From 3cca89e79ace53d5bb8e21629587c684b8ea82f1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=9F=B3=E6=99=93=E4=BC=9F?=
 <39303645+Shixiaowei02@users.noreply.github.com>
Date: Mon, 7 Feb 2022 15:06:54 +0800
Subject: [PATCH] Merge the develop branch (#39362)

---
 .gitignore                                    |   7 +-
 CMakeLists.txt                                |  24 +
 cmake/configure.cmake                         |   5 +
 cmake/generic.cmake                           |  75 ++
 cmake/neuware.cmake                           |   9 +-
 cmake/operators.cmake                         |  22 +-
 cmake/pten.cmake                              |  88 +-
 cmake/xpu_kp.cmake                            | 239 +++++
 paddle/fluid/distributed/common/utils.h       |   2 +-
 .../distributed/ps/service/brpc_ps_client.cc  | 146 ++-
 .../distributed/ps/service/brpc_ps_client.h   |   4 +
 .../ps/service/communicator/communicator.cc   |  40 +-
 .../ps/service/communicator/communicator.h    |  10 +-
 .../fluid/distributed/ps/service/ps_client.h  |  11 +
 .../fluid/distributed/ps/table/CMakeLists.txt |   5 +-
 .../ps/table/depends/geo_recorder.h           |   4 -
 .../ps/table/memory_sparse_geo_table.cc       | 220 +++++
 .../ps/table/memory_sparse_geo_table.h        |  78 ++
 paddle/fluid/distributed/ps/table/table.cc    |   2 +
 paddle/fluid/distributed/test/CMakeLists.txt  |   3 +
 .../distributed/test/memory_geo_table_test.cc | 123 +++
 paddle/fluid/eager/CMakeLists.txt             |   6 +-
 .../fluid/eager/accumulation/CMakeLists.txt   |   3 +-
 .../eager/accumulation/accumulation_node.cc   |   4 +-
 .../accumulation/gradient_accumulation.cc     | 291 ------
 paddle/fluid/eager/api/generated/.gitignore   |   1 +
 .../eager_generated/backwards/CMakeLists.txt  |   5 +
 .../eager_generated/forwards/CMakeLists.txt   |   5 +
 .../eager/auto_code_generator/CMakeLists.txt  |   2 +-
 .../auto_code_generator/eager_generator.cc    |  18 +-
 .../final_state_generator/CMakeLists.txt      |   9 +-
 .../final_state_generator/eager_gen.py        | 177 ++--
 .../generate_file_structures.py               |  49 +-
 paddle/fluid/eager/eager_tensor.h             |   7 +-
 paddle/fluid/eager/grad_tensor_holder.cc      |  10 +-
 paddle/fluid/eager/legacy/CMakeLists.txt      |   2 -
 paddle/fluid/eager/legacy/amp_auto_cast.cc    | 262 -----
 paddle/fluid/eager/legacy/amp_auto_cast.h     |  90 --
 paddle/fluid/eager/legacy/execution_context.h | 214 -----
 .../fluid/eager/legacy/infer_shape_context.h  | 427 ---------
 .../eager/legacy/infer_var_type_context.h     | 264 -----
 paddle/fluid/eager/legacy/op_runner.cc        | 200 ----
 paddle/fluid/eager/legacy/op_runner.h         |  31 -
 .../fluid/eager/legacy/prepared_operator.cc   | 364 -------
 paddle/fluid/eager/legacy/prepared_operator.h |  96 --
 paddle/fluid/eager/legacy/tensor_helper.cc    | 114 ---
 paddle/fluid/eager/legacy/tensor_helper.h     |  33 -
 paddle/fluid/eager/legacy/type_def.h          |  44 -
 .../performance_tests/benchmark_eager_cpu.cc  |   5 -
 .../performance_tests/benchmark_eager_cuda.cc |   4 -
 .../performance_tests/benchmark_fluid_cpu.cc  |   5 -
 .../performance_tests/benchmark_fluid_cuda.cc |   5 -
 .../performance_tests/benchmark_utils.cc      |  10 +-
 paddle/fluid/eager/utils.cc                   |  39 +
 paddle/fluid/eager/utils.h                    |  10 +
 paddle/fluid/framework/CMakeLists.txt         |   6 +-
 paddle/fluid/framework/custom_kernel.cc       |   8 +-
 paddle/fluid/framework/custom_kernel_test.cc  |   6 +-
 paddle/fluid/framework/custom_operator.cc     | 131 ++-
 paddle/fluid/framework/data_feed.cc           |  20 +
 paddle/fluid/framework/data_feed.h            |   5 +
 paddle/fluid/framework/data_feed.proto        |   5 +-
 paddle/fluid/framework/data_set.cc            |  19 +-
 paddle/fluid/framework/data_set.h             |   4 +
 .../framework/data_type_transform_test.cu     |   5 +-
 .../fluid/framework/details/nan_inf_utils.h   |  19 +-
 paddle/fluid/framework/dim.h                  |   2 +-
 paddle/fluid/framework/downpour_worker.cc     |  26 +-
 paddle/fluid/framework/fleet/CMakeLists.txt   |   2 +
 paddle/fluid/framework/fleet/metrics.cc       | 380 ++++++++
 paddle/fluid/framework/fleet/metrics.h        | 693 ++++++++++++++
 paddle/fluid/framework/infershape_utils.cc    |  39 +-
 .../framework/ir/mkldnn/cpu_quantize_pass.cc  |  13 +-
 .../ir/shuffle_channel_detect_pass.cc         |   3 +-
 paddle/fluid/framework/library_type.h         |   7 +-
 ...interpretercore_event_garbage_collector.cc |   3 +-
 .../new_executor/interpretercore_util.cc      |   1 -
 .../new_executor/interpretercore_util.h       |   6 +-
 .../workqueue/nonblocking_threadpool.h        |  12 +-
 .../workqueue/thread_data_registry.h          | 126 +++
 .../new_executor/workqueue/workqueue.cc       |  24 +-
 .../new_executor/workqueue/workqueue.h        |  29 +-
 .../new_executor/workqueue/workqueue_test.cc  |  12 +-
 paddle/fluid/framework/operator.cc            |  63 +-
 paddle/fluid/framework/operator.h             |  26 +-
 paddle/fluid/framework/parallel_executor.cc   |   2 +-
 paddle/fluid/framework/pten_utils.cc          |  10 +-
 paddle/fluid/framework/pten_utils.h           |   7 +
 paddle/fluid/framework/tensor.cc              |   1 -
 paddle/fluid/framework/tensor_util.cc         |   6 +-
 paddle/fluid/framework/tensor_util_test.cc    |  20 +
 paddle/fluid/framework/type_defs.h            |   2 +-
 paddle/fluid/imperative/CMakeLists.txt        |  11 +-
 paddle/fluid/imperative/amp_auto_cast.cc      | 107 ++-
 paddle/fluid/imperative/amp_auto_cast.h       |  11 +-
 paddle/fluid/imperative/execution_context.h   |  67 +-
 paddle/fluid/imperative/gloo_context.cc       |  11 +
 .../fluid/imperative/gradient_accumulator.cc  |  67 +-
 .../fluid/imperative/gradient_accumulator.h   |   6 +-
 paddle/fluid/imperative/infer_shape_context.h |  77 +-
 .../fluid/imperative/infer_var_type_context.h |  42 +-
 .../imperative/jit/program_desc_tracer.cc     |   7 +
 .../imperative/jit/program_desc_tracer.h      |   4 +
 paddle/fluid/imperative/layer.cc              |  59 +-
 paddle/fluid/imperative/layer.h               |   6 +
 paddle/fluid/imperative/op_base.h             |   8 +
 paddle/fluid/imperative/prepared_operator.cc  |  60 +-
 paddle/fluid/imperative/prepared_operator.h   |  80 +-
 paddle/fluid/imperative/tests/CMakeLists.txt  |   2 +-
 paddle/fluid/imperative/tests/test_eager.cc   | 100 ++
 .../tests/test_gradient_accmulator.cc         |   4 +-
 paddle/fluid/imperative/tests/test_hooks.cc   |   9 +-
 paddle/fluid/imperative/tests/test_layer.cc   |   7 +-
 paddle/fluid/imperative/tests/test_tracer.cc  |  46 +-
 paddle/fluid/imperative/tracer.cc             |  82 +-
 paddle/fluid/imperative/tracer.h              |  22 +-
 paddle/fluid/imperative/type_defs.h           |   2 +-
 paddle/fluid/imperative/var_helper.cc         | 261 +++++
 paddle/fluid/imperative/var_helper.h          |  70 ++
 .../fluid/inference/lite/test_engine_lite.cc  |   4 +
 .../tensorrt/plugin/special_slice_plugin.cu   |   8 +-
 .../fluid/inference/tensorrt/test_engine.cc   |  12 +
 .../allocation/best_fit_allocator_test.cu     |   5 +
 paddle/fluid/memory/malloc_test.cu            |  35 +-
 .../fluid/operators/arg_min_max_op_base.cu.h  |   2 +-
 paddle/fluid/operators/argsort_op.cu          |   4 +-
 .../fluid/operators/broadcast_tensors_op.cu   |   4 +-
 paddle/fluid/operators/cast_op.cc             |   5 -
 paddle/fluid/operators/cholesky_solve_op.cu   |   3 +-
 paddle/fluid/operators/clip_by_norm_op.cu     |   2 +-
 paddle/fluid/operators/concat_op.cc           |   9 -
 paddle/fluid/operators/conj_op.cc             |  19 +-
 .../operators/controlflow/compare_all_op.cu   |   3 +-
 .../fluid/operators/copy_cross_scope_test.cc  |   8 +
 paddle/fluid/operators/digamma_op.cc          |  16 +-
 paddle/fluid/operators/digamma_op.h           |  81 --
 .../elementwise/elementwise_functor.h         |   2 +-
 .../elementwise/elementwise_mul_op.cu         |   5 +-
 .../operators/elementwise/elementwise_op.h    |  44 -
 .../elementwise/elementwise_op_function.h     |   3 +-
 paddle/fluid/operators/empty_op.cc            |  14 -
 paddle/fluid/operators/feed_forward_test.cu   |  12 +
 paddle/fluid/operators/fill_any_like_op.cc    |   5 -
 paddle/fluid/operators/fill_constant_op.cc    |  23 -
 paddle/fluid/operators/flatten_op.cc          |  12 -
 paddle/fluid/operators/fused/attn_gemm.h      |   3 +-
 .../operators/fused/fused_dropout_helper.h    |  29 +-
 .../fused_layernorm_residual_dropout_bias.h   |  25 +
 paddle/fluid/operators/gelu_op.cu             |   4 +-
 .../fluid/operators/graph_khop_sampler_op.cu  |   4 +-
 paddle/fluid/operators/graph_send_recv_op.cu  |   4 +-
 paddle/fluid/operators/gumbel_softmax_op.cu   |   2 +-
 paddle/fluid/operators/index_sample_op.cu     |   6 +-
 paddle/fluid/operators/interpolate_v2_op.cu   |   8 +-
 paddle/fluid/operators/kron_op.h              |   4 +-
 paddle/fluid/operators/kthvalue_op.cu         |   2 +-
 paddle/fluid/operators/layer_norm_kernel.cu.h | 557 ++++++++++-
 .../operators/lite/lite_engine_op_test.cc     |   4 +
 .../operators/margin_cross_entropy_op.cu      |   4 +-
 .../fluid/operators/math/beam_search_test.cc  |  79 ++
 paddle/fluid/operators/math/blas_impl.cu.h    | 577 +++++++++++
 paddle/fluid/operators/math/blas_impl.hip.h   | 406 ++++++++
 paddle/fluid/operators/math/concat_test.cc    |  27 +
 .../math/cusparse_conversion_api_test.cc      |  10 +
 paddle/fluid/operators/math/im2col_test.cc    | 162 ++++
 paddle/fluid/operators/math/inclusive_scan.h  |   2 +-
 paddle/fluid/operators/math/math_function.cu  |  13 +
 .../operators/math/math_function_test.cc      |   1 +
 .../operators/math/math_function_test.cu      |  36 +
 paddle/fluid/operators/math/vol2col_test.cc   | 121 ++-
 paddle/fluid/operators/mean_op.cu             |   3 +-
 .../fluid/operators/nccl/nccl_op_test.cu.cc   |   7 +-
 paddle/fluid/operators/p_norm_op.cu           |  12 +-
 paddle/fluid/operators/pool_op.h              |   4 +-
 paddle/fluid/operators/prelu_op.cu            |   3 +-
 .../pscore/send_and_recv_op_gpu_test.cc       |   4 +
 .../fluid/operators/reduce_ops/reduce_op.cu.h |   6 +-
 paddle/fluid/operators/renorm_op.cu           |  10 +-
 paddle/fluid/operators/reshape_op.cc          |  23 +-
 paddle/fluid/operators/roll_op.cu             |   2 +-
 paddle/fluid/operators/scale_op.cc            |  10 +-
 paddle/fluid/operators/scale_op.h             |  28 +-
 paddle/fluid/operators/scatter.cu.h           |   3 +-
 paddle/fluid/operators/sign_op.cc             |   3 +-
 paddle/fluid/operators/sign_op.h              |  47 -
 .../softmax_with_cross_entropy_op.cc          |  17 +-
 .../softmax_with_cross_entropy_op_mlu.cc      | 151 +++
 paddle/fluid/operators/solve_op.h             |   3 +-
 paddle/fluid/operators/strided_memcpy_test.cc |   9 +
 .../tensorrt/tensorrt_engine_op_test.cc       |  12 +
 paddle/fluid/operators/top_k_function_cuda.h  |   2 +-
 paddle/fluid/operators/trace_op.cu            |   3 +-
 paddle/fluid/operators/triangular_solve_op.cu |   3 +-
 paddle/fluid/operators/viterbi_decode_op.cu   |   2 +-
 paddle/fluid/platform/CMakeLists.txt          |  11 +-
 paddle/fluid/platform/bfloat16_test.cu        |   4 +
 paddle/fluid/platform/collective_helper.cc    |  15 +-
 paddle/fluid/platform/collective_helper.h     | 100 ++
 .../fluid/platform/device/gpu/CMakeLists.txt  |   4 +-
 .../platform/device/gpu/cuda/CMakeLists.txt   |   1 -
 .../platform/device/gpu/cuda/cuda_helper.h    |   5 +-
 .../device/gpu/cuda/cusparse_helper.h         |   6 +-
 paddle/fluid/platform/device/gpu/gpu_info.cc  | 120 ++-
 paddle/fluid/platform/device/gpu/gpu_info.h   |  10 +-
 .../fluid/platform/device/gpu/nccl_helper.h   |  17 +-
 .../platform/device/gpu/rocm/CMakeLists.txt   |   2 -
 .../fluid/platform/device/mlu/CMakeLists.txt  |   1 +
 .../fluid/platform/device/mlu/cncl_helper.h   |  57 ++
 .../platform/device/mlu/device_context.h      |  15 +
 paddle/fluid/platform/device/mlu/enforce.h    |  14 +
 .../fluid/platform/device/mlu/enforce_test.cc |  10 +
 .../device/mlu/mlu_collective_helper.cc       | 179 ++++
 paddle/fluid/platform/device/mlu/mlu_info.h   |   6 +
 .../fluid/platform/device/xpu/xpu2_op_list.h  |   7 +-
 .../xpu/xpu_op_kpfirst_list.h}                |  32 +-
 .../fluid/platform/device/xpu/xpu_op_list.cc  |  44 +-
 .../fluid/platform/device/xpu/xpu_op_list.h   |   6 +
 paddle/fluid/platform/device_context.cc       | 229 ++---
 paddle/fluid/platform/device_context.h        | 156 ++-
 paddle/fluid/platform/device_context_test.cu  |  28 +
 paddle/fluid/platform/device_event_gpu.cc     |   5 +-
 paddle/fluid/platform/device_event_test.cc    |   5 +-
 paddle/fluid/platform/enforce.h               |   4 +-
 paddle/fluid/platform/event.h                 |   6 +-
 paddle/fluid/platform/flags.cc                |  22 +-
 paddle/fluid/platform/float16_test.cu         |   4 +
 paddle/fluid/platform/for_range.h             |  36 +
 paddle/fluid/platform/os_info.cc              |  81 +-
 paddle/fluid/platform/profiler.cc             |  61 +-
 paddle/fluid/platform/profiler/CMakeLists.txt |   4 +-
 paddle/fluid/platform/profiler/common_event.h |  66 ++
 .../fluid/platform/profiler/event_tracing.h   |  13 +-
 .../platform/profiler/host_event_recorder.h   |  78 +-
 paddle/fluid/platform/profiler/host_tracer.cc |  71 ++
 paddle/fluid/platform/profiler/host_tracer.h  |  63 ++
 paddle/fluid/platform/profiler/profiler.cc    |  76 ++
 paddle/fluid/platform/profiler/profiler.h     |  74 ++
 .../fluid/platform/profiler/profiler_test.cc  |  52 +
 paddle/fluid/platform/profiler/trace_event.h  |  10 +-
 .../platform/profiler/trace_event_collector.h |  49 +-
 paddle/fluid/platform/stream/cuda_stream.cc   |  16 +-
 paddle/fluid/platform/stream/cuda_stream.h    |  16 +-
 paddle/fluid/platform/transform_test.cu       |   9 +
 paddle/fluid/pybind/CMakeLists.txt            |   3 +-
 paddle/fluid/pybind/cuda_streams_py.cc        |   8 +-
 paddle/fluid/pybind/data_set_py.cc            |   2 +
 paddle/fluid/pybind/eager.cc                  |   2 +-
 paddle/fluid/pybind/eager_functions.cc        |   2 +-
 paddle/fluid/pybind/eager_method.cc           |   2 +-
 paddle/fluid/pybind/eager_properties.cc       |   2 +-
 paddle/fluid/pybind/eager_utils.cc            |   2 +-
 paddle/fluid/pybind/imperative.cc             |  30 +-
 paddle/fluid/pybind/metrics_py.cc             |  55 ++
 paddle/fluid/pybind/metrics_py.h              |  28 +
 paddle/fluid/pybind/pybind.cc                 | 159 +++-
 paddle/pten/api/CMakeLists.txt                |   3 +-
 paddle/pten/api/all.h                         |   3 +-
 paddle/pten/api/ext/op_meta_info.h            | 210 ++--
 paddle/pten/api/include/kernel_signature.h    | 128 ---
 .../api/include/{utils.h => manual_api.h}     |   5 +
 .../api/include/sparse_api.h}                 |  24 +-
 paddle/pten/api/lib/CMakeLists.txt            |  21 +-
 paddle/pten/api/lib/api_declare.h             |   1 +
 paddle/pten/api/lib/api_utils.h               |  37 +-
 paddle/pten/api/lib/data_transform.cc         | 232 +++++
 paddle/pten/api/lib/data_transform.h          |  75 ++
 paddle/pten/api/lib/kernel_dispatch.cc        |   2 +-
 .../pten/api/lib/{utils.cc => manual_api.cc}  |  13 +-
 paddle/pten/api/lib/op_meta_info.cc           |  92 ++
 paddle/pten/api/lib/sparse_api.cc             | 102 ++
 paddle/pten/api/lib/tensor.cc                 |  14 +-
 paddle/pten/api/lib/utils/storage.h           |  18 +-
 paddle/pten/api/lib/utils/tensor_utils.cc     |  66 +-
 paddle/pten/api/lib/utils/tensor_utils.h      |   7 +-
 paddle/pten/backends/CMakeLists.txt           |   8 +
 paddle/pten/backends/cpu/cpu_context.cc       |  68 +-
 paddle/pten/backends/cpu/cpu_context.h        |  34 +-
 paddle/pten/backends/gpu/CMakeLists.txt       |   9 +
 paddle/pten/backends/gpu/cuda/CMakeLists.txt  |   1 +
 paddle/pten/backends/gpu/cuda/cuda_helper.h   |  72 ++
 .../backends}/gpu/cuda/cuda_info.cc           | 131 ++-
 paddle/pten/backends/gpu/forwards.h           | 111 +++
 paddle/pten/backends/gpu/gpu_context.cc       | 899 ++++++++++++++++++
 paddle/pten/backends/gpu/gpu_context.h        | 165 +++-
 paddle/pten/backends/gpu/gpu_decls.h          |  70 ++
 .../backends/gpu/gpu_helper.h}                |  17 +-
 paddle/pten/backends/gpu/gpu_info.cc          |  60 ++
 paddle/pten/backends/gpu/gpu_info.h           | 132 +++
 paddle/pten/backends/gpu/gpu_launch_config.h  | 181 ++++
 paddle/pten/backends/gpu/gpu_types.h          |  73 ++
 paddle/pten/backends/gpu/rocm/CMakeLists.txt  |   1 +
 paddle/pten/backends/gpu/rocm/rocm_helper.h   |  72 ++
 .../backends}/gpu/rocm/rocm_info.cc           | 128 ++-
 paddle/pten/backends/xpu/xpu_context.cc       | 102 +-
 paddle/pten/backends/xpu/xpu_context.h        |  32 +-
 paddle/pten/core/CMakeLists.txt               |  21 +-
 paddle/pten/core/allocator.h                  |   6 +-
 paddle/pten/core/compat/CMakeLists.txt        |  13 +-
 paddle/pten/core/compat/arg_map_context.cc    |   2 +-
 paddle/pten/core/compat/arg_map_context.h     |   3 +
 .../pten/core/{ => compat}/convert_utils.cc   |  68 +-
 paddle/pten/core/{ => compat}/convert_utils.h |  17 +-
 paddle/pten/core/compat/op_utils.h            |  94 +-
 paddle/pten/core/compat/type_defs.h           |  96 ++
 paddle/pten/core/ddim.cc                      |  66 +-
 paddle/pten/core/ddim.h                       |  14 +-
 paddle/pten/core/dense_tensor.cc              |   6 +-
 paddle/pten/core/dense_tensor.h               |  27 +-
 paddle/pten/core/dense_tensor_impl.cc         |   3 +-
 paddle/pten/core/device_context.cc            |  14 +-
 paddle/pten/core/device_context.h             |   6 +-
 paddle/pten/core/enforce.h                    |   2 +-
 paddle/pten/core/infermeta_utils.h            |  12 +-
 paddle/pten/core/kernel_alias_name.h          |  53 --
 paddle/pten/core/kernel_context.h             |  12 +-
 paddle/pten/core/kernel_def.h                 |  38 -
 paddle/pten/core/kernel_factory.cc            |  10 +-
 paddle/pten/core/kernel_factory.h             |  15 +-
 paddle/pten/core/kernel_registry.h            |  15 +-
 paddle/pten/core/kernel_utils.h               |  14 +-
 paddle/pten/core/lod_utils.cc                 |   4 +-
 paddle/pten/core/meta_tensor.cc               |  49 +-
 paddle/pten/core/meta_tensor.h                |  25 +-
 paddle/pten/core/sparse_csr_tensor.cc         |  11 +-
 paddle/pten/core/storage.h                    |  31 +-
 paddle/pten/core/tensor_base.h                |   6 +-
 paddle/pten/core/tensor_meta.h                |   2 -
 .../core/{compat_utils.h => tensor_utils.h}   |  18 +-
 paddle/pten/core/type_defs.h                  | 118 +--
 paddle/pten/core/{ => utils}/array.h          |  15 +-
 paddle/pten/core/{ => utils}/dim.h            |   2 +-
 paddle/pten/core/utils/rw_lock.h              |  26 +-
 .../pten/core/{ => utils}/unroll_array_ops.h  |   0
 paddle/pten/infermeta/CMakeLists.txt          |   4 +-
 paddle/pten/infermeta/backward.cc             |  16 +-
 paddle/pten/infermeta/backward.h              |  16 +-
 paddle/pten/infermeta/binary.cc               |  68 +-
 paddle/pten/infermeta/binary.h                |  55 +-
 paddle/pten/infermeta/multiary.cc             |  23 +-
 paddle/pten/infermeta/multiary.h              |  10 +-
 paddle/pten/infermeta/nullary.cc              |  24 +-
 paddle/pten/infermeta/nullary.h               |  29 +-
 paddle/pten/infermeta/unary.cc                | 139 +--
 paddle/pten/infermeta/unary.h                 |  67 +-
 paddle/pten/kernels/CMakeLists.txt            |   3 +-
 paddle/pten/kernels/cast_kernel.h             |   5 +-
 paddle/pten/kernels/complex_kernel.h          |   5 +-
 paddle/pten/kernels/concat_kernel.h           |  12 +-
 paddle/pten/kernels/cpu/copy_kernel.cc        |   2 +-
 .../pten/kernels/cpu/digamma_grad_kernel.cc   |  23 +
 paddle/pten/kernels/cpu/digamma_kernel.cc     |  23 +
 paddle/pten/kernels/cpu/scale_kernel.cc       |   2 +-
 paddle/pten/kernels/digamma_grad_kernel.h     |  27 +
 paddle/pten/kernels/digamma_kernel.h          |  24 +
 paddle/pten/kernels/dot_kernel.h              |   5 +-
 paddle/pten/kernels/empty_kernel.h            |  10 +-
 paddle/pten/kernels/flatten_kernel.h          |   5 +-
 paddle/pten/kernels/full_kernel.h             |  10 +-
 paddle/pten/kernels/funcs/concat_funcs.h      |   2 +-
 .../pten/kernels/funcs/cuda_kernel_config.h   |   3 +-
 paddle/pten/kernels/funcs/elementwise_base.h  |   5 +-
 paddle/pten/kernels/funcs/transpose.cu        |   1 +
 paddle/pten/kernels/gpu/concat_and_split.h    |  11 +-
 paddle/pten/kernels/gpu/copy_kernel.cu        |  22 +-
 .../pten/kernels/gpu/digamma_grad_kernel.cu   |  22 +
 paddle/pten/kernels/gpu/digamma_kernel.cu     |  23 +
 paddle/pten/kernels/gpu/elementwise.h         |   8 +-
 paddle/pten/kernels/gpu/math_kernel.cu        |   2 +-
 paddle/pten/kernels/gpu/reduce.h              |  28 +-
 paddle/pten/kernels/gpu/scale_kernel.cu       |   2 +-
 .../kernels/impl/digamma_grad_kernel_impl.h   |  55 ++
 .../pten/kernels/impl/digamma_kernel_impl.h   |  49 +
 .../kernels/impl/matmul_grad_kernel_impl.h    |   8 +-
 paddle/pten/kernels/math_kernel.h             |  32 +-
 paddle/pten/kernels/matmul_kernel.h           |   5 +-
 paddle/pten/kernels/reshape_kernel.cc         |  11 +-
 paddle/pten/kernels/reshape_kernel.h          |   5 +-
 paddle/pten/kernels/scale_kernel.h            |  14 +-
 .../kernels/selected_rows/scale_kernel.cc     |  68 ++
 paddle/pten/kernels/sign_kernel.h             |   5 +-
 paddle/pten/kernels/sparse/CMakeLists.txt     |   3 +
 .../kernels/sparse/cpu/sparse_utils_kernel.cc | 187 ++++
 .../kernels/sparse/gpu/sparse_utils_kernel.cu | 360 +++++++
 .../pten/kernels/sparse/sparse_utils_kernel.h |  76 ++
 paddle/pten/kernels/transfer_layout_kernel.cc |  77 ++
 paddle/pten/kernels/transfer_layout_kernel.h  |  43 +
 paddle/pten/kernels/xpu/cast_kernel.cc        |  14 +-
 paddle/pten/kernels/xpu/copy_kernel.cc        |   2 +-
 paddle/pten/kernels/xpu/full_kernel.cc        |  18 +-
 paddle/pten/kernels/xpu/scale_kernel.cc       |   2 +-
 paddle/pten/ops/compat/cast_sig.cc            |  25 +
 paddle/pten/ops/compat/concat_sig.cc          |  28 +
 paddle/pten/ops/compat/elementwise_sig.cc     |  83 ++
 paddle/pten/ops/compat/empty_sig.cc           |  31 +
 paddle/pten/ops/compat/fill_any_like_sig.cc   |  28 +
 paddle/pten/ops/compat/fill_constant_sig.cc   |  73 ++
 paddle/pten/ops/compat/flatten_sig.cc         |  37 +
 paddle/pten/ops/compat/matmul_sig.cc          |  22 +
 paddle/pten/ops/compat/reduce_sig.cc          |  52 +
 paddle/pten/ops/compat/reshape_sig.cc         |  35 +
 paddle/pten/tests/api/CMakeLists.txt          |   7 +-
 paddle/pten/tests/api/scale_api.h             |  13 +-
 paddle/pten/tests/api/test_cast_api.cc        |   2 +-
 paddle/pten/tests/api/test_concat_api.cc      |   7 +-
 paddle/pten/tests/api/test_conj_api.cc        |   2 +-
 paddle/pten/tests/api/test_data_transform.cc  | 100 ++
 paddle/pten/tests/api/test_dot_api.cc         |   4 +-
 paddle/pten/tests/api/test_elementwise_api.cc |  16 +-
 paddle/pten/tests/api/test_empty_api.cc       |   6 +-
 paddle/pten/tests/api/test_fill_api.cc        |  12 +-
 paddle/pten/tests/api/test_flatten_api.cc     |   2 +-
 paddle/pten/tests/api/test_matmul_api.cc      |  15 +-
 paddle/pten/tests/api/test_mean_api.cc        |   2 +-
 paddle/pten/tests/api/test_reshape_api.cc     |   2 +-
 .../pten/tests/api/test_sparse_utils_api.cc   | 105 ++
 paddle/pten/tests/api/test_sum_api.cc         |   2 +-
 paddle/pten/tests/api/test_to_api.cc          |   4 +-
 paddle/pten/tests/core/CMakeLists.txt         |   2 +
 paddle/pten/tests/core/test_convert_utils.cc  |   2 +-
 paddle/pten/tests/core/test_device_context.cc |  36 +-
 paddle/pten/tests/core/test_dim.cu            |   4 +-
 paddle/pten/tests/core/test_meta_fn_utils.cc  |   2 +-
 .../{ => tests}/core/unroll_array_ops_test.cc |   4 +-
 paddle/pten/tests/kernels/CMakeLists.txt      |   1 +
 .../pten/tests/kernels/test_cast_dev_api.cc   |  17 +-
 .../pten/tests/kernels/test_concat_dev_api.cc |  26 +-
 .../pten/tests/kernels/test_conj_dev_api.cc   |  17 +-
 .../pten/tests/kernels/test_copy_dev_api.cc   |  12 +-
 .../tests/kernels/test_creation_dev_api.cc    |  50 +-
 paddle/pten/tests/kernels/test_dot_dev_api.cc |  26 +-
 .../tests/kernels/test_elementwise_dev_api.cc | 104 +-
 .../tests/kernels/test_flatten_dev_api.cc     |  10 +-
 .../pten/tests/kernels/test_matmul_dev_api.cc |  12 +-
 .../pten/tests/kernels/test_mean_dev_api.cc   |  17 +-
 .../tests/kernels/test_reshape_dev_api.cc     |  10 +-
 .../pten/tests/kernels/test_scale_dev_api.cc  |  38 +-
 .../kernels/test_sparse_utils_dev_api.cc      | 386 ++++++++
 paddle/pten/tests/kernels/test_sum_dev_api.cc |  18 +-
 python/paddle/autograd/functional.py          |  67 +-
 .../distributed/auto_parallel/completion.py   |  29 +-
 .../auto_parallel/operators/__init__.py       |   1 +
 .../auto_parallel/operators/dist_split.py     | 115 +++
 .../paddle/distributed/auto_parallel/utils.py |   2 +-
 .../distributed/fleet/dataset/dataset.py      |  34 +
 .../sharding_optimizer_stage2.py              |   6 +-
 .../meta_parallel/sharding/sharding_stage3.py |   6 +
 .../distributed/fleet/runtime/the_one_ps.py   |   3 +-
 python/paddle/distributed/metric/__init__.py  |  16 +
 python/paddle/distributed/metric/metrics.py   | 138 +++
 python/paddle/distributed/passes/cpp_pass.py  |  13 +
 python/paddle/distribution/__init__.py        |   4 +-
 python/paddle/distribution/beta.py            |  47 +-
 python/paddle/distribution/categorical.py     |  82 +-
 python/paddle/distribution/dirichlet.py       |  50 +-
 python/paddle/distribution/distribution.py    |  28 +-
 .../paddle/distribution/exponential_family.py |   2 +
 python/paddle/distribution/kl.py              |  22 +-
 python/paddle/distribution/multinomial.py     | 184 ++++
 python/paddle/fluid/backward.py               |   8 +-
 .../fluid/incubate/fleet/base/role_maker.py   |   3 +-
 python/paddle/fluid/layers/loss.py            |   2 +-
 .../fluid/tests/custom_op/custom_relu_op.cc   |  60 ++
 .../fluid/tests/custom_op/custom_relu_op.cu   |  28 +
 .../custom_op/test_custom_relu_op_jit.py      |   3 +-
 .../fluid/tests/unittests/CMakeLists.txt      |   2 +-
 ...bian_static.py => test_autograd_static.py} | 288 +++---
 ...test_dist_fuse_relu_depthwise_conv_pass.py |  92 ++
 .../test_distribution_categorical.py          |   2 +-
 .../test_distribution_multinomial.py          | 139 +++
 .../test_distribution_multinomial_static.py   | 168 ++++
 .../dygraph_sharding_optimizer_stage2.py      |  13 +-
 .../unittests/dygraph_sharding_stage3.py      |  27 +-
 .../test_softmax_with_cross_entropy_op_mlu.py | 161 ++++
 .../paddle/fluid/tests/unittests/op_test.py   |  21 +-
 .../fluid/tests/unittests/test_dataset.py     |   8 +
 .../fluid/tests/unittests/test_diag_v2.py     |   6 +
 .../fluid/tests/unittests/test_diagonal_op.py |  12 +-
 .../fluid/tests/unittests/test_digamma_op.py  |  12 +
 .../test_get_all_registered_op_kernels.py     |  43 +
 .../fluid/tests/unittests/test_trunc_op.py    |   5 +
 .../unittests/xpu/test_softmax_op_xpu.py      | 102 +-
 .../incubate/nn/layer/fused_transformer.py    |   2 +-
 python/paddle/nn/quant/quant_layers.py        |   2 +-
 python/paddle/optimizer/optimizer.py          |   6 +-
 python/paddle/tensor/search.py                |  57 +-
 python/paddle/utils/code_gen/api.yaml         |   2 +-
 python/paddle/utils/code_gen/api_gen.py       |  39 +-
 .../paddle/utils/code_gen/backward_api_gen.py |  49 +-
 python/paddle/utils/code_gen/gen_utils.py     |  66 +-
 python/setup.py.in                            |   9 +-
 tools/check_file_diff_approvals.sh            |  16 +-
 491 files changed, 15757 insertions(+), 6063 deletions(-)
 create mode 100644 cmake/xpu_kp.cmake
 create mode 100644 paddle/fluid/distributed/ps/table/memory_sparse_geo_table.cc
 create mode 100644 paddle/fluid/distributed/ps/table/memory_sparse_geo_table.h
 create mode 100644 paddle/fluid/distributed/test/memory_geo_table_test.cc
 delete mode 100644 paddle/fluid/eager/accumulation/gradient_accumulation.cc
 delete mode 100644 paddle/fluid/eager/legacy/CMakeLists.txt
 delete mode 100644 paddle/fluid/eager/legacy/amp_auto_cast.cc
 delete mode 100644 paddle/fluid/eager/legacy/amp_auto_cast.h
 delete mode 100644 paddle/fluid/eager/legacy/execution_context.h
 delete mode 100644 paddle/fluid/eager/legacy/infer_shape_context.h
 delete mode 100644 paddle/fluid/eager/legacy/infer_var_type_context.h
 delete mode 100644 paddle/fluid/eager/legacy/op_runner.cc
 delete mode 100644 paddle/fluid/eager/legacy/op_runner.h
 delete mode 100644 paddle/fluid/eager/legacy/prepared_operator.cc
 delete mode 100644 paddle/fluid/eager/legacy/prepared_operator.h
 delete mode 100644 paddle/fluid/eager/legacy/tensor_helper.cc
 delete mode 100644 paddle/fluid/eager/legacy/tensor_helper.h
 delete mode 100644 paddle/fluid/eager/legacy/type_def.h
 create mode 100644 paddle/fluid/framework/fleet/metrics.cc
 create mode 100644 paddle/fluid/framework/fleet/metrics.h
 create mode 100644 paddle/fluid/framework/new_executor/workqueue/thread_data_registry.h
 create mode 100644 paddle/fluid/imperative/tests/test_eager.cc
 create mode 100644 paddle/fluid/imperative/var_helper.cc
 create mode 100644 paddle/fluid/imperative/var_helper.h
 delete mode 100644 paddle/fluid/operators/sign_op.h
 create mode 100644 paddle/fluid/operators/softmax_with_cross_entropy_op_mlu.cc
 create mode 100644 paddle/fluid/platform/device/mlu/cncl_helper.h
 create mode 100644 paddle/fluid/platform/device/mlu/mlu_collective_helper.cc
 rename paddle/fluid/platform/{profiler/host_event_recorder.cc => device/xpu/xpu_op_kpfirst_list.h} (50%)
 create mode 100644 paddle/fluid/platform/profiler/common_event.h
 create mode 100644 paddle/fluid/platform/profiler/host_tracer.cc
 create mode 100644 paddle/fluid/platform/profiler/host_tracer.h
 create mode 100644 paddle/fluid/platform/profiler/profiler.cc
 create mode 100644 paddle/fluid/platform/profiler/profiler.h
 create mode 100644 paddle/fluid/platform/profiler/profiler_test.cc
 create mode 100644 paddle/fluid/pybind/metrics_py.cc
 create mode 100644 paddle/fluid/pybind/metrics_py.h
 delete mode 100644 paddle/pten/api/include/kernel_signature.h
 rename paddle/pten/api/include/{utils.h => manual_api.h} (88%)
 rename paddle/{fluid/operators/digamma_op.cu => pten/api/include/sparse_api.h} (51%)
 create mode 100644 paddle/pten/api/lib/data_transform.cc
 create mode 100644 paddle/pten/api/lib/data_transform.h
 rename paddle/pten/api/lib/{utils.cc => manual_api.cc} (91%)
 create mode 100644 paddle/pten/api/lib/sparse_api.cc
 create mode 100644 paddle/pten/backends/gpu/CMakeLists.txt
 create mode 100644 paddle/pten/backends/gpu/cuda/CMakeLists.txt
 create mode 100644 paddle/pten/backends/gpu/cuda/cuda_helper.h
 rename paddle/{fluid/platform/device => pten/backends}/gpu/cuda/cuda_info.cc (71%)
 create mode 100644 paddle/pten/backends/gpu/forwards.h
 create mode 100644 paddle/pten/backends/gpu/gpu_context.cc
 create mode 100644 paddle/pten/backends/gpu/gpu_decls.h
 rename paddle/{fluid/eager/accumulation/gradient_accumulation.h => pten/backends/gpu/gpu_helper.h} (69%)
 create mode 100644 paddle/pten/backends/gpu/gpu_info.cc
 create mode 100644 paddle/pten/backends/gpu/gpu_info.h
 create mode 100644 paddle/pten/backends/gpu/gpu_launch_config.h
 create mode 100644 paddle/pten/backends/gpu/gpu_types.h
 create mode 100644 paddle/pten/backends/gpu/rocm/CMakeLists.txt
 create mode 100644 paddle/pten/backends/gpu/rocm/rocm_helper.h
 rename paddle/{fluid/platform/device => pten/backends}/gpu/rocm/rocm_info.cc (72%)
 rename paddle/pten/core/{ => compat}/convert_utils.cc (81%)
 rename paddle/pten/core/{ => compat}/convert_utils.h (74%)
 create mode 100644 paddle/pten/core/compat/type_defs.h
 delete mode 100644 paddle/pten/core/kernel_alias_name.h
 delete mode 100644 paddle/pten/core/kernel_def.h
 rename paddle/pten/core/{compat_utils.h => tensor_utils.h} (81%)
 rename paddle/pten/core/{ => utils}/array.h (88%)
 rename paddle/pten/core/{ => utils}/dim.h (98%)
 rename paddle/pten/core/{ => utils}/unroll_array_ops.h (100%)
 create mode 100644 paddle/pten/kernels/cpu/digamma_grad_kernel.cc
 create mode 100644 paddle/pten/kernels/cpu/digamma_kernel.cc
 create mode 100644 paddle/pten/kernels/digamma_grad_kernel.h
 create mode 100644 paddle/pten/kernels/digamma_kernel.h
 create mode 100644 paddle/pten/kernels/gpu/digamma_grad_kernel.cu
 create mode 100644 paddle/pten/kernels/gpu/digamma_kernel.cu
 create mode 100644 paddle/pten/kernels/impl/digamma_grad_kernel_impl.h
 create mode 100644 paddle/pten/kernels/impl/digamma_kernel_impl.h
 create mode 100644 paddle/pten/kernels/selected_rows/scale_kernel.cc
 create mode 100644 paddle/pten/kernels/sparse/CMakeLists.txt
 create mode 100644 paddle/pten/kernels/sparse/cpu/sparse_utils_kernel.cc
 create mode 100644 paddle/pten/kernels/sparse/gpu/sparse_utils_kernel.cu
 create mode 100644 paddle/pten/kernels/sparse/sparse_utils_kernel.h
 create mode 100644 paddle/pten/kernels/transfer_layout_kernel.cc
 create mode 100644 paddle/pten/kernels/transfer_layout_kernel.h
 create mode 100644 paddle/pten/ops/compat/cast_sig.cc
 create mode 100644 paddle/pten/ops/compat/concat_sig.cc
 create mode 100644 paddle/pten/ops/compat/elementwise_sig.cc
 create mode 100644 paddle/pten/ops/compat/empty_sig.cc
 create mode 100644 paddle/pten/ops/compat/fill_any_like_sig.cc
 create mode 100644 paddle/pten/ops/compat/fill_constant_sig.cc
 create mode 100644 paddle/pten/ops/compat/flatten_sig.cc
 create mode 100644 paddle/pten/ops/compat/matmul_sig.cc
 create mode 100644 paddle/pten/ops/compat/reduce_sig.cc
 create mode 100644 paddle/pten/ops/compat/reshape_sig.cc
 create mode 100644 paddle/pten/tests/api/test_data_transform.cc
 create mode 100644 paddle/pten/tests/api/test_sparse_utils_api.cc
 rename paddle/pten/{ => tests}/core/unroll_array_ops_test.cc (96%)
 create mode 100644 paddle/pten/tests/kernels/test_sparse_utils_dev_api.cc
 create mode 100644 python/paddle/distributed/auto_parallel/operators/dist_split.py
 create mode 100644 python/paddle/distributed/metric/__init__.py
 create mode 100644 python/paddle/distributed/metric/metrics.py
 create mode 100644 python/paddle/distribution/multinomial.py
 rename python/paddle/fluid/tests/unittests/autograd/{test_jacobian_static.py => test_autograd_static.py} (53%)
 create mode 100644 python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_relu_depthwise_conv_pass.py
 create mode 100644 python/paddle/fluid/tests/unittests/distribution/test_distribution_multinomial.py
 create mode 100644 python/paddle/fluid/tests/unittests/distribution/test_distribution_multinomial_static.py
 create mode 100644 python/paddle/fluid/tests/unittests/mlu/test_softmax_with_cross_entropy_op_mlu.py
 create mode 100644 python/paddle/fluid/tests/unittests/test_get_all_registered_op_kernels.py

diff --git a/.gitignore b/.gitignore
index 14b75fce515..ae61959a4bd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,10 +4,13 @@ paddle/fluid/API_DEV.spec
 paddle/fluid/API_PR.spec
 paddle/fluid/op_use_default_grad_maker_DEV.spec
 paddle/fluid/op_use_default_grad_maker_PR.spec
-paddle/pten/api/*/api.*
-paddle/pten/api/*/backward*
+paddle/pten/api/include/api.h
+paddle/pten/api/lib/api.cc
+paddle/pten/api/backward/backward_api.h
+paddle/pten/api/lib/backward_api.cc
 paddle/pten/include/*
 paddle/pten/extension.h
+paddle/fluid/eager/api/generated/*
 
 *.DS_Store
 *.vs
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 047db58cfdf..cd131e2d708 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -43,6 +43,7 @@ option(WITH_ONEMKL      "Compile PaddlePaddle with oneMKL"              OFF)
 option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_FOUND})
 option(WITH_TENSORRT    "Compile PaddlePaddle with NVIDIA TensorRT"     OFF)
 option(WITH_XPU         "Compile PaddlePaddle with BAIDU KUNLUN XPU"    OFF)
+option(WITH_XPU_KP      "Compile PaddlePaddle with BAIDU XPU compiler " OFF)
 option(WITH_MLU    "Compile PaddlePaddle with CAMBRICON MLU"     OFF)
 option(WITH_WIN_DUMP_DBG "Compile with windows core dump debug mode"    OFF)
 option(WITH_ASCEND         "Compile PaddlePaddle with ASCEND"        OFF)
@@ -59,6 +60,9 @@ include(generic)            # simplify cmake module
 if (WITH_GPU  AND WITH_XPU)
     message(FATAL_ERROR "Error when compile GPU and XPU at the same time")
 endif()
+if (WITH_GPU AND WITH_XPU_KP)
+    message(FATAL_ERROR "Error when compile GPU and XPU2 at the same time")
+endif()
 if (WITH_GPU AND WITH_ASCEND)
     message(FATAL_ERROR "Error when compile GPU and ASCEND at the same time")
 endif()
@@ -226,6 +230,7 @@ option(WITH_INFRT  "Compile PaddlePaddle with INFRT" OFF)
 option(WITH_NCCL   "Compile PaddlePaddle with NCCL support"             ON)
 option(WITH_RCCL   "Compile PaddlePaddle with RCCL support"             ON)
 option(WITH_XPU_BKCL    "Compile PaddlePaddle with BAIDU KUNLUN XPU BKCL"   OFF)
+option(WITH_CNCL   "Compile PaddlePaddle with CNCL support"             OFF)
 option(WITH_CRYPTO   "Compile PaddlePaddle with crypto support"         ON)
 option(WITH_ARM   "Compile PaddlePaddle with arm support"         OFF)
 option(WITH_SW   "Compile PaddlePaddle with sw support"         OFF)
@@ -273,6 +278,14 @@ if (NOT WITH_GPU AND WITH_NCCL)
         "Disable NCCL when compiling without GPU" FORCE)
 endif()
 
+# force WITH_XPU on when WITH_XPU_KP
+if (WITH_XPU_KP AND NOT WITH_XPU)
+    MESSAGE(WARNING
+        "Enable WITH_XPU when compiling with WITH_XPU_KP. Force WITH_XPU=ON.")
+    set(WITH_XPU ON CACHE STRING
+        "Enable WITH_XPU when compiling with WITH_XPU_KP" FORCE)
+endif()
+
 if (NOT WITH_XPU AND WITH_XPU_BKCL)
     MESSAGE(WARNING
         "Disable BKCL when compiling without XPU. Force WITH_XPU_BKCL=OFF.")
@@ -280,6 +293,13 @@ if (NOT WITH_XPU AND WITH_XPU_BKCL)
         "Disable BKCL when compiling without XPU" FORCE)
 endif()
 
+if (NOT WITH_MLU AND WITH_CNCL)
+    MESSAGE(WARNING
+        "Disable CNCL when compiling without MLU. Force WITH_MLU=OFF.")
+    set(WITH_MLU OFF CACHE STRING
+        "Disable CNCL when compiling without MLU" FORCE)
+endif()
+
 if(WITH_NCCL)
      add_definitions("-DPADDLE_WITH_NCCL")
      include(nccl)
@@ -317,6 +337,10 @@ if(WITH_ROCM)
     include(miopen) # set miopen libraries, must before configure
 endif(WITH_ROCM)
 
+if(WITH_XPU_KP)
+    include(xpu_kp)
+endif()
+
 if (NOT WITH_ROCM AND WITH_RCCL)
     MESSAGE(WARNING
         "Disable RCCL when compiling without ROCM. Force WITH_RCCL=OFF.")
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index 88e8dde8add..9ebde06bd01 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -99,6 +99,11 @@ if(WITH_XPU)
     add_definitions(-DPADDLE_WITH_XPU)
 endif()
 
+if(WITH_XPU_KP)
+    message(STATUS "Compile with XPU_KP!")
+    add_definitions(-DPADDLE_WITH_XPU_KP)
+endif()
+
 if(WITH_IPU)
     message(STATUS "Compile with IPU!")
     add_definitions(-DPADDLE_WITH_IPU)
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 2004abcbfa1..6655963e728 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -654,6 +654,81 @@ function(hip_test TARGET_NAME)
   endif()
 endfunction(hip_test)
 
+function(xpu_library TARGET_NAME)
+  if (WITH_XPU_KP)
+    set(options STATIC static SHARED shared)
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS)
+    cmake_parse_arguments(xpu_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+    if(xpu_library_SRCS)
+      if (xpu_library_SHARED OR xpu_library_shared) # build *.so
+        message(FATAL_ERROR "XPU kernel currently does not support dynamic links")
+      else()
+        xpu_add_library(${TARGET_NAME} STATIC ${xpu_library_SRCS} DEPENDS ${xpu_library_DEPS})
+        find_fluid_modules(${TARGET_NAME})
+      endif()
+      if (xpu_library_DEPS)
+        add_dependencies(${TARGET_NAME} ${xpu_library_DEPS})
+        target_link_libraries(${TARGET_NAME} ${xpu_library_DEPS})
+      endif()
+      # cpplint code style
+      foreach(source_file ${xpu_library_SRCS})
+        string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file})
+        if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
+          list(APPEND xpu_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
+        endif()
+      endforeach()
+    else(xpu_library_SRCS)
+      if (xpu_library_DEPS)
+        list(REMOVE_DUPLICATES xpu_library_DEPS)
+        generate_dummy_static_lib(LIB_NAME ${TARGET_NAME} FILE_PATH ${target_SRCS} GENERATOR "generic.cmake:xpu_library")
+        target_link_libraries(${TARGET_NAME} ${xpu_library_DEPS})
+        add_dependencies(${TARGET_NAME} ${xpu_library_DEPS})
+      else()
+        message(FATAL "Please specify source file or library in xpu_library.")
+      endif()
+    endif(xpu_library_SRCS)
+  endif()
+endfunction(xpu_library)
+
+function(xpu_binary TARGET_NAME)
+  if (WITH_XPU_KP)
+    set(options "")
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS)
+    cmake_parse_arguments(xpu_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    add_executable(${TARGET_NAME} ${xpu_binary_SRCS})
+    if(xpu_binary_DEPS)
+      target_link_libraries(${TARGET_NAME} ${xpu_binary_DEPS})
+      add_dependencies(${TARGET_NAME} ${xpu_binary_DEPS})
+      common_link(${TARGET_NAME})
+    endif()
+  endif()
+endfunction(xpu_binary)
+
+function(xpu_test TARGET_NAME)
+  # The environment variable `CI_SKIP_CPP_TEST` is used to skip the compilation
+  # and execution of test in CI. `CI_SKIP_CPP_TEST` is set to ON when no files
+  # other than *.py are modified.
+  if (WITH_XPU_KP AND WITH_TESTING AND NOT "$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON")
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS)
+    cmake_parse_arguments(xpu_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    add_executable(${TARGET_NAME} ${xpu_test_SRCS})
+    # "-pthread -ldl -lrt" is defined in CMAKE_CXX_LINK_EXECUTABLE
+    target_link_options(${TARGET_NAME} PRIVATE -pthread -ldl -lrt)
+    get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
+    target_link_libraries(${TARGET_NAME} ${xpu_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog ${os_dependency_modules})
+    add_dependencies(${TARGET_NAME} ${xpu_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
+    common_link(${TARGET_NAME})
+    add_test(${TARGET_NAME} ${TARGET_NAME})
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
+  endif()
+endfunction(xpu_test)
+
 function(go_library TARGET_NAME)
   set(options STATIC static SHARED shared)
   set(oneValueArgs "")
diff --git a/cmake/neuware.cmake b/cmake/neuware.cmake
index 7219f5f7259..811c8d664a0 100644
--- a/cmake/neuware.cmake
+++ b/cmake/neuware.cmake
@@ -19,4 +19,11 @@ set(CNRT_LIB ${NEUWARE_LIB_DIR}/libcnrt.so)
 set(CNDRV_LIB ${NEUWARE_LIB_DIR}/libcndrv.so)
 
 generate_dummy_static_lib(LIB_NAME "neuware_lib" GENERATOR "neuware.cmake")
-TARGET_LINK_LIBRARIES(neuware_lib ${CNNL_LIB} ${CNRT_LIB} ${CNDRV_LIB})
+if(WITH_CNCL)
+      MESSAGE(STATUS "Compile with CNCL!")
+      ADD_DEFINITIONS(-DPADDLE_WITH_CNCL)
+      set(CNCL_LIB ${NEUWARE_LIB_DIR}/libcncl.so)
+      TARGET_LINK_LIBRARIES(neuware_lib ${CNCL_LIB} ${CNNL_LIB} ${CNRT_LIB} ${CNDRV_LIB})
+else()
+      TARGET_LINK_LIBRARIES(neuware_lib ${CNNL_LIB} ${CNRT_LIB} ${CNDRV_LIB})
+endif()
diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index d7742c34737..e58dbf77b4c 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -34,6 +34,7 @@ function(op_library TARGET)
     set(cu_cc_srcs)
     set(hip_cc_srcs)
     set(xpu_cc_srcs)
+    set(xpu_kp_cc_srcs)
     set(npu_cc_srcs)
     set(mlu_cc_srcs)
     set(cudnn_cu_cc_srcs)
@@ -120,6 +121,11 @@ function(op_library TARGET)
                 list(APPEND xpu_cc_srcs ${XPU_FILE}.cc)
             endif()
         endif()
+        if(WITH_XPU_KP)
+            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.xpu)
+                list(APPEND xpu_kp_cc_srcs ${TARGET}.xpu)
+            endif()
+        endif()
         if(WITH_ASCEND_CL)
             string(REPLACE "_op" "_op_npu" NPU_FILE "${TARGET}")
             if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${NPU_FILE}.cc)
@@ -154,6 +160,8 @@ function(op_library TARGET)
                 list(APPEND mkldnn_cc_srcs ${src})
             elseif(WITH_XPU AND ${src} MATCHES ".*_op_xpu.cc$")
                 list(APPEND xpu_cc_srcs ${src})
+            elseif(WITH_XPU_KP AND ${src} MATCHES ".*\\.xpu$")
+                list(APPEND xpu_kp_cc_srcs ${src})
             elseif(WITH_ASCEND_CL AND ${src} MATCHES ".*_op_npu.cc$")
                 list(APPEND npu_cc_srcs ${src})
             elseif(WITH_MLU AND ${src} MATCHES ".*_op_mlu.cc$")
@@ -161,11 +169,13 @@ function(op_library TARGET)
             elseif(${src} MATCHES ".*\\.cc$")
                 list(APPEND cc_srcs ${src})
             else()
-                message(FATAL_ERROR "${TARGET} Source file ${src} should only be .cc or .cu")
+                message(FATAL_ERROR "${TARGET} Source file ${src} should only be .cc or .cu or .xpu")
             endif()
         endforeach()
     endif()
-
+    
+    list(LENGTH xpu_cc_srcs xpu_cc_srcs_len)
+    list(LENGTH xpu_kp_cc_srcs xpu_kp_cc_srcs_len)
     list(LENGTH cc_srcs cc_srcs_len)
     if (${cc_srcs_len} EQUAL 0)
         message(FATAL_ERROR "The op library ${TARGET} should contains at least one .cc file")
@@ -231,6 +241,8 @@ function(op_library TARGET)
         list(REMOVE_ITEM hip_srcs "decode_jpeg_op.cu")
         hip_library(${TARGET} SRCS ${cc_srcs} ${hip_cc_srcs} ${miopen_cu_cc_srcs} ${miopen_cu_srcs} ${mkldnn_cc_srcs} ${hip_srcs} DEPS ${op_library_DEPS}
                 ${op_common_deps})
+    elseif (WITH_XPU_KP AND ${xpu_kp_cc_srcs_len} GREATER 0)
+        xpu_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} ${xpu_kp_cc_srcs} DEPS ${op_library_DEPS} ${op_common_deps})
     else()
         # Unity Build relies on global option `WITH_UNITY_BUILD` and local option `UNITY`.
         if(WITH_UNITY_BUILD AND op_library_UNITY)
@@ -359,6 +371,11 @@ function(op_library TARGET)
     endif()
     endif()
 
+    # pybind USE_OP_DEVICE_KERNEL for XPU KP
+    if (WITH_XPU_KP AND ${xpu_kp_cc_srcs_len} GREATER 0)
+        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, KP);\n")
+    endif()
+
     # pybind USE_OP_DEVICE_KERNEL for NPU
     if (WITH_ASCEND_CL AND ${npu_cc_srcs_len} GREATER 0)
         foreach(npu_src ${npu_cc_srcs})
@@ -438,7 +455,6 @@ function(op_library TARGET)
     endif()
 endfunction()
 
-
 function(register_operators)
     set(options "")
     set(oneValueArgs "")
diff --git a/cmake/pten.cmake b/cmake/pten.cmake
index 8e1d2339862..2a040c73b98 100644
--- a/cmake/pten.cmake
+++ b/cmake/pten.cmake
@@ -88,11 +88,12 @@ function(kernel_library TARGET)
     set(cpu_srcs)
     set(gpu_srcs)
     set(xpu_srcs)
+    set(selected_rows_srcs)
     # parse and save the deps kerenl targets
     set(all_srcs)
     set(kernel_deps)
 
-    set(oneValueArgs "")
+    set(oneValueArgs SUB_DIR)
     set(multiValueArgs SRCS DEPS)
     cmake_parse_arguments(kernel_library "${options}" "${oneValueArgs}"
         "${multiValueArgs}" ${ARGN})
@@ -106,6 +107,9 @@ function(kernel_library TARGET)
         if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/cpu/${TARGET}.cc)
             list(APPEND cpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/${TARGET}.cc)
         endif()
+        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/selected_rows/${TARGET}.cc)
+            list(APPEND selected_rows_srcs ${CMAKE_CURRENT_SOURCE_DIR}/selected_rows/${TARGET}.cc)
+        endif()
         if (WITH_GPU OR WITH_ROCM)
             if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/gpu/${TARGET}.cu)
                 list(APPEND gpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpu/${TARGET}.cu)
@@ -131,8 +135,17 @@ function(kernel_library TARGET)
     foreach(src ${all_srcs})
         file(READ ${src} target_content)
         string(REGEX MATCHALL "#include \"paddle\/pten\/kernels\/[a-z0-9_]+_kernel.h\"" include_kernels ${target_content})
+        if ("${kernel_library_SUB_DIR}" STREQUAL "")
+            string(REGEX MATCHALL "#include \"paddle\/pten\/kernels\/[a-z0-9_]+_kernel.h\"" include_kernels ${target_content})
+        else()
+            string(REGEX MATCHALL "#include \"paddle\/pten\/kernels\/${kernel_library_SUB_DIR}\/[a-z0-9_]+_kernel.h\"" include_kernels ${target_content})
+        endif()
         foreach(include_kernel ${include_kernels})
+        if ("${kernel_library_SUB_DIR}" STREQUAL "")
             string(REGEX REPLACE "#include \"paddle\/pten\/kernels\/" "" kernel_name ${include_kernel})
+        else()
+            string(REGEX REPLACE "#include \"paddle\/pten\/kernels\/${kernel_library_SUB_DIR}\/" "" kernel_name ${include_kernel})
+        endif()
             string(REGEX REPLACE ".h\"" "" kernel_name ${kernel_name})
             list(APPEND kernel_deps ${kernel_name})
         endforeach()
@@ -144,27 +157,30 @@ function(kernel_library TARGET)
     list(LENGTH cpu_srcs cpu_srcs_len)
     list(LENGTH gpu_srcs gpu_srcs_len)
     list(LENGTH xpu_srcs xpu_srcs_len)
+    list(LENGTH selected_rows_srcs selected_rows_srcs_len)
 
     # Build Target according different src organization
     if((${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR
-        ${xpu_srcs_len} GREATER 0) AND ${common_srcs_len} GREATER 0)
-        # If the common_srcs depends on specific device srcs, build target using this rule.
+        ${xpu_srcs_len} GREATER 0) AND (${common_srcs_len} GREATER 0 OR 
+        ${selected_rows_srcs_len} GREATER 0))
+        # If the common_srcs/selected_rows_srcs depends on specific device srcs, build target using this rule.
         if (WITH_GPU)
             if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0)
                 nv_library(${TARGET}_part SRCS ${cpu_srcs} ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
-                nv_library(${TARGET} SRCS ${common_srcs} DEPS ${TARGET}_part)
+                nv_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part)
             endif()
         elseif (WITH_ROCM)
             if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0)
                 hip_library(${TARGET}_part SRCS ${cpu_srcs} ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
-                hip_library(${TARGET} SRCS ${common_srcs} DEPS ${TARGET}_part)
+                hip_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part)
             endif()
         else()
             if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0)
                 cc_library(${TARGET}_part SRCS ${cpu_srcs} ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
-                cc_library(${TARGET} SRCS ${common_srcs} DEPS ${TARGET}_part)
+                cc_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part)
             endif()
         endif()
+    # If there are only specific device srcs, build target using this rule.
     elseif (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0)
         if (WITH_GPU)
             if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0)
@@ -179,25 +195,42 @@ function(kernel_library TARGET)
                 cc_library(${TARGET} SRCS ${cpu_srcs} ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
             endif()
         endif()
-    else()
-        if (${common_srcs_len} EQUAL 0)
-             message(FATAL_ERROR "Cannot find any implementation for ${TARGET}")
+    # If the selected_rows_srcs depends on common_srcs, build target using this rule.
+    elseif (${common_srcs_len} GREATER 0 AND ${selected_rows_srcs_len} GREATER 0)
+        if (WITH_GPU)
+            nv_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+            nv_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part)
+        elseif (WITH_ROCM)
+            hip_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+            hip_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part)
         else()
-            # If the kernel has a device independent public implementation,
-            # we will use this implementation and will not adopt the implementation
-            # under specific devices
-            if (WITH_GPU)
-                nv_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
-            elseif (WITH_ROCM)
-                hip_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
-            else()
-                cc_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
-            endif()
-         endif()
+            cc_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+            cc_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part)
+        endif()
+    # If there are only common_srcs or selected_rows_srcs, build target using below rules.
+    elseif (${common_srcs_len} GREATER 0)
+        if (WITH_GPU)
+            nv_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+        elseif (WITH_ROCM)
+            hip_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+        else()
+            cc_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+        endif()
+    elseif (${selected_rows_srcs_len} GREATER 0)
+        if (WITH_GPU)
+            nv_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+        elseif (WITH_ROCM)
+            hip_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+        else()
+            cc_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+        endif()
+    else()
+         message(FATAL_ERROR "Cannot find any implementation for ${TARGET}")
     endif()
 
     if (${common_srcs_len} GREATER 0 OR ${cpu_srcs_len} GREATER 0 OR
-        ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0)
+        ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR 
+        ${selected_rows_srcs_len} GREATER 0)
         # append target into PTEN_KERNELS property
         get_property(pten_kernels GLOBAL PROPERTY PTEN_KERNELS)
         set(pten_kernels ${pten_kernels} ${TARGET})
@@ -219,11 +252,14 @@ function(kernel_library TARGET)
     if (${xpu_srcs_len} GREATER 0)
         kernel_declare(${xpu_srcs})
     endif()
+    if (${selected_rows_srcs_len} GREATER 0)
+        kernel_declare(${selected_rows_srcs})
+    endif()
 endfunction()
 
 function(register_kernels)
     set(options "")
-    set(oneValueArgs "")
+    set(oneValueArgs SUB_DIR)
     set(multiValueArgs EXCLUDES DEPS)
     cmake_parse_arguments(register_kernels "${options}" "${oneValueArgs}"
         "${multiValueArgs}" ${ARGN})
@@ -236,9 +272,9 @@ function(register_kernels)
         list(FIND register_kernels_EXCLUDES ${target} _index)
         if (${_index} EQUAL -1)
             if (${register_kernels_DEPS_len} GREATER 0)
-                kernel_library(${target} DEPS ${register_kernels_DEPS})
+                kernel_library(${target} DEPS ${register_kernels_DEPS} SUB_DIR ${register_kernels_SUB_DIR})
             else()
-                kernel_library(${target})
+                kernel_library(${target} SUB_DIR ${register_kernels_SUB_DIR})
             endif()
         endif()
     endforeach()
@@ -246,9 +282,9 @@ endfunction()
 
 function(append_op_util_declare TARGET)
     file(READ ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET} target_content)
-    string(REGEX MATCH "(PT_REGISTER_API_NAME|PT_REGISTER_ARG_MAPPING_FN)\\([ \t\r\n]*[a-z0-9_]*" util_registrar "${target_content}")
+    string(REGEX MATCH "(PT_REGISTER_BASE_KERNEL_NAME|PT_REGISTER_ARG_MAPPING_FN)\\([ \t\r\n]*[a-z0-9_]*" util_registrar "${target_content}")
     string(REPLACE "PT_REGISTER_ARG_MAPPING_FN" "PT_DECLARE_ARG_MAPPING_FN" util_declare "${util_registrar}")
-    string(REPLACE "PT_REGISTER_API_NAME" "PT_REGISTER_API_NAME" util_declare "${util_declare}")
+    string(REPLACE "PT_REGISTER_BASE_KERNEL_NAME" "PT_DECLARE_BASE_KERNEL_NAME" util_declare "${util_declare}")
     string(APPEND util_declare ");")
     file(APPEND ${op_utils_header} "${util_declare}")
 endfunction()
diff --git a/cmake/xpu_kp.cmake b/cmake/xpu_kp.cmake
new file mode 100644
index 00000000000..f8ab9693db0
--- /dev/null
+++ b/cmake/xpu_kp.cmake
@@ -0,0 +1,239 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(NOT WITH_XPU_KP)
+    return()
+endif()
+
+if(NOT XPU_TOOLCHAIN)
+  set(XPU_TOOLCHAIN /workspace/paddle/xpu-demo/XTDK)
+  get_filename_component(XPU_TOOLCHAIN ${XPU_TOOLCHAIN} REALPATH)
+endif()
+if(NOT IS_DIRECTORY ${XPU_TOOLCHAIN})
+  message(FATAL_ERROR "Directory ${XPU_TOOLCHAIN} not found!")
+endif()
+message(STATUS "Build with XPU_TOOLCHAIN=" ${XPU_TOOLCHAIN})
+set(XPU_CLANG ${XPU_TOOLCHAIN}/bin/clang++)
+message(STATUS "Build with XPU_CLANG=" ${XPU_CLANG})
+
+# The host sysroot of XPU compiler is gcc-8.2 
+if(NOT HOST_SYSROOT)
+  set(HOST_SYSROOT /opt/compiler/gcc-8.2)
+endif()
+
+if(NOT IS_DIRECTORY ${HOST_SYSROOT})
+  message(FATAL_ERROR "Directory ${HOST_SYSROOT} not found!")
+endif()
+
+if(NOT API_ARCH)
+  set(API_ARCH x86_64-baidu-linux-gnu)
+endif()
+
+if(API_ARCH MATCHES "x86_64")
+if(EXISTS ${HOST_SYSROOT}/bin/g++)
+  set(HOST_CXX ${HOST_SYSROOT}/bin/g++)
+  set(HOST_AR ${HOST_SYSROOT}/bin/ar)
+else()
+  set(HOST_CXX /usr/bin/g++)
+  set(HOST_AR /usr/bin/ar)
+endif()
+else()
+  set(HOST_CXX ${CMAKE_CXX_COMPILER})
+  set(HOST_AR ${CMAKE_AR})
+endif()
+
+set(TOOLCHAIN_ARGS )
+
+if(OPT_LEVEL)
+  set(OPT_LEVEL ${OPT_LEVEL})
+else()
+  set(OPT_LEVEL "-O3")
+endif()
+
+message(STATUS "Build with API_ARCH=" ${API_ARCH})
+message(STATUS "Build with TOOLCHAIN_ARGS=" ${TOOLCHAIN_ARGS})
+message(STATUS "Build with HOST_SYSROOT=" ${HOST_SYSROOT})
+message(STATUS "Build with HOST_CXX=" ${HOST_CXX})
+message(STATUS "Build with HOST_AR=" ${HOST_AR})
+
+macro(compile_kernel COMPILE_ARGS)
+  set(options "")
+  set(oneValueArgs "")
+  set(multiValueArgs KERNEL DIRPATH XNAME DEVICE HOST XPU DEPENDS)
+  cmake_parse_arguments(xpu_add_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+  set(kernel_path ${xpu_add_library_DIRPATH})
+  set(kernel_name ${xpu_add_library_XNAME})
+  set(device_o_extra_flags ${xpu_add_library_DEVICE})
+  set(host_o_extra_flags ${xpu_add_library_HOST})
+  set(xpu_1_or_2 ${xpu_add_library_XPU})
+  set(cc_depends ${xpu_add_library_DEPENDS})
+
+  set(kernel_target ${kernel_name}_kernel)
+  add_custom_target(${kernel_target}
+    WORKING_DIRECTORY
+      ${CMAKE_CURRENT_BINARY_DIR}
+    DEPENDS
+      kernel_build/${kernel_name}.host.o
+      kernel_build/${kernel_name}.bin.o
+    COMMENT
+      ${kernel_target}
+    VERBATIM
+    )
+
+  if(cc_depends)
+    add_dependencies(${kernel_target} ${xpu_add_library_DEPENDS})
+  endif()
+
+  set(arg_device_o_extra_flags ${device_o_extra_flags})
+  separate_arguments(arg_device_o_extra_flags)
+  set(arg_host_o_extra_flags ${host_o_extra_flags})
+  separate_arguments(arg_host_o_extra_flags)
+
+  set(XTDK_DIR ${XPU_TOOLCHAIN})
+  set(CXX_DIR ${HOST_SYSROOT})
+  set(XPU_CXX_FLAGS  -Wno-error=pessimizing-move -Wno-error=constant-conversion -Wno-error=c++11-narrowing -Wno-error=shift-count-overflow -Wno-error=unused-local-typedef -Wno-error=deprecated-declarations -Wno-deprecated-declarations -std=c++14 -m64 -fPIC -fno-omit-frame-pointer  -Wall -Wno-inconsistent-missing-override -Wextra -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wno-unused-parameter -Wno-unused-function  -Wno-error=unused-local-typedefs -Wno-error=ignored-attributes  -Wno-error=int-in-bool-context -Wno-error=parentheses -Wno-error=address -Wno-ignored-qualifiers -Wno-ignored-attributes -Wno-parentheses -DNDEBUG )
+
+  #include path
+  get_property(dirs DIRECTORY ${CMAKE_SOURCE_DIR} PROPERTY INCLUDE_DIRECTORIES)
+  set(XPU_CXX_INCLUDES "")
+  foreach(dir IN LISTS dirs)
+    list(APPEND XPU_CXX_INCLUDES "-I${dir}")
+  endforeach()
+  string(REPLACE ";" " " XPU_CXX_INCLUDES "${XPU_CXX_INCLUDES}" )
+  separate_arguments(XPU_CXX_INCLUDES UNIX_COMMAND "${XPU_CXX_INCLUDES}")
+
+  #related flags
+  get_directory_property( DirDefs DIRECTORY ${CMAKE_SOURCE_DIR} COMPILE_DEFINITIONS )
+  set(XPU_CXX_DEFINES "")
+  foreach(def IN LISTS DirDefs)
+    list(APPEND XPU_CXX_DEFINES "-D${def}")
+  endforeach()
+  string(REPLACE ";" " " XPU_CXX_DEFINES "${XPU_CXX_DEFINES}" )
+  separate_arguments(XPU_CXX_DEFINES UNIX_COMMAND "${XPU_CXX_DEFINES}")
+
+  add_custom_command(
+    OUTPUT
+      kernel_build/${kernel_name}.bin.o
+    COMMAND
+      ${CMAKE_COMMAND} -E make_directory kernel_build
+    COMMAND
+    ${XPU_CLANG} --sysroot=${CXX_DIR}  -std=c++11 -D_GLIBCXX_USE_CXX11_ABI=1 ${OPT_LEVEL} -fno-builtin -mcpu=xpu2  -fPIC ${XPU_CXX_DEFINES}  ${XPU_CXX_FLAGS}  ${XPU_CXX_INCLUDES} 
+       -I.  -o kernel_build/${kernel_name}.bin.o.sec ${kernel_path}/${kernel_name}.xpu
+        --xpu-device-only -c -v 
+    COMMAND
+      ${XTDK_DIR}/bin/xpu2-elfconv kernel_build/${kernel_name}.bin.o.sec  kernel_build/${kernel_name}.bin.o ${XPU_CLANG} --sysroot=${CXX_DIR}
+    WORKING_DIRECTORY
+      ${CMAKE_CURRENT_BINARY_DIR}
+    DEPENDS
+      ${xpu_add_library_DEPENDS}
+    COMMENT
+      kernel_build/${kernel_name}.bin.o
+    VERBATIM
+    )
+    list(APPEND xpu_kernel_depends kernel_build/${kernel_name}.bin.o)
+
+  add_custom_command(
+    OUTPUT
+      kernel_build/${kernel_name}.host.o
+    COMMAND
+      ${CMAKE_COMMAND} -E make_directory kernel_build
+    COMMAND
+    ${XPU_CLANG} --sysroot=${CXX_DIR}  -std=c++11 -D_GLIBCXX_USE_CXX11_ABI=1 ${OPT_LEVEL} -fno-builtin -mcpu=xpu2  -fPIC ${XPU_CXX_DEFINES}  ${XPU_CXX_FLAGS} ${XPU_CXX_INCLUDES} 
+        -I.  -o kernel_build/${kernel_name}.host.o ${kernel_path}/${kernel_name}.xpu
+        --xpu-host-only -c -v 
+    WORKING_DIRECTORY
+      ${CMAKE_CURRENT_BINARY_DIR}
+    DEPENDS
+      ${xpu_add_library_DEPENDS}
+    COMMENT
+      kernel_build/${kernel_name}.host.o
+    VERBATIM
+    )
+    list(APPEND xpu_kernel_depends kernel_build/${kernel_name}.host.o)
+endmacro()
+
+###############################################################################
+# XPU_ADD_LIBRARY
+###############################################################################
+macro(xpu_add_library TARGET_NAME)
+    # Separate the sources from the options
+    set(options "")
+    set(oneValueArgs "")
+    set(multiValueArgs STATIC DEPENDS)
+    cmake_parse_arguments(xpu_add_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    set(xpu_srcs ${xpu_add_library_STATIC})
+    set(xpu_target ${TARGET_NAME})
+    set(cc_srcs_depends ${xpu_add_library_DEPENDS})
+    
+    file(GLOB_RECURSE xpu_srcs_lists ${xpu_srcs})
+    list(LENGTH xpu_srcs_lists xpu_srcs_lists_num)
+
+    set(XPU1_DEVICE_O_EXTRA_FLAGS " ")
+    set(XPU1_HOST_O_EXTRA_FLAGS " ")
+
+    # Distinguish .xpu file from other files
+    foreach(cur_xpu_src IN LISTS xpu_srcs_lists)
+      get_filename_component(language_type_name ${cur_xpu_src} EXT)
+      if(${language_type_name} STREQUAL ".xpu")
+        list(APPEND xpu_kernel_lists ${cur_xpu_src})
+      else()
+        list(APPEND cc_kernel_lists ${cur_xpu_src})
+      endif()
+    endforeach()
+
+    # Ensure that there is only one xpu kernel
+    list(LENGTH xpu_kernel_lists xpu_kernel_lists_num)
+    list(LENGTH cc_srcs_depends cc_srcs_depends_num)
+
+    if(${xpu_kernel_lists_num})
+        foreach(xpu_kernel IN LISTS xpu_kernel_lists)
+            get_filename_component(kernel_name ${xpu_kernel} NAME_WE)
+            get_filename_component(kernel_dir ${xpu_kernel} DIRECTORY)
+            set(kernel_rules ${kernel_dir}/${kernel_name}.rules)
+            set(kernel_name ${kernel_name})
+            compile_kernel( KERNEL ${xpu_kernel} DIRPATH ${kernel_dir} XNAME ${kernel_name} DEVICE ${XPU1_DEVICE_O_EXTRA_FLAGS} HOST ${XPU1_HOST_O_EXTRA_FLAGS} XPU "xpu2" DEPENDS ${cc_srcs_depends})
+        endforeach()
+
+        add_custom_target(${xpu_target}_src ALL
+            WORKING_DIRECTORY
+                ${CMAKE_CURRENT_BINARY_DIR}
+            DEPENDS
+                ${xpu_kernel_depends}
+                ${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}_xpu.a
+            COMMENT
+                ${xpu_target}_src
+            VERBATIM
+            )
+
+        add_custom_command(
+            OUTPUT
+            ${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}_xpu.a
+            COMMAND
+                ${HOST_AR} rcs ${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}_xpu.a ${xpu_kernel_depends}
+            WORKING_DIRECTORY
+                ${CMAKE_CURRENT_BINARY_DIR}
+            DEPENDS
+                ${xpu_kernel_depends}
+            COMMENT
+                ${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}_xpu.a
+            VERBATIM
+            ) 
+        
+        add_library(${xpu_target} STATIC ${cc_kernel_lists})
+        add_dependencies(${xpu_target} ${xpu_target}_src)
+        target_link_libraries(${TARGET_NAME} ${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}_xpu.a)
+    else()
+        add_library(${xpu_target} STATIC ${cc_kernel_lists})
+    endif()
+endmacro()
diff --git a/paddle/fluid/distributed/common/utils.h b/paddle/fluid/distributed/common/utils.h
index fb2189b8f5a..85b89d75b98 100644
--- a/paddle/fluid/distributed/common/utils.h
+++ b/paddle/fluid/distributed/common/utils.h
@@ -33,7 +33,7 @@ namespace distributed {
 template <typename T>
 inline paddle::operators::math::BlasT<paddle::platform::CPUDeviceContext, T>
 GetBlas() {
-  auto cpu_ctx = paddle::platform::CPUDeviceContext();
+  paddle::platform::CPUDeviceContext cpu_ctx;
   return paddle::operators::math::GetBlas<paddle::platform::CPUDeviceContext,
                                           T>(cpu_ctx);
 }
diff --git a/paddle/fluid/distributed/ps/service/brpc_ps_client.cc b/paddle/fluid/distributed/ps/service/brpc_ps_client.cc
index e855fcbd025..301136794d4 100644
--- a/paddle/fluid/distributed/ps/service/brpc_ps_client.cc
+++ b/paddle/fluid/distributed/ps/service/brpc_ps_client.cc
@@ -213,6 +213,7 @@ int32_t BrpcPsClient::initialize() {
   auto &profiler = CostProfiler::instance();
   profiler.register_profiler("pserver_client_pull_dense");
   profiler.register_profiler("pserver_client_pull_sparse");
+  profiler.register_profiler("pserver_client_pull_sparse_param");
   profiler.register_profiler("pserver_client_pull_sparse_local");
   profiler.register_profiler("pserver_client_push_sparse");
   profiler.register_profiler("pserver_client_push_sparse_parse");
@@ -543,6 +544,7 @@ std::future<int32_t> BrpcPsClient::pull_geo_param(size_t table_id,
   return fut;
 }
 
+// for GEO
 std::future<int32_t> BrpcPsClient::push_sparse_param(
     size_t table_id, const uint64_t *keys, const float **update_values,
     size_t num, void *done) {
@@ -558,18 +560,8 @@ std::future<int32_t> BrpcPsClient::push_sparse_param(
   ids.resize(request_call_num);
   value_ptrs.resize(request_call_num);
 
-  const auto &server_param = _config.server_param().downpour_server_param();
-  uint64_t shard_num = FLAGS_pserver_sparse_table_shard_num;
-  for (int i = 0; i < server_param.downpour_table_param_size(); ++i) {
-    const auto &table_param = server_param.downpour_table_param(i);
-    if (table_param.table_id() == table_id) {
-      shard_num = table_param.shard_num();
-      break;
-    }
-  }
-
   for (size_t i = 0; i < num; ++i) {
-    size_t pserver_idx = get_sparse_shard(shard_num, request_call_num, keys[i]);
+    size_t pserver_idx = keys[i] % request_call_num;
     ids[pserver_idx].push_back(keys[i]);
     value_ptrs[pserver_idx].push_back(update_values[i]);
   }
@@ -1003,6 +995,120 @@ std::future<int32_t> BrpcPsClient::pull_sparse(float **select_values,
   return fut;
 }
 
+// for GEO
+std::future<int32_t> BrpcPsClient::pull_sparse_param(float **select_values,
+                                                     size_t table_id,
+                                                     const uint64_t *keys,
+                                                     size_t num,
+                                                     bool is_training) {
+  auto timer = std::make_shared<CostTimer>("pserver_client_pull_sparse_param");
+  size_t request_call_num = _server_channels.size();
+
+  auto shard_sorted_kvs = std::make_shared<
+      std::vector<std::vector<std::pair<uint64_t, float *>>>>();
+  shard_sorted_kvs->resize(request_call_num);
+
+  for (size_t i = 0; i < num; ++i) {
+    size_t shard_id = keys[i] % request_call_num;
+    shard_sorted_kvs->at(shard_id).push_back({keys[i], select_values[i]});
+  }
+
+  auto *accessor = table_accessor(table_id);
+  size_t value_size = accessor->select_size();
+
+  DownpourBrpcClosure *closure = new DownpourBrpcClosure(
+      request_call_num, [shard_sorted_kvs, value_size](void *done) {
+        int ret = 0;
+        auto *closure = reinterpret_cast<DownpourBrpcClosure *>(done);
+        for (size_t i = 0; i < shard_sorted_kvs->size(); ++i) {
+          if (closure->check_response(i, PS_PULL_SPARSE_TABLE) != 0) {
+            ret = -1;
+            break;
+          }
+
+          auto &request_kvs = shard_sorted_kvs->at(i);
+          auto &res_io_buffer = closure->cntl(i)->response_attachment();
+          butil::IOBufBytesIterator io_buffer_itr(res_io_buffer);
+          uint64_t last_key = UINT64_MAX;
+          float *last_value_data = NULL;
+
+          // can remove sort&unique
+          for (size_t kv_idx = 0; kv_idx < request_kvs.size(); ++kv_idx) {
+            auto *kv_pair = &(request_kvs[kv_idx]);
+            if (kv_pair->first == last_key) {
+              memcpy(reinterpret_cast<void *>(kv_pair->second),
+                     reinterpret_cast<void *>(last_value_data), value_size);
+            } else {
+              last_key = kv_pair->first;
+              last_value_data = kv_pair->second;
+              if (value_size !=
+                  io_buffer_itr.copy_and_forward(
+                      reinterpret_cast<void *>(last_value_data), value_size)) {
+                LOG(WARNING) << "res data is lack or not in format";
+                ret = -1;
+                break;
+              }
+            }
+          }
+        }
+        closure->set_promise_value(ret);
+      });
+  closure->add_timer(timer);
+  auto promise = std::make_shared<std::promise<int32_t>>();
+  closure->add_promise(promise);
+  std::future<int> fut = promise->get_future();
+
+  for (size_t i = 0; i < request_call_num; ++i) {
+    auto &sorted_kvs = shard_sorted_kvs->at(i);
+    std::sort(sorted_kvs.begin(), sorted_kvs.end(),
+              [](const std::pair<uint64_t, float *> &k1,
+                 const std::pair<uint64_t, float *> &k2) {
+                return k1.first < k2.first;
+              });
+
+    uint64_t last_key = UINT64_MAX;
+    uint32_t kv_request_count = 0;
+    size_t sorted_kv_size = sorted_kvs.size();
+    auto &request_buffer = closure->cntl(i)->request_attachment();
+
+    request_buffer.append(reinterpret_cast<void *>(&is_training), sizeof(bool));
+    std::vector<uint32_t> keys_counter;
+    keys_counter.reserve(sorted_kv_size);
+
+    for (size_t kv_idx = 0; kv_idx < sorted_kv_size; ++kv_idx) {
+      ++kv_request_count;
+      uint32_t keys = 1;
+      last_key = sorted_kvs[kv_idx].first;
+      request_buffer.append(reinterpret_cast<void *>(&last_key),
+                            sizeof(uint64_t));
+      while (kv_idx < sorted_kv_size - 1 &&
+             last_key == sorted_kvs[kv_idx + 1].first) {
+        ++kv_idx;
+        ++keys;
+      }
+      keys_counter.push_back(keys);
+    }
+
+    request_buffer.append(reinterpret_cast<void *>(keys_counter.data()),
+                          sizeof(uint32_t) * keys_counter.size());
+
+    if (kv_request_count == 0) {
+      closure->Run();
+    } else {
+      closure->request(i)->set_cmd_id(PS_PULL_SPARSE_TABLE);
+      closure->request(i)->set_table_id(table_id);
+      closure->request(i)->set_client_id(_client_id);
+      closure->request(i)->add_params((char *)&kv_request_count,  // NOLINT
+                                      sizeof(uint32_t));
+      PsService_Stub rpc_stub(get_cmd_channel(i));
+      closure->cntl(i)->set_log_id(butil::gettimeofday_ms());
+      rpc_stub.service(closure->cntl(i), closure->request(i),
+                       closure->response(i), closure);
+    }
+  }
+  return fut;
+}
+
 std::future<int32_t> BrpcPsClient::send_client2client_msg(
     int msg_type, int to_client_id, const std::string &msg) {
   auto promise = std::make_shared<std::promise<int32_t>>();
@@ -1067,12 +1173,14 @@ int32_t BrpcPsClient::recv_and_save_table(const uint64_t table_id,
   std::string var_name = "";
   int64_t var_num = 0;
   int64_t var_shape = 0;
+  std::string table_class;
   const auto &worker_param = _config.worker_param().downpour_worker_param();
   for (size_t i = 0; i < worker_param.downpour_table_param_size(); ++i) {
     if (worker_param.downpour_table_param(i).table_id() == table_id) {
       var_name = worker_param.downpour_table_param(i).common().table_name();
       var_num = worker_param.downpour_table_param(i).common().table_num();
       var_shape = worker_param.downpour_table_param(i).common().table_dim();
+      table_class = worker_param.downpour_table_param(i).table_class();
       break;
     }
   }
@@ -1094,9 +1202,19 @@ int32_t BrpcPsClient::recv_and_save_table(const uint64_t table_id,
     save_vec.push_back(save_huge_vec.data() + i * var_shape);
   }
 
-  auto status = pull_sparse(reinterpret_cast<float **>(save_vec.data()),
-                            table_id, save_key.data(), save_key.size(), true);
-  status.wait();
+  VLOG(2) << "recv_and_save_table: table_class: " << table_class;
+  // TODO(zhaocaibei123): new GeoBrpcPSClient, move this to its
+  // recv_and_save_table
+  if (table_class == "MemorySparseGeoTable") {
+    auto status =
+        pull_sparse_param(reinterpret_cast<float **>(save_vec.data()), table_id,
+                          save_key.data(), save_key.size(), true);
+    status.wait();
+  } else {
+    auto status = pull_sparse(reinterpret_cast<float **>(save_vec.data()),
+                              table_id, save_key.data(), save_key.size(), true);
+    status.wait();
+  }
 
   // create lod tensor
   std::shared_ptr<framework::Scope> scope;
diff --git a/paddle/fluid/distributed/ps/service/brpc_ps_client.h b/paddle/fluid/distributed/ps/service/brpc_ps_client.h
index 70f406ee248..59ed59933db 100644
--- a/paddle/fluid/distributed/ps/service/brpc_ps_client.h
+++ b/paddle/fluid/distributed/ps/service/brpc_ps_client.h
@@ -194,6 +194,10 @@ class BrpcPsClient : public PSClient {
                                            size_t table_id,
                                            const uint64_t *keys, size_t num,
                                            bool is_training);
+  virtual std::future<int32_t> pull_sparse_param(float **select_values,
+                                                 size_t table_id,
+                                                 const uint64_t *keys,
+                                                 size_t num, bool is_training);
 
   virtual std::future<int32_t> print_table_stat(uint32_t table_id);
 
diff --git a/paddle/fluid/distributed/ps/service/communicator/communicator.cc b/paddle/fluid/distributed/ps/service/communicator/communicator.cc
index a73f87c1d88..99973ee8bdd 100644
--- a/paddle/fluid/distributed/ps/service/communicator/communicator.cc
+++ b/paddle/fluid/distributed/ps/service/communicator/communicator.cc
@@ -354,7 +354,7 @@ void Communicator::RpcRecvSparse(const std::string &varname, int table_id,
 
   bool training = true;
 
-  auto status = _worker_ptr->pull_sparse(
+  auto status = _worker_ptr->pull_sparse_param(
       (float **)push_g_vec.data(), table_id,  // NOLINT
       sparse_push_keys.data(), sparse_push_keys.size(), training);
   status.wait();
@@ -1029,7 +1029,7 @@ void GeoCommunicator::Send(const std::vector<std::string> &var_names,
     auto &sparse_ids_set = iter.second;
     auto sparse_ids_vec = std::make_shared<std::vector<int64_t>>();
     sparse_ids_vec->assign(sparse_ids_set.begin(), sparse_ids_set.end());
-    sparse_id_queues_.at(key)->Push(sparse_ids_vec);
+    sparse_id_queues_.at(key)->Put(sparse_ids_vec);
     VLOG(3) << "push " << sparse_ids_vec->size() << " ids to " << key
             << "'s queue";
   }
@@ -1051,7 +1051,10 @@ void GeoCommunicator::InitImpl(const RpcCtxMap &send_varname_to_ctx,
 
   for (auto &iter : send_varname_to_ctx_) {
     auto &ctx = iter.second;
-    if (!ctx.is_sparse) continue;
+    if (!ctx.is_sparse) {
+      parallel_task_nums_ += 1;
+      continue;
+    }
     auto &varnames = ctx.origin_varnames;
     PADDLE_ENFORCE_EQ(
         varnames.size(), 1,
@@ -1060,12 +1063,11 @@ void GeoCommunicator::InitImpl(const RpcCtxMap &send_varname_to_ctx,
     for (auto &splited_var : ctx.splited_varnames) {
       parallel_task_nums_ += 1;
       sparse_id_queues_.insert(
-          std::pair<std::string, std::shared_ptr<BlockingQueue<
-                                     std::shared_ptr<std::vector<int64_t>>>>>(
+          std::pair<std::string, paddle::framework::Channel<
+                                     std::shared_ptr<std::vector<int64_t>>>>(
               splited_var,
-              std::make_shared<
-                  BlockingQueue<std::shared_ptr<std::vector<int64_t>>>>(
-                  send_queue_size_)));
+              paddle::framework::MakeChannel<
+                  std::shared_ptr<std::vector<int64_t>>>(send_queue_size_)));
     }
   }
 
@@ -1153,7 +1155,7 @@ void GeoCommunicator::SendDense(const CommContext &send_ctx) {
     auto &t_latest = var_latest->Get<framework::LoDTensor>();
     auto t_timestamp = var_timestamp->GetMutable<framework::LoDTensor>();
 
-    auto cpu_ctx = paddle::platform::CPUDeviceContext();
+    paddle::platform::CPUDeviceContext cpu_ctx;
     auto *var_delta = delta_scope_->Var(varname);
     auto *t_delta = var_delta->GetMutable<framework::LoDTensor>();
     t_delta->mutable_data<float>(t_latest.dims(), cpu_ctx.GetPlace());
@@ -1183,7 +1185,7 @@ void GeoCommunicator::RecvDense(const CommContext &send_ctx) {
   RpcRecvDense(varnames, table_id, pserver_scope_.get());
 
   // 2.1 pserver - old => delta; 2.2 latest + old => latest 2.3 old => pserver
-  auto cpu_ctx = paddle::platform::CPUDeviceContext();
+  paddle::platform::CPUDeviceContext cpu_ctx;
   for (auto &varname : varnames) {
     auto *var_latest = recv_scope_->FindVar(varname);
     auto t_latest = var_latest->GetMutable<framework::LoDTensor>();
@@ -1242,8 +1244,8 @@ std::vector<int64_t> GeoCommunicator::MergeSparseIds(
     VLOG(3) << "Merge Number of " << send_varname << " = " << merge_num;
     if (sparse_id_queues_.at(send_varname)->Size() > 0) {
       wait_times = 0;
-      std::shared_ptr<std::vector<int64_t>> pop_ids =
-          sparse_id_queues_.at(send_varname)->Pop();
+      std::shared_ptr<std::vector<int64_t>> pop_ids = nullptr;
+      sparse_id_queues_.at(send_varname)->Get(pop_ids);
       for (size_t j = 0; j < pop_ids->size(); j++) {
         sparse_ids.insert(pop_ids->at(j));
       }
@@ -1268,6 +1270,9 @@ void GeoCommunicator::SendSparse(const std::string &varname,
                                  std::vector<int64_t> &sparse_ids, int table_id,
                                  int ep_idx) {
   platform::RecordEvent record_event("GeoCommunicator->SendSparse");
+  if (sparse_ids.size() == 0) {
+    return;
+  }
   std::string param_name = SplitedGradToParam(varname);
   VLOG(1) << "In GeoCommunicator::SendSparse(" << varname << " " << param_name
           << ", ids.size = " << sparse_ids.size() << ", table_id: " << table_id
@@ -1287,7 +1292,7 @@ void GeoCommunicator::SendSparse(const std::string &varname,
   auto *t_old = var_old->GetMutable<framework::LoDTensor>();
 
   auto dims1 = t_latest.dims()[1];
-  auto cpu_ctx = paddle::platform::CPUDeviceContext();
+  paddle::platform::CPUDeviceContext cpu_ctx;
 
   auto *var_delta = delta_scope_->Var(varname);
   auto *t_delta = var_delta->GetMutable<pten::SelectedRows>();
@@ -1313,6 +1318,10 @@ void GeoCommunicator::SendSparse(const std::string &varname,
               t_value + j * dims1,
               t_old->data<float>() + sparse_ids[j] * dims1);
     push_g_vec.push_back(t_value + j * dims1);
+
+    VLOG(5) << "DEBUG GeoCommunicator::SendSparse send sparse key "
+            << sparse_ids[j] << " value[0] " << push_g_vec[j][0]
+            << " value[-1] " << push_g_vec[j][dims1 - 1];
   }
 
   ++_async_call_num;
@@ -1361,12 +1370,15 @@ void GeoCommunicator::RecvSparse(const std::string &varname, int table_id,
   std::vector<float> v_delta;
   v_delta.resize(numel);
 
-  auto cpu_ctx = paddle::platform::CPUDeviceContext();
+  paddle::platform::CPUDeviceContext cpu_ctx;
   auto blas =
       paddle::operators::math::GetBlas<platform::CPUDeviceContext, float>(
           cpu_ctx);
 
   for (auto j = 0; j < static_cast<int>(keys.size()); ++j) {
+    VLOG(5) << "DEBUG GeoCommunicator::RecvSparse recv sparse key" << keys[j]
+            << "value[0] " << values[j * dims1] << " value[-1] "
+            << values[j * dims1 + dims1 - 1];
     float *latest_data = t_latest->data<float>() + keys[j] * dims1;
     float *old_data = t_old->data<float>() + keys[j] * dims1;
     // pserver - old => delta
diff --git a/paddle/fluid/distributed/ps/service/communicator/communicator.h b/paddle/fluid/distributed/ps/service/communicator/communicator.h
index 570e668d9d5..da4e2f1a128 100644
--- a/paddle/fluid/distributed/ps/service/communicator/communicator.h
+++ b/paddle/fluid/distributed/ps/service/communicator/communicator.h
@@ -30,6 +30,7 @@ limitations under the License. */
 
 #include "gflags/gflags.h"
 #include "paddle/fluid/distributed/ps/service/communicator/communicator_common.h"
+#include "paddle/fluid/framework/channel.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/framework/variable_helper.h"
@@ -178,7 +179,7 @@ inline void MergeVars(const std::string &var_name,
     }
 
     // set output tensor to 0.
-    auto cpu_ctx = paddle::platform::CPUDeviceContext();
+    paddle::platform::CPUDeviceContext cpu_ctx;
     paddle::operators::math::SetConstant<paddle::platform::CPUDeviceContext, T>
         constant_functor;
     constant_functor(cpu_ctx, out_t, static_cast<T>(0));
@@ -203,7 +204,7 @@ inline void MergeVars(const std::string &var_name,
     for (auto &var : vars) {
       inputs.push_back(&var->Get<pten::SelectedRows>());
     }
-    auto dev_ctx = paddle::platform::CPUDeviceContext();
+    paddle::platform::CPUDeviceContext dev_ctx;
     if (merge_add) {
       paddle::operators::math::scatter::MergeAdd<
           paddle::platform::CPUDeviceContext, T>
@@ -626,9 +627,8 @@ class GeoCommunicator : public AsyncCommunicator {
   // parameter on pserver
   std::shared_ptr<Scope> pserver_scope_;
 
-  std::unordered_map<
-      std::string,
-      std::shared_ptr<BlockingQueue<std::shared_ptr<std::vector<int64_t>>>>>
+  std::unordered_map<std::string, paddle::framework::Channel<
+                                      std::shared_ptr<std::vector<int64_t>>>>
       sparse_id_queues_;
 };
 
diff --git a/paddle/fluid/distributed/ps/service/ps_client.h b/paddle/fluid/distributed/ps/service/ps_client.h
index 7db8b0c1244..21719fbdbf1 100644
--- a/paddle/fluid/distributed/ps/service/ps_client.h
+++ b/paddle/fluid/distributed/ps/service/ps_client.h
@@ -128,6 +128,17 @@ class PSClient {
                                            const uint64_t *keys, size_t num,
                                            bool is_training) = 0;
 
+  virtual std::future<int32_t> pull_sparse_param(float **select_values,
+                                                 size_t table_id,
+                                                 const uint64_t *keys,
+                                                 size_t num, bool is_training) {
+    VLOG(0) << "Did not implement";
+    std::promise<int32_t> promise;
+    std::future<int> fut = promise.get_future();
+    promise.set_value(-1);
+    return fut;
+  }
+
   virtual ::std::future<int32_t> pull_sparse_ptr(char **select_values,
                                                  size_t table_id,
                                                  const uint64_t *keys,
diff --git a/paddle/fluid/distributed/ps/table/CMakeLists.txt b/paddle/fluid/distributed/ps/table/CMakeLists.txt
index b0a553f2100..9aa9ecc2afd 100644
--- a/paddle/fluid/distributed/ps/table/CMakeLists.txt
+++ b/paddle/fluid/distributed/ps/table/CMakeLists.txt
@@ -47,6 +47,9 @@ cc_library(sparse_sgd_rule SRCS sparse_sgd_rule.cc DEPS ${TABLE_DEPS} ps_framewo
 cc_library(ctr_accessor SRCS ctr_accessor.cc DEPS ${TABLE_DEPS} ps_framework_proto sparse_sgd_rule)
 cc_library(memory_sparse_table SRCS memory_sparse_table.cc DEPS ps_framework_proto ${TABLE_DEPS} fs afs_wrapper ctr_accessor common_table)
 
-cc_library(table SRCS table.cc DEPS memory_sparse_table common_table tensor_accessor tensor_table ps_framework_proto string_helper device_context gflags glog boost)
+set_source_files_properties(memory_sparse_geo_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+cc_library(memory_sparse_geo_table SRCS memory_sparse_geo_table.cc DEPS ps_framework_proto ${TABLE_DEPS} common_table)
+
+cc_library(table SRCS table.cc DEPS memory_sparse_table memory_sparse_geo_table common_table tensor_accessor tensor_table ps_framework_proto string_helper device_context gflags glog boost)
 
 target_link_libraries(table -fopenmp)
diff --git a/paddle/fluid/distributed/ps/table/depends/geo_recorder.h b/paddle/fluid/distributed/ps/table/depends/geo_recorder.h
index ad094f0dfbc..adab0ee344b 100644
--- a/paddle/fluid/distributed/ps/table/depends/geo_recorder.h
+++ b/paddle/fluid/distributed/ps/table/depends/geo_recorder.h
@@ -15,13 +15,9 @@
 #pragma once
 
 #include <ThreadPool.h>
-#include <functional>
 #include <future>  // NOLINT
 #include <memory>
-#include <string>
-#include <unordered_map>
 #include <unordered_set>
-#include <utility>
 #include <vector>
 
 namespace paddle {
diff --git a/paddle/fluid/distributed/ps/table/memory_sparse_geo_table.cc b/paddle/fluid/distributed/ps/table/memory_sparse_geo_table.cc
new file mode 100644
index 00000000000..f16f4fc7f34
--- /dev/null
+++ b/paddle/fluid/distributed/ps/table/memory_sparse_geo_table.cc
@@ -0,0 +1,220 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/ps/table/memory_sparse_geo_table.h"
+
+namespace paddle {
+namespace distributed {
+
+int32_t MemorySparseGeoTable::push_sparse_param(const uint64_t* keys,
+                                                const float* values,
+                                                size_t num) {
+  VLOG(5) << "DEBUG MemorySparseGeoTable::push_sparse_param begin "
+             "push_sparse_param "
+          << num;
+  auto shard_num = _task_pool_size;
+  std::vector<std::vector<uint64_t>> offset_bucket;
+  offset_bucket.resize(shard_num);
+
+  for (int x = 0; x < num; ++x) {
+    auto y = keys[x] % shard_num;
+    offset_bucket[y].push_back(x);
+    if (x < 10) {
+      VLOG(5) << "DEBUG MemorySparseGeoTable::push_sparse_param key: "
+              << keys[x] << " shard: " << y;
+    }
+  }
+
+  std::vector<std::future<int>> tasks(shard_num);
+
+  for (int shard_id = 0; shard_id < shard_num; ++shard_id) {
+    tasks[shard_id] = _shards_task_pool[shard_id]->enqueue(
+        [this, shard_id, &keys, &offset_bucket, &values]() -> int {
+          auto& local_shard = _local_shards[shard_id];
+          auto& offsets = offset_bucket[shard_id];
+
+          for (int i = 0; i < offsets.size(); ++i) {
+            auto offset = offsets[i];
+            auto id = keys[offset];
+            auto& feature_value = local_shard[id];
+            feature_value.resize(_dim);
+            std::copy_n(values + _dim * offset, _dim, feature_value.data());
+            if (i < 10) {
+              VLOG(5) << "MemorySparseGeoTable::push_sparse_param "
+                         "push_sparse_param key "
+                      << id << " value[0]: " << (values + _dim * offset)[0]
+                      << " data: " << feature_value.data()[0]
+                      << " value[-1]: " << (values + _dim * offset)[_dim - 1]
+                      << " data: " << feature_value.data()[_dim - 1];
+            }
+          }
+          return 0;
+        });
+  }
+
+  for (size_t shard_id = 0; shard_id < tasks.size(); ++shard_id) {
+    tasks[shard_id].wait();
+  }
+  return 0;
+}
+
+int32_t MemorySparseGeoTable::pull_geo_param(const uint32_t trainer_id,
+                                             std::vector<float>* values,
+                                             std::vector<uint64_t>* ids) {
+  _geo_recorder->GetAndClear(trainer_id, ids);
+  VLOG(5)
+      << "DEBUG MemorySparseGeoTable::pull_geo_param pull_geo_param trainer_id "
+      << trainer_id << " id_num: " << ids->size();
+
+  std::vector<uint32_t> frequencies;
+  frequencies.resize(ids->size(), 1);
+
+  auto pull_value = PullSparseValue(ids->size(), _dim);
+  pull_value.is_training_ = true;
+  pull_value.feasigns_ = ids->data();
+  pull_value.frequencies_ = frequencies.data();
+
+  values->resize(ids->size() * _dim);
+  pull_sparse(values->data(), pull_value);
+  return 0;
+}
+
+int32_t MemorySparseGeoTable::push_sparse(const uint64_t* keys,
+                                          const float* values, size_t num) {
+  VLOG(5) << "DEBUG MemorySparseGeoTable::push_sparse keys[0]" << keys[0]
+          << " key_num: " << num;
+  std::vector<uint64_t> ids;
+  ids.resize(num);
+  std::copy_n(keys, num, ids.begin());
+  _geo_recorder->Update(ids);
+  _push_sparse(keys, values, num);
+  return 0;
+}
+
+int32_t MemorySparseGeoTable::initialize() {
+  if (!_geo_recorder) {
+    auto trainers = _config.common().trainer_num();
+    _geo_recorder = std::make_shared<GeoRecorder>(trainers);
+  }
+
+  _dim = _config.common().dims()[0];
+  _shards_task_pool.resize(_task_pool_size);
+  for (int i = 0; i < _shards_task_pool.size(); ++i) {
+    _shards_task_pool[i].reset(new ::ThreadPool(1));
+  }
+
+  _local_shards.reset(new shard_type[_task_pool_size]);
+  return 0;
+}
+
+int32_t MemorySparseGeoTable::pull_sparse(float* pull_values,
+                                          const PullSparseValue& pull_value) {
+  auto shard_num = _task_pool_size;
+  std::vector<std::future<int>> tasks(shard_num);
+
+  std::vector<std::vector<std::pair<uint64_t, int>>> task_keys(shard_num);
+  size_t num = pull_value.numel_;
+  for (size_t i = 0; i < num; ++i) {
+    int shard_id = pull_value.feasigns_[i] % shard_num;
+    task_keys[shard_id].push_back({pull_value.feasigns_[i], i});
+  }
+
+  for (int shard_id = 0; shard_id < shard_num; ++shard_id) {
+    tasks[shard_id] = _shards_task_pool[shard_id]->enqueue(
+        [this, shard_id, &task_keys, pull_values]() -> int {
+          auto& local_shard = _local_shards[shard_id];
+          auto& keys = task_keys[shard_id];
+          for (size_t i = 0; i < keys.size(); i++) {
+            uint64_t key = keys[i].first;
+            auto offset = keys[i].second;
+            float* select_data = pull_values + _dim * offset;
+
+            auto itr = local_shard.find(key);
+            if (itr == local_shard.end()) {
+              // ++missed_keys;
+              auto& feature_value = local_shard[key];
+              feature_value.resize(_dim);
+              memset(feature_value.data(), 0, sizeof(float) * _dim);
+              VLOG(0) << "MemorySparseGeoTable pull_sparse key not found!!! "
+                      << key;
+              itr = local_shard.find(key);
+            }
+            memcpy(select_data, itr.value().data(), _dim * sizeof(float));
+
+            VLOG(5) << "DEBUG MemorySparseGeoTable::pull_sparse key: " << key
+                    << " select_data[0] " << select_data[0]
+                    << " value[0]: " << itr.value().data()[0];
+          }
+          return 0;
+        });
+  }
+
+  for (size_t shard_id = 0; shard_id < tasks.size(); ++shard_id) {
+    tasks[shard_id].wait();
+  }
+
+  return 0;
+}
+
+int32_t MemorySparseGeoTable::_push_sparse(const uint64_t* keys,
+                                           const float* values, size_t num) {
+  auto shard_num = _task_pool_size;
+  std::vector<std::future<int>> tasks(shard_num);
+  std::vector<std::vector<std::pair<uint64_t, int>>> task_keys(shard_num);
+  for (size_t i = 0; i < num; ++i) {
+    int shard_id = keys[i] % shard_num;
+    task_keys[shard_id].push_back({keys[i], i});
+  }
+
+  for (size_t shard_id = 0; shard_id < shard_num; ++shard_id) {
+    tasks[shard_id] = _shards_task_pool[shard_id]->enqueue(
+        [this, shard_id, values, &task_keys]() -> int {
+          auto& keys = task_keys[shard_id];
+          auto& local_shard = _local_shards[shard_id];
+          auto blas = GetBlas<float>();
+
+          for (int i = 0; i < keys.size(); ++i) {
+            uint64_t key = keys[i].first;
+            uint64_t push_data_idx = keys[i].second;
+            const float* update_data = values + push_data_idx * _dim;
+            auto itr = local_shard.find(key);
+            if (itr == local_shard.end()) {
+              VLOG(0) << "sparse geo table push not found key!!! " << key;
+              auto& feature_value = local_shard[key];
+              feature_value.resize(_dim);
+              memset(feature_value.data(), 0, sizeof(float) * _dim);
+              itr = local_shard.find(key);
+            }
+
+            auto& feature_value = itr.value();
+            float* value_data = feature_value.data();
+            VLOG(5) << "DEBUG MemorySparseGeoTable::_push_sparse before key: "
+                    << key << " update_data[0] " << update_data[0]
+                    << " value[0]: " << value_data[0];
+            blas.VADD(_dim, update_data, value_data, value_data);
+            VLOG(5) << "DEBUG MemorySparseGeoTable::_push_sparse after key: "
+                    << key << " value[0]: " << value_data[0];
+          }
+          return 0;
+        });
+  }
+
+  for (size_t shard_id = 0; shard_id < tasks.size(); ++shard_id) {
+    tasks[shard_id].wait();
+  }
+  return 0;
+}
+
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/ps/table/memory_sparse_geo_table.h b/paddle/fluid/distributed/ps/table/memory_sparse_geo_table.h
new file mode 100644
index 00000000000..89c4fc15ae2
--- /dev/null
+++ b/paddle/fluid/distributed/ps/table/memory_sparse_geo_table.h
@@ -0,0 +1,78 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <assert.h>
+// #include <pthread.h>
+#include <stdint.h>
+#include <memory>
+#include <mutex>  // NOLINT
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/distributed/ps/table/accessor.h"
+#include "paddle/fluid/distributed/ps/table/common_table.h"
+#include "paddle/fluid/distributed/ps/table/depends/feature_value.h"
+#include "paddle/fluid/distributed/ps/table/depends/geo_recorder.h"
+#include "paddle/fluid/string/string_helper.h"
+
+namespace paddle {
+namespace distributed {
+
+class GeoRecorder;
+
+class MemorySparseGeoTable : public SparseTable {
+ public:
+  typedef SparseTableShard<uint64_t, FixedFeatureValue> shard_type;
+  MemorySparseGeoTable() { _geo_recorder = nullptr; }
+  virtual ~MemorySparseGeoTable() {}
+
+  virtual int32_t initialize();
+  virtual int32_t initialize_shard() { return 0; }
+  virtual int32_t load(const std::string& path, const std::string& param) {
+    return 0;
+  }
+  virtual int32_t save(const std::string& path, const std::string& param) {
+    return 0;
+  }
+  virtual int32_t flush() { return 0; }
+  virtual int32_t shrink(const std::string& param) { return 0; }
+  virtual void clear() { return; }
+  virtual int32_t pull_sparse(float* values, const PullSparseValue& pull_value);
+
+  int32_t push_sparse_param(const uint64_t* keys, const float* values,
+                            size_t num);
+  // TODO(zhaocaibei123): change to pull_sparse, and rename pull_sparse
+  int32_t pull_geo_param(const uint32_t trainer_id, std::vector<float>* values,
+                         std::vector<uint64_t>* keys);
+
+  int32_t push_sparse(const uint64_t* keys, const float* values,
+                      size_t num) override;
+
+  int32_t _push_sparse(const uint64_t* keys, const float* values, size_t num);
+  // int32_t _pull_sparse(float* pull_values, const PullSparseValue&
+  // pull_value);
+
+ private:
+  std::shared_ptr<GeoRecorder> _geo_recorder;
+  const int _task_pool_size = 10;
+  std::vector<std::shared_ptr<::ThreadPool>> _shards_task_pool;
+  std::unique_ptr<shard_type[]> _local_shards;
+  int _dim;
+};
+
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/ps/table/table.cc b/paddle/fluid/distributed/ps/table/table.cc
index b9b5ff12fc9..fa8169da07a 100644
--- a/paddle/fluid/distributed/ps/table/table.cc
+++ b/paddle/fluid/distributed/ps/table/table.cc
@@ -20,6 +20,7 @@
 #include "paddle/fluid/distributed/ps/table/common_dense_table.h"
 #include "paddle/fluid/distributed/ps/table/common_graph_table.h"
 #include "paddle/fluid/distributed/ps/table/common_sparse_table.h"
+#include "paddle/fluid/distributed/ps/table/memory_sparse_geo_table.h"
 #include "paddle/fluid/distributed/ps/table/sparse_geo_table.h"
 #ifdef PADDLE_WITH_HETERPS
 #include "paddle/fluid/distributed/ps/table/ssd_sparse_table.h"
@@ -43,6 +44,7 @@ REGISTER_PSCORE_CLASS(Table, TensorTable);
 REGISTER_PSCORE_CLASS(Table, DenseTensorTable);
 REGISTER_PSCORE_CLASS(Table, GlobalStepTable);
 REGISTER_PSCORE_CLASS(Table, MemorySparseTable);
+REGISTER_PSCORE_CLASS(Table, MemorySparseGeoTable);
 REGISTER_PSCORE_CLASS(ValueAccessor, CommMergeAccessor);
 REGISTER_PSCORE_CLASS(ValueAccessor, CtrCommonAccessor);
 REGISTER_PSCORE_CLASS(SparseValueSGDRule, StdAdaGradSGDRule);
diff --git a/paddle/fluid/distributed/test/CMakeLists.txt b/paddle/fluid/distributed/test/CMakeLists.txt
index 62de82832e1..2223334ccc4 100644
--- a/paddle/fluid/distributed/test/CMakeLists.txt
+++ b/paddle/fluid/distributed/test/CMakeLists.txt
@@ -35,3 +35,6 @@ cc_test(ctr_accessor_test SRCS ctr_accessor_test.cc DEPS ${COMMON_DEPS} boost ta
 
 set_source_files_properties(memory_sparse_table_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 cc_test(memory_sparse_table_test SRCS memory_sparse_table_test.cc DEPS ${COMMON_DEPS} boost table)
+
+set_source_files_properties(memory_geo_table_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+cc_test(memory_sparse_geo_table_test SRCS memory_geo_table_test.cc DEPS ${COMMON_DEPS} boost table)
diff --git a/paddle/fluid/distributed/test/memory_geo_table_test.cc b/paddle/fluid/distributed/test/memory_geo_table_test.cc
new file mode 100644
index 00000000000..fb48b38c76a
--- /dev/null
+++ b/paddle/fluid/distributed/test/memory_geo_table_test.cc
@@ -0,0 +1,123 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <ThreadPool.h>
+
+#include <unistd.h>
+#include <string>
+#include <thread>  // NOLINT
+
+#include "google/protobuf/text_format.h"
+#include "gtest/gtest.h"
+#include "paddle/fluid/distributed/ps.pb.h"
+#include "paddle/fluid/distributed/ps/table/depends/sparse_utils.h"
+#include "paddle/fluid/distributed/ps/table/memory_sparse_geo_table.h"
+#include "paddle/fluid/distributed/ps/table/table.h"
+
+namespace paddle {
+namespace distributed {
+
+// MemorySparseGeoTable
+TEST(MemorySparseGeoTable, SSUM) {
+  int emb_dim = 10;
+  int trainers = 2;
+
+  TableParameter table_config;
+  table_config.set_table_class("MemorySparseGeoTable");
+  FsClientParameter fs_config;
+  Table *table = new MemorySparseGeoTable();
+  TableAccessorParameter *accessor_config = table_config.mutable_accessor();
+  accessor_config->set_accessor_class("CommMergeAccessor");
+  accessor_config->set_fea_dim(10);
+  CommonAccessorParameter *common_config = table_config.mutable_common();
+  common_config->set_name("sum");
+  common_config->set_table_name("ssum_test_table");
+  common_config->set_trainer_num(trainers);
+  common_config->add_params("Param");
+  common_config->add_dims(emb_dim);
+  common_config->add_initializers("fill_constant&1.0");
+
+  auto ret = table->initialize(table_config, fs_config);
+  ASSERT_EQ(ret, 0);
+
+  // test push_sparse_param, and create params
+  std::vector<uint64_t> init_keys = {0, 1, 2, 3, 4};
+  std::vector<uint32_t> init_fres = {1, 1, 1, 1, 1};
+  std::vector<float> init_values;
+  for (size_t i = 0; i < init_keys.size() * emb_dim; i++) {
+    init_values.push_back(0.0);
+  }
+  table->push_sparse_param(init_keys.data(), init_values.data(),
+                           init_keys.size());
+
+  std::vector<float> pull_values(init_values.size());
+  auto value = PullSparseValue(init_keys, init_fres, emb_dim);
+  table->pull_sparse(pull_values.data(), value);
+
+  for (size_t i = 0; i < init_keys.size() * emb_dim; i++) {
+    ASSERT_TRUE(abs(pull_values[i] - init_values[i]) < 1e-5);
+  }
+
+  std::vector<std::vector<uint64_t>> trainer_keys;
+  std::vector<std::vector<float>> trainer_values;
+  trainer_keys.resize(trainers);
+  trainer_values.resize(trainers);
+  float start = 0.0;
+  for (int i = 0; i < trainers; i++) {
+    trainer_keys[i] = init_keys;
+    for (size_t j = 0; j < trainer_keys[i].size(); j++) {
+      auto id = trainer_keys[i][j];
+      for (int k = 0; k < emb_dim; k++) {
+        trainer_values[i].push_back(start);
+        pull_values[id * emb_dim + k] += start;
+        start += 0.1;
+      }
+    }
+  }
+
+  std::shared_ptr<::ThreadPool> pool_ =
+      std::make_shared<::ThreadPool>(trainers);
+  std::vector<std::future<void>> task_status;
+  for (int i = 0; i < trainers; i++) {
+    auto &push_keys = trainer_keys[i];
+    auto &push_values = trainer_values[i];
+    auto task = [table, &push_keys, &push_values] {
+      table->push_sparse(push_keys.data(), push_values.data(),
+                         push_keys.size());
+    };
+    task_status.push_back(pool_->enqueue(std::move(task)));
+  }
+  for (auto &status : task_status) {
+    status.wait();
+  }
+
+  std::vector<std::vector<uint64_t>> geo_pull_ids;
+  std::vector<std::vector<float>> geo_pull_values;
+  geo_pull_ids.resize(trainers);
+  geo_pull_values.resize(trainers);
+  for (int i = 0; i < trainers; i++) {
+    table->pull_geo_param(i, &geo_pull_values[i], &geo_pull_ids[i]);
+    ASSERT_EQ(geo_pull_values[i].size(), geo_pull_ids[i].size() * emb_dim);
+    for (size_t j = 0; j < geo_pull_ids[i].size(); ++j) {
+      auto id = geo_pull_ids[i][j];
+      for (int k = 0; k < emb_dim; k++) {
+        ASSERT_TRUE(abs(geo_pull_values[i][j * emb_dim + k] -
+                        pull_values[id * emb_dim + k]) < 1e-5);
+      }
+    }
+  }
+}
+
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/eager/CMakeLists.txt b/paddle/fluid/eager/CMakeLists.txt
index df000011e65..711c46e9952 100644
--- a/paddle/fluid/eager/CMakeLists.txt
+++ b/paddle/fluid/eager/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(eager_deps pten pten_api hook_utils tensor_utils utils global_utils backward pten_tensor legacy autograd_meta grad_node_info grad_tensor_holder gradient_accumulation accumulation_node)
+set(eager_deps pten pten_api hook_utils tensor_utils utils global_utils backward pten_tensor tracer layer autograd_meta grad_node_info grad_tensor_holder accumulation_node)
 set(fluid_deps tracer layer proto_desc operator op_registry variable_helper memcpy)
 set(generated_deps dygraph_function dygraph_node)
 
@@ -9,14 +9,12 @@ endif()
 
 add_subdirectory(api)
 add_subdirectory(accumulation)
-add_subdirectory(legacy)
 
 cc_library(grad_node_info SRCS grad_node_info.cc DEPS pten pten_api)
-cc_library(grad_tensor_holder SRCS grad_tensor_holder.cc DEPS grad_node_info gradient_accumulation)
+cc_library(grad_tensor_holder SRCS grad_tensor_holder.cc DEPS grad_node_info gradient_accumulator)
 
 cc_library(autograd_meta SRCS autograd_meta.cc DEPS pten pten_api)
 cc_library(utils SRCS utils.cc DEPS pten pten_api global_utils layer proto_desc operator op_registry variable_helper memcpy scale_op autograd_meta hook_utils)
-cc_library(legacy SRCS ${DYGRAPH_LEGACY} DEPS global_utils proto_desc operator pten pten_api op_registry variable_helper memcpy)
 cc_library(backward SRCS backward.cc DEPS grad_tensor_holder utils autograd_meta grad_node_info)
 
 add_subdirectory(tests)
diff --git a/paddle/fluid/eager/accumulation/CMakeLists.txt b/paddle/fluid/eager/accumulation/CMakeLists.txt
index bfc7b54bef1..632e289ba23 100644
--- a/paddle/fluid/eager/accumulation/CMakeLists.txt
+++ b/paddle/fluid/eager/accumulation/CMakeLists.txt
@@ -1,2 +1 @@
-cc_library(gradient_accumulation SRCS gradient_accumulation.cc DEPS blas pten pten_api var_type_traits layer math_function)
-cc_library(accumulation_node SRCS accumulation_node.cc DEPS gradient_accumulation pten pten_api grad_node_info)
+cc_library(accumulation_node SRCS accumulation_node.cc DEPS gradient_accumulator pten pten_api grad_node_info)
diff --git a/paddle/fluid/eager/accumulation/accumulation_node.cc b/paddle/fluid/eager/accumulation/accumulation_node.cc
index 823c0153d71..f6d66ac81b5 100644
--- a/paddle/fluid/eager/accumulation/accumulation_node.cc
+++ b/paddle/fluid/eager/accumulation/accumulation_node.cc
@@ -13,8 +13,8 @@
 // limitations under the License.
 
 #include "paddle/fluid/eager/accumulation/accumulation_node.h"
-#include "paddle/fluid/eager/accumulation/gradient_accumulation.h"
 #include "paddle/fluid/eager/eager_tensor.h"
+#include "paddle/fluid/imperative/gradient_accumulator.h"
 
 #include "paddle/pten/api/all.h"
 #include "paddle/pten/core/dense_tensor.h"
@@ -35,7 +35,7 @@ static void CopyOrAddTensor(egr::EagerTensor* tensor,
     *tensor = t;
   } else {
     // Accumulation
-    egr::TensorAdd(t, tensor);
+    paddle::imperative::TensorAdd<egr::EagerTensor>(t, tensor);
   }
 }
 
diff --git a/paddle/fluid/eager/accumulation/gradient_accumulation.cc b/paddle/fluid/eager/accumulation/gradient_accumulation.cc
deleted file mode 100644
index 1224b92dec8..00000000000
--- a/paddle/fluid/eager/accumulation/gradient_accumulation.cc
+++ /dev/null
@@ -1,291 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/eager/accumulation/gradient_accumulation.h"
-#include <algorithm>
-#include <memory>
-#include <utility>
-#include "paddle/fluid/eager/eager_tensor.h"
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/imperative/gradient_accumulator.h"
-#include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/operators/math/math_function_impl.h"
-#include "paddle/fluid/operators/math/selected_rows_functor.h"
-#include "paddle/fluid/platform/complex.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/float16.h"
-#include "paddle/pten/api/all.h"
-#include "paddle/pten/core/convert_utils.h"
-#include "unsupported/Eigen/CXX11/Tensor"
-#ifdef PADDLE_WITH_XPU
-#include "xpu/refactor/math.h"
-#endif
-#ifdef PADDLE_WITH_ASCEND_CL
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"
-#endif
-
-namespace egr {
-template <typename T>
-class TensorAddFunctor : public boost::static_visitor<> {
- public:
-  TensorAddFunctor(int64_t numel, const T* x, T* y)
-      : numel_(numel), x_(x), y_(y) {}
-
-  void operator()(const paddle::platform::CPUPlace& place) const {
-    paddle::platform::CPUDeviceContext* ctx =
-        dynamic_cast<paddle::platform::CPUDeviceContext*>(
-            paddle::platform::DeviceContextPool::Instance().Get(place));
-    auto blas =
-        paddle::operators::math::GetBlas<paddle::platform::CPUDeviceContext, T>(
-            *ctx);
-    blas.AXPY(numel_, 1., x_, y_);
-  }
-
-// TODO(jiabin): Support xpu here from gradient_accumulator.cc
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  void operator()(const paddle::platform::CUDAPlace& place) const {
-    paddle::platform::CUDADeviceContext* ctx =
-        dynamic_cast<paddle::platform::CUDADeviceContext*>(
-            paddle::platform::DeviceContextPool::Instance().Get(place));
-    auto blas =
-        paddle::operators::math::GetBlas<paddle::platform::CUDADeviceContext,
-                                         T>(*ctx);
-    blas.AXPY(numel_, 1., x_, y_);
-  }
-#else
-  void operator()(const paddle::platform::CUDAPlace& place) const {
-    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
-        "Gradient accumulation on place (%s) "
-        "is not supported in imperative mode",
-        place));
-  }
-#endif
-
-  // TODO(jiabin): Support Npu here from gradient_accumulator.cc
-  // there is NO blas in CUDAPinnedPlace
-  void operator()(const paddle::platform::CUDAPinnedPlace& place) const {
-    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
-        "Gradient accumulation on place (%s) "
-        "is not supported in imperative mode",
-        place));
-  }
-
-#ifdef PADDLE_WITH_ASCEND_CL
-  void operator()(const paddle::platform::NPUPlace& place) const {
-    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
-        "Gradient accumulation on place (%s) "
-        "is not supported in imperative mode",
-        place));
-  }
-#else
-  void operator()(const paddle::platform::NPUPlace& place) const {
-    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
-        "Gradient accumulation on place (%s) "
-        "is not supported in imperative mode",
-        place));
-  }
-#endif
-
-#ifdef PADDLE_WITH_XPU
-  void operator()(const paddle::platform::XPUPlace& place) const {
-    paddle::platform::XPUDeviceContext* ctx =
-        dynamic_cast<paddle::platform::XPUDeviceContext*>(
-            paddle::platform::DeviceContextPool::Instance().Get(place));
-    xpu::add<T>(ctx->x_context(), x_, y_, y_, static_cast<int>(numel_));
-  }
-#else
-  void operator()(const paddle::platform::XPUPlace& place) const {
-    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
-        "Gradient accumulation on place (%s) "
-        "is not supported in imperative mode",
-        place));
-  }
-#endif
-
-#ifdef PADDLE_WITH_MLU
-  void operator()(const paddle::platform::MLUPlace& place) const {
-    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
-        "Gradient accumulation on place (%s) "
-        "is not supported in imperative mode",
-        place));
-  }
-#else
-  void operator()(const paddle::platform::MLUPlace& place) const {
-    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
-        "Gradient accumulation on place (%s) "
-        "is not supported in imperative mode",
-        place));
-  }
-#endif
-
-#ifdef PADDLE_WITH_IPU
-  void operator()(const paddle::platform::IPUPlace& place) const {
-    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
-        "Gradient accumulation on place (%s) "
-        "is not supported in imperative mode",
-        place));
-  }
-#else
-  void operator()(const paddle::platform::IPUPlace& place) const {
-    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
-        "Gradient accumulation on place (%s) "
-        "is not supported in imperative mode",
-        place));
-  }
-#endif
-
-  void operator()(const paddle::platform::NPUPinnedPlace& place) const {
-    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
-        "Gradient accumulation on place (%s) "
-        "is not supported in imperative mode",
-        place));
-  }
-
- private:
-  int64_t numel_;
-  const T* x_;
-  mutable T* y_;
-};
-
-template <typename DeviceContext, typename T>
-void TensorAddImpl(const std::shared_ptr<pten::DenseTensor>& src,
-                   pten::DenseTensor* dst,
-                   const paddle::platform::Place& place) {
-  paddle::platform::DeviceContextPool& pool =
-      paddle::platform::DeviceContextPool::Instance();
-  paddle::platform::DeviceContext* ctx = pool.Get(place);
-  auto dev_ctx = dynamic_cast<DeviceContext*>(ctx);
-  paddle::operators::math::ElementwiseAddTo<DeviceContext, T> func;
-  func(dev_ctx, *(src.get()), dst);
-}
-
-template <typename DeviceContext, typename T>
-void TensorAddImpl(const paddle::framework::Tensor& src,
-                   paddle::framework::Tensor* dst,
-                   const paddle::platform::Place& place) {
-  paddle::platform::DeviceContextPool& pool =
-      paddle::platform::DeviceContextPool::Instance();
-  paddle::platform::DeviceContext* ctx = pool.Get(place);
-  auto dev_ctx = dynamic_cast<DeviceContext*>(ctx);
-  paddle::operators::math::ElementwiseAddTo<DeviceContext, T> func;
-  func(dev_ctx, src, dst);
-}
-
-void TensorAdd(const egr::EagerTensor& src, egr::EagerTensor* dst) {
-  // TODO(jiabin): Support other tensor type later
-  std::shared_ptr<pten::DenseTensor> dst_tensor =
-      std::dynamic_pointer_cast<pten::DenseTensor>(dst->impl());
-  std::shared_ptr<pten::DenseTensor> src_tensor =
-      std::dynamic_pointer_cast<pten::DenseTensor>(src.impl());
-
-  auto numel = src_tensor->numel();
-
-  if (numel == 0) {
-    return;
-  }
-
-  PADDLE_ENFORCE_EQ(
-      dst_tensor->numel(), numel,
-      paddle::platform::errors::PreconditionNotMet(
-          "The number of elements of source tensor and destination tensor "
-          "should be equal, but got the number of elements of source tensor is "
-          "%zu and the number of elements of destination tensor is %zu.",
-          numel, dst_tensor->numel()));
-
-  auto data_type = pten::TransToProtoVarType(src_tensor->dtype());
-  auto place = src_tensor->place();
-
-  PADDLE_ENFORCE_EQ(pten::TransToProtoVarType(dst_tensor->dtype()), data_type,
-                    paddle::platform::errors::PreconditionNotMet(
-                        "The data type of source tensor and destination tensor "
-                        "should be equal, Otherwise, the calculation results "
-                        "will be incorrect."));
-
-#define PADDLE_TENSOR_ADD(cpp_type)                                          \
-  if (data_type == paddle::framework::DataTypeTrait<cpp_type>::DataType()) { \
-    TensorAddFunctor<cpp_type> func(                                         \
-        numel, src_tensor->data<cpp_type>(),                                 \
-        dst_tensor->mutable_data<cpp_type>(place));                          \
-    paddle::platform::VisitPlace(place, func);                               \
-    return;                                                                  \
-  }
-
-  // TODO(jiabin): Support NPU here
-  PADDLE_TENSOR_ADD(float);
-// NOTE(phlrain): xpu only support float
-#ifndef PADDLE_WITH_XPU
-  PADDLE_TENSOR_ADD(double);
-  // NOTE(chenweihang): only support complex grad tensor accumulated,
-  // support selected rows if needed in the future
-  PADDLE_TENSOR_ADD(paddle::platform::complex<float>);
-  PADDLE_TENSOR_ADD(paddle::platform::complex<double>);
-#endif
-#undef PADDLE_TENSOR_ADD
-
-  if (data_type == paddle::framework::proto::VarType::FP16) {
-    if (paddle::platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-      return TensorAddImpl<paddle::platform::CUDADeviceContext,
-                           paddle::platform::float16>(src_tensor,
-                                                      dst_tensor.get(), place);
-#else
-      PADDLE_THROW(paddle::platform::errors::Unimplemented(
-          "Gradient accumulation of data type (%s) on place (%s) is not "
-          "supported in imperative mode",
-          paddle::framework::DataTypeToString(data_type), place));
-#endif
-    } else if (paddle::platform::is_cpu_place(place)) {
-      return TensorAddImpl<paddle::platform::CPUDeviceContext,
-                           paddle::platform::float16>(src_tensor,
-                                                      dst_tensor.get(), place);
-    }
-  }
-  PADDLE_THROW(paddle::platform::errors::Unimplemented(
-      "Gradient accumulation of data type (%s) on place (%s) is not "
-      "supported in imperative mode",
-      paddle::framework::DataTypeToString(data_type), place));
-}
-
-void VariableAdd(const egr::EagerTensor& src_tensor,
-                 egr::EagerTensor* dst_tensor) {
-  auto& src = src_tensor.Var();
-  auto* dst = dst_tensor->MutableVar();
-
-  if (dst->IsType<paddle::framework::LoDTensor>()) {
-    if (src.IsType<paddle::framework::LoDTensor>()) {
-      paddle::imperative::TensorAdd(src, dst);
-    } else if (src.IsType<pten::SelectedRows>()) {
-      paddle::imperative::SelectedRowsAddToTensor(src, dst);
-    } else {
-      PADDLE_THROW(paddle::platform::errors::InvalidArgument(
-          "Unexpected branch, output variable type is %s",
-          paddle::framework::ToTypeName(dst->Type())));
-    }
-  } else {
-    if (src.IsType<paddle::framework::LoDTensor>()) {
-      paddle::framework::Variable new_dst;
-      paddle::imperative::SelectedRowsAddTensor(*dst, src, &new_dst);
-      *dst = std::move(new_dst);
-    } else {
-      PADDLE_THROW(paddle::platform::errors::InvalidArgument(
-          "Unexpected branch, output variable type is %s",
-          paddle::framework::ToTypeName(dst->Type())));
-    }
-  }
-}
-
-}  // namespace egr
diff --git a/paddle/fluid/eager/api/generated/.gitignore b/paddle/fluid/eager/api/generated/.gitignore
index 7b49528feab..a57128902ff 100644
--- a/paddle/fluid/eager/api/generated/.gitignore
+++ b/paddle/fluid/eager/api/generated/.gitignore
@@ -1 +1,2 @@
 fluid_generated/**
+eager_generated/**
\ No newline at end of file
diff --git a/paddle/fluid/eager/api/generated/eager_generated/backwards/CMakeLists.txt b/paddle/fluid/eager/api/generated/eager_generated/backwards/CMakeLists.txt
index 1084f0ec573..e3fafb265ad 100644
--- a/paddle/fluid/eager/api/generated/eager_generated/backwards/CMakeLists.txt
+++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/CMakeLists.txt
@@ -1 +1,6 @@
 cc_library(scale_node SRCS scale_node.cc DEPS global_utils pten pten_api grad_node_info)
+
+if(NOT ON_INFER)
+cc_library(final_dygraph_node SRCS nodes.cc DEPS ${eager_deps})
+add_dependencies(final_dygraph_node eager_final_state_codegen)
+endif()
diff --git a/paddle/fluid/eager/api/generated/eager_generated/forwards/CMakeLists.txt b/paddle/fluid/eager/api/generated/eager_generated/forwards/CMakeLists.txt
index ed04e0b6f5a..8ede139ddc0 100644
--- a/paddle/fluid/eager/api/generated/eager_generated/forwards/CMakeLists.txt
+++ b/paddle/fluid/eager/api/generated/eager_generated/forwards/CMakeLists.txt
@@ -1 +1,6 @@
 cc_library(eager_scale SRCS scale.cc DEPS pten_api pten autograd_meta scale_node)
+
+if(NOT ON_INFER)
+cc_library(final_dygraph_function SRCS dygraph_functions.cc DEPS ${eager_deps})
+add_dependencies(final_dygraph_function eager_final_state_codegen)
+endif()
diff --git a/paddle/fluid/eager/auto_code_generator/CMakeLists.txt b/paddle/fluid/eager/auto_code_generator/CMakeLists.txt
index c504a126dde..668e60d857b 100644
--- a/paddle/fluid/eager/auto_code_generator/CMakeLists.txt
+++ b/paddle/fluid/eager/auto_code_generator/CMakeLists.txt
@@ -1,4 +1,4 @@
-#add_subdirectory(final_state_generator)
+add_subdirectory(final_state_generator)
 
 set(EAGER_GENERETOR_DEPS ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} pybind proto_desc executor layer tracer engine imperative_profiler imperative_flag)
 
diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
index 341f91c1c6a..edfe1b832b5 100644
--- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc
+++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
@@ -1220,7 +1220,8 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
 
         // According to op_proto->attrs()
 
-        egr::legacy::RunOp("op_type", ins, outs, attr_map,
+        Controller.Instance().GetCurrentTracer()->TraceOp("op_type", ins, outs,
+  attr_map,
   Controller.Instance().GetExpectedPlace(), {});
 
         // According to fwd_outputs_names
@@ -1401,7 +1402,8 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
   const char* FWD_TRACE_OP_TEMPLATE =
       "  paddle::framework::AttributeMap attrs = attr_map;\n"
       "  paddle::framework::AttributeMap default_attrs;\n"
-      "  egr::legacy::RunOp(\"%s\", ins, outs, attrs, \n"
+      "  egr::Controller::Instance().GetCurrentTracer()->TraceOp(\"%s\", ins, "
+      "outs, attrs, \n"
       "     egr::Controller::Instance().GetExpectedPlace(),\n"
       "     &default_attrs, true, {});\n";
   std::string trace_op_str =
@@ -1712,7 +1714,8 @@ static std::string GenerateSingleOpBase(
       "  // Pass the entire attribute map to TraceOp\n"
       "  // The underlying kernel will pickup whatever attribute they need "
       "at runtime\n"
-      "  egr::legacy::RunOp(\"%s\", %s, %s, %s,\n"
+      "  egr::Controller::Instance().GetCurrentTracer()->TraceOp(\"%s\", %s, "
+      "%s, %s,\n"
       "      egr::Controller::Instance().GetExpectedPlace(),\n"
       "      &this->default_attr_map_, false, {});\n";
   std::string trace_opbase_str = paddle::string::Sprintf(
@@ -1822,7 +1825,8 @@ static std::string GenerateGradNodeCCContents(
     // Visit each OpBase
     for(auto iter = "grad_node->begin()"; iter < "grad_node->end()"; iter++) {
         // Simply pass entire attribute map to kernels
-        egr::legacy::RunOp("iter->Type()", ins, outs, this->attr_map_,
+        Controller.Instance().GetCurrentTracer()->TraceOp("iter->Type()", ins,
+  outs, this->attr_map_,
             egr::Controller::Instance().ExpectedPlace(), false, {});
     }
 
@@ -2054,6 +2058,7 @@ static std::string GenerateDygraphHFileIncludes() {
       "#include \"paddle/fluid/eager/autograd_meta.h\"\n"
       "#include \"paddle/pten/api/all.h\"\n"
       "#include \"paddle/fluid/eager/utils.h\"\n"
+      "#include \"paddle/fluid/imperative/tracer.h\"\n"
       "#include \"paddle/fluid/framework/op_registry.h\"\n\n";
 
   dygraph_forward_api_includes_str +=
@@ -2084,8 +2089,7 @@ static void GenerateForwardDygraphFile(const std::string& forward_cc_path,
       "dygraph_forward_api.h\"\n"
       "#include "
       "\"paddle/fluid/eager/api/generated/fluid_generated/nodes/nodes.h\"\n\n"
-      "#include \"paddle/fluid/eager/api/utils/global_utils.h\"\n"
-      "#include \"paddle/fluid/eager/legacy/op_runner.h\"\n";
+      "#include \"paddle/fluid/eager/api/utils/global_utils.h\"\n";
   std::string forward_cc_include_str =
       paddle::string::Sprintf(FORWARD_INCLUDE_TEMPLATE);
   std::ofstream forward_cc_stream(forward_cc_path, std::ios::out);
@@ -2099,7 +2103,7 @@ static void GenerateNodeHFile(const std::string& node_h_path,
   std::string node_h_include_str =
       "#pragma once\n"
       "#include \"paddle/fluid/eager/tensor_wrapper.h\"\n"
-      "#include \"paddle/fluid/eager/legacy/op_runner.h\"\n"
+      "#include \"paddle/fluid/imperative/tracer.h\"\n"
       "#include \"paddle/fluid/eager/grad_node_info.h\"\n\n";
   std::ofstream node_h_stream(node_h_path, std::ios::out);
   node_h_stream << node_h_include_str;
diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt b/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt
index 56ba4acc62b..0a96cbc9c97 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt
@@ -2,13 +2,14 @@ set(api_yaml_path "${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/api.yaml")
 set(backward_yaml_path "${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/backward.yaml")
 set(tmp_forwards_cc_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/forwards/tmp_dygraph_functions.cc")
 set(tmp_forwards_h_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/forwards/tmp_dygraph_functions.h")
-set(tmp_nodes_cc_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/backwards/tmp_node.cc")
-set(tmp_nodes_h_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/backwards/tmp_node.h")
+set(tmp_nodes_cc_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/backwards/tmp_nodes.cc")
+set(tmp_nodes_h_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/backwards/tmp_nodes.h")
 set(forwards_cc_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.cc")
 set(forwards_h_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h")
-set(nodes_cc_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/backwards/node.cc")
-set(nodes_h_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/backwards/node.h")
+set(nodes_cc_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/backwards/nodes.cc")
+set(nodes_h_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/backwards/nodes.h")
 
+message("Final State Eager CodeGen")
 add_custom_target(eager_final_state_codegen
     COMMAND "${PYTHON_EXECUTABLE}" "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py" 
             "--api_yaml_path=${api_yaml_path}"
diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
index 0031d47a383..63a74fd1008 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
@@ -15,6 +15,7 @@
 import yaml
 import re
 import argparse
+import os
 
 
 def ParseArguments():
@@ -71,6 +72,24 @@ def GetConstReference(string):
     return ret
 
 
+def RemoveConstAndReference(string):
+    ret = string
+    if string.startswith("const "):
+        ret = ret[6:]
+    if string.endswith("&"):
+        ret = ret[:-1]
+
+    return ret
+
+
+def GetGradNodeName(string):
+    return f"FinalGradNode{string}"
+
+
+def GetForwardFunctionName(string):
+    return f"{string}_final_state_dygraph_function"
+
+
 def GetAutoGradMetaName(string):
     return f"{string}_autograd_meta"
 
@@ -84,17 +103,17 @@ def GetAutoGradMetaVectorName(string):
 ######################
 def ReadFwdFile(filepath):
     f = open(filepath, 'r')
-    contents = yaml.load(f)
+    contents = yaml.load(f, Loader=yaml.FullLoader)
     return contents
 
 
 def ReadBwdFile(filepath):
     f = open(filepath, 'r')
-    contents = yaml.load(f)
+    contents = yaml.load(f, Loader=yaml.FullLoader)
     ret = {}
     for content in contents:
-        assert 'grad_api' in content.keys()
-        api_name = content['grad_api']
+        assert 'backward_api' in content.keys()
+        api_name = content['backward_api']
         ret[api_name] = content
     return ret
 
@@ -134,13 +153,13 @@ def ParseYamlArgs(string):
 def ParseYamlReturns(string):
     # Example: Tensor, Tensor
 
-    # list = [ [ret_type, orig_position], ...]
+    # list = [ ["", ret_type, orig_position], ...]
     returns_list = []
 
     returns = [x.strip() for x in string.strip().split(",")]
     for i in range(len(returns)):
         ret = returns[i]
-        returns_list.append([ret, i])
+        returns_list.append(["", ret, i])
 
     return returns_list
 
@@ -249,8 +268,8 @@ def ForwardsValidationCheck(forward_inputs_list, forward_attrs_list,
         assert orig_attr_pos == forward_attr_pos
 
     for i in range(len(forward_returns_list)):
-        orig_return_type = orig_forward_returns_list[i][0]
-        orig_return_pos = orig_forward_returns_list[i][1]
+        orig_return_type = orig_forward_returns_list[i][1]
+        orig_return_pos = orig_forward_returns_list[i][2]
         forward_return_type = forward_returns_list[i][1]
         forward_return_pos = forward_returns_list[i][2]
 
@@ -435,19 +454,20 @@ def GenerateNodeDeclaration(fwd_api_name, backward_fwd_input_map,
             aname, GetConstReference(atype), aname, saved_attr_name, aname)
 
         ATTRIBUTE_MEMBER_TEMPLATE = """
-   {} {};
+   {} {} = {};
 """
         attribute_members_str += ATTRIBUTE_MEMBER_TEMPLATE.format(
-            GetConstReference(atype), saved_attr_name)
+            RemoveConstAndReference(atype), saved_attr_name, default_val)
     # End: SetAttributes & Attribute Members
 
+    grad_node_name = GetGradNodeName(fwd_api_name)
     NODE_DECLARATION_TEMPLATE = """
-class GradNode{} : public egr::GradNodeBase {{
+class {} : public egr::GradNodeBase {{
  public:
-  GradNode{}() : egr::GradNodeBase() {{}}
-  GradNode{}(size_t bwd_in_slot_num, size_t bwd_out_slot_num) : 
+  {}() : egr::GradNodeBase() {{}}
+  {}(size_t bwd_in_slot_num, size_t bwd_out_slot_num) : 
       egr::GradNodeBase(bwd_in_slot_num, bwd_out_slot_num) {{}}
-  ~GradNode{}() override = default;
+  ~{}() override = default;
 
   virtual std::vector<std::vector<egr::EagerTensor>> operator()(
       const std::vector<std::vector<egr::EagerTensor>>& grads) override;
@@ -465,7 +485,7 @@ class GradNode{} : public egr::GradNodeBase {{
 }};
 """
     node_declaration_str = NODE_DECLARATION_TEMPLATE.format(
-        forward_op_name, forward_op_name, forward_op_name, forward_op_name,
+        grad_node_name, grad_node_name, grad_node_name, grad_node_name,
         set_tensor_wrapper_methods_str, set_attribute_methods_str,
         tensor_wrapper_members_str, attribute_members_str)
 
@@ -489,17 +509,18 @@ def GenerateNodeDefinition(fwd_api_name, bwd_api_name, backward_fwd_input_map,
     for name, (_, is_fwd_input,
                grad_api_position), in backward_fwd_input_map.items():
         tensor_wrapper_name = GetSavedName(name)
-        if is_fwd_input:
+        grad_api_args[
+            grad_api_position] = f"egr::EagerUtils::SyncToPtenTensors( egr::EagerUtils::RecoverTensorWrapper(&this->{tensor_wrapper_name}, nullptr) )"
+
+    for _, (ttype, fwd_position,
+            grad_api_position) in backward_grad_input_map.items():
+        if IsPlainTensorType(ttype):
             grad_api_args[
-                grad_api_position] = f"egr::EagerUtils::RecoverTensorWrapper(&this->{tensor_wrapper_name}, true)"
+                grad_api_position] = f"egr::EagerUtils::SyncToPtenTensors( grads[{fwd_position}][0] )"
         else:
+            assert IsVectorTensorType(ttype)
             grad_api_args[
-                grad_api_position] = f"egr::EagerUtils::RecoverTensorWrapper(&this->{tensor_wrapper_name}, false)"
-
-    for _, (_, fwd_position,
-            grad_api_position) in backward_grad_input_map.items():
-        grad_api_args[
-            grad_api_position] = f"*grads[{fwd_position}].Tensor().get()"
+                grad_api_position] = f"egr::EagerUtils::SyncToPtenTensors( grads[{fwd_position}] )"
 
     for name, _, _, grad_api_position in backward_attrs_list:
         saved_attribute_name = GetSavedName(name)
@@ -507,40 +528,34 @@ def GenerateNodeDefinition(fwd_api_name, bwd_api_name, backward_fwd_input_map,
     grad_api_args_str = ", ".join(grad_api_args)
 
     # Construct grad_api returns
-    num_outputs = len(backward_grad_output_map.keys())
-    returns_list = ["" for i in range(num_outputs)]
+    num_bwd_outputs = len(backward_grad_output_map.keys())
+    returns_str = f"std::vector<std::vector<egr::EagerTensor>> returns({num_bwd_outputs});\n"
     for _, (ttype, fwd_position,
             grad_api_position) in backward_grad_output_map.items():
         # Infer Grad API Return Type
-        if num_outputs == 1:
+        if num_bwd_outputs == 1:
             # Single tensor output, return as is
             if IsPlainTensorType(ttype):
-                returns_list[0] = "{grad_api_returns}"
+                returns_str += "returns[0] = { egr::EagerUtils::CreateEagerTensorFromTensor(grad_api_returns) };\n"
             else:
                 assert IsVectorTensorType(ttype)
-                returns_list[0] = "grad_api_returns"
+                returns_str += "returns[0] = egr::EagerUtils::CreateEagerTensorFromTensor(grad_api_returns);\n"
         else:
             # Rearrange output order accordingly
-            if IsPlainTensorType(ttype):
-                returns_list[
-                    fwd_position] = f"{{ grad_api_returns[{grad_api_position}] }}"
-            else:
-                assert IsVectorTensorType(ttype)
-                returns_list[
-                    fwd_position] = f"grad_api_returns[{grad_api_position}]"
-    returns_str = ", ".join(returns_list)
-    returns_str = f"{{ {returns_str} }}"
+            returns_str += f"returns[{fwd_position}] = egr::EagerUtils::CreateEagerTensorFromTensor( grad_api_returns[{grad_api_position}] );\n"
+    returns_str += f"return returns;\n"
 
+    grad_node_name = GetGradNodeName(fwd_api_name)
     FUNCTION_TEMPLATE = """
-std::vector<std::vector<egr::EagerTensor>> GradNode{}::operator()(const std::vector<std::vector<egr::EagerTensor>>& grads) {{
+std::vector<std::vector<egr::EagerTensor>> {}::operator()(const std::vector<std::vector<egr::EagerTensor>>& grads) {{
     // Call grad_api function
-    auto grad_api_returns = {}({});
-    return {};
+    auto grad_api_returns = paddle::experimental::{}({});
+    {}
 }}
   """
 
     node_definition_str = FUNCTION_TEMPLATE.format(
-        fwd_api_name, bwd_api_name, grad_api_args_str, returns_str)
+        grad_node_name, bwd_api_name, grad_api_args_str, returns_str)
 
     return node_definition_str
 
@@ -565,12 +580,12 @@ def GenerateNodeCreationCodes(fwd_api_name, bwd_api_name,
     for name, (ttype, pos) in forward_inputs_position_map.items():
         input_autograd_meta_name = GetAutoGradMetaName(name)
         if IsPlainTensorType(ttype):
-            input_autograd_meta = f"    egr::EagerTensor* {input_autograd_meta_name} = egr::EagerUtils::nullable_autograd_meta({name});"
+            input_autograd_meta = f"    egr::AutogradMeta* {input_autograd_meta_name} = egr::EagerUtils::nullable_autograd_meta({name});"
         else:
             assert IsVectorTensorType(ttype)
             input_autograd_meta_vec_name = GetAutoGradMetaVectorName(name)
-            input_autograd_meta = f"    std::vector<egr::EagerTensor*> {input_autograd_meta_vec_name} = egr::EagerUtils::nullable_autograd_meta({name});\n"
-            input_autograd_meta += f"    std::vector<egr::EagerTensor*>* {input_autograd_meta_name} = &{input_autograd_meta_vec_name};"
+            input_autograd_meta = f"    std::vector<egr::AutogradMeta*> {input_autograd_meta_vec_name} = egr::EagerUtils::nullable_autograd_meta({name});\n"
+            input_autograd_meta += f"    std::vector<egr::AutogradMeta*>* {input_autograd_meta_name} = &{input_autograd_meta_vec_name};"
 
         inputs_autograd_meta_list.append(input_autograd_meta)
         compute_require_grad_args_list.append(input_autograd_meta_name)
@@ -586,19 +601,19 @@ def GenerateNodeCreationCodes(fwd_api_name, bwd_api_name,
         output_autograd_meta_vec_name = GetAutoGradMetaVectorName(name)
         if num_fwd_outputs == 1:
             if IsPlainTensorType(rtype):
-                output_autograd_meta = f"    egr::EagerTensor* {output_autograd_meta_name} = egr::EagerUtils::autograd_meta(outputs);"
+                output_autograd_meta = f"    egr::AutogradMeta* {output_autograd_meta_name} = egr::EagerUtils::autograd_meta(&outputs);"
             else:
                 assert IsVectorTensorType(rtype)
-                output_autograd_meta = f"    std::vector<egr::EagerTensor*> {output_autograd_meta_vec_name} = egr::EagerUtils::nullable_autograd_meta(outputs);\n"
-                output_autograd_meta += f"    std::vector<egr::EagerTensor*>* {output_autograd_meta_name} = &{output_autograd_meta_vec_name};"
+                output_autograd_meta = f"    std::vector<egr::AutogradMeta*> {output_autograd_meta_vec_name} = egr::EagerUtils::autograd_meta(&outputs);\n"
+                output_autograd_meta += f"    std::vector<egr::AutogradMeta*>* {output_autograd_meta_name} = &{output_autograd_meta_vec_name};"
         else:
             # Tuple api_result
             if IsPlainTensorType(rtype):
-                outputs_autograd_meta = f"    egr::EagerTensor* {output_autograd_meta_name} = egr::EagerUtils::autograd_meta(outputs[{pos}]);"
+                outputs_autograd_meta = f"    egr::AutogradMeta* {output_autograd_meta_name} = egr::EagerUtils::autograd_meta(&outputs[{pos}]);"
             else:
                 assert IsVectorTensorType(rtype)
-                output_autograd_meta = f"    std::vector<egr::EagerTensor*> {output_autograd_meta_vec_name} = egr::EagerUtils::nullable_autograd_meta(outputs[{pos}]);\n"
-                output_autograd_meta += f"    std::vector<egr::EagerTensor*>* {output_autograd_meta_name} = &{output_autograd_meta_vec_name};"
+                output_autograd_meta = f"    std::vector<egr::AutogradMeta*> {output_autograd_meta_vec_name} = egr::EagerUtils::autograd_meta(&outputs[{pos}]);\n"
+                output_autograd_meta += f"    std::vector<egr::AutogradMeta*>* {output_autograd_meta_name} = &{output_autograd_meta_vec_name};"
 
         outputs_autograd_meta_list.append(output_autograd_meta)
         pass_stop_gradient_args_list.append(output_autograd_meta_name)
@@ -610,19 +625,23 @@ def GenerateNodeCreationCodes(fwd_api_name, bwd_api_name,
     # Node Construction
     num_bwd_inputs = len(backward_grad_input_map.keys())
     num_bwd_outputs = len(backward_grad_output_map.keys())
-    node_construction_str = f"        auto grad_node = std::make_shared<GradNode{fwd_api_name}>({num_bwd_inputs}, {num_bwd_outputs});"
+    grad_node_name = GetGradNodeName(fwd_api_name)
+    node_construction_str = f"        auto grad_node = std::make_shared<{grad_node_name}>({num_bwd_inputs}, {num_bwd_outputs});"
 
     # SetAttributes
     set_attributes_list = []
     for name, _, _, _ in backward_attrs_list:
-        set_attributes = "        grad_node->SetAttribute{name}({name});"
+        set_attributes = f"        grad_node->SetAttribute{name}({name});"
         set_attributes_list.append(set_attributes)
     set_attributes_str = "\n".join(set_attributes_list)
 
     # SetTensorWrappers
     set_tensor_wrappers_list = []
-    for name, (_, _, _) in backward_fwd_input_map.items():
-        set_tensor_wrappers = f"        grad_node->SetTensorWrapper{name}({name});"
+    for name, (_, is_fwd_input, _) in backward_fwd_input_map.items():
+        if is_fwd_input:
+            set_tensor_wrappers = f"        grad_node->SetTensorWrapper{name}({name}, true);"
+        else:
+            set_tensor_wrappers = f"        grad_node->SetTensorWrapper{name}({name}, false);"
         set_tensor_wrappers_list.append(set_tensor_wrappers)
     set_tensor_wrappers_str = "\n".join(set_tensor_wrappers_list)
 
@@ -727,7 +746,7 @@ def GenerateForwardDefinition(fwd_api_name, bwd_api_name,
     inputs_args_list = ["" for i in range(num_inputs)]
     inputs_call_list = ["" for i in range(num_inputs)]
     for name, (ttype, pos) in forward_inputs_position_map.items():
-        inputs_call_list[pos] = f"*{name}.Tensor().get()"
+        inputs_call_list[pos] = f"egr::EagerUtils::SyncToPtenTensors({name})"
         if IsPlainTensorType(ttype):
             inputs_args_list[pos] = f"const egr::EagerTensor& {name}"
         else:
@@ -746,7 +765,7 @@ def GenerateForwardDefinition(fwd_api_name, bwd_api_name,
     inputs_call_args_str = ", ".join(inputs_call_list)
 
     # Forward Full Logic
-    forward_call_str = f"auto api_result = {fwd_api_name}({inputs_call_args_str});"
+    forward_call_str = f"auto api_result = paddle::experimental::{fwd_api_name}({inputs_call_args_str});"
 
     # Get return type list & outputs
     num_outputs = len(forward_outputs_position_map.keys())
@@ -783,7 +802,7 @@ def GenerateForwardDefinition(fwd_api_name, bwd_api_name,
         backward_grad_output_map, backward_attrs_list)
 
     FORWARD_FUNCTION_TEMPLATE = """
-{} {}_dygraph_function({}) {{
+{} {}({}) {{
     // Forward API Call
     {}
     
@@ -796,11 +815,11 @@ def GenerateForwardDefinition(fwd_api_name, bwd_api_name,
 }}
 """
 
+    forward_function_name = GetForwardFunctionName(fwd_api_name)
     forward_function_str = FORWARD_FUNCTION_TEMPLATE.format(
-        returns_type_str, fwd_api_name, inputs_args_str, forward_call_str,
-        returns_str, node_creation_str)
-
-    forward_function_declaration_str = f"{returns_type_str} {fwd_api_name}_dygraph_function({inputs_args_str});"
+        returns_type_str, forward_function_name, inputs_args_str,
+        forward_call_str, returns_str, node_creation_str)
+    forward_function_declaration_str = f"{returns_type_str} {forward_function_name}({inputs_args_str});"
 
     return forward_function_str, forward_function_declaration_str
 
@@ -809,11 +828,12 @@ def GenerateNodeCCFile(filepath, node_definition_str):
     file_contents = """
 #include "glog/logging.h"
 #include "paddle/pten/api/all.h"
+#include "paddle/pten/api/backward/backward_api.h"
 #include "paddle/fluid/imperative/tracer.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/eager/utils.h"
 #include "paddle/fluid/eager/api/utils/global_utils.h"
-#include "paddle/fluid/eager/api/generated/eager_generated/nodes/nodes.h"
+#include "paddle/fluid/eager/api/generated/eager_generated/backwards/nodes.h"
 
 """
     file_contents += node_definition_str
@@ -825,7 +845,6 @@ def GenerateNodeHFile(filepath, node_declaration_str):
     file_contents = """
 #pragma once
 #include "paddle/fluid/eager/tensor_wrapper.h"
-#include "paddle/fluid/eager/legacy/op_runner.h"
 #include "paddle/fluid/eager/grad_node_info.h"
 
 """
@@ -836,11 +855,10 @@ def GenerateNodeHFile(filepath, node_declaration_str):
 
 def GenerateForwardCCFile(filepath, forward_definition_str):
     file_contents = """
-#include "paddle/fluid/eager/api/generated/eager_generated/dygraph_forward_api.h"
-#include "paddle/fluid/eager/api/generated/eager_generated/nodes/nodes.h"
+#include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h"
+#include "paddle/fluid/eager/api/generated/eager_generated/backwards/nodes.h"
 
 #include "paddle/fluid/eager/api/utils/global_utils.h"
-#include "paddle/fluid/eager/legacy/op_runner.h"
 
 """
     file_contents += forward_definition_str
@@ -905,10 +923,17 @@ if __name__ == "__main__":
         # Collect Forward Inputs/Outputs
         forward_inputs_list, forward_attrs_list, forward_returns_list = ParseYamlForwardFromBackward(
             bwd_forward_str)
+        print("Parsed Forward Inputs List: ", forward_inputs_list)
+        print("Prased Forward Attrs List: ", forward_attrs_list)
+        print("Parsed Forward Returns List: ", forward_returns_list)
 
         # Collect Original Forward Inputs/Outputs and then perform validation checks
         orig_forward_inputs_list, orig_forward_attrs_list, orig_forward_returns_list = ParseYamlForward(
             fwd_args_str, fwd_returns_str)
+        print("Parsed Original Forward Inputs List: ", orig_forward_inputs_list)
+        print("Prased Original Forward Attrs List: ", orig_forward_attrs_list)
+        print("Parsed Original Forward Returns List: ",
+              orig_forward_returns_list)
 
         # Forward Validation Checks
         ForwardsValidationCheck(forward_inputs_list, forward_attrs_list,
@@ -919,15 +944,25 @@ if __name__ == "__main__":
         # Parse Backward Inputs/Outputs
         backward_inputs_list, backward_attrs_list, backward_returns_list = ParseYamlBackward(
             bwd_args_str, bwd_returns_str)
+        print("Parsed Backward Inputs List: ", backward_inputs_list)
+        print("Prased Backward Attrs List: ", backward_attrs_list)
+        print("Parsed Backward Returns List: ", backward_returns_list)
 
         # Determine Forward Inputs/Outputs Position
         forward_inputs_position_map, forward_outputs_position_map = DetermineForwardPositionMap(
             forward_inputs_list, forward_returns_list)
+        print("Generated Forward Input Position Map: ",
+              forward_inputs_position_map)
+        print("Generated Forward Output Position Map: ",
+              forward_outputs_position_map)
 
         # SlotName Matching
         backward_fwd_input_map, backward_grad_input_map, backward_grad_output_map = SlotNameMatching(
             backward_inputs_list, backward_returns_list,
             forward_inputs_position_map, forward_outputs_position_map)
+        print("Generated Backward Fwd Input Map: ", backward_fwd_input_map)
+        print("Generated Backward Grad Input Map: ", backward_grad_input_map)
+        print("Generated Backward Grad Output Map: ", backward_grad_output_map)
 
         # Backward Validation Check
         BackwardValidationCheck(backward_fwd_input_map, backward_grad_input_map,
@@ -936,11 +971,13 @@ if __name__ == "__main__":
         # Node Declaration Generation
         node_declaration_str += GenerateNodeDeclaration(
             fwd_api_name, backward_fwd_input_map, backward_attrs_list)
+        print("Generated Node Declaration: ", node_declaration_str)
 
         node_definition_str += GenerateNodeDefinition(
             fwd_api_name, bwd_api_name, backward_fwd_input_map,
             backward_grad_input_map, backward_grad_output_map,
             backward_attrs_list)
+        print("Generated Node Definition: ", node_definition_str)
 
         # Node Definition Generation
         definition_declaration_pair = GenerateForwardDefinition(
@@ -948,6 +985,8 @@ if __name__ == "__main__":
             forward_outputs_position_map, forward_attrs_list,
             backward_fwd_input_map, backward_grad_input_map,
             backward_grad_output_map, backward_attrs_list)
+        print("Generated Forward Definition: ", forward_definition_str)
+        print("Generated Forward Declaration: ", forward_declaration_str)
         forward_definition_str += definition_declaration_pair[0]
         forward_declaration_str += definition_declaration_pair[1]
 
@@ -957,6 +996,12 @@ if __name__ == "__main__":
     forwards_h_path = args.forwards_h_path
     forwards_cc_path = args.forwards_cc_path
 
+    for path in [
+            nodes_cc_path, nodes_h_path, forwards_h_path, forwards_cc_path
+    ]:
+        if os.path.exists(path):
+            os.remove(path)
+
     GenerateNodeCCFile(nodes_cc_path, node_definition_str)
     GenerateNodeHFile(nodes_h_path, node_declaration_str)
     GenerateForwardCCFile(forwards_cc_path, forward_definition_str)
diff --git a/paddle/fluid/eager/auto_code_generator/generate_file_structures.py b/paddle/fluid/eager/auto_code_generator/generate_file_structures.py
index 56ec287561c..fdb8529515d 100644
--- a/paddle/fluid/eager/auto_code_generator/generate_file_structures.py
+++ b/paddle/fluid/eager/auto_code_generator/generate_file_structures.py
@@ -15,9 +15,45 @@
 import sys
 import os
 
-if __name__ == "__main__":
-    assert len(sys.argv) == 2
-    eager_dir = sys.argv[1]
+
+def GenerateFileStructureForFinalDygraph(eager_dir):
+    """
+    paddle/fluid/eager
+    |- generated
+    |  |- CMakeLists.txt
+    |  |  "add_subdirectory(forwards), add_subdirectory(backwards)"
+    |  
+    |  |- forwards
+    |     |- "dygraph_functions.cc"
+    |     |- "dygraph_functions.h"
+    |
+    |  |- backwards
+    |     |- "nodes.cc"
+    |     |- "nodes.h"
+    """
+    # Directory Generation
+    generated_dir = os.path.join(eager_dir, "api/generated/eager_generated")
+    forwards_dir = os.path.join(generated_dir, "forwards")
+    nodes_dir = os.path.join(generated_dir, "backwards")
+    dirs = [generated_dir, forwards_dir, nodes_dir]
+    for directory in dirs:
+        if not os.path.exists(directory):
+            os.mkdir(directory)
+
+    # Empty files
+    dygraph_forward_api_h_path = os.path.join(generated_dir,
+                                              "dygraph_functions.h")
+    empty_files = [dygraph_forward_api_h_path]
+    empty_files.append(os.path.join(forwards_dir, "dygraph_functions.cc"))
+    empty_files.append(os.path.join(nodes_dir, "nodes.cc"))
+    empty_files.append(os.path.join(nodes_dir, "nodes.h"))
+
+    for path in empty_files:
+        if not os.path.exists(path):
+            open(path, 'a').close()
+
+
+def GenerateFileStructureForIntermediateDygraph(eager_dir):
     """
     paddle/fluid/eager
     |- generated
@@ -79,3 +115,10 @@ if __name__ == "__main__":
 
     with open(generated_level_cmakelist_path, "w") as f:
         f.write("add_subdirectory(forwards)\nadd_subdirectory(nodes)")
+
+
+if __name__ == "__main__":
+    assert len(sys.argv) == 2
+    eager_dir = sys.argv[1]
+    GenerateFileStructureForIntermediateDygraph(eager_dir)
+    GenerateFileStructureForFinalDygraph(eager_dir)
diff --git a/paddle/fluid/eager/eager_tensor.h b/paddle/fluid/eager/eager_tensor.h
index a15b16f06a0..e11a471946a 100644
--- a/paddle/fluid/eager/eager_tensor.h
+++ b/paddle/fluid/eager/eager_tensor.h
@@ -18,10 +18,10 @@
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/variable.h"
 // pten deps
-#include "paddle/pten/api/all.h"
+#include "paddle/pten/api/include/tensor.h"
 #include "paddle/pten/api/lib/api_declare.h"
 #include "paddle/pten/api/lib/utils/tensor_utils.h"
-#include "paddle/pten/core/convert_utils.h"
+#include "paddle/pten/core/compat/convert_utils.h"
 /**
  * This class is used by Eager mode for now. It's painful to do this in Eager
  * Mode, the better
@@ -245,8 +245,7 @@ class EagerTensor final {
           auto tensor_dense =
               std::dynamic_pointer_cast<pten::DenseTensor>(tensor_->impl());
           if (tensor_dense && tensor_dense.get()) {
-            paddle::experimental::SharesStorage(tensor_dense.get(),
-                                                framework_tensor);
+            *framework_tensor = *tensor_dense;
           } else {
             PADDLE_THROW(paddle::platform::errors::Fatal(
                 "Unrecognized egr::EagerTensor type, only "
diff --git a/paddle/fluid/eager/grad_tensor_holder.cc b/paddle/fluid/eager/grad_tensor_holder.cc
index c0344e20fb9..0183f88772f 100644
--- a/paddle/fluid/eager/grad_tensor_holder.cc
+++ b/paddle/fluid/eager/grad_tensor_holder.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/eager/grad_tensor_holder.h"
-#include "paddle/fluid/eager/accumulation/gradient_accumulation.h"
+#include "paddle/fluid/imperative/gradient_accumulator.h"
 
 #include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/operators/math/math_function.h"
@@ -72,17 +72,17 @@ void GradTensorHolder::add(size_t slot_id, size_t rank,
     } else {
       // Accumulation
       if (t.initialized() && buffer_tensor.initialized()) {
-        TensorAdd(t, &buffer_tensor);
+        paddle::imperative::TensorAdd<egr::EagerTensor>(t, &buffer_tensor);
       } else if (t.Var().IsInitialized() &&
                  buffer_tensor.Var().IsInitialized()) {
-        VariableAdd(t, &buffer_tensor);
+        paddle::imperative::VariableAdd(t, &buffer_tensor);
       } else if (t.Var().IsInitialized() && buffer_tensor.initialized()) {
         // TODO(jiabin): This can be merge to upper if case.
         buffer_tensor.SyncToVar();
-        VariableAdd(t, &buffer_tensor);
+        paddle::imperative::VariableAdd(t, &buffer_tensor);
       } else if (t.initialized() && buffer_tensor.Var().IsInitialized()) {
         buffer_tensor.SyncToTensor();
-        TensorAdd(t, &buffer_tensor);
+        paddle::imperative::TensorAdd<egr::EagerTensor>(t, &buffer_tensor);
       } else {
         // Should not happend case
         // 1. both not init
diff --git a/paddle/fluid/eager/legacy/CMakeLists.txt b/paddle/fluid/eager/legacy/CMakeLists.txt
deleted file mode 100644
index ac3a9af6d14..00000000000
--- a/paddle/fluid/eager/legacy/CMakeLists.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-file(GLOB DYGRAPH_LEGACY "*.cpp" "*.cc")
-set(DYGRAPH_LEGACY ${DYGRAPH_LEGACY} PARENT_SCOPE)
diff --git a/paddle/fluid/eager/legacy/amp_auto_cast.cc b/paddle/fluid/eager/legacy/amp_auto_cast.cc
deleted file mode 100644
index 5e52f984a1b..00000000000
--- a/paddle/fluid/eager/legacy/amp_auto_cast.cc
+++ /dev/null
@@ -1,262 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/eager/legacy/amp_auto_cast.h"
-#include <memory>
-#include <string>
-#include "paddle/fluid/eager/legacy/op_runner.h"
-#include "paddle/fluid/eager/legacy/tensor_helper.h"
-#include "paddle/fluid/framework/operator.h"
-
-namespace egr {
-namespace legacy {
-
-AmpOperators::AmpOperators()
-    : allow_ops_(new std::unordered_set<std::string>()),
-      block_ops_(new std::unordered_set<std::string>()),
-      unsupported_fp16_ops_(new std::unordered_set<std::string>()) {
-  auto& all_kernels = paddle::framework::OperatorWithKernel::AllOpKernels();
-  auto fp16_dtype = paddle::framework::proto::VarType::FP16;
-  for (auto it = all_kernels.begin(); it != all_kernels.end(); it++) {
-    bool supported = false;
-    for (auto& kernel_type : it->second) {
-      if ((paddle::platform::is_gpu_place(kernel_type.first.place_) ||
-           paddle::platform::is_xpu_place(kernel_type.first.place_)) &&
-          kernel_type.first.data_type_ == fp16_dtype) {
-        supported = true;
-      }
-    }
-    if (!supported) {
-      unsupported_fp16_ops_->insert(it->first);
-    }
-  }
-}
-
-AmpOperators::~AmpOperators() {}
-
-AmpOperators& AmpOperators::Instance() {
-  static AmpOperators instance;
-  return instance;
-}
-
-std::shared_ptr<std::unordered_set<std::string>>
-AmpOperators::GetMutableAllowOps() {
-  return allow_ops_;
-}
-
-std::shared_ptr<std::unordered_set<std::string>>
-AmpOperators::GetMutableBlockOps() {
-  return block_ops_;
-}
-
-std::shared_ptr<std::unordered_set<std::string>>
-AmpOperators::GetMutableUnsupportedFp16Ops() {
-  return unsupported_fp16_ops_;
-}
-
-std::ostream& operator<<(std::ostream& os, AmpOperators& ops) {
-  os << "allow ops: ";
-  auto allow_ops = ops.GetMutableAllowOps();
-  std::copy((*allow_ops).begin(), (*allow_ops).end(),
-            std::ostream_iterator<std::string>(os, " "));
-  os << "\n";
-  os << "block ops: ";
-  auto block_ops = ops.GetMutableBlockOps();
-  std::copy((*block_ops).begin(), (*block_ops).end(),
-            std::ostream_iterator<std::string>(os, " "));
-  os << "\n";
-  os << "unsupported fp16 ops: ";
-  auto unsupported_fp16_ops = ops.GetMutableUnsupportedFp16Ops();
-  std::copy((*unsupported_fp16_ops).begin(), (*unsupported_fp16_ops).end(),
-            std::ostream_iterator<std::string>(os, " "));
-  return os;
-}
-
-inline std::string GetDtypeStr(
-    const std::shared_ptr<egr::EagerTensor>& tensor) {
-  return paddle::framework::DataTypeToString(
-      egr::legacy::GetDtypeFromVar(tensor->Var()));
-}
-
-inline bool NeedCast(const std::shared_ptr<egr::EagerTensor>& tensor) {
-  auto place = egr::legacy::GetPlaceFromVar(tensor->Var());
-  auto data_type = egr::legacy::GetDtypeFromVar(tensor->Var());
-  if (paddle::platform::is_gpu_place(place) ||
-      paddle::platform::is_cuda_pinned_place(place) ||
-      paddle::platform::is_xpu_place(place)) {
-    // CudaPinndePlace is added for varbase created by dataloader
-    if (data_type == paddle::framework::proto::VarType::FP32 ||
-        data_type == paddle::framework::proto::VarType::FP16) {
-      return true;
-    }
-  }
-  return false;
-}
-
-// NOTE: Trace a cast op, so if a var is casted from fp32 to fp16, then the grad
-// var will be cast back from fp16 to fp32 during backward phase.
-static inline std::shared_ptr<egr::EagerTensor> CastToType(
-    const std::shared_ptr<egr::EagerTensor>& tensor,
-    const paddle::framework::proto::VarType::Type dst_type) {
-  NameTensorMap ins = {{"X", {tensor}}};
-  auto in_data_type = egr::legacy::GetDtypeFromVar(tensor->Var());
-  paddle::framework::AttributeMap attrs = {{"in_dtype", in_data_type},
-                                           {"out_dtype", dst_type}};
-  auto out = std::shared_ptr<egr::EagerTensor>(new egr::EagerTensor());
-  NameTensorMap outs = {{"Out", {out}}};
-
-  {
-    AutoCastGuard guard(paddle::imperative::AmpLevel::O0);
-    paddle::framework::AttributeMap default_attrs;
-    RunOp("cast", ins, outs, std::move(attrs), {}, &default_attrs, true);
-  }
-
-  return out;
-}
-
-static inline std::shared_ptr<egr::EagerTensor> CastToFP16(
-    const std::shared_ptr<egr::EagerTensor>& tensor) {
-  auto dst_type = paddle::framework::proto::VarType::FP16;
-  if (NeedCast(tensor) &&
-      (egr::legacy::GetDtypeFromVar(tensor->Var()) != dst_type)) {
-    return CastToType(tensor, dst_type);
-  }
-  return tensor;
-}
-
-static inline std::shared_ptr<egr::EagerTensor> CastToFP32(
-    const std::shared_ptr<egr::EagerTensor>& tensor) {
-  auto dst_type = paddle::framework::proto::VarType::FP32;
-  if (NeedCast(tensor) &&
-      (egr::legacy::GetDtypeFromVar(tensor->Var()) != dst_type)) {
-    return CastToType(tensor, dst_type);
-  }
-  return tensor;
-}
-
-static inline paddle::framework::proto::VarType::Type GetPromoteType(
-    const std::string& op_type, const NameTensorMap& ins) {
-  auto dst_type = paddle::framework::proto::VarType::FP16;
-  for (const auto& pair : ins) {
-    for (const auto& tensor : pair.second) {
-      if (egr::legacy::GetDtypeFromVar(tensor->Var()) ==
-          paddle::framework::proto::VarType::FP32) {
-        dst_type = egr::legacy::GetDtypeFromVar(tensor->Var());
-        break;
-      }
-    }
-  }
-
-  // NOTE(juncai): moving_average_abs_max_scale only consider the
-  // dtype of input(X)
-  if (op_type == "moving_average_abs_max_scale") {
-    for (const auto& pair : ins) {
-      if (pair.first == "X" &&
-          egr::legacy::GetDtypeFromVar(pair.second.front()->Var()) ==
-              paddle::framework::proto::VarType::FP16) {
-        dst_type = paddle::framework::proto::VarType::FP16;
-      }
-    }
-  }
-
-  return dst_type;
-}
-
-NameTensorMap AutoCastInputs(const std::string& op_type,
-                             const NameTensorMap& ins) {
-  NameTensorMap new_ins(ins);
-  if (AmpOperators::Instance().GetMutableAllowOps()->count(op_type)) {
-    for (auto& pair : new_ins) {
-      // NOTE(zhiqiu): batch_norm and layer_norm support only input x is fp16.
-      if ((op_type == "batch_norm" || op_type == "layer_norm" ||
-           op_type == "sync_batch_norm") &&
-          pair.first != "X") {
-        continue;
-      }
-
-      VLOG(5) << "Op(" << op_type << "): Cast " << pair.first << " from "
-              << GetDtypeStr(*pair.second.cbegin()) << " to float16";
-      for (auto& var : pair.second) {
-        var = CastToFP16(var);
-      }
-    }
-    return new_ins;
-  } else if (AmpOperators::Instance().GetMutableBlockOps()->count(op_type)) {
-    for (auto& pair : new_ins) {
-      VLOG(5) << "Op(" << op_type << "): Cast " << pair.first << " from "
-              << GetDtypeStr(*pair.second.cbegin()) << " to float";
-      for (auto& var : pair.second) {
-        var = CastToFP32(var);
-      }
-    }
-    return new_ins;
-  } else {
-    auto dst_type = GetPromoteType(op_type, ins);
-
-    // NOTE(zhiqiu): if the op has op fp16 kernel, fall back to fp32.
-    if (dst_type == paddle::framework::proto::VarType::FP16 &&
-        AmpOperators::Instance().GetMutableUnsupportedFp16Ops()->count(
-            op_type)) {
-      dst_type = paddle::framework::proto::VarType::FP32;
-    }
-    for (auto& pair : new_ins) {
-      // NOTE(zhiqiu): batch_norm and layer_norm support only input x is fp16.
-      if ((op_type == "batch_norm" || op_type == "layer_norm" ||
-           op_type == "sync_batch_norm") &&
-          pair.first == "X" &&
-          dst_type == paddle::framework::proto::VarType::FP32) {
-        continue;
-      }
-      VLOG(5) << "Op(" << op_type << "): Cast " << pair.first << " from "
-              << GetDtypeStr(*pair.second.cbegin()) << " to "
-              << paddle::framework::DataTypeToString(dst_type);
-      for (auto& var : pair.second) {
-        var = (dst_type == paddle::framework::proto::VarType::FP32
-                   ? CastToFP32(var)
-                   : CastToFP16(var));
-      }
-    }
-    return new_ins;
-  }
-  return new_ins;
-}
-
-NameTensorMap CastPureFp16Inputs(const std::string& op_type,
-                                 const NameTensorMap& ins) {
-  NameTensorMap new_ins(ins);
-  auto dst_type = paddle::framework::proto::VarType::FP16;
-  if (AmpOperators::Instance().GetMutableUnsupportedFp16Ops()->count(op_type) ||
-      AmpOperators::Instance().GetMutableBlockOps()->count(op_type)) {
-    dst_type = paddle::framework::proto::VarType::FP32;
-  }
-  for (auto& pair : new_ins) {
-    if ((op_type == "batch_norm" || op_type == "layer_norm" ||
-         op_type == "sync_batch_norm") &&
-        pair.first != "X") {
-      continue;
-    }
-    VLOG(5) << "Op(" << op_type << "): Cast " << pair.first << " from "
-            << GetDtypeStr(*pair.second.cbegin()) << " to "
-            << paddle::framework::DataTypeToString(dst_type);
-    for (auto& var : pair.second) {
-      var = (dst_type == paddle::framework::proto::VarType::FP32
-                 ? CastToFP32(var)
-                 : CastToFP16(var));
-    }
-  }
-  return new_ins;
-}
-
-}  // namespace legacy
-}  // namespace egr
diff --git a/paddle/fluid/eager/legacy/amp_auto_cast.h b/paddle/fluid/eager/legacy/amp_auto_cast.h
deleted file mode 100644
index e457e72cb5d..00000000000
--- a/paddle/fluid/eager/legacy/amp_auto_cast.h
+++ /dev/null
@@ -1,90 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <memory>
-#include <set>
-#include <string>
-#include <tuple>
-#include <unordered_set>
-
-#include "paddle/fluid/eager/api/utils/global_utils.h"
-#include "paddle/fluid/eager/eager_tensor.h"
-#include "paddle/fluid/eager/legacy/type_def.h"
-#include "paddle/fluid/imperative/amp_auto_cast.h"
-
-namespace egr {
-namespace legacy {
-
-class AmpOperators {
- public:
-  ~AmpOperators();
-  AmpOperators(const AmpOperators& o) = delete;
-  const AmpOperators& operator=(const AmpOperators& o) = delete;
-
-  static AmpOperators& Instance();
-
-  std::shared_ptr<std::unordered_set<std::string>> GetMutableAllowOps();
-
-  std::shared_ptr<std::unordered_set<std::string>> GetMutableBlockOps();
-
-  std::shared_ptr<std::unordered_set<std::string>>
-  GetMutableUnsupportedFp16Ops();
-
- private:
-  AmpOperators();  // forbid calling default constructor
-
-  // The set of ops that support fp16 calculation and are considered numerically
-  // safe and performance critical. These ops are always converted to fp16.
-  std::shared_ptr<std::unordered_set<std::string>> allow_ops_;
-
-  // The set of ops that support fp16 calculation and are considered numerically
-  // dangerous and whose effects may also be observed in downstream ops.
-  std::shared_ptr<std::unordered_set<std::string>> block_ops_;
-
-  // The set of ops that has no fp16 CUDA kennel.
-  std::shared_ptr<std::unordered_set<std::string>> unsupported_fp16_ops_;
-};
-
-std::ostream& operator<<(std::ostream& os, AmpOperators& ops);
-
-// NOTE(zhiqiu): AutoCastGuard is used for RAII.
-class AutoCastGuard {
- public:
-  explicit AutoCastGuard(paddle::imperative::AmpLevel guard_level) {
-    pre_amp_level_ = Controller::Instance().GetAMPLevel();
-
-    if (pre_amp_level_ != guard_level) {
-      Controller::Instance().SetAMPLevel(guard_level);
-    }
-  }
-
-  ~AutoCastGuard() { Controller::Instance().SetAMPLevel(pre_amp_level_); }
-
-  // forbid copy and operator=
-  AutoCastGuard(const AutoCastGuard& guard) = delete;
-  AutoCastGuard& operator=(const AutoCastGuard& guard) = delete;
-
- private:
-  paddle::imperative::AmpLevel pre_amp_level_;
-};
-
-NameTensorMap AutoCastInputs(const std::string& op_type,
-                             const NameTensorMap& ins);
-
-NameTensorMap CastPureFp16Inputs(const std::string& op_type,
-                                 const NameTensorMap& ins);
-
-}  // namespace legacy
-}  // namespace egr
diff --git a/paddle/fluid/eager/legacy/execution_context.h b/paddle/fluid/eager/legacy/execution_context.h
deleted file mode 100644
index e51b6bf5417..00000000000
--- a/paddle/fluid/eager/legacy/execution_context.h
+++ /dev/null
@@ -1,214 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "paddle/fluid/eager/eager_tensor.h"
-#include "paddle/fluid/eager/legacy/type_def.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/type_defs.h"
-#include "paddle/fluid/framework/variable.h"
-namespace egr {
-namespace legacy {
-
-class EagerExecutionContext : public paddle::framework::ExecutionContext {
-  using Variable = paddle::framework::Variable;
-
- public:
-  EagerExecutionContext(const paddle::framework::OperatorBase& op,
-                        const paddle::framework::Scope& scope,
-                        const paddle::platform::DeviceContext& device_context,
-                        const paddle::framework::RuntimeContext& ctx,
-                        const NameTensorMap& tensor_map_in,
-                        const NameTensorMap& tensor_map_out,
-                        const paddle::framework::AttributeMap& attrs,
-                        const paddle::framework::AttributeMap& default_attrs)
-      : ExecutionContext(op, scope, device_context, ctx),
-        tensor_map_in_(tensor_map_in),
-        tensor_map_out_(tensor_map_out),
-        attrs_(attrs),
-        default_attrs_(default_attrs) {}
-
-  std::string InputName(const std::string& name) const override {
-    auto it = tensor_map_in_.find(name);
-    PADDLE_ENFORCE_NE(it, tensor_map_in_.end(),
-                      paddle::platform::errors::PreconditionNotMet(
-                          "Can not find [%s] in Input", name));
-    // TODO(jiabin): This is used for egr::EagerTensor temporally,
-    // once we have name, remove it.
-    return it->second[0] ? it->second[0]->name()
-                         : paddle::framework::kEmptyVarName;
-  }
-
-  std::vector<std::string> InputNames(const std::string& name) const override {
-    auto it = tensor_map_in_.find(name);
-    PADDLE_ENFORCE_NE(
-        it, tensor_map_in_.end(),
-        paddle::platform::errors::NotFound("Can not find [%s] in Input", name));
-    std::vector<std::string> vec_res;
-    vec_res.reserve(it->second.size());
-    for (size_t i = 0; i < it->second.size(); ++i) {
-      if (it->second[i]) {
-        // TODO(jiabin): This is used for egr::EagerTensor
-        // temporally, once we have name, remove it.
-        vec_res.push_back(it->second[i]->name());
-      } else {
-        vec_res.push_back(paddle::framework::kEmptyVarName);
-      }
-    }
-    return vec_res;
-  }
-
-  std::string OutputName(const std::string& name) const override {
-    auto it = tensor_map_out_.find(name);
-    PADDLE_ENFORCE_NE(it, tensor_map_out_.end(),
-                      paddle::platform::errors::NotFound(
-                          "Can not find [%s] in Output", name));
-    return it->second[0] ? it->second[0]->name()
-                         : paddle::framework::kEmptyVarName;
-  }
-
-  std::vector<std::string> OutputNames(const std::string& name) const override {
-    auto it = tensor_map_out_.find(name);
-    PADDLE_ENFORCE_NE(it, tensor_map_out_.end(),
-                      paddle::platform::errors::NotFound(
-                          "Can not find [%s] in Output", name));
-    std::vector<std::string> vec_res;
-    vec_res.reserve(it->second.size());
-    for (size_t i = 0; i < it->second.size(); ++i) {
-      if (it->second[i]) {
-        vec_res.push_back(it->second[i]->name());
-      } else {
-        vec_res.push_back(paddle::framework::kEmptyVarName);
-      }
-    }
-    return vec_res;
-  }
-
-  bool HasAttr(const std::string& name) const override {
-    return attrs_.count(name) != 0 || default_attrs_.count(name) != 0;
-  }
-
-  const paddle::framework::AttributeMap& Attrs() const override {
-    return attrs_;
-  }
-
-  const paddle::framework::Attribute& GetAttr(
-      const std::string& name) const override {
-    auto it = attrs_.find(name);
-
-    if (it == attrs_.end()) {
-      it = default_attrs_.find(name);
-      if (it == default_attrs_.end()) {
-        PADDLE_THROW(paddle::platform::errors::NotFound(
-            "Can not find [%s] in attributes of op %s.", name,
-            this->GetOp().Type()));
-      }
-    }
-
-    return it->second;
-  }
-
-  std::vector<std::string> InNameList() const override {
-    std::vector<std::string> vec_temp;
-    vec_temp.reserve(tensor_map_in_.size());
-
-    for (auto& v : tensor_map_in_) {
-      vec_temp.push_back(v.first);
-    }
-
-    return vec_temp;
-  }
-
-  bool HasInput(const std::string& name) const override {
-    auto it = tensor_map_in_.find(name);
-    return (it != tensor_map_in_.end() && it->second.size() > 0);
-  }
-
-  bool HasOutput(const std::string& name) const override {
-    auto it = tensor_map_out_.find(name);
-    return (it != tensor_map_out_.end() && it->second.size() > 0);
-  }
-
-  size_t InputSize(const std::string& name) const override {
-    return InputNames(name).size();
-  }
-
-  size_t OutputSize(const std::string& name) const override {
-    return OutputNames(name).size();
-  }
-
-  const Variable* InputVar(const std::string& name) const override {
-    auto it = tensor_map_in_.find(name);
-    if (it == tensor_map_in_.end()) {
-      return nullptr;
-    }
-
-    return it->second.empty() || it->second[0] == nullptr
-               ? nullptr
-               : it->second[0]->MutableVar();
-  }
-
-  Variable* OutputVar(const std::string& name) const override {
-    auto it = tensor_map_out_.find(name);
-    if (it == tensor_map_out_.end()) {
-      return nullptr;
-    }
-
-    return it->second.empty() || it->second[0] == nullptr
-               ? nullptr
-               : it->second[0]->MutableVar();
-  }
-
-  const std::vector<Variable*> MultiInputVar(
-      const std::string& name) const override {
-    auto it = tensor_map_in_.find(name);
-    if (it == tensor_map_in_.end()) {
-      return {};
-    }
-    std::vector<Variable*> vec_res;
-    vec_res.reserve(it->second.size());
-    for (size_t i = 0; i < it->second.size(); ++i) {
-      vec_res.push_back(it->second[i] ? it->second[i]->MutableVar() : nullptr);
-    }
-
-    return vec_res;
-  }
-
-  std::vector<Variable*> MultiOutputVar(
-      const std::string& name) const override {
-    auto it = tensor_map_out_.find(name);
-    if (it == tensor_map_out_.end()) {
-      return {};
-    }
-    std::vector<Variable*> vec_res;
-    vec_res.reserve(it->second.size());
-    for (size_t i = 0; i < it->second.size(); ++i) {
-      vec_res.push_back(it->second[i] ? it->second[i]->MutableVar() : nullptr);
-    }
-
-    return vec_res;
-  }
-
- private:
-  const NameTensorMap& tensor_map_in_;
-  const NameTensorMap& tensor_map_out_;
-  const paddle::framework::AttributeMap& attrs_;
-  const paddle::framework::AttributeMap& default_attrs_;
-};
-
-}  // namespace legacy
-}  // namespace egr
diff --git a/paddle/fluid/eager/legacy/infer_shape_context.h b/paddle/fluid/eager/legacy/infer_shape_context.h
deleted file mode 100644
index b43eda7abc3..00000000000
--- a/paddle/fluid/eager/legacy/infer_shape_context.h
+++ /dev/null
@@ -1,427 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/eager/eager_tensor.h"
-#include "paddle/fluid/eager/legacy/type_def.h"
-#include "paddle/fluid/framework/ddim.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/shape_inference.h"
-#include "paddle/fluid/framework/type_defs.h"
-#include "paddle/fluid/framework/var_type.h"
-namespace egr {
-namespace legacy {
-
-class EagerInferShapeContext : public paddle::framework::InferShapeContext {
-  using DDim = paddle::framework::DDim;
-
- public:
-  EagerInferShapeContext(
-      const NameTensorMap* in, const NameTensorMap* out,
-      const paddle::framework::AttributeMap* attr,
-      const paddle::framework::AttributeMap* default_attr,
-      const std::string op_type,
-      const paddle::framework::OpKernelType* op_kernel_type = nullptr)
-      : tensor_in_(in),
-        tensor_out_(out),
-        attrs_(attr),
-        default_attrs_(default_attr),
-        op_type_(op_type),
-        op_kernel_type_(op_kernel_type) {}
-
-  bool HasInput(const std::string& name) const override {
-    // has only one input
-    auto it = tensor_in_->find(name);
-
-    if (it == tensor_in_->end()) {
-      return false;
-    }
-    const auto& in = it->second;
-    if (in.size() == 0) return false;
-    PADDLE_ENFORCE_EQ(
-        in.size(), 1UL,
-        paddle::platform::errors::PreconditionNotMet(
-            "Input %s should not have more than one inputs", name));
-    return in[0] != nullptr;
-  }
-
-  bool HasOutput(const std::string& name) const override {
-    // has only one output
-    auto it = tensor_out_->find(name);
-    if (it == tensor_out_->end()) {
-      return false;
-    }
-    const auto& out = it->second;
-    if (out.size() == 0) {
-      return false;
-    }
-    PADDLE_ENFORCE_EQ(
-        out.size(), 1UL,
-        paddle::platform::errors::PreconditionNotMet(
-            "Output %s should not have more than one outputs", name));
-    return out[0] != nullptr;
-  }
-
-  bool HasInputs(const std::string& name) const override {
-    auto it = tensor_in_->find(name);
-    if (it == tensor_in_->end() || it->second.empty()) {
-      return false;
-    }
-    for (auto& input : it->second) {
-      if (input == nullptr) {
-        return false;
-      }
-    }
-    return true;
-  }
-
-  bool HasOutputs(const std::string& name) const override {
-    auto it = tensor_out_->find(name);
-    if (it == tensor_out_->end() || it->second.empty()) {
-      return false;
-    }
-    for (auto& output : it->second) {
-      if (output == nullptr) {
-        return false;
-      }
-    }
-    return true;
-  }
-
-  paddle::framework::AttrReader Attrs() const override {
-    return paddle::framework::AttrReader(*attrs_, *default_attrs_);
-  }
-
-  std::vector<std::string> Inputs(const std::string& name) const override {
-    std::vector<std::string> vec_res;
-    auto it = tensor_in_->find(name);
-    PADDLE_ENFORCE_NE(
-        it, tensor_in_->end(),
-        paddle::platform::errors::NotFound("can not find [%s] in input", name));
-
-    vec_res.reserve(it->second.size());
-    for (auto& var : it->second) {
-      if (var) {
-        vec_res.push_back(var->name());
-      } else {
-        vec_res.push_back(paddle::framework::kEmptyVarName);
-      }
-    }
-
-    return vec_res;
-  }
-
-  std::vector<std::string> Outputs(const std::string& name) const override {
-    std::vector<std::string> vec_res;
-    auto it = tensor_out_->find(name);
-    PADDLE_ENFORCE_NE(it, tensor_out_->end(),
-                      paddle::platform::errors::NotFound(
-                          "can not find [%s] in output", name));
-
-    vec_res.reserve(it->second.size());
-    for (auto& var : it->second) {
-      if (var) {
-        vec_res.push_back(var->name());
-      } else {
-        vec_res.push_back(paddle::framework::kEmptyVarName);
-      }
-    }
-
-    return vec_res;
-  }
-  std::string GetInputNameByIdx(size_t idx) const override {
-    auto& op_proto =
-        paddle::framework::OpInfoMap::Instance().Get(op_type_).proto_;
-    PADDLE_ENFORCE_LT(idx, op_proto->inputs().size(),
-                      paddle::platform::errors::OutOfRange(
-                          "The index should be less than the size of inputs of "
-                          "operator %s, but got index is %d and size is %d",
-                          op_type_, idx, op_proto->inputs().size()));
-    return op_proto->inputs()[idx].name();
-  }
-
-  std::string GetOutputNameByIdx(size_t idx) const override {
-    auto& op_proto =
-        paddle::framework::OpInfoMap::Instance().Get(op_type_).proto_;
-    PADDLE_ENFORCE_LT(
-        idx, op_proto->outputs().size(),
-        paddle::platform::errors::OutOfRange(
-            "The index should be less than the size of outputs of "
-            "operator %s, but got index is %d and size is %d",
-            op_type_, idx, op_proto->outputs().size()));
-    return op_proto->outputs()[idx].name();
-  }
-
-  void ShareDim(const std::string& in, const std::string& out, size_t i = 0,
-                size_t j = 0) override {
-    auto in_it = tensor_in_->find(in);
-    auto out_it = tensor_out_->find(out);
-    PADDLE_ENFORCE_NE(
-        in_it, tensor_in_->end(),
-        paddle::platform::errors::NotFound("can not found [%s] in input", in));
-    PADDLE_ENFORCE_GT(in_it->second.size(), i,
-                      paddle::platform::errors::PreconditionNotMet(
-                          "Inputs %s should have %llu argument", in, i));
-    PADDLE_ENFORCE_NE(
-        out_it, tensor_out_->end(),
-        paddle::platform::errors::NotFound("can not found [%s] in input", in));
-    PADDLE_ENFORCE_GT(out_it->second.size(), j,
-                      paddle::platform::errors::PreconditionNotMet(
-                          "Outputs %s should have %llu argument", out, j));
-
-    paddle::framework::Variable* in_var = in_it->second[i]->MutableVar();
-    paddle::framework::Variable* out_var = out_it->second[j]->MutableVar();
-
-    PADDLE_ENFORCE_EQ(in_var->Type(), out_var->Type(),
-                      paddle::platform::errors::PreconditionNotMet(
-                          "The type of %s and %s is not the same.", in, out));
-
-    if (in_var->IsType<paddle::framework::LoDTensor>()) {
-      auto& in_lod_tensor = in_var->Get<paddle::framework::LoDTensor>();
-      auto* out_lod_tensor =
-          out_var->GetMutable<paddle::framework::LoDTensor>();
-      out_lod_tensor->Resize(in_lod_tensor.dims());
-    } else {
-      auto& in_sele_rows = in_var->Get<pten::SelectedRows>();
-      auto out_sele_rows = out_var->GetMutable<pten::SelectedRows>();
-      out_sele_rows->mutable_value()->Resize(in_sele_rows.value().dims());
-      out_sele_rows->set_rows(in_sele_rows.rows());
-      out_sele_rows->set_height(in_sele_rows.height());
-    }
-  }
-
-  void ShareAllLoD(const std::string& in,
-                   const std::string& out) const override {
-    // do nothing
-  }
-  void ShareLoD(const std::string& in, const std::string& out, size_t i = 0,
-                size_t j = 0) const override {
-    // do nothing
-  }
-
-  bool IsRuntime() const override { return true; }
-
-  bool IsRunMKLDNNKernel() const override {
-    return (op_kernel_type_ && (op_kernel_type_->data_layout_ ==
-                                paddle::framework::DataLayout::kMKLDNN));
-  }
-
-  std::vector<paddle::framework::InferShapeVarPtr> GetInputVarPtrs(
-      const std::string& name) const override {
-    std::vector<paddle::framework::InferShapeVarPtr> res;
-    auto it = tensor_in_->find(name);
-    PADDLE_ENFORCE_NE(it, tensor_in_->end(),
-                      paddle::platform::errors::NotFound(
-                          "Can not find [%s] in inputs.", name));
-    for (auto& tensor : it->second) {
-      res.emplace_back(tensor->MutableVar());
-    }
-    return res;
-  }
-
-  std::vector<paddle::framework::InferShapeVarPtr> GetOutputVarPtrs(
-      const std::string& name) const override {
-    std::vector<paddle::framework::InferShapeVarPtr> res;
-    auto it = tensor_out_->find(name);
-    PADDLE_ENFORCE_NE(it, tensor_out_->end(),
-                      paddle::platform::errors::NotFound(
-                          "Can not find [%s] in outputs.", name));
-    for (auto& tensor : it->second) {
-      res.emplace_back(tensor->MutableVar());
-    }
-    return res;
-  }
-
-  DDim GetInputDim(const std::string& name) const override {
-    auto it = tensor_in_->find(name);
-    PADDLE_ENFORCE_NE(
-        it, tensor_in_->end(),
-        paddle::platform::errors::NotFound("can not find [%s] in input", name));
-    PADDLE_ENFORCE_EQ(
-        it->second.size(), 1UL,
-        paddle::platform::errors::PreconditionNotMet(
-            "Input(%s) should hold one element, but now it holds %d", name,
-            it->second.size()));
-    return this->GetDim(it->second[0]->MutableVar());
-  }
-
-  std::vector<DDim> GetInputsDim(const std::string& name) const override {
-    // const std::vector<Variable*>& vars = InputVars(name);
-    std::vector<DDim> vec_res;
-    auto it = tensor_in_->find(name);
-    PADDLE_ENFORCE_NE(it, tensor_in_->end(),
-                      paddle::platform::errors::NotFound(
-                          "can not find [%s] in output", name));
-    vec_res.reserve(it->second.size());
-    for (size_t i = 0; i < it->second.size(); ++i) {
-      if (it->second[i]) {
-        vec_res.emplace_back(GetDim(it->second[i]->MutableVar()));
-      } else {
-        vec_res.emplace_back();
-      }
-    }
-
-    return vec_res;
-  }
-
-  std::vector<paddle::framework::proto::VarType::Type> GetInputsVarType(
-      const std::string& name) const override {
-    std::vector<paddle::framework::proto::VarType::Type> vec_res;
-    auto it = tensor_in_->find(name);
-    PADDLE_ENFORCE_NE(
-        it, tensor_in_->end(),
-        paddle::platform::errors::NotFound("can not find [%s] in input", name));
-    vec_res.reserve(it->second.size());
-    for (size_t i = 0; i < it->second.size(); ++i) {
-      if (it->second[i]) {
-        vec_res.emplace_back(
-            paddle::framework::ToVarType(it->second[i]->MutableVar()->Type()));
-      } else {
-        vec_res.emplace_back();
-      }
-    }
-    return vec_res;
-  }
-
-  std::vector<paddle::framework::proto::VarType::Type> GetOutputsVarType(
-      const std::string& name) const override {
-    std::vector<paddle::framework::proto::VarType::Type> vec_res;
-    auto it = tensor_out_->find(name);
-    PADDLE_ENFORCE_NE(it, tensor_out_->end(),
-                      paddle::platform::errors::NotFound(
-                          "can not find [%s] in output", name));
-    vec_res.reserve(it->second.size());
-    for (size_t i = 0; i < it->second.size(); ++i) {
-      if (it->second[i]) {
-        vec_res.emplace_back(
-            paddle::framework::ToVarType(it->second[i]->MutableVar()->Type()));
-      } else {
-        vec_res.emplace_back(
-            static_cast<paddle::framework::proto::VarType::Type>(-1));
-      }
-    }
-    return vec_res;
-  }
-
-  void SetOutputDim(const std::string& name, const DDim& dim) override {
-    auto it = tensor_out_->find(name);
-    PADDLE_ENFORCE_NE(it, tensor_out_->end(),
-                      paddle::platform::errors::NotFound(
-                          "can not find [%s] in output", name));
-
-    if (it->second[0]) {
-      SetDim(it->second[0]->MutableVar(), dim);
-    }
-  }
-
-  void SetOutputsDim(const std::string& name,
-                     const std::vector<DDim>& dims) override {
-    auto it = tensor_out_->find(name);
-    PADDLE_ENFORCE_NE(it, tensor_out_->end(),
-                      paddle::platform::errors::NotFound(
-                          "can not find [%s] in output", name));
-
-    PADDLE_ENFORCE_EQ(dims.size(), it->second.size(),
-                      paddle::platform::errors::InvalidArgument(
-                          "The number of dims is expected to be equal to the "
-                          "number of Outputs(%s). But receieved: the number of "
-                          "dims = %d, the number of Outputs(%s) = %d.",
-                          name, dims.size(), name, it->second.size()));
-
-    for (size_t i = 0; i < dims.size(); ++i) {
-      if (it->second[i]) {
-        SetDim(it->second[i]->MutableVar(), dims[i]);
-      }
-    }
-  }
-
-  int32_t GetLoDLevel(const std::string& in, size_t i = 0) const override {
-    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
-        "GetLoDLevel function not support in dygraph mode"));
-  }
-
-  void SetLoDLevel(const std::string& out, int32_t lod_level,
-                   size_t j = 0) const override {
-    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
-        "SetLoDLevel function not support in dygraph mode"));
-  }
-
- protected:
-  DDim GetDim(paddle::framework::Variable* var) const {
-    PADDLE_ENFORCE_NOT_NULL(var, paddle::platform::errors::PreconditionNotMet(
-                                     "Input variable should not be null"));
-    if (var->IsType<paddle::framework::LoDTensor>()) {
-      return var->Get<paddle::framework::LoDTensor>().dims();
-    } else if (var->IsType<pten::SelectedRows>()) {
-      return var->Get<pten::SelectedRows>().GetCompleteDims();
-    } else {
-      PADDLE_THROW(paddle::platform::errors::PermissionDenied(
-          "Only LoDTensor/SelectedRows support 'GetDim', but Variables "
-          "type_id is xx."));
-    }
-  }
-
-  std::vector<DDim> GetRepeatedDims(const std::string& name) const override {
-    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
-        "GetRepeatedDims not support in dygraph runtime"));
-  }
-
-  void SetDim(paddle::framework::Variable* var, const DDim& dim) {
-    if (var->IsType<paddle::framework::LoDTensor>()) {
-      var->GetMutable<paddle::framework::LoDTensor>()->Resize(dim);
-    } else if (var->IsType<pten::SelectedRows>()) {
-      var->GetMutable<pten::SelectedRows>()->set_height(dim[0]);
-    } else {
-      PADDLE_THROW(paddle::platform::errors::PermissionDenied(
-          "Variable type_id %s, expect LoDTensor/SelectedRows."));
-    }
-  }
-
-  void SetDims(const std::vector<paddle::framework::Variable*>& vars,
-               const std::vector<DDim>& dims) {
-    size_t length = vars.size();
-    PADDLE_ENFORCE_EQ(
-        length, dims.size(),
-        paddle::platform::errors::PreconditionNotMet(
-            "Vars number [%d] should be equal with dims number [%d]", length,
-            dims.size()));
-    for (size_t i = 0; i < length; ++i) {
-      if (vars[i] == nullptr) {
-        continue;
-      }
-      SetDim(vars[i], dims[i]);
-    }
-  }
-
-  void SetRepeatedDims(const std::string& name,
-                       const std::vector<DDim>& dims) override {
-    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
-        "SetRepeatedDims not support in dygraph runtime"));
-  }
-
- private:
-  const NameTensorMap* tensor_in_;
-  const NameTensorMap* tensor_out_;
-  const paddle::framework::AttributeMap* attrs_;
-  const paddle::framework::AttributeMap* default_attrs_;
-  const std::string op_type_;
-  const paddle::framework::OpKernelType* op_kernel_type_;
-};
-
-}  // namespace legacy
-}  // namespace egr
diff --git a/paddle/fluid/eager/legacy/infer_var_type_context.h b/paddle/fluid/eager/legacy/infer_var_type_context.h
deleted file mode 100644
index 0bc37bee25e..00000000000
--- a/paddle/fluid/eager/legacy/infer_var_type_context.h
+++ /dev/null
@@ -1,264 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-#include "paddle/fluid/eager/eager_tensor.h"
-#include "paddle/fluid/eager/legacy/tensor_helper.h"
-#include "paddle/fluid/eager/legacy/type_def.h"
-#include "paddle/fluid/framework/type_defs.h"
-#include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/framework/var_type_inference.h"
-#include "paddle/fluid/framework/var_type_traits.h"
-#include "paddle/pten/api/all.h"
-
-namespace egr {
-namespace legacy {
-
-// infer var type context for imperative mode
-class TensorRuntimeInferVarTypeContext
-    : public paddle::framework::InferVarTypeContext {
- public:
-  TensorRuntimeInferVarTypeContext(
-      const NameTensorMap& inputs, const NameTensorMap& outputs,
-      const paddle::framework::AttributeMap& attrs_map,
-      const paddle::framework::AttributeMap& default_attrs_map)
-      : InferVarTypeContext(nullptr, nullptr),
-        inputs_(inputs),
-        outputs_(outputs),
-        attrs_(attrs_map),
-        default_attrs_(default_attrs_map) {}
-
-  virtual ~TensorRuntimeInferVarTypeContext() {}
-
-  paddle::framework::Attribute GetAttr(const std::string& name) const override {
-    auto it = attrs_.find(name);
-
-    if (it == attrs_.end()) {
-      it = default_attrs_.find(name);
-      if (it == default_attrs_.end()) {
-        PADDLE_THROW(paddle::platform::errors::NotFound(
-            "Can not find [%s] in attributes.", name));
-      }
-    }
-
-    return it->second;
-  }
-
-  bool HasInput(const std::string& name) const override {
-    auto it = inputs_.find(name);
-    return (it != inputs_.end() && it->second.size() > 0);
-  }
-
-  bool HasOutput(const std::string& name) const override {
-    auto it = outputs_.find(name);
-    return (it != outputs_.end() && it->second.size() > 0);
-  }
-
-  size_t InputSize(const std::string& name) const {
-    return inputs_.at(name).size();
-  }
-
-  const std::string& InputVarName(const std::string& name,
-                                  const int index = 0) const {
-    // TODO(jiabin): Support this usage inputs_.at(name)[index]->Name()
-    auto it = inputs_.find(name);
-    PADDLE_ENFORCE_NE(it, inputs_.end(),
-                      paddle::platform::errors::PreconditionNotMet(
-                          "Can not find [%s] in Input", name));
-    return inputs_.at(name)[index]->name();
-  }
-
-  bool InputTypeAnyOf(
-      const std::string& name,
-      paddle::framework::proto::VarType::Type type) const override {
-    auto& inputs = inputs_.at(name);
-    return std::any_of(
-        inputs.begin(), inputs.end(),
-        [&type](const std::shared_ptr<egr::EagerTensor>& var) {
-          return paddle::framework::ToVarType(var->Var().Type()) == type;
-        });
-  }
-
-  bool InputTypeAllOf(
-      const std::string& name,
-      paddle::framework::proto::VarType::Type type) const override {
-    auto& inputs = inputs_.at(name);
-    return std::all_of(
-        inputs.begin(), inputs.end(),
-        [&type](const std::shared_ptr<egr::EagerTensor>& var) {
-          return paddle::framework::ToVarType(var->Var().Type()) == type;
-        });
-  }
-
-  void SyncTypeAndDataType(const std::string& input_name,
-                           const std::string& output_name,
-                           int index = 0) override {
-    auto in_tensor = inputs_.at(input_name)[index];
-    auto out_tensor = outputs_.at(output_name)[index];
-    if (in_tensor != out_tensor) {
-      this->SetTensorType(
-          out_tensor, paddle::framework::ToVarType(in_tensor->Var().Type()));
-    }
-  }
-
-  void SetOutputType(const std::string& name,
-                     paddle::framework::proto::VarType::Type type,
-                     int index = 0) override {
-    if (index == paddle::framework::ALL_ELEMENTS) {
-      for (auto& item : outputs_.at(name)) {
-        this->SetTensorType(item, type);
-      }
-    } else {
-      auto& var = outputs_.at(name)[index];
-      this->SetTensorType(var, type);
-    }
-  }
-
-  void SetTensorType(std::shared_ptr<egr::EagerTensor> out,
-                     paddle::framework::proto::VarType::Type type) {
-    switch (type) {
-      case paddle::framework::proto::VarType::LOD_TENSOR: {
-        out->MutableVar()->GetMutable<paddle::framework::LoDTensor>();
-        break;
-      }
-      case paddle::framework::proto::VarType::SELECTED_ROWS: {
-        out->MutableVar()->GetMutable<pten::SelectedRows>();
-        break;
-      }
-      default: {
-        PADDLE_THROW(paddle::platform::errors::NotFound(
-            "Cannot found var type: %s while running runtime InferVarType",
-            paddle::framework::ToTypeName(type)));
-      }
-    }
-  }
-
-  paddle::framework::proto::VarType::Type GetInputType(
-      const std::string& name, const int& index = 0) const override {
-    return paddle::framework::ToVarType(inputs_.at(name)[index]->Var().Type());
-  }
-
-  paddle::framework::proto::VarType::Type GetOutputType(
-      const std::string& name, const int& index = 0) const override {
-    // TODO(jiabin): Support SelectedRows when we have it.
-    return paddle::framework::proto::VarType::LOD_TENSOR;
-  }
-
-  paddle::framework::proto::VarType::Type GetInputDataType(
-      const std::string& name, const int& index = 0) const override {
-    return inputs_.at(name)[index]
-        ->Var()
-        .Get<paddle::framework::LoDTensor>()
-        .type();
-  }
-
-  void SetOutputDataType(const std::string& name,
-                         paddle::framework::proto::VarType::Type type,
-                         int index = 0) override {
-    // TODO(jiabin): It seems doesn't make sense to set data_type in EagerMode.
-  }
-
-  bool IsDygraph() const override { return true; }
-
- protected:
-  bool HasVar(const std::string& name) const override {
-    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
-        "HasVar is not supported in runtime InferVarType"));
-  }
-
-  const std::vector<std::string>& InputVars(
-      const std::string& name) const override {
-    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
-        "InputVars is not supported in runtime InferVarType"));
-  }
-
-  const std::vector<std::string>& OutputVars(
-      const std::string& name) const override {
-    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
-        "OutputVars is not supported in runtime InferVarType"));
-  }
-
-  paddle::framework::proto::VarType::Type GetVarType(
-      const std::string& name) const override {
-    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
-        "Do not manipulate var in runtime InferVarType"));
-  }
-
-  void SetVarType(const std::string& name,
-                  paddle::framework::proto::VarType::Type type) override {
-    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
-        "Do not manipulate var in runtime InferVarType"));
-  }
-
-  paddle::framework::proto::VarType::Type GetVarDataType(
-      const std::string& name) const override {
-    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
-        "Do not manipulate var in runtime InferVarType"));
-  }
-
-  void SetVarDataType(const std::string& name,
-                      paddle::framework::proto::VarType::Type type) override {
-    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
-        "Do not manipulate var in runtime InferVarType"));
-  }
-
-  std::vector<paddle::framework::proto::VarType::Type> GetVarDataTypes(
-      const std::string& name) const override {
-    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
-        "GetVarDataTypes is not supported in runtime InferVarType"));
-  }
-
-  void SetVarDataTypes(
-      const std::string& name,
-      const std::vector<paddle::framework::proto::VarType::Type>&
-          multiple_data_type) override {
-    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
-        "SetVarDataTypes is not supported in runtime InferVarType"));
-  }
-
-  std::vector<int64_t> GetVarShape(const std::string& name) const override {
-    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
-        "Do not handle Shape in runtime InferVarType"));
-  }
-
-  void SetVarShape(const std::string& name,
-                   const std::vector<int64_t>& dims) override {
-    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
-        "Do not handle Shape in runtime InferVarType"));
-  }
-
-  int32_t GetVarLoDLevel(const std::string& name) const override {
-    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
-        "Do not handle LoDLevel in runtime InferVarType"));
-  }
-
-  void SetVarLoDLevel(const std::string& name, int32_t lod_level) override {
-    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
-        "Do not handle LoDLevel in runtime InferVarType"));
-  }
-
- private:
-  const NameTensorMap& inputs_;
-  const NameTensorMap& outputs_;
-  const paddle::framework::AttributeMap& attrs_;
-  const paddle::framework::AttributeMap& default_attrs_;
-};
-
-}  // namespace legacy
-}  // namespace egr
diff --git a/paddle/fluid/eager/legacy/op_runner.cc b/paddle/fluid/eager/legacy/op_runner.cc
deleted file mode 100644
index 4f88346dab9..00000000000
--- a/paddle/fluid/eager/legacy/op_runner.cc
+++ /dev/null
@@ -1,200 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/eager/legacy/op_runner.h"
-#include <map>
-#include <set>
-#include <unordered_set>
-#include <utility>
-#include "paddle/fluid/eager/legacy/amp_auto_cast.h"
-#include "paddle/fluid/eager/legacy/infer_var_type_context.h"
-#include "paddle/fluid/eager/legacy/prepared_operator.h"
-#include "paddle/fluid/eager/legacy/tensor_helper.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/denormal.h"
-#include "paddle/fluid/string/string_helper.h"
-
-DECLARE_bool(use_mkldnn);
-DECLARE_string(tracer_mkldnn_ops_on);
-DECLARE_string(tracer_mkldnn_ops_off);
-
-namespace egr {
-namespace legacy {
-
-void OpRunImpl(const paddle::framework::OperatorBase& op,
-               const NameTensorMap& ins, const NameTensorMap& outs,
-               const paddle::framework::AttributeMap& attrs,
-               const paddle::framework::AttributeMap& default_attrs,
-               const paddle::platform::Place& place) {
-  VLOG(6) << "Get Opertor With Kernel";
-  auto* op_kernel =
-      dynamic_cast<const paddle::framework::OperatorWithKernel*>(&op);
-  PADDLE_ENFORCE_NOT_NULL(
-      op_kernel, paddle::platform::errors::PermissionDenied(
-                     "Only support operator with kernel in Dygraph mode."));
-  auto& info = op.Info();
-  if (info.infer_var_type_) {
-    VLOG(6) << "Run InferVarType";
-    egr::legacy::TensorRuntimeInferVarTypeContext infer_var_type_ctx(
-        ins, outs, attrs, default_attrs);
-    VLOG(9) << "Actual Run InferVarType";
-    info.infer_var_type_(&infer_var_type_ctx);
-  }
-  VLOG(6) << "Initialize output tensor";
-  // Initialize output tensor
-  for (auto& tensor_pair : outs) {
-    for (auto& tensor : tensor_pair.second) {
-      if (tensor && tensor.get() && (!tensor->Var().IsInitialized())) {
-        InitializeVariable(tensor->MutableVar(),
-                           paddle::framework::proto::VarType::LOD_TENSOR);
-      }
-    }
-  }
-
-  /**
-   * [ Why need temporary inputs here? ]
-   *
-   * PrepareData should not change original input tensor inplace.
-   * Suppose the user defines a tensor(int), enters an op to execute,
-   * and then this op rewrites GetExpectedKernelForVar, and converts
-   * this tensor to float type during execution. After the dynamic
-   * graph is executed, the user-defined variable will be lost, and
-   * the user cannot get the originally defined int tensor, because
-   * it has been converted to float, this should be regarded as a bug
-   * in certain usage scenarios
-   *
-   * In static graph mode, when op is executed, a temporary scope
-   * `transfer_scope` is created before PrepareData, the data after
-   * transform is stored in the temporary scope, and then discarded
-   * after the execution of op, but the original input is directly
-   * overwritten in the previous dynamic graph implemention.
-   */
-  VLOG(6) << "Prepare Op";
-  auto prepared_op = egr::legacy::PreparedOp::Prepare(
-      ins, outs, *op_kernel, place, attrs, default_attrs);
-  VLOG(6) << "Prepare Data";
-  auto tmp_ins_ptr =
-      egr::legacy::PrepareData(*op_kernel, ins, prepared_op.kernel_type());
-  VLOG(6) << "Run Prepared Op";
-  if (tmp_ins_ptr == nullptr) {
-    prepared_op.Run(ins, outs, attrs, default_attrs);
-  } else {
-    prepared_op.Run(*tmp_ins_ptr, outs, attrs, default_attrs);
-  }
-
-  VLOG(6) << "Run Prepared Op end";
-  // TODO(jiabin): Set the output var's grad Forward DataType
-}
-
-void RunOp(const std::string& type, const NameTensorMap& ins,
-           const NameTensorMap& outs, paddle::framework::AttributeMap attrs,
-           const paddle::platform::Place& place,
-           paddle::framework::AttributeMap* default_attrs,
-           bool override_default_attr_map,
-           const std::map<std::string, std::string>& inplace_map) {
-  VLOG(1) << "Run Op: " << type;
-  if (FLAGS_use_mkldnn) {
-    // if both lists are empty all ops are enabled (default for
-    // FLAGS_use_mkldnn=1)
-    // if ops_on list is not empty only ops from that list are enabled
-    if (!FLAGS_tracer_mkldnn_ops_on.empty()) {
-      auto is_on = FLAGS_tracer_mkldnn_ops_on.find(type) != std::string::npos;
-      attrs["use_mkldnn"] = is_on;
-    } else {
-      // if ops_on list is empty all ops are enabled except types from off_list
-      auto is_off = FLAGS_tracer_mkldnn_ops_off.find(type) != std::string::npos;
-      attrs["use_mkldnn"] = !is_off;
-    }
-  }
-  auto op = paddle::framework::OpRegistry::CreateOp(type, {}, {}, {}, false);
-
-  PADDLE_ENFORCE_NOT_NULL(default_attrs,
-                          paddle::platform::errors::PermissionDenied(
-                              "Detected default_attrs = nullptr."));
-
-  if (override_default_attr_map) {
-    const auto& op_info = op->Info();
-    auto* attr_checker = op_info.Checker();
-    if (attr_checker) {
-      attr_checker->Check(&attrs, true, /*only_check_exist_value=*/true);
-    }
-
-    static paddle::framework::AttributeMap empty_attrs_map = {};
-    *default_attrs = attr_checker == nullptr
-                         ? empty_attrs_map
-                         : attr_checker->GetDefaultAttrMap();
-  }
-
-  auto amp_level = egr::Controller::Instance().GetAMPLevel();
-  VLOG(6) << "Check AMP status";
-  NameTensorMap new_ins = ins;
-  if (amp_level == paddle::imperative::AmpLevel::O1) {
-    VLOG(5) << "Auto mixed precision run operator: " << type;
-    new_ins = AutoCastInputs(type, ins);
-  } else if (amp_level == paddle::imperative::AmpLevel::O2) {
-    VLOG(5) << "Pure fp16 run operator: " << type;
-    new_ins = CastPureFp16Inputs(type, ins);
-  }
-
-  try {
-    VLOG(6) << "Get Device id";
-    if (paddle::platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-      paddle::platform::SetDeviceId(place.device);
-#else
-      PADDLE_THROW(paddle::platform::errors::PreconditionNotMet(
-          "PaddlePaddle should compile with GPU if use CUDAPlace."));
-#endif
-    } else if (paddle::platform::is_xpu_place(place)) {
-#ifdef PADDLE_WITH_XPU
-      paddle::platform::SetXPUDeviceId(place.device);
-#else
-      PADDLE_THROW(paddle::platform::errors::PreconditionNotMet(
-          "PaddlePaddle should compile with XPU if use XPUPlace."));
-#endif
-    } else if (paddle::platform::is_npu_place(place)) {
-#ifdef PADDLE_WITH_ASCEND_CL
-      paddle::platform::SetNPUDeviceId(place.device);
-#else
-      PADDLE_THROW(paddle::platform::errors::PreconditionNotMet(
-          "PaddlePaddle should compile with NPU if use NPUPlace."));
-#endif
-    }
-    VLOG(6) << "Step in OpRunImpl";
-    OpRunImpl(*op, new_ins, outs, attrs, *default_attrs, place);
-  } catch (paddle::platform::EnforceNotMet& exception) {
-    paddle::framework::AppendErrorOpHint(type, &exception);
-    throw std::move(exception);
-  } catch (std::exception& ex) {
-    PADDLE_THROW(paddle::platform::errors::Fatal(
-        "Operator %s raises an %s exception.\n"
-        "The exception content is\n:%s.",
-        type, paddle::platform::demangle(typeid(ex).name()), ex.what()));
-  } catch (...) {
-    // NOTE: this branch represents a very serious bug with
-    // low probability of occurrence, and we can't get its
-    // exception content here.
-    PADDLE_THROW(paddle::platform::errors::Fatal(
-        "Operator %s raises an unknown exception.", type));
-  }
-  VLOG(6) << "Finish Run Op";
-  // TODO(jiabin): Support this later
-  // if (enable_program_desc_tracing_) {
-  //   VLOG(5) << "Trace op " << type << " into ProgramDesc";
-  //   program_desc_tracer_->InsertOp(type, new_ins, outs, attrs);
-  // }
-}
-
-}  // namespace legacy
-}  // namespace egr
diff --git a/paddle/fluid/eager/legacy/op_runner.h b/paddle/fluid/eager/legacy/op_runner.h
deleted file mode 100644
index 34bd0782dd9..00000000000
--- a/paddle/fluid/eager/legacy/op_runner.h
+++ /dev/null
@@ -1,31 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "paddle/fluid/eager/legacy/type_def.h"
-#include "paddle/fluid/imperative/jit/program_desc_tracer.h"
-#include "paddle/pten/core/tensor_meta.h"
-
-namespace egr {
-namespace legacy {
-
-void RunOp(const std::string& type, const NameTensorMap& ins,
-           const NameTensorMap& outs, paddle::framework::AttributeMap attrs,
-           const paddle::platform::Place& place,
-           paddle::framework::AttributeMap* default_attrs,
-           bool override_default_attr_map,
-           const std::map<std::string, std::string>& inplace_map = {});
-
-}  // namespace legacy
-}  // namespace egr
diff --git a/paddle/fluid/eager/legacy/prepared_operator.cc b/paddle/fluid/eager/legacy/prepared_operator.cc
deleted file mode 100644
index fcdf4162685..00000000000
--- a/paddle/fluid/eager/legacy/prepared_operator.cc
+++ /dev/null
@@ -1,364 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/eager/legacy/prepared_operator.h"
-#include "paddle/fluid/imperative/prepared_operator.h"
-
-#include "paddle/fluid/eager/legacy/infer_shape_context.h"
-#include "paddle/fluid/framework/data_type_transform.h"
-#include "paddle/fluid/framework/details/nan_inf_utils.h"
-#include "paddle/fluid/framework/pten_utils.h"
-#include "paddle/utils/small_vector.h"
-#ifdef PADDLE_WITH_XPU
-#include "paddle/fluid/platform/device/xpu/xpu_op_list.h"
-#endif
-DECLARE_bool(check_nan_inf);
-DECLARE_bool(run_pten_kernel);
-
-namespace egr {
-namespace legacy {
-
-const paddle::framework::Tensor* GetTensorFromVar(
-    const paddle::framework::Variable& var) {
-  if (var.IsType<paddle::framework::LoDTensor>()) {
-    return &(var.Get<paddle::framework::LoDTensor>());
-  } else if (var.IsType<pten::SelectedRows>()) {
-    return &(var.Get<pten::SelectedRows>().value());
-  } else {
-    return nullptr;
-  }
-}
-
-static const paddle::framework::Attribute& GetAttr(
-    const paddle::framework::AttributeMap& attrs,
-    const paddle::framework::AttributeMap& default_attrs,
-    const std::string& name) {
-  auto it = attrs.find(name);
-  bool found = it != attrs.end();
-  if (!found) {
-    it = default_attrs.find(name);
-    found = it != default_attrs.end();
-  }
-  PADDLE_ENFORCE_EQ(found, true,
-                    paddle::platform::errors::NotFound(
-                        "(%s) is not found in AttributeMap.", name));
-  return it->second;
-}
-
-static void HandleComplexGradToRealGrad(const NameTensorMap& outs) {
-  // TODO(jiabin): Support complex forward datatype later.
-}
-
-PreparedOp::PreparedOp(
-    const paddle::framework::OperatorBase& op,
-    const paddle::framework::RuntimeContext& ctx,
-    const paddle::framework::OpKernelType& kernel_type,
-    const paddle::framework::OperatorWithKernel::OpKernelFunc& func,
-    paddle::platform::DeviceContext* dev_ctx)
-    : op_(op),
-      ctx_(ctx),
-      kernel_type_(kernel_type),
-      func_(func),
-      dev_ctx_(dev_ctx) {}
-
-PreparedOp::PreparedOp(
-    const paddle::framework::OperatorBase& op,
-    const paddle::framework::RuntimeContext& ctx,
-    const paddle::framework::OpKernelType& kernel_type,
-    const paddle::framework::KernelSignature& kernel_signature,
-    const pten::Kernel& pt_kernel, paddle::platform::DeviceContext* dev_ctx)
-    : op_(op),
-      ctx_(ctx),
-      kernel_type_(kernel_type),
-      func_(nullptr),
-      dev_ctx_(dev_ctx),
-      run_pten_kernel_(true),
-      pt_kernel_signature_(kernel_signature),
-      pt_kernel_(pt_kernel) {}
-
-PreparedOp PrepareImpl(const NameTensorMap& ins, const NameTensorMap& outs,
-                       const paddle::framework::OperatorWithKernel& op,
-                       const paddle::platform::Place& place,
-                       const paddle::framework::AttributeMap& attrs,
-                       const paddle::framework::AttributeMap& default_attrs) {
-  VLOG(6) << "Preparing an Op";
-  paddle::platform::DeviceContextPool& pool =
-      paddle::platform::DeviceContextPool::Instance();
-  auto* dev_ctx = pool.Get(place);
-
-  paddle::framework::RuntimeContext ctx({}, {});
-
-#ifdef PADDLE_WITH_MKLDNN
-  // MKLDNN variant of code reads attributes in some of GetKernelTypeForVar and
-  // GetKernelType functions, so we need to copy the attributes there.
-  // Const qualifier of Attrs had to be discarded to overwrite it.
-  if (FLAGS_use_mkldnn) {
-    auto& mutable_op_attrs =
-        const_cast<paddle::framework::AttributeMap&>(op.Attrs());
-    mutable_op_attrs = default_attrs;
-    for (auto& attr : attrs) {
-      mutable_op_attrs[attr.first] = attr.second;
-    }
-  }
-#endif
-
-  // 1. get expected kernel key
-  auto dygraph_exe_ctx = egr::legacy::EagerExecutionContext(
-      op, paddle::framework::Scope(), *dev_ctx, ctx, ins, outs, attrs,
-      default_attrs);
-  auto expected_kernel_key = op.GetExpectedKernelType(dygraph_exe_ctx);
-  VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
-
-  // fit for pten
-  pten::KernelSignature pt_kernel_signature;
-  pten::KernelKey pt_kernel_key;
-  std::string pt_kernel_name;
-  if (pten::KernelFactory::Instance().HasCompatiblePtenKernel(op.Type())) {
-    pt_kernel_signature = op.GetExpectedPtenKernelArgs(dygraph_exe_ctx);
-    VLOG(6) << pt_kernel_signature;
-
-    pt_kernel_name = pt_kernel_signature.name;
-    pt_kernel_key = TransOpKernelTypeToPtenKernelKey(expected_kernel_key);
-    auto pt_kernel = pten::KernelFactory::Instance().SelectKernel(
-        pt_kernel_name, pt_kernel_key);
-
-    if (pt_kernel.IsValid()) {
-      VLOG(6) << "Dynamic mode PrepareImpl - kernel name: " << pt_kernel_name
-              << " | kernel key: " << pt_kernel_key
-              << " | kernel: " << pt_kernel;
-
-      // TODO(chenweihang): using CPUKernel when miss device kernel case
-      return PreparedOp(op, ctx, expected_kernel_key, pt_kernel_signature,
-                        pt_kernel, dev_ctx);
-    } else {
-      VLOG(6) << "Dynamic mode ChoosePtenKernel - kernel `" << pt_kernel_name
-              << "` not found.";
-    }
-  }
-
-  // 2. check if op[type] has kernel registered.
-  auto& all_op_kernels = op.AllOpKernels();
-  auto kernels_iter = all_op_kernels.find(op.Type());
-
-  if (kernels_iter == all_op_kernels.end() ||
-      kernels_iter->second.find(expected_kernel_key) ==
-          kernels_iter->second.end()
-#ifdef PADDLE_WITH_XPU
-      ||
-      paddle::platform::is_xpu_place(expected_kernel_key.place_) &&
-          !paddle::platform::is_xpu_support_op(op.Type(),
-                                               expected_kernel_key) ||
-      paddle::platform::is_in_xpu_black_list(op.Type())
-#endif
-          ) {
-    if (pten::KernelFactory::Instance().HasCompatiblePtenKernel(op.Type())) {
-      auto pt_cpu_kernel_key =
-          FallBackToCpu(expected_kernel_key, pt_kernel_key, op);
-      auto pt_cpu_kernel = pten::KernelFactory::Instance().SelectKernel(
-          pt_kernel_name, pt_cpu_kernel_key);
-      if (pt_cpu_kernel.IsValid()) {
-        VLOG(6) << "Dynamic mode PrepareImpl - kernel name: " << pt_kernel_name
-                << " | kernel key: " << pt_cpu_kernel_key
-                << " | kernel: " << pt_cpu_kernel;
-        return PreparedOp(op, ctx, expected_kernel_key, pt_kernel_signature,
-                          pt_cpu_kernel, dev_ctx);
-      }
-    }
-  }
-
-  PADDLE_ENFORCE_NE(
-      kernels_iter, all_op_kernels.end(),
-      paddle::platform::errors::NotFound(
-          "There are no kernels which are registered in the %s operator.",
-          op.Type()));
-  auto& kernels = kernels_iter->second;
-  auto kernel_iter = kernels.find(expected_kernel_key);
-
-#ifdef PADDLE_WITH_XPU
-  if (paddle::platform::is_xpu_place(expected_kernel_key.place_) &&
-      (kernel_iter == kernels.end() ||
-       !paddle::platform::is_xpu_support_op(op.Type(), expected_kernel_key) ||
-       paddle::platform::is_in_xpu_black_list(op.Type()))) {
-    VLOG(3) << "missing XPU kernel: " << op.Type()
-            << ", expected_kernel_key:" << expected_kernel_key
-            << ", fallbacking to CPU one!";
-    expected_kernel_key.place_ = paddle::platform::CPUPlace();
-    kernel_iter = kernels.find(expected_kernel_key);
-  }
-#endif
-#ifdef PADDLE_WITH_ASCEND_CL
-  if (kernel_iter == kernels.end() &&
-      paddle::platform::is_npu_place(expected_kernel_key.place_)) {
-    VLOG(3) << "missing NPU kernel: " << op.Type()
-            << ", expected_kernel_key:" << expected_kernel_key
-            << ", fallbacking to CPU one!";
-    expected_kernel_key.place_ = paddle::platform::CPUPlace();
-    kernel_iter = kernels.find(expected_kernel_key);
-  }
-#endif
-  // TODO(jiabin): Add operator.cc's line 1000 part back when we need that
-  // case
-  PADDLE_ENFORCE_NE(kernel_iter, kernels.end(),
-                    paddle::platform::errors::NotFound(
-                        "Operator %s does not have kernel for %s.", op.Type(),
-                        KernelTypeToString(expected_kernel_key)));
-
-  if (!(expected_kernel_key.place_ == place)) {
-    dev_ctx = pool.Get(expected_kernel_key.place_);
-  }
-  VLOG(6) << "Construct Prepared Op";
-  return PreparedOp(op, ctx, expected_kernel_key, kernel_iter->second, dev_ctx);
-}
-
-PreparedOp PreparedOp::Prepare(
-    const NameTensorMap& ins, const NameTensorMap& outs,
-    const paddle::framework::OperatorWithKernel& op,
-    const paddle::platform::Place& place,
-    const paddle::framework::AttributeMap& attrs,
-    const paddle::framework::AttributeMap& default_attrs) {
-  return PrepareImpl(ins, outs, op, place, attrs, default_attrs);
-}
-
-static void PreparedOpRunImpl(
-    const paddle::framework::OperatorBase& op,
-    const paddle::framework::RuntimeContext& ctx,
-    const paddle::framework::OpKernelType& kernel_type,
-    const paddle::framework::OperatorWithKernel::OpKernelFunc& func,
-    paddle::platform::DeviceContext* dev_ctx, const NameTensorMap& ins,
-    const NameTensorMap& outs, const paddle::framework::AttributeMap& attrs,
-    const paddle::framework::AttributeMap& default_attrs) {
-  // TODO(zjl): remove scope in dygraph
-  VLOG(6) << "Runing Prepared Op";
-  paddle::framework::Scope scope;
-
-  EagerInferShapeContext infer_shape_ctx(&ins, &outs, &attrs, &default_attrs,
-                                         op.Type(), &kernel_type);
-  op.Info().infer_shape_(&infer_shape_ctx);
-
-  func(EagerExecutionContext(op, scope, *dev_ctx, ctx, ins, outs, attrs,
-                             default_attrs));
-
-  if (FLAGS_check_nan_inf) {
-    paddle::framework::details::CheckOpHasNanOrInfInEager<EagerTensor>(
-        op.Type(), outs, dev_ctx->GetPlace());
-  }
-
-  /**
-   * [ Why need handle complex gradient to real gradient? ]
-   *
-   * After the introduction of complex number calculations, Ops that support
-   * complex number calculations generally support type promotion, such as
-   * x(float32) + y(complex64) = out(complex64), then the type of the grad
-   * tensor should be dout(complex64), dx(float32), dy (complex64).
-   *
-   * But because the dout is complex64, the dx is also complex64 after
-   * grad op kernel executed, we need to recognize this situation and
-   * convert dx to float32 type. HandleComplexGradToRealGrad does this thing.
-   */
-  if (paddle::framework::IsComplexType(kernel_type.data_type_)) {
-    HandleComplexGradToRealGrad(outs);
-  }
-  VLOG(6) << "Finish Runing Prepared Op";
-}
-
-static void PreparedOpRunPtImpl(
-    const paddle::framework::OperatorBase& op,
-    const paddle::framework::OpKernelType& kernel_type,
-    const paddle::framework::KernelSignature& pt_kernel_signature,
-    const pten::Kernel& pt_kernel, paddle::platform::DeviceContext* dev_ctx,
-    const NameTensorMap& ins, const NameTensorMap& outs,
-    const paddle::framework::AttributeMap& attrs,
-    const paddle::framework::AttributeMap& default_attrs) {
-  EagerInferShapeContext infer_shape_ctx(&ins, &outs, &attrs, &default_attrs,
-                                         op.Type());
-  static_cast<const paddle::framework::OperatorWithKernel&>(op).InferShape(
-      &infer_shape_ctx);
-
-  paddle::imperative::PreparePtenData<EagerTensor>(
-      pt_kernel, pt_kernel_signature,
-      static_cast<paddle::imperative::NameTensorMap>(ins));
-
-  pten::KernelContext pt_kernel_context;
-  paddle::imperative::BuildDygraphPtenKernelContext<EagerTensor>(
-      pt_kernel_signature, pt_kernel,
-      static_cast<paddle::imperative::NameTensorMap>(ins),
-      static_cast<paddle::imperative::NameTensorMap>(outs), attrs,
-      default_attrs, dev_ctx, &pt_kernel_context);
-
-  pt_kernel(&pt_kernel_context);
-
-  // TODO(chenweihang): add debug flags later
-  // TODO(chenweihang): deal with complex cases later
-}
-
-void PreparedOp::Run(const NameTensorMap& ins, const NameTensorMap& outs,
-                     const paddle::framework::AttributeMap& attrs,
-                     const paddle::framework::AttributeMap& default_attrs) {
-  if (run_pten_kernel_) {
-    PreparedOpRunPtImpl(op_, kernel_type_, pt_kernel_signature_, pt_kernel_,
-                        dev_ctx_, ins, outs, attrs, default_attrs);
-  } else {
-    PreparedOpRunImpl(op_, ctx_, kernel_type_, func_, dev_ctx_, ins, outs,
-                      attrs, default_attrs);
-  }
-}
-
-std::shared_ptr<NameTensorMap> PrepareData(
-    const paddle::framework::OperatorWithKernel& op, const NameTensorMap& ins,
-    const paddle::framework::OpKernelType& expected_kernel_key) {
-  std::shared_ptr<NameTensorMap> tmp_ins_ptr = nullptr;
-  for (const auto& name_pair : ins) {
-    for (size_t i = 0; i < name_pair.second.size(); ++i) {
-      auto& egr_tensor = name_pair.second[i];
-      const auto* tensor = GetTensorFromVar(egr_tensor->Var());
-      if (tensor && tensor->IsInitialized()) {
-        auto kernel_type_for_var = op.GetKernelTypeForVar(
-            name_pair.first, *tensor, expected_kernel_key);
-        if (!NeedTransform(kernel_type_for_var, expected_kernel_key)) {
-          continue;
-        } else {
-          // TODO(jiabin): Support Cache later
-          VLOG(3) << "Transform Variable " << egr_tensor->name() << " from "
-                  << kernel_type_for_var << " to " << expected_kernel_key;
-          paddle::framework::Tensor out;
-          TransformData(expected_kernel_key, kernel_type_for_var, *tensor,
-                        &out);
-          if (NeedTransformDataType(kernel_type_for_var, expected_kernel_key)) {
-            // To avoid NameVarMap copy construction overhead in general
-            // scenarios, if inplace transformed, return original input
-            // directly
-            if (tmp_ins_ptr == nullptr) {
-              tmp_ins_ptr = std::make_shared<NameTensorMap>(ins);
-            }
-            auto tmp_egr_tensor =
-                std::make_shared<EagerTensor>(egr_tensor->name());
-            SetTensorToVariable(egr_tensor->Var(), out,
-                                tmp_egr_tensor->MutableVar());
-            (*tmp_ins_ptr)[name_pair.first][i] = tmp_egr_tensor;
-          } else {
-            // if dtype is same, transform inplace will not change the
-            // original
-            // value, transform inplace to avoid multiple copy
-            SetTensorToVariable(egr_tensor->Var(), out,
-                                egr_tensor->MutableVar());
-          }
-        }
-      }
-    }
-  }
-  return tmp_ins_ptr;
-}
-
-}  // namespace legacy
-}  // namespace egr
diff --git a/paddle/fluid/eager/legacy/prepared_operator.h b/paddle/fluid/eager/legacy/prepared_operator.h
deleted file mode 100644
index c0cb56d99dc..00000000000
--- a/paddle/fluid/eager/legacy/prepared_operator.h
+++ /dev/null
@@ -1,96 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/eager/legacy/execution_context.h"
-#include "paddle/fluid/eager/legacy/type_def.h"
-#include "paddle/fluid/framework/data_transform.h"
-#include "paddle/fluid/framework/op_kernel_type.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/type_defs.h"
-
-DECLARE_bool(use_mkldnn);
-
-namespace paddle {
-namespace framework {
-class Variable;
-}  // namespace framework
-}  // namespace paddle
-
-namespace pten {
-class DenseTensor;
-}  // namespace pten
-
-namespace egr {
-namespace legacy {
-
-const paddle::framework::Tensor* GetTensorFromVar(
-    const paddle::framework::Variable& var);
-
-std::shared_ptr<NameTensorMap> PrepareData(
-    const paddle::framework::OperatorWithKernel& op, const NameTensorMap& ins,
-    const paddle::framework::OpKernelType& expected_kernel_key);
-
-class PreparedOp {
- public:
-  PreparedOp(const paddle::framework::OperatorBase& op,
-             const paddle::framework::RuntimeContext& ctx,
-             const paddle::framework::OpKernelType& kernel_type,
-             const paddle::framework::OperatorWithKernel::OpKernelFunc& func,
-             paddle::platform::DeviceContext* dev_ctx);
-
-  PreparedOp(const paddle::framework::OperatorBase& op,
-             const paddle::framework::RuntimeContext& ctx,
-             const paddle::framework::OpKernelType& kernel_type,
-             const paddle::framework::KernelSignature& kernel_signature,
-             const pten::Kernel& pt_kernel,
-             paddle::platform::DeviceContext* dev_ctx);
-
-  static PreparedOp Prepare(
-      const NameTensorMap& ins, const NameTensorMap& outs,
-      const paddle::framework::OperatorWithKernel& op,
-      const paddle::platform::Place& place,
-      const paddle::framework::AttributeMap& attrs,
-      const paddle::framework::AttributeMap& default_attrs);
-
-  void Run(const NameTensorMap& in, const NameTensorMap& out,
-           const paddle::framework::AttributeMap& attrs,
-           const paddle::framework::AttributeMap& default_attrs);
-
-  const paddle::framework::OpKernelType& kernel_type() const {
-    return kernel_type_;
-  }
-
- private:
-  const paddle::framework::OperatorBase& op_;
-  const paddle::framework::RuntimeContext& ctx_;
-  paddle::framework::OpKernelType kernel_type_;
-  paddle::framework::OperatorWithKernel::OpKernelFunc func_;
-  paddle::platform::DeviceContext* dev_ctx_;
-
-  // NOTE(chenweihang): Similar op members are used to adapt to
-  // new pten kernel, if there is a better design in the future,
-  // we may polish the implementation here
-  bool run_pten_kernel_{false};
-  paddle::framework::KernelSignature pt_kernel_signature_;
-  pten::Kernel pt_kernel_;
-};
-
-}  // namespace legacy
-}  // namespace egr
diff --git a/paddle/fluid/eager/legacy/tensor_helper.cc b/paddle/fluid/eager/legacy/tensor_helper.cc
deleted file mode 100644
index fbf3205be2f..00000000000
--- a/paddle/fluid/eager/legacy/tensor_helper.cc
+++ /dev/null
@@ -1,114 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/eager/legacy/tensor_helper.h"
-
-#include "paddle/fluid/framework/feed_fetch_type.h"
-#include "paddle/fluid/framework/lod_rank_table.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/lod_tensor_array.h"
-#include "paddle/fluid/framework/reader.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows_utils.h"
-#include "paddle/fluid/framework/var_type_traits.h"
-#include "paddle/fluid/platform/place.h"
-
-namespace egr {
-namespace legacy {
-
-void InitializeVariable(paddle::framework::Variable *var,
-                        paddle::framework::proto::VarType::Type var_type) {
-  if (var_type == paddle::framework::proto::VarType::LOD_TENSOR) {
-    var->GetMutable<paddle::framework::LoDTensor>();
-  } else if (var_type == paddle::framework::proto::VarType::SELECTED_ROWS) {
-    var->GetMutable<pten::SelectedRows>();
-  } else if (var_type == paddle::framework::proto::VarType::FEED_MINIBATCH) {
-    var->GetMutable<paddle::framework::FeedList>();
-  } else if (var_type == paddle::framework::proto::VarType::FETCH_LIST) {
-    var->GetMutable<paddle::framework::FetchList>();
-  } else if (var_type == paddle::framework::proto::VarType::STEP_SCOPES) {
-    var->GetMutable<std::vector<paddle::framework::Scope *>>();
-  } else if (var_type == paddle::framework::proto::VarType::LOD_RANK_TABLE) {
-    var->GetMutable<paddle::framework::LoDRankTable>();
-  } else if (var_type == paddle::framework::proto::VarType::LOD_TENSOR_ARRAY) {
-    var->GetMutable<paddle::framework::LoDTensorArray>();
-  } else if (var_type == paddle::framework::proto::VarType::STRINGS) {
-    var->GetMutable<paddle::framework::Strings>();
-  } else if (var_type == paddle::framework::proto::VarType::VOCAB) {
-    var->GetMutable<paddle::framework::Vocab>();
-  } else if (var_type == paddle::framework::proto::VarType::PLACE_LIST) {
-    var->GetMutable<paddle::platform::PlaceList>();
-  } else if (var_type == paddle::framework::proto::VarType::READER) {
-    var->GetMutable<paddle::framework::ReaderHolder>();
-  } else if (var_type == paddle::framework::proto::VarType::RAW) {
-    // GetMutable will be called in operator
-  } else {
-    PADDLE_THROW(paddle::platform::errors::Unavailable(
-        "paddle::framework::Variable type %d is not in "
-        "[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, "
-        "LOD_RANK_TABLE, PLACE_LIST, READER, RAW].",
-        var_type));
-  }
-}
-
-void CopyVariable(const paddle::framework::Variable &src_var,
-                  paddle::framework::Variable *dst_var) {
-  // only support cpu now
-  auto cpu_place = paddle::platform::CPUPlace();
-
-  if (src_var.IsType<paddle::framework::LoDTensor>()) {
-    auto *tmp_grad_tensor = dst_var->GetMutable<paddle::framework::LoDTensor>();
-    auto &src_tensor = src_var.Get<paddle::framework::LoDTensor>();
-    tmp_grad_tensor->set_lod(src_tensor.lod());
-    paddle::framework::TensorCopy(src_tensor, cpu_place, tmp_grad_tensor);
-  } else if (src_var.IsType<pten::SelectedRows>()) {
-    auto &src_slr = src_var.Get<pten::SelectedRows>();
-    auto *tmp_grad_slr = dst_var->GetMutable<pten::SelectedRows>();
-    tmp_grad_slr->set_rows(src_slr.rows());
-    tmp_grad_slr->set_height(src_slr.height());
-    auto &src_t = src_slr.value();
-    auto *dst_t = tmp_grad_slr->mutable_value();
-    paddle::framework::TensorCopy(src_t, cpu_place, dst_t);
-  } else {
-    PADDLE_THROW(paddle::platform::errors::Unavailable(
-        "Unknown variable type to copy."));
-  }
-}
-paddle::framework::proto::VarType::Type GetDtypeFromVar(
-    const paddle::framework::Variable &var) {
-  if (var.IsType<paddle::framework::LoDTensor>()) {
-    return var.Get<paddle::framework::LoDTensor>().type();
-  } else if (var.IsType<pten::SelectedRows>()) {
-    return var.Get<pten::SelectedRows>().value().type();
-  } else {
-    PADDLE_THROW(paddle::platform::errors::InvalidArgument(
-        "Variable type is %s, expect LoDTensor or SelectedRows.",
-        paddle::framework::ToTypeName(var.Type())));
-  }
-}
-const paddle::platform::Place &GetPlaceFromVar(
-    const paddle::framework::Variable &var) {
-  if (var.IsType<paddle::framework::LoDTensor>()) {
-    return var.Get<paddle::framework::LoDTensor>().place();
-  } else if (var.IsType<pten::SelectedRows>()) {
-    return var.Get<pten::SelectedRows>().place();
-  } else {
-    PADDLE_THROW(paddle::platform::errors::InvalidArgument(
-        "Variable type is %s, expect LoDTensor or SelectedRows.",
-        paddle::framework::ToTypeName(var.Type())));
-  }
-}
-
-}  // namespace legacy
-}  // namespace egr
diff --git a/paddle/fluid/eager/legacy/tensor_helper.h b/paddle/fluid/eager/legacy/tensor_helper.h
deleted file mode 100644
index ce407f8965a..00000000000
--- a/paddle/fluid/eager/legacy/tensor_helper.h
+++ /dev/null
@@ -1,33 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <vector>
-#include "paddle/fluid/eager/eager_tensor.h"
-#include "paddle/pten/api/all.h"
-namespace egr {
-namespace legacy {
-
-void InitializeVariable(paddle::framework::Variable* var,
-                        paddle::framework::proto::VarType::Type var_type);
-paddle::framework::proto::VarType::Type GetDtypeFromVar(
-    const paddle::framework::Variable& var);
-const paddle::platform::Place& GetPlaceFromVar(
-    const paddle::framework::Variable& var);
-void CopyVariable(const paddle::framework::Variable& src_var,
-                  paddle::framework::Variable* dst_var);
-
-}  // namespace legacy
-}  // namespace egr
diff --git a/paddle/fluid/eager/legacy/type_def.h b/paddle/fluid/eager/legacy/type_def.h
deleted file mode 100644
index c209c48e384..00000000000
--- a/paddle/fluid/eager/legacy/type_def.h
+++ /dev/null
@@ -1,44 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <map>
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/platform/macros.h"
-namespace egr {
-
-class EagerTensor;
-
-namespace legacy {
-
-namespace details {
-template <typename T>
-struct NameVarMapTrait {};
-
-template <>
-struct NameVarMapTrait<EagerTensor> {
-  using Type =
-      std::map<std::string, std::vector<std::shared_ptr<egr::EagerTensor>>>;
-};
-}  // namespace details
-template <typename T>
-using NameMap = typename details::NameVarMapTrait<T>::Type;
-
-using NameTensorMap = NameMap<EagerTensor>;
-
-}  // namespace legacy
-}  // namespace egr
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc
index c100e3b70f3..93cd0d1338f 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc
@@ -36,11 +36,6 @@
 using namespace egr;            // NOLINT
 using namespace egr_utils_api;  // NOLINT
 
-// Disable pten path
-DECLARE_bool(run_pten_kernel);
-
-TEST(Benchmark, Init) { FLAGS_run_pten_kernel = false; }
-
 TEST(Benchmark, EagerScaleCPU) {
   // Prepare Device Contexts
   eager_test::InitEnv(paddle::platform::CPUPlace());
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc
index c8f4b1b32e4..2df44bfcab5 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc
@@ -35,10 +35,6 @@
 using namespace egr;            // NOLINT
 using namespace egr_utils_api;  // NOLINT
 
-DECLARE_bool(run_pten_kernel);
-
-TEST(Benchmark, Init) { FLAGS_run_pten_kernel = false; }
-
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
 TEST(Benchmark, EagerScaleCUDA) {
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc
index 68e7512eedb..b2a96468ece 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc
@@ -34,11 +34,6 @@
 #include "gperftools/profiler.h"
 #endif
 
-// Disable pten path
-DECLARE_bool(run_pten_kernel);
-
-TEST(Benchmark, Init) { FLAGS_run_pten_kernel = false; }
-
 namespace paddle {
 namespace imperative {
 
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc
index 50423b5a64f..7f8b845b070 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc
@@ -34,11 +34,6 @@
 #include "gperftools/profiler.h"
 #endif
 
-// Disable pten path
-DECLARE_bool(run_pten_kernel);
-
-TEST(Benchmark, Init) { FLAGS_run_pten_kernel = false; }
-
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
 namespace paddle {
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc
index e05a63a69d0..b50d7713d30 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc
@@ -214,7 +214,7 @@ void benchmark_fluid_scale(const std::shared_ptr<imperative::VarBase>& X,
          {std::shared_ptr<imperative::VarBase>(
              new imperative::VarBase(true, "Out"))}}};
 
-    tracer.TraceOp("scale", ins, outs, attrs, place, true);
+    tracer.TraceOp<VarBase>("scale", ins, outs, attrs, place, true);
 
     tmp_out = outs["Out"][0];
   }
@@ -250,7 +250,7 @@ void benchmark_fluid_matmul(const std::shared_ptr<imperative::VarBase>& X,
          {std::shared_ptr<imperative::VarBase>(
              new imperative::VarBase(true, "Out"))}}};
 
-    tracer.TraceOp("matmul_v2", ins, outs, attrs, place, true);
+    tracer.TraceOp<VarBase>("matmul_v2", ins, outs, attrs, place, true);
 
     tmp_out = outs["Out"][0];
   }
@@ -288,7 +288,7 @@ void benchmark_fluid_mlp(
              {std::shared_ptr<imperative::VarBase>(
                  new imperative::VarBase(true, "Out"))}}};
 
-    tracer.TraceOp("matmul_v2", ins, outs, attrs, place, true);
+    tracer.TraceOp<VarBase>("matmul_v2", ins, outs, attrs, place, true);
 
     // EW-Add0
     ins = {{"X", outs["Out"]}, {"Y", {Bs[i]}}};
@@ -296,7 +296,7 @@ void benchmark_fluid_mlp(
              {std::shared_ptr<imperative::VarBase>(
                  new imperative::VarBase(true, "Out"))}}};
 
-    tracer.TraceOp("elementwise_add", ins, outs, attrs, place, true);
+    tracer.TraceOp<VarBase>("elementwise_add", ins, outs, attrs, place, true);
     input0 = outs["Out"][0];
   }
 
@@ -307,7 +307,7 @@ void benchmark_fluid_mlp(
                new imperative::VarBase(true, "Out"))}}};
   attrs = {{"reduce_all", true}};
 
-  tracer.TraceOp("reduce_sum", ins, outs, attrs, place, true);
+  tracer.TraceOp<VarBase>("reduce_sum", ins, outs, attrs, place, true);
 
   auto* engine = tracer.GetEngine();
   std::vector<std::shared_ptr<imperative::VarBase>> grad_tensors{nullptr};
diff --git a/paddle/fluid/eager/utils.cc b/paddle/fluid/eager/utils.cc
index 98e6a8fc5d2..88030d91bf9 100644
--- a/paddle/fluid/eager/utils.cc
+++ b/paddle/fluid/eager/utils.cc
@@ -286,4 +286,43 @@ void EagerUtils::CheckAndRetainGrad(
   }
 }
 
+paddle::experimental::Tensor EagerUtils::SyncToPtenTensors(
+    const egr::EagerTensor& tensor) {
+  const_cast<EagerTensor*>(&tensor)->SyncToTensor();
+  return *tensor.Tensor().get();
+}
+
+std::vector<paddle::experimental::Tensor> EagerUtils::SyncToPtenTensors(
+    const std::vector<egr::EagerTensor>& tensors) {
+  std::vector<paddle::experimental::Tensor> res;
+  size_t num = tensors.size();
+  res.reserve(num);
+  for (size_t i = 0; i < num; i++) {
+    const_cast<EagerTensor*>(&(tensors[i]))->SyncToTensor();
+    res.push_back(*tensors[i].Tensor().get());
+  }
+  return res;
+}
+
+egr::EagerTensor EagerUtils::CreateEagerTensorFromTensor(
+    const paddle::experimental::Tensor& tensor) {
+  egr::EagerTensor ret;
+  ret.set_tensor(std::make_shared<paddle::experimental::Tensor>(tensor));
+  return ret;
+}
+
+std::vector<egr::EagerTensor> EagerUtils::CreateEagerTensorFromTensor(
+    const std::vector<paddle::experimental::Tensor>& tensors) {
+  std::vector<egr::EagerTensor> res;
+  size_t num = tensors.size();
+  res.reserve(num);
+  for (size_t i = 0; i < num; i++) {
+    egr::EagerTensor tmp;
+    tmp.set_tensor(std::make_shared<paddle::experimental::Tensor>(tensors[i]));
+    res.emplace_back(std::move(tmp));
+  }
+
+  return res;
+}
+
 }  // namespace egr
diff --git a/paddle/fluid/eager/utils.h b/paddle/fluid/eager/utils.h
index ef3ecf27c3c..73839d34ec2 100644
--- a/paddle/fluid/eager/utils.h
+++ b/paddle/fluid/eager/utils.h
@@ -170,6 +170,16 @@ class EagerUtils {
 
   static void CheckAndRetainGrad(const egr::EagerTensor& tensor);
   static void CheckAndRetainGrad(const std::vector<egr::EagerTensor>& tensors);
+
+  static paddle::experimental::Tensor SyncToPtenTensors(
+      const egr::EagerTensor& tensor);
+  static std::vector<paddle::experimental::Tensor> SyncToPtenTensors(
+      const std::vector<egr::EagerTensor>& tensors);
+
+  static egr::EagerTensor CreateEagerTensorFromTensor(
+      const paddle::experimental::Tensor& tensor);
+  static std::vector<egr::EagerTensor> CreateEagerTensorFromTensor(
+      const std::vector<paddle::experimental::Tensor>& tensors);
 };
 
 }  // namespace egr
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index de3a957df08..95c814380e3 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -293,7 +293,7 @@ if(WITH_DISTRIBUTE)
     ps_gpu_trainer.cc downpour_worker.cc downpour_worker_opt.cc
     pull_dense_worker.cc section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry
     device_context scope framework_proto trainer_desc_proto glog fs shell 
-    fleet_wrapper heter_wrapper ps_gpu_wrapper box_wrapper lodtensor_printer
+    fleet_wrapper heter_wrapper ps_gpu_wrapper box_wrapper metrics lodtensor_printer
     lod_rank_table feed_fetch_method collective_helper ${GLOB_DISTRIBUTE_DEPS}
     graph_to_program_pass variable_helper data_feed_proto timer monitor
     heter_service_proto fleet_executor ${BRPC_DEP})
@@ -315,7 +315,7 @@ if(WITH_DISTRIBUTE)
             pull_dense_worker.cc section_worker.cc heter_section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry
             device_context scope framework_proto data_feed_proto heter_service_proto trainer_desc_proto glog
             index_sampler index_wrapper sampler index_dataset_proto
-            lod_rank_table fs shell fleet_wrapper heter_wrapper box_wrapper lodtensor_printer feed_fetch_method
+            lod_rank_table fs shell fleet_wrapper heter_wrapper box_wrapper metrics lodtensor_printer feed_fetch_method
             graph_to_program_pass variable_helper timer monitor heter_service_proto fleet heter_server brpc fleet_executor)
     set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
     if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
@@ -336,7 +336,7 @@ if(WITH_DISTRIBUTE)
             ps_gpu_trainer.cc downpour_worker.cc downpour_worker_opt.cc
             pull_dense_worker.cc section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry
             device_context scope framework_proto data_feed_proto heter_service_proto trainer_desc_proto glog
-            lod_rank_table fs shell fleet_wrapper heter_wrapper ps_gpu_wrapper box_wrapper lodtensor_printer feed_fetch_method
+            lod_rank_table fs shell fleet_wrapper heter_wrapper ps_gpu_wrapper box_wrapper metrics lodtensor_printer feed_fetch_method
             graph_to_program_pass variable_helper timer monitor fleet_executor)
   endif()
 elseif(WITH_PSLIB)
diff --git a/paddle/fluid/framework/custom_kernel.cc b/paddle/fluid/framework/custom_kernel.cc
index a5498623941..a04838c9d4f 100644
--- a/paddle/fluid/framework/custom_kernel.cc
+++ b/paddle/fluid/framework/custom_kernel.cc
@@ -25,12 +25,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/pten/api/ext/op_kernel_info.h"
-#include "paddle/pten/core/convert_utils.h"
+#include "paddle/pten/core/compat/convert_utils.h"
 #include "paddle/pten/core/kernel_context.h"
 #include "paddle/pten/core/kernel_registry.h"
 
-DECLARE_bool(run_pten_kernel);
-
 namespace paddle {
 
 namespace framework {
@@ -279,10 +277,6 @@ static void RunKernelFunc(pten::KernelContext* ctx,
 
 void RegisterKernelWithMetaInfo(
     const std::vector<OpKernelInfo>& op_kernel_infos) {
-  PADDLE_ENFORCE_EQ(FLAGS_run_pten_kernel, true,
-                    platform::errors::Unimplemented(
-                        "Custom Kernel depends on pten kernel enabled,"));
-
   for (size_t i = 0; i < op_kernel_infos.size(); ++i) {
     auto& kernel_info = op_kernel_infos[i];
     auto op_type = OpKernelInfoHelper::GetOpName(kernel_info);
diff --git a/paddle/fluid/framework/custom_kernel_test.cc b/paddle/fluid/framework/custom_kernel_test.cc
index 708b7bbe8a5..5f01681624e 100644
--- a/paddle/fluid/framework/custom_kernel_test.cc
+++ b/paddle/fluid/framework/custom_kernel_test.cc
@@ -212,11 +212,13 @@ TEST(CustomKernel, custom_kernel_dot) {
   kernel_context.EmplaceBackAttr(fake_attr_int64_vec);
   kernel_context.EmplaceBackAttr(fake_attr_int_vec);
 
-  auto out_meta = pten::DotInferMeta(dense_x->meta(), dense_y->meta());
   auto dense_out = std::make_shared<pten::DenseTensor>(
       pten::make_intrusive<paddle::experimental::SharedStorage>(
           pten::TransToFluidPlace(backend)),
-      std::move(out_meta));
+      pten::DenseTensorMeta());
+
+  pten::MetaTensor meta_out(dense_out.get());
+  pten::DotInferMeta(*dense_x, *dense_y, &meta_out);
   kernel_context.EmplaceBackOutput(dense_out.get());  // idx:0 index:[0,1)
 
   // fake_input_vec: idx:1, index:[1,3)
diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc
index 30fbee57787..68445e7976e 100644
--- a/paddle/fluid/framework/custom_operator.cc
+++ b/paddle/fluid/framework/custom_operator.cc
@@ -37,7 +37,7 @@ limitations under the License. */
 #include "paddle/pten/api/lib/api_declare.h"
 #include "paddle/pten/api/lib/ext_compat_utils.h"
 #include "paddle/pten/api/lib/utils/tensor_utils.h"
-#include "paddle/pten/core/convert_utils.h"
+#include "paddle/pten/core/compat/convert_utils.h"
 #include "paddle/utils/any.h"
 
 namespace paddle {
@@ -110,8 +110,8 @@ static void RunKernelFunc(const framework::ExecutionContext& ctx,
                           const std::vector<std::string>& outputs,
                           const std::vector<std::string>& attrs) {
   VLOG(3) << "Custom Operator: Start run KernelFunc.";
-  std::vector<paddle::experimental::Tensor> custom_ins;
-  std::vector<std::vector<paddle::experimental::Tensor>> custom_vec_ins;
+  // prepare CustomOpKernelContext
+  paddle::CustomOpKernelContext kernel_ctx;
   for (auto& in_name : inputs) {
     VLOG(3) << "Custom Operator: input name - " << in_name;
     if (detail::IsDuplicableVar(in_name)) {
@@ -136,7 +136,7 @@ static void RunKernelFunc(const framework::ExecutionContext& ctx,
         custom_t.set_impl(std::make_shared<pten::DenseTensor>(*x));
         custom_vec_in.emplace_back(custom_t);
       }
-      custom_vec_ins.emplace_back(custom_vec_in);
+      kernel_ctx.EmplaceBackInputs(std::move(custom_vec_in));
     } else {
       auto* x = ctx.Input<Tensor>(in_name);
       PADDLE_ENFORCE_NOT_NULL(x, platform::errors::NotFound(
@@ -146,33 +146,32 @@ static void RunKernelFunc(const framework::ExecutionContext& ctx,
                             "Input tensor (%s) is not initialized.", in_name));
       paddle::experimental::Tensor custom_in;
       custom_in.set_impl(std::make_shared<pten::DenseTensor>(*x));
-      custom_ins.emplace_back(custom_in);
+      kernel_ctx.EmplaceBackInput(std::move(custom_in));
     }
   }
 
-  std::vector<paddle::any> custom_attrs;
   for (auto& attr_str : attrs) {
     auto attr_name_and_type = detail::ParseAttrStr(attr_str);
     auto attr_name = attr_name_and_type[0];
     auto attr_type_str = attr_name_and_type[1];
     if (attr_type_str == "bool") {
-      custom_attrs.emplace_back(ctx.Attr<bool>(attr_name));
+      kernel_ctx.EmplaceBackAttr(ctx.Attr<bool>(attr_name));
     } else if (attr_type_str == "int") {
-      custom_attrs.emplace_back(ctx.Attr<int>(attr_name));
+      kernel_ctx.EmplaceBackAttr(ctx.Attr<int>(attr_name));
     } else if (attr_type_str == "float") {
-      custom_attrs.emplace_back(ctx.Attr<float>(attr_name));
+      kernel_ctx.EmplaceBackAttr(ctx.Attr<float>(attr_name));
     } else if (attr_type_str == "int64_t") {
-      custom_attrs.emplace_back(ctx.Attr<int64_t>(attr_name));
+      kernel_ctx.EmplaceBackAttr(ctx.Attr<int64_t>(attr_name));
     } else if (attr_type_str == "std::string") {
-      custom_attrs.emplace_back(ctx.Attr<std::string>(attr_name));
+      kernel_ctx.EmplaceBackAttr(ctx.Attr<std::string>(attr_name));
     } else if (attr_type_str == "std::vector<int>") {
-      custom_attrs.emplace_back(ctx.Attr<std::vector<int>>(attr_name));
+      kernel_ctx.EmplaceBackAttr(ctx.Attr<std::vector<int>>(attr_name));
     } else if (attr_type_str == "std::vector<float>") {
-      custom_attrs.emplace_back(ctx.Attr<std::vector<float>>(attr_name));
+      kernel_ctx.EmplaceBackAttr(ctx.Attr<std::vector<float>>(attr_name));
     } else if (attr_type_str == "std::vector<int64_t>") {
-      custom_attrs.emplace_back(ctx.Attr<std::vector<int64_t>>(attr_name));
+      kernel_ctx.EmplaceBackAttr(ctx.Attr<std::vector<int64_t>>(attr_name));
     } else if (attr_type_str == "std::vector<std::string>") {
-      custom_attrs.emplace_back(ctx.Attr<std::vector<std::string>>(attr_name));
+      kernel_ctx.EmplaceBackAttr(ctx.Attr<std::vector<std::string>>(attr_name));
     } else {
       PADDLE_THROW(platform::errors::Unimplemented(
           "Unsupported `%s` type value as custom attribute now. "
@@ -185,39 +184,75 @@ static void RunKernelFunc(const framework::ExecutionContext& ctx,
     }
   }
 
-  VLOG(3) << "Custom Operator: Run ComputeFunc.";
-  try {
-    auto outs = func(custom_ins, custom_vec_ins, custom_attrs);
+  VLOG(3) << "Custom Operator: push outputs into CustomOpKernelContext.";
+  // cache the target tensor pointers
+  std::vector<Tensor*> true_out_ptrs;
+  for (size_t i = 0; i < outputs.size(); ++i) {
+    auto out_name = outputs[i];
+    if (detail::IsDuplicableVar(out_name)) {
+      PADDLE_ENFORCE(i == 0UL && outputs.size() == 1UL,
+                     platform::errors::PreconditionNotMet(
+                         "If custom operator's outputs contains `paddle::Vec("
+                         ")` type, "
+                         "it only can hold one output."));
+      auto vec_out = ctx.MultiOutput<Tensor>(out_name);
+      PADDLE_ENFORCE_NE(vec_out.empty(), true,
+                        platform::errors::NotFound(
+                            "Output vector<tensor> (%s) is empty.", out_name));
+      std::vector<paddle::experimental::Tensor> custom_vec_out;
+      for (size_t j = 0; j < vec_out.size(); ++j) {
+        auto* out = vec_out[j];
+        PADDLE_ENFORCE_NOT_NULL(
+            out,
+            platform::errors::NotFound(
+                "The %d-th tensor in output vector<tensor> (%s) is nullptr.", j,
+                out_name));
+        true_out_ptrs.emplace_back(out);
+        paddle::experimental::Tensor custom_t;
+        // here only can copy the output tensor into context
+        custom_t.set_impl(std::make_shared<pten::DenseTensor>(*out));
+        custom_vec_out.emplace_back(custom_t);
+      }
+      kernel_ctx.EmplaceBackOutputs(std::move(custom_vec_out));
+    } else {
+      auto* out = ctx.Output<Tensor>(out_name);
+      PADDLE_ENFORCE_NOT_NULL(
+          out, platform::errors::NotFound("Output tensor (%s) is nullptr.",
+                                          out_name));
+      true_out_ptrs.emplace_back(out);
+      paddle::experimental::Tensor custom_out;
+      // here only can copy the output tensor into context
+      custom_out.set_impl(std::make_shared<pten::DenseTensor>(*out));
+      kernel_ctx.EmplaceBackOutput(std::move(custom_out));
+    }
+  }
 
-    VLOG(3) << "Custom Operator: Share outputs into ExecutionContext.";
-    for (size_t i = 0; i < outputs.size(); ++i) {
-      auto out_name = outputs[i];
-      if (detail::IsDuplicableVar(out_name)) {
-        PADDLE_ENFORCE(i == 0UL && outputs.size() == 1UL,
-                       platform::errors::PreconditionNotMet(
-                           "If custom operator's outputs contains `paddle::Vec("
-                           ")` type, "
-                           "it only can hold one output."));
-        auto vec_true_outs = ctx.MultiOutput<Tensor>(out_name);
-        PADDLE_ENFORCE_EQ(
-            vec_true_outs.size(), outs.size(),
-            platform::errors::InvalidArgument(
-                "The number of element in custom operator outputs is wrong, "
-                "expected contains %d Tensors, but actually contains %d "
-                "Tensors.",
-                vec_true_outs.size(), outs.size()));
-        for (size_t j = 0; j < vec_true_outs.size(); ++j) {
-          experimental::SharesStorage(
-              std::dynamic_pointer_cast<pten::DenseTensor>(outs.at(j).impl())
-                  .get(),
-              vec_true_outs.at(j));
-        }
-      } else {
-        auto* true_out = ctx.Output<Tensor>(out_name);
-        experimental::SharesStorage(
-            std::dynamic_pointer_cast<pten::DenseTensor>(outs.at(i).impl())
-                .get(),
-            true_out);
+  try {
+    VLOG(3) << "Custom Operator: Run ComputeFunc.";
+    func(&kernel_ctx);
+
+    // sync output tensor data into original output
+    auto* calc_outs = kernel_ctx.AllMutableOutput();
+    PADDLE_ENFORCE_EQ(
+        true_out_ptrs.size(), calc_outs->size(),
+        platform::errors::InvalidArgument(
+            "The number of element in custom operator outputs is wrong, "
+            "expected contains %d Tensors, but actually contains %d "
+            "Tensors.",
+            true_out_ptrs.size(), calc_outs->size()));
+    for (size_t i = 0; i < true_out_ptrs.size(); ++i) {
+      auto* true_out = true_out_ptrs.at(i);
+      auto calc_out =
+          std::dynamic_pointer_cast<pten::DenseTensor>(calc_outs->at(i).impl());
+      // assgin meta info
+      auto* true_out_meta = pten::DenseTensorUtils::GetMutableMeta(true_out);
+      true_out_meta->dims = calc_out->dims();
+      true_out_meta->dtype = calc_out->dtype();
+      true_out_meta->layout = calc_out->layout();
+      // lod and offset no need to be reset
+      // reset holder if needed
+      if (true_out->Holder() != calc_out->Holder()) {
+        true_out->ResetHolder(calc_out->Holder());
       }
     }
   } catch (platform::EnforceNotMet& exception) {
@@ -613,7 +648,7 @@ void RegisterOperatorWithMetaInfo(
   auto op_name = OpMetaInfoHelper::GetOpName(base_op_meta);
 
   if (OpInfoMap::Instance().Has(op_name)) {
-    LOG(WARNING) << "Operator (" << op_name << ")has been registered.";
+    LOG(WARNING) << "Operator (" << op_name << ") has been registered.";
     return;
   }
 
diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc
index 2d089b4721b..0a6d5889c37 100644
--- a/paddle/fluid/framework/data_feed.cc
+++ b/paddle/fluid/framework/data_feed.cc
@@ -340,6 +340,7 @@ InMemoryDataFeed<T>::InMemoryDataFeed() {
   this->thread_id_ = 0;
   this->thread_num_ = 1;
   this->parse_ins_id_ = false;
+  this->parse_uid_ = false;
   this->parse_content_ = false;
   this->parse_logkey_ = false;
   this->enable_pv_merge_ = false;
@@ -498,6 +499,11 @@ void InMemoryDataFeed<T>::SetParseInsId(bool parse_ins_id) {
   parse_ins_id_ = parse_ins_id;
 }
 
+template <typename T>
+void InMemoryDataFeed<T>::SetParseUid(bool parse_uid) {
+  parse_uid_ = parse_uid;
+}
+
 template <typename T>
 void InMemoryDataFeed<T>::LoadIntoMemory() {
 #ifdef _LINUX
@@ -1047,6 +1053,7 @@ void MultiSlotInMemoryDataFeed::Init(
       use_slots_shape_.push_back(local_shape);
     }
   }
+  uid_slot_ = multi_slot_desc.uid_slot();
   feed_vec_.resize(use_slots_.size());
   const int kEstimatedFeasignNumPerSlot = 5;  // Magic Number
   for (size_t i = 0; i < all_slot_num; i++) {
@@ -1160,6 +1167,19 @@ bool MultiSlotInMemoryDataFeed::ParseOneInstanceFromPipe(Record* instance) {
               "\nWe detect the feasign number of this slot is %d, "
               "which is illegal.",
               str, i, num));
+#ifdef PADDLE_WITH_PSLIB
+      if (parse_uid_ && all_slots_[i] == uid_slot_) {
+        PADDLE_ENFORCE(num == 1 && all_slots_type_[i][0] == 'u',
+                       platform::errors::PreconditionNotMet(
+                           "The uid has to be uint64 and single.\n"
+                           "please check this error line: %s",
+                           str));
+
+        char* uidptr = endptr;
+        uint64_t feasign = (uint64_t)strtoull(uidptr, &uidptr, 10);
+        instance->uid_ = feasign;
+      }
+#endif
       if (idx != -1) {
         if (all_slots_type_[i][0] == 'f') {  // float
           for (int j = 0; j < num; ++j) {
diff --git a/paddle/fluid/framework/data_feed.h b/paddle/fluid/framework/data_feed.h
index 313ee9cd68a..5e8b6c135d5 100644
--- a/paddle/fluid/framework/data_feed.h
+++ b/paddle/fluid/framework/data_feed.h
@@ -191,6 +191,7 @@ struct Record {
   uint64_t search_id;
   uint32_t rank;
   uint32_t cmatch;
+  std::string uid_;
 };
 
 inline SlotRecord make_slotrecord() {
@@ -562,6 +563,7 @@ class DataFeed {
   virtual void SetThreadNum(int thread_num) {}
   // This function will do nothing at default
   virtual void SetParseInsId(bool parse_ins_id) {}
+  virtual void SetParseUid(bool parse_uid) {}
   virtual void SetParseContent(bool parse_content) {}
   virtual void SetParseLogKey(bool parse_logkey) {}
   virtual void SetEnablePvMerge(bool enable_pv_merge) {}
@@ -645,6 +647,7 @@ class DataFeed {
   std::vector<std::string> ins_id_vec_;
   std::vector<std::string> ins_content_vec_;
   platform::Place place_;
+  std::string uid_slot_;
 
   // The input type of pipe reader, 0 for one sample, 1 for one batch
   int input_type_;
@@ -709,6 +712,7 @@ class InMemoryDataFeed : public DataFeed {
   virtual void SetThreadId(int thread_id);
   virtual void SetThreadNum(int thread_num);
   virtual void SetParseInsId(bool parse_ins_id);
+  virtual void SetParseUid(bool parse_uid);
   virtual void SetParseContent(bool parse_content);
   virtual void SetParseLogKey(bool parse_logkey);
   virtual void SetEnablePvMerge(bool enable_pv_merge);
@@ -737,6 +741,7 @@ class InMemoryDataFeed : public DataFeed {
   int thread_id_;
   int thread_num_;
   bool parse_ins_id_;
+  bool parse_uid_;
   bool parse_content_;
   bool parse_logkey_;
   bool enable_pv_merge_;
diff --git a/paddle/fluid/framework/data_feed.proto b/paddle/fluid/framework/data_feed.proto
index c1149ed7518..6964446f209 100644
--- a/paddle/fluid/framework/data_feed.proto
+++ b/paddle/fluid/framework/data_feed.proto
@@ -22,7 +22,10 @@ message Slot {
   repeated int32 shape = 5; // we can define N-D Tensor
 }
 
-message MultiSlotDesc { repeated Slot slots = 1; }
+message MultiSlotDesc {
+  repeated Slot slots = 1;
+  optional string uid_slot = 2;
+}
 
 message DataFeedDesc {
   optional string name = 1;
diff --git a/paddle/fluid/framework/data_set.cc b/paddle/fluid/framework/data_set.cc
index df1840794af..b4ae9949f2c 100644
--- a/paddle/fluid/framework/data_set.cc
+++ b/paddle/fluid/framework/data_set.cc
@@ -57,6 +57,8 @@ DatasetImpl<T>::DatasetImpl() {
   parse_logkey_ = false;
   preload_thread_num_ = 0;
   global_index_ = 0;
+  shuffle_by_uid_ = false;
+  parse_uid_ = false;
 }
 
 // set filelist, file_idx_ will reset to zero.
@@ -150,6 +152,12 @@ void DatasetImpl<T>::SetMergeBySid(bool is_merge) {
   merge_by_sid_ = is_merge;
 }
 
+template <typename T>
+void DatasetImpl<T>::SetShuffleByUid(bool enable_shuffle_uid) {
+  shuffle_by_uid_ = enable_shuffle_uid;
+  parse_uid_ = true;
+}
+
 template <typename T>
 void DatasetImpl<T>::SetEnablePvMerge(bool enable_pv_merge) {
   enable_pv_merge_ = enable_pv_merge;
@@ -664,11 +672,14 @@ void MultiSlotDataset::GlobalShuffle(int thread_num) {
           << input_channel_->Size();
 
   auto get_client_id = [this, fleet_ptr](const Record& data) -> size_t {
-    if (!this->merge_by_insid_) {
-      return fleet_ptr->LocalRandomEngine()() % this->trainer_num_;
-    } else {
+    if (this->merge_by_insid_) {
       return XXH64(data.ins_id_.data(), data.ins_id_.length(), 0) %
              this->trainer_num_;
+    } else if (this->shuffle_by_uid_) {
+      return XXH64(data.uid_.data(), data.uid_.length(), 0) %
+             this->trainer_num_;
+    } else {
+      return fleet_ptr->LocalRandomEngine()() % this->trainer_num_;
     }
   };
 
@@ -902,6 +913,7 @@ void DatasetImpl<T>::CreateReaders() {
     readers_[i]->SetFeaNum(&total_fea_num_);
     readers_[i]->SetFileList(filelist_);
     readers_[i]->SetParseInsId(parse_ins_id_);
+    readers_[i]->SetParseUid(parse_uid_);
     readers_[i]->SetParseContent(parse_content_);
     readers_[i]->SetParseLogKey(parse_logkey_);
     readers_[i]->SetEnablePvMerge(enable_pv_merge_);
@@ -972,6 +984,7 @@ void DatasetImpl<T>::CreatePreLoadReaders() {
     preload_readers_[i]->SetFeaNumMutex(&mutex_for_fea_num_);
     preload_readers_[i]->SetFeaNum(&total_fea_num_);
     preload_readers_[i]->SetParseInsId(parse_ins_id_);
+    preload_readers_[i]->SetParseUid(parse_uid_);
     preload_readers_[i]->SetParseContent(parse_content_);
     preload_readers_[i]->SetParseLogKey(parse_logkey_);
     preload_readers_[i]->SetEnablePvMerge(enable_pv_merge_);
diff --git a/paddle/fluid/framework/data_set.h b/paddle/fluid/framework/data_set.h
index 58223a2f28b..1947c669e9b 100644
--- a/paddle/fluid/framework/data_set.h
+++ b/paddle/fluid/framework/data_set.h
@@ -81,6 +81,7 @@ class Dataset {
   virtual void SetEnablePvMerge(bool enable_pv_merge) = 0;
   virtual bool EnablePvMerge() = 0;
   virtual void SetMergeBySid(bool is_merge) = 0;
+  virtual void SetShuffleByUid(bool enable_shuffle_uid) = 0;
   // set merge by ins id
   virtual void SetMergeByInsId(int merge_size) = 0;
   virtual void SetGenerateUniqueFeasign(bool gen_uni_feasigns) = 0;
@@ -189,6 +190,7 @@ class DatasetImpl : public Dataset {
   virtual void SetParseLogKey(bool parse_logkey);
   virtual void SetEnablePvMerge(bool enable_pv_merge);
   virtual void SetMergeBySid(bool is_merge);
+  virtual void SetShuffleByUid(bool enable_shuffle_uid);
 
   virtual void SetMergeByInsId(int merge_size);
   virtual void SetGenerateUniqueFeasign(bool gen_uni_feasigns);
@@ -307,6 +309,8 @@ class DatasetImpl : public Dataset {
   bool parse_content_;
   bool parse_logkey_;
   bool merge_by_sid_;
+  bool shuffle_by_uid_;
+  bool parse_uid_;
   bool enable_pv_merge_;  // True means to merge pv
   int current_phase_;     // 1 join, 0 update
   size_t merge_size_;
diff --git a/paddle/fluid/framework/data_type_transform_test.cu b/paddle/fluid/framework/data_type_transform_test.cu
index 0874509a879..6b84fdf0ec9 100644
--- a/paddle/fluid/framework/data_type_transform_test.cu
+++ b/paddle/fluid/framework/data_type_transform_test.cu
@@ -21,7 +21,10 @@ TEST(DataTypeTransform, GPUTransform) {
   auto cpu_place = paddle::platform::CPUPlace();
   auto gpu_place = paddle::platform::CUDAPlace(0);
   paddle::platform::CUDADeviceContext context(gpu_place);
-
+  context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(gpu_place, context.stream())
+                           .get());
+  context.PartialInitWithAllocator();
   auto kernel_fp16 = paddle::framework::OpKernelType(
       paddle::framework::proto::VarType::FP16, gpu_place,
       paddle::framework::DataLayout::kAnyLayout,
diff --git a/paddle/fluid/framework/details/nan_inf_utils.h b/paddle/fluid/framework/details/nan_inf_utils.h
index be5ffef27ca..81f679823d3 100644
--- a/paddle/fluid/framework/details/nan_inf_utils.h
+++ b/paddle/fluid/framework/details/nan_inf_utils.h
@@ -17,12 +17,11 @@
 #include <string>
 #include <vector>
 
-#include "paddle/fluid/eager/legacy/type_def.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/imperative/type_defs.h"
+#include "paddle/fluid/imperative/var_helper.h"
 #include "paddle/fluid/platform/place.h"
-
 namespace paddle {
 namespace framework {
 namespace details {
@@ -49,20 +48,8 @@ void CheckOpHasNanOrInfInDygraph(const std::string& op_type,
     for (const auto& ivar : pair.second) {
       auto* var = ivar->MutableVar();
       if (var == nullptr) continue;
-      CheckVarHasNanOrInf(op_type, ivar->Name(), var, place);
-    }
-  }
-}
-
-template <typename TensorType>
-static void CheckOpHasNanOrInfInEager(
-    const std::string& op_type, const egr::legacy::NameMap<TensorType>& op_outs,
-    platform::Place place) {
-  for (const auto& pair : op_outs) {
-    for (const auto& tensor : pair.second) {
-      auto* var = tensor->MutableVar();
-      if (var == nullptr) continue;
-      CheckVarHasNanOrInf(op_type, tensor->name(), var, place);
+      CheckVarHasNanOrInf(op_type, paddle::imperative::GetNameFromVar(ivar),
+                          var, place);
     }
   }
 }
diff --git a/paddle/fluid/framework/dim.h b/paddle/fluid/framework/dim.h
index 6abae4e7318..21975485add 100644
--- a/paddle/fluid/framework/dim.h
+++ b/paddle/fluid/framework/dim.h
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
-#include "paddle/pten/core/dim.h"
+#include "paddle/pten/core/utils/dim.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/downpour_worker.cc b/paddle/fluid/framework/downpour_worker.cc
index bea23469f11..77201640ef2 100644
--- a/paddle/fluid/framework/downpour_worker.cc
+++ b/paddle/fluid/framework/downpour_worker.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/device_worker.h"
+#include "paddle/fluid/framework/fleet/metrics.h"
 #include "paddle/fluid/platform/cpu_helper.h"
 
 namespace pten {
@@ -32,7 +33,6 @@ class Variable;
 
 namespace paddle {
 namespace framework {
-
 void DownpourWorker::Initialize(const TrainerDesc& desc) {
   param_ = desc.downpour_param();
   for (int i = 0; i < param_.sparse_table_size(); ++i) {
@@ -740,6 +740,23 @@ void DownpourWorker::TrainFilesWithProfiler() {
   }
 }
 
+#ifdef PADDLE_WITH_PSLIB
+/**
+ * @brief add auc monitor
+ */
+inline void AddAucMonitor(const Scope* scope, const platform::Place& place) {
+  auto metric_ptr = Metric::GetInstance();
+  auto& metric_list = metric_ptr->GetMetricList();
+  for (auto iter = metric_list.begin(); iter != metric_list.end(); iter++) {
+    auto* metric_msg = iter->second;
+    if (metric_ptr->Phase() != metric_msg->MetricPhase()) {
+      continue;
+    }
+    metric_msg->add_data(scope, place);
+  }
+}
+#endif
+
 void DownpourWorker::TrainFiles() {
   VLOG(3) << "Begin to train files";
   platform::SetNumThreads(1);
@@ -837,6 +854,13 @@ void DownpourWorker::TrainFiles() {
       }
     }
 
+#ifdef PADDLE_WITH_PSLIB
+    // add data for MetricMsg
+    if (Metric::GetInstance() != nullptr) {
+      AddAucMonitor(thread_scope_, place_);
+    }
+#endif
+
     // check inf and nan
     for (std::string& var_name : check_nan_var_names_) {
       Variable* var = thread_scope_->FindVar(var_name);
diff --git a/paddle/fluid/framework/fleet/CMakeLists.txt b/paddle/fluid/framework/fleet/CMakeLists.txt
index 65214cb2591..c3304e3f902 100644
--- a/paddle/fluid/framework/fleet/CMakeLists.txt
+++ b/paddle/fluid/framework/fleet/CMakeLists.txt
@@ -42,8 +42,10 @@ endif(WITH_BOX_PS)
 
 if(WITH_GLOO)
     cc_library(gloo_wrapper SRCS gloo_wrapper.cc DEPS framework_proto variable_helper scope gloo)
+    cc_library(metrics SRCS metrics.cc DEPS gloo_wrapper)
 else()
     cc_library(gloo_wrapper SRCS gloo_wrapper.cc DEPS framework_proto variable_helper scope)
+    cc_library(metrics SRCS metrics.cc DEPS gloo_wrapper)
 endif(WITH_GLOO)
 
 if(WITH_PSLIB)
diff --git a/paddle/fluid/framework/fleet/metrics.cc b/paddle/fluid/framework/fleet/metrics.cc
new file mode 100644
index 00000000000..7b6f054ee0c
--- /dev/null
+++ b/paddle/fluid/framework/fleet/metrics.cc
@@ -0,0 +1,380 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/fleet/metrics.h"
+
+#include <algorithm>
+#include <ctime>
+#include <memory>
+#include <numeric>
+#include "paddle/fluid/framework/lod_tensor.h"
+
+#if defined(PADDLE_WITH_PSLIB)
+namespace paddle {
+namespace framework {
+
+std::shared_ptr<Metric> Metric::s_instance_ = nullptr;
+
+void BasicAucCalculator::init(int table_size) {
+  set_table_size(table_size);
+
+  // init CPU memory
+  for (int i = 0; i < 2; i++) {
+    _table[i] = std::vector<double>();
+  }
+
+  // reset
+  reset();
+}
+
+void BasicAucCalculator::reset() {
+  // reset CPU counter
+  for (int i = 0; i < 2; i++) {
+    _table[i].assign(_table_size, 0.0);
+  }
+  _local_abserr = 0;
+  _local_sqrerr = 0;
+  _local_pred = 0;
+}
+
+void BasicAucCalculator::add_data(const float* d_pred, const int64_t* d_label,
+                                  int batch_size,
+                                  const paddle::platform::Place& place) {
+  thread_local std::vector<float> h_pred;
+  thread_local std::vector<int64_t> h_label;
+  h_pred.resize(batch_size);
+  h_label.resize(batch_size);
+  memcpy(h_pred.data(), d_pred, sizeof(float) * batch_size);
+  memcpy(h_label.data(), d_label, sizeof(int64_t) * batch_size);
+  std::lock_guard<std::mutex> lock(_table_mutex);
+  for (int i = 0; i < batch_size; ++i) {
+    add_unlock_data(h_pred[i], h_label[i]);
+  }
+}
+
+void BasicAucCalculator::add_unlock_data(double pred, int label) {
+  PADDLE_ENFORCE_GE(pred, 0.0, platform::errors::PreconditionNotMet(
+                                   "pred should be greater than 0"));
+  PADDLE_ENFORCE_LE(pred, 1.0, platform::errors::PreconditionNotMet(
+                                   "pred should be lower than 1"));
+  PADDLE_ENFORCE_EQ(
+      label * label, label,
+      platform::errors::PreconditionNotMet(
+          "label must be equal to 0 or 1, but its value is: %d", label));
+  int pos = std::min(static_cast<int>(pred * _table_size), _table_size - 1);
+  PADDLE_ENFORCE_GE(
+      pos, 0,
+      platform::errors::PreconditionNotMet(
+          "pos must be equal or greater than 0, but its value is: %d", pos));
+  PADDLE_ENFORCE_LT(
+      pos, _table_size,
+      platform::errors::PreconditionNotMet(
+          "pos must be less than table_size, but its value is: %d", pos));
+  _local_abserr += fabs(pred - label);
+  _local_sqrerr += (pred - label) * (pred - label);
+  _local_pred += pred;
+  ++_table[label][pos];
+}
+
+// add mask data
+void BasicAucCalculator::add_mask_data(const float* d_pred,
+                                       const int64_t* d_label,
+                                       const int64_t* d_mask, int batch_size,
+                                       const paddle::platform::Place& place) {
+  thread_local std::vector<float> h_pred;
+  thread_local std::vector<int64_t> h_label;
+  thread_local std::vector<int64_t> h_mask;
+  h_pred.resize(batch_size);
+  h_label.resize(batch_size);
+  h_mask.resize(batch_size);
+
+  memcpy(h_pred.data(), d_pred, sizeof(float) * batch_size);
+  memcpy(h_label.data(), d_label, sizeof(int64_t) * batch_size);
+  memcpy(h_mask.data(), d_mask, sizeof(int64_t) * batch_size);
+
+  std::lock_guard<std::mutex> lock(_table_mutex);
+  for (int i = 0; i < batch_size; ++i) {
+    if (h_mask[i]) {
+      add_unlock_data(h_pred[i], h_label[i]);
+    }
+  }
+}
+
+void BasicAucCalculator::compute() {
+#if defined(PADDLE_WITH_GLOO)
+  double area = 0;
+  double fp = 0;
+  double tp = 0;
+
+  auto gloo_wrapper = paddle::framework::GlooWrapper::GetInstance();
+  if (!gloo_wrapper->IsInitialized()) {
+    VLOG(0) << "GLOO is not inited";
+    gloo_wrapper->Init();
+  }
+
+  if (gloo_wrapper->Size() > 1) {
+    auto neg_table = gloo_wrapper->AllReduce(_table[0], "sum");
+    auto pos_table = gloo_wrapper->AllReduce(_table[1], "sum");
+    for (int i = _table_size - 1; i >= 0; i--) {
+      double newfp = fp + neg_table[i];
+      double newtp = tp + pos_table[i];
+      area += (newfp - fp) * (tp + newtp) / 2;
+      fp = newfp;
+      tp = newtp;
+    }
+  } else {
+    for (int i = _table_size - 1; i >= 0; i--) {
+      double newfp = fp + _table[0][i];
+      double newtp = tp + _table[1][i];
+      area += (newfp - fp) * (tp + newtp) / 2;
+      fp = newfp;
+      tp = newtp;
+    }
+  }
+
+  if (fp < 1e-3 || tp < 1e-3) {
+    _auc = -0.5;  // which means all nonclick or click
+  } else {
+    _auc = area / (fp * tp);
+  }
+
+  if (gloo_wrapper->Size() > 1) {
+    // allreduce sum
+    std::vector<double> local_abserr_vec(1, _local_abserr);
+    std::vector<double> local_sqrerr_vec(1, _local_sqrerr);
+    std::vector<double> local_pred_vec(1, _local_pred);
+    auto global_abserr_vec = gloo_wrapper->AllReduce(local_abserr_vec, "sum");
+    auto global_sqrerr_vec = gloo_wrapper->AllReduce(local_sqrerr_vec, "sum");
+    auto global_pred_vec = gloo_wrapper->AllReduce(local_pred_vec, "sum");
+    _mae = global_abserr_vec[0] / (fp + tp);
+    _rmse = sqrt(global_sqrerr_vec[0] / (fp + tp));
+    _predicted_ctr = global_pred_vec[0] / (fp + tp);
+  } else {
+    _mae = _local_abserr / (fp + tp);
+    _rmse = sqrt(_local_sqrerr / (fp + tp));
+    _predicted_ctr = _local_pred / (fp + tp);
+  }
+  _actual_ctr = tp / (fp + tp);
+
+  _size = fp + tp;
+
+  calculate_bucket_error();
+#endif
+}
+
+void BasicAucCalculator::calculate_bucket_error() {
+#if defined(PADDLE_WITH_GLOO)
+  double last_ctr = -1;
+  double impression_sum = 0;
+  double ctr_sum = 0.0;
+  double click_sum = 0.0;
+  double error_sum = 0.0;
+  double error_count = 0;
+  auto gloo_wrapper = paddle::framework::GlooWrapper::GetInstance();
+  if (gloo_wrapper->Size() > 1) {
+    auto neg_table = gloo_wrapper->AllReduce(_table[0], "sum");
+    auto pos_table = gloo_wrapper->AllReduce(_table[1], "sum");
+    for (int i = 0; i < _table_size; i++) {
+      double click = pos_table[i];
+      double show = neg_table[i] + pos_table[i];
+      double ctr = static_cast<double>(i) / _table_size;
+      if (fabs(ctr - last_ctr) > kMaxSpan) {
+        last_ctr = ctr;
+        impression_sum = 0.0;
+        ctr_sum = 0.0;
+        click_sum = 0.0;
+      }
+      impression_sum += show;
+      ctr_sum += ctr * show;
+      click_sum += click;
+      double adjust_ctr = ctr_sum / impression_sum;
+      double relative_error =
+          sqrt((1 - adjust_ctr) / (adjust_ctr * impression_sum));
+      if (relative_error < kRelativeErrorBound) {
+        double actual_ctr = click_sum / impression_sum;
+        double relative_ctr_error = fabs(actual_ctr / adjust_ctr - 1);
+        error_sum += relative_ctr_error * impression_sum;
+        error_count += impression_sum;
+        last_ctr = -1;
+      }
+    }
+  } else {
+    double* table[2] = {&_table[0][0], &_table[1][0]};
+    for (int i = 0; i < _table_size; i++) {
+      double click = table[1][i];
+      double show = table[0][i] + table[1][i];
+      double ctr = static_cast<double>(i) / _table_size;
+      if (fabs(ctr - last_ctr) > kMaxSpan) {
+        last_ctr = ctr;
+        impression_sum = 0.0;
+        ctr_sum = 0.0;
+        click_sum = 0.0;
+      }
+      impression_sum += show;
+      ctr_sum += ctr * show;
+      click_sum += click;
+      double adjust_ctr = ctr_sum / impression_sum;
+      double relative_error =
+          sqrt((1 - adjust_ctr) / (adjust_ctr * impression_sum));
+      if (relative_error < kRelativeErrorBound) {
+        double actual_ctr = click_sum / impression_sum;
+        double relative_ctr_error = fabs(actual_ctr / adjust_ctr - 1);
+        error_sum += relative_ctr_error * impression_sum;
+        error_count += impression_sum;
+        last_ctr = -1;
+      }
+    }
+  }
+  _bucket_error = error_count > 0 ? error_sum / error_count : 0.0;
+#endif
+}
+
+void BasicAucCalculator::reset_records() {
+  // reset wuauc_records_
+  wuauc_records_.clear();
+  _user_cnt = 0;
+  _size = 0;
+  _uauc = 0;
+  _wuauc = 0;
+}
+
+// add uid data
+void BasicAucCalculator::add_uid_data(const float* d_pred,
+                                      const int64_t* d_label,
+                                      const int64_t* d_uid, int batch_size,
+                                      const paddle::platform::Place& place) {
+  thread_local std::vector<float> h_pred;
+  thread_local std::vector<int64_t> h_label;
+  thread_local std::vector<uint64_t> h_uid;
+  h_pred.resize(batch_size);
+  h_label.resize(batch_size);
+  h_uid.resize(batch_size);
+
+  memcpy(h_pred.data(), d_pred, sizeof(float) * batch_size);
+  memcpy(h_label.data(), d_label, sizeof(int64_t) * batch_size);
+  memcpy(h_uid.data(), d_uid, sizeof(uint64_t) * batch_size);
+
+  std::lock_guard<std::mutex> lock(_table_mutex);
+  for (int i = 0; i < batch_size; ++i) {
+    add_uid_unlock_data(h_pred[i], h_label[i], static_cast<uint64_t>(h_uid[i]));
+  }
+}
+
+void BasicAucCalculator::add_uid_unlock_data(double pred, int label,
+                                             uint64_t uid) {
+  PADDLE_ENFORCE_GE(pred, 0.0, platform::errors::PreconditionNotMet(
+                                   "pred should be greater than 0"));
+  PADDLE_ENFORCE_LE(pred, 1.0, platform::errors::PreconditionNotMet(
+                                   "pred should be lower than 1"));
+  PADDLE_ENFORCE_EQ(
+      label * label, label,
+      platform::errors::PreconditionNotMet(
+          "label must be equal to 0 or 1, but its value is: %d", label));
+
+  WuaucRecord record;
+  record.uid_ = uid;
+  record.label_ = label;
+  record.pred_ = pred;
+  wuauc_records_.emplace_back(std::move(record));
+}
+
+void BasicAucCalculator::computeWuAuc() {
+  std::sort(wuauc_records_.begin(), wuauc_records_.end(),
+            [](const WuaucRecord& lhs, const WuaucRecord& rhs) {
+              if (lhs.uid_ == rhs.uid_) {
+                if (lhs.pred_ == rhs.pred_) {
+                  return lhs.label_ < rhs.label_;
+                } else {
+                  return lhs.pred_ > rhs.pred_;
+                }
+              } else {
+                return lhs.uid_ > rhs.uid_;
+              }
+            });
+
+  WuaucRocData roc_data;
+  uint64_t prev_uid = 0;
+  size_t prev_pos = 0;
+  for (size_t i = 0; i < wuauc_records_.size(); ++i) {
+    if (wuauc_records_[i].uid_ != prev_uid) {
+      std::vector<WuaucRecord> single_user_recs(
+          wuauc_records_.begin() + prev_pos, wuauc_records_.begin() + i);
+      roc_data = computeSingelUserAuc(single_user_recs);
+      if (roc_data.auc_ != -1) {
+        double ins_num = (roc_data.tp_ + roc_data.fp_);
+        _user_cnt += 1;
+        _size += ins_num;
+        _uauc += roc_data.auc_;
+        _wuauc += roc_data.auc_ * ins_num;
+      }
+
+      prev_uid = wuauc_records_[i].uid_;
+      prev_pos = i;
+    }
+  }
+
+  std::vector<WuaucRecord> single_user_recs(wuauc_records_.begin() + prev_pos,
+                                            wuauc_records_.end());
+  roc_data = computeSingelUserAuc(single_user_recs);
+  if (roc_data.auc_ != -1) {
+    double ins_num = (roc_data.tp_ + roc_data.fp_);
+    _user_cnt += 1;
+    _size += ins_num;
+    _uauc += roc_data.auc_;
+    _wuauc += roc_data.auc_ * ins_num;
+  }
+}
+
+BasicAucCalculator::WuaucRocData BasicAucCalculator::computeSingelUserAuc(
+    const std::vector<WuaucRecord>& records) {
+  double tp = 0.0;
+  double fp = 0.0;
+  double newtp = 0.0;
+  double newfp = 0.0;
+  double area = 0.0;
+  double auc = -1;
+  size_t i = 0;
+
+  while (i < records.size()) {
+    newtp = tp;
+    newfp = fp;
+    if (records[i].label_ == 1) {
+      newtp += 1;
+    } else {
+      newfp += 1;
+    }
+    // check i+1
+    while (i < records.size() - 1 && records[i].pred_ == records[i + 1].pred_) {
+      if (records[i + 1].label_ == 1) {
+        newtp += 1;
+      } else {
+        newfp += 1;
+      }
+      i += 1;
+    }
+    area += (newfp - fp) * (tp + newtp) / 2.0;
+    tp = newtp;
+    fp = newfp;
+    i += 1;
+  }
+  if (tp > 0 && fp > 0) {
+    auc = area / (fp * tp + 1e-9);
+  } else {
+    auc = -1;
+  }
+  return {tp, fp, auc};
+}
+
+}  // namespace framework
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/framework/fleet/metrics.h b/paddle/fluid/framework/fleet/metrics.h
new file mode 100644
index 00000000000..7149c36a393
--- /dev/null
+++ b/paddle/fluid/framework/fleet/metrics.h
@@ -0,0 +1,693 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <ThreadPool.h>
+#include <atomic>
+#include <ctime>
+#include <map>
+#include <memory>
+#include <random>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/variable_helper.h"
+#include "paddle/fluid/platform/timer.h"
+#include "paddle/fluid/string/string_helper.h"
+
+#if defined(PADDLE_WITH_GLOO)
+#include <gloo/allreduce.h>
+#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
+#endif
+
+#if defined(PADDLE_WITH_PSLIB)
+namespace paddle {
+
+namespace framework {
+
+class BasicAucCalculator {
+ public:
+  BasicAucCalculator() {}
+  struct WuaucRecord {
+    uint64_t uid_;
+    int label_;
+    float pred_;
+  };
+
+  struct WuaucRocData {
+    double tp_;
+    double fp_;
+    double auc_;
+  };
+  void init(int table_size);
+  void init_wuauc(int table_size);
+  void reset();
+  void reset_records();
+  // add single data in CPU with LOCK, deprecated
+  void add_unlock_data(double pred, int label);
+  void add_uid_unlock_data(double pred, int label, uint64_t uid);
+  // add batch data
+  void add_data(const float* d_pred, const int64_t* d_label, int batch_size,
+                const paddle::platform::Place& place);
+  // add mask data
+  void add_mask_data(const float* d_pred, const int64_t* d_label,
+                     const int64_t* d_mask, int batch_size,
+                     const paddle::platform::Place& place);
+  // add uid data
+  void add_uid_data(const float* d_pred, const int64_t* d_label,
+                    const int64_t* d_uid, int batch_size,
+                    const paddle::platform::Place& place);
+
+  void compute();
+  void computeWuAuc();
+  WuaucRocData computeSingelUserAuc(const std::vector<WuaucRecord>& records);
+  int table_size() const { return _table_size; }
+  double bucket_error() const { return _bucket_error; }
+  double auc() const { return _auc; }
+  double uauc() const { return _uauc; }
+  double wuauc() const { return _wuauc; }
+  double mae() const { return _mae; }
+  double actual_ctr() const { return _actual_ctr; }
+  double predicted_ctr() const { return _predicted_ctr; }
+  double user_cnt() const { return _user_cnt; }
+  double size() const { return _size; }
+  double rmse() const { return _rmse; }
+  std::unordered_set<uint64_t> uid_keys() const { return _uid_keys; }
+  // lock and unlock
+  std::mutex& table_mutex(void) { return _table_mutex; }
+
+ private:
+  void calculate_bucket_error();
+
+ protected:
+  double _local_abserr = 0;
+  double _local_sqrerr = 0;
+  double _local_pred = 0;
+  double _auc = 0;
+  double _uauc = 0;
+  double _wuauc = 0;
+  double _mae = 0;
+  double _rmse = 0;
+  double _actual_ctr = 0;
+  double _predicted_ctr = 0;
+  double _size;
+  double _user_cnt = 0;
+  double _bucket_error = 0;
+  std::unordered_set<uint64_t> _uid_keys;
+
+ private:
+  void set_table_size(int table_size) { _table_size = table_size; }
+  int _table_size;
+  std::vector<double> _table[2];
+  std::vector<WuaucRecord> wuauc_records_;
+  static constexpr double kRelativeErrorBound = 0.05;
+  static constexpr double kMaxSpan = 0.01;
+  std::mutex _table_mutex;
+};
+
+class Metric {
+ public:
+  virtual ~Metric() {}
+
+  Metric() { fprintf(stdout, "init fleet Metric\n"); }
+
+  class MetricMsg {
+   public:
+    MetricMsg() {}
+    MetricMsg(const std::string& label_varname, const std::string& pred_varname,
+              int metric_phase, int bucket_size = 1000000)
+        : label_varname_(label_varname),
+          pred_varname_(pred_varname),
+          metric_phase_(metric_phase) {
+      calculator = new BasicAucCalculator();
+      calculator->init(bucket_size);
+    }
+    virtual ~MetricMsg() {}
+
+    int MetricPhase() const { return metric_phase_; }
+    BasicAucCalculator* GetCalculator() { return calculator; }
+
+    // add_data
+    virtual void add_data(const Scope* exe_scope,
+                          const paddle::platform::Place& place) {
+      int label_len = 0;
+      const int64_t* label_data = NULL;
+      int pred_len = 0;
+      const float* pred_data = NULL;
+      get_data<int64_t>(exe_scope, label_varname_, &label_data, &label_len);
+      get_data<float>(exe_scope, pred_varname_, &pred_data, &pred_len);
+      PADDLE_ENFORCE_EQ(label_len, pred_len,
+                        platform::errors::PreconditionNotMet(
+                            "the predict data length should be consistent with "
+                            "the label data length"));
+      calculator->add_data(pred_data, label_data, label_len, place);
+    }
+
+    // get_data
+    template <class T = float>
+    static void get_data(const Scope* exe_scope, const std::string& varname,
+                         const T** data, int* len) {
+      auto* var = exe_scope->FindVar(varname.c_str());
+      PADDLE_ENFORCE_NOT_NULL(
+          var, platform::errors::NotFound(
+                   "Error: var %s is not found in scope.", varname.c_str()));
+      auto& cpu_tensor = var->Get<LoDTensor>();
+      *data = cpu_tensor.data<T>();
+      *len = cpu_tensor.numel();
+    }
+
+    template <class T = float>
+    static void get_data(const Scope* exe_scope, const std::string& varname,
+                         std::vector<T>* data) {
+      auto* var = exe_scope->FindVar(varname.c_str());
+      PADDLE_ENFORCE_NOT_NULL(
+          var, platform::errors::NotFound(
+                   "Error: var %s is not found in scope.", varname.c_str()));
+      auto& cpu_tensor = var->Get<LoDTensor>();
+      auto* cpu_data = cpu_tensor.data<T>();
+      auto len = cpu_tensor.numel();
+      data->resize(len);
+      memcpy(data->data(), cpu_data, sizeof(T) * len);
+    }
+
+    // parse_cmatch_rank
+    static inline std::pair<int, int> parse_cmatch_rank(uint64_t x) {
+      // only consider ignore_rank=True
+      return std::make_pair(static_cast<int>(x), 0);
+      // first 32 bit store cmatch and second 32 bit store rank
+      // return std::make_pair(static_cast<int>(x >> 32),
+      //                       static_cast<int>(x & 0xff));
+    }
+
+   protected:
+    std::string label_varname_;
+    std::string pred_varname_;
+    int metric_phase_;
+    BasicAucCalculator* calculator;
+  };
+
+  class WuAucMetricMsg : public MetricMsg {
+   public:
+    WuAucMetricMsg(const std::string& label_varname,
+                   const std::string& pred_varname,
+                   const std::string& uid_varname, int metric_phase,
+                   int bucket_size = 1000000) {
+      label_varname_ = label_varname;
+      pred_varname_ = pred_varname;
+      uid_varname_ = uid_varname;
+      metric_phase_ = metric_phase;
+      calculator = new BasicAucCalculator();
+    }
+    virtual ~WuAucMetricMsg() {}
+    void add_data(const Scope* exe_scope,
+                  const paddle::platform::Place& place) override {
+      int label_len = 0;
+      const int64_t* label_data = NULL;
+      get_data<int64_t>(exe_scope, label_varname_, &label_data, &label_len);
+
+      int pred_len = 0;
+      const float* pred_data = NULL;
+      get_data<float>(exe_scope, pred_varname_, &pred_data, &pred_len);
+
+      int uid_len = 0;
+      const int64_t* uid_data = NULL;
+      get_data<int64_t>(exe_scope, uid_varname_, &uid_data, &uid_len);
+      PADDLE_ENFORCE_EQ(label_len, uid_len,
+                        platform::errors::PreconditionNotMet(
+                            "the predict data length should be consistent with "
+                            "the label data length"));
+      auto cal = GetCalculator();
+      cal->add_uid_data(pred_data, label_data, uid_data, label_len, place);
+    }
+
+   protected:
+    std::string uid_varname_;
+  };
+
+  class MultiTaskMetricMsg : public MetricMsg {
+   public:
+    MultiTaskMetricMsg(const std::string& label_varname,
+                       const std::string& pred_varname_list, int metric_phase,
+                       const std::string& cmatch_rank_group,
+                       const std::string& cmatch_rank_varname,
+                       int bucket_size = 1000000) {
+      label_varname_ = label_varname;
+      cmatch_rank_varname_ = cmatch_rank_varname;
+      metric_phase_ = metric_phase;
+      calculator = new BasicAucCalculator();
+      calculator->init(bucket_size);
+      for (auto& cmatch_rank : string::split_string(cmatch_rank_group)) {
+        const std::vector<std::string>& cur_cmatch_rank =
+            string::split_string(cmatch_rank, "_");
+        PADDLE_ENFORCE_EQ(
+            cur_cmatch_rank.size(), 2,
+            platform::errors::PreconditionNotMet(
+                "illegal multitask auc spec: %s", cmatch_rank.c_str()));
+        cmatch_rank_v.emplace_back(atoi(cur_cmatch_rank[0].c_str()),
+                                   atoi(cur_cmatch_rank[1].c_str()));
+      }
+      for (const auto& pred_varname : string::split_string(pred_varname_list)) {
+        pred_v.emplace_back(pred_varname);
+      }
+      PADDLE_ENFORCE_EQ(cmatch_rank_v.size(), pred_v.size(),
+                        platform::errors::PreconditionNotMet(
+                            "cmatch_rank's size [%lu] should be equal to pred "
+                            "list's size [%lu], but ther are not equal",
+                            cmatch_rank_v.size(), pred_v.size()));
+    }
+    virtual ~MultiTaskMetricMsg() {}
+    void add_data(const Scope* exe_scope,
+                  const paddle::platform::Place& place) override {
+      std::vector<int64_t> cmatch_rank_data;
+      get_data<int64_t>(exe_scope, cmatch_rank_varname_, &cmatch_rank_data);
+      std::vector<int64_t> label_data;
+      get_data<int64_t>(exe_scope, label_varname_, &label_data);
+      size_t batch_size = cmatch_rank_data.size();
+      PADDLE_ENFORCE_EQ(
+          batch_size, label_data.size(),
+          platform::errors::PreconditionNotMet(
+              "illegal batch size: batch_size[%lu] and label_data[%lu]",
+              batch_size, label_data.size()));
+
+      std::vector<std::vector<float>> pred_data_list(pred_v.size());
+      for (size_t i = 0; i < pred_v.size(); ++i) {
+        get_data<float>(exe_scope, pred_v[i], &pred_data_list[i]);
+      }
+      for (size_t i = 0; i < pred_data_list.size(); ++i) {
+        PADDLE_ENFORCE_EQ(
+            batch_size, pred_data_list[i].size(),
+            platform::errors::PreconditionNotMet(
+                "illegal batch size: batch_size[%lu] and pred_data[%lu]",
+                batch_size, pred_data_list[i].size()));
+      }
+      auto cal = GetCalculator();
+      std::lock_guard<std::mutex> lock(cal->table_mutex());
+      for (size_t i = 0; i < batch_size; ++i) {
+        auto cmatch_rank_it =
+            std::find(cmatch_rank_v.begin(), cmatch_rank_v.end(),
+                      parse_cmatch_rank(cmatch_rank_data[i]));
+        if (cmatch_rank_it != cmatch_rank_v.end()) {
+          cal->add_unlock_data(pred_data_list[std::distance(
+                                   cmatch_rank_v.begin(), cmatch_rank_it)][i],
+                               label_data[i]);
+        }
+      }
+    }
+
+   protected:
+    std::vector<std::pair<int, int>> cmatch_rank_v;
+    std::vector<std::string> pred_v;
+    std::string cmatch_rank_varname_;
+  };
+
+  class CmatchRankMetricMsg : public MetricMsg {
+   public:
+    CmatchRankMetricMsg(const std::string& label_varname,
+                        const std::string& pred_varname, int metric_phase,
+                        const std::string& cmatch_rank_group,
+                        const std::string& cmatch_rank_varname,
+                        bool ignore_rank = false, int bucket_size = 1000000) {
+      label_varname_ = label_varname;
+      pred_varname_ = pred_varname;
+      cmatch_rank_varname_ = cmatch_rank_varname;
+      metric_phase_ = metric_phase;
+      ignore_rank_ = ignore_rank;
+      calculator = new BasicAucCalculator();
+      calculator->init(bucket_size);
+      for (auto& cmatch_rank : string::split_string(cmatch_rank_group)) {
+        if (ignore_rank) {  // CmatchAUC
+          cmatch_rank_v.emplace_back(atoi(cmatch_rank.c_str()), 0);
+          continue;
+        }
+        const std::vector<std::string>& cur_cmatch_rank =
+            string::split_string(cmatch_rank, "_");
+        PADDLE_ENFORCE_EQ(
+            cur_cmatch_rank.size(), 2,
+            platform::errors::PreconditionNotMet(
+                "illegal cmatch_rank auc spec: %s", cmatch_rank.c_str()));
+        cmatch_rank_v.emplace_back(atoi(cur_cmatch_rank[0].c_str()),
+                                   atoi(cur_cmatch_rank[1].c_str()));
+      }
+    }
+    virtual ~CmatchRankMetricMsg() {}
+    void add_data(const Scope* exe_scope,
+                  const paddle::platform::Place& place) override {
+      std::vector<int64_t> cmatch_rank_data;
+      get_data<int64_t>(exe_scope, cmatch_rank_varname_, &cmatch_rank_data);
+      std::vector<int64_t> label_data;
+      get_data<int64_t>(exe_scope, label_varname_, &label_data);
+      std::vector<float> pred_data;
+      get_data<float>(exe_scope, pred_varname_, &pred_data);
+      size_t batch_size = cmatch_rank_data.size();
+      PADDLE_ENFORCE_EQ(
+          batch_size, label_data.size(),
+          platform::errors::PreconditionNotMet(
+              "illegal batch size: cmatch_rank[%lu] and label_data[%lu]",
+              batch_size, label_data.size()));
+      PADDLE_ENFORCE_EQ(
+          batch_size, pred_data.size(),
+          platform::errors::PreconditionNotMet(
+              "illegal batch size: cmatch_rank[%lu] and pred_data[%lu]",
+              batch_size, pred_data.size()));
+      auto cal = GetCalculator();
+      std::lock_guard<std::mutex> lock(cal->table_mutex());
+      for (size_t i = 0; i < batch_size; ++i) {
+        const auto& cur_cmatch_rank = parse_cmatch_rank(cmatch_rank_data[i]);
+        for (size_t j = 0; j < cmatch_rank_v.size(); ++j) {
+          bool is_matched = false;
+          if (ignore_rank_) {
+            is_matched = cmatch_rank_v[j].first == cur_cmatch_rank.first;
+          } else {
+            is_matched = cmatch_rank_v[j] == cur_cmatch_rank;
+          }
+          if (is_matched) {
+            cal->add_unlock_data(pred_data[i], label_data[i]);
+            break;
+          }
+        }
+      }
+    }
+
+   protected:
+    std::vector<std::pair<int, int>> cmatch_rank_v;
+    std::string cmatch_rank_varname_;
+    bool ignore_rank_;
+  };
+
+  class MaskMetricMsg : public MetricMsg {
+   public:
+    MaskMetricMsg(const std::string& label_varname,
+                  const std::string& pred_varname, int metric_phase,
+                  const std::string& mask_varname, int bucket_size = 1000000) {
+      label_varname_ = label_varname;
+      pred_varname_ = pred_varname;
+      mask_varname_ = mask_varname;
+      metric_phase_ = metric_phase;
+      calculator = new BasicAucCalculator();
+      calculator->init(bucket_size);
+    }
+    virtual ~MaskMetricMsg() {}
+    void add_data(const Scope* exe_scope,
+                  const paddle::platform::Place& place) override {
+      int label_len = 0;
+      const int64_t* label_data = NULL;
+      get_data<int64_t>(exe_scope, label_varname_, &label_data, &label_len);
+
+      int pred_len = 0;
+      const float* pred_data = NULL;
+      get_data<float>(exe_scope, pred_varname_, &pred_data, &pred_len);
+
+      int mask_len = 0;
+      const int64_t* mask_data = NULL;
+      get_data<int64_t>(exe_scope, mask_varname_, &mask_data, &mask_len);
+      PADDLE_ENFORCE_EQ(label_len, mask_len,
+                        platform::errors::PreconditionNotMet(
+                            "the predict data length should be consistent with "
+                            "the label data length"));
+      auto cal = GetCalculator();
+      cal->add_mask_data(pred_data, label_data, mask_data, label_len, place);
+    }
+
+   protected:
+    std::string mask_varname_;
+  };
+
+  class CmatchRankMaskMetricMsg : public MetricMsg {
+   public:
+    CmatchRankMaskMetricMsg(const std::string& label_varname,
+                            const std::string& pred_varname, int metric_phase,
+                            const std::string& cmatch_rank_group,
+                            const std::string& cmatch_rank_varname,
+                            bool ignore_rank = false,
+                            const std::string& mask_varname = "",
+                            int bucket_size = 1000000) {
+      label_varname_ = label_varname;
+      pred_varname_ = pred_varname;
+      cmatch_rank_varname_ = cmatch_rank_varname;
+      metric_phase_ = metric_phase;
+      ignore_rank_ = ignore_rank;
+      mask_varname_ = mask_varname;
+      calculator = new BasicAucCalculator();
+      calculator->init(bucket_size);
+      for (auto& cmatch_rank : string::split_string(cmatch_rank_group)) {
+        if (ignore_rank) {  // CmatchAUC
+          cmatch_rank_v.emplace_back(atoi(cmatch_rank.c_str()), 0);
+          continue;
+        }
+        const std::vector<std::string>& cur_cmatch_rank =
+            string::split_string(cmatch_rank, "_");
+        PADDLE_ENFORCE_EQ(
+            cur_cmatch_rank.size(), 2,
+            platform::errors::PreconditionNotMet(
+                "illegal cmatch_rank auc spec: %s", cmatch_rank.c_str()));
+        cmatch_rank_v.emplace_back(atoi(cur_cmatch_rank[0].c_str()),
+                                   atoi(cur_cmatch_rank[1].c_str()));
+      }
+    }
+    virtual ~CmatchRankMaskMetricMsg() {}
+    void add_data(const Scope* exe_scope,
+                  const paddle::platform::Place& place) override {
+      std::vector<int64_t> cmatch_rank_data;
+      get_data<int64_t>(exe_scope, cmatch_rank_varname_, &cmatch_rank_data);
+      std::vector<int64_t> label_data;
+      get_data<int64_t>(exe_scope, label_varname_, &label_data);
+      std::vector<float> pred_data;
+      get_data<float>(exe_scope, pred_varname_, &pred_data);
+      size_t batch_size = cmatch_rank_data.size();
+      PADDLE_ENFORCE_EQ(
+          batch_size, label_data.size(),
+          platform::errors::PreconditionNotMet(
+              "illegal batch size: cmatch_rank[%lu] and label_data[%lu]",
+              batch_size, label_data.size()));
+      PADDLE_ENFORCE_EQ(
+          batch_size, pred_data.size(),
+          platform::errors::PreconditionNotMet(
+              "illegal batch size: cmatch_rank[%lu] and pred_data[%lu]",
+              batch_size, pred_data.size()));
+
+      std::vector<int64_t> mask_data;
+      if (!mask_varname_.empty()) {
+        get_data<int64_t>(exe_scope, mask_varname_, &mask_data);
+        PADDLE_ENFORCE_EQ(
+            batch_size, mask_data.size(),
+            platform::errors::PreconditionNotMet(
+                "illegal batch size: cmatch_rank[%lu] and mask_data[%lu]",
+                batch_size, mask_data.size()));
+      }
+
+      auto cal = GetCalculator();
+      std::lock_guard<std::mutex> lock(cal->table_mutex());
+      for (size_t i = 0; i < batch_size; ++i) {
+        const auto& cur_cmatch_rank = parse_cmatch_rank(cmatch_rank_data[i]);
+        for (size_t j = 0; j < cmatch_rank_v.size(); ++j) {
+          if (!mask_data.empty() && !mask_data[i]) {
+            continue;
+          }
+          bool is_matched = false;
+          if (ignore_rank_) {
+            is_matched = cmatch_rank_v[j].first == cur_cmatch_rank.first;
+          } else {
+            is_matched = cmatch_rank_v[j] == cur_cmatch_rank;
+          }
+          if (is_matched) {
+            cal->add_unlock_data(pred_data[i], label_data[i]);
+            break;
+          }
+        }
+      }
+    }
+
+   protected:
+    std::vector<std::pair<int, int>> cmatch_rank_v;
+    std::string cmatch_rank_varname_;
+    bool ignore_rank_;
+    std::string mask_varname_;
+  };
+
+  static std::shared_ptr<Metric> GetInstance() {
+    // PADDLE_ENFORCE_EQ(
+    //     s_instance_ == nullptr, false,
+    //     platform::errors::PreconditionNotMet(
+    //         "GetInstance failed in Metric, you should use SetInstance
+    //         firstly"));
+    return s_instance_;
+  }
+
+  static std::shared_ptr<Metric> SetInstance() {
+    static std::mutex mutex;
+    std::lock_guard<std::mutex> lock(mutex);
+    if (nullptr == s_instance_) {
+      VLOG(3) << "s_instance_ is null";
+      s_instance_.reset(new paddle::framework::Metric());
+    } else {
+      LOG(WARNING) << "You have already used SetInstance() before";
+    }
+    return s_instance_;
+  }
+
+  const std::vector<std::string> GetMetricNameList(
+      int metric_phase = -1) const {
+    VLOG(0) << "Want to Get metric phase: " << metric_phase;
+    if (metric_phase == -1) {
+      return metric_name_list_;
+    } else {
+      std::vector<std::string> ret;
+      for (const auto& name : metric_name_list_) {
+        const auto iter = metric_lists_.find(name);
+        PADDLE_ENFORCE_NE(
+            iter, metric_lists_.end(),
+            platform::errors::InvalidArgument(
+                "The metric name you provided is not registered."));
+
+        if (iter->second->MetricPhase() == metric_phase) {
+          VLOG(3) << name << "'s phase is " << iter->second->MetricPhase()
+                  << ", we want";
+          ret.push_back(name);
+        } else {
+          VLOG(3) << name << "'s phase is " << iter->second->MetricPhase()
+                  << ", not we want";
+        }
+      }
+      return ret;
+    }
+  }
+  int Phase() const { return phase_; }
+  int PhaseNum() const { return phase_num_; }
+  void FlipPhase() { phase_ = (phase_ + 1) % phase_num_; }
+  std::map<std::string, MetricMsg*>& GetMetricList() { return metric_lists_; }
+
+  void InitMetric(const std::string& method, const std::string& name,
+                  const std::string& label_varname,
+                  const std::string& pred_varname,
+                  const std::string& cmatch_rank_varname,
+                  const std::string& mask_varname,
+                  const std::string& uid_varname, int metric_phase,
+                  const std::string& cmatch_rank_group, bool ignore_rank,
+                  int bucket_size = 1000000) {
+    if (method == "AucCalculator") {
+      metric_lists_.emplace(name, new MetricMsg(label_varname, pred_varname,
+                                                metric_phase, bucket_size));
+    } else if (method == "MultiTaskAucCalculator") {
+      metric_lists_.emplace(
+          name, new MultiTaskMetricMsg(label_varname, pred_varname,
+                                       metric_phase, cmatch_rank_group,
+                                       cmatch_rank_varname, bucket_size));
+    } else if (method == "CmatchRankAucCalculator") {
+      metric_lists_.emplace(name, new CmatchRankMetricMsg(
+                                      label_varname, pred_varname, metric_phase,
+                                      cmatch_rank_group, cmatch_rank_varname,
+                                      ignore_rank, bucket_size));
+    } else if (method == "MaskAucCalculator") {
+      metric_lists_.emplace(
+          name, new MaskMetricMsg(label_varname, pred_varname, metric_phase,
+                                  mask_varname, bucket_size));
+    } else if (method == "CmatchRankMaskAucCalculator") {
+      metric_lists_.emplace(name, new CmatchRankMaskMetricMsg(
+                                      label_varname, pred_varname, metric_phase,
+                                      cmatch_rank_group, cmatch_rank_varname,
+                                      ignore_rank, mask_varname, bucket_size));
+    } else if (method == "WuAucCalculator") {
+      metric_lists_.emplace(
+          name, new WuAucMetricMsg(label_varname, pred_varname, uid_varname,
+                                   metric_phase, bucket_size));
+    } else {
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "PSLIB Metrics only support AucCalculator, MultiTaskAucCalculator, "
+          "CmatchRankAucCalculator, MaskAucCalculator, WuAucCalculator and "
+          "CmatchRankMaskAucCalculator"));
+    }
+    metric_name_list_.emplace_back(name);
+  }
+
+  const std::vector<float> GetMetricMsg(const std::string& name) {
+    const auto iter = metric_lists_.find(name);
+    PADDLE_ENFORCE_NE(iter, metric_lists_.end(),
+                      platform::errors::InvalidArgument(
+                          "The metric name you provided is not registered."));
+    std::vector<float> metric_return_values_(8, 0.0);
+    auto* auc_cal_ = iter->second->GetCalculator();
+    auc_cal_->compute();
+    metric_return_values_[0] = auc_cal_->auc();
+    metric_return_values_[1] = auc_cal_->bucket_error();
+    metric_return_values_[2] = auc_cal_->mae();
+    metric_return_values_[3] = auc_cal_->rmse();
+    metric_return_values_[4] = auc_cal_->actual_ctr();
+    metric_return_values_[5] = auc_cal_->predicted_ctr();
+    metric_return_values_[6] =
+        auc_cal_->actual_ctr() / auc_cal_->predicted_ctr();
+    metric_return_values_[7] = auc_cal_->size();
+    auc_cal_->reset();
+    return metric_return_values_;
+  }
+
+  const std::vector<float> GetWuAucMetricMsg(const std::string& name) {
+    const auto iter = metric_lists_.find(name);
+    PADDLE_ENFORCE_NE(iter, metric_lists_.end(),
+                      platform::errors::InvalidArgument(
+                          "The metric name you provided is not registered."));
+    VLOG(0) << "begin GetWuAucMetricMsg";
+    std::vector<float> metric_return_values_(6, 0.0);
+    auto* auc_cal_ = iter->second->GetCalculator();
+    auc_cal_->computeWuAuc();
+    metric_return_values_[0] = auc_cal_->user_cnt();
+    metric_return_values_[1] = auc_cal_->size();
+    metric_return_values_[2] = auc_cal_->uauc();
+    metric_return_values_[3] = auc_cal_->wuauc();
+    metric_return_values_[4] =
+        metric_return_values_[2] / (metric_return_values_[0] + 1e-10);
+    metric_return_values_[5] =
+        metric_return_values_[3] / (metric_return_values_[1] + 1e-10);
+
+#if defined(PADDLE_WITH_GLOO)
+    auto gloo_wrapper = paddle::framework::GlooWrapper::GetInstance();
+    if (gloo_wrapper->Size() > 1) {
+      auto global_metric_return_values_ =
+          gloo_wrapper->AllReduce(metric_return_values_, "sum");
+      global_metric_return_values_[4] =
+          global_metric_return_values_[2] /
+          (global_metric_return_values_[0] + 1e-10);
+      global_metric_return_values_[5] =
+          global_metric_return_values_[3] /
+          (global_metric_return_values_[1] + 1e-10);
+      auc_cal_->reset_records();
+      return global_metric_return_values_;
+    } else {
+      auc_cal_->reset_records();
+      return metric_return_values_;
+    }
+#else
+    auc_cal_->reset_records();
+    return metric_return_values_;
+#endif
+  }
+
+ private:
+  static std::shared_ptr<Metric> s_instance_;
+
+  // Metric Related
+  int phase_ = 1;
+  int phase_num_ = 2;
+  std::map<std::string, MetricMsg*> metric_lists_;
+  std::vector<std::string> metric_name_list_;
+};
+}  // namespace framework
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/framework/infershape_utils.cc b/paddle/fluid/framework/infershape_utils.cc
index 08b945159ad..e1d7190a9e4 100644
--- a/paddle/fluid/framework/infershape_utils.cc
+++ b/paddle/fluid/framework/infershape_utils.cc
@@ -18,12 +18,12 @@ limitations under the License. */
 #include "paddle/fluid/framework/pten_utils.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/pten/core/compat/arg_map_context.h"
+#include "paddle/pten/core/compat/convert_utils.h"
 #include "paddle/pten/core/compat/op_utils.h"
-#include "paddle/pten/core/compat_utils.h"
-#include "paddle/pten/core/convert_utils.h"
 #include "paddle/pten/core/dense_tensor.h"
 #include "paddle/pten/core/infermeta_utils.h"
 #include "paddle/pten/core/meta_tensor.h"
+#include "paddle/pten/core/tensor_utils.h"
 
 namespace paddle {
 namespace framework {
@@ -64,6 +64,16 @@ class InferShapeArgumentMappingContext : public pten::ArgumentMappingContext {
     return var_types[0] == proto::VarType::SELECTED_ROWS;
   }
 
+  bool IsDenseTensorOutput(const std::string& name) const override {
+    auto var_types = ctx_.GetOutputsVarType(name);
+    return var_types[0] == proto::VarType::LOD_TENSOR;
+  }
+
+  bool IsSelectedRowsOutput(const std::string& name) const override {
+    auto var_types = ctx_.GetOutputsVarType(name);
+    return var_types[0] == proto::VarType::SELECTED_ROWS;
+  }
+
  private:
   const InferShapeContext& ctx_;
 };
@@ -116,8 +126,9 @@ class CompatMetaTensor : public pten::MetaTensor {
       auto* var = BOOST_GET_CONST(Variable*, var_);
       return var->Get<LoDTensor>().layout();
     } else {
-      PADDLE_THROW(platform::errors::Unimplemented(
-          "Unsupported get layout for VarDesc now."));
+      // NOTE(chenweihang): do nothing
+      // Unsupported get layout for VarDesc now
+      return DataLayout::UNDEFINED;
     }
   }
 
@@ -125,7 +136,7 @@ class CompatMetaTensor : public pten::MetaTensor {
     if (is_runtime_) {
       auto* var = BOOST_GET(Variable*, var_);
       LoDTensor* tensor = var->GetMutable<LoDTensor>();
-      pten::CompatibleDenseTensorUtils::GetMutableMeta(
+      pten::DenseTensorUtils::GetMutableMeta(
           static_cast<pten::DenseTensor*>(tensor))
           ->dims = dims;
     } else {
@@ -138,7 +149,7 @@ class CompatMetaTensor : public pten::MetaTensor {
     if (is_runtime_) {
       auto* var = BOOST_GET(Variable*, var_);
       LoDTensor* tensor = var->GetMutable<LoDTensor>();
-      pten::CompatibleDenseTensorUtils::GetMutableMeta(
+      pten::DenseTensorUtils::GetMutableMeta(
           static_cast<pten::DenseTensor*>(tensor))
           ->dtype = dtype;
     } else {
@@ -151,12 +162,12 @@ class CompatMetaTensor : public pten::MetaTensor {
     if (is_runtime_) {
       auto* var = BOOST_GET(Variable*, var_);
       LoDTensor* tensor = var->GetMutable<LoDTensor>();
-      pten::CompatibleDenseTensorUtils::GetMutableMeta(
+      pten::DenseTensorUtils::GetMutableMeta(
           static_cast<pten::DenseTensor*>(tensor))
           ->layout = layout;
     } else {
-      PADDLE_THROW(platform::errors::Unimplemented(
-          "Unsupported set layout for VarDesc now."));
+      // NOTE(chenweihang): do nothing
+      // Unsupported set layout for VarDesc now
     }
   }
 
@@ -164,7 +175,7 @@ class CompatMetaTensor : public pten::MetaTensor {
     if (is_runtime_) {
       auto* var = BOOST_GET(Variable*, var_);
       LoDTensor* tensor = var->GetMutable<LoDTensor>();
-      pten::CompatibleDenseTensorUtils::GetMutableMeta(
+      pten::DenseTensorUtils::GetMutableMeta(
           static_cast<pten::DenseTensor*>(tensor))
           ->lod =
           static_cast<const CompatMetaTensor&>(meta_tensor).GetRuntimeLoD();
@@ -175,6 +186,14 @@ class CompatMetaTensor : public pten::MetaTensor {
     }
   }
 
+  void share_meta(const MetaTensor& meta_tensor) override {
+    set_dims(meta_tensor.dims());
+    set_dtype(meta_tensor.dtype());
+    // VarDesc doesn't contains layout, so we cannot share layout
+    // set_layout(meta_tensor.layout());
+    share_lod(meta_tensor);
+  }
+
  private:
   const LoD& GetRuntimeLoD() const {
     auto* var = BOOST_GET_CONST(Variable*, var_);
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
index 64d9bf60353..0f12261d3c4 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
@@ -95,8 +95,17 @@ void CPUQuantizePass::QuantizeInput(Graph* g, Node* op, Node* input,
   q_desc.SetAttr("Shift", shift);
   q_desc.SetAttr("is_negative_input", !is_input_unsigned);
 
-  q_desc.SetAttr("output_format",
-                 Has("data_layout") ? Get<std::string>("data_layout") : "NHWC");
+  // fix to fc format error
+  if (op->Op()->Type() == "fc" &&
+      op->Op()->GetAttrIfExists<int>("in_num_col_dims") == 2) {
+    q_desc.SetAttr("output_format", Has("data_layout")
+                                        ? Get<std::string>("data_layout")
+                                        : "NCHW");
+  } else {
+    q_desc.SetAttr("output_format", Has("data_layout")
+                                        ? Get<std::string>("data_layout")
+                                        : "NHWC");
+  }
   auto quantize_op = g->CreateOpNode(&q_desc);  // OpDesc will be copied.
 
   // update op's input
diff --git a/paddle/fluid/framework/ir/shuffle_channel_detect_pass.cc b/paddle/fluid/framework/ir/shuffle_channel_detect_pass.cc
index 63cd4f1f8ef..bcd7bedcc43 100644
--- a/paddle/fluid/framework/ir/shuffle_channel_detect_pass.cc
+++ b/paddle/fluid/framework/ir/shuffle_channel_detect_pass.cc
@@ -192,8 +192,7 @@ void ShuffleChannelDetectPass::ApplyImpl(ir::Graph* graph) const {
     if (reshape2_shape[1] != reshape1_shape[2] * reshape1_shape[1]) return;
     // trans on channel dim
     if (trans_axis[0] != 0 || trans_axis[3] != 3 || trans_axis[4] != 4) return;
-
-    if (group != 1) {
+    if (group != 1 && i_c != 1) {
       if (trans_axis[1] != 2 && trans_axis[2] != 1) {
         return;
       }
diff --git a/paddle/fluid/framework/library_type.h b/paddle/fluid/framework/library_type.h
index 6fdd128b0d3..0cb6cac26a6 100644
--- a/paddle/fluid/framework/library_type.h
+++ b/paddle/fluid/framework/library_type.h
@@ -26,6 +26,7 @@ enum class LibraryType {
   kPlain = 0,
   kMKLDNN = 1,
   kCUDNN = 2,
+  kKP = 3,
 };
 
 inline std::string LibraryTypeToString(const LibraryType& library_type) {
@@ -36,10 +37,12 @@ inline std::string LibraryTypeToString(const LibraryType& library_type) {
       return "MKLDNN";
     case LibraryType::kCUDNN:
       return "CUDNN";
+    case LibraryType::kKP:
+      return "KP";
     default:
       PADDLE_THROW(platform::errors::Unimplemented(
           "Unknown LibraryType code (%d), only supports library type include "
-          "PLAIN(0), MKLDNN(1), CUDNN(2).",
+          "PLAIN(0), MKLDNN(1), CUDNN(2), KP(3).",
           static_cast<int>(library_type)));
   }
 }
@@ -57,6 +60,8 @@ inline LibraryType StringToLibraryType(const char* ctype) {
     return LibraryType::kCUDNN;
     // To be compatible with register macro.
     // CPU, CUDA, PLAIN are same library type.
+  } else if (s == std::string("KP")) {
+    return LibraryType::kKP;
   } else if (s == std::string("CPU")) {
     return LibraryType::kPlain;
   } else if (s == std::string("XPU")) {
diff --git a/paddle/fluid/framework/new_executor/interpretercore_event_garbage_collector.cc b/paddle/fluid/framework/new_executor/interpretercore_event_garbage_collector.cc
index ba81ee9166f..4ba83e6a30c 100644
--- a/paddle/fluid/framework/new_executor/interpretercore_event_garbage_collector.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore_event_garbage_collector.cc
@@ -25,7 +25,8 @@ namespace paddle {
 namespace framework {
 
 InterpreterCoreEventGarbageCollector::InterpreterCoreEventGarbageCollector() {
-  WorkQueueOptions options(/*num_threads*/ 1, /*allow_spinning*/ true,
+  WorkQueueOptions options(/*name*/ "GarbageCollector", /*num_threads*/ 1,
+                           /*allow_spinning*/ true,
                            /*track_task*/ false);
   queue_ = CreateSingleThreadedWorkQueue(options);
 }
diff --git a/paddle/fluid/framework/new_executor/interpretercore_util.cc b/paddle/fluid/framework/new_executor/interpretercore_util.cc
index fb0951e87aa..50725e5eea5 100644
--- a/paddle/fluid/framework/new_executor/interpretercore_util.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore_util.cc
@@ -24,7 +24,6 @@
 PADDLE_DEFINE_EXPORTED_bool(
     new_executor_sequential_run, false,
     "Enable sequential execution for standalone executor, used for debug");
-DECLARE_bool(run_pten_kernel);
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/new_executor/interpretercore_util.h b/paddle/fluid/framework/new_executor/interpretercore_util.h
index 5f403613c6b..81c05df62ec 100644
--- a/paddle/fluid/framework/new_executor/interpretercore_util.h
+++ b/paddle/fluid/framework/new_executor/interpretercore_util.h
@@ -58,13 +58,15 @@ class AsyncWorkQueue {
       : host_num_thread_(host_num_threads) {
     std::vector<WorkQueueOptions> group_options;
     // for execute host Kernel
-    group_options.emplace_back(/*num_threads*/ host_num_threads,
+    group_options.emplace_back(/*name*/ "HostTasks",
+                               /*num_threads*/ host_num_threads,
                                /*allow_spinning*/ true,
                                /*track_task*/ false,
                                /*detached*/ true,
                                /*events_waiter*/ waiter);
     // for launch device Kernel
-    group_options.emplace_back(/*num_threads*/ 1,
+    group_options.emplace_back(/*name*/ "DeviceKernelLaunch",
+                               /*num_threads*/ 1,
                                /*allow_spinning*/ true,
                                /*track_task*/ false,
                                /*detached*/ true,
diff --git a/paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h b/paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h
index 37044d3c19b..2ad76562c15 100644
--- a/paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h
+++ b/paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h
@@ -12,10 +12,12 @@
 #include <atomic>
 #include <cstdlib>
 #include <vector>
+#include "glog/logging.h"
 #include "paddle/fluid/framework/new_executor/workqueue/event_count.h"
 #include "paddle/fluid/framework/new_executor/workqueue/run_queue.h"
 #include "paddle/fluid/framework/new_executor/workqueue/thread_environment.h"
 #include "paddle/fluid/platform/os_info.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"
 
 namespace paddle {
 namespace framework {
@@ -26,7 +28,7 @@ class ThreadPoolTempl {
   typedef typename Environment::Task Task;
   typedef RunQueue<Task, 1024> Queue;
 
-  ThreadPoolTempl(int num_threads, bool allow_spinning,
+  ThreadPoolTempl(const std::string& name, int num_threads, bool allow_spinning,
                   Environment env = Environment())
       : env_(env),
         allow_spinning_(allow_spinning),
@@ -38,7 +40,8 @@ class ThreadPoolTempl {
         cancelled_(false),
         ec_(num_threads),
         num_threads_(num_threads),
-        thread_data_(num_threads) {
+        thread_data_(num_threads),
+        name_(name) {
     // Calculate coprimes of all numbers [1, num_threads].
     // Coprimes are used for random walks over all threads in Steal
     // and NonEmptyQueueIndex. Iteration is based on the fact that if we take
@@ -240,9 +243,13 @@ class ThreadPoolTempl {
   EventCount ec_;
   const int num_threads_;
   std::vector<ThreadData> thread_data_;
+  std::string name_;
 
   // Main worker thread loop.
   void WorkerLoop(int thread_id) {
+    std::string thr_name = name_ + "_thread_" + std::to_string(thread_id);
+    VLOG(1) << thr_name << " started ";
+    platform::SetCurrentThreadName(thr_name);
     PerThread* pt = GetPerThread();
     pt->pool = this;
     pt->rand = GlobalThreadIdHash();
@@ -401,6 +408,7 @@ class ThreadPoolTempl {
       ec_.Notify(true);
       return false;
     }
+    platform::RecordEvent("SleepWaitForWork");
     ec_.CommitWait(waiter);
     blocked_--;
     return true;
diff --git a/paddle/fluid/framework/new_executor/workqueue/thread_data_registry.h b/paddle/fluid/framework/new_executor/workqueue/thread_data_registry.h
new file mode 100644
index 00000000000..a4a913cdff2
--- /dev/null
+++ b/paddle/fluid/framework/new_executor/workqueue/thread_data_registry.h
@@ -0,0 +1,126 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <functional>
+#include <mutex>
+#include <thread>
+#include <type_traits>
+#include <unordered_map>
+
+namespace paddle {
+namespace framework {
+
+static uint64_t main_tid =
+    std::hash<std::thread::id>()(std::this_thread::get_id());
+
+template <typename T>
+class ThreadDataRegistry {
+  class ThreadDataHolder;
+
+ public:
+  // Singleton
+  static ThreadDataRegistry& GetInstance() {
+    static ThreadDataRegistry instance;
+    return instance;
+  }
+
+  T* GetMutableCurrentThreadData() { return &CurrentThreadData(); }
+
+  const T& GetCurrentThreadData() { return CurrentThreadData(); }
+
+  template <typename Alias = T,
+            typename = std::enable_if_t<std::is_copy_assignable<Alias>::value>>
+  void SetCurrentThreadData(const T& val) {
+    std::lock_guard<std::mutex> lock(lock_);
+    CurrentThreadData() = val;
+  }
+
+  // Returns current snapshot of all threads. Make sure there is no thread
+  // create/destory when using it.
+  template <typename Alias = T, typename = std::enable_if_t<
+                                    std::is_copy_constructible<Alias>::value>>
+  std::unordered_map<uint64_t, T> GetAllThreadDataByValue() {
+    std::unordered_map<uint64_t, T> data_copy;
+    std::lock_guard<std::mutex> lock(lock_);
+    data_copy.reserve(tid_map_.size());
+    for (auto& kv : tid_map_) {
+      data_copy.emplace(kv.first, kv.second->GetData());
+    }
+    return data_copy;
+  }
+
+  // Returns current snapshot of all threads. Make sure there is no thread
+  // create/destory when using it.
+  std::unordered_map<uint64_t, std::reference_wrapper<T>>
+  GetAllThreadDataByRef() {
+    std::unordered_map<uint64_t, std::reference_wrapper<T>> data_ref;
+    std::lock_guard<std::mutex> lock(lock_);
+    data_ref.reserve(tid_map_.size());
+    for (auto& kv : tid_map_) {
+      data_ref.emplace(kv.first, std::ref(kv.second->GetData()));
+    }
+    return data_ref;
+  }
+
+  void RegisterData(uint64_t tid, ThreadDataHolder* tls_obj) {
+    std::lock_guard<std::mutex> lock(lock_);
+    tid_map_[tid] = tls_obj;
+  }
+
+  void UnregisterData(uint64_t tid) {
+    if (tid == main_tid) {
+      return;
+    }
+    std::lock_guard<std::mutex> lock(lock_);
+    tid_map_.erase(tid);
+  }
+
+ private:
+  class ThreadDataHolder {
+   public:
+    ThreadDataHolder() {
+      tid_ = std::hash<std::thread::id>()(std::this_thread::get_id());
+      ThreadDataRegistry::GetInstance().RegisterData(tid_, this);
+    }
+
+    ~ThreadDataHolder() {
+      ThreadDataRegistry::GetInstance().UnregisterData(tid_);
+    }
+
+    T& GetData() { return data_; }
+
+   private:
+    uint64_t tid_;
+    T data_;
+  };
+
+  ThreadDataRegistry() = default;
+
+  ThreadDataRegistry(const ThreadDataRegistry&) = delete;
+
+  ThreadDataRegistry& operator=(const ThreadDataRegistry&) = delete;
+
+  T& CurrentThreadData() {
+    static thread_local ThreadDataHolder thread_data;
+    return thread_data.GetData();
+  }
+
+  std::mutex lock_;
+  std::unordered_map<uint64_t, ThreadDataHolder*> tid_map_;  // not owned
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/new_executor/workqueue/workqueue.cc b/paddle/fluid/framework/new_executor/workqueue/workqueue.cc
index 45694349168..07c5298c2f2 100644
--- a/paddle/fluid/framework/new_executor/workqueue/workqueue.cc
+++ b/paddle/fluid/framework/new_executor/workqueue/workqueue.cc
@@ -11,6 +11,17 @@
 
 namespace paddle {
 namespace framework {
+
+void WorkQueueOptions::Validate() const {
+  PADDLE_ENFORCE_GT(name.size(), 0,
+                    platform::errors::InvalidArgument(
+                        "WorkQueueOptions.name must be nonempty"));
+  PADDLE_ENFORCE_EQ(
+      name.find('_'), std::string::npos,
+      platform::errors::InvalidArgument(
+          "WorkQueueOptions.name shouldn't contain an underline"));
+}
+
 namespace {
 
 using TaskTracker = TaskTracker<EventsWaiter::EventNotifier>;
@@ -30,7 +41,7 @@ class WorkQueueImpl : public WorkQueue {
       destruct_notifier_ =
           options.events_waiter->RegisterEvent(kQueueDestructEvent);
     }
-    queue_ = new NonblockingThreadPool(options_.num_threads,
+    queue_ = new NonblockingThreadPool(options_.name, options_.num_threads,
                                        options_.allow_spinning);
   }
 
@@ -121,8 +132,8 @@ WorkQueueGroupImpl::WorkQueueGroupImpl(
       destruct_notifier_ =
           options.events_waiter->RegisterEvent(kQueueDestructEvent);
     }
-    queues_[idx] = new (&queues_storage_[idx])
-        NonblockingThreadPool(options.num_threads, options.allow_spinning);
+    queues_[idx] = new (&queues_storage_[idx]) NonblockingThreadPool(
+        options.name, options.num_threads, options.allow_spinning);
   }
 }
 
@@ -182,6 +193,8 @@ void WorkQueueGroupImpl::Cancel() {
 
 std::unique_ptr<WorkQueue> CreateSingleThreadedWorkQueue(
     const WorkQueueOptions& options) {
+  options.Validate();
+  // extra check
   PADDLE_ENFORCE_EQ(options.num_threads, 1u,
                     platform::errors::InvalidArgument(
                         "For a SingleThreadedWorkQueue, "
@@ -192,6 +205,8 @@ std::unique_ptr<WorkQueue> CreateSingleThreadedWorkQueue(
 
 std::unique_ptr<WorkQueue> CreateMultiThreadedWorkQueue(
     const WorkQueueOptions& options) {
+  options.Validate();
+  // extra check
   PADDLE_ENFORCE_GT(
       options.num_threads, 1u,
       platform::errors::InvalidArgument("For a MultiThreadedWorkQueue, "
@@ -207,6 +222,9 @@ std::unique_ptr<WorkQueueGroup> CreateWorkQueueGroup(
                     platform::errors::InvalidArgument(
                         "For a WorkQueueGroup, the number of WorkQueueOptions "
                         "must be greater than 1."));
+  for (const auto& opts : queues_options) {
+    opts.Validate();
+  }
   std::unique_ptr<WorkQueueGroup> ptr(new WorkQueueGroupImpl(queues_options));
   return ptr;
 }
diff --git a/paddle/fluid/framework/new_executor/workqueue/workqueue.h b/paddle/fluid/framework/new_executor/workqueue/workqueue.h
index 068c54a21a4..6c8abee2f01 100644
--- a/paddle/fluid/framework/new_executor/workqueue/workqueue.h
+++ b/paddle/fluid/framework/new_executor/workqueue/workqueue.h
@@ -16,6 +16,7 @@
 
 #include <functional>
 #include <memory>
+#include <string>
 #include <vector>
 
 namespace paddle {
@@ -27,19 +28,31 @@ constexpr const char* kQueueDestructEvent = "QueueDestruct";
 class EventsWaiter;
 
 struct WorkQueueOptions {
-  WorkQueueOptions(size_t num_threads, bool allow_spinning, bool track_task)
-      : num_threads(num_threads),
+  WorkQueueOptions(const std::string& name, size_t num_threads,
+                   bool allow_spinning, bool track_task)
+      : name(name),
+        num_threads(num_threads),
         allow_spinning(allow_spinning),
-        track_task(track_task) {}
-
-  WorkQueueOptions(size_t num_threads, bool allow_spinning, bool track_task,
-                   bool detached, EventsWaiter* waiter)
-      : num_threads(num_threads),
+        track_task(track_task) {
+    Validate();
+  }
+
+  WorkQueueOptions(const std::string& name, size_t num_threads,
+                   bool allow_spinning, bool track_task, bool detached,
+                   EventsWaiter* waiter)
+      : name(name),
+        num_threads(num_threads),
         allow_spinning(allow_spinning),
         track_task(track_task),
         detached(detached),
-        events_waiter(waiter) {}
+        events_waiter(waiter) {
+    Validate();
+  }
+
+  // throw an exception if there is an invalid option
+  void Validate() const;
 
+  std::string name;
   size_t num_threads;
   bool allow_spinning;
   // If you need to blocking the calling  thread to wait "queue empty", set
diff --git a/paddle/fluid/framework/new_executor/workqueue/workqueue_test.cc b/paddle/fluid/framework/new_executor/workqueue/workqueue_test.cc
index e06beb623be..25448da8f10 100644
--- a/paddle/fluid/framework/new_executor/workqueue/workqueue_test.cc
+++ b/paddle/fluid/framework/new_executor/workqueue/workqueue_test.cc
@@ -44,7 +44,8 @@ TEST(WorkQueue, TestSingleThreadedWorkQueue) {
   constexpr unsigned kLoopNum = 1000000;
   // CreateSingleThreadedWorkQueue
   EventsWaiter events_waiter;
-  WorkQueueOptions options(/*num_threads*/ 1, /*allow_spinning*/ true,
+  WorkQueueOptions options(/*name*/ "SingleThreadedWorkQueueForTesting",
+                           /*num_threads*/ 1, /*allow_spinning*/ true,
                            /*track_task*/ true, /*detached*/ true,
                            &events_waiter);
   auto work_queue = CreateSingleThreadedWorkQueue(options);
@@ -78,7 +79,8 @@ TEST(WorkQueue, TestMultiThreadedWorkQueue) {
   constexpr unsigned kLoopNum = 1000000;
   // CreateMultiThreadedWorkQueue
   EventsWaiter events_waiter;
-  WorkQueueOptions options(/*num_threads*/ 10, /*allow_spinning*/ true,
+  WorkQueueOptions options(/*name*/ "MultiThreadedWorkQueueForTesting",
+                           /*num_threads*/ 10, /*allow_spinning*/ true,
                            /*track_task*/ true, /*detached*/ false,
                            &events_waiter);
   auto work_queue = CreateMultiThreadedWorkQueue(options);
@@ -117,10 +119,12 @@ TEST(WorkQueue, TestWorkQueueGroup) {
   constexpr unsigned kLoopNum = 1000000;
   // ThreadedWorkQueueGroup
   EventsWaiter events_waiter;
-  WorkQueueOptions sq_options(/*num_threads*/ 1, /*allow_spinning*/ true,
+  WorkQueueOptions sq_options(/*name*/ "SingleThreadedWorkQueueForTesting",
+                              /*num_threads*/ 1, /*allow_spinning*/ true,
                               /*track_task*/ true, /*detached*/ false,
                               &events_waiter);
-  WorkQueueOptions mq_options(/*num_threads*/ 10, /*allow_spinning*/ true,
+  WorkQueueOptions mq_options(/*name*/ "MultiThreadedWorkQueueForTesting",
+                              /*num_threads*/ 10, /*allow_spinning*/ true,
                               /*track_task*/ true, /*detached*/ false,
                               &events_waiter);
   auto queue_group = CreateWorkQueueGroup({sq_options, mq_options});
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 426b5ac8ffd..50c315bf038 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -56,7 +56,6 @@ DECLARE_bool(check_nan_inf);
 DECLARE_bool(enable_unused_var_check);
 PADDLE_DEFINE_EXPORTED_int32(inner_op_parallelism, 0,
                              "number of threads for inner op");
-DECLARE_bool(run_pten_kernel);
 DECLARE_bool(run_kp_kernel);
 
 namespace paddle {
@@ -1386,6 +1385,7 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const {
   auto expected_kernel_key = InnerGetExpectedKernelType(ctx);
 
   auto kernel_iter = kernels.find(expected_kernel_key);
+
 #ifdef PADDLE_WITH_MKLDNN
   // workaround for missing MKLDNN kernel when FLAGS_use_mkldnn env var is set
   if (kernel_iter == kernels.end() &&
@@ -1408,6 +1408,22 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const {
     kernel_iter = kernels.find(expected_kernel_key);
   }
 #endif
+
+#ifdef PADDLE_WITH_XPU_KP
+  bool use_xpu_kp_kernel_rt =
+      FLAGS_run_kp_kernel &&
+      paddle::platform::is_xpu_kp_support_op(type_, expected_kernel_key);
+  bool use_xpu_kp_kernel_debug =
+      paddle::platform::is_in_xpu_kpwhite_list(type_);
+  if (platform::is_xpu_place(expected_kernel_key.place_) &&
+      (use_xpu_kp_kernel_rt || use_xpu_kp_kernel_debug)) {
+    expected_kernel_key.library_type_ = LibraryType::kKP;
+    kernel_iter = kernels.find(expected_kernel_key);
+    VLOG(3) << "using XPU KP kernel: " << type_
+            << ", using_kernel_key:" << expected_kernel_key;
+  }
+#endif
+
 #ifdef PADDLE_WITH_IPU
   if (kernel_iter == kernels.end() &&
       platform::is_ipu_place(expected_kernel_key.place_)) {
@@ -1877,9 +1893,24 @@ Scope* OperatorWithKernel::PreparePtenData(
                         "the size of kernel input_defs (%d).",
                         input_names.size(), input_defs.size()));
   Scope* new_scope = nullptr;
+  auto& name_map = Inputs();
+  const std::unordered_set<std::string>* no_buffer_ins = nullptr;
+  if (info_) {
+    auto& no_buffer_inferer = info_->NoNeedBufferVarsInferer();
+    // Some op may not register NoNeedBufferVarsInferer
+    if (no_buffer_inferer) {
+      no_buffer_ins = &(no_buffer_inferer(Inputs(), Outputs(), Attrs()));
+      if (no_buffer_ins->empty()) no_buffer_ins = nullptr;
+    }
+  }
+
   for (size_t i = 0; i < input_defs.size(); ++i) {
     auto& in_def = input_defs.at(i);
     auto& ins_vector = ctx->inputs.at(input_names[i]);
+    auto& name_vec = name_map.at(input_names[i]);
+    bool should_skip_input =
+        no_buffer_ins && no_buffer_ins->count(input_names[i]) > 0;
+
     for (size_t offset = 0; offset < ins_vector.size(); ++offset) {
       // Only tensor can be tranfer to another device.
       auto* var = ins_vector[offset];
@@ -1888,6 +1919,15 @@ Scope* OperatorWithKernel::PreparePtenData(
       }
 
       auto* tensor_in = GetLoDTensorOrSelectedRowsValueFromVar(*var);
+
+      // When no_buffer_ins then checking of Tensor::holder_ is
+      // not a thread safe. And for infershape scenario checks
+      // to be omitted are not really needed
+      if (should_skip_input == true) {
+        // TODO(YuanRisheng) : There need to supplement MKLDNN code later
+        continue;
+      }
+
       if (!tensor_in->IsInitialized()) {
         continue;
       }
@@ -1905,7 +1945,7 @@ Scope* OperatorWithKernel::PreparePtenData(
       }
 
       // Create new var with the same name in transfer scopes
-      auto* trans_var = new_scope->Var(input_names[i]);
+      auto* trans_var = new_scope->Var(name_vec[offset]);
       ins_vector[offset] = trans_var;
 
       // Do transfer
@@ -2105,24 +2145,5 @@ void OperatorWithKernel::BuildPtenKernelContext(
   }
 }
 
-void OperatorWithKernel::WriteBackToOutputs(
-    RuntimeContext* ctx, pten::KernelContext* pt_kernel_context) const {
-  auto& output_names = std::get<2>(pt_kernel_signature_->args);
-
-  for (size_t i = 0; i < output_names.size(); ++i) {
-    auto& outs_vector = ctx->outputs.at(output_names[i]);
-
-    auto& range_pair = pt_kernel_context->OutputRangeAt(i);
-    auto pten_outs = pt_kernel_context->MutableOutputBetween<pten::DenseTensor>(
-        range_pair.first, range_pair.second);
-
-    for (size_t j = 0; j < pten_outs.size(); ++j) {
-      if (pten_outs[j]) {
-        experimental::MakeVariableFromPtenTensor(pten_outs[j], outs_vector[j]);
-      }
-    }
-  }
-}
-
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 9ad13299a37..b6600796baf 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -461,11 +461,11 @@ class ExecutionArgumentMappingContext : public pten::ArgumentMappingContext {
   }
 
   size_t InputSize(const std::string& name) const override {
-    return ctx_.InputSize(name);
+    return ctx_.MultiInputVar(name).size();
   }
 
   size_t OutputSize(const std::string& name) const override {
-    return ctx_.OutputSize(name);
+    return ctx_.MultiOutputVar(name).size();
   }
 
   bool IsDenseTensorInput(const std::string& name) const override {
@@ -476,6 +476,14 @@ class ExecutionArgumentMappingContext : public pten::ArgumentMappingContext {
     return ctx_.InputVar(name)->IsType<pten::SelectedRows>();
   }
 
+  bool IsDenseTensorOutput(const std::string& name) const override {
+    return ctx_.OutputVar(name)->IsType<framework::LoDTensor>();
+  }
+
+  bool IsSelectedRowsOutput(const std::string& name) const override {
+    return ctx_.OutputVar(name)->IsType<pten::SelectedRows>();
+  }
+
  private:
   const ExecutionContext& ctx_;
 };
@@ -527,11 +535,11 @@ class OperatorWithKernel : public OperatorBase {
   bool SupportGPU() const override {
     auto pten_kernels = pten::KernelFactory::Instance().SelectKernelMap(
         pten::TransToPtenKernelName(type_));
-    auto has_pten_kernel = std::any_of(
-        pten_kernels.begin(), pten_kernels.end(),
-        [](pten::KernelFactory::KernelKeyMap::const_reference kern_pair) {
-          return kern_pair.first.backend() == pten::Backend::GPU;
-        });
+    auto has_pten_kernel =
+        std::any_of(pten_kernels.begin(), pten_kernels.end(),
+                    [](pten::KernelKeyMap::const_reference kern_pair) {
+                      return kern_pair.first.backend() == pten::Backend::GPU;
+                    });
     if (has_pten_kernel) {
       return true;
     } else {
@@ -616,9 +624,6 @@ class OperatorWithKernel : public OperatorBase {
                               platform::DeviceContext* dev_ctx,
                               pten::KernelContext* pt_kernel_context) const;
 
-  void WriteBackToOutputs(RuntimeContext* ctx,
-                          pten::KernelContext* pt_kernel_context) const;
-
   pten::KernelSignature* PtenKernelSignature() const {
     return pt_kernel_signature_.get();
   }
@@ -685,6 +690,7 @@ class OperatorWithKernel : public OperatorBase {
   // new pten kernel, if there is a better design in the future,
   // we may polish the implementation here
   mutable bool run_pten_kernel_ = false;
+  mutable bool run_kp_kernel = false;
   mutable std::unique_ptr<pten::KernelSignature> pt_kernel_signature_;
   mutable std::unique_ptr<pten::Kernel> pt_kernel_;
 };
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 8767b55062c..7eebfb904cf 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -1361,7 +1361,7 @@ void ParallelExecutor::PrepareNCCLCommunicator(Scope *global_scope) {
       auto *dev_ctx = static_cast<platform::XPUDeviceContext *>(
           pool.Get(member_->places_[dev_id]));
       auto &bkcl_ctx = bkcl_ctxs->at(member_->places_[dev_id]);
-      dev_ctx->set_bkcl_context(bkcl_ctx.comm());
+      dev_ctx->SetBkclContext(bkcl_ctx.comm());
     }
 #else
     PADDLE_THROW(
diff --git a/paddle/fluid/framework/pten_utils.cc b/paddle/fluid/framework/pten_utils.cc
index 336f8423d6f..1a27f971fa0 100644
--- a/paddle/fluid/framework/pten_utils.cc
+++ b/paddle/fluid/framework/pten_utils.cc
@@ -15,8 +15,8 @@ limitations under the License. */
 #include <sstream>
 
 #include "paddle/fluid/framework/pten_utils.h"
+#include "paddle/pten/core/compat/convert_utils.h"
 #include "paddle/pten/core/compat/op_utils.h"
-#include "paddle/pten/core/convert_utils.h"
 #include "paddle/pten/core/kernel_factory.h"
 
 #include "paddle/fluid/framework/lod_tensor.h"
@@ -60,7 +60,8 @@ OpKernelType TransPtenKernelKeyToOpKernelType(
     const pten::KernelKey& kernel_key) {
   proto::VarType::Type data_type =
       pten::TransToProtoVarType(kernel_key.dtype());
-  platform::Place place = pten::TransToFluidPlace(kernel_key.backend());
+  // no need to set current device id here
+  platform::Place place = pten::TransToFluidPlace(kernel_key.backend(), false);
   DataLayout data_layout = kernel_key.layout();
   LibraryType library_type = LibraryType::kPlain;
   if (kernel_key.backend() == pten::Backend::MKLDNN) {
@@ -184,9 +185,8 @@ KernelArgsNameMakerByOpProto::GetAttrsArgsNames() {
 }
 
 KernelSignature KernelArgsNameMakerByOpProto::GetKernelSignature() {
-  return KernelSignature(pten::TransToPtenKernelName(op_proto_->type()),
-                         GetInputArgsNames(), GetAttrsArgsNames(),
-                         GetOutputArgsNames());
+  return KernelSignature(op_proto_->type(), GetInputArgsNames(),
+                         GetAttrsArgsNames(), GetOutputArgsNames());
 }
 
 std::once_flag kernel_sig_map_init_flag;
diff --git a/paddle/fluid/framework/pten_utils.h b/paddle/fluid/framework/pten_utils.h
index 2d335fc9c98..ae0388079d2 100644
--- a/paddle/fluid/framework/pten_utils.h
+++ b/paddle/fluid/framework/pten_utils.h
@@ -77,6 +77,13 @@ struct ConvertToPtenContext<platform::CPUDeviceContext> {
   using TYPE = pten::CPUContext;
 };
 
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+template <>
+struct ConvertToPtenContext<platform::CUDADeviceContext> {
+  using TYPE = pten::GPUContext;
+};
+#endif
+
 #ifdef PADDLE_WITH_XPU
 template <>
 struct ConvertToPtenContext<platform::XPUDeviceContext> {
diff --git a/paddle/fluid/framework/tensor.cc b/paddle/fluid/framework/tensor.cc
index bb8d7df7457..2f0c7c5b2c8 100644
--- a/paddle/fluid/framework/tensor.cc
+++ b/paddle/fluid/framework/tensor.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/tensor.h"
-#include "paddle/pten/api/lib/utils/storage.h"
 
 DECLARE_bool(use_stream_safe_cuda_allocator);
 
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index dff48790960..b926a3cc765 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -1085,7 +1085,7 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
     is.seekg(seekg, is.cur);
 
     void* buf;
-    auto ctx = platform::CPUDeviceContext();
+    platform::CPUDeviceContext ctx;
     size_t size = tensor->numel() * framework::SizeOfType(desc.data_type());
     if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
         platform::is_xpu_place(dev_ctx.GetPlace()) ||
@@ -1155,7 +1155,7 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
     std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims));
     tensor->Resize(framework::make_ddim(dims));
     void* buf;
-    auto ctx = platform::CPUDeviceContext();
+    platform::CPUDeviceContext ctx;
     size_t size = tensor->numel() * framework::SizeOfType(desc.data_type());
     if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
         platform::is_xpu_place(dev_ctx.GetPlace()) ||
@@ -1432,4 +1432,4 @@ std::ostream& operator<<(std::ostream& os, const pten::DenseTensor& t) {
   VLOG(1) << "PrintVar: unrecognized data type:" << t.type();
   return os;
 }
-}
+}  // namespace pten
diff --git a/paddle/fluid/framework/tensor_util_test.cc b/paddle/fluid/framework/tensor_util_test.cc
index 8587ee8d1e9..fbbf29254d0 100644
--- a/paddle/fluid/framework/tensor_util_test.cc
+++ b/paddle/fluid/framework/tensor_util_test.cc
@@ -73,6 +73,10 @@ TEST(TensorCopy, Tensor) {
     // CPU Tensor to GPU Tensor
     auto gpu_place = new platform::CUDAPlace(0);
     platform::CUDADeviceContext gpu_ctx(*gpu_place);
+    gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                             .GetAllocator(*gpu_place, gpu_ctx.stream())
+                             .get());
+    gpu_ctx.PartialInitWithAllocator();
     TensorCopy(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
 
     // GPU Tensor to CPU Tensor
@@ -166,6 +170,10 @@ TEST(TensorFromVector, Tensor) {
     gpu_tensor.Resize(paddle::framework::make_ddim({3, 3}));
     auto gpu_place = new paddle::platform::CUDAPlace();
     paddle::platform::CUDADeviceContext gpu_ctx(*gpu_place);
+    gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                             .GetAllocator(*gpu_place, gpu_ctx.stream())
+                             .get());
+    gpu_ctx.PartialInitWithAllocator();
     paddle::framework::TensorFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
     // Copy from GPU to CPU tensor for comparison
     paddle::framework::TensorCopy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
@@ -230,6 +238,10 @@ TEST(TensorToVector, Tensor) {
     paddle::framework::Tensor gpu_tensor;
     paddle::platform::CUDAPlace place;
     paddle::platform::CUDADeviceContext gpu_ctx(place);
+    gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                             .GetAllocator(place, gpu_ctx.stream())
+                             .get());
+    gpu_ctx.PartialInitWithAllocator();
     paddle::framework::TensorFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
 
     std::vector<int> dst;
@@ -267,6 +279,10 @@ TEST(TensorToVector, Tensor_bool) {
     paddle::framework::Tensor gpu_tensor;
     paddle::platform::CUDAPlace place;
     paddle::platform::CUDADeviceContext gpu_ctx(place);
+    gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                             .GetAllocator(place, gpu_ctx.stream())
+                             .get());
+    gpu_ctx.PartialInitWithAllocator();
     paddle::framework::TensorFromVector<bool>(src_vec, gpu_ctx, &gpu_tensor);
 
     std::vector<bool> dst;
@@ -493,6 +509,10 @@ TEST(Tensor, FromAndToStream) {
 
     auto gpu_place = new platform::CUDAPlace();
     platform::CUDADeviceContext gpu_ctx(*gpu_place);
+    gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                             .GetAllocator(*gpu_place, gpu_ctx.stream())
+                             .get());
+    gpu_ctx.PartialInitWithAllocator();
 
     TensorCopy(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
 
diff --git a/paddle/fluid/framework/type_defs.h b/paddle/fluid/framework/type_defs.h
index fe376a5669c..5a13df783ae 100644
--- a/paddle/fluid/framework/type_defs.h
+++ b/paddle/fluid/framework/type_defs.h
@@ -22,7 +22,7 @@ limitations under the License. */
 #include <unordered_set>
 #include <vector>
 #include "paddle/fluid/platform/variant.h"
-#include "paddle/pten/core/type_defs.h"
+#include "paddle/pten/core/compat/type_defs.h"
 #include "paddle/utils/small_vector.h"
 
 namespace paddle {
diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt
index d0f8d39f927..89d9324039c 100644
--- a/paddle/fluid/imperative/CMakeLists.txt
+++ b/paddle/fluid/imperative/CMakeLists.txt
@@ -1,13 +1,14 @@
 cc_library(imperative_flag SRCS flags.cc DEPS gflags flags)
+cc_library(var_helper SRCS var_helper.cc DEPS tensor pten_api)
 IF(WITH_XPU)
-cc_library(prepared_operator SRCS prepared_operator.cc DEPS xpu_op_list proto_desc operator device_context lod_tensor selected_rows_utils var_type_traits op_kernel_type data_transform nan_inf_utils pten pten_utils)
+cc_library(prepared_operator SRCS prepared_operator.cc DEPS xpu_op_list proto_desc operator device_context lod_tensor selected_rows_utils var_type_traits op_kernel_type data_transform nan_inf_utils pten_api pten pten_utils var_helper)
 ELSE()
-cc_library(prepared_operator SRCS prepared_operator.cc DEPS proto_desc operator device_context lod_tensor selected_rows_utils var_type_traits op_kernel_type data_transform nan_inf_utils pten pten_utils)
+cc_library(prepared_operator SRCS prepared_operator.cc DEPS proto_desc operator device_context lod_tensor selected_rows_utils var_type_traits op_kernel_type data_transform nan_inf_utils pten_api pten pten_utils var_helper)
 ENDIF()
-cc_library(layer SRCS layer.cc DEPS prepared_operator math_function imperative_flag variable_helper op_registry)
+cc_library(layer SRCS layer.cc DEPS prepared_operator math_function imperative_flag variable_helper op_registry var_helper pten_api)
 add_subdirectory(jit)
-cc_library(amp SRCS amp_auto_cast.cc DEPS layer )
-cc_library(tracer SRCS tracer.cc DEPS layer engine program_desc_tracer amp denormal garbage_collector)
+cc_library(amp SRCS amp_auto_cast.cc DEPS layer var_helper)
+cc_library(tracer SRCS tracer.cc DEPS layer engine program_desc_tracer amp denormal garbage_collector var_helper)
 cc_library(basic_engine SRCS basic_engine.cc DEPS layer gradient_accumulator)
 cc_library(engine SRCS basic_engine.cc partial_grad_engine.cc DEPS layer gradient_accumulator)
 cc_library(imperative_profiler SRCS profiler.cc DEPS flags)
diff --git a/paddle/fluid/imperative/amp_auto_cast.cc b/paddle/fluid/imperative/amp_auto_cast.cc
index a02d528a4a8..2d97a1e3b6f 100644
--- a/paddle/fluid/imperative/amp_auto_cast.cc
+++ b/paddle/fluid/imperative/amp_auto_cast.cc
@@ -13,11 +13,12 @@
 // limitations under the License.
 
 #include "paddle/fluid/imperative/amp_auto_cast.h"
-
 #include <memory>
 #include <string>
-
+#include "paddle/fluid/eager/eager_tensor.h"
 #include "paddle/fluid/imperative/tracer.h"
+#include "paddle/fluid/imperative/type_defs.h"
+#include "paddle/fluid/imperative/var_helper.h"
 
 namespace paddle {
 namespace imperative {
@@ -96,18 +97,20 @@ std::ostream& operator<<(std::ostream& os, AmpOperators& ops) {
   return os;
 }
 
-inline std::string GetDtypeStr(
-    const std::shared_ptr<imperative::VarBase>& var) {
-  return framework::DataTypeToString(var->DataType());
+template <typename VarType>
+inline std::string GetDtypeStr(const std::shared_ptr<VarType>& var) {
+  return framework::DataTypeToString(GetDataType<VarType>(var));
 }
-
-inline bool NeedCast(const std::shared_ptr<VarBase>& var) {
-  if (platform::is_gpu_place(var->Place()) ||
-      platform::is_cuda_pinned_place(var->Place()) ||
-      platform::is_xpu_place(var->Place())) {
+template <typename VarType>
+inline bool NeedCast(const std::shared_ptr<VarType>& var) {
+  auto place = GetPlace(var);
+  auto data_type = GetDataType<VarType>(var);
+  if (paddle::platform::is_gpu_place(place) ||
+      paddle::platform::is_cuda_pinned_place(place) ||
+      paddle::platform::is_xpu_place(place)) {
     // CudaPinndePlace is added for varbase created by dataloader
-    if (var->DataType() == framework::proto::VarType::FP32 ||
-        var->DataType() == framework::proto::VarType::FP16) {
+    if (data_type == paddle::framework::proto::VarType::FP32 ||
+        data_type == paddle::framework::proto::VarType::FP16) {
       return true;
     }
   }
@@ -116,16 +119,17 @@ inline bool NeedCast(const std::shared_ptr<VarBase>& var) {
 
 // NOTE: Trace a cast op, so if a var is casted from fp32 to fp16, then the grad
 // var will be cast back from fp16 to fp32 during backward phase.
-static inline std::shared_ptr<imperative::VarBase> CastToType(
-    const std::shared_ptr<VarBase>& var,
+template <typename VarType>
+static inline std::shared_ptr<VarType> CastToType(
+    const std::shared_ptr<VarType>& var,
     const framework::proto::VarType::Type dst_type) {
   const auto& tracer = imperative::GetCurrentTracer();
-  imperative::NameVarBaseMap ins = {{"X", {var}}};
-  framework::AttributeMap attrs = {{"in_dtype", var->DataType()},
+  imperative::NameVarMap<VarType> ins = {{"X", {var}}};
+  framework::AttributeMap attrs = {{"in_dtype", GetDataType<VarType>(var)},
                                    {"out_dtype", dst_type}};
-  auto out = std::shared_ptr<imperative::VarBase>(
-      new imperative::VarBase(tracer->GenerateUniqueName()));
-  imperative::NameVarBaseMap outs = {{"Out", {out}}};
+  auto out =
+      std::shared_ptr<VarType>(new VarType(tracer->GenerateUniqueName()));
+  imperative::NameVarMap<VarType> outs = {{"Out", {out}}};
 
   {
     AutoCastGuard guard(tracer, AmpLevel::O0);
@@ -134,32 +138,34 @@ static inline std::shared_ptr<imperative::VarBase> CastToType(
 
   return out;
 }
-
-static inline std::shared_ptr<imperative::VarBase> CastToFP16(
-    const std::shared_ptr<VarBase>& var) {
+template <typename VarType>
+static inline std::shared_ptr<VarType> CastToFP16(
+    const std::shared_ptr<VarType>& var) {
   auto dst_type = framework::proto::VarType::FP16;
-  if (NeedCast(var) && (var->DataType() != dst_type)) {
+  if (NeedCast(var) && (GetDataType<VarType>(var) != dst_type)) {
     return CastToType(var, dst_type);
   }
   return var;
 }
 
-static inline std::shared_ptr<imperative::VarBase> CastToFP32(
-    const std::shared_ptr<VarBase>& var) {
+template <typename VarType>
+static inline std::shared_ptr<VarType> CastToFP32(
+    const std::shared_ptr<VarType>& var) {
   auto dst_type = framework::proto::VarType::FP32;
-  if (NeedCast(var) && (var->DataType() != dst_type)) {
+  if (NeedCast(var) && (GetDataType<VarType>(var) != dst_type)) {
     return CastToType(var, dst_type);
   }
   return var;
 }
 
+template <typename VarType>
 static inline framework::proto::VarType::Type GetPromoteType(
-    const std::string& op_type, const NameVarBaseMap& ins) {
+    const std::string& op_type, const NameVarMap<VarType>& ins) {
   auto dst_type = framework::proto::VarType::FP16;
   for (const auto& pair : ins) {
     for (const auto& var : pair.second) {
-      if (var->DataType() == framework::proto::VarType::FP32) {
-        dst_type = var->DataType();
+      if (GetDataType<VarType>(var) == framework::proto::VarType::FP32) {
+        dst_type = GetDataType<VarType>(var);
         break;
       }
     }
@@ -170,7 +176,8 @@ static inline framework::proto::VarType::Type GetPromoteType(
   if (op_type == "moving_average_abs_max_scale") {
     for (const auto& pair : ins) {
       if (pair.first == "X" &&
-          pair.second.front()->DataType() == framework::proto::VarType::FP16) {
+          GetDataType<VarType>(pair.second.front()) ==
+              framework::proto::VarType::FP16) {
         dst_type = framework::proto::VarType::FP16;
       }
     }
@@ -179,9 +186,10 @@ static inline framework::proto::VarType::Type GetPromoteType(
   return dst_type;
 }
 
-NameVarBaseMap AutoCastInputs(const std::string& op_type,
-                              const NameVarBaseMap& ins) {
-  NameVarBaseMap new_ins(ins);
+template <typename VarType>
+NameVarMap<VarType> AutoCastInputs(const std::string& op_type,
+                                   const NameVarMap<VarType>& ins) {
+  NameVarMap<VarType> new_ins(ins);
   if (AmpOperators::Instance().GetMutableAllowOps()->count(op_type)) {
     for (auto& pair : new_ins) {
       // NOTE(zhiqiu): batch_norm and layer_norm support only input x is fp16.
@@ -202,7 +210,7 @@ NameVarBaseMap AutoCastInputs(const std::string& op_type,
       VLOG(5) << "Op(" << op_type << "): Cast " << pair.first << " from "
               << GetDtypeStr(*pair.second.cbegin()) << " to float16";
       for (auto& var : pair.second) {
-        var = CastToFP16(var);
+        var = CastToFP16<VarType>(var);
       }
     }
     return new_ins;
@@ -211,12 +219,12 @@ NameVarBaseMap AutoCastInputs(const std::string& op_type,
       VLOG(5) << "Op(" << op_type << "): Cast " << pair.first << " from "
               << GetDtypeStr(*pair.second.cbegin()) << " to float";
       for (auto& var : pair.second) {
-        var = CastToFP32(var);
+        var = CastToFP32<VarType>(var);
       }
     }
     return new_ins;
   } else {
-    auto dst_type = GetPromoteType(op_type, ins);
+    auto dst_type = GetPromoteType<VarType>(op_type, ins);
 
     // NOTE(zhiqiu): if the op has op fp16 kernel, fall back to fp32.
     if (dst_type == framework::proto::VarType::FP16 &&
@@ -243,18 +251,23 @@ NameVarBaseMap AutoCastInputs(const std::string& op_type,
               << GetDtypeStr(*pair.second.cbegin()) << " to "
               << framework::DataTypeToString(dst_type);
       for (auto& var : pair.second) {
-        var = (dst_type == framework::proto::VarType::FP32 ? CastToFP32(var)
-                                                           : CastToFP16(var));
+        var = (dst_type == framework::proto::VarType::FP32
+                   ? CastToFP32<VarType>(var)
+                   : CastToFP16<VarType>(var));
       }
     }
     return new_ins;
   }
   return new_ins;
 }
-
-NameVarBaseMap CastPureFp16Inputs(const std::string& op_type,
-                                  const NameVarBaseMap& ins) {
-  NameVarBaseMap new_ins(ins);
+template NameVarMap<VarBase> AutoCastInputs<VarBase>(
+    const std::string& op_type, const NameVarMap<VarBase>& ins);
+template NameVarMap<egr::EagerTensor> AutoCastInputs<egr::EagerTensor>(
+    const std::string& op_type, const NameVarMap<egr::EagerTensor>& ins);
+template <typename VarType>
+NameVarMap<VarType> CastPureFp16Inputs(const std::string& op_type,
+                                       const NameVarMap<VarType>& ins) {
+  NameVarMap<VarType> new_ins(ins);
   auto dst_type = framework::proto::VarType::FP16;
   if (AmpOperators::Instance().GetMutableUnsupportedFp16Ops()->count(op_type) ||
       AmpOperators::Instance().GetMutableBlockOps()->count(op_type)) {
@@ -284,12 +297,16 @@ NameVarBaseMap CastPureFp16Inputs(const std::string& op_type,
             << GetDtypeStr(*pair.second.cbegin()) << " to "
             << framework::DataTypeToString(dst_type);
     for (auto& var : pair.second) {
-      var = (dst_type == framework::proto::VarType::FP32 ? CastToFP32(var)
-                                                         : CastToFP16(var));
+      var = (dst_type == framework::proto::VarType::FP32
+                 ? CastToFP32<VarType>(var)
+                 : CastToFP16<VarType>(var));
     }
   }
   return new_ins;
 }
-
+template NameVarMap<VarBase> CastPureFp16Inputs<VarBase>(
+    const std::string& op_type, const NameVarMap<VarBase>& ins);
+template NameVarMap<egr::EagerTensor> CastPureFp16Inputs<egr::EagerTensor>(
+    const std::string& op_type, const NameVarMap<egr::EagerTensor>& ins);
 }  // namespace imperative
 }  // namespace paddle
diff --git a/paddle/fluid/imperative/amp_auto_cast.h b/paddle/fluid/imperative/amp_auto_cast.h
index 903e2652888..0a45798a52d 100644
--- a/paddle/fluid/imperative/amp_auto_cast.h
+++ b/paddle/fluid/imperative/amp_auto_cast.h
@@ -83,11 +83,12 @@ class AutoCastGuard {
   AmpLevel pre_amp_level_;
 };
 
-NameVarBaseMap AutoCastInputs(const std::string& op_type,
-                              const NameVarBaseMap& ins);
-
-NameVarBaseMap CastPureFp16Inputs(const std::string& op_type,
-                                  const NameVarBaseMap& ins);
+template <typename VarType>
+NameVarMap<VarType> AutoCastInputs(const std::string& op_type,
+                                   const NameVarMap<VarType>& ins);
+template <typename VarType>
+NameVarMap<VarType> CastPureFp16Inputs(const std::string& op_type,
+                                       const NameVarMap<VarType>& ins);
 
 }  // namespace imperative
 }  // namespace paddle
diff --git a/paddle/fluid/imperative/execution_context.h b/paddle/fluid/imperative/execution_context.h
index 5446add8678..fe5ac73b004 100644
--- a/paddle/fluid/imperative/execution_context.h
+++ b/paddle/fluid/imperative/execution_context.h
@@ -20,6 +20,7 @@
 #include "paddle/fluid/framework/type_defs.h"
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/imperative/type_defs.h"
+#include "paddle/fluid/imperative/var_helper.h"
 
 namespace paddle {
 namespace imperative {
@@ -33,34 +34,35 @@ class DygraphExecutionContext : public framework::ExecutionContext {
                           const framework::Scope& scope,
                           const platform::DeviceContext& device_context,
                           const framework::RuntimeContext& ctx,
-                          const NameVarMap<VarType>& var_base_map_in,
-                          const NameVarMap<VarType>& var_base_map_out,
+                          const NameVarMap<VarType>& var_map_in,
+                          const NameVarMap<VarType>& var_map_out,
                           const framework::AttributeMap& attrs,
                           const framework::AttributeMap& default_attrs)
       : ExecutionContext(op, scope, device_context, ctx),
-        var_base_map_in_(var_base_map_in),
-        var_base_map_out_(var_base_map_out),
+        var_map_in_(var_map_in),
+        var_map_out_(var_map_out),
         attrs_(attrs),
         default_attrs_(default_attrs) {}
 
   std::string InputName(const std::string& name) const override {
-    auto it = var_base_map_in_.find(name);
-    PADDLE_ENFORCE_NE(it, var_base_map_in_.end(),
+    auto it = var_map_in_.find(name);
+    PADDLE_ENFORCE_NE(it, var_map_in_.end(),
                       platform::errors::PreconditionNotMet(
                           "Can not find [%s] in Input", name));
-    return it->second[0] ? it->second[0]->Name() : framework::kEmptyVarName;
+    return it->second[0] ? GetNameFromVar(it->second[0])
+                         : framework::kEmptyVarName;
   }
 
   std::vector<std::string> InputNames(const std::string& name) const override {
-    auto it = var_base_map_in_.find(name);
+    auto it = var_map_in_.find(name);
     PADDLE_ENFORCE_NE(
-        it, var_base_map_in_.end(),
+        it, var_map_in_.end(),
         platform::errors::NotFound("Can not find [%s] in Input", name));
     std::vector<std::string> vec_res;
     vec_res.reserve(it->second.size());
     for (size_t i = 0; i < it->second.size(); ++i) {
       if (it->second[i]) {
-        vec_res.push_back(it->second[i]->Name());
+        vec_res.push_back(GetNameFromVar(it->second[i]));
       } else {
         vec_res.push_back(framework::kEmptyVarName);
       }
@@ -69,23 +71,24 @@ class DygraphExecutionContext : public framework::ExecutionContext {
   }
 
   std::string OutputName(const std::string& name) const override {
-    auto it = var_base_map_out_.find(name);
+    auto it = var_map_out_.find(name);
     PADDLE_ENFORCE_NE(
-        it, var_base_map_out_.end(),
+        it, var_map_out_.end(),
         platform::errors::NotFound("Can not find [%s] in Output", name));
-    return it->second[0] ? it->second[0]->Name() : framework::kEmptyVarName;
+    return it->second[0] ? GetNameFromVar(it->second[0])
+                         : framework::kEmptyVarName;
   }
 
   std::vector<std::string> OutputNames(const std::string& name) const override {
-    auto it = var_base_map_out_.find(name);
+    auto it = var_map_out_.find(name);
     PADDLE_ENFORCE_NE(
-        it, var_base_map_out_.end(),
+        it, var_map_out_.end(),
         platform::errors::NotFound("Can not find [%s] in Output", name));
     std::vector<std::string> vec_res;
     vec_res.reserve(it->second.size());
     for (size_t i = 0; i < it->second.size(); ++i) {
       if (it->second[i]) {
-        vec_res.push_back(it->second[i]->Name());
+        vec_res.push_back(GetNameFromVar(it->second[i]));
       } else {
         vec_res.push_back(framework::kEmptyVarName);
       }
@@ -116,9 +119,9 @@ class DygraphExecutionContext : public framework::ExecutionContext {
 
   std::vector<std::string> InNameList() const override {
     std::vector<std::string> vec_temp;
-    vec_temp.reserve(var_base_map_in_.size());
+    vec_temp.reserve(var_map_in_.size());
 
-    for (auto& v : var_base_map_in_) {
+    for (auto& v : var_map_in_) {
       vec_temp.push_back(v.first);
     }
 
@@ -126,13 +129,13 @@ class DygraphExecutionContext : public framework::ExecutionContext {
   }
 
   bool HasInput(const std::string& name) const override {
-    auto it = var_base_map_in_.find(name);
-    return (it != var_base_map_in_.end() && it->second.size() > 0);
+    auto it = var_map_in_.find(name);
+    return (it != var_map_in_.end() && it->second.size() > 0);
   }
 
   bool HasOutput(const std::string& name) const override {
-    auto it = var_base_map_out_.find(name);
-    return (it != var_base_map_out_.end() && it->second.size() > 0);
+    auto it = var_map_out_.find(name);
+    return (it != var_map_out_.end() && it->second.size() > 0);
   }
 
   size_t InputSize(const std::string& name) const override {
@@ -144,8 +147,8 @@ class DygraphExecutionContext : public framework::ExecutionContext {
   }
 
   const Variable* InputVar(const std::string& name) const override {
-    auto it = var_base_map_in_.find(name);
-    if (it == var_base_map_in_.end()) {
+    auto it = var_map_in_.find(name);
+    if (it == var_map_in_.end()) {
       return nullptr;
     }
 
@@ -155,8 +158,8 @@ class DygraphExecutionContext : public framework::ExecutionContext {
   }
 
   Variable* OutputVar(const std::string& name) const override {
-    auto it = var_base_map_out_.find(name);
-    if (it == var_base_map_out_.end()) {
+    auto it = var_map_out_.find(name);
+    if (it == var_map_out_.end()) {
       return nullptr;
     }
 
@@ -167,8 +170,8 @@ class DygraphExecutionContext : public framework::ExecutionContext {
 
   const std::vector<Variable*> MultiInputVar(
       const std::string& name) const override {
-    auto it = var_base_map_in_.find(name);
-    if (it == var_base_map_in_.end()) {
+    auto it = var_map_in_.find(name);
+    if (it == var_map_in_.end()) {
       return {};
     }
     std::vector<Variable*> vec_res;
@@ -182,8 +185,8 @@ class DygraphExecutionContext : public framework::ExecutionContext {
 
   std::vector<Variable*> MultiOutputVar(
       const std::string& name) const override {
-    auto it = var_base_map_out_.find(name);
-    if (it == var_base_map_out_.end()) {
+    auto it = var_map_out_.find(name);
+    if (it == var_map_out_.end()) {
       return {};
     }
     std::vector<Variable*> vec_res;
@@ -196,8 +199,8 @@ class DygraphExecutionContext : public framework::ExecutionContext {
   }
 
  private:
-  const NameVarMap<VarType>& var_base_map_in_;
-  const NameVarMap<VarType>& var_base_map_out_;
+  const NameVarMap<VarType>& var_map_in_;
+  const NameVarMap<VarType>& var_map_out_;
   const framework::AttributeMap& attrs_;
   const framework::AttributeMap& default_attrs_;
 };
diff --git a/paddle/fluid/imperative/gloo_context.cc b/paddle/fluid/imperative/gloo_context.cc
index 44315e267ee..eeac500cac4 100644
--- a/paddle/fluid/imperative/gloo_context.cc
+++ b/paddle/fluid/imperative/gloo_context.cc
@@ -46,6 +46,17 @@ void GLOOParallelContext::Init() {
   gloo_wrapper->Init();
   device_ = std::unique_ptr<platform::CPUDeviceContext>(
       new platform::CPUDeviceContext(platform::CPUPlace()));
+  device_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                            .GetAllocator(platform::CPUPlace())
+                            .get());
+  device_->SetHostAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(paddle::platform::CPUPlace())
+          .get());
+  device_->SetZeroAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetZeroAllocator(platform::CPUPlace())
+          .get());
 }
 
 void GLOOParallelContext::InitWithRingID(int ring_id) {
diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc
index ffd9aae8ff0..257953252bc 100644
--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@@ -214,9 +214,37 @@ void TensorAddImpl(const framework::Tensor& src, framework::Tensor* dst,
   func(dev_ctx, src, dst);
 }
 
-void TensorAdd(const framework::Variable& src, framework::Variable* dst) {
+std::shared_ptr<pten::DenseTensor> GetInnerDstTensor(egr::EagerTensor* dst) {
+  std::shared_ptr<pten::DenseTensor> dst_tensor =
+      std::dynamic_pointer_cast<pten::DenseTensor>(dst->impl());
+  return dst_tensor;
+}
+
+std::shared_ptr<pten::DenseTensor> GetInnerSrcTensor(
+    const egr::EagerTensor& src) {
+  std::shared_ptr<pten::DenseTensor> dst_tensor =
+      std::dynamic_pointer_cast<pten::DenseTensor>(src.impl());
+  return dst_tensor;
+}
+
+std::shared_ptr<pten::DenseTensor> GetInnerDstTensor(framework::Variable* dst) {
   auto* dst_tensor = dst->GetMutable<framework::LoDTensor>();
+  return std::make_shared<pten::DenseTensor>(*dst_tensor);
+}
+
+std::shared_ptr<pten::DenseTensor> GetInnerSrcTensor(
+    const framework::Variable& src) {
   auto& src_tensor = src.Get<framework::LoDTensor>();
+  return std::make_shared<pten::DenseTensor>(src_tensor);
+}
+
+template <typename VarType>
+void TensorAdd(const VarType& src, VarType* dst) {
+  std::shared_ptr<pten::DenseTensor> d_tensor = GetInnerDstTensor(dst);
+  std::shared_ptr<pten::DenseTensor> s_tensor = GetInnerSrcTensor(src);
+
+  auto* dst_tensor = d_tensor.get();
+  auto& src_tensor = *s_tensor.get();
 
   auto numel = src_tensor.numel();
 
@@ -336,6 +364,11 @@ void TensorAdd(const framework::Variable& src, framework::Variable* dst) {
       framework::DataTypeToString(data_type), place));
 }
 
+template void TensorAdd<framework::Variable>(const framework::Variable& src,
+                                             framework::Variable* dst);
+template void TensorAdd<egr::EagerTensor>(const egr::EagerTensor& src,
+                                          egr::EagerTensor* dst);
+
 void SelectedRowsAddToTensor(const framework::Variable& src,
                              framework::Variable* dst) {
   auto* dst_tensor = dst->GetMutable<framework::LoDTensor>();
@@ -462,13 +495,41 @@ std::shared_ptr<VariableWrapper> SelectedRowsMerge(
       framework::DataTypeToString(data_type)));
 }
 
+void VariableAdd(const egr::EagerTensor& src_tensor,
+                 egr::EagerTensor* dst_tensor) {
+  auto& src = src_tensor.Var();
+  auto* dst = dst_tensor->MutableVar();
+
+  if (dst->IsType<paddle::framework::LoDTensor>()) {
+    if (src.IsType<paddle::framework::LoDTensor>()) {
+      paddle::imperative::TensorAdd<paddle::framework::Variable>(src, dst);
+    } else if (src.IsType<pten::SelectedRows>()) {
+      paddle::imperative::SelectedRowsAddToTensor(src, dst);
+    } else {
+      PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+          "Unexpected branch, output variable type is %s",
+          paddle::framework::ToTypeName(dst->Type())));
+    }
+  } else {
+    if (src.IsType<paddle::framework::LoDTensor>()) {
+      paddle::framework::Variable new_dst;
+      paddle::imperative::SelectedRowsAddTensor(*dst, src, &new_dst);
+      *dst = std::move(new_dst);
+    } else {
+      PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+          "Unexpected branch, output variable type is %s",
+          paddle::framework::ToTypeName(dst->Type())));
+    }
+  }
+}
+
 void VariableWrapperAdd(std::shared_ptr<VariableWrapper> var,
                         VariableWrapper* dst_var, bool unchange_input) {
   auto& src = var->Var();
   auto* dst = dst_var->MutableVar();
   if (dst->IsType<framework::LoDTensor>()) {
     if (src.IsType<framework::LoDTensor>()) {
-      TensorAdd(src, dst);
+      TensorAdd<framework::Variable>(src, dst);
     } else if (src.IsType<pten::SelectedRows>()) {
       SelectedRowsAddToTensor(src, dst);
     } else {
@@ -535,7 +596,7 @@ void GradientAccumulator::AccumulateGrad() {
                "previous gradient.";
     if (dst->IsType<framework::LoDTensor>()) {
       if (src->IsType<framework::LoDTensor>()) {
-        TensorAdd(*src, dst);
+        TensorAdd<framework::Variable>(*src, dst);
       } else if (src->IsType<pten::SelectedRows>()) {
         SelectedRowsAddToTensor(*src, dst);
       }
diff --git a/paddle/fluid/imperative/gradient_accumulator.h b/paddle/fluid/imperative/gradient_accumulator.h
index f9f8081d30f..a57335d08a2 100644
--- a/paddle/fluid/imperative/gradient_accumulator.h
+++ b/paddle/fluid/imperative/gradient_accumulator.h
@@ -18,6 +18,7 @@
 #include <utility>
 #include <vector>
 
+#include "paddle/fluid/eager/eager_tensor.h"
 #include "paddle/fluid/imperative/hooks.h"
 #include "paddle/fluid/imperative/layer.h"
 
@@ -170,7 +171,10 @@ void SelectedRowsAddTensor(const framework::Variable& src_selected_rows_var,
                            const framework::Variable& src_tensor_var,
                            framework::Variable* dst_tensor_var);
 
-void TensorAdd(const framework::Variable& src, framework::Variable* dst);
+template <typename VarType>
+void TensorAdd(const VarType& src, VarType* dst);
+
+void VariableAdd(const egr::EagerTensor& src, egr::EagerTensor* dst);
 
 }  // namespace imperative
 }  // namespace paddle
diff --git a/paddle/fluid/imperative/infer_shape_context.h b/paddle/fluid/imperative/infer_shape_context.h
index a39e58bba90..eb7d419c298 100644
--- a/paddle/fluid/imperative/infer_shape_context.h
+++ b/paddle/fluid/imperative/infer_shape_context.h
@@ -22,6 +22,7 @@
 #include "paddle/fluid/framework/shape_inference.h"
 #include "paddle/fluid/framework/type_defs.h"
 #include "paddle/fluid/imperative/type_defs.h"
+#include "paddle/fluid/imperative/var_helper.h"
 #include "paddle/fluid/imperative/variable_wrapper.h"
 
 namespace paddle {
@@ -37,8 +38,8 @@ class DygraphInferShapeContext : public framework::InferShapeContext {
       const framework::AttributeMap* attr,
       const framework::AttributeMap* default_attr, const std::string op_type,
       const framework::OpKernelType* op_kernel_type = nullptr)
-      : var_base_map_in_(in),
-        var_base_map_out_(out),
+      : var_map_in_(in),
+        var_map_out_(out),
         attrs_(attr),
         default_attrs_(default_attr),
         op_type_(op_type),
@@ -46,9 +47,9 @@ class DygraphInferShapeContext : public framework::InferShapeContext {
 
   bool HasInput(const std::string& name) const override {
     // has only one input
-    auto it = var_base_map_in_->find(name);
+    auto it = var_map_in_->find(name);
 
-    if (it == var_base_map_in_->end()) {
+    if (it == var_map_in_->end()) {
       return false;
     }
     const auto& in = it->second;
@@ -62,8 +63,8 @@ class DygraphInferShapeContext : public framework::InferShapeContext {
 
   bool HasOutput(const std::string& name) const override {
     // has only one output
-    auto it = var_base_map_out_->find(name);
-    if (it == var_base_map_out_->end()) {
+    auto it = var_map_out_->find(name);
+    if (it == var_map_out_->end()) {
       return false;
     }
     const auto& out = it->second;
@@ -78,8 +79,8 @@ class DygraphInferShapeContext : public framework::InferShapeContext {
   }
 
   bool HasInputs(const std::string& name) const override {
-    auto it = var_base_map_in_->find(name);
-    if (it == var_base_map_in_->end() || it->second.empty()) {
+    auto it = var_map_in_->find(name);
+    if (it == var_map_in_->end() || it->second.empty()) {
       return false;
     }
     for (auto& input : it->second) {
@@ -91,8 +92,8 @@ class DygraphInferShapeContext : public framework::InferShapeContext {
   }
 
   bool HasOutputs(const std::string& name) const override {
-    auto it = var_base_map_out_->find(name);
-    if (it == var_base_map_out_->end() || it->second.empty()) {
+    auto it = var_map_out_->find(name);
+    if (it == var_map_out_->end() || it->second.empty()) {
       return false;
     }
     for (auto& output : it->second) {
@@ -109,15 +110,15 @@ class DygraphInferShapeContext : public framework::InferShapeContext {
 
   std::vector<std::string> Inputs(const std::string& name) const override {
     std::vector<std::string> vec_res;
-    auto it = var_base_map_in_->find(name);
+    auto it = var_map_in_->find(name);
     PADDLE_ENFORCE_NE(
-        it, var_base_map_in_->end(),
+        it, var_map_in_->end(),
         platform::errors::NotFound("can not find [%s] in input", name));
 
     vec_res.reserve(it->second.size());
     for (auto& var : it->second) {
       if (var) {
-        vec_res.push_back(var->Name());
+        vec_res.push_back(GetNameFromVar(var));
       } else {
         vec_res.push_back(framework::kEmptyVarName);
       }
@@ -128,15 +129,15 @@ class DygraphInferShapeContext : public framework::InferShapeContext {
 
   std::vector<std::string> Outputs(const std::string& name) const override {
     std::vector<std::string> vec_res;
-    auto it = var_base_map_out_->find(name);
+    auto it = var_map_out_->find(name);
     PADDLE_ENFORCE_NE(
-        it, var_base_map_out_->end(),
+        it, var_map_out_->end(),
         platform::errors::NotFound("can not find [%s] in output", name));
 
     vec_res.reserve(it->second.size());
     for (auto& var : it->second) {
       if (var) {
-        vec_res.push_back(var->Name());
+        vec_res.push_back(GetNameFromVar(var));
       } else {
         vec_res.push_back(framework::kEmptyVarName);
       }
@@ -169,16 +170,16 @@ class DygraphInferShapeContext : public framework::InferShapeContext {
 
   void ShareDim(const std::string& in, const std::string& out, size_t i = 0,
                 size_t j = 0) override {
-    auto in_it = var_base_map_in_->find(in);
-    auto out_it = var_base_map_out_->find(out);
+    auto in_it = var_map_in_->find(in);
+    auto out_it = var_map_out_->find(out);
     PADDLE_ENFORCE_NE(
-        in_it, var_base_map_in_->end(),
+        in_it, var_map_in_->end(),
         platform::errors::NotFound("can not found [%s] in input", in));
     PADDLE_ENFORCE_GT(in_it->second.size(), i,
                       platform::errors::PreconditionNotMet(
                           "Inputs %s should have %llu argument", in, i));
     PADDLE_ENFORCE_NE(
-        out_it, var_base_map_out_->end(),
+        out_it, var_map_out_->end(),
         platform::errors::NotFound("can not found [%s] in input", in));
     PADDLE_ENFORCE_GT(out_it->second.size(), j,
                       platform::errors::PreconditionNotMet(
@@ -223,9 +224,9 @@ class DygraphInferShapeContext : public framework::InferShapeContext {
   std::vector<framework::InferShapeVarPtr> GetInputVarPtrs(
       const std::string& name) const override {
     std::vector<framework::InferShapeVarPtr> res;
-    auto it = var_base_map_in_->find(name);
+    auto it = var_map_in_->find(name);
     PADDLE_ENFORCE_NE(
-        it, var_base_map_in_->end(),
+        it, var_map_in_->end(),
         platform::errors::NotFound("Can not find [%s] in inputs.", name));
     for (auto& var : it->second) {
       res.emplace_back(var->MutableVar());
@@ -236,9 +237,9 @@ class DygraphInferShapeContext : public framework::InferShapeContext {
   std::vector<framework::InferShapeVarPtr> GetOutputVarPtrs(
       const std::string& name) const override {
     std::vector<framework::InferShapeVarPtr> res;
-    auto it = var_base_map_out_->find(name);
+    auto it = var_map_out_->find(name);
     PADDLE_ENFORCE_NE(
-        it, var_base_map_out_->end(),
+        it, var_map_out_->end(),
         platform::errors::NotFound("Can not find [%s] in outputs.", name));
     for (auto& var : it->second) {
       res.emplace_back(var->MutableVar());
@@ -247,9 +248,9 @@ class DygraphInferShapeContext : public framework::InferShapeContext {
   }
 
   DDim GetInputDim(const std::string& name) const override {
-    auto it = var_base_map_in_->find(name);
+    auto it = var_map_in_->find(name);
     PADDLE_ENFORCE_NE(
-        it, var_base_map_in_->end(),
+        it, var_map_in_->end(),
         platform::errors::NotFound("can not find [%s] in input", name));
     PADDLE_ENFORCE_EQ(
         it->second.size(), 1UL,
@@ -262,9 +263,9 @@ class DygraphInferShapeContext : public framework::InferShapeContext {
   std::vector<DDim> GetInputsDim(const std::string& name) const override {
     // const std::vector<Variable*>& vars = InputVars(name);
     std::vector<DDim> vec_res;
-    auto it = var_base_map_in_->find(name);
+    auto it = var_map_in_->find(name);
     PADDLE_ENFORCE_NE(
-        it, var_base_map_in_->end(),
+        it, var_map_in_->end(),
         platform::errors::NotFound("can not find [%s] in output", name));
     vec_res.reserve(it->second.size());
     for (size_t i = 0; i < it->second.size(); ++i) {
@@ -281,9 +282,9 @@ class DygraphInferShapeContext : public framework::InferShapeContext {
   std::vector<framework::proto::VarType::Type> GetInputsVarType(
       const std::string& name) const override {
     std::vector<framework::proto::VarType::Type> vec_res;
-    auto it = var_base_map_in_->find(name);
+    auto it = var_map_in_->find(name);
     PADDLE_ENFORCE_NE(
-        it, var_base_map_in_->end(),
+        it, var_map_in_->end(),
         platform::errors::NotFound("can not find [%s] in input", name));
     vec_res.reserve(it->second.size());
     for (size_t i = 0; i < it->second.size(); ++i) {
@@ -300,9 +301,9 @@ class DygraphInferShapeContext : public framework::InferShapeContext {
   std::vector<framework::proto::VarType::Type> GetOutputsVarType(
       const std::string& name) const override {
     std::vector<framework::proto::VarType::Type> vec_res;
-    auto it = var_base_map_out_->find(name);
+    auto it = var_map_out_->find(name);
     PADDLE_ENFORCE_NE(
-        it, var_base_map_out_->end(),
+        it, var_map_out_->end(),
         platform::errors::NotFound("can not find [%s] in output", name));
     vec_res.reserve(it->second.size());
     for (size_t i = 0; i < it->second.size(); ++i) {
@@ -317,9 +318,9 @@ class DygraphInferShapeContext : public framework::InferShapeContext {
   }
 
   void SetOutputDim(const std::string& name, const DDim& dim) override {
-    auto it = var_base_map_out_->find(name);
+    auto it = var_map_out_->find(name);
     PADDLE_ENFORCE_NE(
-        it, var_base_map_out_->end(),
+        it, var_map_out_->end(),
         platform::errors::NotFound("can not find [%s] in output", name));
 
     if (it->second[0]) {
@@ -329,9 +330,9 @@ class DygraphInferShapeContext : public framework::InferShapeContext {
 
   void SetOutputsDim(const std::string& name,
                      const std::vector<DDim>& dims) override {
-    auto it = var_base_map_out_->find(name);
+    auto it = var_map_out_->find(name);
     PADDLE_ENFORCE_NE(
-        it, var_base_map_out_->end(),
+        it, var_map_out_->end(),
         platform::errors::NotFound("can not find [%s] in output", name));
 
     PADDLE_ENFORCE_EQ(dims.size(), it->second.size(),
@@ -413,8 +414,8 @@ class DygraphInferShapeContext : public framework::InferShapeContext {
   }
 
  private:
-  const NameVarMap<VarType>* var_base_map_in_;
-  const NameVarMap<VarType>* var_base_map_out_;
+  const NameVarMap<VarType>* var_map_in_;
+  const NameVarMap<VarType>* var_map_out_;
   const framework::AttributeMap* attrs_;
   const framework::AttributeMap* default_attrs_;
   const std::string op_type_;
diff --git a/paddle/fluid/imperative/infer_var_type_context.h b/paddle/fluid/imperative/infer_var_type_context.h
index 7defc339f4f..297ec840db4 100644
--- a/paddle/fluid/imperative/infer_var_type_context.h
+++ b/paddle/fluid/imperative/infer_var_type_context.h
@@ -21,6 +21,7 @@
 #include "paddle/fluid/framework/type_defs.h"
 #include "paddle/fluid/framework/var_type_inference.h"
 #include "paddle/fluid/imperative/type_defs.h"
+#include "paddle/fluid/imperative/var_helper.h"
 #include "paddle/fluid/imperative/variable_wrapper.h"
 
 namespace paddle {
@@ -72,7 +73,7 @@ class RuntimeInferVarTypeContext : public framework::InferVarTypeContext {
 
   const std::string& InputVarName(const std::string& name,
                                   const int index = 0) const {
-    return inputs_.at(name)[index]->Name();
+    return GetNameFromVar(inputs_.at(name)[index]);
   }
 
   bool InputTypeAnyOf(const std::string& name,
@@ -80,7 +81,7 @@ class RuntimeInferVarTypeContext : public framework::InferVarTypeContext {
     auto& inputs = inputs_.at(name);
     return std::any_of(inputs.begin(), inputs.end(),
                        [&type](const std::shared_ptr<VarType>& var) {
-                         return var->Type() == type;
+                         return GetType(var) == type;
                        });
   }
 
@@ -89,7 +90,7 @@ class RuntimeInferVarTypeContext : public framework::InferVarTypeContext {
     auto& inputs = inputs_.at(name);
     return std::all_of(inputs.begin(), inputs.end(),
                        [&type](const std::shared_ptr<VarType>& var) {
-                         return var->Type() == type;
+                         return GetType(var) == type;
                        });
   }
 
@@ -99,8 +100,7 @@ class RuntimeInferVarTypeContext : public framework::InferVarTypeContext {
     auto in_var = inputs_.at(input_name)[index];
     auto out_var = outputs_.at(output_name)[index];
     if (in_var != out_var) {
-      this->SetVarBaseType(out_var, in_var->Type());
-      this->SetVarBaseDataType(out_var, in_var->DataType());
+      this->SetVarType(out_var, GetType(in_var));
     }
   }
 
@@ -109,54 +109,44 @@ class RuntimeInferVarTypeContext : public framework::InferVarTypeContext {
                      int index = 0) override {
     if (index == framework::ALL_ELEMENTS) {
       for (auto& item : outputs_.at(name)) {
-        this->SetVarBaseType(item, type);
+        this->SetVarType(item, type);
       }
     } else {
       auto& var = outputs_.at(name)[index];
-      this->SetVarBaseType(var, type);
+      this->SetVarType(var, type);
     }
   }
 
-  void SetVarBaseType(std::shared_ptr<VarType> out,
-                      framework::proto::VarType::Type type) {
-    out->SetType(type);
+  void SetVarType(std::shared_ptr<VarType> out,
+                  framework::proto::VarType::Type type) {
+    SetType(out, type);
     if ((out->MutableVar()->IsInitialized() == true) &&
         (out->MutableVar()->Type() != type)) {
       out->MutableVar()->Clear();
     }
   }
 
-  void SetVarBaseDataType(std::shared_ptr<VarType> out,
-                          framework::proto::VarType::Type type) {
-    out->SetDataType(type);
-  }
-
   framework::proto::VarType::Type GetInputType(
       const std::string& name, const int& index = 0) const override {
-    return inputs_.at(name)[index]->Type();
+    return GetType(inputs_.at(name)[index]);
   }
 
   framework::proto::VarType::Type GetOutputType(
       const std::string& name, const int& index = 0) const override {
-    return outputs_.at(name)[index]->Type();
+    return GetType(outputs_.at(name)[index]);
   }
 
   framework::proto::VarType::Type GetInputDataType(
       const std::string& name, const int& index = 0) const override {
-    return inputs_.at(name)[index]->DataType();
+    return GetDataType(inputs_.at(name)[index]);
   }
 
   void SetOutputDataType(const std::string& name,
                          framework::proto::VarType::Type type,
                          int index = 0) override {
-    if (framework::ALL_ELEMENTS == index) {
-      for (auto& item : outputs_.at(name)) {
-        this->SetVarBaseDataType(item, type);
-      }
-    } else {
-      auto& var = outputs_.at(name)[index];
-      this->SetVarBaseDataType(var, type);
-    }
+    VLOG(10) << "Set data type in infer var type of Eager mode is meaning less "
+                "for var: "
+             << name;
   }
 
   bool IsDygraph() const override { return true; }
diff --git a/paddle/fluid/imperative/jit/program_desc_tracer.cc b/paddle/fluid/imperative/jit/program_desc_tracer.cc
index 1a44f50275e..65309b66db5 100644
--- a/paddle/fluid/imperative/jit/program_desc_tracer.cc
+++ b/paddle/fluid/imperative/jit/program_desc_tracer.cc
@@ -140,6 +140,13 @@ void ProgramDescTracer::InsertOp(const std::string &type,
   }
 }
 
+void ProgramDescTracer::InsertOp(const std::string &type,
+                                 const NameTensorMap &inputs,
+                                 const NameTensorMap &outputs,
+                                 const framework::AttributeMap &attrs) {
+  // TODO(jiabin): Support this later.
+}
+
 TracedProgramTuple ProgramDescTracer::CreateProgramDesc(
     const std::vector<std::shared_ptr<VarBase>> &feed_vars,
     const std::string &feed_prefix,
diff --git a/paddle/fluid/imperative/jit/program_desc_tracer.h b/paddle/fluid/imperative/jit/program_desc_tracer.h
index b231efb0e53..47a96c155b8 100644
--- a/paddle/fluid/imperative/jit/program_desc_tracer.h
+++ b/paddle/fluid/imperative/jit/program_desc_tracer.h
@@ -61,6 +61,10 @@ class ProgramDescTracer {
                 const NameVarBaseMap &outputs,
                 const framework::AttributeMap &attrs);
 
+  void InsertOp(const std::string &type, const NameTensorMap &inputs,
+                const NameTensorMap &outputs,
+                const framework::AttributeMap &attrs);
+
   TracedProgramTuple CreateProgramDesc(
       const std::vector<std::shared_ptr<VarBase>> &feed_vars,
       const std::string &feed_prefix,
diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc
index 65720c8a3cf..5b8974b3348 100644
--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
@@ -14,11 +14,12 @@
 
 #include "paddle/fluid/imperative/layer.h"
 
+#include "paddle/fluid/eager/eager_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/variable_helper.h"
 #include "paddle/fluid/imperative/infer_var_type_context.h"
 #include "paddle/fluid/imperative/op_base.h"
 #include "paddle/fluid/imperative/prepared_operator.h"
+#include "paddle/fluid/imperative/var_helper.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -90,7 +91,7 @@ static std::string DebugString(
       ss << "NULL";
       continue;
     }
-    ss << vars[i]->Name() << "[";
+    ss << GetNameFromVar(vars[i]) << "[";
     const framework::Variable& var = vars[i]->Var();
     if (!var.IsInitialized()) {
       ss << "NOT_INITED_VAR";
@@ -169,6 +170,29 @@ std::string LayerDebugString(const std::string& op_type,
   return LayerDebugStringImpl<VariableWrapper>(op_type, ins, outs);
 }
 
+std::string LayerDebugString(const std::string& op_type,
+                             const NameVarMap<egr::EagerTensor>& ins,
+                             const NameVarMap<egr::EagerTensor>& outs) {
+  return LayerDebugStringImpl<egr::EagerTensor>(op_type, ins, outs);
+}
+
+template <typename VarType>
+static void SetForwardDataTypeOfGradVars(const NameVarMap<VarType>& outs) {
+  for (auto& var_pair : outs) {
+    for (auto& var : var_pair.second) {
+      // NOTE(zhiqu): The ouput may be NULL because of pruning.
+      if (var) {
+        SetForwardDataTypeOfGradVar(var);
+      }
+    }
+  }
+}
+template <>
+void SetForwardDataTypeOfGradVars<egr::EagerTensor>(
+    const NameVarMap<egr::EagerTensor>& outs) {
+  // In eager mode we don't need this.
+}
+
 VarBase::VarBase(const std::shared_ptr<VariableWrapper>& var)
     : var_(var), grad_node_(var->GetGradNode()) {
   if (auto grad_var = var_->GetGradVar()) {
@@ -407,8 +431,6 @@ void VarBase::_CopyGradientFrom(const VarBase& src) {
   }
 }
 
-pten::KernelContext OpBase::pt_kernel_context_;
-
 void OpBase::SetType(const std::string& type) {
   op_ = framework::OpRegistry::CreateOp(type, {}, {}, {}, false);
 }
@@ -440,7 +462,7 @@ static void OpBaseRunImpl(const framework::OperatorBase& op,
   for (auto& var_pair : outs) {
     for (auto& var : var_pair.second) {
       if (var) {
-        InitializeVariable(var->MutableVar(), var->Type());
+        InitializeVariable(var->MutableVar(), GetType(var));
       }
     }
   }
@@ -478,14 +500,7 @@ static void OpBaseRunImpl(const framework::OperatorBase& op,
   VLOG(4) << LayerDebugString(op.Type(), ins, outs);
 
   // set the output var
-  for (auto& var_pair : outs) {
-    for (auto& var : var_pair.second) {
-      // NOTE(zhiqu): The ouput may be NULL because of pruning.
-      if (var) {
-        SetForwardDataTypeOfGradVar(var);
-      }
-    }
-  }
+  SetForwardDataTypeOfGradVars<VarType>(outs);
 }
 
 void OpBase::Run(const framework::OperatorBase& op,
@@ -506,6 +521,15 @@ void OpBase::Run(const framework::OperatorBase& op,
   OpBaseRunImpl<VariableWrapper>(op, ins, outs, attrs, default_attrs, place);
 }
 
+void OpBase::Run(const framework::OperatorBase& op,
+                 const NameVarMap<egr::EagerTensor>& ins,
+                 const NameVarMap<egr::EagerTensor>& outs,
+                 const framework::AttributeMap& attrs,
+                 const framework::AttributeMap& default_attrs,
+                 const platform::Place& place) {
+  OpBaseRunImpl<egr::EagerTensor>(op, ins, outs, attrs, default_attrs, place);
+}
+
 void ClearNoNeedBufferInputs(OpBase* op) {
   auto& inferer = op->Info().NoNeedBufferVarsInferer();
   if (!inferer) return;
@@ -566,5 +590,14 @@ std::shared_ptr<GradOpNode> CreateGradOpNode(
   }
 }
 
+std::shared_ptr<GradOpNode> CreateGradOpNode(
+    const framework::OperatorBase& op, const NameTensorMap& ins,
+    const NameTensorMap& outs, const framework::AttributeMap& attrs,
+    const framework::AttributeMap& default_attrs, const platform::Place& place,
+    const std::map<std::string, std::string>& inplace_map) {
+  // Do Nothing in Eager Mode.
+  return nullptr;
+}
+
 }  // namespace imperative
 }  // namespace paddle
diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h
index d27460aeecc..756f26dceff 100644
--- a/paddle/fluid/imperative/layer.h
+++ b/paddle/fluid/imperative/layer.h
@@ -288,6 +288,12 @@ std::shared_ptr<GradOpNode> CreateGradOpNode(
     const framework::AttributeMap& default_attrs, const platform::Place& place,
     const std::map<std::string, std::string>& inplace_map);
 
+std::shared_ptr<GradOpNode> CreateGradOpNode(
+    const framework::OperatorBase& op, const NameTensorMap& ins,
+    const NameTensorMap& outs, const framework::AttributeMap& attrs,
+    const framework::AttributeMap& default_attrs, const platform::Place& place,
+    const std::map<std::string, std::string>& inplace_map);
+
 void ClearNoNeedBufferInputs(OpBase* op);
 
 }  // namespace imperative
diff --git a/paddle/fluid/imperative/op_base.h b/paddle/fluid/imperative/op_base.h
index 8d27e4f42a5..58c77d0f4b6 100644
--- a/paddle/fluid/imperative/op_base.h
+++ b/paddle/fluid/imperative/op_base.h
@@ -121,6 +121,8 @@ class OpBase {
   const framework::AttributeMap& DefaultAttrsMap() { return *default_attrs_; }
 
   bool HasAttr(const std::string& name) const {
+    VLOG(6) << "Default attrs: " << default_attrs_;
+    VLOG(6) << "attrs: " << &attrs_;
     return attrs_.count(name) > 0 || default_attrs_->count(name) > 0;
   }
 
@@ -182,6 +184,12 @@ class OpBase {
                   const framework::AttributeMap& attrs,
                   const framework::AttributeMap& default_attrs,
                   const platform::Place& place);
+  static void Run(const framework::OperatorBase& op,
+                  const NameVarMap<egr::EagerTensor>& ins,
+                  const NameVarMap<egr::EagerTensor>& outs,
+                  const framework::AttributeMap& attrs,
+                  const framework::AttributeMap& default_attrs,
+                  const platform::Place& place);
 
   bool HasVoidFunctionPostHook() const {
     return !void_function_post_hooks_.empty();
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index 5d6df145ab3..c8ff561f7af 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/fluid/imperative/prepared_operator.h"
 
+#include "paddle/fluid/eager/eager_tensor.h"
 #include "paddle/fluid/framework/data_type_transform.h"
 #include "paddle/fluid/framework/details/nan_inf_utils.h"
 #include "paddle/fluid/imperative/infer_shape_context.h"
@@ -24,11 +25,11 @@
 #ifdef PADDLE_WITH_XPU
 #include "paddle/fluid/platform/device/xpu/xpu_op_list.h"
 #endif
+#include "paddle/fluid/framework/library_type.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/profiler.h"
 
 DECLARE_bool(check_nan_inf);
-DECLARE_bool(run_pten_kernel);
 DECLARE_bool(benchmark);
 DECLARE_bool(run_kp_kernel);
 
@@ -56,7 +57,7 @@ const framework::Tensor* GetTensorFromVar(const framework::Variable& var) {
 }
 
 template <typename VarType>
-static void HandleComplexGradToRealGrad(const NameVarMap<VarType>& outs) {
+void HandleComplexGradToRealGrad(const NameVarMap<VarType>& outs) {
   for (auto& pair : outs) {
     for (auto& var : pair.second) {
       if (var == nullptr) {
@@ -87,6 +88,12 @@ static void HandleComplexGradToRealGrad(const NameVarMap<VarType>& outs) {
   }
 }
 
+template <>
+void HandleComplexGradToRealGrad<egr::EagerTensor>(
+    const NameVarMap<egr::EagerTensor>& outs) {
+  // TODO(jiabin): Support Complex here.
+}
+
 PreparedOp::PreparedOp(const framework::OperatorBase& op,
                        const framework::RuntimeContext& ctx,
                        const framework::OpKernelType& kernel_type,
@@ -145,7 +152,6 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
   auto dygraph_exe_ctx = DygraphExecutionContext<VarType>(
       op, framework::Scope(), *dev_ctx, ctx, ins, outs, attrs, default_attrs);
   auto expected_kernel_key = op.GetExpectedKernelType(dygraph_exe_ctx);
-  VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
 
   framework::KernelSignature pt_kernel_signature;
   pten::KernelKey pt_kernel_key;
@@ -228,7 +234,31 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
     expected_kernel_key.place_ = platform::CPUPlace();
     kernel_iter = kernels.find(expected_kernel_key);
   }
+
 #endif
+
+#ifdef PADDLE_WITH_XPU_KP
+  bool use_xpu_kp_kernel_rt =
+      FLAGS_run_kp_kernel &&
+      paddle::platform::is_xpu_kp_support_op(op.Type(), expected_kernel_key);
+  bool use_xpu_kp_kernel_debug =
+      paddle::platform::is_in_xpu_kpwhite_list(op.Type());
+  if (use_xpu_kp_kernel_rt) {
+    VLOG(3) << "xpu_kp using rt mode ";
+  }
+  if (use_xpu_kp_kernel_debug) {
+    VLOG(3) << "xpu_kp using debug mode ";
+  }
+  if (paddle::platform::is_xpu_place(expected_kernel_key.place_) &&
+      (use_xpu_kp_kernel_rt || use_xpu_kp_kernel_debug)) {
+    expected_kernel_key.place_ = platform::XPUPlace();
+    expected_kernel_key.library_type_ = paddle::framework::LibraryType::kKP;
+    kernel_iter = kernels.find(expected_kernel_key);
+    VLOG(3) << "using XPU KP kernel: " << op.Type()
+            << ", using_kernel_key:" << expected_kernel_key;
+  }
+#endif
+
 #ifdef PADDLE_WITH_ASCEND_CL
   if (kernel_iter == kernels.end() &&
       paddle::platform::is_npu_place(expected_kernel_key.place_)) {
@@ -282,6 +312,15 @@ PreparedOp PreparedOp::Prepare(const NameVarMap<VariableWrapper>& ins,
                                       default_attrs);
 }
 
+PreparedOp PreparedOp::Prepare(const NameVarMap<egr::EagerTensor>& ins,
+                               const NameVarMap<egr::EagerTensor>& outs,
+                               const framework::OperatorWithKernel& op,
+                               const platform::Place& place,
+                               const framework::AttributeMap& attrs,
+                               const framework::AttributeMap& default_attrs) {
+  return PrepareImpl<egr::EagerTensor>(ins, outs, op, place, attrs,
+                                       default_attrs);
+}
 template <typename VarType>
 static void PreparedOpRunImpl(
     const framework::OperatorBase& op, const framework::RuntimeContext& ctx,
@@ -412,5 +451,20 @@ void PreparedOp::Run(const NameVarMap<VariableWrapper>& ins,
   }
 }
 
+void PreparedOp::Run(const NameVarMap<egr::EagerTensor>& ins,
+                     const NameVarMap<egr::EagerTensor>& outs,
+                     const framework::AttributeMap& attrs,
+                     const framework::AttributeMap& default_attrs) {
+  if (run_pten_kernel_) {
+    PreparedOpRunPtImpl<egr::EagerTensor>(
+        op_, kernel_type_, pt_kernel_signature_, pt_kernel_, dev_ctx_, ins,
+        outs, attrs, default_attrs);
+  } else {
+    PreparedOpRunImpl<egr::EagerTensor>(op_, ctx_, kernel_type_, func_,
+                                        dev_ctx_, ins, outs, attrs,
+                                        default_attrs);
+  }
+}
+
 }  // namespace imperative
 }  // namespace paddle
diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h
index f9165e8ee23..719036742d3 100644
--- a/paddle/fluid/imperative/prepared_operator.h
+++ b/paddle/fluid/imperative/prepared_operator.h
@@ -18,6 +18,7 @@
 #include <utility>
 #include <vector>
 
+#include "paddle/fluid/eager/eager_tensor.h"
 #include "paddle/fluid/framework/data_transform.h"
 #include "paddle/fluid/framework/op_kernel_type.h"
 #include "paddle/fluid/framework/operator.h"
@@ -26,19 +27,10 @@
 #include "paddle/fluid/imperative/execution_context.h"
 #include "paddle/fluid/imperative/layer.h"
 #include "paddle/fluid/imperative/type_defs.h"
+#include "paddle/fluid/imperative/var_helper.h"
 
 DECLARE_bool(use_mkldnn);
 
-namespace pten {
-class DenseTensor;
-}  // namespace pten
-
-namespace paddle {
-namespace framework {
-class Variable;
-}  // namespace framework
-}  // namespace paddle
-
 namespace paddle {
 namespace imperative {
 
@@ -66,10 +58,14 @@ void SetForwardDataTypeOfGradVar<VarBase>(const std::shared_ptr<VarBase>& var) {
   }
 }
 
-extern const std::shared_ptr<VariableWrapper>& GetVariableWrapper(
-    const std::shared_ptr<paddle::imperative::VarBase>& var);
-extern const std::shared_ptr<VariableWrapper>& GetVariableWrapper(
-    const std::shared_ptr<VariableWrapper>& var);
+template <>
+void SetForwardDataTypeOfGradVar<egr::EagerTensor>(
+    const std::shared_ptr<egr::EagerTensor>& var) {
+  VLOG(10) << "Var in Eager dose not support SetForwardDataTypeOfGradVar: "
+           << var->name();
+  // TODO(jiabin): SetForwardDataType of Grad var is not supported yet in
+  // EagerMode.
+}
 
 template <typename VarType>
 std::shared_ptr<NameVarMap<VarType>> PrepareData(
@@ -78,31 +74,32 @@ std::shared_ptr<NameVarMap<VarType>> PrepareData(
   std::shared_ptr<NameVarMap<VarType>> tmp_ins_ptr = nullptr;
   for (const auto& name_pair : ins) {
     for (size_t i = 0; i < name_pair.second.size(); ++i) {
-      auto& var_base = name_pair.second[i];
-      SetForwardDataTypeOfGradVar(var_base);
-      const auto* tensor = GetTensorFromVar(var_base->Var());
+      auto& template_var = name_pair.second[i];
+      SetForwardDataTypeOfGradVar(template_var);
+      const auto* tensor = GetTensorFromVar(template_var->Var());
       if (tensor && tensor->IsInitialized()) {
         auto kernel_type_for_var = op.GetKernelTypeForVar(
             name_pair.first, *tensor, expected_kernel_key);
         if (!NeedTransform(kernel_type_for_var, expected_kernel_key)) {
           continue;
         } else {
-          VLOG(3) << "Transform Variable " << var_base->Name() << " from "
-                  << kernel_type_for_var << " to " << expected_kernel_key;
+          VLOG(3) << "Transform Variable " << GetNameFromVar(template_var)
+                  << " from " << kernel_type_for_var << " to "
+                  << expected_kernel_key;
 
-          if (GetVariableWrapper(var_base)->hasCacheKey(expected_kernel_key)) {
+          if (CheckCachedKey(template_var, expected_kernel_key)) {
             VLOG(3) << "Hit variable_wrapper cache: key="
                     << expected_kernel_key;
             std::shared_ptr<VariableWrapper> cache_var =
-                GetVariableWrapper(var_base)->getCacheValue(
-                    expected_kernel_key);
+                GetCachedValue(template_var, expected_kernel_key);
             if (tmp_ins_ptr == nullptr) {
               tmp_ins_ptr = std::make_shared<NameVarMap<VarType>>(ins);
             }
 
             const auto* tensor = GetTensorFromVar(cache_var->Var());
-            auto tmp_var = std::make_shared<VarType>(var_base->Name());
-            tmp_var->SetType(var_base->Type());
+            auto tmp_var =
+                std::make_shared<VarType>(GetNameFromVar(template_var));
+            SetType(tmp_var, GetType(template_var));
             SetTensorToVariable(cache_var->Var(), *tensor,
                                 tmp_var->MutableVar());
             (*tmp_ins_ptr)[name_pair.first][i] = tmp_var;
@@ -118,20 +115,21 @@ std::shared_ptr<NameVarMap<VarType>> PrepareData(
               if (tmp_ins_ptr == nullptr) {
                 tmp_ins_ptr = std::make_shared<NameVarMap<VarType>>(ins);
               }
-              auto tmp_var = std::make_shared<VarType>(var_base->Name());
-              tmp_var->SetType(var_base->Type());
-              SetTensorToVariable(var_base->Var(), out, tmp_var->MutableVar());
+              auto tmp_var =
+                  std::make_shared<VarType>(GetNameFromVar(template_var));
+              SetType(tmp_var, GetType(template_var));
+              SetTensorToVariable(template_var->Var(), out,
+                                  tmp_var->MutableVar());
               (*tmp_ins_ptr)[name_pair.first][i] = tmp_var;
-
-              GetVariableWrapper(var_base)->setCacheValue(
-                  expected_kernel_key, GetVariableWrapper(tmp_var));
+              SetCachedValue(template_var, expected_kernel_key, tmp_var);
               VLOG(3) << "Set cache to variable_wrapper: key="
                       << expected_kernel_key;
             } else {
               // if dtype is same, transform inplace will not change the
               // original
               // value, transform inplace to avoid multiple copy
-              SetTensorToVariable(var_base->Var(), out, var_base->MutableVar());
+              SetTensorToVariable(template_var->Var(), out,
+                                  template_var->MutableVar());
             }
           }
         }
@@ -169,6 +167,13 @@ class PreparedOp {
                             const framework::AttributeMap& attrs,
                             const framework::AttributeMap& default_attrs);
 
+  static PreparedOp Prepare(const NameVarMap<egr::EagerTensor>& ins,
+                            const NameVarMap<egr::EagerTensor>& outs,
+                            const framework::OperatorWithKernel& op,
+                            const platform::Place& place,
+                            const framework::AttributeMap& attrs,
+                            const framework::AttributeMap& default_attrs);
+
   void Run(const NameVarMap<VarBase>& in, const NameVarMap<VarBase>& out,
            const framework::AttributeMap& attrs,
            const framework::AttributeMap& default_attrs);
@@ -178,6 +183,11 @@ class PreparedOp {
            const framework::AttributeMap& attrs,
            const framework::AttributeMap& default_attrs);
 
+  void Run(const NameVarMap<egr::EagerTensor>& ins,
+           const NameVarMap<egr::EagerTensor>& outs,
+           const framework::AttributeMap& attrs,
+           const framework::AttributeMap& default_attrs);
+
   const framework::OpKernelType& kernel_type() const { return kernel_type_; }
 
  private:
@@ -190,6 +200,7 @@ class PreparedOp {
   // new pten kernel, if there is a better design in the future,
   // we may polish the implementation here
   bool run_pten_kernel_{false};
+  bool run_kp_kernel_{false};
   framework::KernelSignature pt_kernel_signature_;
   pten::Kernel pt_kernel_;
 };
@@ -415,8 +426,8 @@ void PreparePtenData(const pten::Kernel& pt_kernel,
     auto& ins_vector = ins.at(input_names[i]);
 
     for (size_t offset = 0; offset < ins_vector.size(); ++offset) {
-      auto var_base = ins_vector[offset];
-      const auto* tensor_in = GetTensorFromVar(var_base->Var());
+      auto var = ins_vector[offset];
+      const auto* tensor_in = GetTensorFromVar(var->Var());
       if (tensor_in && tensor_in->IsInitialized()) {
         auto expected_place = pten::TransToFluidPlace(in_def.backend);
         if (platform::is_same_place(tensor_in->place(), expected_place)) {
@@ -429,8 +440,7 @@ void PreparePtenData(const pten::Kernel& pt_kernel,
         framework::Tensor tmp_tensor;
         framework::TensorCopySync(*tensor_in, expected_place, &tmp_tensor);
 
-        SetTensorToVariable(var_base->Var(), tmp_tensor,
-                            var_base->MutableVar());
+        SetTensorToVariable(var->Var(), tmp_tensor, var->MutableVar());
       }
     }
   }
diff --git a/paddle/fluid/imperative/tests/CMakeLists.txt b/paddle/fluid/imperative/tests/CMakeLists.txt
index 88f8076885e..56eb47a2ef1 100644
--- a/paddle/fluid/imperative/tests/CMakeLists.txt
+++ b/paddle/fluid/imperative/tests/CMakeLists.txt
@@ -17,7 +17,7 @@ cc_test(test_layer SRCS test_layer.cc DEPS layer proto_desc operator op_registry
 cc_test(test_prepare_op SRCS test_prepare_op.cc DEPS prepared_operator op_info split_op layer concat_and_split activation_op place)
 cc_test(test_tracer SRCS test_tracer.cc DEPS tracer layer proto_desc operator op_registry variable_helper mul_op reduce_sum_op elementwise_add_op memcpy)
 cc_test(test_hooks SRCS test_hooks.cc DEPS tracer basic_engine layer proto_desc operator op_registry variable_helper mul_op elementwise_add_op memcpy)
-
+cc_test(test_eager SRCS test_eager.cc DEPS tracer layer prepared_operator mul_op)
 if (WITH_NCCL OR WITH_RCCL OR WITH_XPU_BKCL)
 cc_test(test_group SRCS test_group.cc DEPS reducer concat_and_split memcpy)
 endif()
diff --git a/paddle/fluid/imperative/tests/test_eager.cc b/paddle/fluid/imperative/tests/test_eager.cc
new file mode 100644
index 00000000000..d34cb924d56
--- /dev/null
+++ b/paddle/fluid/imperative/tests/test_eager.cc
@@ -0,0 +1,100 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/imperative/basic_engine.h"
+#include "paddle/fluid/imperative/execution_context.h"
+#include "paddle/fluid/imperative/layer.h"
+#include "paddle/fluid/imperative/tracer.h"
+#include "paddle/fluid/imperative/var_helper.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/pten/core/compat/type_defs.h"
+
+namespace paddle {
+namespace imperative {
+extern std::string LayerDebugString(const std::string& op_type,
+                                    const NameVarMap<egr::EagerTensor>& ins,
+                                    const NameVarMap<egr::EagerTensor>& outs);
+
+extern std::shared_ptr<GradOpNode> CreateGradOpNode(
+    const framework::OperatorBase& op, const NameTensorMap& ins,
+    const NameTensorMap& outs, const framework::AttributeMap& attrs,
+    const framework::AttributeMap& default_attrs, const platform::Place& place,
+    const std::map<std::string, std::string>& inplace_map);
+
+TEST(test_eager, eager_debug) {
+  std::shared_ptr<egr::EagerTensor> x_in(new egr::EagerTensor("x_in"));
+  std::shared_ptr<egr::EagerTensor> y_in(new egr::EagerTensor("y_in"));
+  std::shared_ptr<egr::EagerTensor> vout(new egr::EagerTensor("vout"));
+  imperative::NameVarMap<egr::EagerTensor> ins = {{"X", {x_in}}, {"Y", {y_in}}};
+  imperative::NameVarMap<egr::EagerTensor> outs = {{"Out", {vout}}};
+  LayerDebugString("mul", ins, outs);
+}
+TEST(test_create_node, eager_node) {
+  auto op = framework::OpRegistry::CreateOp("mul", {}, {}, {}, false);
+  framework::Scope scope;
+  auto ctx = framework::RuntimeContext({}, {});
+  imperative::NameVarMap<egr::EagerTensor> ins = {{"X", {nullptr}},
+                                                  {"Y", {nullptr}}};
+  imperative::NameVarMap<egr::EagerTensor> outs = {{"Out", {nullptr}}};
+  CreateGradOpNode((*op.get()), ins, outs, framework::AttributeMap{},
+                   framework::AttributeMap{}, platform::CPUPlace(), {});
+}
+TEST(test_var_helper, eager_var_helper) {
+  framework::Variable var0, var1, var2, var3, var4, var5, var6, var7, var8;
+  InitializeVariable(&var0, paddle::framework::proto::VarType::FEED_MINIBATCH);
+  InitializeVariable(&var1, paddle::framework::proto::VarType::STEP_SCOPES);
+  InitializeVariable(&var2, paddle::framework::proto::VarType::LOD_RANK_TABLE);
+  InitializeVariable(&var3,
+                     paddle::framework::proto::VarType::LOD_TENSOR_ARRAY);
+  InitializeVariable(&var4, paddle::framework::proto::VarType::STRINGS);
+  InitializeVariable(&var5, paddle::framework::proto::VarType::VOCAB);
+  InitializeVariable(&var6, paddle::framework::proto::VarType::READER);
+  InitializeVariable(&var7, paddle::framework::proto::VarType::RAW);
+  ASSERT_ANY_THROW(
+      InitializeVariable(&var8, paddle::framework::proto::VarType::FP64));
+
+  auto egr_tensor = std::make_shared<egr::EagerTensor>();
+  auto egr_tensor2 = std::make_shared<egr::EagerTensor>();
+  egr_tensor->MutableVar()
+      ->GetMutable<pten::SelectedRows>()
+      ->mutable_value()
+      ->mutable_data<float>(platform::CPUPlace());
+  egr_tensor2->MutableVar()->GetMutable<framework::LoDRankTable>();
+  VLOG(6) << "egr_tensor create with ";
+  ASSERT_TRUE(platform::is_cpu_place(GetPlace<egr::EagerTensor>(egr_tensor)));
+  ASSERT_TRUE(GetDataType<egr::EagerTensor>(egr_tensor) ==
+              framework::proto::VarType::FP32);
+  GetCachedValue<egr::EagerTensor>(
+      egr_tensor, framework::OpKernelType(framework::proto::VarType::FP32,
+                                          platform::CPUPlace()));
+  SetCachedValue<egr::EagerTensor>(
+      egr_tensor, framework::OpKernelType(framework::proto::VarType::FP32,
+                                          platform::CPUPlace()),
+      egr_tensor2);
+  ASSERT_ANY_THROW(GetPlace<egr::EagerTensor>(egr_tensor2));
+  ASSERT_ANY_THROW(SetType<egr::EagerTensor>(
+      egr_tensor, paddle::framework::proto::VarType::LOD_TENSOR_ARRAY));
+}
+}  // namespace imperative
+}  // namespace paddle
+
+USE_OP(mul);
diff --git a/paddle/fluid/imperative/tests/test_gradient_accmulator.cc b/paddle/fluid/imperative/tests/test_gradient_accmulator.cc
index 25ffab47064..6210cb108bd 100644
--- a/paddle/fluid/imperative/tests/test_gradient_accmulator.cc
+++ b/paddle/fluid/imperative/tests/test_gradient_accmulator.cc
@@ -28,8 +28,6 @@ namespace framework = paddle::framework;
 namespace paddle {
 namespace imperative {
 
-void TensorAdd(const framework::Variable& src, framework::Variable* dst);
-
 template <typename Place1, typename Place2, typename T>
 int TensorddTest(Place1 place1, Place2 place2, T t1, T t2) {
   framework::Variable var1;
@@ -69,7 +67,7 @@ int TensorddTest(Place1 place1, Place2 place2, T t1, T t2) {
                          sizeof(T) * dst_data.size(), 0);
 #endif
   }
-  imperative::TensorAdd(var1, &var2);
+  imperative::TensorAdd<framework::Variable>(var1, &var2);
   framework::LoDTensor rlt;
   platform::CPUPlace rlt_place;
   framework::TensorCopySync(*dst, rlt_place, &rlt);
diff --git a/paddle/fluid/imperative/tests/test_hooks.cc b/paddle/fluid/imperative/tests/test_hooks.cc
index 5c4e1538cf0..3a0bb7c52bf 100644
--- a/paddle/fluid/imperative/tests/test_hooks.cc
+++ b/paddle/fluid/imperative/tests/test_hooks.cc
@@ -107,7 +107,7 @@ TEST(TestHooks, TestGradVarLeafBackwardHook) {
       std::make_shared<std::function<void()>>([&]() { hook_value = 10; }));
 
   // 2. forward
-  tracer.TraceOp("mul", ins, outs, mul_attr_map, place, true);
+  tracer.TraceOp<VarBase>("mul", ins, outs, mul_attr_map, place, true);
 
   ASSERT_EQ(x->GradVarBase()->GradOpNum(), 0UL);
   ASSERT_EQ(y->GradVarBase()->GradOpNum(), 0UL);
@@ -194,13 +194,13 @@ void GradVarLeafBackwardHookWithGradAccmulatedTest() {
   NameVarBaseMap outs = {out_xy_pair};
   framework::AttributeMap mul_attr_map;
   mul_attr_map["use_mkldnn"] = false;
-  tracer.TraceOp("mul", ins, outs, mul_attr_map, place, true);
+  tracer.TraceOp<VarBase>("mul", ins, outs, mul_attr_map, place, true);
 
   var_pair z_pair = var_pair("Y", vb_vector(1, z));
   var_pair out_xz_pair = var_pair("Out", vb_vector(1, out_xz));
   ins = {x_pair, z_pair};
   outs = {out_xz_pair};
-  tracer.TraceOp("mul", ins, outs, mul_attr_map, place, true);
+  tracer.TraceOp<VarBase>("mul", ins, outs, mul_attr_map, place, true);
 
   var_pair xy_pair = var_pair("X", vb_vector(1, out_xy));
   var_pair xz_pair = var_pair("Y", vb_vector(1, out_xz));
@@ -208,7 +208,8 @@ void GradVarLeafBackwardHookWithGradAccmulatedTest() {
   ins = {xy_pair, xz_pair};
   outs = {out_pair};
   framework::AttributeMap add_attr_map;
-  tracer.TraceOp("elementwise_add", ins, outs, add_attr_map, place, true);
+  tracer.TraceOp<VarBase>("elementwise_add", ins, outs, add_attr_map, place,
+                          true);
 
   ASSERT_EQ(x->GradVarBase()->GradOpNum(), 0UL);
   ASSERT_EQ(y->GradVarBase()->GradOpNum(), 0UL);
diff --git a/paddle/fluid/imperative/tests/test_layer.cc b/paddle/fluid/imperative/tests/test_layer.cc
index c54ed34bb81..bcd4e62e57c 100644
--- a/paddle/fluid/imperative/tests/test_layer.cc
+++ b/paddle/fluid/imperative/tests/test_layer.cc
@@ -143,7 +143,8 @@ TEST(test_layer, test_runtime_context) {
 
   ctx->SyncTypeAndDataType("X", "Out");
 
-  ASSERT_EQ(framework::proto::VarType::FP32, vout->DataType());
+  // Remove DataType check, because it doesn't make sense of set dtype in
+  // dygraph
 
   ASSERT_EQ(framework::proto::VarType::LOD_TENSOR, ctx->GetOutputType("Out"));
 
@@ -157,8 +158,8 @@ TEST(test_layer, test_runtime_context) {
                          framework::ALL_ELEMENTS);
   ctx->SetOutputDataType("Out", framework::proto::VarType::INT8);
 
-  ASSERT_EQ(framework::proto::VarType::INT8, vout->DataType());
-  ASSERT_EQ(framework::proto::VarType::FP64, vout_b->DataType());
+  // Remove DataType check, because it doesn't make sense of set dtype in
+  // dygraph
 
   // no throw, but do nothing
   ASSERT_NO_THROW(
diff --git a/paddle/fluid/imperative/tests/test_tracer.cc b/paddle/fluid/imperative/tests/test_tracer.cc
index 76de413b3e6..ff3331be56c 100644
--- a/paddle/fluid/imperative/tests/test_tracer.cc
+++ b/paddle/fluid/imperative/tests/test_tracer.cc
@@ -16,17 +16,18 @@
 // Created by Jiabin on 2019-08-16.
 //
 
-#include <paddle/fluid/framework/op_registry.h>
-
 #include <memory>
 #include <set>
 #include <string>
 #include <vector>
 
 #include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/imperative/basic_engine.h"
+#include "paddle/fluid/imperative/execution_context.h"
 #include "paddle/fluid/imperative/tracer.h"
 #include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/platform/device_context.h"
 
 namespace imperative = paddle::imperative;
 namespace platform = paddle::platform;
@@ -71,11 +72,11 @@ TEST(test_tracer, test_trace_op) {
   imperative::NameVarBaseMap outs = {out_pair};
   framework::AttributeMap mul_attr_map;
   mul_attr_map["use_mkldnn"] = false;
-  tracer.TraceOp("mul", ins, outs, mul_attr_map, place, true);
+  tracer.TraceOp<VarBase>("mul", ins, outs, mul_attr_map, place, true);
 
 #ifndef PADDLE_WITH_XPU
-  ASSERT_THROW(tracer.TraceOp("mul", ins, outs, mul_attr_map,
-                              platform::XPUPlace(0), true);
+  ASSERT_THROW(tracer.TraceOp<VarBase>("mul", ins, outs, mul_attr_map,
+                                       platform::XPUPlace(0), true);
                , platform::EnforceNotMet);
 #endif
 
@@ -117,7 +118,7 @@ TEST(test_tracer, test_trace_op_with_backward) {
   imperative::NameVarBaseMap outs = {out_pair};
   framework::AttributeMap mul_attr_map;
   mul_attr_map["use_mkldnn"] = false;
-  tracer.TraceOp("mul", ins, outs, mul_attr_map, place, true);
+  tracer.TraceOp<VarBase>("mul", ins, outs, mul_attr_map, place, true);
   const auto& out_tensor = vout->Var().Get<framework::LoDTensor>();
   for (int i = 0; i < vout->Var().Get<framework::LoDTensor>().numel(); i++) {
     ASSERT_EQ(out_tensor.data<float>()[i], 20.0);
@@ -157,7 +158,7 @@ TEST(test_tracer, test_track_backward_output) {
   imperative::NameVarBaseMap outs = {out_pair};
   framework::AttributeMap mul_attr_map;
   mul_attr_map["use_mkldnn"] = false;
-  tracer.TraceOp("mul", ins, outs, mul_attr_map, place, true);
+  tracer.TraceOp<VarBase>("mul", ins, outs, mul_attr_map, place, true);
   ASSERT_EQ(x_in->GradVarBase()->GradOpNum(), 0UL);
   ASSERT_EQ(y_in->GradVarBase()->GradOpNum(), 0UL);
   ASSERT_EQ(vout->GradVarBase()->GradOpNum(), 1UL);
@@ -196,7 +197,7 @@ TEST(test_tracer, test_track_backward_input) {
   imperative::NameVarBaseMap outs = {out_pair};
   framework::AttributeMap mul_attr_map;
   mul_attr_map["use_mkldnn"] = false;
-  tracer.TraceOp("mul", ins, outs, mul_attr_map, place, true);
+  tracer.TraceOp<VarBase>("mul", ins, outs, mul_attr_map, place, true);
 
   ASSERT_EQ(x_in->GradVarBase()->GradOpNum(), 0UL);
   ASSERT_EQ(y_in->GradVarBase()->GradOpNum(), 0UL);
@@ -237,7 +238,8 @@ TEST(test_tracer, test_trace_op_with_multi_device_inputs) {
   imperative::NameVarBaseMap outs = {out_pair};
   framework::AttributeMap mul_attr_map;
   mul_attr_map["use_mkldnn"] = false;
-  tracer.TraceOp("elementwise_add", ins, outs, mul_attr_map, gpu_place, true);
+  tracer.TraceOp<VarBase>("elementwise_add", ins, outs, mul_attr_map, gpu_place,
+                          true);
 
   // run reduce sum
   std::shared_ptr<imperative::VarBase> reduce_sum_out(
@@ -247,8 +249,8 @@ TEST(test_tracer, test_trace_op_with_multi_device_inputs) {
   imperative::NameVarBaseMap reduce_in = {reduce_sum_in_pair};
   imperative::NameVarBaseMap reduce_out = {reduce_sum_out_pair};
   framework::AttributeMap reduce_attr_map;
-  tracer.TraceOp("reduce_sum", reduce_in, reduce_out, reduce_attr_map,
-                 gpu_place, true);
+  tracer.TraceOp<VarBase>("reduce_sum", reduce_in, reduce_out, reduce_attr_map,
+                          gpu_place, true);
   imperative::BasicEngine engine;
 
   std::vector<std::shared_ptr<imperative::VarBase>> tensors{reduce_sum_out};
@@ -368,7 +370,7 @@ TEST(test_tracer, test_var_without_grad_var) {
   imperative::NameVarBaseMap outs = {out_pair};
   framework::AttributeMap mul_attr_map;
   mul_attr_map["use_mkldnn"] = false;
-  tracer.TraceOp("mul", ins, outs, mul_attr_map, place, true);
+  tracer.TraceOp<VarBase>("mul", ins, outs, mul_attr_map, place, true);
 
   const auto& out_tensor = vout->Var().Get<framework::LoDTensor>();
   for (int i = 0; i < vout->Var().Get<framework::LoDTensor>().numel(); i++) {
@@ -439,9 +441,9 @@ static void TestVarOpDestructionMain(const platform::Place& place,
       size_t op_base_num = op_bases.size();
 
       auto z = std::make_shared<VarBase>("z_" + std::to_string(i));
-      tracer.TraceOp("mul", NameVarBaseMap{{"X", {x}}, {"Y", {y}}},
-                     NameVarBaseMap{{"Out", {z}}}, framework::AttributeMap{},
-                     place, true);
+      tracer.TraceOp<VarBase>("mul", NameVarBaseMap{{"X", {x}}, {"Y", {y}}},
+                              NameVarBaseMap{{"Out", {z}}},
+                              framework::AttributeMap{}, place, true);
 
       ASSERT_EQ(z->GradOpNum(), 0UL);
       ASSERT_EQ(z->GradVarBase()->GradOpNum(), 1UL);
@@ -530,6 +532,20 @@ TEST(test_tracer, test_var_op_destruction) {
 #endif
 }
 
+TEST(test_tracer, test_execution_context) {
+  auto op = framework::OpRegistry::CreateOp("mul", {}, {}, {}, false);
+  framework::Scope scope;
+  auto ctx = framework::RuntimeContext({}, {});
+  NameVarBaseMap ins = {{"X", {nullptr}}, {"Y", {nullptr}}};
+  NameVarBaseMap outs = {{"Out", {nullptr}}};
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  auto* dev_ctx = pool.Get(platform::CPUPlace());
+  auto dy_ctx = DygraphExecutionContext<VarBase>(
+      (*op.get()), scope, *dev_ctx, ctx, ins, outs, framework::AttributeMap{},
+      framework::AttributeMap{});
+  ASSERT_EQ(dy_ctx.OutputName("Out"), framework::kEmptyVarName);
+}
+
 }  // namespace imperative
 }  // namespace paddle
 
diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc
index e845ce10453..ca8adc97615 100644
--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -149,10 +149,14 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists(
   return gcs_.at(place).get();
 }
 
-void Tracer::TraceOp(const std::string& type, const NameVarBaseMap& ins,
-                     const NameVarBaseMap& outs, framework::AttributeMap attrs,
+template <typename VarType>
+void Tracer::TraceOp(const std::string& type, const NameVarMap<VarType>& ins,
+                     const NameVarMap<VarType>& outs,
+                     framework::AttributeMap attrs,
                      const platform::Place& place, bool trace_backward,
-                     const std::map<std::string, std::string>& inplace_map) {
+                     const std::map<std::string, std::string>& inplace_map,
+                     paddle::framework::AttributeMap* passed_default_attrs_,
+                     bool override_default_attr_map) {
   platform::RecordEvent op_type_record_event(type);
   platform::ScopedFlushDenormal flush;
   VLOG(1) << "Trace Op: " << type;
@@ -181,13 +185,13 @@ void Tracer::TraceOp(const std::string& type, const NameVarBaseMap& ins,
       attr_checker == nullptr ? empty_attrs_map
                               : attr_checker->GetDefaultAttrMap();
 
-  NameVarBaseMap new_ins = ins;
+  NameVarMap<VarType> new_ins = ins;
   if (amp_level_ == AmpLevel::O1) {
     VLOG(5) << "Auto mixed precision run operator: " << type;
-    new_ins = AutoCastInputs(type, ins);
+    new_ins = AutoCastInputs<VarType>(type, ins);
   } else if (amp_level_ == AmpLevel::O2) {
     VLOG(5) << "Pure fp16 run operator: " << type;
-    new_ins = CastPureFp16Inputs(type, ins);
+    new_ins = CastPureFp16Inputs<VarType>(type, ins);
   }
 
   try {
@@ -220,8 +224,20 @@ void Tracer::TraceOp(const std::string& type, const NameVarBaseMap& ins,
           "PaddlePaddle should compile with MLU if use MLUPlace."));
 #endif
     }
-
-    OpBase::Run(*op, new_ins, outs, attrs, default_attrs, place);
+    if (!override_default_attr_map) {
+      PADDLE_ENFORCE_NOT_NULL(passed_default_attrs_,
+                              paddle::platform::errors::PermissionDenied(
+                                  "Detected default_attrs = nullptr."));
+      VLOG(6) << "Use passed in default attrs";
+      OpBase::Run(*op, new_ins, outs, attrs, (*passed_default_attrs_), place);
+    } else {
+      VLOG(6) << "Use Checker's default attrs";
+      if (passed_default_attrs_) {
+        // TODO(jiabin): Update this without copy
+        *passed_default_attrs_ = default_attrs;
+      }
+      OpBase::Run(*op, new_ins, outs, attrs, default_attrs, place);
+    }
   } catch (platform::EnforceNotMet& exception) {
     framework::AppendErrorOpHint(type, &exception);
     throw std::move(exception);
@@ -249,13 +265,53 @@ void Tracer::TraceOp(const std::string& type, const NameVarBaseMap& ins,
   } else {
     VLOG(3) << "No Grad to track for Op: " << type;
   }
+  VLOG(6) << "Finish Trace Op: " << type;
 }
 
+template void Tracer::TraceOp<VarBase>(
+    const std::string& type, const NameVarMap<VarBase>& ins,
+    const NameVarMap<VarBase>& outs, framework::AttributeMap attrs,
+    const platform::Place& place, bool trace_backward,
+    const std::map<std::string, std::string>& inplace_map,
+    paddle::framework::AttributeMap* default_attrs,
+    bool override_default_attr_map);
+
+template void Tracer::TraceOp<egr::EagerTensor>(
+    const std::string& type, const NameVarMap<egr::EagerTensor>& ins,
+    const NameVarMap<egr::EagerTensor>& outs, framework::AttributeMap attrs,
+    const platform::Place& place, bool trace_backward,
+    const std::map<std::string, std::string>& inplace_map_,
+    paddle::framework::AttributeMap* default_attrs,
+    bool override_default_attr_map);
+
 void Tracer::TraceOp(const std::string& type, const NameVarBaseMap& ins,
                      const NameVarBaseMap& outs, framework::AttributeMap attrs,
                      const std::map<std::string, std::string>& inplace_map) {
-  TraceOp(type, ins, outs, std::move(attrs), expected_place_, has_grad_,
-          inplace_map);
+  TraceOp<VarBase>(type, ins, outs, std::move(attrs), expected_place_,
+                   has_grad_, inplace_map);
+}
+
+void Tracer::TraceOp(const std::string& type, const NameTensorMap& ins,
+                     const NameTensorMap& outs,
+                     paddle::framework::AttributeMap attrs,
+                     const paddle::platform::Place& place,
+                     paddle::framework::AttributeMap* default_attrs,
+                     bool override_default_attr_map,
+                     const std::map<std::string, std::string>& inplace_map) {
+  VLOG(6) << "Running On Eager TraceOp with override_default_attr_map: "
+          << override_default_attr_map;
+  TraceOp<egr::EagerTensor>(type, ins, outs, std::move(attrs), place, false,
+                            inplace_map, default_attrs,
+                            override_default_attr_map);
+}
+
+void Tracer::TraceOp(const std::string& type, const NameTensorMap& ins,
+                     const NameTensorMap& outs,
+                     paddle::framework::AttributeMap attrs,
+                     const std::map<std::string, std::string>& inplace_map) {
+  VLOG(6) << "Running On Eager TraceOp(less): ";
+  TraceOp<egr::EagerTensor>(type, ins, outs, std::move(attrs), expected_place_,
+                            false, inplace_map, nullptr, true);
 }
 
 void Tracer::SetExpectedPlace(platform::Place place) {
@@ -280,5 +336,11 @@ bool Tracer::ComputeRequiredGrad(const NameVarBaseMap& ins,
   return false;
 }
 
+bool Tracer::ComputeRequiredGrad(const NameTensorMap& ins,
+                                 const NameTensorMap& outs,
+                                 bool trace_backward) {
+  return false;
+}
+
 }  // namespace imperative
 }  // namespace paddle
diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h
index bd8521dabde..4e406a9482d 100644
--- a/paddle/fluid/imperative/tracer.h
+++ b/paddle/fluid/imperative/tracer.h
@@ -63,17 +63,33 @@ class Tracer {
 
   ~Tracer() = default;
 
+  template <typename VarType>
+  void TraceOp(const std::string& type, const NameVarMap<VarType>& ins,
+               const NameVarMap<VarType>& outs, framework::AttributeMap attrs,
+               const platform::Place& place, bool trace_backward,
+               const std::map<std::string, std::string>& inplace_map = {},
+               paddle::framework::AttributeMap* passed_default_attrs_ = nullptr,
+               bool override_default_attr_map = true);
+
   void TraceOp(const std::string& type, const NameVarBaseMap& ins,
                const NameVarBaseMap& outs, framework::AttributeMap attrs,
-               const platform::Place& place, bool trace_bacward,
                const std::map<std::string, std::string>& inplace_map = {});
 
-  void TraceOp(const std::string& type, const NameVarBaseMap& ins,
-               const NameVarBaseMap& outs, framework::AttributeMap attrs,
+  void TraceOp(const std::string& type, const NameTensorMap& ins,
+               const NameTensorMap& outs, paddle::framework::AttributeMap attrs,
+               const std::map<std::string, std::string>& inplace_map = {});
+
+  void TraceOp(const std::string& type, const NameTensorMap& ins,
+               const NameTensorMap& outs, paddle::framework::AttributeMap attrs,
+               const paddle::platform::Place& place,
+               paddle::framework::AttributeMap* default_attrs,
+               bool override_default_attr_map,
                const std::map<std::string, std::string>& inplace_map = {});
 
   bool ComputeRequiredGrad(const NameVarBaseMap& ins,
                            const NameVarBaseMap& outs, bool trace_backward);
+  bool ComputeRequiredGrad(const NameTensorMap& ins, const NameTensorMap& outs,
+                           bool trace_backward);
 
   void SetEnableProgramDescTracing(bool enabled) {
     enable_program_desc_tracing_ = enabled;
diff --git a/paddle/fluid/imperative/type_defs.h b/paddle/fluid/imperative/type_defs.h
index a0258c7a880..a248f29ee9c 100644
--- a/paddle/fluid/imperative/type_defs.h
+++ b/paddle/fluid/imperative/type_defs.h
@@ -13,4 +13,4 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/pten/core/type_defs.h"
+#include "paddle/pten/core/compat/type_defs.h"
diff --git a/paddle/fluid/imperative/var_helper.cc b/paddle/fluid/imperative/var_helper.cc
new file mode 100644
index 00000000000..4686aef5afd
--- /dev/null
+++ b/paddle/fluid/imperative/var_helper.cc
@@ -0,0 +1,261 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/imperative/var_helper.h"
+
+#include "paddle/fluid/eager/eager_tensor.h"
+#include "paddle/fluid/framework/feed_fetch_type.h"
+#include "paddle/fluid/framework/lod_rank_table.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/reader.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/var_type_traits.h"
+#include "paddle/fluid/imperative/layer.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/pten/core/selected_rows.h"
+namespace paddle {
+namespace imperative {
+
+/* GetVariableWrapper */
+template <>
+const std::shared_ptr<VariableWrapper> &GetVariableWrapper<VarBase>(
+    const std::shared_ptr<VarBase> &var) {
+  return var->SharedVar();
+}
+template <>
+const std::shared_ptr<VariableWrapper> &GetVariableWrapper<VariableWrapper>(
+    const std::shared_ptr<VariableWrapper> &var) {
+  return var;
+}
+
+void InitializeVariable(paddle::framework::Variable *var,
+                        paddle::framework::proto::VarType::Type var_type) {
+  if (var_type == paddle::framework::proto::VarType::LOD_TENSOR) {
+    var->GetMutable<paddle::framework::LoDTensor>();
+  } else if (var_type == paddle::framework::proto::VarType::SELECTED_ROWS) {
+    var->GetMutable<pten::SelectedRows>();
+  } else if (var_type == paddle::framework::proto::VarType::FEED_MINIBATCH) {
+    var->GetMutable<paddle::framework::FeedList>();
+  } else if (var_type == paddle::framework::proto::VarType::FETCH_LIST) {
+    var->GetMutable<paddle::framework::FetchList>();
+  } else if (var_type == paddle::framework::proto::VarType::STEP_SCOPES) {
+    var->GetMutable<std::vector<paddle::framework::Scope *>>();
+  } else if (var_type == paddle::framework::proto::VarType::LOD_RANK_TABLE) {
+    var->GetMutable<paddle::framework::LoDRankTable>();
+  } else if (var_type == paddle::framework::proto::VarType::LOD_TENSOR_ARRAY) {
+    var->GetMutable<paddle::framework::LoDTensorArray>();
+  } else if (var_type == paddle::framework::proto::VarType::STRINGS) {
+    var->GetMutable<paddle::framework::Strings>();
+  } else if (var_type == paddle::framework::proto::VarType::VOCAB) {
+    var->GetMutable<paddle::framework::Vocab>();
+  } else if (var_type == paddle::framework::proto::VarType::PLACE_LIST) {
+    var->GetMutable<paddle::platform::PlaceList>();
+  } else if (var_type == paddle::framework::proto::VarType::READER) {
+    var->GetMutable<paddle::framework::ReaderHolder>();
+  } else if (var_type == paddle::framework::proto::VarType::RAW) {
+    // GetMutable will be called in operator
+  } else {
+    PADDLE_THROW(paddle::platform::errors::Unavailable(
+        "paddle::framework::Variable type %d is not in "
+        "[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, "
+        "LOD_RANK_TABLE, PLACE_LIST, READER, RAW].",
+        var_type));
+  }
+}
+
+/* GetPlace */
+template <typename VarType>
+const paddle::platform::Place &GetPlace(const std::shared_ptr<VarType> &var) {
+  paddle::framework::Variable variable = var->Var();
+  if (variable.IsType<paddle::framework::LoDTensor>()) {
+    return variable.Get<paddle::framework::LoDTensor>().place();
+  } else if (variable.IsType<pten::SelectedRows>()) {
+    return variable.Get<pten::SelectedRows>().place();
+  } else {
+    PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+        "Variable type is %s, expect LoDTensor or SelectedRows.",
+        paddle::framework::ToTypeName(var->Var().Type())));
+  }
+}
+template const paddle::platform::Place &GetPlace<VarBase>(
+    const std::shared_ptr<VarBase> &var);
+template const paddle::platform::Place &GetPlace<VariableWrapper>(
+    const std::shared_ptr<VariableWrapper> &var);
+template const paddle::platform::Place &GetPlace<egr::EagerTensor>(
+    const std::shared_ptr<egr::EagerTensor> &var);
+
+/* GetNameFromVar */
+template <typename VarType>
+const std::string &GetNameFromVar(std::shared_ptr<VarType> var) {
+  return var->Name();
+}
+template <>
+const std::string &GetNameFromVar<egr::EagerTensor>(
+    std::shared_ptr<egr::EagerTensor> tensor) {
+  return tensor->name();
+}
+template const std::string &GetNameFromVar<VariableWrapper>(
+    std::shared_ptr<VariableWrapper> var);
+template const std::string &GetNameFromVar<VarBase>(
+    std::shared_ptr<VarBase> var);
+
+/* SetType */
+template <typename VarType>
+void SetType(std::shared_ptr<VarType> var,
+             framework::proto::VarType::Type type) {
+  var->SetType(type);
+}
+template <>
+void SetType<egr::EagerTensor>(std::shared_ptr<egr::EagerTensor> var,
+                               framework::proto::VarType::Type type) {
+  switch (type) {
+    case paddle::framework::proto::VarType::LOD_TENSOR: {
+      var->MutableVar()->GetMutable<paddle::framework::LoDTensor>();
+      break;
+    }
+    case paddle::framework::proto::VarType::SELECTED_ROWS: {
+      var->MutableVar()->GetMutable<pten::SelectedRows>();
+      break;
+    }
+    default: {
+      PADDLE_THROW(paddle::platform::errors::NotFound(
+          "Cannot found var type: %s while running runtime InferVarType",
+          paddle::framework::ToTypeName(type)));
+    }
+  }
+}
+template void SetType<VarBase>(std::shared_ptr<VarBase> var,
+                               framework::proto::VarType::Type type);
+template void SetType<VariableWrapper>(std::shared_ptr<VariableWrapper> var,
+                                       framework::proto::VarType::Type type);
+
+/* GetType */
+template <typename VarType>
+framework::proto::VarType::Type GetType(std::shared_ptr<VarType> var) {
+  return var->Type();
+}
+template <>
+framework::proto::VarType::Type GetType<egr::EagerTensor>(
+    std::shared_ptr<egr::EagerTensor> var) {
+  if (var->Var().IsInitialized()) {
+    return paddle::framework::ToVarType(var->Var().Type());
+  } else {
+    return paddle::framework::proto::VarType::LOD_TENSOR;
+  }
+}
+template framework::proto::VarType::Type GetType<VarBase>(
+    std::shared_ptr<VarBase> var);
+template framework::proto::VarType::Type GetType<VariableWrapper>(
+    std::shared_ptr<VariableWrapper> var);
+
+/* GetDataType */
+template <typename VarType>
+framework::proto::VarType::Type GetDataType(std::shared_ptr<VarType> var) {
+  return var->DataType();
+}
+template <>
+framework::proto::VarType::Type GetDataType<egr::EagerTensor>(
+    std::shared_ptr<egr::EagerTensor> var) {
+  if (var->Var().IsType<pten::SelectedRows>()) {
+    return var->Var().Get<pten::SelectedRows>().value().type();
+  } else if (var->Var().IsType<framework::LoDTensor>()) {
+    return var->Var().Get<framework::LoDTensor>().type();
+  } else {
+    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
+        "We only support pten::SelectedRows and framework::LoDTensor in "
+        "eager mode, but we got %s here, please checkout your var type of "
+        "tensor: %s",
+        paddle::framework::ToTypeName(framework::ToVarType(var->Var().Type())),
+        var->name()));
+  }
+}
+template framework::proto::VarType::Type GetDataType<VarBase>(
+    std::shared_ptr<VarBase> var);
+template framework::proto::VarType::Type GetDataType<VariableWrapper>(
+    std::shared_ptr<VariableWrapper> var);
+
+/* CheckCachedKey */
+template <typename VarType>
+bool CheckCachedKey(std::shared_ptr<VarType> var,
+                    const paddle::framework::OpKernelType &key) {
+  return GetVariableWrapper(var)->hasCacheKey(key);
+}
+template <>
+bool CheckCachedKey<egr::EagerTensor>(
+    std::shared_ptr<egr::EagerTensor> tensor,
+    const paddle::framework::OpKernelType &key) {
+  // TODO(jiabin): Support this later
+  // VLOG(10) << "CheckCachedKey with tensor: " << tensor->name() << "and key is
+  // equal to self: " << key == key.
+  return false;
+}
+template bool CheckCachedKey<VarBase>(
+    std::shared_ptr<VarBase> var, const paddle::framework::OpKernelType &key);
+template bool CheckCachedKey<VariableWrapper>(
+    std::shared_ptr<VariableWrapper> var,
+    const paddle::framework::OpKernelType &key);
+
+/* GetCachedValue */
+template <typename VarType>
+std::shared_ptr<VariableWrapper> GetCachedValue(
+    std::shared_ptr<VarType> var, const paddle::framework::OpKernelType &key) {
+  return GetVariableWrapper(var)->getCacheValue(key);
+}
+template <>
+std::shared_ptr<VariableWrapper> GetCachedValue(
+    std::shared_ptr<egr::EagerTensor> var,
+    const paddle::framework::OpKernelType &key) {
+  // TODO(jiabin): Support this later
+  //   PADDLE_THROW(platform::errors::Fatal("In eager mode program should not
+  //   reach this, support cache and remove this error check later, or this
+  //   should not be supported."));
+  //   VLOG(10) << "CheckCachedKey with tensor: " << tensor->name() << "and key
+  //   is equal to self: " << key == key.
+  return std::make_shared<VariableWrapper>("");
+}
+template std::shared_ptr<VariableWrapper> GetCachedValue<VarBase>(
+    std::shared_ptr<VarBase> var, const paddle::framework::OpKernelType &key);
+template std::shared_ptr<VariableWrapper> GetCachedValue<VariableWrapper>(
+    std::shared_ptr<VariableWrapper> var,
+    const paddle::framework::OpKernelType &key);
+
+/* SetCachedValue */
+template <typename VarType>
+void SetCachedValue(std::shared_ptr<VarType> var,
+                    const paddle::framework::OpKernelType &key,
+                    std::shared_ptr<VarType> res) {
+  GetVariableWrapper(var)->setCacheValue(key, GetVariableWrapper(res));
+}
+template <>
+void SetCachedValue<egr::EagerTensor>(
+    std::shared_ptr<egr::EagerTensor> tensor,
+    const paddle::framework::OpKernelType &key,
+    std::shared_ptr<egr::EagerTensor> res) {
+  //   PADDLE_THROW(platform::errors::Fatal("In eager mode program should not
+  //   reach this, support cache and remove this error check later, or this
+  //   should not be supported."));
+  //   VLOG(10) << "CheckCachedKey with tensor: " << tensor->name() << "and key
+  //   is equal to self: " << key == key << " and res name is:" << res->Name().
+}
+template void SetCachedValue<VarBase>(
+    std::shared_ptr<VarBase> var, const paddle::framework::OpKernelType &key,
+    std::shared_ptr<VarBase> res);
+template void SetCachedValue<VariableWrapper>(
+    std::shared_ptr<VariableWrapper> var,
+    const paddle::framework::OpKernelType &key,
+    std::shared_ptr<VariableWrapper> res);
+}  // namespace imperative
+}  // namespace paddle
diff --git a/paddle/fluid/imperative/var_helper.h b/paddle/fluid/imperative/var_helper.h
new file mode 100644
index 00000000000..ff228e0ab84
--- /dev/null
+++ b/paddle/fluid/imperative/var_helper.h
@@ -0,0 +1,70 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vector>
+#include "paddle/fluid/framework/variable.h"
+
+namespace egr {
+class EagerTensor;
+}  // namespace egr
+namespace pten {
+class DenseTensor;
+}
+namespace paddle {
+namespace framework {
+class Variable;
+class OpKernelType;
+}  // namespace framework
+
+namespace imperative {
+
+class VarBase;
+class VariableWrapper;
+
+void InitializeVariable(paddle::framework::Variable* var,
+                        paddle::framework::proto::VarType::Type var_type);
+template <typename VarType>
+const paddle::platform::Place& GetPlace(const std::shared_ptr<VarType>& var);
+template <typename VarType>
+const std::string& GetNameFromVar(std::shared_ptr<VarType> var);
+
+template <typename VarType>
+bool CheckCachedKey(std::shared_ptr<VarType> tensor,
+                    const paddle::framework::OpKernelType& key);
+template <typename VarType>
+void SetCachedValue(std::shared_ptr<VarType> tensor,
+                    const paddle::framework::OpKernelType& key,
+                    std::shared_ptr<VarType> res);
+template <typename VarType>
+std::shared_ptr<VariableWrapper> GetCachedValue(
+    std::shared_ptr<VarType> tensor,
+    const paddle::framework::OpKernelType& key);
+
+template <typename VarType>
+void SetType(std::shared_ptr<VarType> var,
+             framework::proto::VarType::Type type);
+
+template <typename VarType>
+framework::proto::VarType::Type GetType(std::shared_ptr<VarType> var);
+
+template <typename VarType>
+framework::proto::VarType::Type GetDataType(std::shared_ptr<VarType> var);
+
+template <typename VarType>
+const std::shared_ptr<VariableWrapper>& GetVariableWrapper(
+    const std::shared_ptr<VarType>& var);
+}  // namespace imperative
+}  // namespace paddle
diff --git a/paddle/fluid/inference/lite/test_engine_lite.cc b/paddle/fluid/inference/lite/test_engine_lite.cc
index b2750fd070d..85f7d3ee363 100644
--- a/paddle/fluid/inference/lite/test_engine_lite.cc
+++ b/paddle/fluid/inference/lite/test_engine_lite.cc
@@ -77,6 +77,10 @@ void make_fake_model(std::string* model, std::string* param) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   platform::CUDAPlace place;
   platform::CUDADeviceContext ctx(place);
+  ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                       .GetAllocator(place, ctx.stream())
+                       .get());
+  ctx.PartialInitWithAllocator();
 #else
   platform::CPUPlace place;
   platform::CPUDeviceContext ctx(place);
diff --git a/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.cu
index 3c7b59cec72..ecf06e9bf15 100644
--- a/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.cu
@@ -128,10 +128,14 @@ int SpecialSlicePluginDynamic::enqueue(
   auto input_dims = input_desc[0].dims;  // (sum(S), hidden, 1, 1)
   auto out_dims = output_desc[0].dims;   // (batch, hidden, 1, 1)
 
-  assert(input_desc[0].type == nvinfer1::DataType::kHALF);
-  assert(hidden % 128 == 0);
+  PADDLE_ENFORCE_EQ(
+      input_desc[0].type, nvinfer1::DataType::kHALF,
+      platform::errors::InvalidArgument("Type of input should be half."));
 
   const int32_t hidden = input_dims.d[1];
+  PADDLE_ENFORCE_EQ(hidden % 128, 0, platform::errors::InvalidArgument(
+                                         "hidden should be multiple of 128."));
+
   constexpr int num_threads = 128;
   const dim3 blocks(out_dims.d[0], hidden / num_threads);
 
diff --git a/paddle/fluid/inference/tensorrt/test_engine.cc b/paddle/fluid/inference/tensorrt/test_engine.cc
index d5cc69ea661..fa03d5320f2 100644
--- a/paddle/fluid/inference/tensorrt/test_engine.cc
+++ b/paddle/fluid/inference/tensorrt/test_engine.cc
@@ -27,6 +27,18 @@ class TensorRTEngineTest : public ::testing::Test {
  protected:
   void SetUp() override {
     ctx_ = new platform::CUDADeviceContext(platform::CUDAPlace(0));
+    ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(platform::CUDAPlace(0), ctx_->stream())
+                           .get());
+    ctx_->SetHostAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetAllocator(paddle::platform::CPUPlace())
+            .get());
+    ctx_->SetZeroAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetZeroAllocator(platform::CUDAPlace(0))
+            .get());
+    ctx_->PartialInitWithAllocator();
 
     engine_ = new TensorRTEngine(10, 1 << 10);
     engine_->InitNetwork();
diff --git a/paddle/fluid/memory/allocation/best_fit_allocator_test.cu b/paddle/fluid/memory/allocation/best_fit_allocator_test.cu
index 59c14103ca6..cad31f5bba9 100644
--- a/paddle/fluid/memory/allocation/best_fit_allocator_test.cu
+++ b/paddle/fluid/memory/allocation/best_fit_allocator_test.cu
@@ -18,6 +18,7 @@
 #include <vector>
 
 #include "gtest/gtest.h"
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/fluid/memory/allocation/best_fit_allocator.h"
 #include "paddle/fluid/memory/allocation/cuda_allocator.h"
 #include "paddle/fluid/memory/allocation/locked_allocator.h"
@@ -44,6 +45,10 @@ TEST(BestFitAllocator, concurrent_cuda) {
 
   platform::CUDAPlace gpu(0);
   platform::CUDADeviceContext dev_ctx(gpu);
+  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(gpu, dev_ctx.stream())
+                           .get());
+  dev_ctx.PartialInitWithAllocator();
 
   auto th_main = [&](std::random_device::result_type seed) {
     std::default_random_engine engine(seed);
diff --git a/paddle/fluid/memory/malloc_test.cu b/paddle/fluid/memory/malloc_test.cu
index d015ed7ce69..07577531d64 100644
--- a/paddle/fluid/memory/malloc_test.cu
+++ b/paddle/fluid/memory/malloc_test.cu
@@ -25,6 +25,7 @@
 #include <vector>
 
 #include "gtest/gtest.h"
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/platform/device_context.h"
 
@@ -105,8 +106,21 @@ TEST(Malloc, CUDADeviceContextMultiStream) {
   main_stream_alloc_ptr.reset();
 
   for (int i = 0; i < NUM_STREAMS; ++i) {
-    dev_ctx.push_back(std::unique_ptr<platform::CUDADeviceContext>(
-        new platform::CUDADeviceContext(place)));
+    auto ctx = std::unique_ptr<platform::CUDADeviceContext>(
+        new platform::CUDADeviceContext(place));
+    ctx->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                          .GetAllocator(place, ctx->stream())
+                          .get());
+    ctx->SetHostAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetAllocator(paddle::platform::CPUPlace())
+            .get());
+    ctx->SetZeroAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetZeroAllocator(place)
+            .get());
+    ctx->PartialInitWithAllocator();
+    dev_ctx.emplace_back(std::move(ctx));
     MultiStreamCompute(&data[i], &second_data[i], *dev_ctx[i]);
   }
 
@@ -144,8 +158,21 @@ TEST(Malloc, CUDADeviceContextMultiThreadMultiStream) {
   main_stream_alloc_ptr.reset();
 
   for (int i = 0; i < NUM_STREAMS; ++i) {
-    dev_ctx.push_back(std::unique_ptr<platform::CUDADeviceContext>(
-        new platform::CUDADeviceContext(place)));
+    auto ctx = std::unique_ptr<platform::CUDADeviceContext>(
+        new platform::CUDADeviceContext(place));
+    ctx->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                          .GetAllocator(place, ctx->stream())
+                          .get());
+    ctx->SetHostAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetAllocator(paddle::platform::CPUPlace())
+            .get());
+    ctx->SetZeroAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetZeroAllocator(place)
+            .get());
+    ctx->PartialInitWithAllocator();
+    dev_ctx.emplace_back(std::move(ctx));
     threads.push_back(std::thread(MultiStreamCompute, &data[i], &second_data[i],
                                   std::cref(*dev_ctx[i])));
   }
diff --git a/paddle/fluid/operators/arg_min_max_op_base.cu.h b/paddle/fluid/operators/arg_min_max_op_base.cu.h
index 2c34d6f8300..89ba2dfb925 100644
--- a/paddle/fluid/operators/arg_min_max_op_base.cu.h
+++ b/paddle/fluid/operators/arg_min_max_op_base.cu.h
@@ -110,7 +110,7 @@ void ComputeFullArg(const platform::CUDADeviceContext& ctx, const Tensor& input,
     return block_size;
   };
 
-  int64_t max_grid_dimx = ctx.GetCUDAMaxGridDimSize().x;
+  int64_t max_grid_dimx = ctx.GetCUDAMaxGridDimSize()[0];
   int64_t height = pre * post;
   int64_t width = n;
   int64_t grid_size = height < max_grid_dimx ? height : max_grid_dimx;
diff --git a/paddle/fluid/operators/argsort_op.cu b/paddle/fluid/operators/argsort_op.cu
index 6236a07de4b..b6a1f1f6d23 100644
--- a/paddle/fluid/operators/argsort_op.cu
+++ b/paddle/fluid/operators/argsort_op.cu
@@ -131,7 +131,7 @@ void ArgFullSort(const platform::CUDADeviceContext& ctx, const Tensor* input,
 
   int block_size = ComputeBlockSize(num_cols);
 
-  int maxGridDimX = ctx.GetCUDAMaxGridDimSize().x;
+  int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0];
   // actually, int num_rows < max_grid_size
   int grid_size = num_rows < maxGridDimX ? num_rows : maxGridDimX;
   // Init a index array
@@ -212,7 +212,7 @@ void ArgFullAssign(const platform::CUDADeviceContext& ctx, const Tensor* dO,
 
   int block_size = ComputeBlockSize(num_cols);
 
-  int maxGridDimX = ctx.GetCUDAMaxGridDimSize().x;
+  int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0];
   // actually, int num_rows < max_grid_size
   int grid_size = num_rows < maxGridDimX ? num_rows : maxGridDimX;
   FillGrad<<<grid_size, block_size, 0, cu_stream>>>(
diff --git a/paddle/fluid/operators/broadcast_tensors_op.cu b/paddle/fluid/operators/broadcast_tensors_op.cu
index 718e7ce3966..5a835c7678f 100644
--- a/paddle/fluid/operators/broadcast_tensors_op.cu
+++ b/paddle/fluid/operators/broadcast_tensors_op.cu
@@ -90,8 +90,8 @@ class CUDABroadcastTensorsGradOpKernel : public framework::OpKernel<T> {
         // reduce_sum implementation on CUDA
         auto stream = context.cuda_device_context().stream();
         TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-            *input_tensor, output_tensor, kps::IdentityFunctor<T>(),
-            reduce_dims_vec, stream);
+            context.cuda_device_context(), *input_tensor, output_tensor,
+            kps::IdentityFunctor<T>(), reduce_dims_vec, stream);
       }
     }
   }
diff --git a/paddle/fluid/operators/cast_op.cc b/paddle/fluid/operators/cast_op.cc
index 9e2fe6e2d06..0c49dbe7e0f 100644
--- a/paddle/fluid/operators/cast_op.cc
+++ b/paddle/fluid/operators/cast_op.cc
@@ -121,11 +121,6 @@ class CastOp : public framework::OperatorWithKernel {
 #endif
     return framework::OpKernelType(tensor->type(), tensor_place);
   }
-
-  framework::KernelSignature GetExpectedPtenKernelArgs(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::KernelSignature("cast", {"X"}, {"out_dtype"}, {"Out"});
-  }
 };
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/cholesky_solve_op.cu b/paddle/fluid/operators/cholesky_solve_op.cu
index f42364c9619..a5d5baf19da 100644
--- a/paddle/fluid/operators/cholesky_solve_op.cu
+++ b/paddle/fluid/operators/cholesky_solve_op.cu
@@ -115,7 +115,8 @@ class MatrixReduceSumFunctor<platform::CUDADeviceContext, T> {
     }
     gpuStream_t stream = ctx.cuda_device_context().stream();
     TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-        in, out, kps::IdentityFunctor<T>(), out_reduce_dims, stream);
+        ctx.cuda_device_context(), in, out, kps::IdentityFunctor<T>(),
+        out_reduce_dims, stream);
   }
 };
 
diff --git a/paddle/fluid/operators/clip_by_norm_op.cu b/paddle/fluid/operators/clip_by_norm_op.cu
index 4d04fdc8ce2..f2714d13785 100644
--- a/paddle/fluid/operators/clip_by_norm_op.cu
+++ b/paddle/fluid/operators/clip_by_norm_op.cu
@@ -77,7 +77,7 @@ class ClipByNormKernel<platform::CUDADeviceContext, platform::float16>
         {1}, dev_ctx);
     TensorReduceFunctorImpl<platform::float16, float, kps::AddFunctor,
                             kps::SquareFunctor<platform::float16, float>>(
-        *input, &tmp, kps::SquareFunctor<platform::float16, float>(),
+        dev_ctx, *input, &tmp, kps::SquareFunctor<platform::float16, float>(),
         reduce_dims, dev_ctx.stream());
     auto tmp_eigen = EigenVector<float>::Flatten(tmp);
     auto x_norm = tmp_eigen.sqrt();
diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc
index 68a4d09f3b9..2746f034530 100644
--- a/paddle/fluid/operators/concat_op.cc
+++ b/paddle/fluid/operators/concat_op.cc
@@ -104,15 +104,6 @@ class ConcatOp : public framework::OperatorWithKernel {
     return framework::OpKernelType(expected_kernel_type.data_type_,
                                    tensor.place(), tensor.layout());
   }
-
-  framework::KernelSignature GetExpectedPtenKernelArgs(
-      const framework::ExecutionContext &ctx) const override {
-    if (ctx.HasInput("AxisTensor")) {
-      return framework::KernelSignature("concat", {"X"}, {"AxisTensor"},
-                                        {"Out"});
-    }
-    return framework::KernelSignature("concat", {"X"}, {"axis"}, {"Out"});
-  }
 };
 
 class ConcatOpMaker : public framework::OpProtoAndCheckerMaker {
diff --git a/paddle/fluid/operators/conj_op.cc b/paddle/fluid/operators/conj_op.cc
index 4d801bc003e..c2211f9ab05 100644
--- a/paddle/fluid/operators/conj_op.cc
+++ b/paddle/fluid/operators/conj_op.cc
@@ -22,22 +22,16 @@
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
 
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/pten/core/infermeta_utils.h"
+#include "paddle/pten/infermeta/unary.h"
+
 namespace paddle {
 namespace operators {
 
 class ConjOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "conj");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "conj");
-
-    auto in_dims = ctx->GetInputDim("X");
-
-    ctx->SetOutputDim("Out", in_dims);
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
 };
 
 class ConjOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -72,9 +66,12 @@ class ConjGradMaker : public framework::SingleGradOpMaker<T> {
 
 namespace ops = paddle::operators;
 
+DELCARE_INFER_SHAPE_FUNCTOR(conj, ConjInferShapeFunctor,
+                            PT_INFER_META(pten::UnchangedInferMeta));
 REGISTER_OPERATOR(conj, ops::ConjOp, ops::ConjOpMaker,
                   ops::ConjGradMaker<paddle::framework::OpDesc>,
-                  ops::ConjGradMaker<paddle::imperative::OpBase>);
+                  ops::ConjGradMaker<paddle::imperative::OpBase>,
+                  ConjInferShapeFunctor);
 
 REGISTER_OP_CPU_KERNEL(
     conj, ops::ConjKernel<paddle::platform::CPUDeviceContext,
diff --git a/paddle/fluid/operators/controlflow/compare_all_op.cu b/paddle/fluid/operators/controlflow/compare_all_op.cu
index 54f59c40a20..6526d774caf 100644
--- a/paddle/fluid/operators/controlflow/compare_all_op.cu
+++ b/paddle/fluid/operators/controlflow/compare_all_op.cu
@@ -65,7 +65,8 @@ class CompareReduceOpKernel
       auto stream = context.cuda_device_context().stream();
       TensorReduceFunctorImpl<bool, bool, BitwiseAdd,
                               kps::IdentityFunctor<bool>>(
-          tmp, z, kps::IdentityFunctor<bool>(), reduce_dims, stream);
+          context.cuda_device_context(), tmp, z, kps::IdentityFunctor<bool>(),
+          reduce_dims, stream);
     }
   }
 };
diff --git a/paddle/fluid/operators/copy_cross_scope_test.cc b/paddle/fluid/operators/copy_cross_scope_test.cc
index 5f951ad337e..95e30efda0f 100644
--- a/paddle/fluid/operators/copy_cross_scope_test.cc
+++ b/paddle/fluid/operators/copy_cross_scope_test.cc
@@ -131,12 +131,20 @@ void Compare2(f::Scope* scope, const p::DeviceContext& ctx,
 TEST(copy_cross_scope, CUDA_fp32) {
   f::Scope scope;
   p::CUDADeviceContext ctx(p::CUDAPlace(0));
+  ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                       .GetAllocator(p::CUDAPlace(0), ctx.stream())
+                       .get());
+  ctx.PartialInitWithAllocator();
   Compare1<float>(&scope, ctx, "copy_cross_scope");
 }
 
 TEST(copy_cross_scope_to_main_scope, CUDA_fp32) {
   f::Scope scope;
   p::CUDADeviceContext ctx(p::CUDAPlace(0));
+  ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                       .GetAllocator(p::CUDAPlace(0), ctx.stream())
+                       .get());
+  ctx.PartialInitWithAllocator();
   Compare2<float>(&scope, ctx, "copy_cross_scope");
 }
 #elif PADDLE_WITH_ASCEND_CL
diff --git a/paddle/fluid/operators/digamma_op.cc b/paddle/fluid/operators/digamma_op.cc
index b1a58817e06..eb0471fec12 100644
--- a/paddle/fluid/operators/digamma_op.cc
+++ b/paddle/fluid/operators/digamma_op.cc
@@ -64,6 +64,13 @@ class DigammaGradOp : public framework::OperatorWithKernel {
     ctx->SetOutputDim(framework::GradVarName("X"), dout_dims);
     ctx->ShareLoD(framework::GradVarName("Out"), framework::GradVarName("X"));
   }
+
+  framework::KernelSignature GetExpectedPtenKernelArgs(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::KernelSignature("digamma_grad",
+                                      {framework::GradVarName("Out"), "X"}, {},
+                                      {framework::GradVarName("X")});
+  }
 };
 
 template <typename T>
@@ -89,12 +96,3 @@ REGISTER_OPERATOR(digamma, ops::DigammaOp, ops::DigammaOpMaker,
                   ops::DigammaGradOpMaker<paddle::framework::OpDesc>,
                   ops::DigammaGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(digamma_grad, ops::DigammaGradOp);
-
-REGISTER_OP_CPU_KERNEL(
-    digamma, ops::DigammaKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::DigammaKernel<paddle::platform::CPUDeviceContext, double>);
-
-REGISTER_OP_CPU_KERNEL(
-    digamma_grad,
-    ops::DigammaGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::DigammaGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/digamma_op.h b/paddle/fluid/operators/digamma_op.h
index f82628f0204..85f9094e6a0 100644
--- a/paddle/fluid/operators/digamma_op.h
+++ b/paddle/fluid/operators/digamma_op.h
@@ -14,86 +14,5 @@ limitations under the License. */
 
 #pragma once
 
-#include <unsupported/Eigen/SpecialFunctions>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/platform/for_range.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct DigammaFunctor {
-  DigammaFunctor(const T* input, T* output, int64_t numel)
-      : input_(input), output_(output), numel_(numel) {}
-
-  HOSTDEVICE void operator()(int64_t idx) const {
-    output_[idx] = Eigen::numext::digamma(input_[idx]);
-  }
-
- private:
-  const T* input_;
-  T* output_;
-  int64_t numel_;
-};
-
-template <typename T>
-struct DigammaGradFunctor {
-  DigammaGradFunctor(const T* dout, const T* x, T* output, int64_t numel)
-      : dout_(dout), x_(x), output_(output), numel_(numel) {}
-
-  HOSTDEVICE void operator()(int64_t idx) const {
-    output_[idx] = dout_[idx] * Eigen::numext::polygamma(T(1), x_[idx]);
-  }
-
- private:
-  const T* dout_;
-  const T* x_;
-  T* output_;
-  int64_t numel_;
-};
-
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T>
-class DigammaKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* x = context.Input<Tensor>("X");
-    Tensor* out = context.Output<Tensor>("Out");
-
-    auto numel = x->numel();
-    auto* x_data = x->data<T>();
-    auto* out_data = out->mutable_data<T>(context.GetPlace(),
-                                          size_t(x->numel() * sizeof(T)));
-
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    platform::ForRange<DeviceContext> for_range(dev_ctx, numel);
-    DigammaFunctor<T> functor(x_data, out_data, numel);
-    for_range(functor);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class DigammaGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
-    const Tensor* x = context.Input<Tensor>("X");
-    auto* d_x = context.Output<Tensor>(framework::GradVarName("X"));
-
-    auto numel = d_out->numel();
-    auto* dout_data = d_out->data<T>();
-    auto* x_data = x->data<T>();
-    auto* dx_data = d_x->mutable_data<T>(
-        context.GetPlace(), static_cast<size_t>(numel * sizeof(T)));
-
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    platform::ForRange<DeviceContext> for_range(dev_ctx, numel);
-    DigammaGradFunctor<T> functor(dout_data, x_data, dx_data, numel);
-    for_range(functor);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise/elementwise_functor.h b/paddle/fluid/operators/elementwise/elementwise_functor.h
index daca105ce46..19d3a6c385c 100644
--- a/paddle/fluid/operators/elementwise/elementwise_functor.h
+++ b/paddle/fluid/operators/elementwise/elementwise_functor.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/fluid/platform/complex.h"
-#include "paddle/pten/core/array.h"
+#include "paddle/pten/core/utils/array.h"
 #include "paddle/pten/kernels/funcs/elementwise_functor.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
index 8923f1fd4b8..56580e6d595 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/elementwise/elementwise_mul_op.h"
+#include "paddle/pten/backends/gpu/gpu_context.h"
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
@@ -51,8 +52,8 @@ class ElementwiseMulKernel<platform::CUDADeviceContext, T>
       auto pt_x = paddle::experimental::MakePtenDenseTensor(*x_lod);
       auto pt_y = paddle::experimental::MakePtenDenseTensor(*y_lod);
       auto pt_z = paddle::experimental::MakePtenDenseTensor(*z_lod);
-      pten::MultiplyRawKernel<T>(cuda_ctx, *pt_x.get(), *pt_y.get(), axis,
-                                 pt_z.get());
+      pten::MultiplyRawKernel<T>(static_cast<const pten::GPUContext&>(cuda_ctx),
+                                 *pt_x.get(), *pt_y.get(), axis, pt_z.get());
     } else {
       PADDLE_THROW(platform::errors::InvalidArgument(
           "X's type[%s] is not supported by elementwise_op. X's type should be "
diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h
index 64beac0804d..e18ff9727b2 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op.h
@@ -137,50 +137,6 @@ class ElementwiseOp : public framework::OperatorWithKernel {
                                      tensor.place(), tensor.layout());
     }
   }
-
-  framework::KernelSignature GetExpectedPtenKernelArgs(
-      const framework::ExecutionContext &ctx) const override {
-    int axis = ctx.Attr<int>("axis");
-    if (Type() == "elementwise_add") {
-      if (ctx.InputVar("X")->IsType<framework::LoDTensor>()) {
-        if (axis == -1) {
-          return framework::KernelSignature("add", {"X", "Y"}, {}, {"Out"});
-        }
-        return framework::KernelSignature("add_raw", {"X", "Y"}, {"axis"},
-                                          {"Out"});
-      }
-    }
-    if (Type() == "elementwise_sub") {
-      if (ctx.InputVar("X")->IsType<framework::LoDTensor>()) {
-        if (axis == -1) {
-          return framework::KernelSignature("subtract", {"X", "Y"}, {},
-                                            {"Out"});
-        }
-        return framework::KernelSignature("subtract_raw", {"X", "Y"}, {"axis"},
-                                          {"Out"});
-      }
-    }
-    if (Type() == "elementwise_div") {
-      if (ctx.InputVar("X")->IsType<framework::LoDTensor>()) {
-        if (axis == -1) {
-          return framework::KernelSignature("divide", {"X", "Y"}, {}, {"Out"});
-        }
-        return framework::KernelSignature("divide_raw", {"X", "Y"}, {"axis"},
-                                          {"Out"});
-      }
-    }
-    if (Type() == "elementwise_mul") {
-      if (ctx.InputVar("X")->IsType<framework::LoDTensor>()) {
-        if (axis == -1) {
-          return framework::KernelSignature("multiply", {"X", "Y"}, {},
-                                            {"Out"});
-        }
-        return framework::KernelSignature("multiply_raw", {"X", "Y"}, {"axis"},
-                                          {"Out"});
-      }
-    }
-    return framework::KernelSignature("None", {"X"}, {}, {"Out"});
-  }
 };
 
 class ElementwiseOpInferVarType
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h
index fdf04181de7..74e74870b8e 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h
@@ -1189,7 +1189,8 @@ void ReduceWrapper(const platform::CUDADeviceContext &dev_ctx, int axis,
                    framework::Tensor *src, framework::Tensor *dst) {
   std::vector<int> reduce_dims = GetReduceDim(dst->dims(), src->dims(), axis);
   TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-      *src, dst, kps::IdentityFunctor<T>(), reduce_dims, dev_ctx.stream());
+      dev_ctx, *src, dst, kps::IdentityFunctor<T>(), reduce_dims,
+      dev_ctx.stream());
 }
 
 template <ElementwiseType ET, typename T, typename Functor>
diff --git a/paddle/fluid/operators/empty_op.cc b/paddle/fluid/operators/empty_op.cc
index 71780971560..3d28ca90a5a 100644
--- a/paddle/fluid/operators/empty_op.cc
+++ b/paddle/fluid/operators/empty_op.cc
@@ -109,20 +109,6 @@ class EmptyOp : public framework::OperatorWithKernel {
         framework::proto::VarType::Type(context.Attr<int>("dtype")),
         context.GetPlace());
   }
-
-  framework::KernelSignature GetExpectedPtenKernelArgs(
-      const framework::ExecutionContext& ctx) const override {
-    std::string shape;
-    if (ctx.HasInput("ShapeTensor")) {
-      shape = "ShapeTensor";
-    } else if (ctx.MultiInput<framework::Tensor>("ShapeTensorList").size()) {
-      shape = "ShapeTensorList";
-    } else {
-      shape = "shape";
-    }
-
-    return framework::KernelSignature("empty", {}, {shape}, {"Out"});
-  }
 };
 
 class EmptyOpVarTypeInference : public framework::VarTypeInference {
diff --git a/paddle/fluid/operators/feed_forward_test.cu b/paddle/fluid/operators/feed_forward_test.cu
index 9f44c39a92c..dea427393b1 100644
--- a/paddle/fluid/operators/feed_forward_test.cu
+++ b/paddle/fluid/operators/feed_forward_test.cu
@@ -275,6 +275,18 @@ class TestFeedForward {
     output_size_ = 3 * num_head_ * dim_head_;
     input_size_ = dim_embed_;
     ctx_ = new platform::CUDADeviceContext(place_);
+    ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(place_, ctx_->stream())
+                           .get());
+    ctx_->SetHostAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetAllocator(paddle::platform::CPUPlace())
+            .get());
+    ctx_->SetZeroAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetZeroAllocator(place_)
+            .get());
+    ctx_->PartialInitWithAllocator();
 
     size_src_ = bsz_seq_ * dim_embed_;         // src: [bs, seq_len, em_dim]
     size_weight_ = dim_embed_ * output_size_;  // weight: [output_size, em_dim]
diff --git a/paddle/fluid/operators/fill_any_like_op.cc b/paddle/fluid/operators/fill_any_like_op.cc
index 245a8977c0b..1e908d5ead9 100644
--- a/paddle/fluid/operators/fill_any_like_op.cc
+++ b/paddle/fluid/operators/fill_any_like_op.cc
@@ -47,11 +47,6 @@ class FillAnyLikeOp : public framework::OperatorWithKernel {
                                    expected_kernel_type.place_,
                                    tensor.layout());
   }
-
-  framework::KernelSignature GetExpectedPtenKernelArgs(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::KernelSignature("full_like", {}, {"value"}, {"Out"});
-  }
 };
 
 class FillAnyLikeOpMaker : public framework::OpProtoAndCheckerMaker {
diff --git a/paddle/fluid/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc
index c0e2b4584d0..04c2d027cac 100644
--- a/paddle/fluid/operators/fill_constant_op.cc
+++ b/paddle/fluid/operators/fill_constant_op.cc
@@ -99,29 +99,6 @@ class FillConstantOp : public framework::OperatorWithKernel {
 
     return kt;
   }
-
-  framework::KernelSignature GetExpectedPtenKernelArgs(
-      const framework::ExecutionContext& ctx) const override {
-    std::string shape;
-    if (ctx.HasInput("ShapeTensor")) {
-      shape = "ShapeTensor";
-    } else if (ctx.MultiInput<framework::Tensor>("ShapeTensorList").size()) {
-      shape = "ShapeTensorList";
-    } else {
-      shape = "shape";
-    }
-    std::string value;
-    if (ctx.HasInput("ValueTensor")) {
-      value = "ValueTensor";
-    } else {
-      const auto& str_value = ctx.Attr<std::string>("str_value");
-      value = str_value.empty() ? "value" : "str_value";
-    }
-    if (!ctx.OutputVar("Out")->IsType<pten::SelectedRows>()) {
-      return framework::KernelSignature("full", {}, {shape, value}, {"Out"});
-    }
-    return framework::KernelSignature("fill_constant.unregistered", {}, {}, {});
-  }
 };
 
 class FillConstantOpVarTypeInference : public framework::VarTypeInference {
diff --git a/paddle/fluid/operators/flatten_op.cc b/paddle/fluid/operators/flatten_op.cc
index 6b1ee00b55d..110e6f1d025 100644
--- a/paddle/fluid/operators/flatten_op.cc
+++ b/paddle/fluid/operators/flatten_op.cc
@@ -333,18 +333,6 @@ class FlattenContiguousRangeOp : public framework::OperatorWithKernel {
 
     return out_shape;
   }
-
-  framework::KernelSignature GetExpectedPtenKernelArgs(
-      const framework::ExecutionContext &ctx) const override {
-    if (ctx.HasOutput("XShape")) {
-      return framework::KernelSignature("flatten_with_xshape", {"X"},
-                                        {"start_axis", "stop_axis"},
-                                        {"Out", "XShape"});
-    } else {
-      return framework::KernelSignature("flatten", {"X"},
-                                        {"start_axis", "stop_axis"}, {"Out"});
-    }
-  }
 };
 
 class FlattenContiguousRangeOpMaker : public FlattenOpMaker {
diff --git a/paddle/fluid/operators/fused/attn_gemm.h b/paddle/fluid/operators/fused/attn_gemm.h
index b9b881cf83e..1128997fd25 100644
--- a/paddle/fluid/operators/fused/attn_gemm.h
+++ b/paddle/fluid/operators/fused/attn_gemm.h
@@ -166,7 +166,8 @@ class AttnMatMul {
       if (support_case_1 || support_case_2) {
         gpuStream_t stream = dev_ctx_.stream();
         TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-            *d_output, d_bias, kps::IdentityFunctor<T>(), {0, 1}, stream);
+            dev_ctx_, *d_output, d_bias, kps::IdentityFunctor<T>(), {0, 1},
+            stream);
       } else {
         PADDLE_THROW(platform::errors::InvalidArgument(
             "Only support reduce when the input dims are [0,1,2,3,4] and "
diff --git a/paddle/fluid/operators/fused/fused_dropout_helper.h b/paddle/fluid/operators/fused/fused_dropout_helper.h
index 4aa8b65635e..782c5d70ee0 100644
--- a/paddle/fluid/operators/fused/fused_dropout_helper.h
+++ b/paddle/fluid/operators/fused/fused_dropout_helper.h
@@ -284,11 +284,30 @@ class FusedDropoutLayerNormHelper : public FusedDropoutHelper<T, MaskType> {
                                         P* d_layernorm_bias, T* d_dropout_src,
                                         T* d_bias, T* d_residual) {
     using U = LayerNormParamType<T>;
-    LayerNormBackward<T, U, is_same_type>(
-        layernorm_src, d_out, gamma, mean, variance, d_layernorm_src, d_scale,
-        d_layernorm_bias, epsilon_, this->rows_, this->cols_, ctx);
-    this->ResidualDropoutBiasGrad(ctx, d_layernorm_src, mask, d_dropout_src,
-                                  d_residual, d_bias);
+    bool can_call_1024_kernel = false;
+    // Fast impl for cases when cols is 1024 and linear_bias is nullptr.
+    // In fact, linear_bias is not nullptr is also feasible for impl.
+    // Here, we do not support it.
+    if (this->cols_ == 1024 && d_bias == nullptr && d_scale != nullptr &&
+        d_layernorm_bias != nullptr && sizeof(T) <= 4) {
+      can_call_1024_kernel = true;
+    }
+    VLOG(6) << "LaunchLayernormResidualDropoutGrad = " << can_call_1024_kernel;
+
+    if (can_call_1024_kernel) {
+      LaunchLayernormResidualDropoutGrad<T, U, MaskType, is_same_type>(
+          ctx, this->rows_, this->cols_, epsilon_,
+          this->dropout_param_.dropout_prob,
+          this->dropout_param_.is_upscale_in_train, d_out, layernorm_src, gamma,
+          mean, variance, mask, d_scale, d_layernorm_bias, d_residual,
+          d_dropout_src);
+    } else {
+      LayerNormBackward<T, U, is_same_type>(
+          layernorm_src, d_out, gamma, mean, variance, d_layernorm_src, d_scale,
+          d_layernorm_bias, epsilon_, this->rows_, this->cols_, ctx);
+      this->ResidualDropoutBiasGrad(ctx, d_layernorm_src, mask, d_dropout_src,
+                                    d_residual, d_bias);
+    }
   }
 
  protected:
diff --git a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
index babf1c657f2..911c2cda575 100644
--- a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
+++ b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
@@ -441,5 +441,30 @@ void LaunchLayernormResidualDropoutBias(
   }
 }
 
+template <typename T, typename U, typename MaskType,
+          bool ScaleBiasWithSameTypeX = false>
+void LaunchLayernormResidualDropoutGrad(
+    const platform::CUDADeviceContext &dev_ctx, const uint32_t rows,
+    const uint32_t cols, const float epsilon, const float dropout_prob,
+    const bool is_upscale_in_train, const T *d_out, const T *layernorm_src,
+    const LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX> *scale,
+    const LayerNormParamType<T> *mean, const LayerNormParamType<T> *var,
+    const MaskType *mask_data,
+    LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX> *d_scale,
+    LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX> *d_layernorm_bias,
+    T *d_residual, T *d_dropout_src) {
+  const T zero = static_cast<T>(0.0f);
+  auto factor = dropout_prob == static_cast<float>(1.0f)
+                    ? zero
+                    : static_cast<T>(1.0f / (1.0f - dropout_prob));
+  if (!is_upscale_in_train) {
+    factor = static_cast<T>(1.0f);
+  }
+  ln_bwd_1024_kernel_driver<
+      T, U, LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX>, MaskType>(
+      dev_ctx, rows, cols, epsilon, layernorm_src, scale, mean, var, d_out,
+      d_residual, d_scale, d_layernorm_bias, mask_data, factor, d_dropout_src);
+}
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/gelu_op.cu b/paddle/fluid/operators/gelu_op.cu
index 73694471799..6b778eee434 100644
--- a/paddle/fluid/operators/gelu_op.cu
+++ b/paddle/fluid/operators/gelu_op.cu
@@ -108,7 +108,7 @@ static bool TryLaunchFP16FastGeluFwdVectorizeCUDAKernel(
         is_aligned(y, kAlignment)) {                                          \
       size_t thread = std::min<size_t>(512, dev_ctx.GetMaxThreadsPerBlock()); \
       size_t block = (n / __vec_size + thread - 1) / thread;                  \
-      block = std::min<size_t>(block, dev_ctx.GetCUDAMaxGridDimSize().x);     \
+      block = std::min<size_t>(block, dev_ctx.GetCUDAMaxGridDimSize()[0]);    \
       VLOG(10) << "Use FP16 fast gelu fwd kernel, block = " << block          \
                << " , thread = " << thread;                                   \
       FP16FastGeluFwdCUDAKernel<                                              \
@@ -144,7 +144,7 @@ static bool TryLaunchFP16FastGeluBwdVectorizeCUDAKernel(
         is_aligned(x_g, kAlignment)) {                                        \
       size_t thread = std::min<size_t>(512, dev_ctx.GetMaxThreadsPerBlock()); \
       size_t block = (n / __vec_size + thread - 1) / thread;                  \
-      block = std::min<size_t>(block, dev_ctx.GetCUDAMaxGridDimSize().x);     \
+      block = std::min<size_t>(block, dev_ctx.GetCUDAMaxGridDimSize()[0]);    \
       VLOG(10) << "Use FP16 fast gelu bwd kernel, block = " << block          \
                << " , thread = " << thread;                                   \
       FP16FastGeluBwdCUDAKernel<                                              \
diff --git a/paddle/fluid/operators/graph_khop_sampler_op.cu b/paddle/fluid/operators/graph_khop_sampler_op.cu
index 777ec64f6e0..df977b43512 100644
--- a/paddle/fluid/operators/graph_khop_sampler_op.cu
+++ b/paddle/fluid/operators/graph_khop_sampler_op.cu
@@ -260,7 +260,7 @@ void FillHashTable(const framework::ExecutionContext& ctx, const T* input,
   int block = 1024;
 #endif
   const auto& dev_ctx = ctx.cuda_device_context();
-  int max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize().x;
+  int max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
   int grid_tmp = (num_input + block - 1) / block;
   int grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx;
   // 1. Insert data into keys and values.
@@ -334,7 +334,7 @@ void ReindexFunc(const framework::ExecutionContext& ctx,
   int block = 1024;
 #endif
   const auto& dev_ctx = ctx.cuda_device_context();
-  int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize().x;
+  int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
   int64_t grid_tmp = (outputs->size() + block - 1) / block;
   int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx;
   ReindexSrcOutput<
diff --git a/paddle/fluid/operators/graph_send_recv_op.cu b/paddle/fluid/operators/graph_send_recv_op.cu
index 6e5e203e2d9..446ad2d97a7 100644
--- a/paddle/fluid/operators/graph_send_recv_op.cu
+++ b/paddle/fluid/operators/graph_send_recv_op.cu
@@ -197,7 +197,7 @@ void GraphSendRecvOpCUDAKernelLaunchHelper(
 #endif
   int64_t n = slice_size * index_size;
   const auto& dev_ctx = ctx.cuda_device_context();
-  int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize().x;
+  int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
   int64_t grid_tmp = (n + block - 1) / block;
   int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx;
   int64_t input_size = src_dims[0];
@@ -320,7 +320,7 @@ void GraphSendRecvGradOpCUDAKernelLaunchHelper(
 #endif
   int64_t n = slice_size * index_size;
   const auto& dev_ctx = ctx.cuda_device_context();
-  int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize().x;
+  int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
   int64_t grid_tmp = (n + block - 1) / block;
   int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx;
   int64_t input_size = src_dims[0];
diff --git a/paddle/fluid/operators/gumbel_softmax_op.cu b/paddle/fluid/operators/gumbel_softmax_op.cu
index 4e91e689fa5..63577ed1e0f 100644
--- a/paddle/fluid/operators/gumbel_softmax_op.cu
+++ b/paddle/fluid/operators/gumbel_softmax_op.cu
@@ -92,7 +92,7 @@ struct OneHotGenerator<platform::CUDADeviceContext, T> {
     const int size_from_axis = SizeFromAxis(axis, X.dims());
     const int size_out_axis = SizeOutAxis(axis, X.dims());
     constexpr int thread_size = 512;
-    int64_t max_grid_dimx = context.GetCUDAMaxGridDimSize().x;
+    int64_t max_grid_dimx = context.GetCUDAMaxGridDimSize()[0];
     int64_t height = size_to_axis * size_out_axis;
     int block_size = height < max_grid_dimx ? height : max_grid_dimx;
 
diff --git a/paddle/fluid/operators/index_sample_op.cu b/paddle/fluid/operators/index_sample_op.cu
index 45f63c2b2fb..4c9dec14000 100644
--- a/paddle/fluid/operators/index_sample_op.cu
+++ b/paddle/fluid/operators/index_sample_op.cu
@@ -27,10 +27,10 @@ namespace operators {
 
 namespace {
 void LimitGridDim(const framework::ExecutionContext& ctx, dim3* grid_dim) {
-  dim3 max_grid_dim = ctx.template device_context<platform::CUDADeviceContext>()
+  auto max_grid_dim = ctx.template device_context<platform::CUDADeviceContext>()
                           .GetCUDAMaxGridDimSize();
-  grid_dim->x = grid_dim->x < max_grid_dim.x ? grid_dim->x : max_grid_dim.x;
-  grid_dim->y = grid_dim->y < max_grid_dim.y ? grid_dim->y : max_grid_dim.y;
+  grid_dim->x = grid_dim->x < max_grid_dim[0] ? grid_dim->x : max_grid_dim[0];
+  grid_dim->y = grid_dim->y < max_grid_dim[1] ? grid_dim->y : max_grid_dim[1];
 }
 }
 
diff --git a/paddle/fluid/operators/interpolate_v2_op.cu b/paddle/fluid/operators/interpolate_v2_op.cu
index a4d5e75e728..3901a251545 100644
--- a/paddle/fluid/operators/interpolate_v2_op.cu
+++ b/paddle/fluid/operators/interpolate_v2_op.cu
@@ -45,11 +45,11 @@ inline platform::GpuLaunchConfig GetGpuLaunchConfig3D(
   int block_y = std::min(GetLastPow2(height), max_threads / block_x);
   int block_z = std::min(num_img, max_threads / block_x / block_y);
 
-  dim3 max_grid_dim = context.GetCUDAMaxGridDimSize();
-  int grid_x = std::min<int>(max_grid_dim.x, platform::DivUp(width, block_x));
-  int grid_y = std::min<int>(max_grid_dim.y, platform::DivUp(height, block_y));
+  auto max_grid_dim = context.GetCUDAMaxGridDimSize();
+  int grid_x = std::min<int>(max_grid_dim[0], platform::DivUp(width, block_x));
+  int grid_y = std::min<int>(max_grid_dim[1], platform::DivUp(height, block_y));
   int grid_z =
-      std::min<int>(max_grid_dim.z, platform::DivUp(num_img, block_z * 4));
+      std::min<int>(max_grid_dim[2], platform::DivUp(num_img, block_z * 4));
 
   const int capability = context.GetComputeCapability();
   platform::GpuLaunchConfig config;
diff --git a/paddle/fluid/operators/kron_op.h b/paddle/fluid/operators/kron_op.h
index 6039d8c6240..ff3baf4d70f 100644
--- a/paddle/fluid/operators/kron_op.h
+++ b/paddle/fluid/operators/kron_op.h
@@ -306,11 +306,11 @@ struct KronGradOpFunctor {
     auto stream = dev_ctx.stream();  // it is a cuda device_context
     if (dx) {
       TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-          dout_x, dx, kps::IdentityFunctor<T>(), {1}, stream);
+          dev_ctx, dout_x, dx, kps::IdentityFunctor<T>(), {1}, stream);
     }
     if (dy) {
       TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-          dout_y, dy, kps::IdentityFunctor<T>(), {1}, stream);
+          dev_ctx, dout_y, dy, kps::IdentityFunctor<T>(), {1}, stream);
     }
 #else
     auto* place = dev_ctx.eigen_device();
diff --git a/paddle/fluid/operators/kthvalue_op.cu b/paddle/fluid/operators/kthvalue_op.cu
index c6c62a763aa..0941f9d4c3b 100644
--- a/paddle/fluid/operators/kthvalue_op.cu
+++ b/paddle/fluid/operators/kthvalue_op.cu
@@ -54,7 +54,7 @@ bool SortKthvalue(const platform::CUDADeviceContext& ctx,
   input_indices.mutable_data<int64_t>(ctx.GetPlace());
   size_t temp_storage_bytes = -1;
   int block_size = getBlockSize(num_cols);
-  unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize().x;
+  unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0];
   unsigned int grid_size = num_rows < maxGridDimX
                                ? static_cast<unsigned int>(num_rows)
                                : maxGridDimX;
diff --git a/paddle/fluid/operators/layer_norm_kernel.cu.h b/paddle/fluid/operators/layer_norm_kernel.cu.h
index bc00d875cd1..da4932543a0 100644
--- a/paddle/fluid/operators/layer_norm_kernel.cu.h
+++ b/paddle/fluid/operators/layer_norm_kernel.cu.h
@@ -385,6 +385,471 @@ __inline__ __device__ void cuLoadAddStridedInputs(
   }
 }
 
+#ifdef PADDLE_WITH_CUDA
+template <
+    bool isFusedDropoutResidualLn, typename T, typename U, typename ScaleT = U,
+    typename MaskType = uint8_t, int VecSize = 8, int WARPS_M = 4,
+    int WARPS_N = 1, int BYTES_PER_LDG = 16, int ELTS_PER_ROW = 1024,
+    int THREADS_PER_WARP = 32, int THREADS_PER_ROW = WARPS_N *THREADS_PER_WARP,
+    int THREADS_PER_CTA = WARPS_M *THREADS_PER_ROW, int ROWS_PER_CTA = WARPS_M,
+    int ELTS_PER_ROW_PER_CTA = THREADS_PER_ROW *VecSize,
+    int LDGS = ELTS_PER_ROW / ELTS_PER_ROW_PER_CTA>
+__global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_bwd_1024_kernel(
+    const int rows, float epsilon, const T *__restrict__ x_ptr,
+    const ScaleT *__restrict__ gamma_ptr, const U *__restrict__ mean_ptr,
+    const U *__restrict__ var_ptr, const T *__restrict__ dout_ptr,
+    U *__restrict__ dgamma_temp_ptr, U *__restrict__ dbeta_temp_ptr,
+    T *__restrict__ dx_ptr, const MaskType *mask_ptr = nullptr,
+    T factor = static_cast<T>(0), T *d_dropout_src_ptr = nullptr) {
+  using Vec = platform::AlignedVector<T, VecSize>;
+  using Vec_scale = platform::AlignedVector<ScaleT, VecSize>;
+  using MaskLoadT = platform::AlignedVector<MaskType, VecSize>;
+
+  const int tidx = threadIdx.x;
+  const int bidx = blockIdx.x;
+  const int lane = tidx % THREADS_PER_WARP;            // 0, 1, ..., 31
+  const int warp = tidx / THREADS_PER_WARP;            // 0, 1, 2, 3
+  const int warp_m = warp / WARPS_N;                   // 0, 1, 2, 3
+  const int warp_n = warp % WARPS_N;                   // 0
+  const int tid_r = warp_n * THREADS_PER_WARP + lane;  // 0, 1, ..., 31
+
+  const int r = bidx * ROWS_PER_CTA + warp_m;
+  const int c = warp_n * THREADS_PER_WARP + lane;
+
+  static_assert(LN_NUM_COLS == THREADS_PER_ROW * LDGS * VecSize, "");
+
+  // smem for column reduction
+  __shared__ U smem_[ROWS_PER_CTA * LN_NUM_COLS];
+
+  U dgamma_sum[LDGS * VecSize];
+  U dbeta_sum[LDGS * VecSize];
+
+  memset(dgamma_sum, 0, sizeof(U) * LDGS * VecSize);
+  memset(dbeta_sum, 0, sizeof(U) * LDGS * VecSize);
+
+  // Note: it is no use for WARP_N = 1
+  __shared__ U smem_sum_loss1[ROWS_PER_CTA * WARPS_N];  // 4
+  __shared__ U smem_sum_loss2[ROWS_PER_CTA * WARPS_N];  // 4
+  U *sum_loss1_shared = &smem_sum_loss1[warp_m * WARPS_N];
+  U *sum_loss2_shared = &smem_sum_loss2[warp_m * WARPS_N];
+
+  // step-1: compute dx and local results of dscale and dbias
+  constexpr float rn = 1.f / static_cast<float>(LN_NUM_COLS);
+  Vec_scale gamma[LDGS];
+  int col = c;
+#pragma unroll
+  for (int it = 0; it < LDGS; it++) {
+    platform::Load<ScaleT, VecSize>(gamma_ptr + col * VecSize, &gamma[it]);
+    col += THREADS_PER_ROW;
+  }
+
+#pragma unroll 1
+  for (int row = r; row < rows; row += gridDim.x * ROWS_PER_CTA) {
+    const U mean_cur_row = mean_ptr[row];
+    const U var_cur_row = rsqrt_<U>(var_ptr[row] + epsilon);
+    Vec dout[LDGS], x[LDGS];
+    MaskLoadT mask_vec[LDGS];
+    int col = c;
+#pragma unroll
+    for (int it = 0; it < LDGS; it++) {
+      platform::Load<T, VecSize>(dout_ptr + row * LN_NUM_COLS + col * VecSize,
+                                 &dout[it]);
+      platform::Load<T, VecSize>(x_ptr + row * LN_NUM_COLS + col * VecSize,
+                                 &x[it]);
+      if (isFusedDropoutResidualLn) {
+        platform::Load<MaskType, VecSize>(
+            mask_ptr + row * LN_NUM_COLS + col * VecSize, &mask_vec[it]);
+      }
+
+      col += THREADS_PER_ROW;
+    }
+
+    // local reductions
+    U dy[LDGS * VecSize];
+    U y[LDGS * VecSize];
+
+    U sum_loss1 = 0.f;
+    U sum_loss2 = 0.f;
+#pragma unroll
+    for (int it = 0; it < LDGS; it++) {
+#pragma unroll
+      for (int jt = 0; jt < VecSize; jt++) {
+        U x_tmp = x[it][jt];
+        U y_tmp = var_cur_row * (x_tmp - mean_cur_row);
+        U dy_tmp = static_cast<U>(gamma[it][jt]) *
+                   static_cast<U>(dout[it][jt]);  // scale * dy
+        U dout_tmp = dout[it][jt];                // dy
+
+        // used for get dx (row reduction)
+        sum_loss1 += dy_tmp;          // scale * dy, sum_1
+        sum_loss2 += dy_tmp * y_tmp;  // scale * dy * y, sum_2
+
+        dy[it * VecSize + jt] = dy_tmp;  // scale * dy
+        y[it * VecSize + jt] = y_tmp;    // y
+
+        // used for get dscale and dbias (column reduction)
+        dgamma_sum[it * VecSize + jt] += dout_tmp * y_tmp;  // dy * y
+        dbeta_sum[it * VecSize + jt] += dout_tmp;           // dy
+      }
+    }
+
+    // reduction across row for sum_loss1, sum_loss2
+    if (WARPS_N == 1) {
+#pragma unroll
+      // row reduction among 32 threads.
+      for (int it = 1; it < THREADS_PER_WARP; it *= 2) {
+        sum_loss1 += __shfl_xor_sync(uint32_t(-1), sum_loss1, it);
+        sum_loss2 += __shfl_xor_sync(uint32_t(-1), sum_loss2, it);
+      }
+      sum_loss1 *= rn;
+      sum_loss2 *= rn;
+    } else {
+#pragma unroll
+      for (int it = 16; it > 0; it /= 2) {
+        sum_loss1 += __shfl_down_sync(uint32_t(-1), sum_loss1, it);
+        sum_loss2 += __shfl_down_sync(uint32_t(-1), sum_loss2, it);
+      }
+
+      if (lane == 0) {
+        sum_loss1_shared[warp_n] = sum_loss1;
+        sum_loss2_shared[warp_n] = sum_loss2;
+      }
+
+      __syncthreads();
+      if (warp_n == 0 && lane == 0) {
+        sum_loss1 = 0.f;
+        sum_loss2 = 0.f;
+        for (int it = 0; it < WARPS_N; it++) {
+          sum_loss1 += sum_loss1_shared[it];
+          sum_loss2 += sum_loss2_shared[it];
+        }
+        sum_loss1_shared[0] = sum_loss1;
+        sum_loss2_shared[0] = sum_loss2;
+      }
+      __syncthreads();
+
+      sum_loss1 = sum_loss1_shared[0] * rn;
+      sum_loss2 = sum_loss2_shared[0] * rn;
+    }
+
+#pragma unroll
+    for (int it = 0; it < LDGS; it++) {
+#pragma unroll
+      for (int jt = 0; jt < VecSize; jt++) {
+        U dy_tmp = dy[it * VecSize + jt];  // scale * dy
+        U y_tmp = y[it * VecSize + jt];    // y
+        // dx = var * (scale * dy - sum_loss2 * y - sum_loss1)
+        U dx_tmp = var_cur_row * (dy_tmp - sum_loss2 * y_tmp - sum_loss1);
+        // Note: reuse x and dout vec register to store dx and d_dropout_src.
+        x[it][jt] = static_cast<T>(dx_tmp);
+        if (isFusedDropoutResidualLn) {
+          dout[it][jt] = x[it][jt] * static_cast<T>(mask_vec[it][jt]) * factor;
+        }
+      }
+    }
+
+    // store dx to global memory
+    col = c;
+#pragma unroll
+    for (int it = 0; it < LDGS; it++) {
+      platform::Store<T, VecSize>(x[it],
+                                  dx_ptr + row * LN_NUM_COLS + col * VecSize);
+      if (isFusedDropoutResidualLn) {
+        platform::Store<T, VecSize>(
+            dout[it], d_dropout_src_ptr + row * LN_NUM_COLS + col * VecSize);
+      }
+      col += THREADS_PER_ROW;
+    }
+  }
+
+  // step-2: column reduction of dscale and dbias for each thread block.
+  // each block's sum: [4 * 1024] -> [1 * 1024]
+  enum { NUM_RES = LN_NUM_COLS / THREADS_PER_CTA };  // 1024/128 = 8
+  static_assert(NUM_RES * THREADS_PER_CTA == LN_NUM_COLS, "");
+
+  U *smem_write;
+
+  smem_write = &smem_[warp_m * LN_NUM_COLS + tid_r * VecSize];  // [4 * 1024]
+#pragma unroll
+  for (int it = 0; it < LDGS; it++) {
+#pragma unroll
+    for (int jt = 0; jt < VecSize; jt++) {
+      smem_write[jt] = dbeta_sum[it * VecSize + jt];
+    }
+    smem_write += THREADS_PER_ROW * VecSize;  // 32*8
+  }
+  __syncthreads();
+  U cta_dbeta_sum[NUM_RES];
+  memset(cta_dbeta_sum, 0, sizeof(U) * NUM_RES);
+  // column reduction for elems in smem: 4*1024 -> 1*1024.
+  for (int it = 0; it < ROWS_PER_CTA; it++) {
+    for (int jt = 0; jt < NUM_RES; jt++) {
+      cta_dbeta_sum[jt] +=
+          smem_[it * LN_NUM_COLS + tidx + jt * THREADS_PER_CTA];
+    }
+  }
+  __syncthreads();
+
+  smem_write = &smem_[warp_m * LN_NUM_COLS + tid_r * VecSize];
+#pragma unroll
+  for (int it = 0; it < LDGS; it++) {
+#pragma unroll
+    for (int jt = 0; jt < VecSize; jt++) {
+      smem_write[jt] = dgamma_sum[it * VecSize + jt];
+    }
+    smem_write += THREADS_PER_ROW * VecSize;
+  }
+  __syncthreads();
+  U cta_dgamma_sum[NUM_RES];
+  memset(cta_dgamma_sum, 0, sizeof(U) * NUM_RES);
+  for (int it = 0; it < ROWS_PER_CTA; it++) {
+    for (int jt = 0; jt < NUM_RES; jt++) {
+      cta_dgamma_sum[jt] +=
+          smem_[it * LN_NUM_COLS + tidx + jt * THREADS_PER_CTA];
+    }
+  }
+
+  // the shape of results：(#blocks, 1024)
+  U *dgamma_part =
+      static_cast<U *>(dgamma_temp_ptr) + bidx * LN_NUM_COLS + tidx;
+  for (int jt = 0; jt < NUM_RES; jt++) {
+    *dgamma_part = cta_dgamma_sum[jt];
+    dgamma_part += THREADS_PER_CTA;
+  }
+
+  U *dbeta_part = static_cast<U *>(dbeta_temp_ptr) + bidx * LN_NUM_COLS + tidx;
+  for (int jt = 0; jt < NUM_RES; jt++) {
+    *dbeta_part = cta_dbeta_sum[jt];
+    dbeta_part += THREADS_PER_CTA;
+  }
+}
+
+/* This function carry out column reduction whose input is [rows, 1024] and
+ * output is [1, 1024].
+ * #blocks: 32
+ * #threads: 512
+*/
+// todo(@limin29): to think if there are better impl strategies
+template <
+    typename U, typename ScaleT = U, int VecSize = 1, int WARPS_M = 16,
+    int WARPS_N = 1, int BYTES_PER_LDG = 4, int ELTS_PER_ROW = 1024,
+    int THREADS_PER_WARP = 32, int THREADS_PER_ROW = WARPS_N *THREADS_PER_WARP,
+    int THREADS_PER_CTA = WARPS_M *THREADS_PER_ROW, int ROWS_PER_CTA = WARPS_M,
+    int ELTS_PER_ROW_PER_CTA = THREADS_PER_ROW *VecSize,
+    int LDGS = ELTS_PER_ROW / ELTS_PER_ROW_PER_CTA,
+    int VEC_COLS = ELTS_PER_ROW / VecSize>
+__global__ __launch_bounds__(THREADS_PER_CTA) void ln_bwd_1024_final_kernel(
+    const int rows, U *__restrict__ dg_part_, U *__restrict__ db_part_,
+    ScaleT *__restrict__ dg_, ScaleT *__restrict__ db_) {
+  using Vec = platform::AlignedVector<U, VecSize>;
+  static_assert(VEC_COLS == LN_NUM_COLS / VecSize, "");
+
+  const int tidx = threadIdx.x;
+  const int bidx = blockIdx.x;
+  const int lane = tidx % THREADS_PER_WARP;
+  const int warp = tidx / THREADS_PER_WARP;
+  const int warp_m = warp / WARPS_N;
+  const int warp_n = warp % WARPS_N;
+  const int tid_c = warp_n * THREADS_PER_WARP + lane;
+
+  const int c = bidx * THREADS_PER_ROW + tid_c;
+  const int r = warp_m;
+
+  __shared__ U smem_space[(WARPS_M - 1) * THREADS_PER_ROW * VecSize];
+
+  for (int col = c; col < VEC_COLS; col += gridDim.x * THREADS_PER_ROW) {
+    const U *dg_part_ptr = (dg_part_) + r * LN_NUM_COLS + col * VecSize;
+    const U *db_part_ptr = (db_part_) + r * LN_NUM_COLS + col * VecSize;
+
+    U dg_sum[VecSize];
+    U db_sum[VecSize];
+    memset(dg_sum, 0, sizeof(U) * VecSize);
+    memset(db_sum, 0, sizeof(U) * VecSize);
+#pragma unroll
+    for (int row = r; row < rows; row += ROWS_PER_CTA) {
+      Vec dg;
+      Vec db;
+      platform::Load<U, VecSize>(dg_part_ptr, &dg);
+      platform::Load<U, VecSize>(db_part_ptr, &db);
+      dg_part_ptr += ROWS_PER_CTA * LN_NUM_COLS;
+      db_part_ptr += ROWS_PER_CTA * LN_NUM_COLS;
+
+#pragma unroll
+      for (int jt = 0; jt < VecSize; jt++) {
+        dg_sum[jt] += dg[jt];
+        db_sum[jt] += db[jt];
+      }
+    }
+
+    // reduction across rows of the thread block
+    U *smem_write;
+    smem_write = smem_space + (warp_m - 1) * THREADS_PER_ROW * VecSize + tid_c;
+
+    if (warp_m > 0) {
+#pragma unroll
+      for (int jt = 0; jt < VecSize; jt++) {
+        *smem_write = dg_sum[jt];
+        smem_write += THREADS_PER_ROW;
+      }
+    }
+    __syncthreads();
+
+    U *smem_read;
+    smem_read = smem_space + tid_c;
+    if (warp_m == 0) {
+#pragma unroll
+      for (int it = 0; it < WARPS_M - 1; it++) {
+#pragma unroll
+        for (int jt = 0; jt < VecSize; jt++) {
+          dg_sum[jt] += *smem_read;
+          smem_read += THREADS_PER_ROW;
+        }
+      }
+    }
+
+    __syncthreads();
+
+    smem_write = smem_space + (warp_m - 1) * THREADS_PER_ROW * VecSize + tid_c;
+
+    if (warp_m > 0) {
+#pragma unroll
+      for (int jt = 0; jt < VecSize; jt++) {
+        *smem_write = db_sum[jt];
+        smem_write += THREADS_PER_ROW;
+      }
+    }
+    __syncthreads();
+
+    smem_read = smem_space + tid_c;
+    if (warp_m == 0) {
+#pragma unroll
+      for (int it = 0; it < WARPS_M - 1; it++) {
+#pragma unroll
+        for (int jt = 0; jt < VecSize; jt++) {
+          db_sum[jt] += *smem_read;
+          smem_read += THREADS_PER_ROW;
+        }
+      }
+
+      union {
+        ScaleT raw;
+        ScaleT elt[VecSize];
+      } dg_out, db_out;
+
+#pragma unroll
+      for (int jt = 0; jt < VecSize; jt++) {
+        dg_out.elt[jt] = dg_sum[jt];
+        db_out.elt[jt] = db_sum[jt];
+      }
+      ScaleT *dg_ptr = reinterpret_cast<ScaleT *>(dg_) + col;
+      ScaleT *db_ptr = reinterpret_cast<ScaleT *>(db_) + col;
+      *dg_ptr = dg_out.raw;
+      *db_ptr = db_out.raw;
+    }
+  }
+}
+
+/* This function support two kinds of computations (only for float and fp16
+* type):
+*
+* Case-1: compute layer_norm_grad for layernorm op by setting mask_ptr and
+* d_dropout_src_ptr to nullptr. Here, d_x_ptr returns the grad of layernorm
+* input.
+*
+* Case-2: compute layer_norm_grad + residual_grad + dropout_grad for
+* fused_dropout_residual_layernorm op. Here, dx_ptr returns residual_grad.
+*
+*/
+template <typename T, typename U, typename ScaleT = U,
+          typename MaskType = uint8_t>
+void ln_bwd_1024_kernel_driver(
+    const platform::CUDADeviceContext &dev_ctx, const int rows, const int cols,
+    float epsilon, const T *x_ptr, const ScaleT *scale_ptr, const U *mean_ptr,
+    const U *var_ptr, const T *dout_ptr, T *dx_ptr, ScaleT *dscale_ptr,
+    ScaleT *dbias_ptr, const MaskType *mask_ptr = nullptr,
+    T factor = static_cast<T>(0), T *d_dropout_src_ptr = nullptr) {
+  auto stream = dev_ctx.stream();
+  if (cols == 1024) {
+    // step-1: compute dx and reduced part results of dscale and dbias.
+    const int WARPS_M = 4;
+    const int WARPS_N = 1;
+    const int BYTES_PER_LDG = 16;
+    const int VecSize = BYTES_PER_LDG / sizeof(T);
+
+    const int THREADS_PER_WARP = 32;
+    const int THREADS_PER_ROW = WARPS_N * THREADS_PER_WARP;
+    const int THREADS_PER_CTA = WARPS_M * THREADS_PER_ROW;
+    const int ROWS_PER_CTA = WARPS_M;
+
+    // 4 * 1024 * 4
+    const int SMEM_BYTES = ROWS_PER_CTA * cols * sizeof(U);
+
+    // #blocks = 2 * #SM
+    const int gridx = 2 * dev_ctx.GetSMCount();
+
+    // get temp space for dscale and dbias.
+    framework::Tensor dscale_temp;
+    dscale_temp.Resize({gridx, cols});
+    dscale_temp.mutable_data<U>(dev_ctx.GetPlace());
+    U *dscale_temp_ptr = dscale_temp.data<U>();
+
+    framework::Tensor dbias_temp;
+    dbias_temp.Resize({gridx, cols});
+    dbias_temp.mutable_data<U>(dev_ctx.GetPlace());
+    U *dbias_temp_ptr = dbias_temp.data<U>();
+
+    if (mask_ptr != nullptr) {
+      if (d_dropout_src_ptr == nullptr) {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "To compute fused_dropout_residual_ln grad, d_dropout_src_ptr "
+            "can't be null"));
+      }
+      fused_ln_bwd_1024_kernel<
+          true, T, U, ScaleT, MaskType, VecSize, WARPS_M, WARPS_N,
+          BYTES_PER_LDG><<<gridx, THREADS_PER_CTA, 0, stream>>>(
+          rows, epsilon, x_ptr, scale_ptr, mean_ptr, var_ptr, dout_ptr,
+          dscale_temp_ptr, dbias_temp_ptr, dx_ptr, mask_ptr, factor,
+          d_dropout_src_ptr);
+
+    } else {
+      fused_ln_bwd_1024_kernel<
+          false, T, U, ScaleT, MaskType, VecSize, WARPS_M, WARPS_N,
+          BYTES_PER_LDG><<<gridx, THREADS_PER_CTA, 0, stream>>>(
+          rows, epsilon, x_ptr, scale_ptr, mean_ptr, var_ptr, dout_ptr,
+          dscale_temp_ptr, dbias_temp_ptr, dx_ptr);
+    }
+    const int WARPS_M_2 = 16;
+    const int WARPS_N_2 = 1;
+    const int BYTES_PER_LDG_2 = 4;
+    const int VecSize_2 =
+        std::max(1, static_cast<int>(BYTES_PER_LDG_2 / sizeof(U)));  // 1
+
+    const int THREADS_PER_WARP_2 = 32;
+    const int THREADS_PER_ROW_2 = WARPS_N_2 * THREADS_PER_WARP_2;  // 32
+    const int THREADS_PER_CTA_2 =
+        WARPS_M_2 * THREADS_PER_ROW_2;     // 16 * 32 = 512
+    const int ROWS_PER_CTA_2 = WARPS_M_2;  // 16
+
+    const int gridx_2 = static_cast<int>(
+        std::ceil(1024 / static_cast<float>(THREADS_PER_ROW_2 * VecSize_2)));
+    // #blocks: 32，#threads_per_block: 512
+    // Note: it is not supported for double type.
+    if (sizeof(U) > 4) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Only support float and fp16 type"));
+    } else {
+      ln_bwd_1024_final_kernel<
+          U, ScaleT, VecSize_2, WARPS_M_2, WARPS_N_2,
+          BYTES_PER_LDG_2><<<gridx_2, THREADS_PER_CTA_2, 0, stream>>>(
+          gridx, dscale_temp_ptr, dbias_temp_ptr, dscale_ptr, dbias_ptr);
+    }
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Fast layer_norm kernel is only used when feature_size is 1024"));
+  }
+}
+#endif
+
 template <typename T, typename U, int BDIMX, int BDIMY, int VPTX>
 __global__ void LayerNormBackwardPartGradGammaBeta(
     const T *__restrict__ dout, const T *__restrict__ input, const int64_t n1,
@@ -983,42 +1448,62 @@ static void LayerNormBackward(
       break;
     case 7:  // d_x != nullptr, d_scale != nullptr, d_bias != nullptr
     {
-      constexpr int VPT = 4;
-      constexpr int BDIMX2 = 32;
-      constexpr int BDIMY2 = 4;
-      dim3 threads2(BDIMX2, BDIMY2, 1);
-      constexpr int part_size = BDIMY2 * VPT;
-      const dim3 blocks2((feature_size + BDIMX2 - 1) / BDIMX2, part_size, 1);
-
-      auto part_grad_gamma_ptr =
-          memory::Alloc(dev_ctx, part_size * feature_size * sizeof(U));
-      auto part_grad_beta_ptr =
-          memory::Alloc(dev_ctx, part_size * feature_size * sizeof(U));
-      U *part_grad_gamma = reinterpret_cast<U *>(part_grad_gamma_ptr->ptr());
-      U *part_grad_beta = reinterpret_cast<U *>(part_grad_beta_ptr->ptr());
-
-      LayerNormBackwardPartGradGammaBeta<T, U, BDIMX2, BDIMY2,
-                                         VPT><<<blocks2, threads2, 0, stream>>>(
-          d_y, x, batch_size, feature_size, mean, var, epsilon, part_grad_gamma,
-          part_grad_beta);  // compute part_grad_gamma, beta
-
-      constexpr int BDIMX3 = 32;
-      constexpr int BDIMY3 = 8;
-      dim3 threads3(BDIMX3, BDIMY3, 1);
-      const dim3 blocks3((feature_size + BDIMX2 - 1) / BDIMX2, 1, 1);
-      LayerNormBackwardSumGradGammaBeta<
-          T, U, BDIMX3, BDIMY3,
-          ScaleBiasWithSameTypeX><<<blocks3, threads3, 0, stream>>>(
-          part_grad_gamma, part_grad_beta, part_size, batch_size, feature_size,
-          d_scale, d_bias);
-
-      constexpr int BDIMX1 = 32;
-      constexpr int BDIMY1 = 4;
-      dim3 threads1(BDIMX1, BDIMY1, 1);
-      LayerNormBackwardComputeGradInput<
-          T, U, BDIMX1, BDIMY1,
-          ScaleBiasWithSameTypeX><<<batch_size, threads1, 0, stream>>>(
-          d_y, x, batch_size, feature_size, mean, var, epsilon, scale, d_x);
+#ifdef PADDLE_WITH_CUDA
+      bool can_call_1024_kernel = false;
+      // todo: rule out double type.
+      if (feature_size == 1024 && sizeof(T) <= 4) {
+        can_call_1024_kernel = true;
+      }
+      VLOG(6) << "can_call_1024_kernel = " << can_call_1024_kernel;
+
+      if (can_call_1024_kernel) {
+        ln_bwd_1024_kernel_driver<
+            T, U, LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX>>(
+            dev_ctx, batch_size, feature_size, epsilon, x, scale, mean, var,
+            d_y, d_x, d_scale, d_bias);
+      } else {
+#endif
+        constexpr int VPT = 4;
+        constexpr int BDIMX2 = 32;
+        constexpr int BDIMY2 = 4;
+        dim3 threads2(BDIMX2, BDIMY2, 1);
+        constexpr int part_size = BDIMY2 * VPT;
+        const dim3 blocks2((feature_size + BDIMX2 - 1) / BDIMX2, part_size, 1);
+
+        auto part_grad_gamma_ptr =
+            memory::Alloc(dev_ctx, part_size * feature_size * sizeof(U));
+        auto part_grad_beta_ptr =
+            memory::Alloc(dev_ctx, part_size * feature_size * sizeof(U));
+        U *part_grad_gamma = reinterpret_cast<U *>(part_grad_gamma_ptr->ptr());
+        U *part_grad_beta = reinterpret_cast<U *>(part_grad_beta_ptr->ptr());
+
+        LayerNormBackwardPartGradGammaBeta<
+            T, U, BDIMX2, BDIMY2, VPT><<<blocks2, threads2, 0, stream>>>(
+            d_y, x, batch_size, feature_size, mean, var, epsilon,
+            part_grad_gamma,
+            part_grad_beta);  // compute part_grad_gamma, beta
+
+        constexpr int BDIMX3 = 32;
+        constexpr int BDIMY3 = 8;
+        dim3 threads3(BDIMX3, BDIMY3, 1);
+        const dim3 blocks3((feature_size + BDIMX2 - 1) / BDIMX2, 1, 1);
+        LayerNormBackwardSumGradGammaBeta<
+            T, U, BDIMX3, BDIMY3,
+            ScaleBiasWithSameTypeX><<<blocks3, threads3, 0, stream>>>(
+            part_grad_gamma, part_grad_beta, part_size, batch_size,
+            feature_size, d_scale, d_bias);
+
+        constexpr int BDIMX1 = 32;
+        constexpr int BDIMY1 = 4;
+        dim3 threads1(BDIMX1, BDIMY1, 1);
+        LayerNormBackwardComputeGradInput<
+            T, U, BDIMX1, BDIMY1,
+            ScaleBiasWithSameTypeX><<<batch_size, threads1, 0, stream>>>(
+            d_y, x, batch_size, feature_size, mean, var, epsilon, scale, d_x);
+#ifdef PADDLE_WITH_CUDA
+      }
+#endif
+
       break;
     }
     default:
diff --git a/paddle/fluid/operators/lite/lite_engine_op_test.cc b/paddle/fluid/operators/lite/lite_engine_op_test.cc
index 053ba322d8f..01583cea312 100644
--- a/paddle/fluid/operators/lite/lite_engine_op_test.cc
+++ b/paddle/fluid/operators/lite/lite_engine_op_test.cc
@@ -72,6 +72,10 @@ TEST(LiteEngineOp, engine_op) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   platform::CUDAPlace place;
   platform::CUDADeviceContext ctx(place);
+  ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                       .GetAllocator(place, ctx.stream())
+                       .get());
+  ctx.PartialInitWithAllocator();
 #else
   platform::CPUPlace place;
   platform::CPUDeviceContext ctx(place);
diff --git a/paddle/fluid/operators/margin_cross_entropy_op.cu b/paddle/fluid/operators/margin_cross_entropy_op.cu
index e4fb4150f84..a8a3390c002 100644
--- a/paddle/fluid/operators/margin_cross_entropy_op.cu
+++ b/paddle/fluid/operators/margin_cross_entropy_op.cu
@@ -299,7 +299,7 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
         ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({N, 1}, dev_ctx);
     T* logits_max_buff = logits_max.mutable_data<T>(place);
     TensorReduceFunctorImpl<T, T, kps::MaxFunctor, kps::IdentityFunctor<T>>(
-        softmax_2d, &logits_max, kps::IdentityFunctor<T>(), {1},
+        dev_ctx, softmax_2d, &logits_max, kps::IdentityFunctor<T>(), {1},
         dev_ctx.stream());
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
@@ -321,7 +321,7 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
         ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({N, 1}, dev_ctx);
     T* sum_exp_logits_buff = sum_exp_logits.mutable_data<T>(place);
     TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::ExpFunctor<T>>(
-        softmax_2d, &sum_exp_logits, kps::ExpFunctor<T>(), {1},
+        dev_ctx, softmax_2d, &sum_exp_logits, kps::ExpFunctor<T>(), {1},
         dev_ctx.stream());
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
diff --git a/paddle/fluid/operators/math/beam_search_test.cc b/paddle/fluid/operators/math/beam_search_test.cc
index ec2e9516fcd..e6f6a09a43b 100644
--- a/paddle/fluid/operators/math/beam_search_test.cc
+++ b/paddle/fluid/operators/math/beam_search_test.cc
@@ -15,6 +15,8 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/beam_search.h"
 
 #include <gtest/gtest.h>
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/place.h"
 
 void PrepareCPUTensors(paddle::framework::LoDTensor* ids,
                        paddle::framework::LoDTensor* scores,
@@ -129,6 +131,83 @@ void TestBeamSearch() {
   delete context;
 }
 
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+template <>
+void TestBeamSearch<paddle::platform::CUDADeviceContext,
+                    paddle::platform::CUDAPlace>() {
+  paddle::framework::LoDTensor ids;
+  paddle::framework::LoDTensor scores;
+  paddle::framework::LoDTensor pre_ids;
+  paddle::framework::LoDTensor pre_scores;
+
+  auto* place = new paddle::platform::CUDAPlace();
+  auto* context = new paddle::platform::CUDADeviceContext(*place);
+  context->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                            .GetAllocator(*place, context->stream())
+                            .get());
+  context->PartialInitWithAllocator();
+  if (paddle::platform::is_cpu_place(*place)) {
+    PrepareCPUTensors(&ids, &scores, &pre_ids, &pre_scores);
+  } else {
+    paddle::framework::LoDTensor cpu_ids;
+    paddle::framework::LoDTensor cpu_scores;
+    paddle::framework::LoDTensor cpu_pre_ids;
+    paddle::framework::LoDTensor cpu_pre_scores;
+
+    PrepareCPUTensors(&cpu_ids, &cpu_scores, &cpu_pre_ids, &cpu_pre_scores);
+
+    paddle::framework::TensorCopySync(cpu_ids, *place, &ids);
+    paddle::framework::TensorCopySync(cpu_scores, *place, &scores);
+    paddle::framework::TensorCopySync(cpu_pre_ids, *place, &pre_ids);
+    paddle::framework::TensorCopySync(cpu_pre_scores, *place, &pre_scores);
+
+    ids.set_lod(cpu_ids.lod());
+    scores.set_lod(cpu_scores.lod());
+    pre_ids.set_lod(cpu_pre_ids.lod());
+    pre_scores.set_lod(cpu_pre_scores.lod());
+  }
+
+  paddle::framework::LoDTensor selected_ids;
+  paddle::framework::LoDTensor selected_scores;
+  paddle::framework::LoDTensor parent_idx;
+
+  size_t level = 0;
+  size_t beam_size = 2;
+  int end_id = 0;
+  paddle::operators::math::BeamSearchFunctor<
+      paddle::platform::CUDADeviceContext, float>
+      beamsearch;
+  beamsearch(*context, &pre_ids, &pre_scores, &ids, &scores, &selected_ids,
+             &selected_scores, &parent_idx, level, beam_size, end_id, true);
+
+  ASSERT_EQ(selected_ids.lod(), selected_scores.lod());
+
+  paddle::framework::LoDTensor cpu_selected_ids;
+  paddle::framework::LoDTensor cpu_selected_scores;
+  if (paddle::platform::is_cpu_place(*place)) {
+    cpu_selected_ids = selected_ids;
+    cpu_selected_scores = selected_scores;
+  } else {
+    paddle::framework::TensorCopySync(
+        selected_ids, paddle::platform::CPUPlace(), &cpu_selected_ids);
+    paddle::framework::TensorCopySync(
+        selected_scores, paddle::platform::CPUPlace(), &cpu_selected_scores);
+    cpu_selected_ids.set_lod(selected_ids.lod());
+    cpu_selected_scores.set_lod(selected_scores.lod());
+  }
+
+  std::vector<int64_t> expected_ids({4, 5, 3, 8});
+  std::vector<float> expected_scores({0.6f, 0.5f, 0.9f, 0.7f});
+  for (int i = 0; i < 4; i++) {
+    ASSERT_EQ(expected_ids[i], cpu_selected_ids.data<int64_t>()[i]);
+    ASSERT_EQ(expected_scores[i], cpu_selected_scores.data<float>()[i]);
+  }
+
+  delete place;
+  delete context;
+}
+#endif
+
 TEST(BeamSearch, CPU) {
   TestBeamSearch<paddle::platform::CPUDeviceContext,
                  paddle::platform::CPUPlace>();
diff --git a/paddle/fluid/operators/math/blas_impl.cu.h b/paddle/fluid/operators/math/blas_impl.cu.h
index 92162e639ff..7ffd2a7ab2d 100644
--- a/paddle/fluid/operators/math/blas_impl.cu.h
+++ b/paddle/fluid/operators/math/blas_impl.cu.h
@@ -18,6 +18,7 @@
 #include "paddle/fluid/platform/dynload/cublas.h"
 
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/pten/backends/gpu/gpu_context.h"
 
 DECLARE_bool(enable_cublas_tensor_op_math);
 
@@ -92,6 +93,32 @@ struct CUBlas<float> {
 #endif
   }
 
+  // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
+  // https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
+  template <typename... ARGS>
+  static void GEMM_EX(pten::GPUContext *dev_ctx, cublasOperation_t transa,
+                      cublasOperation_t transb, int m, int n, int k,
+                      const float *alpha, const void *A, cudaDataType_t Atype,
+                      int lda, const void *B, cudaDataType_t Btype, int ldb,
+                      const float *beta, void *C, cudaDataType_t Ctype,
+                      int ldc) {
+// Because the gcc 4.8 doesn't expand template parameter pack that
+// appears in a lambda-expression, I can not use template parameter pack
+// here.
+#if CUDA_VERSION >= 8000
+    VLOG(5) << "use_tensor_op_math: "
+            << (dev_ctx->tensor_core_available() ? "True" : "False");
+    dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasSgemmEx(
+          handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb,
+          beta, C, Ctype, ldc));
+    });
+#else
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "cublasSgemmEx is not supported on cuda <= 7.5"));
+#endif
+  }
+
   template <typename... ARGS>
   static void TRSM(ARGS... args) {
     PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasStrsm(args...));
@@ -273,6 +300,37 @@ struct CUBlas<platform::float16> {
         "cublasGemmEx is not supported on cuda <= 7.5"));
 #endif
   }
+
+  // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
+  // https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
+  template <typename... ARGS>
+  static void GEMM_EX(pten::GPUContext *dev_ctx, cublasOperation_t transa,
+                      cublasOperation_t transb, int m, int n, int k,
+                      const void *alpha, const void *A, cudaDataType_t Atype,
+                      int lda, const void *B, cudaDataType_t Btype, int ldb,
+                      const void *beta, void *C, cudaDataType_t Ctype, int ldc,
+                      cudaDataType_t computeType) {
+#if CUDA_VERSION >= 8000
+    cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
+#if CUDA_VERSION >= 9000
+    bool use_tensor_op_math = dev_ctx->tensor_core_available();
+    if (use_tensor_op_math) {
+      algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
+    }
+    VLOG(5) << "use_tensor_op_math: "
+            << (use_tensor_op_math ? "True" : "False");
+#endif  // CUDA_VERSION >= 9000
+
+    dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasGemmEx(
+          handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb,
+          beta, C, Ctype, ldc, computeType, algo));
+    });
+#else
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "cublasGemmEx is not supported on cuda <= 7.5"));
+#endif
+  }
 };
 
 template <>
@@ -388,6 +446,37 @@ struct CUBlas<platform::complex<float>> {
 #endif
   }
 
+  // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
+  // https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
+  template <typename... ARGS>
+  static void GEMM_EX(pten::GPUContext *dev_ctx, cublasOperation_t transa,
+                      cublasOperation_t transb, int m, int n, int k,
+                      const void *alpha, const void *A, cudaDataType_t Atype,
+                      int lda, const void *B, cudaDataType_t Btype, int ldb,
+                      const void *beta, void *C, cudaDataType_t Ctype, int ldc,
+                      cudaDataType_t computeType) {
+#if CUDA_VERSION >= 8000
+    cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
+#if CUDA_VERSION >= 9000
+    bool use_tensor_op_math = dev_ctx->tensor_core_available();
+    if (use_tensor_op_math) {
+      algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
+    }
+    VLOG(5) << "use_tensor_op_math: "
+            << (use_tensor_op_math ? "True" : "False");
+#endif  // CUDA_VERSION >= 9000
+
+    dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasGemmEx(
+          handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb,
+          beta, C, Ctype, ldc, computeType, algo));
+    });
+#else
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "cublasGemmEx is not supported on cuda <= 7.5"));
+#endif
+  }
+
   static void TRSM_BATCH(cublasHandle_t handle, cublasSideMode_t side,
                          cublasFillMode_t uplo, cublasOperation_t transa,
                          cublasDiagType_t diag, int m, int n,
@@ -529,6 +618,37 @@ struct CUBlas<platform::complex<double>> {
         "cublasGemmEx is not supported on cuda <= 7.5"));
 #endif
   }
+
+  // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
+  // https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
+  template <typename... ARGS>
+  static void GEMM_EX(pten::GPUContext *dev_ctx, cublasOperation_t transa,
+                      cublasOperation_t transb, int m, int n, int k,
+                      const void *alpha, const void *A, cudaDataType_t Atype,
+                      int lda, const void *B, cudaDataType_t Btype, int ldb,
+                      const void *beta, void *C, cudaDataType_t Ctype, int ldc,
+                      cudaDataType_t computeType) {
+#if CUDA_VERSION >= 8000
+    cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
+#if CUDA_VERSION >= 9000
+    bool use_tensor_op_math = dev_ctx->tensor_core_available();
+    if (use_tensor_op_math) {
+      algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
+    }
+    VLOG(5) << "use_tensor_op_math: "
+            << (use_tensor_op_math ? "True" : "False");
+#endif  // CUDA_VERSION >= 9000
+
+    dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasGemmEx(
+          handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb,
+          beta, C, Ctype, ldc, computeType, algo));
+    });
+#else
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "cublasGemmEx is not supported on cuda <= 7.5"));
+#endif
+  }
 };
 
 template <>
@@ -564,6 +684,39 @@ void Blas<platform::CUDADeviceContext>::GEMM(CBLAS_TRANSPOSE transA,
 #endif  // CUDA_VERSION >= 8000
 }
 
+template <>
+template <typename T>
+void Blas<pten::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
+                                  CBLAS_TRANSPOSE transB, int M, int N, int K,
+                                  T alpha, const T *A, const T *B, T beta,
+                                  T *C) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  cublasOperation_t cuTransA =
+      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t cuTransB =
+      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+
+#if CUDA_VERSION >= 8000
+  if (FLAGS_enable_cublas_tensor_op_math && std::is_same<T, float>::value) {
+    auto &cuda_ctx = const_cast<pten::GPUContext &>(context_);
+    CUBlas<T>::GEMM_EX(&cuda_ctx, cuTransB, cuTransA, N, M, K, &alpha, B,
+                       CUDA_R_32F, ldb, A, CUDA_R_32F, lda, &beta, C,
+                       CUDA_R_32F, N);
+  } else {
+#endif  // CUDA_VERSION >= 8000
+    context_.CublasCall([&](cublasHandle_t handle) {
+      CUBlas<T>::GEMM(handle, cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A,
+                      lda, &beta, C, N);
+    });
+
+#if CUDA_VERSION >= 8000
+  }
+#endif  // CUDA_VERSION >= 8000
+}
+
 template <>
 template <>
 inline void Blas<platform::CUDADeviceContext>::GEMM(
@@ -611,6 +764,55 @@ inline void Blas<platform::CUDADeviceContext>::GEMM(
 #endif  // CUDA_VERSION >= 8000
 }
 
+template <>
+template <>
+inline void Blas<pten::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
+                                         CBLAS_TRANSPOSE transB, int M, int N,
+                                         int K, platform::float16 alpha,
+                                         const platform::float16 *A,
+                                         const platform::float16 *B,
+                                         platform::float16 beta,
+                                         platform::float16 *C) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  cublasOperation_t cuTransA =
+      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t cuTransB =
+      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+
+  // TODO(kexinzhao): add processing code for compute capability < 53 case
+  PADDLE_ENFORCE_GE(
+      context_.GetComputeCapability(), 53,
+      platform::errors::InvalidArgument(
+          "cublas fp16 gemm requires GPU compute capability >= 53,"
+          "but received %d",
+          context_.GetComputeCapability()));
+
+  float h_alpha = static_cast<float>(alpha);
+  float h_beta = static_cast<float>(beta);
+
+#if CUDA_VERSION >= 8000
+  // cublasHgemm does true FP16 computation which is slow for non-Volta
+  // GPUs. So use cublasGemmEx instead which does pesudo FP16 computation:
+  // input/output in fp16, computation in fp32, which can also be accelerated
+  // using tensor cores in volta GPUs.
+  auto &cuda_ctx = const_cast<pten::GPUContext &>(context_);
+  CUBlas<platform::float16>::GEMM_EX(
+      &cuda_ctx, cuTransB, cuTransA, N, M, K, &h_alpha, B, CUDA_R_16F, ldb, A,
+      CUDA_R_16F, lda, &h_beta, C, CUDA_R_16F, N, CUDA_R_32F);
+#else
+  // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
+
+  context_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<platform::float16>::GEMM(handle, cuTransB, cuTransA, N, M, K,
+                                    &h_alpha, h_B, ldb, h_A, lda, &h_beta, h_C,
+                                    N);
+  });
+#endif  // CUDA_VERSION >= 8000
+}
+
 template <>
 template <>
 inline void Blas<platform::CUDADeviceContext>::GEMM(
@@ -659,6 +861,56 @@ inline void Blas<platform::CUDADeviceContext>::GEMM(
 #endif  // CUDA_VERSION >= 8000
 }
 
+template <>
+template <>
+inline void Blas<pten::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
+                                         CBLAS_TRANSPOSE transB, int M, int N,
+                                         int K, platform::complex<float> alpha,
+                                         const platform::complex<float> *A,
+                                         const platform::complex<float> *B,
+                                         platform::complex<float> beta,
+                                         platform::complex<float> *C) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  cublasOperation_t cuTransA =
+      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t cuTransB =
+      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+
+  // TODO(kexinzhao): add processing code for compute capability < 53 case
+  PADDLE_ENFORCE_GE(
+      context_.GetComputeCapability(), 53,
+      platform::errors::InvalidArgument(
+          "cublas complex64 gemm requires GPU compute capability >= 53,"
+          "but received %d",
+          context_.GetComputeCapability()));
+
+  thrust::complex<float> c_alpha =
+      thrust::complex<float>(alpha.real, alpha.imag);
+  thrust::complex<float> c_beta = thrust::complex<float>(beta.real, beta.imag);
+
+#if CUDA_VERSION >= 8000
+  // cublasHgemm does true FP16 computation which is slow for non-Volta
+  // GPUs. So use cublasGemmEx instead which does pesudo FP16 computation:
+  // input/output in fp16, computation in fp32, which can also be accelerated
+  // using tensor cores in volta GPUs.
+  auto &cuda_ctx = const_cast<pten::GPUContext &>(context_);
+  CUBlas<platform::complex<float>>::GEMM_EX(
+      &cuda_ctx, cuTransB, cuTransA, N, M, K, &c_alpha, B, CUDA_C_32F, ldb, A,
+      CUDA_C_32F, lda, &c_beta, C, CUDA_C_32F, N, CUDA_C_32F);
+#else
+  // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
+
+  context_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<platform::complex<float>>::GEMM(handle, cuTransB, cuTransA, N, M, K,
+                                           &c_alpha, h_B, ldb, h_A, lda,
+                                           &c_beta, h_C, N);
+  });
+#endif  // CUDA_VERSION >= 8000
+}
+
 template <>
 template <>
 inline void Blas<platform::CUDADeviceContext>::GEMM(
@@ -708,6 +960,57 @@ inline void Blas<platform::CUDADeviceContext>::GEMM(
 #endif  // CUDA_VERSION >= 8000
 }
 
+template <>
+template <>
+inline void Blas<pten::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
+                                         CBLAS_TRANSPOSE transB, int M, int N,
+                                         int K, platform::complex<double> alpha,
+                                         const platform::complex<double> *A,
+                                         const platform::complex<double> *B,
+                                         platform::complex<double> beta,
+                                         platform::complex<double> *C) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  cublasOperation_t cuTransA =
+      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t cuTransB =
+      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+
+  // TODO(kexinzhao): add processing code for compute capability < 53 case
+  PADDLE_ENFORCE_GE(
+      context_.GetComputeCapability(), 53,
+      platform::errors::InvalidArgument(
+          "cublas complex128 gemm requires GPU compute capability >= 53,"
+          "but received %d",
+          context_.GetComputeCapability()));
+
+  thrust::complex<double> c_alpha =
+      thrust::complex<double>(alpha.real, alpha.imag);
+  thrust::complex<double> c_beta =
+      thrust::complex<double>(beta.real, beta.imag);
+
+#if CUDA_VERSION >= 8000
+  // cublasHgemm does true FP16 computation which is slow for non-Volta
+  // GPUs. So use cublasGemmEx instead which does pesudo FP16 computation:
+  // input/output in fp16, computation in fp32, which can also be accelerated
+  // using tensor cores in volta GPUs.
+  auto &cuda_ctx = const_cast<pten::GPUContext &>(context_);
+  CUBlas<platform::complex<double>>::GEMM_EX(
+      &cuda_ctx, cuTransB, cuTransA, N, M, K, &c_alpha, B, CUDA_C_64F, ldb, A,
+      CUDA_C_64F, lda, &c_beta, C, CUDA_C_64F, N, CUDA_C_64F);
+#else
+  // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
+
+  context_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<platform::complex<double>>::GEMM(handle, cuTransB, cuTransA, N, M, K,
+                                            &c_alpha, h_B, ldb, h_A, lda,
+                                            &c_beta, h_C, N);
+  });
+#endif  // CUDA_VERSION >= 8000
+}
+
 template <>
 template <typename T>
 void Blas<platform::CUDADeviceContext>::GEMM(bool transA, bool transB, int M,
@@ -738,6 +1041,35 @@ void Blas<platform::CUDADeviceContext>::GEMM(bool transA, bool transB, int M,
 #endif  // CUDA_VERSION >= 8000
 }
 
+template <>
+template <typename T>
+void Blas<pten::GPUContext>::GEMM(bool transA, bool transB, int M, int N, int K,
+                                  T alpha, const T *A, int lda, const T *B,
+                                  int ldb, T beta, T *C, int ldc) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  cublasOperation_t cuTransA = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
+  cublasOperation_t cuTransB = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
+
+#if CUDA_VERSION >= 8000
+  if (FLAGS_enable_cublas_tensor_op_math && std::is_same<T, float>::value) {
+    auto &cuda_ctx = const_cast<pten::GPUContext &>(context_);
+    CUBlas<T>::GEMM_EX(&cuda_ctx, cuTransB, cuTransA, N, M, K, &alpha, B,
+                       CUDA_R_32F, ldb, A, CUDA_R_32F, lda, &beta, C,
+                       CUDA_R_32F, ldc);
+  } else {
+#endif  // CUDA_VERSION >= 8000
+
+    context_.CublasCall([&](cublasHandle_t handle) {
+      CUBlas<T>::GEMM(handle, cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A,
+                      lda, &beta, C, ldc);
+    });
+
+#if CUDA_VERSION >= 8000
+  }
+#endif  // CUDA_VERSION >= 8000
+}
+
 template <>
 template <>
 inline void Blas<platform::CUDADeviceContext>::GEMM(
@@ -755,6 +1087,25 @@ inline void Blas<platform::CUDADeviceContext>::GEMM(
   });
 }
 
+template <>
+template <>
+inline void Blas<pten::GPUContext>::GEMM(bool transA, bool transB, int M, int N,
+                                         int K, platform::float16 alpha,
+                                         const platform::float16 *A, int lda,
+                                         const platform::float16 *B, int ldb,
+                                         platform::float16 beta,
+                                         platform::float16 *C, int ldc) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  cublasOperation_t cuTransA = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
+  cublasOperation_t cuTransB = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
+
+  context_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<platform::float16>::GEMM(handle, cuTransB, cuTransA, N, M, K, &alpha,
+                                    B, ldb, A, lda, &beta, C, ldc);
+  });
+}
+
 template <>
 template <typename T>
 void Blas<platform::CUDADeviceContext>::AXPY(int n, T alpha, const T *x,
@@ -764,6 +1115,14 @@ void Blas<platform::CUDADeviceContext>::AXPY(int n, T alpha, const T *x,
   });
 }
 
+template <>
+template <typename T>
+void Blas<pten::GPUContext>::AXPY(int n, T alpha, const T *x, T *y) const {
+  context_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<T>::AXPY(handle, n, &alpha, x, 1, y, 1);
+  });
+}
+
 template <>
 template <typename T>
 void Blas<platform::CUDADeviceContext>::SCAL(int n, const T alpha, T *x) const {
@@ -771,6 +1130,13 @@ void Blas<platform::CUDADeviceContext>::SCAL(int n, const T alpha, T *x) const {
       [&](cublasHandle_t handle) { CUBlas<T>::SCAL(handle, n, &alpha, x, 1); });
 }
 
+template <>
+template <typename T>
+void Blas<pten::GPUContext>::SCAL(int n, const T alpha, T *x) const {
+  context_.CublasCall(
+      [&](cublasHandle_t handle) { CUBlas<T>::SCAL(handle, n, &alpha, x, 1); });
+}
+
 template <>
 template <typename T>
 void Blas<platform::CUDADeviceContext>::VCOPY(int n, const T *x, T *y) const {
@@ -778,6 +1144,13 @@ void Blas<platform::CUDADeviceContext>::VCOPY(int n, const T *x, T *y) const {
       [&](cublasHandle_t handle) { CUBlas<T>::VCOPY(handle, n, x, 1, y, 1); });
 }
 
+template <>
+template <typename T>
+void Blas<pten::GPUContext>::VCOPY(int n, const T *x, T *y) const {
+  context_.CublasCall(
+      [&](cublasHandle_t handle) { CUBlas<T>::VCOPY(handle, n, x, 1, y, 1); });
+}
+
 template <>
 template <typename T>
 void Blas<platform::CUDADeviceContext>::GEMV(bool trans_a, int M, int N,
@@ -790,6 +1163,17 @@ void Blas<platform::CUDADeviceContext>::GEMV(bool trans_a, int M, int N,
   });
 }
 
+template <>
+template <typename T>
+void Blas<pten::GPUContext>::GEMV(bool trans_a, int M, int N, T alpha,
+                                  const T *A, const T *B, T beta, T *C) const {
+  cublasOperation_t cuTransA = !trans_a ? CUBLAS_OP_T : CUBLAS_OP_N;
+
+  context_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<T>::GEMV(handle, cuTransA, N, M, &alpha, A, N, B, 1, &beta, C, 1);
+  });
+}
+
 template <>
 template <>
 inline void Blas<platform::CUDADeviceContext>::GEMV(
@@ -806,6 +1190,24 @@ inline void Blas<platform::CUDADeviceContext>::GEMV(
   }
 }
 
+template <>
+template <>
+inline void Blas<pten::GPUContext>::GEMV(bool trans_a, int M, int N,
+                                         platform::float16 alpha,
+                                         const platform::float16 *A,
+                                         const platform::float16 *B,
+                                         platform::float16 beta,
+                                         platform::float16 *C) const {
+  // Because cublas doesn't support half gemv, we use cublasHgemm to achieve it.
+  if (trans_a) {
+    this->template GEMM<platform::float16>(CblasNoTrans, CblasNoTrans, 1, N, M,
+                                           alpha, B, A, beta, C);
+  } else {
+    this->template GEMM<platform::float16>(CblasNoTrans, CblasNoTrans, M, 1, N,
+                                           alpha, A, B, beta, C);
+  }
+}
+
 template <>
 template <typename T>
 void Blas<platform::CUDADeviceContext>::BatchedGEMM(
@@ -854,6 +1256,56 @@ void Blas<platform::CUDADeviceContext>::BatchedGEMM(
 #endif  // CUDA_VERSION >= 9010
 }
 
+template <>
+template <typename T>
+void Blas<pten::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
+                                         CBLAS_TRANSPOSE transB, int M, int N,
+                                         int K, T alpha, const T *A, const T *B,
+                                         T beta, T *C, int batchCount,
+                                         int64_t strideA,
+                                         int64_t strideB) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  int ldc = N;
+  cublasOperation_t cuTransA =
+      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t cuTransB =
+      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  const int64_t strideC = M * N;
+
+#if CUDA_VERSION >= 9010
+  if ((FLAGS_enable_cublas_tensor_op_math && (std::is_same<T, float>::value)) ||
+      std::is_same<T, paddle::platform::float16>::value) {
+    cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
+    bool use_tensor_op_math = context_.tensor_core_available();
+    if (use_tensor_op_math) {
+      algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
+    }
+    VLOG(5) << "use_tensor_op_math: "
+            << (use_tensor_op_math ? "True" : "False");
+
+    auto fp = std::is_same<T, float>::value ? CUDA_R_32F : CUDA_R_16F;
+    context_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasGemmStridedBatchedEx(
+          handle, cuTransB, cuTransA, N, M, K, &alpha, B, fp, ldb, strideB, A,
+          fp, lda, strideA, &beta, C, fp, ldc, strideC, batchCount, fp, algo));
+    });
+  } else {
+#endif  // CUDA_VERSION >= 9010
+
+    context_.CublasCall([&](cublasHandle_t handle) {
+      CUBlas<T>::GEMM_STRIDED_BATCH(handle, cuTransB, cuTransA, N, M, K, &alpha,
+                                    B, ldb, strideB, A, lda, strideA, &beta, C,
+                                    ldc, strideC, batchCount);
+    });
+
+#if CUDA_VERSION >= 9010
+  }
+#endif  // CUDA_VERSION >= 9010
+}
+
 template <>
 template <typename T>
 void Blas<platform::CUDADeviceContext>::BatchedGEMM(
@@ -865,6 +1317,19 @@ void Blas<platform::CUDADeviceContext>::BatchedGEMM(
   }
 }
 
+template <>
+template <typename T>
+void Blas<pten::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
+                                         CBLAS_TRANSPOSE transB, int M, int N,
+                                         int K, T alpha, const T **A,
+                                         const T **B, T beta, T **C,
+                                         int batchCount) const {
+  for (int k = 0; k < batchCount; ++k) {
+    this->template GEMM<T>(transA, transB, M, N, K, alpha, A[k], B[k], beta,
+                           C[k]);
+  }
+}
+
 template <>
 template <>
 inline void Blas<platform::CUDADeviceContext>::BatchedGEMM(
@@ -878,6 +1343,19 @@ inline void Blas<platform::CUDADeviceContext>::BatchedGEMM(
   }
 }
 
+template <>
+template <>
+inline void Blas<pten::GPUContext>::BatchedGEMM(
+    CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K,
+    platform::float16 alpha, const platform::float16 **A,
+    const platform::float16 **B, platform::float16 beta, platform::float16 **C,
+    int batchCount) const {
+  for (int k = 0; k < batchCount; ++k) {
+    this->template GEMM<platform::float16>(transA, transB, M, N, K, alpha, A[k],
+                                           B[k], beta, C[k]);
+  }
+}
+
 template <>
 template <typename T>
 void Blas<platform::CUDADeviceContext>::TRSM(CBLAS_SIDE side, CBLAS_UPLO uplo,
@@ -903,6 +1381,30 @@ void Blas<platform::CUDADeviceContext>::TRSM(CBLAS_SIDE side, CBLAS_UPLO uplo,
   });
 }
 
+template <>
+template <typename T>
+void Blas<pten::GPUContext>::TRSM(CBLAS_SIDE side, CBLAS_UPLO uplo,
+                                  CBLAS_TRANSPOSE transA, CBLAS_DIAG diag,
+                                  int M, int N, T alpha, const T *A, int lda,
+                                  T *B, int ldb) const {
+  // solve row major `op ( A ) X = α B` by taking it as `X' op ( A' )  =  α B'`
+  // where ' stands for transpose
+  cublasSideMode_t cuSide =
+      (side == CblasLeft) ? CUBLAS_SIDE_RIGHT : CUBLAS_SIDE_LEFT;
+  cublasFillMode_t cuUplo =
+      (uplo == CblasLower) ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER;
+  // use CUBLAS_OP_C (conjugate transpose) for complex
+  cublasOperation_t cuTransA =
+      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasDiagType_t cuDiag =
+      (diag == CblasUnit) ? CUBLAS_DIAG_UNIT : CUBLAS_DIAG_NON_UNIT;
+
+  context_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<T>::TRSM(handle, cuSide, cuUplo, cuTransA, cuDiag, N, M, &alpha, A,
+                    lda, B, ldb);
+  });
+}
+
 template <>
 template <typename T>
 void Blas<platform::CUDADeviceContext>::BatchedGETRF(int n, T **a, int *ipiv,
@@ -913,6 +1415,15 @@ void Blas<platform::CUDADeviceContext>::BatchedGETRF(int n, T **a, int *ipiv,
   });
 }
 
+template <>
+template <typename T>
+void Blas<pten::GPUContext>::BatchedGETRF(int n, T **a, int *ipiv, int *info,
+                                          int batch_size) const {
+  context_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<T>::GETRF_BATCH(handle, n, a, n, ipiv, info, batch_size);
+  });
+}
+
 template <>
 template <typename T>
 void Blas<platform::CUDADeviceContext>::BatchedGETRI(int n, const T **a,
@@ -931,6 +1442,23 @@ void Blas<platform::CUDADeviceContext>::BatchedGETRI(int n, const T **a,
   });
 }
 
+template <>
+template <typename T>
+void Blas<pten::GPUContext>::BatchedGETRI(int n, const T **a, const int *ipiv,
+                                          T **a_inv, int *info,
+                                          int batch_size) const {
+  PADDLE_ENFORCE_NE(
+      a_inv, a,
+      platform::errors::InvalidArgument(
+          "cuBLAS fuction 'cublas<S/D>getrfBatched' cannot be executed "
+          "in-place. The memory space of output matrix (address: %p) cannot "
+          "overlap memory space of input matrix (address: %p).",
+          a_inv, a));
+  context_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<T>::GETRI_BATCH(handle, n, a, n, ipiv, a_inv, n, info, batch_size);
+  });
+}
+
 template <>
 template <typename T>
 void Blas<platform::CUDADeviceContext>::BatchedMatInv(int n, const T **a,
@@ -941,6 +1469,15 @@ void Blas<platform::CUDADeviceContext>::BatchedMatInv(int n, const T **a,
   });
 }
 
+template <>
+template <typename T>
+void Blas<pten::GPUContext>::BatchedMatInv(int n, const T **a, T **a_inv,
+                                           int *info, int batch_size) const {
+  context_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<T>::MATINV_BATCH(handle, n, a, n, a_inv, n, info, batch_size);
+  });
+}
+
 template <>
 template <typename T>
 void Blas<platform::CUDADeviceContext>::BatchedGETRS(
@@ -955,6 +1492,21 @@ void Blas<platform::CUDADeviceContext>::BatchedGETRS(
   });
 }
 
+template <>
+template <typename T>
+void Blas<pten::GPUContext>::BatchedGETRS(CBLAS_TRANSPOSE trans, int n,
+                                          int nrhs, const T **a, int lda,
+                                          int *ipiv, T **b, int ldb, int *info,
+                                          int batch_size) const {
+  // use CUBLAS_OP_C (conjugate transpose) for complex
+  cublasOperation_t cuTrans =
+      (trans == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  context_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<T>::GETRS_BATCH(handle, cuTrans, n, nrhs, a, lda, ipiv, b, ldb, info,
+                           batch_size);
+  });
+}
+
 template <>
 template <typename T>
 void Blas<platform::CUDADeviceContext>::BatchedTRSM(
@@ -979,6 +1531,31 @@ void Blas<platform::CUDADeviceContext>::BatchedTRSM(
   });
 }
 
+template <>
+template <typename T>
+void Blas<pten::GPUContext>::BatchedTRSM(CBLAS_SIDE side, CBLAS_UPLO uplo,
+                                         CBLAS_TRANSPOSE transA,
+                                         CBLAS_DIAG diag, int M, int N, T alpha,
+                                         const T **A, int lda, T **B, int ldb,
+                                         int batch_size) const {
+  // solve row major `op ( A ) X = α B` by taking it as `X' op ( A' )  =  α B'`
+  // where ' stands for transpose
+  cublasSideMode_t cuSide =
+      (side == CblasLeft) ? CUBLAS_SIDE_RIGHT : CUBLAS_SIDE_LEFT;
+  cublasFillMode_t cuUplo =
+      (uplo == CblasLower) ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER;
+  // use CUBLAS_OP_C (conjugate transpose) for complex
+  cublasOperation_t cuTransA =
+      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasDiagType_t cuDiag =
+      (diag == CblasUnit) ? CUBLAS_DIAG_UNIT : CUBLAS_DIAG_NON_UNIT;
+
+  context_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<T>::TRSM_BATCH(handle, cuSide, cuUplo, cuTransA, cuDiag, N, M,
+                          &alpha, A, lda, B, ldb, batch_size);
+  });
+}
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/math/blas_impl.hip.h b/paddle/fluid/operators/math/blas_impl.hip.h
index 32479189eea..bf7d66f4853 100644
--- a/paddle/fluid/operators/math/blas_impl.hip.h
+++ b/paddle/fluid/operators/math/blas_impl.hip.h
@@ -17,6 +17,7 @@
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/dynload/rocblas.h"
+#include "paddle/pten/backends/gpu/gpu_context.h"
 
 DECLARE_bool(enable_cublas_tensor_op_math);
 
@@ -221,6 +222,20 @@ struct CUBlas<platform::float16> {
           beta, C, Ctype, ldc, C, Ctype, ldc, computeType, algo, 0, 0));
     });
   }
+  template <typename... ARGS>
+  static void GEMM_EX(pten::GPUContext *dev_ctx, rocblas_operation transa,
+                      rocblas_operation transb, int m, int n, int k,
+                      const void *alpha, const void *A, rocblas_datatype Atype,
+                      int lda, const void *B, rocblas_datatype Btype, int ldb,
+                      const void *beta, void *C, rocblas_datatype Ctype,
+                      int ldc, rocblas_datatype computeType) {
+    rocblas_gemm_algo algo = rocblas_gemm_algo_standard;
+    dev_ctx->TensorCoreCublasCallIfAvailable([&](rocblas_handle handle) {
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_gemm_ex(
+          handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb,
+          beta, C, Ctype, ldc, C, Ctype, ldc, computeType, algo, 0, 0));
+    });
+  }
 };
 
 template <>
@@ -305,6 +320,20 @@ struct CUBlas<platform::complex<float>> {
           beta, C, Ctype, ldc, C, Ctype, ldc, computeType, algo, 0, 0));
     });
   }
+  template <typename... ARGS>
+  static void GEMM_EX(pten::GPUContext *dev_ctx, rocblas_operation transa,
+                      rocblas_operation transb, int m, int n, int k,
+                      const void *alpha, const void *A, rocblas_datatype Atype,
+                      int lda, const void *B, rocblas_datatype Btype, int ldb,
+                      const void *beta, void *C, rocblas_datatype Ctype,
+                      int ldc, rocblas_datatype computeType) {
+    rocblas_gemm_algo algo = rocblas_gemm_algo_standard;
+    dev_ctx->TensorCoreCublasCallIfAvailable([&](rocblas_handle handle) {
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_gemm_ex(
+          handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb,
+          beta, C, Ctype, ldc, C, Ctype, ldc, computeType, algo, 0, 0));
+    });
+  }
 };
 
 template <>
@@ -389,6 +418,20 @@ struct CUBlas<platform::complex<double>> {
           beta, C, Ctype, ldc, C, Ctype, ldc, computeType, algo, 0, 0));
     });
   }
+  template <typename... ARGS>
+  static void GEMM_EX(pten::GPUContext *dev_ctx, rocblas_operation transa,
+                      rocblas_operation transb, int m, int n, int k,
+                      const void *alpha, const void *A, rocblas_datatype Atype,
+                      int lda, const void *B, rocblas_datatype Btype, int ldb,
+                      const void *beta, void *C, rocblas_datatype Ctype,
+                      int ldc, rocblas_datatype computeType) {
+    rocblas_gemm_algo algo = rocblas_gemm_algo_standard;
+    dev_ctx->TensorCoreCublasCallIfAvailable([&](rocblas_handle handle) {
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_gemm_ex(
+          handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb,
+          beta, C, Ctype, ldc, C, Ctype, ldc, computeType, algo, 0, 0));
+    });
+  }
 };
 
 template <>
@@ -412,6 +455,27 @@ void Blas<platform::CUDADeviceContext>::GEMM(CBLAS_TRANSPOSE transA,
                     &beta, C, N);
   });
 }
+template <>
+template <typename T>
+void Blas<pten::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
+                                  CBLAS_TRANSPOSE transB, int M, int N, int K,
+                                  T alpha, const T *A, const T *B, T beta,
+                                  T *C) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  rocblas_operation cuTransA = (transA == CblasNoTrans)
+                                   ? rocblas_operation_none
+                                   : rocblas_operation_transpose;
+  rocblas_operation cuTransB = (transB == CblasNoTrans)
+                                   ? rocblas_operation_none
+                                   : rocblas_operation_transpose;
+  context_.CublasCall([&](rocblas_handle handle) {
+    CUBlas<T>::GEMM(handle, cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda,
+                    &beta, C, N);
+  });
+}
 
 template <>
 template <>
@@ -448,6 +512,43 @@ inline void Blas<platform::CUDADeviceContext>::GEMM(
       rocblas_datatype_f16_r, ldb, A, rocblas_datatype_f16_r, lda, &h_beta, C,
       rocblas_datatype_f16_r, N, rocblas_datatype_f32_r);
 }
+template <>
+template <>
+inline void Blas<pten::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
+                                         CBLAS_TRANSPOSE transB, int M, int N,
+                                         int K, platform::float16 alpha,
+                                         const platform::float16 *A,
+                                         const platform::float16 *B,
+                                         platform::float16 beta,
+                                         platform::float16 *C) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  rocblas_operation cuTransA = (transA == CblasNoTrans)
+                                   ? rocblas_operation_none
+                                   : rocblas_operation_transpose;
+  rocblas_operation cuTransB = (transB == CblasNoTrans)
+                                   ? rocblas_operation_none
+                                   : rocblas_operation_transpose;
+
+  // TODO(kexinzhao): add processing code for compute capability < 53 case
+  PADDLE_ENFORCE_GE(
+      context_.GetComputeCapability(), 53,
+      platform::errors::InvalidArgument(
+          "cublas fp16 gemm requires GPU compute capability >= 53,"
+          "but received %d",
+          context_.GetComputeCapability()));
+
+  float h_alpha = static_cast<float>(alpha);
+  float h_beta = static_cast<float>(beta);
+
+  auto &cuda_ctx = const_cast<pten::GPUContext &>(context_);
+  CUBlas<platform::float16>::GEMM_EX(
+      &cuda_ctx, cuTransB, cuTransA, N, M, K, &h_alpha, B,
+      rocblas_datatype_f16_r, ldb, A, rocblas_datatype_f16_r, lda, &h_beta, C,
+      rocblas_datatype_f16_r, N, rocblas_datatype_f32_r);
+}
 
 template <>
 template <>
@@ -485,6 +586,44 @@ inline void Blas<platform::CUDADeviceContext>::GEMM(
       rocblas_datatype_f32_c, ldb, A, rocblas_datatype_f32_c, lda, &c_beta, C,
       rocblas_datatype_f32_c, N, rocblas_datatype_f32_c);
 }
+template <>
+template <>
+inline void Blas<pten::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
+                                         CBLAS_TRANSPOSE transB, int M, int N,
+                                         int K, platform::complex<float> alpha,
+                                         const platform::complex<float> *A,
+                                         const platform::complex<float> *B,
+                                         platform::complex<float> beta,
+                                         platform::complex<float> *C) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  rocblas_operation cuTransA = (transA == CblasNoTrans)
+                                   ? rocblas_operation_none
+                                   : rocblas_operation_transpose;
+  rocblas_operation cuTransB = (transB == CblasNoTrans)
+                                   ? rocblas_operation_none
+                                   : rocblas_operation_transpose;
+
+  // TODO(kexinzhao): add processing code for compute capability < 53 case
+  PADDLE_ENFORCE_GE(
+      context_.GetComputeCapability(), 53,
+      platform::errors::InvalidArgument(
+          "cublas complex64 gemm requires GPU compute capability >= 53,"
+          "but received %d",
+          context_.GetComputeCapability()));
+
+  thrust::complex<float> c_alpha =
+      thrust::complex<float>(alpha.real, alpha.imag);
+  thrust::complex<float> c_beta = thrust::complex<float>(beta.real, beta.imag);
+
+  auto &cuda_ctx = const_cast<pten::GPUContext &>(context_);
+  CUBlas<platform::complex<float>>::GEMM_EX(
+      &cuda_ctx, cuTransB, cuTransA, N, M, K, &c_alpha, B,
+      rocblas_datatype_f32_c, ldb, A, rocblas_datatype_f32_c, lda, &c_beta, C,
+      rocblas_datatype_f32_c, N, rocblas_datatype_f32_c);
+}
 
 template <>
 template <>
@@ -523,6 +662,45 @@ inline void Blas<platform::CUDADeviceContext>::GEMM(
       rocblas_datatype_f64_c, ldb, A, rocblas_datatype_f64_c, lda, &c_beta, C,
       rocblas_datatype_f64_c, N, rocblas_datatype_f64_c);
 }
+template <>
+template <>
+inline void Blas<pten::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
+                                         CBLAS_TRANSPOSE transB, int M, int N,
+                                         int K, platform::complex<double> alpha,
+                                         const platform::complex<double> *A,
+                                         const platform::complex<double> *B,
+                                         platform::complex<double> beta,
+                                         platform::complex<double> *C) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  rocblas_operation cuTransA = (transA == CblasNoTrans)
+                                   ? rocblas_operation_none
+                                   : rocblas_operation_transpose;
+  rocblas_operation cuTransB = (transB == CblasNoTrans)
+                                   ? rocblas_operation_none
+                                   : rocblas_operation_transpose;
+
+  // TODO(kexinzhao): add processing code for compute capability < 53 case
+  PADDLE_ENFORCE_GE(
+      context_.GetComputeCapability(), 53,
+      platform::errors::InvalidArgument(
+          "cublas complex128 gemm requires GPU compute capability >= 53,"
+          "but received %d",
+          context_.GetComputeCapability()));
+
+  thrust::complex<double> c_alpha =
+      thrust::complex<double>(alpha.real, alpha.imag);
+  thrust::complex<double> c_beta =
+      thrust::complex<double>(beta.real, beta.imag);
+
+  auto &cuda_ctx = const_cast<pten::GPUContext &>(context_);
+  CUBlas<platform::complex<double>>::GEMM_EX(
+      &cuda_ctx, cuTransB, cuTransA, N, M, K, &c_alpha, B,
+      rocblas_datatype_f64_c, ldb, A, rocblas_datatype_f64_c, lda, &c_beta, C,
+      rocblas_datatype_f64_c, N, rocblas_datatype_f64_c);
+}
 
 template <>
 template <typename T>
@@ -541,6 +719,22 @@ void Blas<platform::CUDADeviceContext>::GEMM(bool transA, bool transB, int M,
                     &beta, C, ldc);
   });
 }
+template <>
+template <typename T>
+void Blas<pten::GPUContext>::GEMM(bool transA, bool transB, int M, int N, int K,
+                                  T alpha, const T *A, int lda, const T *B,
+                                  int ldb, T beta, T *C, int ldc) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  rocblas_operation cuTransA =
+      transA ? rocblas_operation_transpose : rocblas_operation_none;
+  rocblas_operation cuTransB =
+      transB ? rocblas_operation_transpose : rocblas_operation_none;
+  context_.CublasCall([&](rocblas_handle handle) {
+    CUBlas<T>::GEMM(handle, cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda,
+                    &beta, C, ldc);
+  });
+}
 
 template <>
 template <>
@@ -560,6 +754,26 @@ inline void Blas<platform::CUDADeviceContext>::GEMM(
                                     B, ldb, A, lda, &beta, C, ldc);
   });
 }
+template <>
+template <>
+inline void Blas<pten::GPUContext>::GEMM(bool transA, bool transB, int M, int N,
+                                         int K, platform::float16 alpha,
+                                         const platform::float16 *A, int lda,
+                                         const platform::float16 *B, int ldb,
+                                         platform::float16 beta,
+                                         platform::float16 *C, int ldc) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  rocblas_operation cuTransA =
+      transA ? rocblas_operation_transpose : rocblas_operation_none;
+  rocblas_operation cuTransB =
+      transB ? rocblas_operation_transpose : rocblas_operation_none;
+
+  context_.CublasCall([&](rocblas_handle handle) {
+    CUBlas<platform::float16>::GEMM(handle, cuTransB, cuTransA, N, M, K, &alpha,
+                                    B, ldb, A, lda, &beta, C, ldc);
+  });
+}
 
 template <>
 template <typename T>
@@ -569,6 +783,13 @@ void Blas<platform::CUDADeviceContext>::AXPY(int n, T alpha, const T *x,
     CUBlas<T>::AXPY(handle, n, &alpha, x, 1, y, 1);
   });
 }
+template <>
+template <typename T>
+void Blas<pten::GPUContext>::AXPY(int n, T alpha, const T *x, T *y) const {
+  context_.CublasCall([&](rocblas_handle handle) {
+    CUBlas<T>::AXPY(handle, n, &alpha, x, 1, y, 1);
+  });
+}
 
 template <>
 template <typename T>
@@ -576,6 +797,12 @@ void Blas<platform::CUDADeviceContext>::SCAL(int n, const T alpha, T *x) const {
   context_.CublasCall(
       [&](rocblas_handle handle) { CUBlas<T>::SCAL(handle, n, &alpha, x, 1); });
 }
+template <>
+template <typename T>
+void Blas<pten::GPUContext>::SCAL(int n, const T alpha, T *x) const {
+  context_.CublasCall(
+      [&](rocblas_handle handle) { CUBlas<T>::SCAL(handle, n, &alpha, x, 1); });
+}
 
 template <>
 template <typename T>
@@ -583,6 +810,12 @@ void Blas<platform::CUDADeviceContext>::VCOPY(int n, const T *x, T *y) const {
   context_.CublasCall(
       [&](rocblas_handle handle) { CUBlas<T>::VCOPY(handle, n, x, 1, y, 1); });
 }
+template <>
+template <typename T>
+void Blas<pten::GPUContext>::VCOPY(int n, const T *x, T *y) const {
+  context_.CublasCall(
+      [&](rocblas_handle handle) { CUBlas<T>::VCOPY(handle, n, x, 1, y, 1); });
+}
 
 template <>
 template <typename T>
@@ -596,6 +829,17 @@ void Blas<platform::CUDADeviceContext>::GEMV(bool trans_a, int M, int N,
     CUBlas<T>::GEMV(handle, cuTransA, N, M, &alpha, A, N, B, 1, &beta, C, 1);
   });
 }
+template <>
+template <typename T>
+void Blas<pten::GPUContext>::GEMV(bool trans_a, int M, int N, T alpha,
+                                  const T *A, const T *B, T beta, T *C) const {
+  rocblas_operation cuTransA =
+      !trans_a ? rocblas_operation_transpose : rocblas_operation_none;
+
+  context_.CublasCall([&](rocblas_handle handle) {
+    CUBlas<T>::GEMV(handle, cuTransA, N, M, &alpha, A, N, B, 1, &beta, C, 1);
+  });
+}
 
 template <>
 template <>
@@ -612,6 +856,23 @@ inline void Blas<platform::CUDADeviceContext>::GEMV(
                                            alpha, A, B, beta, C);
   }
 }
+template <>
+template <>
+inline void Blas<pten::GPUContext>::GEMV(bool trans_a, int M, int N,
+                                         platform::float16 alpha,
+                                         const platform::float16 *A,
+                                         const platform::float16 *B,
+                                         platform::float16 beta,
+                                         platform::float16 *C) const {
+  // Because cublas doesn't support half gemv, we use cublasHgemm to achieve it.
+  if (trans_a) {
+    this->template GEMM<platform::float16>(CblasNoTrans, CblasNoTrans, 1, N, M,
+                                           alpha, B, A, beta, C);
+  } else {
+    this->template GEMM<platform::float16>(CblasNoTrans, CblasNoTrans, M, 1, N,
+                                           alpha, A, B, beta, C);
+  }
+}
 
 template <>
 template <typename T>
@@ -637,6 +898,32 @@ void Blas<platform::CUDADeviceContext>::BatchedGEMM(
                                   ldc, strideC, batchCount);
   });
 }
+template <>
+template <typename T>
+void Blas<pten::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
+                                         CBLAS_TRANSPOSE transB, int M, int N,
+                                         int K, T alpha, const T *A, const T *B,
+                                         T beta, T *C, int batchCount,
+                                         int64_t strideA,
+                                         int64_t strideB) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  int ldc = N;
+  rocblas_operation cuTransA = (transA == CblasNoTrans)
+                                   ? rocblas_operation_none
+                                   : rocblas_operation_transpose;
+  rocblas_operation cuTransB = (transB == CblasNoTrans)
+                                   ? rocblas_operation_none
+                                   : rocblas_operation_transpose;
+  const int64_t strideC = M * N;
+  context_.CublasCall([&](rocblas_handle handle) {
+    CUBlas<T>::GEMM_STRIDED_BATCH(handle, cuTransB, cuTransA, N, M, K, &alpha,
+                                  B, ldb, strideB, A, lda, strideA, &beta, C,
+                                  ldc, strideC, batchCount);
+  });
+}
 
 template <>
 template <typename T>
@@ -648,6 +935,18 @@ void Blas<platform::CUDADeviceContext>::BatchedGEMM(
                            C[k]);
   }
 }
+template <>
+template <typename T>
+void Blas<pten::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
+                                         CBLAS_TRANSPOSE transB, int M, int N,
+                                         int K, T alpha, const T **A,
+                                         const T **B, T beta, T **C,
+                                         int batchCount) const {
+  for (int k = 0; k < batchCount; ++k) {
+    this->template GEMM<T>(transA, transB, M, N, K, alpha, A[k], B[k], beta,
+                           C[k]);
+  }
+}
 
 template <>
 template <>
@@ -661,6 +960,18 @@ inline void Blas<platform::CUDADeviceContext>::BatchedGEMM(
                                            B[k], beta, C[k]);
   }
 }
+template <>
+template <>
+inline void Blas<pten::GPUContext>::BatchedGEMM(
+    CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K,
+    platform::float16 alpha, const platform::float16 **A,
+    const platform::float16 **B, platform::float16 beta, platform::float16 **C,
+    int batchCount) const {
+  for (int k = 0; k < batchCount; ++k) {
+    this->template GEMM<platform::float16>(transA, transB, M, N, K, alpha, A[k],
+                                           B[k], beta, C[k]);
+  }
+}
 
 template <>
 template <typename T>
@@ -687,6 +998,30 @@ void Blas<platform::CUDADeviceContext>::TRSM(CBLAS_SIDE side, CBLAS_UPLO uplo,
                     lda, B, ldb);
   });
 }
+template <>
+template <typename T>
+void Blas<pten::GPUContext>::TRSM(CBLAS_SIDE side, CBLAS_UPLO uplo,
+                                  CBLAS_TRANSPOSE transA, CBLAS_DIAG diag,
+                                  int M, int N, T alpha, const T *A, int lda,
+                                  T *B, int ldb) const {
+  // solve row major `op ( A ) X = α B` by taking it as `X' op ( A' )  =  α B'`
+  // where ' stands for transpose
+  rocblas_side cuSide =
+      (side == CblasLeft) ? rocblas_side_right : rocblas_side_left;
+  rocblas_fill cuUplo =
+      (uplo == CblasLower) ? rocblas_fill_upper : rocblas_fill_lower;
+  // use CUBLAS_OP_C (conjugate transpose) for complex
+  rocblas_operation cuTransA = (transA == CblasNoTrans)
+                                   ? rocblas_operation_none
+                                   : rocblas_operation_transpose;
+  rocblas_diagonal cuDiag =
+      (diag == CblasUnit) ? rocblas_diagonal_unit : rocblas_diagonal_non_unit;
+
+  context_.CublasCall([&](rocblas_handle handle) {
+    CUBlas<T>::TRSM(handle, cuSide, cuUplo, cuTransA, cuDiag, N, M, &alpha, A,
+                    lda, B, ldb);
+  });
+}
 
 template <>
 template <typename T>
@@ -697,6 +1032,14 @@ void Blas<platform::CUDADeviceContext>::BatchedGETRF(int n, T **a, int *ipiv,
     CUBlas<T>::GETRF_BATCH(handle, n, a, n, ipiv, info, batch_size);
   });
 }
+template <>
+template <typename T>
+void Blas<pten::GPUContext>::BatchedGETRF(int n, T **a, int *ipiv, int *info,
+                                          int batch_size) const {
+  context_.CublasCall([&](rocblas_handle handle) {
+    CUBlas<T>::GETRF_BATCH(handle, n, a, n, ipiv, info, batch_size);
+  });
+}
 
 template <>
 template <typename T>
@@ -715,6 +1058,22 @@ void Blas<platform::CUDADeviceContext>::BatchedGETRI(int n, const T **a,
     CUBlas<T>::GETRI_BATCH(handle, n, a, n, ipiv, a_inv, n, info, batch_size);
   });
 }
+template <>
+template <typename T>
+void Blas<pten::GPUContext>::BatchedGETRI(int n, const T **a, const int *ipiv,
+                                          T **a_inv, int *info,
+                                          int batch_size) const {
+  PADDLE_ENFORCE_NE(
+      a_inv, a,
+      platform::errors::InvalidArgument(
+          "cuBLAS fuction 'cublas<S/D>getrfBatched' cannot be executed "
+          "in-place. The memory space of output matrix (address: %p) cannot "
+          "overlap memory space of input matrix (address: %p).",
+          a_inv, a));
+  context_.CublasCall([&](rocblas_handle handle) {
+    CUBlas<T>::GETRI_BATCH(handle, n, a, n, ipiv, a_inv, n, info, batch_size);
+  });
+}
 
 template <>
 template <typename T>
@@ -725,6 +1084,14 @@ void Blas<platform::CUDADeviceContext>::BatchedMatInv(int n, const T **a,
     CUBlas<T>::MATINV_BATCH(handle, n, a, n, a_inv, n, info, batch_size);
   });
 }
+template <>
+template <typename T>
+void Blas<pten::GPUContext>::BatchedMatInv(int n, const T **a, T **a_inv,
+                                           int *info, int batch_size) const {
+  context_.CublasCall([&](rocblas_handle handle) {
+    CUBlas<T>::MATINV_BATCH(handle, n, a, n, a_inv, n, info, batch_size);
+  });
+}
 
 template <>
 template <typename T>
@@ -739,6 +1106,20 @@ void Blas<platform::CUDADeviceContext>::BatchedGETRS(
                            batch_size);
   });
 }
+template <>
+template <typename T>
+void Blas<pten::GPUContext>::BatchedGETRS(CBLAS_TRANSPOSE trans, int n,
+                                          int nrhs, const T **a, int lda,
+                                          int *ipiv, T **b, int ldb, int *info,
+                                          int batch_size) const {
+  rocblas_operation cuTrans = (trans == CblasNoTrans)
+                                  ? rocblas_operation_none
+                                  : rocblas_operation_transpose;
+  context_.CublasCall([&](rocblas_handle handle) {
+    CUBlas<T>::GETRS_BATCH(handle, cuTrans, n, nrhs, a, lda, ipiv, b, ldb, info,
+                           batch_size);
+  });
+}
 
 template <>
 template <typename T>
@@ -764,6 +1145,31 @@ void Blas<platform::CUDADeviceContext>::BatchedTRSM(
                           &alpha, A, lda, B, ldb, batch_size);
   });
 }
+template <>
+template <typename T>
+void Blas<pten::GPUContext>::BatchedTRSM(CBLAS_SIDE side, CBLAS_UPLO uplo,
+                                         CBLAS_TRANSPOSE transA,
+                                         CBLAS_DIAG diag, int M, int N, T alpha,
+                                         const T **A, int lda, T **B, int ldb,
+                                         int batch_size) const {
+  // solve row major `op ( A ) X = α B` by taking it as `X' op ( A' )  =  α B'`
+  // where ' stands for transpose
+  rocblas_side cuSide =
+      (side == CblasLeft) ? rocblas_side_right : rocblas_side_left;
+  rocblas_fill cuUplo =
+      (uplo == CblasLower) ? rocblas_fill_upper : rocblas_fill_lower;
+  // use CUBLAS_OP_C (conjugate transpose) for complex
+  rocblas_operation cuTransA = (transA == CblasNoTrans)
+                                   ? rocblas_operation_none
+                                   : rocblas_operation_transpose;
+  rocblas_diagonal cuDiag =
+      (diag == CblasUnit) ? rocblas_diagonal_unit : rocblas_diagonal_non_unit;
+
+  context_.CublasCall([&](rocblas_handle handle) {
+    CUBlas<T>::TRSM_BATCH(handle, cuSide, cuUplo, cuTransA, cuDiag, N, M,
+                          &alpha, A, lda, B, ldb, batch_size);
+  });
+}
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/fluid/operators/math/concat_test.cc b/paddle/fluid/operators/math/concat_test.cc
index c8e2acea451..65e48e58175 100644
--- a/paddle/fluid/operators/math/concat_test.cc
+++ b/paddle/fluid/operators/math/concat_test.cc
@@ -16,6 +16,8 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/place.h"
 
 /**
  * case 1:
@@ -441,6 +443,31 @@ void TestConcatMain() {
   delete context;
 }
 
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+template <>
+void TestConcatMain<paddle::platform::CUDADeviceContext,
+                    paddle::platform::CUDAPlace>() {
+  auto* context =
+      new paddle::platform::CUDADeviceContext(paddle::platform::CUDAPlace());
+  context->SetAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(paddle::platform::CUDAPlace(), context->stream())
+          .get());
+  context->PartialInitWithAllocator();
+
+  ConcatCase1<paddle::platform::CUDADeviceContext, paddle::platform::CUDAPlace>(
+      context);
+  ConcatCase2<paddle::platform::CUDADeviceContext, paddle::platform::CUDAPlace>(
+      context);
+  ConcatCase3<paddle::platform::CUDADeviceContext, paddle::platform::CUDAPlace>(
+      context);
+  ConcatCase4<paddle::platform::CUDADeviceContext, paddle::platform::CUDAPlace>(
+      context);
+
+  delete context;
+}
+#endif
+
 TEST(math, concat) {
   TestConcatMain<paddle::platform::CPUDeviceContext,
                  paddle::platform::CPUPlace>();
diff --git a/paddle/fluid/operators/math/cusparse_conversion_api_test.cc b/paddle/fluid/operators/math/cusparse_conversion_api_test.cc
index d45b57420ee..a84b734fbeb 100644
--- a/paddle/fluid/operators/math/cusparse_conversion_api_test.cc
+++ b/paddle/fluid/operators/math/cusparse_conversion_api_test.cc
@@ -24,6 +24,11 @@ void TestNNZ(const std::vector<T>& dense_data, const int correct_nnz,
              const int rows, const int cols) {
   paddle::platform::CUDADeviceContext* context =
       new paddle::platform::CUDADeviceContext(paddle::platform::CUDAPlace());
+  context->SetAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(paddle::platform::CUDAPlace(), context->stream())
+          .get());
+  context->PartialInitWithAllocator();
   auto sparse =
       paddle::operators::math::GetSparse<paddle::platform::CUDADeviceContext,
                                          T>(*context);
@@ -61,6 +66,11 @@ void TestDenseToSparse(const std::vector<T>& correct_dense_data,
                        const std::string& mode) {
   paddle::platform::CUDADeviceContext* context =
       new paddle::platform::CUDADeviceContext(paddle::platform::CUDAPlace());
+  context->SetAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(paddle::platform::CUDAPlace(), context->stream())
+          .get());
+  context->PartialInitWithAllocator();
   // get sparse
   auto sparse =
       paddle::operators::math::GetSparse<paddle::platform::CUDADeviceContext,
diff --git a/paddle/fluid/operators/math/im2col_test.cc b/paddle/fluid/operators/math/im2col_test.cc
index 1400b9d105c..0e4032986cf 100644
--- a/paddle/fluid/operators/math/im2col_test.cc
+++ b/paddle/fluid/operators/math/im2col_test.cc
@@ -15,6 +15,8 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/im2col.h"
 #include <gtest/gtest.h>
 #include "paddle/fluid/operators/math/im2col_cfo_cpu.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/place.h"
 
 template <typename DeviceContext, typename Place>
 void testIm2col() {
@@ -60,6 +62,7 @@ void testIm2col() {
 
   auto* place = new Place();
   DeviceContext* context = new DeviceContext(*place);
+
   if (paddle::platform::is_cpu_place(*place)) {
     input = input_tmp;
   } else {
@@ -164,6 +167,165 @@ void testIm2col() {
   delete context;
 }
 
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+template <>
+void testIm2col<paddle::platform::CUDADeviceContext,
+                paddle::platform::CUDAPlace>() {
+  paddle::framework::Tensor input_tmp;
+  paddle::framework::Tensor input;
+  paddle::framework::Tensor output_cfo;
+  paddle::framework::Tensor output_ocf;
+  paddle::framework::Tensor output_tmp;
+
+  /**
+   * input = [0, 1, 2,
+   *          3, 4, 5]
+   *
+   * output_cfo = [0, 1
+   *               1, 2
+   *               3, 4
+   *               4, 5]
+   *
+   * output_ocf = [0, 1, 3, 4
+   *               1, 2, 4, 5]
+   *
+   * col2im_cfo = [0, 2, 2
+   *               3, 4, 5]
+   *
+   * col2im_ocf = [0, 2, 2
+   *               3, 4, 5]
+   */
+  int input_height = 2;
+  int input_width = 3;
+  int filter_size = 2;
+  std::vector<int> stride({1, 1});  // stride_y, stride_x
+  std::vector<int> padding(
+      {0, 0, 0, 0});                  // up_pad, left_pad, down_pad, right_pad
+  std::vector<int> dilation({1, 1});  // dilation_y, dilation_x
+  int output_height =
+      (input_height - filter_size + padding[0] + padding[1]) / stride[0] + 1;
+  int output_width =
+      (input_width - filter_size + padding[2] + padding[3]) / stride[1] + 1;
+  float* input_ptr = input_tmp.mutable_data<float>(
+      {1, input_height, input_width}, paddle::platform::CPUPlace());
+  float arr[6] = {0, 1, 2, 3, 4, 5};
+  memcpy(input_ptr, arr, 6 * sizeof(float));
+
+  auto* place = new paddle::platform::CUDAPlace();
+  auto* context = new paddle::platform::CUDADeviceContext(*place);
+  context->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                            .GetAllocator(*place, context->stream())
+                            .get());
+  context->PartialInitWithAllocator();
+  if (paddle::platform::is_cpu_place(*place)) {
+    input = input_tmp;
+  } else {
+    paddle::framework::TensorCopySync(input_tmp, *place, &input);
+  }
+  output_cfo.mutable_data<float>(
+      {1, filter_size, filter_size, output_height, output_width}, *place);
+  output_ocf.mutable_data<float>(
+      {output_height, output_width, 1, filter_size, filter_size}, *place);
+
+  // Im2Col
+  paddle::operators::math::Im2ColFunctor<
+      paddle::operators::math::ColFormat::kCFO,
+      paddle::platform::CUDADeviceContext, float>
+      im2col;
+  paddle::operators::math::Im2ColFunctor<
+      paddle::operators::math::ColFormat::kOCF,
+      paddle::platform::CUDADeviceContext, float>
+      im2col_ocf;
+
+  im2col(*context, input, dilation, stride, padding, &output_cfo);
+  im2col_ocf(*context, input, dilation, stride, padding, &output_ocf);
+
+  float out_cfo_data[] = {0, 1, 1, 2, 3, 4, 4, 5};
+  float out_ocf_data[] = {0, 1, 3, 4, 1, 2, 4, 5};
+
+  float* out_cfo_ptr;
+  if (paddle::platform::is_cpu_place(*place)) {
+    out_cfo_ptr = output_cfo.data<float>();
+  } else {
+    paddle::framework::TensorCopySync(output_cfo, paddle::platform::CPUPlace(),
+                                      &output_tmp);
+    out_cfo_ptr = output_tmp.data<float>();
+  }
+  for (int i = 0; i < 6; ++i) {
+    EXPECT_EQ(out_cfo_ptr[i], out_cfo_data[i]);
+  }
+
+  float* out_ocf_ptr;
+  if (paddle::platform::is_cpu_place(*place)) {
+    out_ocf_ptr = output_ocf.data<float>();
+  } else {
+    paddle::framework::TensorCopySync(output_ocf, paddle::platform::CPUPlace(),
+                                      &output_tmp);
+    out_ocf_ptr = output_tmp.data<float>();
+  }
+
+  for (int i = 0; i < 6; ++i) {
+    EXPECT_EQ(out_ocf_ptr[i], out_ocf_data[i]);
+  }
+
+  // Col2Im: kCFO
+  paddle::operators::math::Col2ImFunctor<
+      paddle::operators::math::ColFormat::kCFO,
+      paddle::platform::CUDADeviceContext, float>
+      col2im;
+  paddle::operators::math::Col2ImFunctor<
+      paddle::operators::math::ColFormat::kOCF,
+      paddle::platform::CUDADeviceContext, float>
+      col2im_ocf;
+  float col2im_data[] = {0, 2, 2, 3, 8, 5};
+
+  memset(input_ptr, 0, 6 * sizeof(float));
+  if (paddle::platform::is_cpu_place(*place)) {
+    input = input_tmp;
+  } else {
+    paddle::framework::TensorCopySync(input_tmp, *place, &input);
+  }
+
+  col2im(*context, output_cfo, dilation, stride, padding, &input);
+
+  float* in_ptr;
+  if (paddle::platform::is_cpu_place(*place)) {
+    in_ptr = input.data<float>();
+  } else {
+    paddle::framework::TensorCopySync(input, paddle::platform::CPUPlace(),
+                                      &input_tmp);
+    in_ptr = input_tmp.data<float>();
+  }
+  for (int i = 0; i < 6; ++i) {
+    EXPECT_EQ(in_ptr[i], col2im_data[i]);
+  }
+
+  // Col2Im: kOCF
+  memset(input_ptr, 0, 6 * sizeof(float));
+  if (paddle::platform::is_cpu_place(*place)) {
+    input = input_tmp;
+  } else {
+    paddle::framework::TensorCopySync(input_tmp, *place, &input);
+  }
+
+  col2im_ocf(*context, output_ocf, dilation, stride, padding, &input);
+
+  if (paddle::platform::is_cpu_place(*place)) {
+    in_ptr = input.data<float>();
+  } else {
+    paddle::framework::TensorCopySync(input, paddle::platform::CPUPlace(),
+                                      &input_tmp);
+    in_ptr = input_tmp.data<float>();
+  }
+  for (int i = 0; i < 6; ++i) {
+    EXPECT_EQ(in_ptr[i], col2im_data[i]);
+  }
+
+  delete place;
+  delete context;
+}
+#endif
+
 TEST(math, im2col) {
   testIm2col<paddle::platform::CPUDeviceContext, paddle::platform::CPUPlace>();
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
diff --git a/paddle/fluid/operators/math/inclusive_scan.h b/paddle/fluid/operators/math/inclusive_scan.h
index 54a37db1df7..5fdc2889a88 100644
--- a/paddle/fluid/operators/math/inclusive_scan.h
+++ b/paddle/fluid/operators/math/inclusive_scan.h
@@ -194,7 +194,7 @@ static void InclusiveScanInnerDim(const T *x, T *y, size_t outer_dim,
   constexpr size_t kThreadNumY = 32;
 
   size_t grid_dim = (outer_dim + kThreadNumY - 1) / kThreadNumY;
-  grid_dim = std::min<size_t>(grid_dim, dev_ctx.GetCUDAMaxGridDimSize().x);
+  grid_dim = std::min<size_t>(grid_dim, dev_ctx.GetCUDAMaxGridDimSize()[0]);
   dim3 thread_dims(kThreadNumX, kThreadNumY);
   if (reverse) {
     InclusiveScanInnerDimCUDAKernel<
diff --git a/paddle/fluid/operators/math/math_function.cu b/paddle/fluid/operators/math/math_function.cu
index 9ade45ee743..f0ef692b99f 100644
--- a/paddle/fluid/operators/math/math_function.cu
+++ b/paddle/fluid/operators/math/math_function.cu
@@ -21,6 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/math_function_impl.h"
 #include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/pten/backends/gpu/gpu_context.h"
 #include "paddle/pten/kernels/funcs/eigen/common.h"
 
 namespace paddle {
@@ -44,6 +45,18 @@ template struct SetConstant<platform::CUDADeviceContext,
 template struct SetConstant<platform::CUDADeviceContext,
                             platform::complex<double>>;
 
+template struct SetConstant<pten::GPUContext, platform::float16>;
+template struct SetConstant<pten::GPUContext, platform::bfloat16>;
+template struct SetConstant<pten::GPUContext, float>;
+template struct SetConstant<pten::GPUContext, double>;
+template struct SetConstant<pten::GPUContext, uint8_t>;
+template struct SetConstant<pten::GPUContext, int>;
+template struct SetConstant<pten::GPUContext, int16_t>;
+template struct SetConstant<pten::GPUContext, int64_t>;
+template struct SetConstant<pten::GPUContext, bool>;
+template struct SetConstant<pten::GPUContext, platform::complex<float>>;
+template struct SetConstant<pten::GPUContext, platform::complex<double>>;
+
 template struct SetConstant<platform::CUDAPinnedDeviceContext,
                             platform::float16>;
 template struct SetConstant<platform::CUDAPinnedDeviceContext,
diff --git a/paddle/fluid/operators/math/math_function_test.cc b/paddle/fluid/operators/math/math_function_test.cc
index 32f9938dcac..91a4f2746ea 100644
--- a/paddle/fluid/operators/math/math_function_test.cc
+++ b/paddle/fluid/operators/math/math_function_test.cc
@@ -223,6 +223,7 @@ TEST(math_funciton, set_constant) {
   t.Resize({10, 10});
   t.mutable_data<int>(paddle::platform::CPUPlace());
   auto* ctx = new paddle::platform::CPUDeviceContext();
+  ctx->Init();
   paddle::operators::math::set_constant(*ctx, &t, 10);
   for (int64_t i = 0; i < t.numel(); ++i) {
     PADDLE_ENFORCE_EQ(10, t.data<int>()[i],
diff --git a/paddle/fluid/operators/math/math_function_test.cu b/paddle/fluid/operators/math/math_function_test.cu
index 44b1ee45a4f..39c91e96a70 100644
--- a/paddle/fluid/operators/math/math_function_test.cu
+++ b/paddle/fluid/operators/math/math_function_test.cu
@@ -46,6 +46,10 @@ TEST(math_function, notrans_mul_trans_fp32) {
   paddle::platform::CPUPlace cpu_place;
   paddle::platform::CUDAPlace gpu_place(0);
   paddle::platform::CUDADeviceContext context(gpu_place);
+  context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(gpu_place, context.stream())
+                           .get());
+  context.PartialInitWithAllocator();
 
   float* input1_ptr = input1.mutable_data<float>({2, 3}, cpu_place);
   float arr[6] = {0, 1, 2, 3, 4, 5};
@@ -78,6 +82,10 @@ TEST(math_function, notrans_mul_trans_fp16) {
   paddle::platform::CPUPlace cpu_place;
   paddle::platform::CUDAPlace gpu_place(0);
   paddle::platform::CUDADeviceContext context(gpu_place);
+  context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(gpu_place, context.stream())
+                           .get());
+  context.PartialInitWithAllocator();
 
   // fp16 GEMM in cublas requires GPU compute capability >= 53
   if (context.GetComputeCapability() < 53) {
@@ -117,6 +125,10 @@ TEST(math_function, trans_mul_notrans_fp32) {
   paddle::platform::CPUPlace cpu_place;
   paddle::platform::CUDAPlace gpu_place(0);
   paddle::platform::CUDADeviceContext context(gpu_place);
+  context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(gpu_place, context.stream())
+                           .get());
+  context.PartialInitWithAllocator();
 
   float* input1_ptr = input1.mutable_data<float>({2, 3}, cpu_place);
   float arr[6] = {0, 1, 2, 3, 4, 5};
@@ -155,6 +167,10 @@ TEST(math_function, trans_mul_notrans_fp16) {
   paddle::platform::CPUPlace cpu_place;
   paddle::platform::CUDAPlace gpu_place(0);
   paddle::platform::CUDADeviceContext context(gpu_place);
+  context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(gpu_place, context.stream())
+                           .get());
+  context.PartialInitWithAllocator();
 
   // fp16 GEMM in cublas requires GPU compute capability >= 53
   if (context.GetComputeCapability() < 53) {
@@ -200,6 +216,10 @@ TEST(math_function, gemm_notrans_cublas_fp32) {
   paddle::platform::CPUPlace cpu_place;
   paddle::platform::CUDAPlace gpu_place(0);
   paddle::platform::CUDADeviceContext context(gpu_place);
+  context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(gpu_place, context.stream())
+                           .get());
+  context.PartialInitWithAllocator();
 
   int m = 2;
   int n = 3;
@@ -254,6 +274,10 @@ TEST(math_function, gemm_notrans_cublas_fp16) {
   paddle::platform::CPUPlace cpu_place;
   paddle::platform::CUDAPlace gpu_place(0);
   paddle::platform::CUDADeviceContext context(gpu_place);
+  context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(gpu_place, context.stream())
+                           .get());
+  context.PartialInitWithAllocator();
 
   // fp16 GEMM in cublas requires GPU compute capability >= 53
   if (context.GetComputeCapability() < 53) {
@@ -316,6 +340,10 @@ TEST(math_function, gemm_trans_cublas_fp32) {
   paddle::platform::CPUPlace cpu_place;
   paddle::platform::CUDAPlace gpu_place(0);
   paddle::platform::CUDADeviceContext context(gpu_place);
+  context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(gpu_place, context.stream())
+                           .get());
+  context.PartialInitWithAllocator();
 
   int m = 2;
   int n = 3;
@@ -364,6 +392,10 @@ TEST(math_function, gemm_trans_cublas_fp16) {
   paddle::platform::CPUPlace cpu_place;
   paddle::platform::CUDAPlace gpu_place(0);
   paddle::platform::CUDADeviceContext context(gpu_place);
+  context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(gpu_place, context.stream())
+                           .get());
+  context.PartialInitWithAllocator();
 
   // fp16 GEMM in cublas requires GPU compute capability >= 53
   if (context.GetComputeCapability() < 53) {
@@ -418,6 +450,10 @@ void GemvTest(int m, int n, bool trans) {
   paddle::platform::CPUPlace cpu_place;
   paddle::platform::CUDAPlace gpu_place(0);
   paddle::platform::CUDADeviceContext context(gpu_place);
+  context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(gpu_place, context.stream())
+                           .get());
+  context.PartialInitWithAllocator();
 
   T* data_a = mat_a.mutable_data<T>({m, n}, cpu_place);
   T* data_b = vec_b.mutable_data<T>({trans ? m : n}, cpu_place);
diff --git a/paddle/fluid/operators/math/vol2col_test.cc b/paddle/fluid/operators/math/vol2col_test.cc
index 8cd28244658..210cf10d887 100644
--- a/paddle/fluid/operators/math/vol2col_test.cc
+++ b/paddle/fluid/operators/math/vol2col_test.cc
@@ -15,6 +15,8 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/vol2col.h"
 
 #include <gtest/gtest.h>
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/place.h"
 
 template <typename DeviceContext, typename Place>
 void testVol2col() {
@@ -25,7 +27,6 @@ void testVol2col() {
 
   auto* place = new Place();
   DeviceContext* context = new DeviceContext(*place);
-
   /**
    * input = [[0, 1, 2,
    *          3, 4, 5]
@@ -123,6 +124,124 @@ void testVol2col() {
   delete context;
 }
 
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+template <>
+void testVol2col<paddle::platform::CUDADeviceContext,
+                 paddle::platform::CUDAPlace>() {
+  paddle::framework::Tensor input;
+  paddle::framework::Tensor input_tmp;
+  paddle::framework::Tensor output;
+  paddle::framework::Tensor output_tmp;
+
+  auto* place = new paddle::platform::CUDAPlace();
+  auto* context = new paddle::platform::CUDADeviceContext(*place);
+  context->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                            .GetAllocator(*place, context->stream())
+                            .get());
+  context->PartialInitWithAllocator();
+
+  /**
+   * input = [[0, 1, 2,
+   *          3, 4, 5]
+   *          [6, 7, 8,
+   *          9, 10, 11]]
+   *
+   * output = [0, 1
+   *           1, 2
+   *           3, 4
+   *           4, 5
+   *           6, 7
+   *           7, 8
+   *           9, 10
+   *           10, 11]
+   *
+   * col2vol = [[0, 2, 2,
+   *             3, 8, 5]
+   *            [6, 14, 8,
+   *             9, 20, 11]]
+   *
+   */
+  int input_depth = 2;
+  int input_height = 2;
+  int input_width = 3;
+  int filter_size = 2;
+  std::vector<int> strides({1, 1, 1});
+  std::vector<int> paddings({0, 0, 0});
+  std::vector<int> dilations({1, 1, 1});
+  int output_depth =
+      (input_depth - filter_size + 2 * paddings[0]) / strides[0] + 1;
+  int output_height =
+      (input_height - filter_size + 2 * paddings[1]) / strides[1] + 1;
+  int output_width =
+      (input_width - filter_size + 2 * paddings[2]) / strides[2] + 1;
+
+  // Vol2Col test
+  float* input_ptr =
+      input_tmp.mutable_data<float>({1, input_depth, input_height, input_width},
+                                    paddle::platform::CPUPlace());
+  float arr[12] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
+  memcpy(input_ptr, arr, 12 * sizeof(float));
+
+  if (paddle::platform::is_cpu_place(*place)) {
+    input = input_tmp;
+  } else {
+    paddle::framework::TensorCopySync(input_tmp, *place, &input);
+  }
+  output.mutable_data<float>({1, filter_size, filter_size, filter_size,
+                              output_depth, output_height, output_width},
+                             *place);
+
+  paddle::operators::math::Vol2ColFunctor<paddle::platform::CUDADeviceContext,
+                                          float>
+      vol2col;
+  vol2col(*context, input, dilations, strides, paddings, &output);
+
+  float vol_2_col[] = {0, 1, 1, 2, 3, 4, 4, 5, 6, 7, 7, 8, 9, 10, 10, 11};
+  float* out_cfo_ptr;
+  if (paddle::platform::is_cpu_place(*place)) {
+    out_cfo_ptr = output.data<float>();
+  } else {
+    paddle::framework::TensorCopySync(output, paddle::platform::CPUPlace(),
+                                      &output_tmp);
+    out_cfo_ptr = output_tmp.data<float>();
+  }
+
+  for (int i = 0; i < 16; ++i) {
+    EXPECT_EQ(out_cfo_ptr[i], vol_2_col[i]);
+  }
+
+  // Col2Vol test
+  float col_2_vol[] = {0, 2, 2, 3, 8, 5, 6, 14, 8, 9, 20, 11};
+  memset(input_ptr, 0, 12 * sizeof(float));
+  if (paddle::platform::is_cpu_place(*place)) {
+    input = input_tmp;
+  } else {
+    paddle::framework::TensorCopySync(input_tmp, *place, &input);
+  }
+
+  paddle::operators::math::Col2VolFunctor<paddle::platform::CUDADeviceContext,
+                                          float>
+      col2vol;
+  col2vol(*context, output, dilations, strides, paddings, &input);
+
+  float* in_ptr;
+  if (paddle::platform::is_cpu_place(*place)) {
+    in_ptr = input.data<float>();
+  } else {
+    paddle::framework::TensorCopySync(input, paddle::platform::CPUPlace(),
+                                      &input_tmp);
+    in_ptr = input_tmp.data<float>();
+  }
+
+  for (int i = 0; i < 12; ++i) {
+    EXPECT_EQ(in_ptr[i], col_2_vol[i]);
+  }
+
+  delete place;
+  delete context;
+}
+#endif
+
 TEST(math, vol2col) {
   testVol2col<paddle::platform::CPUDeviceContext, paddle::platform::CPUPlace>();
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
diff --git a/paddle/fluid/operators/mean_op.cu b/paddle/fluid/operators/mean_op.cu
index 63b5b871aab..db8f586297c 100644
--- a/paddle/fluid/operators/mean_op.cu
+++ b/paddle/fluid/operators/mean_op.cu
@@ -66,7 +66,8 @@ class MeanCUDAKernel : public framework::OpKernel<T> {
       reduce_dims.push_back(i);
     }
     TensorReduceFunctorImpl<T, T, kernel_primitives::AddFunctor, Div>(
-        *input, output, Div(numel), reduce_dims, stream);
+        context.cuda_device_context(), *input, output, Div(numel), reduce_dims,
+        stream);
   }
 };
 
diff --git a/paddle/fluid/operators/nccl/nccl_op_test.cu.cc b/paddle/fluid/operators/nccl/nccl_op_test.cu.cc
index 41c1b4d7a8f..9fcbfa90230 100644
--- a/paddle/fluid/operators/nccl/nccl_op_test.cu.cc
+++ b/paddle/fluid/operators/nccl/nccl_op_test.cu.cc
@@ -57,7 +57,12 @@ class NCCLTester : public ::testing::Test {
     paddle::platform::CPUPlace cpu_place;
     for (size_t i = 0; i < gpu_list_.size(); ++i) {
       p::CUDAPlace place(i);
-      dev_ctxs_.emplace_back(new p::CUDADeviceContext(place));
+      auto *ctx = new p::CUDADeviceContext(place);
+      ctx->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                            .GetAllocator(place, ctx->stream())
+                            .get());
+      ctx->PartialInitWithAllocator();
+      dev_ctxs_.emplace_back(ctx);
     }
 
     NCCLInitOp();
diff --git a/paddle/fluid/operators/p_norm_op.cu b/paddle/fluid/operators/p_norm_op.cu
index c0bd906685d..e11fe478106 100644
--- a/paddle/fluid/operators/p_norm_op.cu
+++ b/paddle/fluid/operators/p_norm_op.cu
@@ -106,16 +106,20 @@ class PnormCUDAKernel : public framework::OpKernel<T> {
     using MT = typename details::MPTypeTrait<T>::Type;
     if (porder == 0) {
       TensorReduceFunctorImpl<T, T, kps::AddFunctor, NonzeroFunctor<T>>(
-          *in_x, out_norm, NonzeroFunctor<T>(), reduce_axis, stream);
+          ctx.cuda_device_context(), *in_x, out_norm, NonzeroFunctor<T>(),
+          reduce_axis, stream);
     } else if (porder == INFINITY) {
       TensorReduceFunctorImpl<T, T, kps::MaxFunctor, AbsFunctor<T>>(
-          *in_x, out_norm, AbsFunctor<T>(), reduce_axis, stream);
+          ctx.cuda_device_context(), *in_x, out_norm, AbsFunctor<T>(),
+          reduce_axis, stream);
     } else if (porder == -INFINITY) {
       TensorReduceFunctorImpl<T, T, kps::MinFunctor, AbsFunctor<T>>(
-          *in_x, out_norm, AbsFunctor<T>(), reduce_axis, stream);
+          ctx.cuda_device_context(), *in_x, out_norm, AbsFunctor<T>(),
+          reduce_axis, stream);
     } else {
       TensorReduceFunctorImpl<T, T, kps::AddFunctor, UnsignedPowFunctor<T>>(
-          *in_x, out_norm, UnsignedPowFunctor<T>(porder), reduce_axis, stream);
+          ctx.cuda_device_context(), *in_x, out_norm,
+          UnsignedPowFunctor<T>(porder), reduce_axis, stream);
 
       const framework::Tensor* tmp_norm = out_norm;
       std::vector<const framework::Tensor*> ins = {tmp_norm};
diff --git a/paddle/fluid/operators/pool_op.h b/paddle/fluid/operators/pool_op.h
index 84c1988e29b..f7f60e82216 100644
--- a/paddle/fluid/operators/pool_op.h
+++ b/paddle/fluid/operators/pool_op.h
@@ -208,8 +208,8 @@ class PoolKernel : public framework::OpKernel<T> {
             auto stream = dev_ctx.stream();
             TensorReduceFunctorImpl<T, T, kps::AddFunctor,
                                     kps::DivideFunctor<T>>(
-                *in_x, out, kps::DivideFunctor<T>(reduce_num), reduce_dim,
-                stream);
+                dev_ctx, *in_x, out, kps::DivideFunctor<T>(reduce_num),
+                reduce_dim, stream);
 #else  // for cpu
             paddle::operators::math::Pool2dFunctor<
                 DeviceContext, paddle::operators::math::AvgPool<T>, T>
diff --git a/paddle/fluid/operators/prelu_op.cu b/paddle/fluid/operators/prelu_op.cu
index c6997603bb1..9493b6d4391 100644
--- a/paddle/fluid/operators/prelu_op.cu
+++ b/paddle/fluid/operators/prelu_op.cu
@@ -186,7 +186,8 @@ class CUDAPReluGradKernel : public framework::OpKernel<T> {
     }
 
     TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-        dalpha_tmp, dalpha, kps::IdentityFunctor<T>(), reduce_dims, stream);
+        context.cuda_device_context(), dalpha_tmp, dalpha,
+        kps::IdentityFunctor<T>(), reduce_dims, stream);
   }
 };
 
diff --git a/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc b/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc
index 21f21cdc956..2c701bdae76 100644
--- a/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc
+++ b/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc
@@ -222,6 +222,10 @@ TEST(SENDANDRECV, GPU) {
   framework::Scope* scope = (*micro_scope)[0];
   platform::CUDAPlace place;
   platform::CUDADeviceContext ctx(place);
+  ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                       .GetAllocator(place, ctx.stream())
+                       .get());
+  ctx.PartialInitWithAllocator();
 
   framework::Executor exe(place);
   // create var on local scope
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
index 730a84da88b..f741c5941eb 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
@@ -39,14 +39,16 @@ namespace operators {
 
 template <typename Tx, typename Ty, template <typename> class ReduceOp,
           typename TransformOp>
-void TensorReduceFunctorImpl(const framework::Tensor& x, framework::Tensor* y,
+void TensorReduceFunctorImpl(const platform::CUDADeviceContext& dev_ctx,
+                             const framework::Tensor& x, framework::Tensor* y,
                              const TransformOp& transform,
                              const std::vector<int>& origin_reduce_dims,
                              gpuStream_t stream) {
   y->mutable_data<Ty>(x.place());
 
   pten::kernels::TensorReduceFunctorImpl<Tx, Ty, ReduceOp, TransformOp>(
-      x, y, transform, origin_reduce_dims, stream);
+      static_cast<const pten::GPUContext&>(dev_ctx), x, y, transform,
+      origin_reduce_dims, stream);
 }
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/renorm_op.cu b/paddle/fluid/operators/renorm_op.cu
index 1d76eaf27e8..ca6169d0410 100644
--- a/paddle/fluid/operators/renorm_op.cu
+++ b/paddle/fluid/operators/renorm_op.cu
@@ -156,7 +156,8 @@ class CUDARenormKernel : public framework::OpKernel<T> {
         cuda_ctx, ins, &outs, func);
     std::vector<int> reduce_axis = {0, 2};
     TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-        pow_value, &dim_value, kps::IdentityFunctor<T>(), reduce_axis, stream);
+        cuda_ctx, pow_value, &dim_value, kps::IdentityFunctor<T>(), reduce_axis,
+        stream);
     RenormKernelFunc3<T><<<grid2, block2, 0, stream>>>(
         numel, dim_value.mutable_data<T>(context.GetPlace()), p, max_norm);
     RenormKernelFunc4<T><<<grid, block, 0, stream>>>(
@@ -213,10 +214,11 @@ class CUDAGradRenormKernel : public framework::OpKernel<T> {
         dim_divisor);
     std::vector<int> reduce_axis = {0, 2};
     TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-        pow_value, &dim_value, kps::IdentityFunctor<T>(), reduce_axis, stream);
+        ctx.cuda_device_context(), pow_value, &dim_value,
+        kps::IdentityFunctor<T>(), reduce_axis, stream);
     TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-        mul_value, &weight_derivative, kps::IdentityFunctor<T>(), reduce_axis,
-        stream);
+        ctx.cuda_device_context(), mul_value, &weight_derivative,
+        kps::IdentityFunctor<T>(), reduce_axis, stream);
     RenormGradKernelFunc2<T><<<grid, block, 0, stream>>>(
         x_data, dout_data, dx_data, numel,
         dim_value.mutable_data<T>(ctx.GetPlace()),
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index 6c2d5ebcc7d..99e40de3080 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -389,7 +389,8 @@ class ReshapeKernel {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     if (platform::is_gpu_place(ctx.GetPlace())) {
       auto &dev_ctx = ctx.device_context<platform::CUDADeviceContext>();
-      pten::ReshapeKernel(dev_ctx, *in, pt_scalar_shape, out);
+      pten::ReshapeKernel(static_cast<const pten::GPUContext &>(dev_ctx), *in,
+                          pt_scalar_shape, out);
     }
 #endif
 #ifdef PADDLE_WITH_XPU
@@ -417,7 +418,8 @@ class ReshapeGradKernel {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     if (platform::is_gpu_place(ctx.GetPlace())) {
       auto &dev_ctx = ctx.device_context<platform::CUDADeviceContext>();
-      pten::ReshapeGradKernel(dev_ctx, *d_out, d_x);
+      pten::ReshapeGradKernel(static_cast<const pten::GPUContext &>(dev_ctx),
+                              *d_out, d_x);
     }
 #endif
 #ifdef PADDLE_WITH_XPU
@@ -445,7 +447,8 @@ class ReshapeDoubleGradKernel {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     if (platform::is_gpu_place(ctx.GetPlace())) {
       auto &dev_ctx = ctx.device_context<platform::CUDADeviceContext>();
-      pten::ReshapeDoubleGradKernel(dev_ctx, *dd_x, dd_out);
+      pten::ReshapeDoubleGradKernel(
+          static_cast<const pten::GPUContext &>(dev_ctx), *dd_x, dd_out);
     }
 #endif
 #ifdef PADDLE_WITH_XPU
@@ -485,20 +488,6 @@ class Reshape2Op : public ReshapeOp {
 
     ReshapeOp::InferShape(ctx);
   }
-
-  framework::KernelSignature GetExpectedPtenKernelArgs(
-      const framework::ExecutionContext &ctx) const override {
-    std::string shape;
-    auto multi_inputs = ctx.MultiInput<framework::Tensor>("ShapeTensor");
-    if (multi_inputs.size() > 0) {
-      shape = "ShapeTensor";
-    } else if (ctx.HasInput("Shape")) {
-      shape = "Shape";
-    } else {
-      shape = "shape";
-    }
-    return framework::KernelSignature("reshape", {"X"}, {shape}, {"Out"});
-  }
 };
 
 class Reshape2OpMaker : public ReshapeOpMaker {
diff --git a/paddle/fluid/operators/roll_op.cu b/paddle/fluid/operators/roll_op.cu
index 7e8e37bd2ee..0f861179b2d 100644
--- a/paddle/fluid/operators/roll_op.cu
+++ b/paddle/fluid/operators/roll_op.cu
@@ -17,7 +17,7 @@
 #include "paddle/fluid/operators/roll_op.h"
 #include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-#include "paddle/pten/core/array.h"
+#include "paddle/pten/core/utils/array.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/scale_op.cc b/paddle/fluid/operators/scale_op.cc
index a1954527910..912af2c85b2 100644
--- a/paddle/fluid/operators/scale_op.cc
+++ b/paddle/fluid/operators/scale_op.cc
@@ -128,11 +128,19 @@ class ScaleGradMaker : public framework::SingleGradOpMaker<T> {
       grad_op->SetInput("ScaleTensor", this->Input("ScaleTensor"));
     }
     grad_op->SetOutput("Out", this->InputGrad("X"));
+    VLOG(6) << "Finish SetOutput";
     grad_op->SetAttr("scale", this->GetAttr("scale"));
+    VLOG(6) << "Finish Set Attr scale";
     grad_op->SetAttr("bias", 0.0f);
+    VLOG(6) << "Finish Set Attr bias";
     grad_op->SetAttr("bias_after_scale", true);
-    if (grad_op->HasAttr("use_mkldnn"))
+    VLOG(6) << "Finish Set Attr bias_after_scale";
+    if (grad_op->HasAttr("use_mkldnn")) {
+      VLOG(6) << "Finish Check Attr use_mkldnn";
       grad_op->SetAttr("use_mkldnn", this->GetAttr("use_mkldnn"));
+      VLOG(6) << "Finish Set Attr use_mkldnn";
+    }
+    VLOG(6) << "Finish Apply";
   }
 };
 
diff --git a/paddle/fluid/operators/scale_op.h b/paddle/fluid/operators/scale_op.h
index 2a30d3f0b08..8ce0b7984cc 100644
--- a/paddle/fluid/operators/scale_op.h
+++ b/paddle/fluid/operators/scale_op.h
@@ -43,34 +43,36 @@ class ScaleKernel : public framework::OpKernel<T> {
  public:
   virtual void Compute(const framework::ExecutionContext& ctx) const {
     auto* in_var = ctx.InputVar("X");
-    auto* in = framework::GetLoDTensorOrSelectedRowsValueFromVar(*in_var);
 
     auto bias = ctx.Attr<float>("bias");
     auto bias_after_scale = ctx.Attr<bool>("bias_after_scale");
-
     auto scale = ctx.Attr<float>("scale");
+    auto* out_var = ctx.OutputVar("Out");
+
     if (ctx.HasInput("ScaleTensor")) {
       auto* scale_tensor = ctx.Input<framework::Tensor>("ScaleTensor");
       scale = static_cast<float>(GetAttrFromTensor<T>(scale_tensor));
     }
 
-    auto* out_var = ctx.OutputVar("Out");
-    if (in_var->IsType<pten::SelectedRows>() && in_var != out_var) {
-      auto& in_slr = in_var->Get<pten::SelectedRows>();
-      auto* out_slr = out_var->GetMutable<pten::SelectedRows>();
-      out_slr->set_rows(in_slr.rows());
-      out_slr->set_height(in_slr.height());
-    }
+    auto* in = framework::GetLoDTensorOrSelectedRowsValueFromVar(*in_var);
     auto* out =
         framework::GetMutableLoDTensorOrSelectedRowsValueFromVar(out_var);
     out->mutable_data<T>(in->place());
     auto& dev_ctx = ctx.device_context<DeviceContext>();
 
     // call new kernel
-    pten::ScaleKernel<T>(
-        static_cast<const typename framework::ConvertToPtenContext<
-            DeviceContext>::TYPE&>(dev_ctx),
-        *in, scale, bias, bias_after_scale, out);
+    if (in_var->IsType<pten::SelectedRows>()) {
+      pten::ScaleSR<T>(
+          static_cast<const typename framework::ConvertToPtenContext<
+              DeviceContext>::TYPE&>(dev_ctx),
+          in_var->Get<pten::SelectedRows>(), scale, bias, bias_after_scale,
+          out_var->GetMutable<pten::SelectedRows>());
+    } else {
+      pten::ScaleKernel<T>(
+          static_cast<const typename framework::ConvertToPtenContext<
+              DeviceContext>::TYPE&>(dev_ctx),
+          *in, scale, bias, bias_after_scale, out);
+    }
   }
 };
 
diff --git a/paddle/fluid/operators/scatter.cu.h b/paddle/fluid/operators/scatter.cu.h
index 98311ff404b..13c08aea688 100644
--- a/paddle/fluid/operators/scatter.cu.h
+++ b/paddle/fluid/operators/scatter.cu.h
@@ -183,8 +183,7 @@ void GPUScatterGradForX(const platform::DeviceContext& ctx, const Tensor& index,
 
   int64_t max_grid_dimx =
       reinterpret_cast<const platform::CUDADeviceContext&>(ctx)
-          .GetCUDAMaxGridDimSize()
-          .x;
+          .GetCUDAMaxGridDimSize()[0];
   int64_t grid = height < max_grid_dimx ? height : max_grid_dimx;
 
   ScatterInitCUDAKernel<T, IndexT><<<
diff --git a/paddle/fluid/operators/sign_op.cc b/paddle/fluid/operators/sign_op.cc
index 3fd2a5bc5e4..cc7828c56c0 100644
--- a/paddle/fluid/operators/sign_op.cc
+++ b/paddle/fluid/operators/sign_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <memory>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/float16.h"
@@ -60,7 +61,7 @@ class SignGradMaker : public framework::SingleGradOpMaker<T> {
 namespace ops = paddle::operators;
 
 DELCARE_INFER_SHAPE_FUNCTOR(sign, SignInferShapeFunctor,
-                            PT_INFER_META(pten::UnchangedInferMetaNew));
+                            PT_INFER_META(pten::UnchangedInferMeta));
 REGISTER_OPERATOR(sign, ops::SignOp, ops::SignOpMaker<float>,
                   ops::SignGradMaker<paddle::framework::OpDesc>,
                   ops::SignGradMaker<paddle::imperative::OpBase>,
diff --git a/paddle/fluid/operators/sign_op.h b/paddle/fluid/operators/sign_op.h
deleted file mode 100644
index 41bcf9e8ae1..00000000000
--- a/paddle/fluid/operators/sign_op.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/pten_utils.h"
-#include "paddle/fluid/operators/eigen/eigen_function.h"
-
-#include "paddle/pten/kernels/sign_kernel.h"
-
-namespace paddle {
-namespace operators {
-
-// See Note [ Why still keep the original kernel implementation? ]
-template <typename DeviceContext, typename T>
-class SignKernel : public framework::OpKernel<T> {
- public:
-  virtual void Compute(const framework::ExecutionContext& context) const {
-    auto* x = context.Input<framework::Tensor>("X");
-    auto* out = context.Output<framework::Tensor>("Out");
-    auto& dev_ctx = context.device_context<DeviceContext>();
-    out->mutable_data<T>(x->place());
-
-    // call new kernel
-    pten::SignKernel<T, typename paddle::framework::ConvertToPtenContext<
-                            DeviceContext>::TYPE>(
-        static_cast<const typename paddle::framework::ConvertToPtenContext<
-            DeviceContext>::TYPE&>(dev_ctx),
-        *x, out);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
index 78e813edda9..cba779d0a77 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
@@ -40,7 +40,7 @@ class SoftmaxWithCrossEntropyOpMaker
         "The outputs value of softmax activation by given the input batch, "
         "which will be used in backward calculation.")
         .AsIntermediate();
-#ifdef PADDLE_WITH_ASCEND_CL
+#if defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_MLU)
     AddOutput(
         "Backprop",
         "(Tensor, default: Tensor<float>), A tensor in same shape with "
@@ -49,7 +49,7 @@ class SoftmaxWithCrossEntropyOpMaker
         "is :"
         "exp(logits -max_logits) / sum(exp(logits - max_logits)) - labels, "
         "where labels is ont-hot."
-        "Currently, the tensor is generated and used in npu kernel only. ")
+        "Currently, the tensor is generated and used in npu/mlu kernel. ")
         .AsIntermediate();
 #endif
     AddOutput("Loss",
@@ -131,7 +131,7 @@ class SoftmaxWithCrossEntropyOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(ctx->HasOutput("Softmax"), true,
                       platform::errors::InvalidArgument(
                           "Output(Softmax) should be not null."));
-#ifdef PADDLE_WITH_ASCEND_CL
+#if defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_MLU)
     PADDLE_ENFORCE_EQ(ctx->HasOutput("Backprop"), true,
                       platform::errors::InvalidArgument(
                           "Output(Backprop) should be not null."));
@@ -194,7 +194,7 @@ class SoftmaxWithCrossEntropyOp : public framework::OperatorWithKernel {
     }
 
     ctx->SetOutputDim("Softmax", logits_dims);
-#ifdef PADDLE_WITH_ASCEND_CL
+#if defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_MLU)
     ctx->SetOutputDim("Backprop", logits_dims);
     ctx->ShareLoD("Logits", /*->*/ "Backprop");
 #endif
@@ -225,7 +225,7 @@ class SoftmaxWithCrossEntropyOpGrad : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(ctx->HasInput("Softmax"), true,
                       platform::errors::InvalidArgument(
                           "Input(Softmax) should be not null."));
-#ifdef PADDLE_WITH_ASCEND_CL
+#if defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_MLU)
     PADDLE_ENFORCE_EQ(ctx->HasInput("Backprop"), true,
                       platform::errors::InvalidArgument(
                           "Input(Backprop) should be not null."));
@@ -306,7 +306,7 @@ class SoftmaxGradMaker : public framework::SingleGradOpMaker<T> {
     grad_op->SetType("softmax_with_cross_entropy_grad");
     grad_op->SetInput("Label", this->Input("Label"));
     grad_op->SetInput("Softmax", this->Output("Softmax"));
-#ifdef PADDLE_WITH_ASCEND_CL
+#if defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_MLU)
     grad_op->SetInput("Backprop", this->Output("Backprop"));
 #endif
     grad_op->SetInput(framework::GradVarName("Loss"), this->OutputGrad("Loss"));
@@ -343,7 +343,7 @@ REGISTER_OP_CPU_KERNEL(softmax_with_cross_entropy_grad,
                        ops::SoftmaxWithCrossEntropyGradKernel<double>);
 
 REGISTER_OP_VERSION(softmax_with_cross_entropy)
-#ifdef PADDLE_WITH_ASCEND_CL
+#if defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_MLU)
     .AddCheckpoint(
         R"ROC(
               Add a new attribute [use_softmax] )ROC",
@@ -358,8 +358,7 @@ REGISTER_OP_VERSION(softmax_with_cross_entropy)
             "calculation is :"
             "exp(logits -max_logits) / sum(exp(logits - max_logits)) - labels, "
             "where labels is ont-hot."
-            "Currently, the tensor is generated and used in npu kernel "
-            "only. "));
+            "Currently, the tensor is generated and used in npu/mlu kernel. "));
 #else
     .AddCheckpoint(
         R"ROC(
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op_mlu.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op_mlu.cc
new file mode 100644
index 00000000000..0f14e6dabdb
--- /dev/null
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op_mlu.cc
@@ -0,0 +1,151 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/softmax_with_cross_entropy_op.h"
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+class SoftmaxWithCrossEntropyMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* logits = ctx.Input<Tensor>("Logits");
+    auto* labels = ctx.Input<Tensor>("Label");
+    auto* softmax = ctx.Output<Tensor>("Softmax");
+    auto* loss = ctx.Output<Tensor>("Loss");
+    auto* backprop = ctx.Output<Tensor>("Backprop");
+    auto soft_label = ctx.Attr<bool>("soft_label");
+
+    PADDLE_ENFORCE_EQ(ctx.Attr<bool>("use_softmax"), true,
+                      platform::errors::InvalidArgument(
+                          "use_softmax=False is not supported in "
+                          "the mlu kernel of softmax_with_cross_entropy."));
+
+    const int rank = logits->dims().size();
+    const int axis = CanonicalAxis(ctx.Attr<int>("axis"), rank);
+
+    loss->mutable_data<T>(ctx.GetPlace());
+    backprop->mutable_data<T>(ctx.GetPlace());
+    softmax->mutable_data<T>(ctx.GetPlace());
+
+    // cnnl softmax only support 3-dims, regard all shape as [d1, d2, d3]
+    const int cnnl_softmax_dims = 3;
+    const int d1 = SizeToAxis(axis, logits->dims());
+    const int d2_logits = logits->dims()[axis];
+    const int d2_labels = labels->dims()[axis];
+    const int d3 = SizeOutAxis(axis, logits->dims());
+
+    // CNNL_SOFTMAX_MODE_LOW_DIMENSION has better perfermence, use it as much as
+    // possible.
+    cnnlSoftmaxMode_t mode = CNNL_SOFTMAX_MODE_LOW_DIMENSION;
+    std::vector<int> regard_logits_shape{d1, 1, d2_logits};
+    std::vector<int> regard_labels_shape{d1, 1, d2_labels};
+    std::vector<int> regard_loss_shape{d1, 1, 1};
+    if (d3 != 1) {
+      mode = CNNL_SOFTMAX_MODE_MEDIUM_DIMENSION;
+      regard_logits_shape = {d1, d2_logits, d3};
+      regard_labels_shape = {d1, d2_labels, d3};
+      regard_loss_shape = {d1, 1, d3};
+    }
+
+    MLUCnnlTensorDesc logits_desc(cnnl_softmax_dims, regard_logits_shape.data(),
+                                  ToCnnlDataType<T>());
+    MLUCnnlTensorDesc labels_desc(cnnl_softmax_dims, regard_labels_shape.data(),
+                                  ToCnnlDataType<T>());
+    MLUCnnlTensorDesc loss_desc(cnnl_softmax_dims, regard_loss_shape.data(),
+                                ToCnnlDataType<T>());
+
+    const cnnlSoftmaxAlgorithm_t algo = CNNL_SOFTMAX_ACCURATE;
+    MLUCnnl::SoftmaxForward(ctx, algo, mode, NULL, logits_desc.get(),
+                            GetBasePtr(logits), NULL, logits_desc.get(),
+                            GetBasePtr(softmax));
+
+    if (soft_label) {
+      const cnnlComputationPreference_t prefer =
+          CNNL_COMPUTATION_HIGH_PRECISION;
+      MLUCnnl::SoftmaxCrossEntropyWithLogits(
+          ctx, mode, prefer, logits_desc.get(), GetBasePtr(logits),
+          labels_desc.get(), GetBasePtr(labels), loss_desc.get(),
+          GetBasePtr(loss), logits_desc.get(), GetBasePtr(backprop));
+    } else {
+      PADDLE_ENFORCE_EQ(d3, 1,
+                        platform::errors::InvalidArgument(
+                            "If soft_label=False, axis must be -1 or"
+                            " can be regard as last dimention in mlu kernel."));
+      framework::Tensor labels_int32(VT::INT32);
+      labels_int32.Resize(labels->dims());
+      labels_int32.mutable_data<int32_t>(ctx.GetPlace());
+
+      MLUCnnlTensorDesc labels_int64_desc(*labels);
+      MLUCnnlTensorDesc labels_int32_desc(labels_int32);
+      cnnlCastDataType_t cast_type = GetCastDataType(VT::INT64, VT::INT32);
+      MLUCnnl::Cast(ctx, cast_type, labels_int64_desc.get(), GetBasePtr(labels),
+                    labels_int32_desc.get(), GetBasePtr(&labels_int32));
+
+      const int regard_sparse_shape[cnnl_softmax_dims - 1] = {d1, 1};
+      MLUCnnlTensorDesc sparse_labels_desc(cnnl_softmax_dims - 1,
+                                           regard_sparse_shape,
+                                           ToCnnlDataType<int32_t>());
+      MLUCnnlTensorDesc sparse_loss_desc(
+          cnnl_softmax_dims - 1, regard_sparse_shape, ToCnnlDataType<T>());
+
+      MLUCnnl::SparseSoftmaxXentWithLogits(
+          ctx, mode, logits_desc.get(), GetBasePtr(logits),
+          sparse_labels_desc.get(), GetBasePtr(&labels_int32),
+          sparse_loss_desc.get(), GetBasePtr(loss), logits_desc.get(),
+          GetBasePtr(backprop));
+    }
+  }
+};
+
+template <typename T>
+class SoftmaxWithCrossEntropyGradMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* backprop = ctx.Input<Tensor>("Backprop");
+    auto* loss_grad = ctx.Input<Tensor>(framework::GradVarName("Loss"));
+    auto* logits_grad = ctx.Output<Tensor>(framework::GradVarName("Logits"));
+    PADDLE_ENFORCE_NOT_NULL(backprop,
+                            platform::errors::PreconditionNotMet(
+                                "backprop should not be null in MLU kernel of "
+                                "softmax_with_cross_entropy_grad."));
+    logits_grad->mutable_data<T>(ctx.GetPlace());
+
+    MLUCnnlOpTensorDesc mul_op_desc(CNNL_OP_TENSOR_MUL, ToCnnlDataType<T>(),
+                                    CNNL_NOT_PROPAGATE_NAN);
+    MLUCnnlTensorDesc backprop_desc(*backprop);
+    MLUCnnlTensorDesc loss_grad_desc(*loss_grad);
+    MLUCnnlTensorDesc logits_grad_desc(*logits_grad);
+    MLUCnnl::OpTensor(ctx, mul_op_desc.get(), backprop_desc.get(),
+                      GetBasePtr(backprop), loss_grad_desc.get(),
+                      GetBasePtr(loss_grad), logits_grad_desc.get(),
+                      GetBasePtr(logits_grad), ToCnnlDataType<T>());
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_MLU_KERNEL(
+    softmax_with_cross_entropy, ops::SoftmaxWithCrossEntropyMLUKernel<float>,
+    ops::SoftmaxWithCrossEntropyMLUKernel<paddle::platform::float16>);
+REGISTER_OP_MLU_KERNEL(
+    softmax_with_cross_entropy_grad,
+    ops::SoftmaxWithCrossEntropyGradMLUKernel<float>,
+    ops::SoftmaxWithCrossEntropyGradMLUKernel<paddle::platform::float16>);
diff --git a/paddle/fluid/operators/solve_op.h b/paddle/fluid/operators/solve_op.h
index 0acef78484c..ea3a5aa5af9 100644
--- a/paddle/fluid/operators/solve_op.h
+++ b/paddle/fluid/operators/solve_op.h
@@ -46,7 +46,8 @@ void ReduceSumForSolve(const Tensor* input, Tensor* output,
 #if defined(__NVCC__) || defined(__HIPCC__)
   auto stream = ctx.cuda_device_context().stream();
   TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-      *input, output, kps::IdentityFunctor<T>(), reduce_dims, stream);
+      ctx.cuda_device_context(), *input, output, kps::IdentityFunctor<T>(),
+      reduce_dims, stream);
 #else
   ReduceKernelFunctor<DeviceContext, T, ops::SumFunctor>(
       input, output, reduce_dims, keep_dim, false, ctx)
diff --git a/paddle/fluid/operators/strided_memcpy_test.cc b/paddle/fluid/operators/strided_memcpy_test.cc
index 1ab036e8692..514bdac9c55 100644
--- a/paddle/fluid/operators/strided_memcpy_test.cc
+++ b/paddle/fluid/operators/strided_memcpy_test.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/strided_memcpy.h"
 
 #include "gtest/gtest.h"
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
 
 namespace paddle {
 namespace operators {
@@ -86,6 +87,10 @@ TEST(StridedMemcpy, GPUCrop) {
   platform::CPUPlace cpu;
 
   platform::CUDADeviceContext ctx(gpu0);
+  ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                       .GetAllocator(gpu0, ctx.stream())
+                       .get());
+  ctx.PartialInitWithAllocator();
 
   auto src_allocation = memory::Alloc(gpu0, sizeof(src));
 
@@ -124,6 +129,10 @@ TEST(StridedMemcpy, GPUConcat) {
   platform::CUDAPlace gpu0(0);
   platform::CPUPlace cpu;
   platform::CUDADeviceContext ctx(gpu0);
+  ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                       .GetAllocator(gpu0, ctx.stream())
+                       .get());
+  ctx.PartialInitWithAllocator();
   auto gpu_src_allocation = memory::Alloc(gpu0, sizeof(src));
   int* gpu_src = reinterpret_cast<int*>(gpu_src_allocation->ptr());
   memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx.stream());
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
index d2d04a4fa50..90d489c8df7 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
@@ -37,6 +37,10 @@ void CreateCUDATensor(framework::Scope* scope, const std::string& name,
   tensor->Resize(dims);
   platform::CUDAPlace place;
   platform::CUDADeviceContext ctx(place);
+  ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                       .GetAllocator(place, ctx.stream())
+                       .get());
+  ctx.PartialInitWithAllocator();
   inference::tensorrt::RandomizeTensor(tensor, place, ctx);
 }
 
@@ -133,6 +137,10 @@ void DynamicShapeTest(bool allow_build_at_runtime) {
   framework::Scope scope;
   platform::CUDAPlace place;
   platform::CUDADeviceContext ctx(place);
+  ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                       .GetAllocator(place, ctx.stream())
+                       .get());
+  ctx.PartialInitWithAllocator();
   // Prepare variables.
   if (allow_build_at_runtime)
     CreateCUDATensor(&scope, "x", std::vector<int64_t>({3, 4, 1, 1}));
@@ -159,6 +167,10 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
   framework::Scope scope;
   platform::CUDAPlace place;
   platform::CUDADeviceContext ctx(place);
+  ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                       .GetAllocator(place, ctx.stream())
+                       .get());
+  ctx.PartialInitWithAllocator();
 
   auto* block_ = program.Proto()->add_blocks();
   block_->set_idx(0);
diff --git a/paddle/fluid/operators/top_k_function_cuda.h b/paddle/fluid/operators/top_k_function_cuda.h
index 05ae5c9188c..c077411e496 100644
--- a/paddle/fluid/operators/top_k_function_cuda.h
+++ b/paddle/fluid/operators/top_k_function_cuda.h
@@ -411,7 +411,7 @@ bool SortTopk(const platform::CUDADeviceContext& ctx,
   };
   int block_size = ComputeBlockSize(num_cols);
 
-  unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize().x;
+  unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0];
   // actually, int num_rows < max_grid_size
   unsigned int grid_size = num_rows < maxGridDimX
                                ? static_cast<unsigned int>(num_rows)
diff --git a/paddle/fluid/operators/trace_op.cu b/paddle/fluid/operators/trace_op.cu
index 98a77637f92..0b9e615eece 100644
--- a/paddle/fluid/operators/trace_op.cu
+++ b/paddle/fluid/operators/trace_op.cu
@@ -40,7 +40,8 @@ class TraceCUDAKernel : public framework::OpKernel<T> {
       std::vector<int> reduce_dims;
       reduce_dims.push_back(out->dims().size());
       TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-          diag, out, kps::IdentityFunctor<T>(), reduce_dims, stream);
+          context.cuda_device_context(), diag, out, kps::IdentityFunctor<T>(),
+          reduce_dims, stream);
     } else {
       math::SetConstant<DeviceContext, T> functor;
       functor(context.device_context<DeviceContext>(), out, static_cast<T>(0));
diff --git a/paddle/fluid/operators/triangular_solve_op.cu b/paddle/fluid/operators/triangular_solve_op.cu
index b7ea5cd9531..28cdc56e2ae 100644
--- a/paddle/fluid/operators/triangular_solve_op.cu
+++ b/paddle/fluid/operators/triangular_solve_op.cu
@@ -45,7 +45,8 @@ class MatrixReduceSumFunctor<platform::CUDADeviceContext, T> {
     }
     gpuStream_t stream = ctx.cuda_device_context().stream();
     TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-        in, out, kps::IdentityFunctor<T>(), out_reduce_dims, stream);
+        ctx.cuda_device_context(), in, out, kps::IdentityFunctor<T>(),
+        out_reduce_dims, stream);
   }
 };
 
diff --git a/paddle/fluid/operators/viterbi_decode_op.cu b/paddle/fluid/operators/viterbi_decode_op.cu
index 7ad3335009b..b52f46e4a8a 100644
--- a/paddle/fluid/operators/viterbi_decode_op.cu
+++ b/paddle/fluid/operators/viterbi_decode_op.cu
@@ -148,7 +148,7 @@ struct Argmax<platform::CUDADeviceContext, T, IndType> {
     }
     const auto& dev_ctx = ctx.cuda_device_context();
     auto cu_stream = dev_ctx.stream();
-    int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize().x;
+    int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
     int64_t height = pre * post;
     int64_t width = n;
     int64_t grid_size = height < max_grid_dimx ? height : max_grid_dimx;
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 7dc07942d44..07d3e419582 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -133,7 +133,12 @@ if(WITH_ASCEND_CL)
     target_link_libraries(collective_helper npu_collective_helper)
 endif()
 
+if(WITH_CNCL)
+    target_link_libraries(collective_helper mlu_collective_helper)
+endif()
+
 if(WITH_GPU OR WITH_ROCM)
+    target_link_libraries(device_context gpu_info gpu_context pten_gpu_info)
     target_link_libraries(device_context gpu_resource_pool)
 endif()
 
@@ -177,13 +182,13 @@ add_subdirectory(profiler)
 
 cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS})
 if(WITH_GPU)
-  nv_library(profiler SRCS profiler.cc profiler.cu DEPS host_event_recorder os_info device_tracer gpu_info enforce dynload_cuda)
+  nv_library(profiler SRCS profiler.cc profiler.cu DEPS os_info device_tracer gpu_info enforce dynload_cuda)
   nv_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info gpu_info place)
 elseif(WITH_ROCM)
-  hip_library(profiler SRCS profiler.cc profiler.cu DEPS host_event_recorder os_info device_tracer gpu_info enforce)
+  hip_library(profiler SRCS profiler.cc profiler.cu DEPS os_info device_tracer gpu_info enforce)
   hip_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info gpu_info place)
 else()
-  cc_library(profiler SRCS profiler.cc DEPS host_event_recorder os_info device_tracer enforce)
+  cc_library(profiler SRCS profiler.cc DEPS os_info device_tracer enforce)
   cc_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info place)
 endif()
 
diff --git a/paddle/fluid/platform/bfloat16_test.cu b/paddle/fluid/platform/bfloat16_test.cu
index dbbb72920a5..a82043cd7c4 100644
--- a/paddle/fluid/platform/bfloat16_test.cu
+++ b/paddle/fluid/platform/bfloat16_test.cu
@@ -66,6 +66,10 @@ TEST(bfloat16, lod_tensor_on_gpu) {
   // CPU LoDTensor to GPU LoDTensor
   CUDAPlace gpu_place(0);
   CUDADeviceContext gpu_ctx(gpu_place);
+  gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(gpu_place, gpu_ctx.stream())
+                           .get());
+  gpu_ctx.PartialInitWithAllocator();
   framework::TensorCopy(src_tensor, gpu_place, gpu_ctx, &gpu_tensor);
 
   // GPU LoDTensor to CPU LoDTensor
diff --git a/paddle/fluid/platform/collective_helper.cc b/paddle/fluid/platform/collective_helper.cc
index dd2dc9a4079..ae1df10c45f 100644
--- a/paddle/fluid/platform/collective_helper.cc
+++ b/paddle/fluid/platform/collective_helper.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/platform/collective_helper.h"
 #include <utility>
 
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/platform/device/gpu/gpu_resource_pool.h"
 
@@ -187,6 +188,18 @@ NCCLComm* NCCLCommContext::AssignNCCLComm(ncclComm_t comm, int nranks, int rank,
                                           int dev_id, int ring_id) {
   std::unique_ptr<CUDADeviceContext> dev_ctx(
       new CUDADeviceContext(CUDAPlace(dev_id)));
+  dev_ctx->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                            .GetAllocator(CUDAPlace(dev_id), dev_ctx->stream())
+                            .get());
+  dev_ctx->SetHostAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(paddle::platform::CPUPlace())
+          .get());
+  dev_ctx->SetZeroAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetZeroAllocator(CUDAPlace(dev_id))
+          .get());
+  dev_ctx->PartialInitWithAllocator();
 
   std::shared_ptr<platform::CudaEventObject> compute_event(
       platform::CudaEventResourcePool::Instance().New(dev_id));
@@ -329,7 +342,7 @@ BKCLComm* BKCLCommContext::AssignBKCLComm(BKCLContext_t comm, int nranks,
     auto* dev_ctx = static_cast<platform::XPUDeviceContext*>(
         platform::DeviceContextPool::Instance().Get(
             platform::XPUPlace(dev_id)));
-    dev_ctx->set_bkcl_context(comm);
+    dev_ctx->SetBkclContext(comm);
   }
 
   return comm_map_[ring_id][dev_id].get();
diff --git a/paddle/fluid/platform/collective_helper.h b/paddle/fluid/platform/collective_helper.h
index 62a07669259..2c0067bb152 100644
--- a/paddle/fluid/platform/collective_helper.h
+++ b/paddle/fluid/platform/collective_helper.h
@@ -24,6 +24,9 @@
 #include "paddle/fluid/platform/device/npu/dynload/hccl.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
+#if defined(PADDLE_WITH_CNCL)
+#include "paddle/fluid/platform/device/mlu/device_context.h"
+#endif
 
 namespace paddle {
 namespace platform {
@@ -333,5 +336,102 @@ class BKCLCommContext {
 };
 #endif
 
+#if defined(PADDLE_WITH_CNCL)
+// In order to apply hierarchical communication with CNCL, we need
+// a communication ring contains CNCL communicators associated to a global
+// cnclUniqueId. E.g. for a hierarchical case,
+//
+//    11 - 12   21 - 22
+//     |    |    |    |
+//    13 - 14 - 23 - 24
+//          |    |
+//    31 - 32 - 41 - 42
+//     |    |    |    |
+//    33 - 34   43 - 44
+//
+// we group (14,23,32,41) as the top, and (11,12,13,14), (21,22,23,24),
+// (31,32,33,34), (41,42,43,44) as bottoms respectively.
+//
+// We could also use a single communication ring for the flatten case
+//
+// The CNCLComm instance is created and reversed in the CNCLCommContext
+// singleton with a global user specified group id.
+class MLUDeviceContext;
+
+class CNCLComm {
+ public:
+  virtual int ring_id() const = 0;
+  virtual int nranks() const = 0;
+  virtual int rank() const = 0;
+  virtual int device_id() const = 0;
+  virtual cnclComm_t comm() const = 0;
+  virtual mluStream stream() const = 0;
+  virtual MLUDeviceContext* dev_context() const = 0;
+  virtual ~CNCLComm() = default;
+};
+
+// A singleton CNCL communicator context reserves communication ring ids
+class CNCLCommContext {
+ public:
+  static CNCLCommContext& Instance() {
+    static CNCLCommContext comm_ctx;
+    return comm_ctx;
+  }
+
+  CNCLComm* CreateComm(cnclCliqueId* cncl_id, int nranks, int rank, int dev_id,
+                       int ring_id = 0);
+  void CreateAllCNCLComms(const std::vector<int>& dev_ids, int ring_id = 0);
+
+  // a latter comm with the same dev_id and the same ring_id
+  // will override the former
+  CNCLComm* AssignCNCLComm(cnclComm_t comm, int nranks, int rank, int dev_id,
+                           int ring_id = 0);
+
+  // retrieve a communicator by the ring id in multiprocessing mode
+  CNCLComm* Get(int ring_id) const {
+    PADDLE_ENFORCE_GT(
+        comm_map_.count(ring_id), 0,
+        platform::errors::InvalidArgument(
+            "Communicator in ring id %d has not been initialized.", ring_id));
+    PADDLE_ENFORCE_EQ(comm_map_.at(ring_id).size(), 1,
+                      platform::errors::InvalidArgument(
+                          "One device id should be specified to retrieve from "
+                          "multiple communicators."));
+    return comm_map_.at(ring_id).begin()->second.get();
+  }
+
+  // retrieve a communicator by the ring id and the device id
+  CNCLComm* Get(int ring_id, int dev_id) const {
+    PADDLE_ENFORCE_GT(
+        comm_map_.count(ring_id), 0,
+        platform::errors::InvalidArgument(
+            "Communicator of ring id %d has not been initialized.", ring_id));
+    PADDLE_ENFORCE_GT(
+        comm_map_.at(ring_id).count(dev_id), 0,
+        platform::errors::InvalidArgument(
+            "Communicator at device id %d has not been initialized in ring %d.",
+            dev_id, ring_id));
+    return comm_map_.at(ring_id).at(dev_id).get();
+  }
+
+  // retrieve a communicator by the ring id and place
+  CNCLComm* Get(int ring_id, Place place) const {
+    return Get(ring_id, place.device);
+  }
+
+ private:
+  std::once_flag once_flag_;
+  std::mutex comm_map_mutex_;
+  // ring id to dev-CNCLComm
+  std::map<int, std::map<int, std::unique_ptr<CNCLComm>>> comm_map_;
+
+  void ReleaseCNCLComms();
+
+  CNCLCommContext() = default;
+  DISABLE_COPY_AND_ASSIGN(CNCLCommContext);
+};
+
+#endif
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/device/gpu/CMakeLists.txt b/paddle/fluid/platform/device/gpu/CMakeLists.txt
index 5cf2258204f..00f0cc2ac92 100644
--- a/paddle/fluid/platform/device/gpu/CMakeLists.txt
+++ b/paddle/fluid/platform/device/gpu/CMakeLists.txt
@@ -1,12 +1,12 @@
 IF(WITH_GPU)
     add_subdirectory(cuda)
-    nv_library(gpu_info SRCS gpu_info.cc DEPS cuda_info gflags glog enforce monitor dynload_cuda)
+    nv_library(gpu_info SRCS gpu_info.cc DEPS pten_gpu_info gflags glog enforce monitor dynload_cuda)
 
     nv_test(cuda_helper_test SRCS cuda_helper_test.cu)
     nv_test(cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda)
 ELSEIF(WITH_ROCM)
     add_subdirectory(rocm)
-    hip_library(gpu_info SRCS gpu_info.cc DEPS rocm_info gflags glog enforce monitor dynload_cuda)
+    hip_library(gpu_info SRCS gpu_info.cc DEPS pten_gpu_info gflags glog enforce monitor dynload_cuda)
 
     hip_test(cuda_helper_test SRCS cuda_helper_test.cu)
     hip_test(cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda)
diff --git a/paddle/fluid/platform/device/gpu/cuda/CMakeLists.txt b/paddle/fluid/platform/device/gpu/cuda/CMakeLists.txt
index 5df1de1b00f..8f7fd3dcbc0 100644
--- a/paddle/fluid/platform/device/gpu/cuda/CMakeLists.txt
+++ b/paddle/fluid/platform/device/gpu/cuda/CMakeLists.txt
@@ -1,4 +1,3 @@
-nv_library(cuda_info SRCS cuda_info.cc DEPS gflags glog enforce monitor dynload_cuda)
 nv_library(cuda_graph SRCS cuda_graph.cc DEPS enforce allocator_facade)
 nv_library(cuda_profiler SRCS cuda_profiler.cc DEPS enforce)
 
diff --git a/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h b/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h
index 3199af9c975..ab7d474c1ac 100644
--- a/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h
+++ b/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h
@@ -14,8 +14,10 @@
 
 #pragma once
 
+#include <functional>
 #include <mutex>  // NOLINT
 
+#include "paddle/fluid/platform/device/gpu/gpu_types.h"
 #include "paddle/fluid/platform/dynload/cublas.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/macros.h"
@@ -96,8 +98,7 @@ class CublasHandleHolder {
     PADDLE_RETRY_CUDA_SUCCESS(dynload::cublasDestroy(handle_));
   }
 
-  template <typename Callback>
-  inline void Call(Callback&& callback) const {
+  inline void Call(const std::function<void(blasHandle_t)>& callback) const {
     std::lock_guard<std::mutex> guard(mtx_);
     callback(handle_);
   }
diff --git a/paddle/fluid/platform/device/gpu/cuda/cusparse_helper.h b/paddle/fluid/platform/device/gpu/cuda/cusparse_helper.h
index 43da9bb1fb4..cc2b7349d50 100644
--- a/paddle/fluid/platform/device/gpu/cuda/cusparse_helper.h
+++ b/paddle/fluid/platform/device/gpu/cuda/cusparse_helper.h
@@ -14,11 +14,13 @@ limitations under the License. */
 
 #pragma once
 
+#include <functional>
 #include <mutex>  // NOLINT
 
 #include "paddle/fluid/platform/dynload/cusparse.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/macros.h"
+#include "paddle/pten/backends/gpu/gpu_decls.h"
 
 namespace paddle {
 namespace platform {
@@ -45,8 +47,8 @@ class CusparseHandleHolder {
 #endif
   }
 
-  template <typename Callback>
-  inline void Call(Callback&& callback) const {
+  inline void Call(
+      const std::function<void(pten::sparseHandle_t)>& callback) const {
     std::lock_guard<std::mutex> guard(mtx_);
     callback(handle_);
   }
diff --git a/paddle/fluid/platform/device/gpu/gpu_info.cc b/paddle/fluid/platform/device/gpu/gpu_info.cc
index e09d07a6e39..59fb26e696e 100644
--- a/paddle/fluid/platform/device/gpu/gpu_info.cc
+++ b/paddle/fluid/platform/device/gpu/gpu_info.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include <array>
 #include <cstdlib>
 #include <mutex>
 #include <set>
@@ -39,11 +40,12 @@ limitations under the License. */
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/string/split.h"
 
+#include "paddle/pten/backends/gpu/gpu_info.h"
+
 DECLARE_double(fraction_of_gpu_memory_to_use);
 DECLARE_uint64(initial_gpu_memory_in_mb);
 DECLARE_uint64(reallocate_gpu_memory_in_mb);
 DECLARE_bool(enable_cublas_tensor_op_math);
-DECLARE_string(selected_gpus);
 DECLARE_uint64(gpu_memory_limit_mb);
 
 constexpr static float fraction_reserve_gpu_memory = 0.05f;
@@ -51,23 +53,6 @@ constexpr static float fraction_reserve_gpu_memory = 0.05f;
 USE_GPU_MEM_STAT;
 namespace paddle {
 namespace platform {
-//! Get a list of device ids from environment variable or use all.
-std::vector<int> GetSelectedDevices() {
-  // use user specified GPUs in single-node multi-process mode.
-  std::vector<int> devices;
-  if (!FLAGS_selected_gpus.empty()) {
-    auto devices_str = paddle::string::Split(FLAGS_selected_gpus, ',');
-    for (auto id : devices_str) {
-      devices.push_back(atoi(id.c_str()));
-    }
-  } else {
-    int count = GetGPUDeviceCount();
-    for (int i = 0; i < count; ++i) {
-      devices.push_back(i);
-    }
-  }
-  return devices;
-}
 
 void GpuMemoryUsage(size_t *available, size_t *total) {
   size_t actual_available, actual_total;
@@ -244,17 +229,20 @@ class RecordedGpuMallocHelper {
 #endif
   }
 
-#ifdef PADDLE_WITH_TESTING
   void *GetBasePtr(void *ptr) {
+#ifdef PADDLE_WITH_TESTING
     auto it = gpu_ptrs.upper_bound(ptr);
-
     if (it == gpu_ptrs.begin()) {
       return nullptr;
     }
-
     return *(--it);
-  }
+#else
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "The RecordedGpuMallocHelper::GetBasePtr is only implemented with "
+        "testing, should not use for release."));
+    return nullptr;
 #endif
+  }
 
   bool GetMemInfo(size_t *avail, size_t *total, size_t *actual_avail,
                   size_t *actual_total) {
@@ -375,11 +363,95 @@ void EmptyCache(void) {
   }
 }
 
-#ifdef PADDLE_WITH_TESTING
 void *GetGpuBasePtr(void *ptr, int dev_id) {
   return RecordedGpuMallocHelper::Instance(dev_id)->GetBasePtr(ptr);
 }
-#endif
+
+int DnnVersion() { return pten::backends::gpu::DnnVersion(); }
+
+int GetGPUDeviceCount() { return pten::backends::gpu::GetGPUDeviceCount(); }
+
+int GetGPUComputeCapability(int id) {
+  return pten::backends::gpu::GetGPUComputeCapability(id);
+}
+
+int GetGPURuntimeVersion(int id) {
+  return pten::backends::gpu::GetGPURuntimeVersion(id);
+}
+
+int GetGPUDriverVersion(int id) {
+  return pten::backends::gpu::GetGPUDriverVersion(id);
+}
+
+bool TensorCoreAvailable() {
+  return pten::backends::gpu::TensorCoreAvailable();
+}
+
+int GetGPUMultiProcessors(int id) {
+  return pten::backends::gpu::GetGPUMultiProcessors(id);
+}
+
+int GetGPUMaxThreadsPerMultiProcessor(int id) {
+  return pten::backends::gpu::GetGPUMaxThreadsPerMultiProcessor(id);
+}
+
+int GetGPUMaxThreadsPerBlock(int id) {
+  return pten::backends::gpu::GetGPUMaxThreadsPerBlock(id);
+}
+
+int GetCurrentDeviceId() { return pten::backends::gpu::GetCurrentDeviceId(); }
+
+std::array<int, 3> GetGpuMaxGridDimSize(int id) {
+  return pten::backends::gpu::GetGpuMaxGridDimSize(id);
+}
+
+std::vector<int> GetSelectedDevices() {
+  return pten::backends::gpu::GetSelectedDevices();
+}
+
+const gpuDeviceProp &GetDeviceProperties(int id) {
+  return pten::backends::gpu::GetDeviceProperties(id);
+}
+
+void SetDeviceId(int device_id) { pten::backends::gpu::SetDeviceId(device_id); }
+
+gpuError_t GpuGetLastError() { return pten::backends::gpu::GpuGetLastError(); }
+
+void GpuStreamSync(gpuStream_t stream) {
+  pten::backends::gpu::GpuStreamSync(stream);
+}
+
+void GpuDestroyStream(gpuStream_t stream) {
+  pten::backends::gpu::GpuDestroyStream(stream);
+}
+
+void GpuDeviceSync() { pten::backends::gpu::GpuDeviceSync(); }
+
+void GpuMemcpyAsync(void *dst, const void *src, size_t count,
+                    gpuMemcpyKind kind, gpuStream_t stream) {
+  pten::backends::gpu::GpuMemcpyAsync(dst, src, count, kind, stream);
+}
+
+void GpuMemcpySync(void *dst, const void *src, size_t count,
+                   gpuMemcpyKind kind) {
+  pten::backends::gpu::GpuMemcpySync(dst, src, count, kind);
+}
+
+void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src,
+                        int src_device, size_t count, gpuStream_t stream) {
+  pten::backends::gpu::GpuMemcpyPeerAsync(dst, dst_device, src, src_device,
+                                          count, stream);
+}
+
+void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src,
+                       int src_device, size_t count) {
+  pten::backends::gpu::GpuMemcpyPeerSync(dst, dst_device, src, src_device,
+                                         count);
+}
+
+void GpuMemsetAsync(void *dst, int value, size_t count, gpuStream_t stream) {
+  pten::backends::gpu::GpuMemsetAsync(dst, value, count, stream);
+}
 
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/device/gpu/gpu_info.h b/paddle/fluid/platform/device/gpu/gpu_info.h
index 9bc4d70bc45..f6fb2ad8ce7 100644
--- a/paddle/fluid/platform/device/gpu/gpu_info.h
+++ b/paddle/fluid/platform/device/gpu/gpu_info.h
@@ -14,6 +14,7 @@ limitations under the License. */
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
 #include <stddef.h>
+#include <array>
 #include <string>
 #include <vector>
 
@@ -52,7 +53,7 @@ int GetGPUMaxThreadsPerBlock(int id);
 int GetCurrentDeviceId();
 
 //! Get the maximum GridDim size for GPU buddy allocator.
-dim3 GetGpuMaxGridDimSize(int);
+std::array<int, 3> GetGpuMaxGridDimSize(int);
 
 //! Get a list of device ids from environment variable or use all.
 std::vector<int> GetSelectedDevices();
@@ -110,7 +111,7 @@ void GpuStreamSync(gpuStream_t stream);
 void GpuDestroyStream(gpuStream_t stream);
 
 // ! Blocks until device has completed all operations.
-void GpuDeviceync();
+void GpuDeviceSync();
 
 //! CudaMalloc with recorded info
 gpuError_t RecordedGpuMalloc(void **ptr, size_t size, int dev_id);
@@ -145,10 +146,9 @@ bool IsGpuMallocRecorded(int dev_id);
 //! Empty idle cached memory held by the allocator.
 void EmptyCache(void);
 
-//! Get the primitive pointer return from cudaMalloc, just for testing
-#ifdef PADDLE_WITH_TESTING
+//! Get the primitive pointer return from cudaMalloc, just implemented with
+//! testing, do not use for release
 void *GetGpuBasePtr(void *ptr, int dev_id);
-#endif
 
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/device/gpu/nccl_helper.h b/paddle/fluid/platform/device/gpu/nccl_helper.h
index 261916b2555..1d6ccdc1280 100644
--- a/paddle/fluid/platform/device/gpu/nccl_helper.h
+++ b/paddle/fluid/platform/device/gpu/nccl_helper.h
@@ -83,8 +83,21 @@ struct NCCLContext {
   std::unique_ptr<CUDADeviceContext> ctx_;
   ncclComm_t comm_;
 
-  explicit NCCLContext(int dev_id)
-      : ctx_(new CUDADeviceContext(CUDAPlace(dev_id))), comm_{nullptr} {}
+  explicit NCCLContext(int dev_id) : comm_{nullptr} {
+    ctx_.reset(new CUDADeviceContext(CUDAPlace(dev_id)));
+    ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(CUDAPlace(dev_id), ctx_->stream())
+                           .get());
+    ctx_->SetHostAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetAllocator(paddle::platform::CPUPlace())
+            .get());
+    ctx_->SetZeroAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetZeroAllocator(CUDAPlace(dev_id))
+            .get());
+    ctx_->PartialInitWithAllocator();
+  }
 
   gpuStream_t stream() const { return ctx_->stream(); }
   ncclComm_t comm() const { return comm_; }
diff --git a/paddle/fluid/platform/device/gpu/rocm/CMakeLists.txt b/paddle/fluid/platform/device/gpu/rocm/CMakeLists.txt
index 86b9ecd5f54..988807258c1 100644
--- a/paddle/fluid/platform/device/gpu/rocm/CMakeLists.txt
+++ b/paddle/fluid/platform/device/gpu/rocm/CMakeLists.txt
@@ -1,3 +1 @@
-hip_library(rocm_info SRCS rocm_info.cc DEPS gflags glog enforce monitor dynload_cuda)
-
 hip_test(miopen_helper_test SRCS miopen_helper_test.cc DEPS dynload_cuda)
diff --git a/paddle/fluid/platform/device/mlu/CMakeLists.txt b/paddle/fluid/platform/device/mlu/CMakeLists.txt
index e8b794a03e3..724776bfad2 100644
--- a/paddle/fluid/platform/device/mlu/CMakeLists.txt
+++ b/paddle/fluid/platform/device/mlu/CMakeLists.txt
@@ -8,3 +8,4 @@ cc_library(mlu_info SRCS mlu_info.cc DEPS enforce glog monitor neuware_lib)
 cc_library(mlu_stream SRCS mlu_stream.cc DEPS boost mlu_info stream_callback_manager eigen3 ${MKLDNN_CTX_DEPS})
 cc_library(mlu_device_context SRCS device_context.cc DEPS mlu_stream)
 cc_test(mlu_device_context_test SRCS device_context_test.cc DEPS mlu_device_context)
+cc_library(mlu_collective_helper SRCS mlu_collective_helper.cc DEPS mlu_stream mlu_info)
diff --git a/paddle/fluid/platform/device/mlu/cncl_helper.h b/paddle/fluid/platform/device/mlu/cncl_helper.h
new file mode 100644
index 00000000000..2f9bed01426
--- /dev/null
+++ b/paddle/fluid/platform/device/mlu/cncl_helper.h
@@ -0,0 +1,57 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef PADDLE_WITH_CNCL
+#include <cncl.h>
+
+#include <stdio.h>
+#include <memory>
+#include <string>
+#include <thread>  // NOLINT
+#include <typeindex>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/device/mlu/enforce.h"
+#include "paddle/fluid/platform/float16.h"
+
+namespace paddle {
+namespace platform {
+
+inline cnclDataType_t ToCNCLDataType(framework::proto::VarType::Type type) {
+  if (type == framework::proto::VarType::FP32) {
+    return cnclFloat32;
+  } else if (type == framework::proto::VarType::FP16) {
+    return cnclFloat16;
+  } else if (type == framework::proto::VarType::INT32) {
+    return cnclInt32;
+  } else if (type == framework::proto::VarType::INT16) {
+    return cnclInt16;
+  } else if (type == framework::proto::VarType::INT8) {
+    return cnclInt8;
+  } else if (type == framework::proto::VarType::UINT8) {
+    return cnclUint8;
+  } else {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "This datatype in cncl is not supported."));
+  }
+}
+
+}  // namespace platform
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/platform/device/mlu/device_context.h b/paddle/fluid/platform/device/mlu/device_context.h
index 2692f3a248a..a3f3bda17c8 100644
--- a/paddle/fluid/platform/device/mlu/device_context.h
+++ b/paddle/fluid/platform/device/mlu/device_context.h
@@ -15,6 +15,9 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/mlu/enforce.h"
 #include "paddle/fluid/platform/device/mlu/mlu_stream.h"
 #include "paddle/fluid/platform/device_context.h"
+#ifdef PADDLE_WITH_CNCL
+#include <cncl.h>
+#endif
 
 namespace Eigen {
 struct DefaultDevice;
@@ -88,6 +91,14 @@ class MLUDeviceContext : public DeviceContext {
   /*! \brief  Return mlu stream in the device context. */
   mluStream stream() const;
 
+#ifdef PADDLE_WITH_CNCL
+  /*! \brief  Return cncl communicators. */
+  cnclComm_t cncl_comm() const { return cncl_comm_; }
+
+  /*! \brief  Set cncl communicators. */
+  void set_cncl_comm(cnclComm_t comm) { cncl_comm_ = comm; }
+#endif
+
   template <typename Callback>
   void RecordEvent(mluEventHandle ev, Callback callback) const {
     return context()->Stream()->RecordEvent(ev, callback);
@@ -132,6 +143,10 @@ class MLUDeviceContext : public DeviceContext {
       thread_ctx_;
   static thread_local std::mutex ctx_mtx_;
 
+#ifdef PADDLE_WITH_CNCL
+  cnclComm_t cncl_comm_{nullptr};
+#endif
+
   DISABLE_COPY_AND_ASSIGN(MLUDeviceContext);
 };
 
diff --git a/paddle/fluid/platform/device/mlu/enforce.h b/paddle/fluid/platform/device/mlu/enforce.h
index eecbad53cab..5c9871d7bce 100644
--- a/paddle/fluid/platform/device/mlu/enforce.h
+++ b/paddle/fluid/platform/device/mlu/enforce.h
@@ -42,6 +42,9 @@ struct MLUStatusType {};
 DEFINE_MLU_STATUS_TYPE(cnrtStatus, cnrtSuccess, CNRT);
 DEFINE_MLU_STATUS_TYPE(cnnlStatus, CNNL_STATUS_SUCCESS, CNNL);
 DEFINE_MLU_STATUS_TYPE(cnStatus, CN_SUCCESS, CN);
+#ifdef PADDLE_WITH_CNCL
+DEFINE_MLU_STATUS_TYPE(cnclStatus, CNCL_RET_SUCCESS, CNCL);
+#endif
 
 }  // namespace details
 
@@ -80,6 +83,17 @@ inline std::string build_mlu_error_msg(cnStatus stat) {
   return sout.str();
 }
 
+/*************** CNCL ERROR ***************/
+#ifdef PADDLE_WITH_CNCL
+inline bool is_error(cnclStatus e) { return e != CNCL_RET_SUCCESS; }
+
+inline std::string build_mlu_error_msg(cnclStatus e) {
+  std::ostringstream sout;
+  sout << "MLU CNCL error(" << e << "), " << cnclGetErrorStr(e) << ". ";
+  return sout.str();
+}
+#endif
+
 #define PADDLE_ENFORCE_MLU_SUCCESS(COND)                       \
   do {                                                         \
     auto __cond__ = (COND);                                    \
diff --git a/paddle/fluid/platform/device/mlu/enforce_test.cc b/paddle/fluid/platform/device/mlu/enforce_test.cc
index 7241afba6aa..4ff7b12c446 100644
--- a/paddle/fluid/platform/device/mlu/enforce_test.cc
+++ b/paddle/fluid/platform/device/mlu/enforce_test.cc
@@ -58,5 +58,15 @@ TEST(mlu_enforce, mlu_success) {
       CheckMluStatusFailure(CN_ERROR_INVALID_VALUE, "invalid argument"));
   EXPECT_TRUE(CheckMluStatusFailure(CN_MEMORY_ERROR_OUT_OF_MEMORY,
                                     "device has no memory to alloc"));
+#ifdef PADDLE_WITH_CNCL
+  EXPECT_TRUE(CheckMluStatusSuccess(CNCL_RET_SUCCESS));
+  EXPECT_TRUE(CheckMluStatusFailure(CNCL_RET_ERR_INTERNAL, "CNCL error"));
+  EXPECT_TRUE(CheckMluStatusFailure(CNCL_RET_ERR_NULL_POINTER, "CNCL error"));
+  EXPECT_TRUE(CheckMluStatusFailure(CNCL_RET_ERR_INIT, "CNCL error"));
+  EXPECT_TRUE(CheckMluStatusFailure(CNCL_RET_ERR_NOT_INIT, "CNCL error"));
+  EXPECT_TRUE(CheckMluStatusFailure(CNCL_RET_ERR_REINIT, "CNCL error"));
+  EXPECT_TRUE(
+      CheckMluStatusFailure(CNCL_RET_ERR_INVALID_VERSION, "CNCL error"));
+#endif
 }
 #endif
diff --git a/paddle/fluid/platform/device/mlu/mlu_collective_helper.cc b/paddle/fluid/platform/device/mlu/mlu_collective_helper.cc
new file mode 100644
index 00000000000..7708267c1bc
--- /dev/null
+++ b/paddle/fluid/platform/device/mlu/mlu_collective_helper.cc
@@ -0,0 +1,179 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#if defined(PADDLE_WITH_CNCL)
+#include <utility>
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/device/mlu/enforce.h"
+
+namespace paddle {
+namespace platform {
+
+class CNCLCommImpl : public CNCLComm {
+ public:
+  void set_ring_id(int ring_id) { ring_id_ = ring_id; }
+  int ring_id() const override { return ring_id_; }
+
+  void set_nranks(int nranks) { nranks_ = nranks; }
+  int nranks() const override { return nranks_; }
+
+  void set_rank(int rank) { rank_ = rank; }
+  int rank() const override { return rank_; }
+
+  int device_id() const override { return dev_ctx_->GetPlace().device; }
+
+  void set_comm(cnclComm_t comm) { comm_ = comm; }
+  cnclComm_t comm() const override { return comm_; }
+
+  mluStream stream() const override { return dev_ctx_->stream(); }
+
+  void set_dev_ctx(std::unique_ptr<MLUDeviceContext>&& dev_ctx) {
+    dev_ctx_ = std::move(dev_ctx);
+  }
+  MLUDeviceContext* dev_context() const override { return dev_ctx_.get(); }
+
+  ~CNCLCommImpl() {
+    if (comm_) {
+      PADDLE_ENFORCE_MLU_SUCCESS(cnclFreeComm(comm_));
+    }
+  }
+
+ private:
+  int ring_id_;
+  int nranks_;
+  int rank_;
+  cnclComm_t comm_;
+  std::unique_ptr<MLUDeviceContext> dev_ctx_;
+};
+
+CNCLComm* CNCLCommContext::CreateComm(cnclCliqueId* cncl_id, int nranks,
+                                      int rank, int dev_id, int ring_id) {
+  PADDLE_ENFORCE_NOT_NULL(cncl_id,
+                          platform::errors::InvalidArgument(
+                              "The cncl unique id should not be null."));
+  PADDLE_ENFORCE_GT(
+      nranks, 1,
+      platform::errors::InvalidArgument(
+          "Expected nranks > 1. But received nranks is %d.", nranks));
+  PADDLE_ENFORCE_GE(rank, 0,
+                    platform::errors::InvalidArgument(
+                        "Expected rank >= 0. But received rank is %d.", rank));
+  PADDLE_ENFORCE_LT(
+      rank, nranks,
+      platform::errors::InvalidArgument(
+          "Expected rank < nranks. But received rank is %d, nranks is %d.",
+          rank, nranks));
+  PADDLE_ENFORCE_GE(
+      dev_id, 0,
+      platform::errors::InvalidArgument(
+          "Expected dev_id >= 0. But received dev_id is %d.", dev_id));
+
+  cnclComm_t comm;
+  int dev_list[] = {dev_id};
+  int rank_list[] = {rank};
+  SetMLUDeviceId(dev_id);
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnclInitComms(&comm, 1, dev_list, rank_list, nranks, cncl_id));
+
+  auto* comm_wrapper = AssignCNCLComm(comm, nranks, rank, dev_id, ring_id);
+
+  VLOG(1) << "cncl communicator of rank " << rank << " in ring " << ring_id
+          << " has been created on device " << dev_id;
+
+  std::call_once(once_flag_, []() {
+    std::atexit([]() { CNCLCommContext::Instance().ReleaseCNCLComms(); });
+  });
+
+  return comm_wrapper;
+}
+
+void CNCLCommContext::CreateAllCNCLComms(const std::vector<int>& dev_ids,
+                                         int ring_id) {
+  PADDLE_ENFORCE_GT(
+      dev_ids.size(), 0,
+      platform::errors::InvalidArgument("Expected the size of dev_ids > 0. But "
+                                        "received the size of dev_ids is %d.",
+                                        dev_ids.size()));
+
+  const int kDevices = dev_ids.size();
+  cnclComm_t comms[kDevices];
+  int* rank_list = new int[kDevices];
+  for (int i = 0; i < kDevices; i++) {
+    rank_list[i] = i;
+  }
+  cnclCliqueId clique_id;
+  PADDLE_ENFORCE_MLU_SUCCESS(cnclGetCliqueId(&clique_id));
+  PADDLE_ENFORCE_MLU_SUCCESS(cnclInitComms(comms, dev_ids.size(),
+                                           dev_ids.data(), rank_list,
+                                           dev_ids.size(), &clique_id));
+
+  PADDLE_ENFORCE_EQ(comm_map_.count(ring_id), 0,
+                    platform::errors::InvalidArgument(
+                        "Expected comm_map_.count(ring_id) = 0. But received "
+                        "comm_map_.count(ring_id) is %d.",
+                        comm_map_.count(ring_id)));
+  for (size_t i = 0; i < dev_ids.size(); ++i) {
+    AssignCNCLComm(comms[i], dev_ids.size(), i, dev_ids[i], ring_id);
+    VLOG(1) << "cncl communicator of rank " << i << " in ring " << ring_id
+            << " has been created on device " << dev_ids[i];
+  }
+
+  std::call_once(once_flag_, []() {
+    std::atexit([]() { CNCLCommContext::Instance().ReleaseCNCLComms(); });
+  });
+  delete[] rank_list;
+}
+
+CNCLComm* CNCLCommContext::AssignCNCLComm(cnclComm_t comm, int nranks, int rank,
+                                          int dev_id, int ring_id) {
+  std::unique_ptr<MLUDeviceContext> dev_ctx(
+      new MLUDeviceContext(MLUPlace(dev_id)));
+
+  CNCLCommImpl* c = new CNCLCommImpl;
+  c->set_ring_id(ring_id);
+  c->set_nranks(nranks);
+  c->set_rank(rank);
+  c->set_comm(comm);
+  c->set_dev_ctx(std::move(dev_ctx));
+
+  comm_map_mutex_.lock();
+  if (comm_map_.count(ring_id) == 0) {
+    comm_map_.emplace(ring_id, std::map<int, std::unique_ptr<CNCLComm>>());
+  }
+  auto& dev2comm = comm_map_[ring_id];
+
+  dev2comm.emplace(dev_id, std::unique_ptr<CNCLComm>(c));
+  comm_map_mutex_.unlock();
+
+  if (ring_id == 0) {
+    auto* dev_ctx = static_cast<platform::MLUDeviceContext*>(
+        platform::DeviceContextPool::Instance().Get(
+            platform::MLUPlace(dev_id)));
+    dev_ctx->set_cncl_comm(comm);
+  }
+
+  return comm_map_[ring_id][dev_id].get();
+}
+
+void CNCLCommContext::ReleaseCNCLComms() {
+  for (auto& p : comm_map_) {
+    for (auto& q : p.second) {
+      q.second.reset();
+    }
+  }
+}
+
+}  // namespace platform
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/platform/device/mlu/mlu_info.h b/paddle/fluid/platform/device/mlu/mlu_info.h
index 4588dd66677..fcf06cb4f1c 100644
--- a/paddle/fluid/platform/device/mlu/mlu_info.h
+++ b/paddle/fluid/platform/device/mlu/mlu_info.h
@@ -18,6 +18,9 @@ limitations under the License. */
 #include <cn_api.h>
 #include <cnnl.h>
 #include <cnrt.h>
+#ifdef PADDLE_WITH_CNCL
+#include <cncl.h>
+#endif
 #include <vector>
 
 namespace paddle {
@@ -25,6 +28,9 @@ namespace paddle {
 using cnStatus = CNresult;
 using cnrtStatus = cnrtRet_t;
 using cnnlStatus = cnnlStatus_t;
+#ifdef PADDLE_WITH_CNCL
+using cnclStatus = cnclResult_t;
+#endif
 using mluStream = cnrtQueue_t;
 using mluCnnlHandle = cnnlHandle_t;
 using mluEventHandle = CNnotifier;
diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h
index 87644584330..cb2b57474d1 100644
--- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h
+++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h
@@ -307,8 +307,8 @@ XPUOpMap& get_kl2_ops() {
       {"slice", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                               pOpKernelType(vartype::FP16, XPUPlace()),
                               pOpKernelType(vartype::INT32, XPUPlace())})},
-      {"softmax_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"softmax", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                                pOpKernelType(vartype::FP16, XPUPlace())})},
       {"softmax_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                      pOpKernelType(vartype::FP16, XPUPlace())})},
@@ -317,9 +317,6 @@ XPUOpMap& get_kl2_ops() {
                      pOpKernelType(vartype::FP16, XPUPlace())})},
       {"softmax_with_cross_entropy",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"softmax", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"softmax", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                                pOpKernelType(vartype::FP16, XPUPlace())})},
       {"split", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                               pOpKernelType(vartype::INT32, XPUPlace())})},
       {"squeeze2_grad",
diff --git a/paddle/fluid/platform/profiler/host_event_recorder.cc b/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h
similarity index 50%
rename from paddle/fluid/platform/profiler/host_event_recorder.cc
rename to paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h
index b8495ca45ca..aa020593454 100644
--- a/paddle/fluid/platform/profiler/host_event_recorder.cc
+++ b/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h
@@ -1,5 +1,5 @@
 /* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-licensed under the Apache License, Version 2.0 (the "License");
+Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
     http://www.apache.org/licenses/LICENSE-2.0
@@ -8,26 +8,30 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#pragma once
 
-#include "paddle/fluid/platform/profiler/host_event_recorder.h"
-#include "paddle/fluid/platform/os_info.h"
+#ifdef PADDLE_WITH_XPU_KP
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+
+#include "paddle/fluid/framework/op_kernel_type.h"
 
 namespace paddle {
 namespace platform {
 
-ThreadEventRecorder::ThreadEventRecorder() {
-  thread_id_ = GetCurrentThreadSysId();
-  HostEventRecorder::GetInstance().RegisterThreadRecorder(thread_id_, this);
-}
+using vartype = paddle::framework::proto::VarType;
+using pOpKernelType = paddle::framework::OpKernelType;
+using XPUKernelSet =
+    std::unordered_set<pOpKernelType, paddle::framework::OpKernelType::Hash>;
+using XPUOpMap = std::unordered_map<std::string, XPUKernelSet>;
+
+XPUOpMap& get_kp_ops() {
+  static XPUOpMap s_xpu_kp_kernels{};
 
-HostEventSection HostEventRecorder::GatherEvents() {
-  HostEventSection host_sec;
-  host_sec.thr_sections.reserve(thread_recorders_.size());
-  for (auto &kv : thread_recorders_) {
-    host_sec.thr_sections.emplace_back(std::move(kv.second->GatherEvents()));
-  }
-  return host_sec;
+  return s_xpu_kp_kernels;
 }
 
 }  // namespace platform
 }  // namespace paddle
+#endif
diff --git a/paddle/fluid/platform/device/xpu/xpu_op_list.cc b/paddle/fluid/platform/device/xpu/xpu_op_list.cc
index e9b494024bd..88d803bdf18 100644
--- a/paddle/fluid/platform/device/xpu/xpu_op_list.cc
+++ b/paddle/fluid/platform/device/xpu/xpu_op_list.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/xpu/xpu1_op_list.h"
 #include "paddle/fluid/platform/device/xpu/xpu2_op_list.h"
 #include "paddle/fluid/platform/device/xpu/xpu_info.h"
+#include "paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h"
 #include "paddle/fluid/platform/device/xpu/xpu_op_list.h"
 
 namespace paddle {
@@ -74,6 +75,48 @@ bool is_in_xpu_black_list(const std::string& op_name) {
   return false;
 }
 
+#ifdef PADDLE_WITH_XPU_KP
+bool is_xpu_kp_support_op(const std::string& op_name,
+                          const pOpKernelType& type) {
+  auto& ops = get_kl1_ops();
+  auto v = get_xpu_version(type.place_.device);
+  if (v == pten::backends::xpu::XPUVersion::XPU2) {
+    ops = get_kp_ops();
+  }
+
+  if (ops.find(op_name) != ops.end() &&
+      ops[op_name].find(type) != ops[op_name].end()) {
+    return true;
+  }
+  return false;
+}
+
+bool is_in_xpu_kpwhite_list(const std::string& op_name) {
+  static bool inited = false;
+  static std::unordered_set<std::string> xpu_kpwhite_list;
+  static std::mutex s_mtx;
+  if (!inited) {
+    std::lock_guard<std::mutex> guard(s_mtx);
+    if (!inited) {
+      if (std::getenv("XPU_KPWHITE_LIST") != nullptr) {
+        std::string ops(std::getenv("XPU_KPWHITE_LIST"));
+        tokenize(ops, ',', &xpu_kpwhite_list);
+      }
+      inited = true;
+      VLOG(3) << "XPU kpwhite List: ";
+      for (auto iter = xpu_kpwhite_list.begin(); iter != xpu_kpwhite_list.end();
+           ++iter) {
+        VLOG(3) << *iter << " ";
+      }
+    }
+  }
+  if (xpu_kpwhite_list.find(op_name) != xpu_kpwhite_list.end()) {
+    return true;
+  }
+  return false;
+}
+#endif
+
 std::vector<vartype::Type> get_xpu_op_support_type(
     const std::string& op_name, pten::backends::xpu::XPUVersion version) {
   std::vector<vartype::Type> res;
@@ -101,7 +144,6 @@ XPUOpListMap get_xpu_op_list(pten::backends::xpu::XPUVersion version) {
   }
   return res;
 }
-
 }  // namespace platform
 }  // namespace paddle
 #endif
diff --git a/paddle/fluid/platform/device/xpu/xpu_op_list.h b/paddle/fluid/platform/device/xpu/xpu_op_list.h
index 4c3eb097a14..a51dfac1892 100644
--- a/paddle/fluid/platform/device/xpu/xpu_op_list.h
+++ b/paddle/fluid/platform/device/xpu/xpu_op_list.h
@@ -27,6 +27,12 @@ using XPUOpListMap =
 bool is_xpu_support_op(const std::string& op_name, const pOpKernelType& type);
 bool is_in_xpu_black_list(const std::string& op_name);
 
+#ifdef PADDLE_WITH_XPU_KP
+bool is_xpu_kp_support_op(const std::string& op_name,
+                          const pOpKernelType& type);
+bool is_in_xpu_kpwhite_list(const std::string& op_name);
+#endif
+
 std::vector<vartype::Type> get_xpu_op_support_type(
     const std::string& op_name, pten::backends::xpu::XPUVersion version);
 XPUOpListMap get_xpu_op_list(pten::backends::xpu::XPUVersion version);
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index fdd9883c2c9..1e674258334 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -10,8 +10,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/platform/device_context.h"
+#include <functional>
 #include <memory>
 #include <set>
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/platform/stream/cuda_stream.h"
+#include "paddle/pten/backends/gpu/gpu_context.h"
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/memory/allocation/cuda_device_context_allocator.h"
@@ -149,16 +153,17 @@ inline void EmplaceDeviceContext(
               cuda_ctx,
               platform::errors::InvalidArgument(
                   "Failed to dynamic_cast dev_ctx into CUDADeviceContext."));
-          dev_ctx->SetDeviceAllocator(
-              memory::allocation::AllocatorFacade::Instance()
-                  .GetAllocator(p, cuda_ctx->context()->RawStream())
-                  .get());
+          // Note: A trick method to init context, why GetAllocator interface
+          // needs a stream parameter?
+          dev_ctx->SetAllocator(memory::allocation::AllocatorFacade::Instance()
+                                    .GetAllocator(p, cuda_ctx->stream())
+                                    .get());
+          cuda_ctx->PartialInitWithAllocator();
 #endif
         } else {
-          dev_ctx->SetDeviceAllocator(
-              memory::allocation::AllocatorFacade::Instance()
-                  .GetAllocator(p)
-                  .get());
+          dev_ctx->SetAllocator(memory::allocation::AllocatorFacade::Instance()
+                                    .GetAllocator(p)
+                                    .get());
         }
         dev_ctx->SetHostAllocator(
             memory::allocation::AllocatorFacade::Instance()
@@ -251,14 +256,18 @@ DeviceContextPool::DeviceContextPool(
   }
 }
 
-CPUDeviceContext::CPUDeviceContext() : pten::CPUContext() {}
+CPUDeviceContext::CPUDeviceContext() : pten::CPUContext() {
+  pten::CPUContext::Init();
+}
 
-CPUDeviceContext::CPUDeviceContext(CPUPlace place) : pten::CPUContext() {}
+CPUDeviceContext::CPUDeviceContext(CPUPlace place) : pten::CPUContext(place) {
+  pten::CPUContext::Init();
+}
 
 #ifdef PADDLE_WITH_IPU
 IPUDeviceContext::IPUDeviceContext(IPUPlace place) : place_(place) {}
 
-Place IPUDeviceContext::GetPlace() const { return place_; }
+const Place& IPUDeviceContext::GetPlace() const { return place_; }
 
 void IPUDeviceContext::Wait() const {
   /*! \brief  Wait for all operations completion in the stream. */
@@ -268,11 +277,14 @@ IPUDeviceContext::~IPUDeviceContext() {}
 
 #endif
 #ifdef PADDLE_WITH_XPU
-XPUDeviceContext::XPUDeviceContext() : pten::XPUContext() {}
+XPUDeviceContext::XPUDeviceContext() : pten::XPUContext() {
+  pten::XPUContext::Init();
+}
 
 XPUDeviceContext::~XPUDeviceContext() {}
 
 XPUDeviceContext::XPUDeviceContext(XPUPlace place) : pten::XPUContext(place) {
+  pten::XPUContext::Init();
   LOG_FIRST_N(WARNING, 1) << "Please NOTE: xpu device: "
                           << static_cast<int>(place.device);
 }
@@ -302,7 +314,7 @@ void NPUDeviceContext::Wait() const {
 
 aclrtStream NPUDeviceContext::stream() const { return stream_->raw_stream(); }
 
-Place NPUDeviceContext::GetPlace() const { return place_; }
+const Place& NPUDeviceContext::GetPlace() const { return place_; }
 
 aclrtContext NPUDeviceContext::context() const { return context_; }
 
@@ -319,7 +331,7 @@ Eigen::DefaultDevice* NPUPinnedDeviceContext::eigen_device() const {
   return eigen_device_.get();
 }
 
-Place NPUPinnedDeviceContext::GetPlace() const { return place_; }
+const Place& NPUPinnedDeviceContext::GetPlace() const { return place_; }
 
 #endif
 
@@ -470,102 +482,28 @@ CUDAContext::~CUDAContext() {
 #endif
 }
 
-CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) {
-  CUDADeviceGuard guard(place_.device);
-  compute_capability_ = GetGPUComputeCapability(place_.device);
-  multi_process_ = GetGPUMultiProcessors(place_.device);
-  max_threads_per_mp_ = GetGPUMaxThreadsPerMultiProcessor(place_.device);
-  max_grid_dim_size_ = GetGpuMaxGridDimSize(place_.device);
-  max_threads_per_block_ = GetGPUMaxThreadsPerBlock(place_.device);
-
-  driver_version_ = GetGPUDriverVersion(place_.device);
-  runtime_version_ = GetGPURuntimeVersion(place_.device);
-
-  LOG_FIRST_N(WARNING, 1) << "Please NOTE: device: "
-                          << static_cast<int>(place_.device)
-                          << ", GPU Compute Capability: "
-                          << compute_capability_ / 10 << "."
-                          << compute_capability_ % 10
-                          << ", Driver API Version: " << driver_version_ / 1000
-                          << "." << (driver_version_ % 100) / 10
-                          << ", Runtime API Version: "
-                          << runtime_version_ / 1000 << "."
-                          << (runtime_version_ % 100) / 10;
-#ifdef PADDLE_WITH_HIP
-  size_t version_major, version_minor, version_patch;
-  PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenGetVersion(
-      &version_major, &version_minor, &version_patch));
-  LOG_FIRST_N(WARNING, 1) << "device: " << static_cast<int>(place_.device)
-                          << ", MIOpen Version: " << version_major << "."
-                          << version_minor << "." << version_patch;
-#else
-  size_t cudnn_dso_ver = dynload::cudnnGetVersion();
-  LOG_FIRST_N(WARNING, 1) << "device: " << static_cast<int>(place_.device)
-                          << ", cuDNN Version: " << cudnn_dso_ver / 1000 << "."
-                          << (cudnn_dso_ver % 1000) / 100 << ".";
-#endif
-  {
-    // Check CUDA/CUDNN version compatiblity
-    auto local_cuda_version =
-        (driver_version_ / 1000) * 10 + (driver_version_ % 100) / 10;
-#ifdef PADDLE_WITH_HIP
-    auto compile_cuda_version = (HIP_VERSION / 100) * 10 + (HIP_VERSION % 10);
-#else
-    auto compile_cuda_version =
-        (CUDA_VERSION / 1000) * 10 + (CUDA_VERSION % 100) / 10;
-#endif
-    if (local_cuda_version < compile_cuda_version) {
-      LOG_FIRST_N(WARNING, 1)
-          << "WARNING: device: " << static_cast<int>(place_.device)
-          << ". The installed Paddle is compiled with CUDA "
-          << compile_cuda_version / 10 << "." << compile_cuda_version % 10
-          << ", but CUDA runtime version in your machine is "
-          << local_cuda_version / 10 << "." << local_cuda_version % 10
-          << ", which may cause serious incompatible bug. "
-          << "Please recompile or reinstall Paddle with compatible CUDA "
-             "version.";
-    }
-  }
-  default_ctx_.reset(new CUDAContext(place_));
-}
-
-CUDADeviceContext::~CUDADeviceContext() {
-  SetDeviceId(place_.device);
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-  if (nccl_comm_) {
-    PADDLE_ENFORCE_GPU_SUCCESS(dynload::ncclCommDestroy(nccl_comm_));
-  }
-#endif
-}
-
-Place CUDADeviceContext::GetPlace() const { return place_; }
-
-void CUDADeviceContext::Wait() const { context()->Stream()->Wait(); }
-
-int CUDADeviceContext::GetComputeCapability() const {
-  return compute_capability_;
-}
-
-int CUDADeviceContext::GetMaxPhysicalThreadCount() const {
-  return multi_process_ * max_threads_per_mp_;
+CUDADeviceContext::CUDADeviceContext(CUDAPlace place)
+    : pten::GPUContext(place) {
+  pten::GPUContext::PartialInitWithoutAllocator();
+  cuda_stream_.reset(
+      new stream::CUDAStream(pten::GPUContext::stream(), this->GetPlace()));
 }
 
-int CUDADeviceContext::GetSMCount() const { return multi_process_; }
-
-int CUDADeviceContext::GetMaxThreadsPerBlock() const {
-  return max_threads_per_block_;
-}
+CUDADeviceContext::~CUDADeviceContext() = default;
 
 Eigen::GpuDevice* CUDADeviceContext::eigen_device() const {
-  return context()->EigenDevice().get();
-}
-
-bool CUDADeviceContext::tensor_core_available() const {
-  return context()->CublasTensorCoreHandle() != nullptr;
+  if (thread_ctx_.count(this)) {
+    return context()->EigenDevice().get();
+  }
+  return pten::GPUContext::eigen_device();
 }
 
-dim3 CUDADeviceContext::GetCUDAMaxGridDimSize() const {
-  return max_grid_dim_size_;
+void CUDADeviceContext::Wait() const {
+  if (thread_ctx_.count(this)) {
+    context()->Stream()->Wait();
+    return;
+  }
+  pten::GPUContext::Wait();
 }
 
 #ifdef PADDLE_WITH_HIP
@@ -573,33 +511,96 @@ miopenHandle_t CUDADeviceContext::cudnn_handle() const {
 #else
 cudnnHandle_t CUDADeviceContext::cudnn_handle() const {
 #endif
-  return context()->CudnnHandle();
+  if (thread_ctx_.count(this)) {
+    return context()->CudnnHandle();
+  }
+  return pten::GPUContext::cudnn_handle();
 }
 
 #ifdef PADDLE_WITH_HIP
 rocblas_handle CUDADeviceContext::cublas_handle() const {
-  return context()->CublasHandle()->GetCublasHandle();
+  if (thread_ctx_.count(this)) {
+    return context()->CublasHandle()->GetCublasHandle();
+  }
+  return pten::GPUContext::cublas_handle();
 }
 #else
 cublasHandle_t CUDADeviceContext::cublas_handle() const {
-  return context()->CublasHandle()->GetCublasHandle();
+  if (thread_ctx_.count(this)) {
+    return context()->CublasHandle()->GetCublasHandle();
+  }
+  return pten::GPUContext::cublas_handle();
 }
 cusparseHandle_t CUDADeviceContext::cusparse_handle() const {
-  return context()->CusparseHandle()->GetCusparseHandle();
+  if (thread_ctx_.count(this)) {
+    return context()->CusparseHandle()->GetCusparseHandle();
+  }
+  return pten::GPUContext::cusparse_handle();
+}
+cusolverDnHandle_t CUDADeviceContext::cusolver_dn_handle() const {
+  if (thread_ctx_.count(this)) {
+    return context()->CusolverDnHandle();
+  }
+  return pten::GPUContext::cusolver_dn_handle();
 }
 #endif
 
+void CUDADeviceContext::RecordEvent(
+    gpuEvent_t ev, const std::function<void()>& callback) const {
+  if (thread_ctx_.count(this)) {
+    context()->Stream()->RecordEvent(ev, callback);
+    return;
+  }
+  pten::GPUContext::RecordEvent(ev, callback);
+}
+
+void CUDADeviceContext::AddStreamCallback(
+    const std::function<void()>& callback) const {
+  if (thread_ctx_.count(this)) {
+    context()->Stream()->AddCallback(callback);
+    return;
+  }
+  pten::GPUContext::AddStreamCallback(callback);
+}
+
+void CUDADeviceContext::WaitStreamCallback() const {
+  if (thread_ctx_.count(this)) {
+    context()->Stream()->WaitCallback();
+    return;
+  }
+  pten::GPUContext::WaitStreamCallback();
+}
+
 CudnnWorkspaceHandle CUDADeviceContext::cudnn_workspace_handle() const {
   return CudnnWorkspaceHandle(*this, &cudnn_handle_mtx_);
 }
 
-#ifndef PADDLE_WITH_HIP
-cusolverDnHandle_t CUDADeviceContext::cusolver_dn_handle() const {
-  return context()->CusolverDnHandle();
+gpuStream_t CUDADeviceContext::stream() const {
+  if (thread_ctx_.count(this)) {
+    return context()->RawStream();
+  }
+  return pten::GPUContext::stream();
 }
-#endif
 
-gpuStream_t CUDADeviceContext::stream() const { return context()->RawStream(); }
+std::shared_ptr<CUDAContext> CUDADeviceContext::context() const {
+  if (!thread_ctx_.count(this)) {
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "CUDADeviceContext call context() failed, make sure in the "
+        "thread_local semantic."));
+  }
+  return thread_ctx_.at(this);
+}
+
+stream::CUDAStream* CUDADeviceContext::GetCudaStream() const {
+  return cuda_stream_.get();
+}
+
+stream::CUDAStream* CUDADeviceContext::SetCudaStream(
+    stream::CUDAStream* new_stream_ptr) {
+  auto* old_stream_ptr = cuda_stream_.release();
+  cuda_stream_.reset(new_stream_ptr);
+  return old_stream_ptr;
+}
 
 CUDAPinnedDeviceContext::CUDAPinnedDeviceContext() {
   eigen_device_.reset(new Eigen::DefaultDevice());
@@ -614,7 +615,7 @@ Eigen::DefaultDevice* CUDAPinnedDeviceContext::eigen_device() const {
   return eigen_device_.get();
 }
 
-Place CUDAPinnedDeviceContext::GetPlace() const { return place_; }
+const Place& CUDAPinnedDeviceContext::GetPlace() const { return place_; }
 #endif
 
 #ifdef PADDLE_WITH_MKLDNN
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 17b22907b15..4d469e92c04 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 
+#include <functional>
 #include <future>  // NOLINT
 #include <memory>
 #include <mutex>  // NOLINT
@@ -18,7 +19,9 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
+#include "paddle/fluid/platform/device/gpu/gpu_types.h"
 #include "paddle/pten/backends/cpu/cpu_context.h"
+#include "paddle/pten/backends/gpu/gpu_decls.h"
 #include "paddle/pten/core/device_context.h"
 
 #include "paddle/fluid/memory/malloc.h"
@@ -28,6 +31,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/dynload/cudnn.h"
 #include "paddle/fluid/platform/dynload/cusolver.h"
 #include "paddle/fluid/platform/dynload/cusparse.h"
+#include "paddle/pten/backends/gpu/gpu_context.h"
 #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
 #include "paddle/fluid/platform/dynload/nccl.h"
 #endif
@@ -38,6 +42,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/gpu/gpu_helper.h"  // NOLINT
 #include "paddle/fluid/platform/dynload/miopen.h"
 #include "paddle/fluid/platform/dynload/rocblas.h"
+#include "paddle/pten/backends/gpu/gpu_context.h"  // NOLINT
 #if !defined(__APPLE__) && defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/dynload/rccl.h"
 #endif
@@ -145,7 +150,7 @@ class IPUDeviceContext : public DeviceContext {
   explicit IPUDeviceContext(IPUPlace place);
   virtual ~IPUDeviceContext();
   Eigen::DefaultDevice* eigen_device() const { return nullptr; }
-  Place GetPlace() const override;
+  const Place& GetPlace() const override;
   /*! \brief  Wait for all operations completion in the stream. */
   void Wait() const override;
 
@@ -187,7 +192,7 @@ class NPUDeviceContext : public DeviceContext {
   explicit NPUDeviceContext(NPUPlace place);
   virtual ~NPUDeviceContext();
   Eigen::DefaultDevice* eigen_device() const { return nullptr; }
-  Place GetPlace() const override;
+  const Place& GetPlace() const override;
   aclrtContext context() const;
 
   /*! \brief  Wait for all operations completion in the stream. */
@@ -247,7 +252,7 @@ class NPUPinnedDeviceContext : public DeviceContext {
   NPUPinnedDeviceContext();
   explicit NPUPinnedDeviceContext(NPUPinnedPlace place);
 
-  Place GetPlace() const override;
+  const Place& GetPlace() const override;
 
   Eigen::DefaultDevice* eigen_device() const;
 
@@ -326,20 +331,20 @@ class CUDAContext {
 #endif
 
   /*! \brief  Call cublas function safely. */
-  template <typename Callback>
-  inline void CublasCall(Callback&& callback) const {
+  inline void CublasCall(
+      const std::function<void(blasHandle_t)>& callback) const {
     if (cublas_tf32_tensor_core_handle_) {
-      cublas_tf32_tensor_core_handle_->Call(std::forward<Callback>(callback));
+      cublas_tf32_tensor_core_handle_->Call(callback);
     } else {
-      cublas_handle_->Call(std::forward<Callback>(callback));
+      cublas_handle_->Call(callback);
     }
   }
 
 #ifndef PADDLE_WITH_HIP
   /*! \brief  Call cusparse function safely. */
-  template <typename Callback>
-  inline void CusparseCall(Callback&& callback) const {
-    cusparse_handle_->Call(std::forward<Callback>(callback));
+  inline void CusparseCall(
+      const std::function<void(pten::sparseHandle_t)>& callback) const {
+    cusparse_handle_->Call(callback);
   }
 #endif
 
@@ -348,12 +353,12 @@ class CUDAContext {
 
   /*! \brief  Call cublas function with Tensor Core safely. If
       Tensor Core is not available, use DEFAULT_MATH instead. */
-  template <typename Callback>
-  inline void TensorCoreCublasCallIfAvailable(Callback&& callback) const {
+  inline void TensorCoreCublasCallIfAvailable(
+      const std::function<void(blasHandle_t)>& callback) const {
     if (cublas_tensor_core_handle_) {
-      cublas_tensor_core_handle_->Call(std::forward<Callback>(callback));
+      cublas_tensor_core_handle_->Call(callback);
     } else {
-      cublas_handle_->Call(std::forward<Callback>(callback));
+      cublas_handle_->Call(callback);
     }
   }
 
@@ -491,7 +496,7 @@ class CUDAContext {
   DISABLE_COPY_AND_ASSIGN(CUDAContext);
 };
 
-class CUDADeviceContext : public DeviceContext {
+class CUDADeviceContext : public pten::GPUContext {
  public:
   explicit CUDADeviceContext(CUDAPlace place);
   virtual ~CUDADeviceContext();
@@ -499,49 +504,40 @@ class CUDADeviceContext : public DeviceContext {
   /*! \brief  Wait for all operations completion in the stream. */
   void Wait() const override;
 
-  /*! \brief  Return place in the device context. */
-  Place GetPlace() const override;
-
-  /*! \brief  Return compute capability in the device context. */
-  int GetComputeCapability() const;
-
-  /*! \brief  Return the max physical thread count in the device context */
-  int GetMaxPhysicalThreadCount() const;
-
-  /*! \brief  Return the SM count in the device context */
-  int GetSMCount() const;
-
-  /*! \brief  Return the Max thread num of block in the device context */
-  int GetMaxThreadsPerBlock() const;
-
-  /*! \brief  Return the max grid dim size in the device context */
-  dim3 GetCUDAMaxGridDimSize() const;
-
   /*! \brief  Return eigen device in the device context. */
   Eigen::GpuDevice* eigen_device() const;
 
   /*! \brief  Call cublas function safely. */
-  template <typename Callback>
-  inline void CublasCall(Callback&& callback) const {
+  inline void CublasCall(
+      const std::function<void(blasHandle_t)>& callback) const {
+    if (!thread_ctx_.count(this)) {
+      pten::GPUContext::CublasCall(callback);
+      return;
+    }
     return context()->CublasCall(callback);
   }
 
 #ifndef PADDLE_WITH_HIP
   /*! \brief  Call cusparse function safely. */
-  template <typename Callback>
-  inline void CusparseCall(Callback&& callback) const {
-    return context()->CusparseCall(callback);
+  inline void CusparseCall(
+      const std::function<void(pten::sparseHandle_t)>& callback) const {
+    if (!thread_ctx_.count(this)) {
+      pten::GPUContext::CusparseCall(callback);
+      return;
+    }
+    context()->CusparseCall(callback);
   }
 #endif
 
-  /*! \brief  Check whether tensor core is supported */
-  bool tensor_core_available() const;
-
   /*! \brief  Call cublas function with Tensor Core safely. If
       Tensor Core is not available, use DEFAULT_MATH instead. */
-  template <typename Callback>
-  inline void TensorCoreCublasCallIfAvailable(Callback&& callback) const {
-    return context()->TensorCoreCublasCallIfAvailable(callback);
+  inline void TensorCoreCublasCallIfAvailable(
+      const std::function<void(blasHandle_t)>& callback) const {
+    if (!thread_ctx_.count(this)) {
+      pten::GPUContext::TensorCoreCublasCallIfAvailable(callback);
+      return;
+    }
+    context()->TensorCoreCublasCallIfAvailable(callback);
   }
 
 /*! \brief  Return cudnn  handle in the device context. */
@@ -559,6 +555,10 @@ class CUDADeviceContext : public DeviceContext {
   cusparseHandle_t cusparse_handle() const;
 #endif
 
+#ifndef PADDLE_WITH_HIP
+  cusolverDnHandle_t cusolver_dn_handle() const;
+#endif
+
   /*! \brief  Return a cudnn workspace handle to call multiple cudnn
    *  functions without interrupting by other threads.
    *  Once the first cudnn function is called by the handle, a lock
@@ -568,60 +568,33 @@ class CUDADeviceContext : public DeviceContext {
    *  sequential cudnn function calls. */
   CudnnWorkspaceHandle cudnn_workspace_handle() const;
 
-#ifndef PADDLE_WITH_HIP
-  cusolverDnHandle_t cusolver_dn_handle() const;
-#endif
-
   /*! \brief  Return cuda stream in the device context. */
   gpuStream_t stream() const;
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-  /*! \brief  Return nccl communicators. */
-  ncclComm_t nccl_comm() const { return nccl_comm_; }
-
-  /*! \brief  Set nccl communicators. */
-  void set_nccl_comm(ncclComm_t comm) { nccl_comm_ = comm; }
-#endif
-
-  template <typename Callback>
-  void RecordEvent(gpuEvent_t ev, Callback callback) const {
-    return context()->Stream()->RecordEvent(ev, callback);
-  }
-
-  template <typename Callback>
-  void AddStreamCallback(Callback&& callback) const {
-    return context()->Stream()->AddCallback(callback);
-  }
+  void RecordEvent(gpuEvent_t ev, const std::function<void()>& callback) const;
 
-  void WaitStreamCallback() const {
-    return context()->Stream()->WaitCallback();
-  }
+  void AddStreamCallback(const std::function<void()>& callback) const;
 
-  void ResetDefaultContext(const stream::Priority& priority) {
-    default_ctx_.reset(new CUDAContext(place_, priority));
-  }
+  void WaitStreamCallback() const;
 
   void ResetThreadContext(const stream::Priority& priority) {
     std::lock_guard<std::mutex> guard(ctx_mtx_);
-    thread_ctx_[this].reset(new CUDAContext(place_, priority));
+    thread_ctx_[this].reset(new CUDAContext(this->GetPlace(), priority));
   }
 
-  std::shared_ptr<CUDAContext> context() const {
-    if (!thread_ctx_.count(this)) {
-      return default_ctx_;
-    }
-    return thread_ctx_.at(this);
-  }
+  std::shared_ptr<CUDAContext> context() const;
 
   // Note: Can only be used under thread_local semantics.
   void SetThreadLocalStream(const gpuStream_t stream) {
     thread_ctx_.at(this)->SetStream(stream);
   }
 
- private:
-  CUDAPlace place_;
-  std::shared_ptr<CUDAContext> default_ctx_;
+  // NOTE: Just for compatibility with the past, please delete if there is an
+  // elegant way.
+  stream::CUDAStream* GetCudaStream() const;
+  stream::CUDAStream* SetCudaStream(stream::CUDAStream*);
 
+ private:
   // The thread_local static variable will be released before the
   // global static variable, so avoid using it in dtor.
   static thread_local std::unordered_map<const CUDADeviceContext*,
@@ -631,22 +604,9 @@ class CUDADeviceContext : public DeviceContext {
 
   mutable std::mutex cudnn_handle_mtx_;
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-  // NCCL communicator (single process version) for NCCL collective operations.
-  // NCCL collective operations provides fast collectives over multiple GPUs
-  // both within and across nodes.
-  // But, this collectives is used for collectives over multiple GPUs within
-  // nodes.
-  ncclComm_t nccl_comm_{nullptr};
-#endif
-
-  int compute_capability_;
-  int runtime_version_;
-  int driver_version_;
-  int multi_process_;
-  int max_threads_per_mp_;
-  int max_threads_per_block_;
-  dim3 max_grid_dim_size_;
+  // NOTE: Just for compatibility with the past, please delete if there is an
+  // elegant way.
+  std::unique_ptr<stream::CUDAStream> cuda_stream_;
 
   DISABLE_COPY_AND_ASSIGN(CUDADeviceContext);
 };
@@ -711,7 +671,7 @@ class CUDAPinnedDeviceContext : public DeviceContext {
   CUDAPinnedDeviceContext();
   explicit CUDAPinnedDeviceContext(CUDAPinnedPlace place);
 
-  Place GetPlace() const override;
+  const Place& GetPlace() const override;
 
   Eigen::DefaultDevice* eigen_device() const;
 
diff --git a/paddle/fluid/platform/device_context_test.cu b/paddle/fluid/platform/device_context_test.cu
index cf617a478eb..851c756b665 100644
--- a/paddle/fluid/platform/device_context_test.cu
+++ b/paddle/fluid/platform/device_context_test.cu
@@ -17,6 +17,7 @@ limitations under the License. */
 
 #include "glog/logging.h"
 #include "gtest/gtest.h"
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
 
 TEST(Device, Init) {
   using paddle::platform::DeviceContext;
@@ -26,6 +27,20 @@ TEST(Device, Init) {
   int count = paddle::platform::GetGPUDeviceCount();
   for (int i = 0; i < count; i++) {
     CUDADeviceContext* device_context = new CUDADeviceContext(CUDAPlace(i));
+    device_context->SetAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetAllocator(CUDAPlace(i), device_context->stream())
+            .get());
+    device_context->SetHostAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetAllocator(paddle::platform::CPUPlace())
+            .get());
+    device_context->SetZeroAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetZeroAllocator(CUDAPlace(i))
+            .get());
+    device_context->PartialInitWithAllocator();
+
     Eigen::GpuDevice* gpu_device = device_context->eigen_device();
     ASSERT_NE(nullptr, gpu_device);
     delete device_context;
@@ -39,6 +54,19 @@ TEST(Device, CUDADeviceContext) {
   int count = paddle::platform::GetGPUDeviceCount();
   for (int i = 0; i < count; i++) {
     CUDADeviceContext* device_context = new CUDADeviceContext(CUDAPlace(i));
+    device_context->SetAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetAllocator(CUDAPlace(i), device_context->stream())
+            .get());
+    device_context->SetHostAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetAllocator(paddle::platform::CPUPlace())
+            .get());
+    device_context->SetZeroAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetZeroAllocator(CUDAPlace(i))
+            .get());
+    device_context->PartialInitWithAllocator();
     Eigen::GpuDevice* gpu_device = device_context->eigen_device();
     ASSERT_NE(nullptr, gpu_device);
 #ifdef PADDLE_WITH_HIP
diff --git a/paddle/fluid/platform/device_event_gpu.cc b/paddle/fluid/platform/device_event_gpu.cc
index 0a6b3917fbc..a811a5b9c13 100644
--- a/paddle/fluid/platform/device_event_gpu.cc
+++ b/paddle/fluid/platform/device_event_gpu.cc
@@ -53,7 +53,7 @@ void DeviceEventRecordCUDA(DeviceEvent* event, const DeviceContext* context) {
       platform::errors::PreconditionNotMet(
           "Failed to dynamic_cast context into CUDADeviceContext."));
 
-  wrapper->inner_event_.Record(*cuda_dev_ctx->context()->Stream());
+  wrapper->inner_event_.Record(cuda_dev_ctx->stream());
 }
 
 bool DeviceEventQueryCUDA(const DeviceEvent* event) {
@@ -82,8 +82,7 @@ void DeviceEventCUDAWaitCUDA(const DeviceEvent* event,
       platform::errors::PreconditionNotMet(
           "Failed to dynamic_cast context into CUDADeviceContext."));
   // calling cudaStreamWaitEvent(stream, event, 0)
-  cuda_dev_ctx->context()->Stream()->WaitEvent(
-      wrapper->inner_event_.GetRawCudaEvent());
+  cuda_dev_ctx->WaitEvent(wrapper->inner_event_.GetRawCudaEvent());
 }
 
 void DeviceEventCPUWaitCUDA(const DeviceEvent* event,
diff --git a/paddle/fluid/platform/device_event_test.cc b/paddle/fluid/platform/device_event_test.cc
index a56d94b892e..96e89f9257d 100644
--- a/paddle/fluid/platform/device_event_test.cc
+++ b/paddle/fluid/platform/device_event_test.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/platform/device_event.h"
 #include "glog/logging.h"
 #include "gtest/gtest.h"
+#include "paddle/fluid/platform/place.h"
 
 using ::paddle::platform::kCUDA;
 using ::paddle::platform::kCPU;
@@ -38,9 +39,11 @@ TEST(DeviceEvent, CUDA) {
   // case 1. test for event_creator
   DeviceEvent event(place);
   ASSERT_NE(event.GetEvent().get(), nullptr);
+  bool status = event.Query();
+  ASSERT_EQ(status, true);
   // case 2. test for event_recorder
   event.Record(context);
-  bool status = event.Query();
+  status = event.Query();
   ASSERT_EQ(status, false);
   // case 3. test for event_finisher
   event.Finish();
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index c751ee1e69b..7de50554319 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -539,7 +539,7 @@ inline void retry_sleep(unsigned milliseconds) {
         ::paddle::platform::details::ExternalApiType<                   \
             __CUDA_STATUS_TYPE__>::kSuccess;                            \
     while (UNLIKELY(__cond__ != __success_type__) && retry_count < 5) { \
-      retry_sleep(FLAGS_gpu_allocator_retry_time);                      \
+      paddle::platform::retry_sleep(FLAGS_gpu_allocator_retry_time);    \
       __cond__ = (COND);                                                \
       ++retry_count;                                                    \
     }                                                                   \
@@ -727,7 +727,7 @@ inline void retry_sleep(unsigned millisecond) {
         ::paddle::platform::details::ExternalApiType<                   \
             __CUDA_STATUS_TYPE__>::kSuccess;                            \
     while (UNLIKELY(__cond__ != __success_type__) && retry_count < 5) { \
-      retry_sleep(FLAGS_gpu_allocator_retry_time);                      \
+      ::paddle::platform::retry_sleep(FLAGS_gpu_allocator_retry_time);  \
       __cond__ = (COND);                                                \
       ++retry_count;                                                    \
     }                                                                   \
diff --git a/paddle/fluid/platform/event.h b/paddle/fluid/platform/event.h
index da5080cc86f..86df34acd76 100644
--- a/paddle/fluid/platform/event.h
+++ b/paddle/fluid/platform/event.h
@@ -152,11 +152,11 @@ class CudaEvent {
 #endif
   }
 
-  void Record(const paddle::platform::stream::CUDAStream &stream) {
+  void Record(gpuStream_t stream) {
 #ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event_, stream.raw_stream()));
+    PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event_, stream));
 #else
-    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event_, stream.raw_stream()));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event_, stream));
 #endif
   }
 
diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc
index 4a6bfe67ba5..b969ba971b6 100644
--- a/paddle/fluid/platform/flags.cc
+++ b/paddle/fluid/platform/flags.cc
@@ -688,28 +688,16 @@ PADDLE_DEFINE_EXPORTED_bool(
     "It controls whether to apply IR pass to program when using Fleet APIs");
 
 /**
- * Pt kernel related FLAG
- * Name: FLAGS_run_pten_kernel
- * Since Version: 2.3.0
- * Value Range: bool, default=false
- * Example: FLAGS_run_pten_kernel=true would use the pt kernel to compute in the
- * Op.
- * Note:
- */
-PADDLE_DEFINE_EXPORTED_bool(run_pten_kernel, true,
-                            "It controls whether to use pten kernel");
-
-/**
- * Pt kernel related FLAG
+ * KP kernel related FLAG
  * Name: FLAGS_run_kp_kernel
  * Since Version: 2.3.0
  * Value Range: bool, default=false
- * Example: FLAGS_run_kp_kernel=true would use the kp kernel to compute in
- * the Op for XPU2.
+ * Example: FLAGS_run_kp_kernel=true would use the kp kernel to compute in the
+ * Op.
  * Note:
  */
-PADDLE_DEFINE_EXPORTED_bool(run_kp_kernel, true,
-                            "It controls whether to use kp kernel for xpu2");
+PADDLE_DEFINE_EXPORTED_bool(run_kp_kernel, false,
+                            "It controls whether to run PaddlePaddle using KP");
 
 /**
  * Distributed related FLAG
diff --git a/paddle/fluid/platform/float16_test.cu b/paddle/fluid/platform/float16_test.cu
index 7a047d790ab..cc5c11778aa 100644
--- a/paddle/fluid/platform/float16_test.cu
+++ b/paddle/fluid/platform/float16_test.cu
@@ -328,6 +328,10 @@ TEST(float16, lod_tensor_on_gpu) {
   // CPU LoDTensor to GPU LoDTensor
   CUDAPlace gpu_place(0);
   CUDADeviceContext gpu_ctx(gpu_place);
+  gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(gpu_place, gpu_ctx.stream())
+                           .get());
+  gpu_ctx.PartialInitWithAllocator();
   framework::TensorCopy(src_tensor, gpu_place, gpu_ctx, &gpu_tensor);
 
   // GPU LoDTensor to CPU LoDTensor
diff --git a/paddle/fluid/platform/for_range.h b/paddle/fluid/platform/for_range.h
index 1d4be3801dd..aef36a89315 100644
--- a/paddle/fluid/platform/for_range.h
+++ b/paddle/fluid/platform/for_range.h
@@ -16,6 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/pten/backends/cpu/cpu_context.h"
+#include "paddle/pten/backends/gpu/gpu_context.h"
 
 namespace paddle {
 namespace platform {
@@ -72,6 +73,7 @@ __global__ static void ForRangeElemwiseOp(Function func, size_t limit) {
   }
 }
 
+// NOTE: After the pten kernel is migrated, it needs to be deleted.
 template <>
 struct ForRange<CUDADeviceContext> {
   ForRange(const CUDADeviceContext& dev_ctx, size_t limit)
@@ -106,6 +108,40 @@ struct ForRange<CUDADeviceContext> {
   size_t limit_;
 };
 
+template <>
+struct ForRange<pten::GPUContext> {
+  ForRange(const pten::GPUContext& dev_ctx, size_t limit)
+      : dev_ctx_(dev_ctx), limit_(static_cast<size_t>(limit)) {}
+
+  template <typename Function>
+  inline void operator()(Function func) const {
+#ifdef __HIPCC__
+    // HIP will throw core dump when threads > 256
+    constexpr int num_threads = 256;
+#elif WITH_NV_JETSON
+    // JETSON_NANO will throw core dump when threads > 128
+    int num_thread = 256;
+    platform::ChangeThreadNum(dev_ctx_, &num_thread, 128);
+    const int num_threads = num_thread;
+#else
+    constexpr int num_threads = 1024;
+#endif
+    size_t block_size = limit_ <= num_threads ? limit_ : num_threads;
+    size_t grid_size = (limit_ + num_threads - 1) / num_threads;
+
+    if (grid_size == 1) {
+      ForRangeElemwiseOpGridIsOne<<<1, block_size, 0, dev_ctx_.stream()>>>(
+          func);
+    } else {
+      ForRangeElemwiseOp<<<grid_size, block_size, 0, dev_ctx_.stream()>>>(
+          func, limit_);
+    }
+  }
+
+  const pten::GPUContext& dev_ctx_;
+  size_t limit_;
+};
+
 #endif
 
 }  // namespace platform
diff --git a/paddle/fluid/platform/os_info.cc b/paddle/fluid/platform/os_info.cc
index 92d218504ea..58d37783d05 100644
--- a/paddle/fluid/platform/os_info.cc
+++ b/paddle/fluid/platform/os_info.cc
@@ -14,7 +14,6 @@ limitations under the License. */
 
 #include "paddle/fluid/platform/os_info.h"
 #include <functional>
-#include <mutex>
 #include <sstream>
 #include <thread>
 #include <vector>
@@ -27,90 +26,14 @@ limitations under the License. */
 #else
 #include <unistd.h>
 #endif
+#include "paddle/fluid/framework/new_executor/workqueue/thread_data_registry.h"
 #include "paddle/fluid/platform/macros.h"  // import DISABLE_COPY_AND_ASSIGN
 
 namespace paddle {
 namespace platform {
 namespace internal {
 
-static uint64_t main_tid =
-    std::hash<std::thread::id>()(std::this_thread::get_id());
-
-template <typename T>
-class ThreadDataRegistry {
-  class ThreadDataHolder;
-
- public:
-  // Singleton
-  static ThreadDataRegistry& GetInstance() {
-    static ThreadDataRegistry instance;
-    return instance;
-  }
-
-  const T& GetCurrentThreadData() { return CurrentThreadData(); }
-
-  void SetCurrentThreadData(const T& val) {
-    std::lock_guard<std::mutex> lock(lock_);
-    CurrentThreadData() = val;
-  }
-
-  // Returns current snapshot of all threads. Make sure there is no thread
-  // create/destory when using it.
-  template <typename = std::enable_if_t<std::is_copy_constructible<T>::value>>
-  std::unordered_map<uint64_t, T> GetAllThreadDataByValue() {
-    std::unordered_map<uint64_t, T> data_copy;
-    std::lock_guard<std::mutex> lock(lock_);
-    data_copy.reserve(tid_map_.size());
-    for (auto& kv : tid_map_) {
-      data_copy.emplace(kv.first, kv.second->GetData());
-    }
-    return std::move(data_copy);
-  }
-
-  void RegisterData(uint64_t tid, ThreadDataHolder* tls_obj) {
-    std::lock_guard<std::mutex> lock(lock_);
-    tid_map_[tid] = tls_obj;
-  }
-
-  void UnregisterData(uint64_t tid) {
-    if (tid == main_tid) {
-      return;
-    }
-    std::lock_guard<std::mutex> lock(lock_);
-    tid_map_.erase(tid);
-  }
-
- private:
-  class ThreadDataHolder {
-   public:
-    ThreadDataHolder() {
-      tid_ = std::hash<std::thread::id>()(std::this_thread::get_id());
-      ThreadDataRegistry::GetInstance().RegisterData(tid_, this);
-    }
-
-    ~ThreadDataHolder() {
-      ThreadDataRegistry::GetInstance().UnregisterData(tid_);
-    }
-
-    T& GetData() { return data_; }
-
-   private:
-    uint64_t tid_;
-    T data_;
-  };
-
-  ThreadDataRegistry() = default;
-
-  DISABLE_COPY_AND_ASSIGN(ThreadDataRegistry);
-
-  T& CurrentThreadData() {
-    static thread_local ThreadDataHolder thread_data;
-    return thread_data.GetData();
-  }
-
-  std::mutex lock_;
-  std::unordered_map<uint64_t, ThreadDataHolder*> tid_map_;  // not owned
-};
+using framework::ThreadDataRegistry;
 
 class InternalThreadId {
  public:
diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc
index c4beac93ef1..8fecf444dc4 100644
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -21,7 +21,9 @@ limitations under the License. */
 #include "paddle/fluid/platform/device_tracer.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/profiler.h"
+#include "paddle/fluid/platform/profiler/common_event.h"
 #include "paddle/fluid/platform/profiler/host_event_recorder.h"
+#include "paddle/fluid/platform/profiler/host_tracer.h"
 #include "paddle/fluid/platform/profiler_helper.h"
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/dynload/nvtx.h"
@@ -64,7 +66,8 @@ double Event::CudaElapsedMs(const Event &e) const {
 #endif
 }
 
-RecordEvent::RecordEvent(const char *name, const EventRole role) {
+RecordEvent::RecordEvent(const char *name, const EventRole role,
+                         uint32_t level) {
 #ifndef _WIN32
 #ifdef PADDLE_WITH_CUDA
   if (g_enable_nvprof_hook) {
@@ -73,16 +76,21 @@ RecordEvent::RecordEvent(const char *name, const EventRole role) {
   }
 #endif
 #endif
-  if (UNLIKELY(FLAGS_enable_host_event_recorder_hook == false)) {
+  if (FLAGS_enable_host_event_recorder_hook == false) {
     OriginalConstruct(name, role, "none");
     return;
   }
+  if (UNLIKELY(HostTraceLevel::GetInstance().NeedTrace(level) == false)) {
+    return;
+  }
+  is_enabled_ = true;
   shallow_copy_name_ = name;
   role_ = role;
   start_ns_ = PosixInNsec();
 }
 
-RecordEvent::RecordEvent(const std::string &name, const EventRole role) {
+RecordEvent::RecordEvent(const std::string &name, const EventRole role,
+                         uint32_t level) {
 #ifndef _WIN32
 #ifdef PADDLE_WITH_CUDA
   if (g_enable_nvprof_hook) {
@@ -91,17 +99,21 @@ RecordEvent::RecordEvent(const std::string &name, const EventRole role) {
   }
 #endif
 #endif
-  if (UNLIKELY(FLAGS_enable_host_event_recorder_hook == false)) {
+  if (FLAGS_enable_host_event_recorder_hook == false) {
     OriginalConstruct(name, role, "none");
     return;
   }
+  if (UNLIKELY(HostTraceLevel::GetInstance().NeedTrace(level) == false)) {
+    return;
+  }
+  is_enabled_ = true;
   name_ = new std::string(name);
   role_ = role;
   start_ns_ = PosixInNsec();
 }
 
 RecordEvent::RecordEvent(const std::string &name, const EventRole role,
-                         const std::string &attr) {
+                         const std::string &attr, uint32_t level) {
 #ifndef _WIN32
 #ifdef PADDLE_WITH_CUDA
   if (g_enable_nvprof_hook) {
@@ -110,10 +122,14 @@ RecordEvent::RecordEvent(const std::string &name, const EventRole role,
   }
 #endif
 #endif
-  if (UNLIKELY(FLAGS_enable_host_event_recorder_hook == false)) {
+  if (FLAGS_enable_host_event_recorder_hook == false) {
     OriginalConstruct(name, role, attr);
     return;
   }
+  if (UNLIKELY(HostTraceLevel::GetInstance().NeedTrace(level) == false)) {
+    return;
+  }
+  is_enabled_ = true;
   name_ = new std::string(name);
   start_ns_ = PosixInNsec();
   attr_ = new std::string(attr);
@@ -138,10 +154,6 @@ void RecordEvent::OriginalConstruct(const std::string &name,
 }
 
 void RecordEvent::End() {
-  if (UNLIKELY(finished_)) {
-    return;
-  }
-  finished_ = true;
 #ifndef _WIN32
 #ifdef PADDLE_WITH_CUDA
   if (g_enable_nvprof_hook && is_pushed_) {
@@ -150,21 +162,25 @@ void RecordEvent::End() {
 #endif
 #endif
   uint64_t end_ns = PosixInNsec();
-  if (LIKELY(FLAGS_enable_host_event_recorder_hook)) {
+  if (LIKELY(FLAGS_enable_host_event_recorder_hook && is_enabled_)) {
     if (LIKELY(shallow_copy_name_ != nullptr)) {
       HostEventRecorder::GetInstance().RecordEvent(shallow_copy_name_,
-                                                   start_ns_, end_ns, role_);
+                                                   start_ns_, end_ns, role_,
+                                                   TracerEventType::NumTypes);
     } else if (name_ != nullptr) {
       if (attr_ == nullptr) {
-        HostEventRecorder::GetInstance().RecordEvent(*name_, start_ns_, end_ns,
-                                                     role_);
+        HostEventRecorder::GetInstance().RecordEvent(
+            *name_, start_ns_, end_ns, role_, TracerEventType::NumTypes);
       } else {
-        HostEventRecorder::GetInstance().RecordEvent(*name_, start_ns_, end_ns,
-                                                     role_, *attr_);
+        HostEventRecorder::GetInstance().RecordEvent(
+            *name_, start_ns_, end_ns, role_, TracerEventType::NumTypes,
+            *attr_);
         delete attr_;
       }
       delete name_;
     }
+    // use this flag to avoid double End();
+    is_enabled_ = false;
     return;
   }
 
@@ -179,15 +195,18 @@ void RecordEvent::End() {
   PopEvent(*name_, role_);
   delete name_;
   delete attr_;
+  // use this flag to avoid double End();
+  is_enabled_ = false;
 }
 
-RecordInstantEvent::RecordInstantEvent(const char *name, const EventRole role) {
-  if (UNLIKELY(FLAGS_enable_host_event_recorder_hook == false)) {
+RecordInstantEvent::RecordInstantEvent(const char *name, TracerEventType type,
+                                       uint32_t level) {
+  if (UNLIKELY(HostTraceLevel::GetInstance().NeedTrace(level) == false)) {
     return;
   }
   auto start_end_ns = PosixInNsec();
   HostEventRecorder::GetInstance().RecordEvent(name, start_end_ns, start_end_ns,
-                                               role);
+                                               EventRole::kOrdinary, type);
 }
 
 void MemEvenRecorder::PushMemRecord(const void *ptr, const Place &place,
@@ -281,8 +300,8 @@ void PopMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
 
 void Mark(const std::string &name) {
   if (FLAGS_enable_host_event_recorder_hook) {
-    HostEventRecorder::GetInstance().RecordEvent(name, 0, 0,
-                                                 EventRole::kOrdinary);
+    HostEventRecorder::GetInstance().RecordEvent(
+        name, 0, 0, EventRole::kOrdinary, TracerEventType::NumTypes);
     return;
   }
   GetEventList().Record(EventType::kMark, name, g_thread_id);
diff --git a/paddle/fluid/platform/profiler/CMakeLists.txt b/paddle/fluid/platform/profiler/CMakeLists.txt
index de22183df60..e25e4f3f56c 100644
--- a/paddle/fluid/platform/profiler/CMakeLists.txt
+++ b/paddle/fluid/platform/profiler/CMakeLists.txt
@@ -1 +1,3 @@
-cc_library(host_event_recorder SRCS host_event_recorder.cc DEPS os_info)
+cc_library(host_tracer SRCS host_tracer.cc DEPS enforce)
+cc_library(new_profiler SRCS profiler.cc DEPS host_tracer)
+cc_test(new_profiler_test SRCS profiler_test.cc DEPS new_profiler)
diff --git a/paddle/fluid/platform/profiler/common_event.h b/paddle/fluid/platform/profiler/common_event.h
new file mode 100644
index 00000000000..cfdc3be110a
--- /dev/null
+++ b/paddle/fluid/platform/profiler/common_event.h
@@ -0,0 +1,66 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cstring>
+#include <functional>
+#include <string>
+#include "paddle/fluid/platform/event.h"  // import EventRole, TODO(TIEXING): remove later
+#include "paddle/fluid/platform/profiler/trace_event.h"
+
+namespace paddle {
+namespace platform {
+
+struct CommonEvent {
+ public:
+  CommonEvent(const char *name, uint64_t start_ns, uint64_t end_ns,
+              EventRole role, TracerEventType type)
+      : name(name),
+        start_ns(start_ns),
+        end_ns(end_ns),
+        role(role),
+        type(type) {}
+
+  CommonEvent(std::function<void *(size_t)> arena_allocator,
+              const std::string &name_str, uint64_t start_ns, uint64_t end_ns,
+              EventRole role, TracerEventType type, const std::string &attr_str)
+      : start_ns(start_ns), end_ns(end_ns), role(role), type(type) {
+    auto buf = static_cast<char *>(arena_allocator(name_str.length() + 1));
+    strncpy(buf, name_str.c_str(), name_str.length() + 1);
+    name = buf;
+    buf = static_cast<char *>(arena_allocator(attr_str.length() + 1));
+    strncpy(buf, attr_str.c_str(), attr_str.length() + 1);
+    attr = buf;
+  }
+
+  CommonEvent(std::function<void *(size_t)> arena_allocator,
+              const std::string &name_str, uint64_t start_ns, uint64_t end_ns,
+              EventRole role, TracerEventType type)
+      : start_ns(start_ns), end_ns(end_ns), role(role), type(type) {
+    auto buf = static_cast<char *>(arena_allocator(name_str.length() + 1));
+    strncpy(buf, name_str.c_str(), name_str.length() + 1);
+    name = buf;
+  }
+
+  const char *name = nullptr;  // not owned, designed for performance
+  uint64_t start_ns = 0;
+  uint64_t end_ns = 0;
+  EventRole role = EventRole::kOrdinary;
+  TracerEventType type = TracerEventType::NumTypes;
+  const char *attr = nullptr;  // not owned, designed for performance
+};
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/profiler/event_tracing.h b/paddle/fluid/platform/profiler/event_tracing.h
index f68b4b5162a..2532077bcc3 100644
--- a/paddle/fluid/platform/profiler/event_tracing.h
+++ b/paddle/fluid/platform/profiler/event_tracing.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <string>
 #include "paddle/fluid/platform/event.h"
+#include "paddle/fluid/platform/profiler/trace_event.h"
 
 namespace paddle {
 namespace platform {
@@ -24,8 +25,8 @@ namespace platform {
 // associated with it. For example, thread starts working.
 // Chrome Trace Viewer Format: Instant Event
 struct RecordInstantEvent {
-  explicit RecordInstantEvent(const char* name,
-                              const EventRole role = EventRole::kOrdinary);
+  explicit RecordInstantEvent(const char* name, TracerEventType type,
+                              uint32_t level = 1);
 };
 
 // CPU event tracing. A trace starts when an object of this clas is created and
@@ -34,13 +35,15 @@ struct RecordInstantEvent {
 class RecordEvent {
  public:
   explicit RecordEvent(const std::string& name,
-                       const EventRole role = EventRole::kOrdinary);
+                       const EventRole role = EventRole::kOrdinary,
+                       uint32_t level = 1);
 
   explicit RecordEvent(const char* name,
-                       const EventRole role = EventRole::kOrdinary);
+                       const EventRole role = EventRole::kOrdinary,
+                       uint32_t level = 1);
 
   RecordEvent(const std::string& name, const EventRole role,
-              const std::string& attr);
+              const std::string& attr, uint32_t level = 1);
 
   // Stop event tracing explicitly before the object goes out of scope.
   // Sometimes it's inconvenient to use RAII
diff --git a/paddle/fluid/platform/profiler/host_event_recorder.h b/paddle/fluid/platform/profiler/host_event_recorder.h
index 071f0d65bd0..9c810dc184c 100644
--- a/paddle/fluid/platform/profiler/host_event_recorder.h
+++ b/paddle/fluid/platform/profiler/host_event_recorder.h
@@ -14,51 +14,17 @@ limitations under the License. */
 
 #pragma once
 
-#include <cstring>
-#include <mutex>
 #include <string>
 #include <type_traits>
-#include <unordered_map>
 #include <vector>
-#include "paddle/fluid/platform/event.h"
+#include "paddle/fluid/framework/new_executor/workqueue/thread_data_registry.h"
+#include "paddle/fluid/platform/macros.h"
+#include "paddle/fluid/platform/os_info.h"
+#include "paddle/fluid/platform/profiler/common_event.h"
 
 namespace paddle {
 namespace platform {
 
-struct CommonEvent {
- public:
-  CommonEvent(const char *name, uint64_t start_ns, uint64_t end_ns,
-              EventRole role)
-      : name(name), start_ns(start_ns), end_ns(end_ns), role(role) {}
-
-  CommonEvent(std::function<void *(size_t)> &arena_allocator,
-              const std::string &name_str, uint64_t start_ns, uint64_t end_ns,
-              EventRole role, const std::string &attr_str)
-      : start_ns(start_ns), end_ns(end_ns), role(role) {
-    auto buf = static_cast<char *>(arena_allocator(name_str.length() + 1));
-    strncpy(buf, name_str.c_str(), name_str.length() + 1);
-    name = buf;
-    buf = static_cast<char *>(arena_allocator(attr_str.length() + 1));
-    strncpy(buf, attr_str.c_str(), attr_str.length() + 1);
-    attr = buf;
-  }
-
-  CommonEvent(const std::function<void *(size_t)> &arena_allocator,
-              const std::string &name_str, uint64_t start_ns, uint64_t end_ns,
-              EventRole role)
-      : start_ns(start_ns), end_ns(end_ns), role(role) {
-    auto buf = static_cast<char *>(arena_allocator(name_str.length() + 1));
-    strncpy(buf, name_str.c_str(), name_str.length() + 1);
-    name = buf;
-  }
-
-  const char *name = nullptr;  // not owned, designed for performance
-  uint64_t start_ns = 0;
-  uint64_t end_ns = 0;
-  EventRole role = EventRole::kOrdinary;
-  const char *attr = nullptr;  // not owned, designed for performance
-};
-
 template <typename HeadType, typename... RestTypes>
 struct ContainsStdString
     : std::conditional_t<
@@ -223,7 +189,8 @@ struct ThreadEventSection {
 
 class ThreadEventRecorder {
  public:
-  ThreadEventRecorder();
+  ThreadEventRecorder() { thread_id_ = GetCurrentThreadSysId(); }
+
   DISABLE_COPY_AND_ASSIGN(ThreadEventRecorder);
 
  public:
@@ -261,34 +228,43 @@ class HostEventRecorder {
     return instance;
   }
 
+  // thread-safe
   // If your string argument has a longer lifetime than the Event,
   // use 'const char*'. e.g.: string literal, op name, etc.
   // Do your best to avoid using 'std::string' as the argument type.
   // It will cause deep-copy to harm performance.
   template <typename... Args>
   void RecordEvent(Args &&... args) {
-    GetThreadLocalRecorder().RecordEvent(std::forward<Args>(args)...);
+    GetThreadLocalRecorder()->RecordEvent(std::forward<Args>(args)...);
   }
 
+  // thread-unsafe, make sure make sure there is no running tracing.
   // Poor performance, call it at the ending
-  HostEventSection GatherEvents();
-
-  void RegisterThreadRecorder(uint64_t tid, ThreadEventRecorder *recorder) {
-    const std::lock_guard<std::mutex> guard(thread_recorders_lock_);
-    thread_recorders_[tid] = recorder;
+  HostEventSection GatherEvents() {
+    auto thr_recorders =
+        ThreadEventRecorderRegistry::GetInstance().GetAllThreadDataByRef();
+    HostEventSection host_sec;
+    host_sec.process_id = GetProcessId();
+    host_sec.thr_sections.reserve(thr_recorders.size());
+    for (auto &kv : thr_recorders) {
+      auto &thr_recorder = kv.second.get();
+      host_sec.thr_sections.emplace_back(
+          std::move(thr_recorder.GatherEvents()));
+    }
+    return host_sec;
   }
 
  private:
+  using ThreadEventRecorderRegistry =
+      framework::ThreadDataRegistry<ThreadEventRecorder>;
+
   HostEventRecorder() = default;
   DISABLE_COPY_AND_ASSIGN(HostEventRecorder);
 
-  ThreadEventRecorder &GetThreadLocalRecorder() {
-    static thread_local ThreadEventRecorder tls_recorder;
-    return tls_recorder;
+  ThreadEventRecorder *GetThreadLocalRecorder() {
+    return ThreadEventRecorderRegistry::GetInstance()
+        .GetMutableCurrentThreadData();
   }
-
-  std::mutex thread_recorders_lock_;
-  std::unordered_map<uint64_t, ThreadEventRecorder *> thread_recorders_;
 };
 
 }  // namespace platform
diff --git a/paddle/fluid/platform/profiler/host_tracer.cc b/paddle/fluid/platform/profiler/host_tracer.cc
new file mode 100644
index 00000000000..80f9a5d9af1
--- /dev/null
+++ b/paddle/fluid/platform/profiler/host_tracer.cc
@@ -0,0 +1,71 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/profiler/host_tracer.h"
+#include "glog/logging.h"
+#include "paddle/fluid/platform/profiler/common_event.h"
+#include "paddle/fluid/platform/profiler/host_event_recorder.h"
+
+namespace paddle {
+namespace platform {
+
+namespace {
+
+void ProcessHostEvents(const HostEventSection& host_events,
+                       TraceEventCollector* collector) {
+  for (const auto& thr_sec : host_events.thr_sections) {
+    uint64_t tid = thr_sec.thread_id;
+    for (const auto& evt : thr_sec.events) {
+      HostTraceEvent event;
+      event.name = evt.name;
+      event.type = evt.type;
+      event.start_ns = evt.start_ns;
+      event.end_ns = evt.end_ns;
+      event.process_id = host_events.process_id;
+      event.thread_id = tid;
+      collector->AddHostEvent(std::move(event));
+    }
+  }
+}
+
+}  // namespace
+
+void HostTracer::StartTracing() {
+  PADDLE_ENFORCE_EQ(
+      state_ == TracerState::READY || state_ == TracerState::STOPED, true,
+      platform::errors::PreconditionNotMet("TracerState must be READY"));
+  HostEventRecorder::GetInstance().GatherEvents();
+  HostTraceLevel::GetInstance().SetLevel(trace_level_);
+  state_ = TracerState::STARTED;
+}
+
+void HostTracer::StopTracing() {
+  PADDLE_ENFORCE_EQ(
+      state_, TracerState::STARTED,
+      platform::errors::PreconditionNotMet("TracerState must be STARTED"));
+  HostTraceLevel::GetInstance().SetLevel(HostTraceLevel::kDisabled);
+  state_ = TracerState::STOPED;
+}
+
+void HostTracer::CollectTraceData(TraceEventCollector* collector) {
+  PADDLE_ENFORCE_EQ(
+      state_, TracerState::STOPED,
+      platform::errors::PreconditionNotMet("TracerState must be STOPED"));
+  HostEventSection host_events =
+      HostEventRecorder::GetInstance().GatherEvents();
+  ProcessHostEvents(host_events, collector);
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/profiler/host_tracer.h b/paddle/fluid/platform/profiler/host_tracer.h
new file mode 100644
index 00000000000..c73b5eca15f
--- /dev/null
+++ b/paddle/fluid/platform/profiler/host_tracer.h
@@ -0,0 +1,63 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/platform/profiler/tracer_base.h"
+
+namespace paddle {
+namespace platform {
+
+class HostTraceLevel {
+ public:
+  static constexpr int64_t kDisabled = -1;
+
+  static HostTraceLevel& GetInstance() {
+    static HostTraceLevel instance;
+    return instance;
+  }
+
+  bool NeedTrace(uint32_t level) {
+    return trace_level_ >= static_cast<int64_t>(level);
+  }
+
+  void SetLevel(int64_t trace_level) { trace_level_ = trace_level; }
+
+ private:
+  // Verbose trace level, works like VLOG(level)
+  int trace_level_ = kDisabled;
+};
+
+struct HostTracerOptions {
+  uint32_t trace_level = 0;
+};
+
+class HostTracer : public TracerBase {
+ public:
+  explicit HostTracer(const HostTracerOptions& options) {
+    trace_level_ = options.trace_level;
+  }
+
+  void StartTracing() override;
+
+  void StopTracing() override;
+
+  void CollectTraceData(TraceEventCollector* collector) override;
+
+ private:
+  uint32_t trace_level_;
+};
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/profiler/profiler.cc b/paddle/fluid/platform/profiler/profiler.cc
new file mode 100644
index 00000000000..e9f0eb98d53
--- /dev/null
+++ b/paddle/fluid/platform/profiler/profiler.cc
@@ -0,0 +1,76 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/profiler/profiler.h"
+#include "glog/logging.h"
+#ifdef PADDLE_WITH_CUDA
+#include <cuda.h>
+#endif
+#ifdef PADDLE_WITH_HIP
+#include <hip/hip_runtime.h>
+#endif
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#endif
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/profiler/host_tracer.h"
+
+namespace paddle {
+namespace platform {
+
+void SynchronizeAllDevice();
+
+std::atomic<bool> Profiler::alive_{false};
+
+std::unique_ptr<Profiler> Profiler::Create(const ProfilerOptions& options) {
+  if (alive_.exchange(true)) {
+    return nullptr;
+  }
+  return std::unique_ptr<Profiler>(new Profiler(options));
+}
+
+Profiler::Profiler(const ProfilerOptions& options) {
+  options_ = options;
+  HostTracerOptions host_tracer_options;
+  host_tracer_options.trace_level = options.trace_level;
+  tracers_.emplace_back(new HostTracer(host_tracer_options), true);
+}
+
+Profiler::~Profiler() { alive_.store(false); }
+
+void Profiler::Prepare() {
+  for (auto& tracer : tracers_) {
+    tracer.Get().PrepareTracing();
+  }
+}
+
+void Profiler::Start() {
+  SynchronizeAllDevice();
+  for (auto& tracer : tracers_) {
+    tracer.Get().StartTracing();
+  }
+}
+
+TraceEventCollector Profiler::Stop() {
+  SynchronizeAllDevice();
+  TraceEventCollector collector;
+  for (auto& tracer : tracers_) {
+    tracer.Get().StopTracing();
+    tracer.Get().CollectTraceData(&collector);
+  }
+  return collector;
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/profiler/profiler.h b/paddle/fluid/platform/profiler/profiler.h
new file mode 100644
index 00000000000..1324d81f959
--- /dev/null
+++ b/paddle/fluid/platform/profiler/profiler.h
@@ -0,0 +1,74 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <atomic>
+#include <cstdint>
+#include <functional>
+#include <list>
+#include <memory>
+#include "paddle/fluid/platform/macros.h"
+#include "paddle/fluid/platform/profiler/trace_event_collector.h"
+#include "paddle/fluid/platform/profiler/tracer_base.h"
+
+namespace paddle {
+namespace platform {
+
+struct ProfilerOptions {
+  uint32_t trace_level = 0;
+};
+
+class Profiler {
+ public:
+  static std::unique_ptr<Profiler> Create(const ProfilerOptions& options);
+
+  void Prepare();
+
+  void Start();
+
+  TraceEventCollector Stop();
+
+  ~Profiler();
+
+ private:
+  class TracerHolder {
+   public:
+    TracerHolder(TracerBase* tracer, bool owned)
+        : tracer(tracer), owned(owned) {}
+    ~TracerHolder() {
+      if (owned) {
+        delete tracer;
+      }
+    }
+
+    TracerBase& Get() { return *tracer; }
+
+   private:
+    TracerBase* tracer;
+    bool owned;
+  };
+
+  explicit Profiler(const ProfilerOptions& options);
+
+  DISABLE_COPY_AND_ASSIGN(Profiler);
+
+  static std::atomic<bool> alive_;
+  ProfilerOptions options_;
+  uint64_t start_ns_ = UINT64_MAX;
+  std::list<TracerHolder> tracers_;
+};
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/profiler/profiler_test.cc b/paddle/fluid/platform/profiler/profiler_test.cc
new file mode 100644
index 00000000000..414987d2f10
--- /dev/null
+++ b/paddle/fluid/platform/profiler/profiler_test.cc
@@ -0,0 +1,52 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <set>
+#include <string>
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+#ifdef PADDLE_WITH_CUDA
+#include <cuda.h>
+#endif
+#ifdef PADDLE_WITH_HIP
+#include <hip/hip_runtime.h>
+#endif
+#include "paddle/fluid/platform/profiler/event_tracing.h"
+#include "paddle/fluid/platform/profiler/profiler.h"
+
+TEST(ProfilerTest, TestHostTracer) {
+  using paddle::platform::ProfilerOptions;
+  using paddle::platform::Profiler;
+  using paddle::platform::RecordInstantEvent;
+  using paddle::platform::TracerEventType;
+  ProfilerOptions options;
+  options.trace_level = 2;
+  auto profiler = Profiler::Create(options);
+  EXPECT_TRUE(profiler);
+  profiler->Prepare();
+  profiler->Start();
+  {
+    RecordInstantEvent("TestTraceLevel_record1", TracerEventType::UserDefined,
+                       2);
+    RecordInstantEvent("TestTraceLevel_record2", TracerEventType::UserDefined,
+                       3);
+  }
+  auto collector = profiler->Stop();
+  std::set<std::string> host_events;
+  for (const auto evt : collector.HostEvents()) {
+    host_events.insert(evt.name);
+  }
+  EXPECT_EQ(host_events.count("TestTraceLevel_record1"), 1u);
+  EXPECT_EQ(host_events.count("TestTraceLevel_record2"), 0u);
+}
diff --git a/paddle/fluid/platform/profiler/trace_event.h b/paddle/fluid/platform/profiler/trace_event.h
index e676942c458..1f146adf4f7 100644
--- a/paddle/fluid/platform/profiler/trace_event.h
+++ b/paddle/fluid/platform/profiler/trace_event.h
@@ -76,26 +76,28 @@ struct KernelEventInfo {
   uint64_t completed;
 };
 
+static constexpr size_t kMemKindMaxLen = 50;
+
 struct MemcpyEventInfo {
   // The number of bytes transferred by the memory copy.
   uint64_t num_bytes;
   // The kind of the memory copy.
   // Each kind represents the source and destination targets of a memory copy.
   // Targets are host, device, and array. Refer to CUpti_ActivityMemcpyKind
-  std::string copy_kind;
+  // std::string copy_kind;
   // The source memory kind read by the memory copy.
   // Each kind represents the type of the memory accessed by a memory
   // operation/copy. Refer to CUpti_ActivityMemoryKind
-  std::string src_kind;
+  char src_kind[kMemKindMaxLen];
   // The destination memory kind read by the memory copy.
-  std::string dst_kind;
+  char dst_kind[kMemKindMaxLen];
 };
 
 struct MemsetEventInfo {
   // The number of bytes being set by the memory set.
   uint64_t num_bytes;
   // The memory kind of the memory set. Refer to CUpti_ActivityMemoryKind
-  std::string memory_kind;
+  char memory_kind[kMemKindMaxLen];
   // the value being assigned to memory by the memory set.
   uint32_t value;
 };
diff --git a/paddle/fluid/platform/profiler/trace_event_collector.h b/paddle/fluid/platform/profiler/trace_event_collector.h
index eabafb73542..30b32220d9f 100644
--- a/paddle/fluid/platform/profiler/trace_event_collector.h
+++ b/paddle/fluid/platform/profiler/trace_event_collector.h
@@ -15,50 +15,37 @@ limitations under the License. */
 #pragma once
 
 #include <list>
+#include "paddle/fluid/platform/profiler/trace_event.h"
 
 namespace paddle {
 namespace platform {
 
-struct HostRecord {
-  std::string name;
-  uint64_t start_ns;
-  uint64_t end_ns;
-  uint64_t process_id;
-  uint64_t thread_id;
-};
+class TraceEventCollector {
+ public:
+  void AddHostEvent(HostTraceEvent&& event) { host_events_.push_back(event); }
 
-struct RuntimeRecord {
-  std::string name;
-  uint64_t start_ns;
-  uint64_t end_ns;
-  uint64_t process_id;
-  uint64_t thread_id;
-  uint32_t correlation_id;
-};
+  void AddRuntimeEvent(RuntimeTraceEvent&& event) {
+    runtime_events_.push_back(event);
+  }
 
-struct DeviceRecord {
-  std::string name;
-  uint64_t start_ns;
-  uint64_t end_ns;
-  uint32_t correlation_id;
-};
+  void AddDeviceEvent(DeviceTraceEvent&& event) {
+    device_events_.push_back(event);
+  }
 
-class TraceEventCollector {
- public:
-  void AddHostRecord(HostRecord&& record) { host_records_.push_back(record); }
+  const std::list<HostTraceEvent>& HostEvents() const { return host_events_; }
 
-  void AddRuntimeRecord(RuntimeRecord&& record) {
-    runtime_records_.push_back(record);
+  const std::list<RuntimeTraceEvent>& RuntimeEvents() const {
+    return runtime_events_;
   }
 
-  void AddDeviceRecord(DeviceRecord&& record) {
-    device_records_.push_back(record);
+  const std::list<DeviceTraceEvent>& DeviceEvents() const {
+    return device_events_;
   }
 
  private:
-  std::list<HostRecord> host_records_;
-  std::list<RuntimeRecord> runtime_records_;
-  std::list<DeviceRecord> device_records_;
+  std::list<HostTraceEvent> host_events_;
+  std::list<RuntimeTraceEvent> runtime_events_;
+  std::list<DeviceTraceEvent> device_events_;
 };
 
 }  // namespace platform
diff --git a/paddle/fluid/platform/stream/cuda_stream.cc b/paddle/fluid/platform/stream/cuda_stream.cc
index 5697bbee0bb..e3e735d03ab 100644
--- a/paddle/fluid/platform/stream/cuda_stream.cc
+++ b/paddle/fluid/platform/stream/cuda_stream.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/platform/stream/cuda_stream.h"
 #include "paddle/fluid/platform/cuda_device_guard.h"
+#include "paddle/fluid/platform/device/gpu/gpu_types.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
 
@@ -116,11 +117,8 @@ CUDAStream* get_current_stream(int deviceId) {
 
   platform::Place device = CUDAPlace(deviceId);
 
-  auto stream = static_cast<platform::CUDADeviceContext*>(pool.Get(device))
-                    ->context()
-                    ->Stream()
-                    .get();
-  return stream;
+  return static_cast<platform::CUDADeviceContext*>(pool.Get(device))
+      ->GetCudaStream();
 #else
   PADDLE_THROW(platform::errors::Unavailable(
       "Paddle is not compiled with CUDA. Cannot visit cuda current stream."));
@@ -133,12 +131,12 @@ CUDAStream* set_current_stream(CUDAStream* stream) {
   auto& device = stream->GetPlace();
   auto& pool = platform::DeviceContextPool::Instance();
   return static_cast<platform::CUDADeviceContext*>(pool.Get(device))
-      ->context()
-      ->SetStream(stream);
+      ->SetCudaStream(stream);
 #else
   PADDLE_THROW(platform::errors::Unavailable(
-      "Paddle is not compiled with CUDA. Cannot visit cuda current stream."));
-  return nullptr;
+      "Paddle is not compiled with CUDA. Cannot visit cuda current"
+      "stream."));
+  return CUDAStream(nullptr);
 #endif
 }
 }  // namespace stream
diff --git a/paddle/fluid/platform/stream/cuda_stream.h b/paddle/fluid/platform/stream/cuda_stream.h
index 0683cf4b042..2b54f0861f4 100644
--- a/paddle/fluid/platform/stream/cuda_stream.h
+++ b/paddle/fluid/platform/stream/cuda_stream.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <cstdint>
+#include <functional>
 #include <memory>
 
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
@@ -51,24 +52,28 @@ class CUDAStream final {
                       const StreamFlag& flag = StreamFlag::kDefaultFlag) {
     Init(place, priority, flag);
   }
+  explicit CUDAStream(gpuStream_t stream, const Place& place)
+      : place_(place), stream_(stream) {
+    owned_stream_ = false;
+    callback_manager_.reset(new StreamCallbackManager<gpuStream_t>(stream_));
+  }
   virtual ~CUDAStream() { Destroy(); }
 
   bool Init(const Place& place, const Priority& priority = Priority::kNormal,
             const StreamFlag& flag = StreamFlag::kDefaultFlag);
 
-  template <typename Callback>
-  void AddCallback(Callback&& callback) const {
+  void AddCallback(std::function<void()> callback) const {
     callback_manager_->AddCallback(callback);
   }
 
-  template <typename Callback>
 #ifdef PADDLE_WITH_HIP
-  void RecordEvent(hipEvent_t ev, Callback callback) const {
+  void RecordEvent(hipEvent_t ev, const std::function<void()>& callback) const {
     callback();
     PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(ev, stream_));
   }
 #else
-  void RecordEvent(cudaEvent_t ev, Callback callback) const {
+  void RecordEvent(cudaEvent_t ev,
+                   const std::function<void()>& callback) const {
     callback();
     PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(ev, stream_));
   }
@@ -149,6 +154,7 @@ class CUDAStream final {
 };
 
 CUDAStream* get_current_stream(int deviceId);
+// NOTE: There is a problem with the interface and needs to be fixed
 CUDAStream* set_current_stream(CUDAStream* stream);
 
 }  // namespace stream
diff --git a/paddle/fluid/platform/transform_test.cu b/paddle/fluid/platform/transform_test.cu
index 32ec113d1f5..e26fd511aa9 100644
--- a/paddle/fluid/platform/transform_test.cu
+++ b/paddle/fluid/platform/transform_test.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/platform/transform.h"
@@ -57,6 +58,10 @@ TEST(Transform, CPUUnary) {
 TEST(Transform, GPUUnary) {
   CUDAPlace gpu0(0);
   CUDADeviceContext ctx(gpu0);
+  ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                       .GetAllocator(gpu0, ctx.stream())
+                       .get());
+  ctx.PartialInitWithAllocator();
   float cpu_buf[4] = {0.1, 0.2, 0.3, 0.4};
   auto gpu_allocation = Alloc(gpu0, sizeof(float) * 4);
   float* gpu_buf = static_cast<float*>(gpu_allocation->ptr());
@@ -84,6 +89,10 @@ TEST(Transform, GPUBinary) {
   int buf[4] = {1, 2, 3, 4};
   CUDAPlace gpu0(0);
   CUDADeviceContext ctx(gpu0);
+  ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                       .GetAllocator(gpu0, ctx.stream())
+                       .get());
+  ctx.PartialInitWithAllocator();
   auto gpu_allocation = Alloc(gpu0, sizeof(buf));
   int* gpu_buf = static_cast<int*>(gpu_allocation->ptr());
   Copy(gpu0, gpu_buf, CPUPlace(), buf, sizeof(buf), ctx.stream());
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index a6e155f70e6..be773d312a7 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(PYBIND_DEPS init pybind python proto_desc memory executor fleet_wrapper box_wrapper prune
+set(PYBIND_DEPS init pybind python proto_desc memory executor fleet_wrapper box_wrapper metrics prune
   feed_fetch_method pass generate_pass pass_builder parallel_executor profiler layer tracer engine scope_pool
   analysis_predictor imperative_profiler imperative_flag save_load_util dlpack_tensor device_context
   gloo_wrapper infer_io_utils heter_wrapper generator op_version_registry ps_gpu_wrapper custom_operator
@@ -63,6 +63,7 @@ set(PYBIND_SRCS
   ps_gpu_wrapper_py.cc
   gloo_wrapper_py.cc
   box_helper_py.cc
+  metrics_py.cc
   data_set_py.cc
   imperative.cc
   ir.cc
diff --git a/paddle/fluid/pybind/cuda_streams_py.cc b/paddle/fluid/pybind/cuda_streams_py.cc
index 21571e17a2b..64c145c94f9 100644
--- a/paddle/fluid/pybind/cuda_streams_py.cc
+++ b/paddle/fluid/pybind/cuda_streams_py.cc
@@ -34,7 +34,7 @@ void BindCudaStream(py::module *m_ptr) {
           return paddle::platform::stream::get_current_stream(deviceId);
 #else
           PADDLE_THROW(platform::errors::Unavailable(
-              "Paddle is not compiled with CUDA. Cannot visit cuda current "
+              "Paddle is not compiled with CUDA. Cannot visit cuda current"
               "stream."));
 #endif
         },
@@ -119,7 +119,7 @@ void BindCudaStream(py::module *m_ptr) {
            [](paddle::platform::stream::CUDAStream &self,
               paddle::platform::stream::CUDAStream &stream) {
              paddle::platform::CudaEvent event;
-             event.Record(stream);
+             event.Record(stream.raw_stream());
 
              self.WaitEvent(event.GetRawCudaEvent());
            },
@@ -179,7 +179,7 @@ void BindCudaStream(py::module *m_ptr) {
              if (event == nullptr) {
                event = new paddle::platform::CudaEvent();
              }
-             event->Record(self);
+             event->Record(self.raw_stream());
              return event;
            },
            R"DOC(
@@ -321,7 +321,7 @@ void BindCudaStream(py::module *m_ptr) {
              if (stream == nullptr) {
                stream = paddle::platform::stream::get_current_stream(-1);
              }
-             self.Record(*stream);
+             self.Record(stream->raw_stream());
            },
            R"DOC(
           Records the event in the given stream.
diff --git a/paddle/fluid/pybind/data_set_py.cc b/paddle/fluid/pybind/data_set_py.cc
index 562047a0c0c..5e2274cb651 100644
--- a/paddle/fluid/pybind/data_set_py.cc
+++ b/paddle/fluid/pybind/data_set_py.cc
@@ -271,6 +271,8 @@ void BindDataset(py::module *m) {
            py::call_guard<py::gil_scoped_release>())
       .def("set_merge_by_sid", &framework::Dataset::SetMergeBySid,
            py::call_guard<py::gil_scoped_release>())
+      .def("set_shuffle_by_uid", &framework::Dataset::SetShuffleByUid,
+           py::call_guard<py::gil_scoped_release>())
       .def("preprocess_instance", &framework::Dataset::PreprocessInstance,
            py::call_guard<py::gil_scoped_release>())
       .def("postprocess_instance", &framework::Dataset::PostprocessInstance,
diff --git a/paddle/fluid/pybind/eager.cc b/paddle/fluid/pybind/eager.cc
index 0aacbe5e325..6ace8159426 100644
--- a/paddle/fluid/pybind/eager.cc
+++ b/paddle/fluid/pybind/eager.cc
@@ -24,7 +24,7 @@ limitations under the License. */
 #include "paddle/fluid/pybind/eager.h"
 #include "paddle/fluid/pybind/eager_utils.h"
 #include "paddle/pten/common/data_type.h"
-#include "paddle/pten/core/convert_utils.h"
+#include "paddle/pten/core/compat/convert_utils.h"
 #include "paddle/pten/core/dense_tensor.h"
 #include "pybind11/detail/internals.h"
 #include "pybind11/numpy.h"
diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc
index 44fc9593839..a5167e9ebae 100644
--- a/paddle/fluid/pybind/eager_functions.cc
+++ b/paddle/fluid/pybind/eager_functions.cc
@@ -32,7 +32,7 @@ limitations under the License. */
 #include "paddle/pten/api/lib/utils/storage.h"
 #include "paddle/pten/api/lib/utils/tensor_utils.h"
 #include "paddle/pten/common/data_type.h"
-#include "paddle/pten/core/convert_utils.h"
+#include "paddle/pten/core/compat/convert_utils.h"
 #include "paddle/pten/core/dense_tensor.h"
 
 namespace paddle {
diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc
index 4835d8873af..414c60adf03 100644
--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -29,7 +29,7 @@ limitations under the License. */
 #include "paddle/fluid/pybind/exception.h"
 #include "paddle/pten/api/include/api.h"
 #include "paddle/pten/common/data_type.h"
-#include "paddle/pten/core/convert_utils.h"
+#include "paddle/pten/core/compat/convert_utils.h"
 #include "paddle/pten/core/dense_tensor.h"
 namespace paddle {
 namespace pybind {
diff --git a/paddle/fluid/pybind/eager_properties.cc b/paddle/fluid/pybind/eager_properties.cc
index 5f1d809168a..d8dac3c6287 100644
--- a/paddle/fluid/pybind/eager_properties.cc
+++ b/paddle/fluid/pybind/eager_properties.cc
@@ -26,7 +26,7 @@ limitations under the License. */
 #include "paddle/fluid/pybind/eager_utils.h"
 #include "paddle/fluid/pybind/exception.h"
 #include "paddle/pten/common/data_type.h"
-#include "paddle/pten/core/convert_utils.h"
+#include "paddle/pten/core/compat/convert_utils.h"
 #include "paddle/pten/core/dense_tensor.h"
 #pragma GCC diagnostic ignored "-Wwrite-strings"
 
diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc
index 1e0697246e9..0f21bd4ae37 100644
--- a/paddle/fluid/pybind/eager_utils.cc
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -24,7 +24,7 @@ limitations under the License. */
 #include "paddle/fluid/pybind/op_function_common.h"
 #include "paddle/fluid/pybind/tensor_py.h"
 #include "paddle/pten/common/data_type.h"
-#include "paddle/pten/core/convert_utils.h"
+#include "paddle/pten/core/compat/convert_utils.h"
 #include "paddle/pten/core/dense_tensor.h"
 
 namespace paddle {
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index b9f11923d18..61b8f1fe010 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -2305,9 +2305,9 @@ void BindImperative(py::module *m_ptr) {
              auto outs_map = ConvertToNameVarBaseMap(outs);
              {
                py::gil_scoped_release release;
-               self.TraceOp(type, std::move(ins_map), std::move(outs_map),
-                            std::move(attrs), place, trace_backward,
-                            inplace_map);
+               self.TraceOp<imperative::VarBase>(
+                   type, std::move(ins_map), std::move(outs_map),
+                   std::move(attrs), place, trace_backward, inplace_map);
              }
            })
       .def("trace",
@@ -2320,9 +2320,9 @@ void BindImperative(py::module *m_ptr) {
              auto outs_map = ConvertToNameVarBaseMap(outs);
              {
                py::gil_scoped_release release;
-               self.TraceOp(type, std::move(ins_map), std::move(outs_map),
-                            std::move(attrs), place, trace_backward,
-                            inplace_map);
+               self.TraceOp<imperative::VarBase>(
+                   type, std::move(ins_map), std::move(outs_map),
+                   std::move(attrs), place, trace_backward, inplace_map);
              }
            })
       .def("trace",
@@ -2335,9 +2335,9 @@ void BindImperative(py::module *m_ptr) {
              auto outs_map = ConvertToNameVarBaseMap(outs);
              {
                py::gil_scoped_release release;
-               self.TraceOp(type, std::move(ins_map), std::move(outs_map),
-                            std::move(attrs), place, trace_backward,
-                            inplace_map);
+               self.TraceOp<imperative::VarBase>(
+                   type, std::move(ins_map), std::move(outs_map),
+                   std::move(attrs), place, trace_backward, inplace_map);
              }
            })
       .def("trace",
@@ -2350,9 +2350,9 @@ void BindImperative(py::module *m_ptr) {
              auto outs_map = ConvertToNameVarBaseMap(outs);
              {
                py::gil_scoped_release release;
-               self.TraceOp(type, std::move(ins_map), std::move(outs_map),
-                            std::move(attrs), place, trace_backward,
-                            inplace_map);
+               self.TraceOp<imperative::VarBase>(
+                   type, std::move(ins_map), std::move(outs_map),
+                   std::move(attrs), place, trace_backward, inplace_map);
              }
            })
       .def("trace",
@@ -2365,9 +2365,9 @@ void BindImperative(py::module *m_ptr) {
              auto outs_map = ConvertToNameVarBaseMap(outs);
              {
                py::gil_scoped_release release;
-               self.TraceOp(type, std::move(ins_map), std::move(outs_map),
-                            std::move(attrs), place, trace_backward,
-                            inplace_map);
+               self.TraceOp<imperative::VarBase>(
+                   type, std::move(ins_map), std::move(outs_map),
+                   std::move(attrs), place, trace_backward, inplace_map);
              }
            });
 
diff --git a/paddle/fluid/pybind/metrics_py.cc b/paddle/fluid/pybind/metrics_py.cc
new file mode 100644
index 00000000000..79ab416eb50
--- /dev/null
+++ b/paddle/fluid/pybind/metrics_py.cc
@@ -0,0 +1,55 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <fcntl.h>
+
+#ifdef _POSIX_C_SOURCE
+#undef _POSIX_C_SOURCE
+#endif
+
+#ifdef _XOPEN_SOURCE
+#undef _XOPEN_SOURCE
+#endif
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "google/protobuf/text_format.h"
+#include "paddle/fluid/framework/fleet/metrics.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/inference/io.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/platform/variant.h"
+#include "paddle/fluid/pybind/metrics_py.h"
+
+namespace py = pybind11;
+
+#if defined(PADDLE_WITH_PSLIB)
+namespace paddle {
+namespace pybind {
+void BindMetrics(py::module* m) {
+  py::class_<framework::Metric, std::shared_ptr<framework::Metric>>(*m,
+                                                                    "Metric")
+      .def(py::init([]() { return framework::Metric::SetInstance(); }))
+      .def("init_metric", &framework::Metric::InitMetric,
+           py::call_guard<py::gil_scoped_release>())
+      .def("flip_phase", &framework::Metric::FlipPhase,
+           py::call_guard<py::gil_scoped_release>())
+      .def("get_metric_msg", &framework::Metric::GetMetricMsg,
+           py::call_guard<py::gil_scoped_release>())
+      .def("get_wuauc_metric_msg", &framework::Metric::GetWuAucMetricMsg,
+           py::call_guard<py::gil_scoped_release>())
+      .def("get_metric_name_list", &framework::Metric::GetMetricNameList,
+           py::call_guard<py::gil_scoped_release>());
+}  // end Metrics
+}  // end namespace pybind
+}  // end namespace paddle
+#endif
diff --git a/paddle/fluid/pybind/metrics_py.h b/paddle/fluid/pybind/metrics_py.h
new file mode 100644
index 00000000000..fc48e3b0f25
--- /dev/null
+++ b/paddle/fluid/pybind/metrics_py.h
@@ -0,0 +1,28 @@
+//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
+
+namespace py = pybind11;
+
+#if defined(PADDLE_WITH_PSLIB)
+namespace paddle {
+namespace pybind {
+void BindMetrics(py::module* m);
+}  // namespace pybind
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 3e94ccfd497..9a535f5fb04 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -75,6 +75,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/pybind/cuda_streams_py.h"
+#include "paddle/pten/core/compat/convert_utils.h"
 #include "paddle/pten/core/lod_utils.h"
 #ifndef PADDLE_ON_INFERENCE
 #include "paddle/fluid/pybind/eager.h"
@@ -100,6 +101,7 @@ limitations under the License. */
 #include "paddle/fluid/pybind/imperative.h"
 #include "paddle/fluid/pybind/inference_api.h"
 #include "paddle/fluid/pybind/ir.h"
+#include "paddle/fluid/pybind/metrics_py.h"
 #include "paddle/fluid/pybind/ps_gpu_wrapper_py.h"
 #include "paddle/fluid/pybind/pybind_boost_headers.h"
 
@@ -714,21 +716,61 @@ PYBIND11_MODULE(core_noavx, m) {
   m.def("_get_use_default_grad_op_desc_maker_ops",
         [] { return OpInfoMap::Instance().GetUseDefaultGradOpDescMakerOps(); });
 
-  m.def("_get_all_register_op_kernels", [] {
-    auto &all_kernels = paddle::framework::OperatorWithKernel::AllOpKernels();
-    std::unordered_map<std::string, std::vector<std::string>> all_kernels_info;
-    for (auto &kernel_pair : all_kernels) {
-      auto op_type = kernel_pair.first;
-      std::vector<std::string> kernel_types;
-      for (auto &info_pair : kernel_pair.second) {
-        paddle::framework::OpKernelType kernel_type = info_pair.first;
-        kernel_types.push_back(
-            paddle::framework::KernelTypeToString(kernel_type));
-      }
-      all_kernels_info.emplace(op_type, kernel_types);
-    }
-    return all_kernels_info;
-  });
+  m.def(
+      "_get_all_register_op_kernels",
+      [](const std::string &lib) {
+        std::unordered_map<std::string, std::vector<std::string>>
+            all_kernels_info;
+        if (lib == "fluid" || lib == "all") {
+          auto &all_kernels =
+              paddle::framework::OperatorWithKernel::AllOpKernels();
+
+          for (auto &kernel_pair : all_kernels) {
+            auto op_type = kernel_pair.first;
+            std::vector<std::string> kernel_types;
+            for (auto &info_pair : kernel_pair.second) {
+              paddle::framework::OpKernelType kernel_type = info_pair.first;
+              kernel_types.emplace_back(
+                  paddle::framework::KernelTypeToString(kernel_type));
+            }
+            all_kernels_info.emplace(op_type, kernel_types);
+          }
+        }
+        if (lib == "pten" || lib == "all") {
+          auto pten_kernels = pten::KernelFactory::Instance().kernels();
+          for (auto &kernel_pair : pten_kernels) {
+            auto op_type = pten::TransToFluidOpName(kernel_pair.first);
+            std::vector<std::string> kernel_types;
+            for (auto &info_pair : kernel_pair.second) {
+              framework::OpKernelType kernel_type =
+                  framework::TransPtenKernelKeyToOpKernelType(info_pair.first);
+              auto kernel_type_str = framework::KernelTypeToString(kernel_type);
+              if (all_kernels_info.count(op_type)) {
+                if (std::find(all_kernels_info[op_type].begin(),
+                              all_kernels_info[op_type].end(),
+                              kernel_type_str) ==
+                    all_kernels_info[op_type].end()) {
+                  all_kernels_info[op_type].emplace_back(kernel_type_str);
+                }
+              } else {
+                kernel_types.emplace_back(kernel_type_str);
+              }
+            }
+            if (!kernel_types.empty()) {
+              all_kernels_info.emplace(op_type, kernel_types);
+            }
+          }
+        }
+
+        return all_kernels_info;
+      },
+      py::arg("lib") = "all",
+      R"DOC(
+           Return the registered kernels in paddle.
+
+           Args:
+               lib[string]: the libarary, could be 'pten', 'fluid' and 'all'.
+           )DOC");
 
   // NOTE(zjl): ctest would load environment variables at the beginning even
   // though we have not `import paddle.fluid as fluid`. So we add this API
@@ -972,7 +1014,8 @@ PYBIND11_MODULE(core_noavx, m) {
             PADDLE_ENFORCE_EQ(
                 CheckLoD(new_offset_lod, -1), true,
                 platform::errors::InvalidArgument(
-                    "The provided recursive_sequence_lengths info is invalid, "
+                    "The provided recursive_sequence_lengths info is "
+                    "invalid, "
                     "the LoD converted by recursive_sequence_lengths is %s",
                     new_lod));
             new (&instance) framework::Tensor(new_offset_lod);
@@ -1034,7 +1077,8 @@ PYBIND11_MODULE(core_noavx, m) {
              PADDLE_ENFORCE_EQ(
                  CheckLoD(new_offset_lod, vectorize(self.dims()).front()), true,
                  platform::errors::InvalidArgument(
-                     "The provided recursive_sequence_lengths info is invalid, "
+                     "The provided recursive_sequence_lengths info is "
+                     "invalid, "
                      "the LoD converted by recursive_sequence_lengths is "
                      "%s",
                      new_lod));
@@ -1552,7 +1596,20 @@ All parameter, weight, gradient are variables in Paddle.
       .def_static("create",
                   [](paddle::platform::CPUPlace& place)
                       -> paddle::platform::DeviceContext* {
-                    return new paddle::platform::CPUDeviceContext();
+    auto* context = new paddle::platform::CPUDeviceContext();
+    context->SetAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+        .GetAllocator(place)
+        .get());
+    context->SetHostAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+        .GetAllocator(paddle::platform::CPUPlace())
+        .get());
+    context->SetZeroAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+        .GetZeroAllocator(place)
+        .get());
+    return context;
                   })
       .def_static("create",
                   [](paddle::platform::XPUPlace& place)
@@ -1563,7 +1620,20 @@ All parameter, weight, gradient are variables in Paddle.
                  "Cannot use XPUPlace in CPU/GPU version, "
                  "Please recompile or reinstall Paddle with XPU support."));
 #else
-                    return new paddle::platform::XPUDeviceContext(place);
+      auto* context = new paddle::platform::XPUDeviceContext(place);
+      context->SetAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(place)
+          .get());
+      context->SetHostAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(paddle::platform::CPUPlace())
+          .get());
+      context->SetZeroAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetZeroAllocator(place)
+          .get());
+      return context;
 #endif
                   })
         .def_static("create",
@@ -1599,7 +1669,21 @@ All parameter, weight, gradient are variables in Paddle.
                  "Cannot use CUDAPlace in CPU only version, "
                  "Please recompile or reinstall Paddle with CUDA support."));
 #else
-                    return new paddle::platform::CUDADeviceContext(place);
+      auto* context = new paddle::platform::CUDADeviceContext(place);
+      context->SetAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(place, context->stream())
+          .get());
+      context->SetHostAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(paddle::platform::CPUPlace())
+          .get());
+      context->SetZeroAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+        .GetZeroAllocator(place)
+        .get());
+      context->PartialInitWithAllocator();
+      return context;
 #endif
                   })
           .def_static("create",
@@ -2109,20 +2193,20 @@ All parameter, weight, gradient are variables in Paddle.
       .def("__str__", string::to_string<const platform::Place &>);
 
   py::class_<OperatorBase>(m, "Operator")
-      .def_static(
-          "create",
-          [](py::bytes protobin) {
-            proto::OpDesc desc;
-            PADDLE_ENFORCE_EQ(desc.ParsePartialFromString(protobin), true,
-                              platform::errors::InvalidArgument(
-                                  "Cannot parse user input to OpDesc"));
-            PADDLE_ENFORCE_EQ(
-                desc.IsInitialized(), true,
-                platform::errors::InvalidArgument(
-                    "The provided OpDesc is not initialized, the reason is: %s",
-                    desc.InitializationErrorString()));
-            return OpRegistry::CreateOp(desc);
-          })
+      .def_static("create",
+                  [](py::bytes protobin) {
+                    proto::OpDesc desc;
+                    PADDLE_ENFORCE_EQ(desc.ParsePartialFromString(protobin),
+                                      true,
+                                      platform::errors::InvalidArgument(
+                                          "Cannot parse user input to OpDesc"));
+                    PADDLE_ENFORCE_EQ(desc.IsInitialized(), true,
+                                      platform::errors::InvalidArgument(
+                                          "The provided OpDesc is not "
+                                          "initialized, the reason is: %s",
+                                          desc.InitializationErrorString()));
+                    return OpRegistry::CreateOp(desc);
+                  })
       .def("run",
            [](OperatorBase &self, const Scope &scope,
               const platform::CPUPlace &place) {
@@ -2704,9 +2788,9 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("register_pass", [](const std::string &pass_type, py::object callable) {
     PADDLE_ENFORCE_EQ(
         framework::ir::PassRegistry::Instance().Has(pass_type), false,
-        platform::errors::AlreadyExists(
-            "Pass '%s' is registered more than once. Please use another name.",
-            pass_type));
+        platform::errors::AlreadyExists("Pass '%s' is registered more than "
+                                        "once. Please use another name.",
+                                        pass_type));
     callable.inc_ref();
     framework::ir::PassRegistry::Instance().Insert(pass_type, [pass_type,
                                                                callable]() {
@@ -3678,6 +3762,7 @@ All parameter, weight, gradient are variables in Paddle.
 
 #if defined(PADDLE_WITH_PSLIB) && !defined(PADDLE_WITH_HETERPS)
   BindHeterWrapper(&m);
+  BindMetrics(&m);
 #endif
 #ifdef PADDLE_WITH_HETERPS
   BindPSGPUWrapper(&m);
diff --git a/paddle/pten/api/CMakeLists.txt b/paddle/pten/api/CMakeLists.txt
index 0491363eda7..a993cb3ff80 100644
--- a/paddle/pten/api/CMakeLists.txt
+++ b/paddle/pten/api/CMakeLists.txt
@@ -1,3 +1,2 @@
 add_subdirectory(lib)
-
-cc_library(pten_api SRCS all.cc DEPS pten_function_api pten_bw_function_api utils_api)
+cc_library(pten_api SRCS all.cc DEPS pten_function_api pten_bw_function_api manual_api sparse_api)
diff --git a/paddle/pten/api/all.h b/paddle/pten/api/all.h
index 5744b18c4d2..a327bd998cb 100644
--- a/paddle/pten/api/all.h
+++ b/paddle/pten/api/all.h
@@ -26,8 +26,9 @@ limitations under the License. */
 
 // new pten apis
 #include "paddle/pten/api/include/api.h"
+#include "paddle/pten/api/include/manual_api.h"
+#include "paddle/pten/api/include/sparse_api.h"
 #include "paddle/pten/api/include/tensor.h"
-#include "paddle/pten/api/include/utils.h"
 
 // pten common headers
 #include "paddle/pten/common/backend.h"
diff --git a/paddle/pten/api/ext/op_meta_info.h b/paddle/pten/api/ext/op_meta_info.h
index ac37d724698..a8f12bad187 100644
--- a/paddle/pten/api/ext/op_meta_info.h
+++ b/paddle/pten/api/ext/op_meta_info.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <iostream>
 #include <string>
 #include <unordered_map>
+#include <utility>
 #include <vector>
 
 #include "paddle/pten/api/ext/dll_decl.h"
@@ -76,37 +77,66 @@ inline std::string Vec(const std::string& t_name) {
   return result;
 }
 
+PADDLE_API void AssignTensorImpl(const Tensor& src, Tensor* dst);
+
+////////////////////// Kernel Context ////////////////////////
+
+class PADDLE_API CustomOpKernelContext {
+ public:
+  CustomOpKernelContext() = default;
+
+  void EmplaceBackInput(Tensor&& input);
+  void EmplaceBackInputs(std::vector<Tensor>&& inputs);
+  void EmplaceBackOutput(Tensor&& output);
+  void EmplaceBackOutputs(std::vector<Tensor>&& outputs);
+  void EmplaceBackAttr(paddle::any attr);
+
+  const std::pair<size_t, size_t>& InputRangeAt(size_t idx) const;
+  const std::pair<size_t, size_t>& OutputRangeAt(size_t idx) const;
+
+  const Tensor& InputAt(size_t idx) const;
+  std::vector<Tensor> InputsBetween(size_t start, size_t end) const;
+
+  Tensor* MutableOutputAt(size_t idx);
+  std::vector<Tensor*> MutableOutputBetweeen(size_t start, size_t end);
+  std::vector<Tensor>* AllMutableOutput();
+
+  template <typename AttrType>
+  AttrType AttrAt(size_t idx) const {
+    try {
+      return paddle::any_cast<AttrType>(attrs_.at(idx));
+    } catch (paddle::bad_any_cast&) {
+      PD_THROW("Attribute cast error in Custom Op Kernel Context.");
+    }
+  }
+
+ private:
+  // TODO(chenweihang): replaced be SmallVector
+  std::vector<Tensor> inputs_;
+  std::vector<Tensor> outputs_;
+  std::vector<paddle::any> attrs_;
+
+  std::vector<std::pair<size_t, size_t>> input_range_;
+  std::vector<std::pair<size_t, size_t>> output_range_;
+};
+
 ////////////////////// Kernel Function (PD_KERNEL) ////////////////////////
 
 // Record Op kernel core function
-using KernelFunc =
-    std::vector<Tensor> (*)(const std::vector<Tensor>& inputs,
-                            const std::vector<std::vector<Tensor>>& vec_inputs,
-                            const std::vector<paddle::any>& attrs);
-
-#define PD_SPECIALIZE_ComputeCallHelper(attr_type)                            \
-  template <typename... Tail>                                                 \
-  struct ComputeCallHelper<attr_type, Tail...> {                              \
-    template <int in_idx,                                                     \
-              int vec_in_idx,                                                 \
-              int attr_idx,                                                   \
-              typename... PreviousArgs>                                       \
-    static Return Compute(const std::vector<Tensor>& inputs,                  \
-                          const std::vector<std::vector<Tensor>>& vec_inputs, \
-                          const std::vector<paddle::any>& attrs,              \
-                          const PreviousArgs&... pargs) {                     \
-      try {                                                                   \
-        attr_type arg = paddle::any_cast<attr_type>(attrs[attr_idx]);         \
-        return ComputeCallHelper<Tail...>::template Compute<in_idx,           \
-                                                            vec_in_idx,       \
-                                                            attr_idx + 1>(    \
-            inputs, vec_inputs, attrs, pargs..., arg);                        \
-      } catch (paddle::bad_any_cast&) {                                       \
-        PD_THROW(                                                             \
-            "Attribute cast error in custom operator. Expected " #attr_type   \
-            " value.");                                                       \
-      }                                                                       \
-    }                                                                         \
+using KernelFunc = void (*)(CustomOpKernelContext*);
+
+#define PD_SPECIALIZE_ComputeCallHelper(attr_type)                             \
+  template <typename... Tail>                                                  \
+  struct ComputeCallHelper<attr_type, Tail...> {                               \
+    template <int in_idx, int attr_idx, int out_idx, typename... PreviousArgs> \
+    static void Compute(CustomOpKernelContext* ctx,                            \
+                        const PreviousArgs&... pargs) {                        \
+      attr_type arg = ctx->AttrAt<attr_type>(attr_idx);                        \
+      ComputeCallHelper<                                                       \
+          Tail...>::template Compute<in_idx, attr_idx + 1, out_idx>(ctx,       \
+                                                                    pargs...,  \
+                                                                    arg);      \
+    }                                                                          \
   }
 
 template <typename T>
@@ -117,11 +147,8 @@ struct KernelFuncImpl;
 
 template <typename Return, typename... Args, Return (*impl_fn)(Args...)>
 struct KernelFuncImpl<Return (*)(Args...), impl_fn> {
-  static Return Compute(const std::vector<Tensor>& inputs,
-                        const std::vector<std::vector<Tensor>>& vec_inputs,
-                        const std::vector<paddle::any>& attrs) {
-    return ComputeCallHelper<Args..., TypeTag<int>>::template Compute<0, 0, 0>(
-        inputs, vec_inputs, attrs);
+  static void Compute(CustomOpKernelContext* ctx) {
+    ComputeCallHelper<Args..., TypeTag<int>>::template Compute<0, 0, 0>(ctx);
   }
 
  private:
@@ -130,37 +157,29 @@ struct KernelFuncImpl<Return (*)(Args...), impl_fn> {
 
   template <typename... Tail>
   struct ComputeCallHelper<const Tensor&, Tail...> {
-    template <int in_idx,
-              int vec_in_idx,
-              int attr_idx,
-              typename... PreviousArgs>
-    static Return Compute(const std::vector<Tensor>& inputs,
-                          const std::vector<std::vector<Tensor>>& vec_inputs,
-                          const std::vector<paddle::any>& attrs,
-                          const PreviousArgs&... pargs) {
-      const Tensor& arg = inputs[in_idx];
-      return ComputeCallHelper<Tail...>::template Compute<in_idx + 1,
-                                                          vec_in_idx,
-                                                          attr_idx>(
-          inputs, vec_inputs, attrs, pargs..., arg);
+    template <int in_idx, int attr_idx, int out_idx, typename... PreviousArgs>
+    static void Compute(CustomOpKernelContext* ctx,
+                        const PreviousArgs&... pargs) {
+      auto& range = ctx->InputRangeAt(in_idx);
+      auto& arg = ctx->InputAt(range.first);
+      ComputeCallHelper<
+          Tail...>::template Compute<in_idx + 1, attr_idx, out_idx>(ctx,
+                                                                    pargs...,
+                                                                    arg);
     }
   };
 
   template <typename... Tail>
   struct ComputeCallHelper<const std::vector<Tensor>&, Tail...> {
-    template <int in_idx,
-              int vec_in_idx,
-              int attr_idx,
-              typename... PreviousArgs>
-    static Return Compute(const std::vector<Tensor>& inputs,
-                          const std::vector<std::vector<Tensor>>& vec_inputs,
-                          const std::vector<paddle::any>& attrs,
-                          const PreviousArgs&... pargs) {
-      const std::vector<Tensor>& arg = vec_inputs[vec_in_idx];
-      return ComputeCallHelper<Tail...>::template Compute<in_idx,
-                                                          vec_in_idx + 1,
-                                                          attr_idx>(
-          inputs, vec_inputs, attrs, pargs..., arg);
+    template <int in_idx, int attr_idx, int out_idx, typename... PreviousArgs>
+    static void Compute(CustomOpKernelContext* ctx,
+                        const PreviousArgs&... pargs) {
+      auto& range = ctx->InputRangeAt(in_idx);
+      auto arg = ctx->InputsBetween(range.first, range.second);
+      ComputeCallHelper<
+          Tail...>::template Compute<in_idx + 1, attr_idx, out_idx>(ctx,
+                                                                    pargs...,
+                                                                    arg);
     }
   };
 
@@ -194,15 +213,76 @@ struct KernelFuncImpl<Return (*)(Args...), impl_fn> {
   PD_SPECIALIZE_ComputeCallHelper(std::vector<int64_t>);
   PD_SPECIALIZE_ComputeCallHelper(std::vector<std::string>);
 
+  template <typename... Tail>
+  struct ComputeCallHelper<Tensor*, Tail...> {
+    template <int in_idx, int attr_idx, int out_idx, typename... PreviousArgs>
+    static void Compute(CustomOpKernelContext* ctx,
+                        const PreviousArgs&... pargs) {
+      auto& range = ctx->OutputRangeAt(out_idx);
+      auto* arg = ctx->MutableOutputAt(range.first);
+      ComputeCallHelper<
+          Tail...>::template Compute<in_idx, attr_idx, out_idx + 1>(ctx,
+                                                                    pargs...,
+                                                                    arg);
+    }
+  };
+
+  // TODO(chenweihang): What is the appropriate output form?
+  // std::vector<Tensor>*? or std::vector<Tensor*>? or std::vector<Tensor*>*
+  template <typename... Tail>
+  struct ComputeCallHelper<std::vector<Tensor*>, Tail...> {
+    template <int in_idx, int attr_idx, int out_idx, typename... PreviousArgs>
+    static void Compute(CustomOpKernelContext* ctx,
+                        const PreviousArgs&... pargs) {
+      auto& range = ctx->OutputRangeAt(out_idx);
+      auto arg = ctx->MutableOutputBetweeen(range.first, range.second);
+      ComputeCallHelper<
+          Tail...>::template Compute<in_idx, attr_idx, out_idx + 1>(ctx,
+                                                                    pargs...,
+                                                                    arg);
+    }
+  };
+
+  template <int out_idx, typename T>
+  struct ComputeReturnHelper;
+
+  // For compatibility with the original custom op form
+  template <int out_idx>
+  struct ComputeReturnHelper<out_idx, std::vector<Tensor>> {
+    static void Compute(CustomOpKernelContext* ctx, const Args&... args) {
+      static_assert(out_idx == 0,
+                    "If return std::vector<Tensor> in Custom OpKernel, "
+                    "you cannot pass output by kernel funciton argument.");
+      auto outs = impl_fn(args...);
+      auto* orig_outs = ctx->AllMutableOutput();
+      PD_CHECK(orig_outs->size() == outs.size(),
+               "The number of element in custom operator outputs is wrong, "
+               "expected contains ",
+               orig_outs->size(),
+               " Tensors, but actually contains ",
+               outs.size(),
+               " Tensors.");
+      for (size_t i = 0; i < outs.size(); ++i) {
+        AssignTensorImpl(outs.at(i), &(orig_outs->at(i)));
+      }
+    }
+  };
+
+  template <int out_idx>
+  struct ComputeReturnHelper<out_idx, void> {
+    static void Compute(CustomOpKernelContext* ctx, const Args&... args) {
+      static_assert(out_idx > 0, "Custom OpKernel has no output.");
+      impl_fn(args...);
+    }
+  };
+
   // end: base template
   template <typename T>
   struct ComputeCallHelper<TypeTag<T>> {
-    template <int in_idx, int vec_in_idx, int attr_idx>
-    static Return Compute(const std::vector<Tensor>& inputs,
-                          const std::vector<std::vector<Tensor>>& vec_inputs,
-                          const std::vector<paddle::any>& attrs,
-                          const Args&... args) {
-      return impl_fn(args...);
+    template <int in_idx, int attr_idx, int out_idx, typename... PreviousArgs>
+    static void Compute(CustomOpKernelContext* ctx,
+                        const PreviousArgs&... pargs) {
+      ComputeReturnHelper<out_idx, Return>::Compute(ctx, pargs...);
     }
   };
 };
diff --git a/paddle/pten/api/include/kernel_signature.h b/paddle/pten/api/include/kernel_signature.h
deleted file mode 100644
index 863adbea36a..00000000000
--- a/paddle/pten/api/include/kernel_signature.h
+++ /dev/null
@@ -1,128 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/pten/common/scalar.h"
-#include "paddle/pten/common/scalar_array.h"
-#include "paddle/pten/core/dense_tensor.h"
-
-// This header is used to cast kernel function from void* to original form of
-// function Currnetly.
-// It may be generated automatically in the future.
-
-namespace pten {
-
-using DeviceContext = paddle::platform::DeviceContext;
-
-using add_kernel = void (*)(const DeviceContext&,
-                            const DenseTensor&,
-                            const DenseTensor&,
-                            DenseTensor*);
-
-using cast_kernel = void (*)(const DeviceContext&,
-                             const DenseTensor&,
-                             DataType,
-                             DenseTensor*);
-
-using concat_kernel = void (*)(const DeviceContext&,
-                               const std::vector<DenseTensor>&,
-                               const Scalar&,
-                               DenseTensor*);
-
-using divide_kernel = void (*)(const DeviceContext&,
-                               const DenseTensor&,
-                               const DenseTensor&,
-                               DenseTensor*);
-
-using dot_kernel = void (*)(const DeviceContext&,
-                            const DenseTensor&,
-                            const DenseTensor&,
-                            DenseTensor*);
-
-using flatten_kernel =
-    void (*)(const DeviceContext&, const DenseTensor&, int, int, DenseTensor*);
-
-using empty_kernel = void (*)(const DeviceContext&,
-                              const ScalarArray&,
-                              DenseTensor*);
-
-using empty_like_kernel = void (*)(const DeviceContext&, DenseTensor*);
-using full_kernel = void (*)(const DeviceContext&,
-                             const ScalarArray&,
-                             const Scalar&,
-                             DenseTensor*);
-
-using full_like_kernel = void (*)(const DeviceContext&,
-                                  const Scalar&,
-                                  DenseTensor*);
-
-using matmul_kernel = void (*)(const DeviceContext&,
-                               const DenseTensor&,
-                               const DenseTensor&,
-                               bool,
-                               bool,
-                               DenseTensor*);
-
-using mean_kernel = void (*)(const DeviceContext&,
-                             const DenseTensor&,
-                             const std::vector<int64_t>&,
-                             bool,
-                             DenseTensor*);
-
-using multiply_kernel = void (*)(const DeviceContext&,
-                                 const DenseTensor&,
-                                 const DenseTensor&,
-                                 DenseTensor*);
-
-using reshape_kernel = void (*)(const DeviceContext&,
-                                const DenseTensor&,
-                                const ScalarArray&,
-                                DenseTensor*);
-
-using scale_kernel = void (*)(const DeviceContext&,
-                              const DenseTensor&,
-                              const Scalar&,
-                              float,
-                              bool,
-                              DenseTensor*);
-
-using sum_kernel = void (*)(const DeviceContext&,
-                            const DenseTensor&,
-                            const std::vector<int64_t>&,
-                            DataType,
-                            bool,
-                            DenseTensor*);
-
-using subtract_kernel = void (*)(const DeviceContext&,
-                                 const DenseTensor&,
-                                 const DenseTensor&,
-                                 DenseTensor*);
-
-using conj_kernel = void (*)(const DeviceContext&,
-                             const DenseTensor&,
-                             DenseTensor*);
-
-/* -------------- Grad Kernel ----------------- */
-using matmul_grad_kernel = void (*)(const DeviceContext&,
-                                    const DenseTensor&,
-                                    const DenseTensor&,
-                                    const DenseTensor&,
-                                    bool,
-                                    bool,
-                                    DenseTensor*,
-                                    DenseTensor*);
-
-}  // namespace pten
diff --git a/paddle/pten/api/include/utils.h b/paddle/pten/api/include/manual_api.h
similarity index 88%
rename from paddle/pten/api/include/utils.h
rename to paddle/pten/api/include/manual_api.h
index b8b955090b9..3bd7e60154d 100644
--- a/paddle/pten/api/include/utils.h
+++ b/paddle/pten/api/include/manual_api.h
@@ -17,6 +17,11 @@ limitations under the License. */
 #include "paddle/pten/api/include/tensor.h"
 #include "paddle/pten/common/backend.h"
 
+/**
+ * This file stores some special APIs that are implemented manually
+ * or difficult to automatically generated.
+ */
+
 namespace paddle {
 namespace experimental {
 
diff --git a/paddle/fluid/operators/digamma_op.cu b/paddle/pten/api/include/sparse_api.h
similarity index 51%
rename from paddle/fluid/operators/digamma_op.cu
rename to paddle/pten/api/include/sparse_api.h
index 5f2f59ba520..22e511e62ab 100644
--- a/paddle/fluid/operators/digamma_op.cu
+++ b/paddle/pten/api/include/sparse_api.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,15 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/digamma_op.h"
+#pragma once
 
-namespace ops = paddle::operators;
+#include "paddle/pten/api/include/tensor.h"
+#include "paddle/pten/common/backend.h"
 
-REGISTER_OP_CUDA_KERNEL(
-    digamma, ops::DigammaKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::DigammaKernel<paddle::platform::CUDADeviceContext, double>);
+namespace paddle {
+namespace experimental {
+namespace sparse {
 
-REGISTER_OP_CUDA_KERNEL(
-    digamma_grad,
-    ops::DigammaGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::DigammaGradKernel<paddle::platform::CUDADeviceContext, double>);
+PADDLE_API Tensor to_sparse_coo(const Tensor& x,
+                                Backend backend,
+                                const int64_t sparse_dim);
+
+}  // namespace sparse
+}  // namespace experimental
+}  // namespace paddle
diff --git a/paddle/pten/api/lib/CMakeLists.txt b/paddle/pten/api/lib/CMakeLists.txt
index 3fe4baca773..0a55d52a265 100644
--- a/paddle/pten/api/lib/CMakeLists.txt
+++ b/paddle/pten/api/lib/CMakeLists.txt
@@ -3,11 +3,11 @@ add_subdirectory(utils)
 cc_library(ext_compat_utils SRCS ext_compat_utils.cc DEPS place)
 
 if (WITH_GPU)
-  nv_library(pten_tensor SRCS tensor.cc DEPS tensor_base dense_tensor pten_api_utils ext_compat_utils pten_enforce utils_api pten_function_api)
+  nv_library(pten_tensor SRCS tensor.cc DEPS tensor_base dense_tensor pten_api_utils ext_compat_utils pten_enforce manual_api pten_function_api)
 elseif (WITH_ROCM)
-  hip_library(pten_tensor SRCS tensor.cc DEPS tensor_base dense_tensor pten_api_utils ext_compat_utils pten_enforce utils_api pten_function_api)
+  hip_library(pten_tensor SRCS tensor.cc DEPS tensor_base dense_tensor pten_api_utils ext_compat_utils pten_enforce manual_api pten_function_api)
 else()
-  cc_library(pten_tensor SRCS tensor.cc DEPS tensor_base dense_tensor pten_api_utils ext_compat_utils pten_enforce utils_api pten_function_api)
+  cc_library(pten_tensor SRCS tensor.cc DEPS tensor_base dense_tensor pten_api_utils ext_compat_utils pten_enforce manual_api pten_function_api)
 endif()
 
 cc_library(kernel_dispatch SRCS kernel_dispatch.cc DEPS pten_tensor pten_context kernel_factory)
@@ -15,6 +15,9 @@ cc_library(kernel_dispatch SRCS kernel_dispatch.cc DEPS pten_tensor pten_context
 cc_library(op_meta_info SRCS op_meta_info.cc DEPS pten_tensor)
 cc_library(op_kernel_info SRCS op_kernel_info.cc DEPS pten_tensor)
 
+
+set(api_gen_utils ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/gen_utils.py)
+
 # forward api file
 set(api_gen_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/api_gen.py)
 set(api_yaml_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/api.yaml)
@@ -46,7 +49,7 @@ add_custom_command(
   COMMAND ${CMAKE_COMMAND} -E copy_if_different ${api_header_file_tmp} ${api_header_file}
   COMMAND ${CMAKE_COMMAND} -E copy_if_different ${api_source_file_tmp} ${api_source_file}
   COMMENT "copy_if_different ${api_header_file} ${api_source_file}"
-  DEPENDS ${api_yaml_file} ${api_gen_file}
+  DEPENDS ${api_yaml_file} ${api_gen_file} ${api_gen_utils}
   VERBATIM)
 
 # generate backward api
@@ -59,9 +62,11 @@ add_custom_command(
   COMMAND ${CMAKE_COMMAND} -E copy_if_different ${bw_api_header_file_tmp} ${bw_api_header_file}
   COMMAND ${CMAKE_COMMAND} -E copy_if_different ${bw_api_source_file_tmp} ${bw_api_source_file}
   COMMENT "copy_if_different ${bw_api_header_file} ${bw_api_source_file}"
-  DEPENDS ${bw_api_yaml_file} ${bw_api_gen_file}
+  DEPENDS ${bw_api_yaml_file} ${bw_api_gen_file} ${api_gen_utils}
   VERBATIM)
 
-cc_library(utils_api SRCS utils.cc DEPS pten_tensor pten kernel_dispatch)
-cc_library(pten_function_api SRCS ${api_source_file} DEPS pten_tensor pten kernel_dispatch)
-cc_library(pten_bw_function_api SRCS ${bw_api_source_file} DEPS pten_tensor pten kernel_dispatch backward_infermeta pten_function_api)
+cc_library(pten_data_transform SRCS data_transform.cc DEPS pten_tensor transfer_layout_kernel cast_kernel data_device_transform)
+cc_library(manual_api SRCS manual_api.cc DEPS pten_tensor pten kernel_dispatch)
+cc_library(sparse_api SRCS sparse_api.cc DEPS pten_tensor pten kernel_dispatch pten_data_transform)
+cc_library(pten_function_api SRCS ${api_source_file} DEPS pten_tensor pten kernel_dispatch pten_data_transform)
+cc_library(pten_bw_function_api SRCS ${bw_api_source_file} DEPS pten_tensor pten kernel_dispatch backward_infermeta pten_data_transform pten_function_api)
diff --git a/paddle/pten/api/lib/api_declare.h b/paddle/pten/api/lib/api_declare.h
index 0023170714f..998e01e41ea 100644
--- a/paddle/pten/api/lib/api_declare.h
+++ b/paddle/pten/api/lib/api_declare.h
@@ -19,3 +19,4 @@ limitations under the License. */
 
 PT_DECLARE_API(Math);
 PT_DECLARE_API(Utils);
+PT_DECLARE_API(SparseApi);
diff --git a/paddle/pten/api/lib/api_utils.h b/paddle/pten/api/lib/api_utils.h
index f3e7d74db1e..3d18cc611cf 100644
--- a/paddle/pten/api/lib/api_utils.h
+++ b/paddle/pten/api/lib/api_utils.h
@@ -16,8 +16,9 @@ limitations under the License. */
 
 #include "paddle/pten/api/include/tensor.h"
 #include "paddle/pten/api/lib/utils/storage.h"
-#include "paddle/pten/core/convert_utils.h"
+#include "paddle/pten/core/compat/convert_utils.h"
 #include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/meta_tensor.h"
 
 namespace paddle {
 namespace experimental {
@@ -44,44 +45,38 @@ inline std::unique_ptr<std::vector<pten::DenseTensor>> TensorToDenseTensor(
 
 /* ----------------- for infer_meta --------------------- */
 
-inline const pten::DenseTensorMeta& GetDenseTensorMeta(
-    const pten::DenseTensor& tensor) {
-  return tensor.meta();
+inline pten::MetaTensor MakeMetaTensor(const pten::DenseTensor& tensor) {
+  return pten::MetaTensor(tensor);
 }
 
-inline std::vector<pten::DenseTensorMeta> GetDenseTensorMeta(
+inline std::vector<pten::MetaTensor> MakeMetaTensor(
     const std::vector<pten::DenseTensor>& tensors) {
-  std::vector<pten::DenseTensorMeta> metas;
-  metas.reserve(tensors.size());
+  std::vector<pten::MetaTensor> meta_tensors;
+  meta_tensors.reserve(tensors.size());
   for (const auto& t : tensors) {
-    metas.push_back(t.meta());
+    meta_tensors.emplace_back(t);
   }
-  return metas;
+  return meta_tensors;
 }
 
 /* ------------------ for output ----------------------- */
 
-inline pten::DenseTensor* SetKernelOutput(const pten::DenseTensorMeta& meta,
-                                          Backend backend,
-                                          Tensor* out) {
+inline pten::DenseTensor* SetKernelOutput(Backend backend, Tensor* out) {
   auto dense_tensor = std::make_shared<pten::DenseTensor>(
       pten::make_intrusive<SharedStorage>(pten::TransToFluidPlace(backend)),
-      meta);
+      pten::DenseTensorMeta());
   out->set_impl(dense_tensor);
   return dense_tensor.get();
 }
 
 inline std::vector<pten::DenseTensor*> SetKernelOutput(
-    const std::vector<pten::DenseTensorMeta>& metas,
-    Backend backend,
-    std::vector<Tensor>* out) {
-  size_t n = metas.size();
-  out->reserve(n);
-  std::vector<pten::DenseTensor*> results(n);
-  for (size_t i = 0; i < n; ++i) {
+    size_t out_size, Backend backend, std::vector<Tensor>* out) {
+  out->reserve(out_size);
+  std::vector<pten::DenseTensor*> results(out_size);
+  for (size_t i = 0; i < out_size; ++i) {
     auto tensor_ptr = std::make_shared<pten::DenseTensor>(
         pten::make_intrusive<SharedStorage>(pten::TransToFluidPlace(backend)),
-        metas[i]);
+        pten::DenseTensorMeta());
     results[i] = tensor_ptr.get();
     out->emplace_back();
     out->back().set_impl(tensor_ptr);
diff --git a/paddle/pten/api/lib/data_transform.cc b/paddle/pten/api/lib/data_transform.cc
new file mode 100644
index 00000000000..bbef8247368
--- /dev/null
+++ b/paddle/pten/api/lib/data_transform.cc
@@ -0,0 +1,232 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/api/lib/data_transform.h"
+
+#include "paddle/pten/api/ext/dispatch.h"
+#include "paddle/pten/api/lib/kernel_dispatch.h"
+#include "paddle/pten/backends/all_context.h"
+#include "paddle/pten/kernels/cast_kernel.h"
+#include "paddle/pten/kernels/transfer_layout_kernel.h"
+
+#include "paddle/fluid/framework/data_device_transform.h"
+
+namespace paddle {
+namespace experimental {
+
+inline bool NeedTransformDataType(const DataType& input,
+                                  const DataType& target,
+                                  const TransformFlag& transform_flag) {
+  return input != target &&
+         (transform_flag.need_trans_data_type() ||
+          target == DataType::COMPLEX64 || target == DataType::COMPLEX128);
+}
+
+inline bool NeedTransformPlace(const paddle::platform::Place& input,
+                               const Backend& target,
+                               const TransformFlag& transform_flag) {
+  bool ret = transform_flag.need_trans_backend() &&
+             target != Backend::ALL_BACKEND &&
+             !platform::is_same_place(input, pten::TransToFluidPlace(target));
+  return ret;
+}
+
+inline bool NeedTransformLayout(const DataLayout& input,
+                                const DataLayout& target,
+                                const TransformFlag& transform_flag) {
+  bool ret = transform_flag.need_trans_layout() &&
+             (input != DataLayout::ALL_LAYOUT &&
+              target != DataLayout::ALL_LAYOUT && input != target);
+  return ret;
+}
+
+inline pten::DenseTensor TransDataLayout(const pten::DenseTensor& tensor,
+                                         DataLayout layout) {
+  auto& pool = paddle::platform::DeviceContextPool::Instance();
+  VLOG(3) << "DataLayoutTransform src_layout: " << tensor.layout()
+          << " dst_layout: " << layout;
+  if (platform::is_cpu_place(tensor.place())) {
+    auto* dev_ctx = static_cast<pten::CPUContext*>(pool.Get(tensor.place()));
+    return pten::TransferLayout(*dev_ctx, tensor, layout);
+  } else {
+    PADDLE_THROW(pten::errors::PreconditionNotMet(
+        "Unsupported data layout cast from CPU to GPU."));
+  }
+}
+
+template <typename Context>
+pten::DenseTensor CastDateType(const Context& dev_ctx,
+                               const pten::DenseTensor& tensor,
+                               DataType dtype) {
+  switch (tensor.dtype()) {
+    case DataType::FLOAT32:
+      return pten::Cast<float>(dev_ctx, tensor, dtype);
+    case DataType::FLOAT64:
+      return pten::Cast<double>(dev_ctx, tensor, dtype);
+    case DataType::INT32:
+      return pten::Cast<int32_t>(dev_ctx, tensor, dtype);
+    case DataType::INT64:
+      return pten::Cast<int64_t>(dev_ctx, tensor, dtype);
+    case DataType::FLOAT16:
+      return pten::Cast<pten::dtype::float16>(dev_ctx, tensor, dtype);
+    case DataType::BFLOAT16:
+      return pten::Cast<pten::dtype::bfloat16>(dev_ctx, tensor, dtype);
+    case DataType::BOOL:
+      return pten::Cast<bool>(dev_ctx, tensor, dtype);
+    case DataType::INT16:
+      return pten::Cast<int16_t>(dev_ctx, tensor, dtype);
+    case DataType::UINT8:
+      return pten::Cast<uint8_t>(dev_ctx, tensor, dtype);
+    default:
+      PADDLE_THROW(pten::errors::Unimplemented(
+          "Data type (%s) is not supported when casting data type.",
+          tensor.dtype()));
+  }
+}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+pten::DenseTensor CastDateType(const pten::GPUContext& dev_ctx,
+                               const pten::DenseTensor& tensor,
+                               DataType dtype) {
+  switch (tensor.dtype()) {
+    case DataType::FLOAT32:
+      return pten::Cast<float>(dev_ctx, tensor, dtype);
+    case DataType::FLOAT64:
+      return pten::Cast<double>(dev_ctx, tensor, dtype);
+    case DataType::INT32:
+      return pten::Cast<int32_t>(dev_ctx, tensor, dtype);
+    case DataType::INT64:
+      return pten::Cast<int64_t>(dev_ctx, tensor, dtype);
+    case DataType::FLOAT16:
+      return pten::Cast<pten::dtype::float16>(dev_ctx, tensor, dtype);
+    case DataType::BOOL:
+      return pten::Cast<bool>(dev_ctx, tensor, dtype);
+    case DataType::INT16:
+      return pten::Cast<int16_t>(dev_ctx, tensor, dtype);
+    case DataType::UINT8:
+      return pten::Cast<uint8_t>(dev_ctx, tensor, dtype);
+    default:
+      PADDLE_THROW(pten::errors::Unimplemented(
+          "Data type (%s) is not supported when casting data type.",
+          tensor.dtype()));
+  }
+}
+#endif
+
+inline pten::DenseTensor TransDataType(const pten::DenseTensor& tensor,
+                                       DataType dtype) {
+  auto& pool = paddle::platform::DeviceContextPool::Instance();
+
+  VLOG(3) << "DataTypeTransform src_dtype: " << tensor.dtype()
+          << " dst_dtype: " << dtype;
+
+  pten::DenseTensor out(
+      pten::make_intrusive<paddle::experimental::SharedStorage>(tensor.place()),
+      {dtype, tensor.dims(), tensor.layout()});
+
+  if (platform::is_cpu_place(tensor.place())) {
+    auto* dev_ctx = static_cast<pten::CPUContext*>(pool.Get(tensor.place()));
+    return CastDateType(*dev_ctx, tensor, dtype);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  } else if (platform::is_gpu_place(tensor.place())) {
+    auto* dev_ctx = static_cast<pten::GPUContext*>(pool.Get(tensor.place()));
+    return CastDateType(*dev_ctx, tensor, dtype);
+#endif
+  } else {
+    PADDLE_THROW(pten::errors::Unimplemented(
+        "Place type is not supported when casting data type."));
+  }
+  return out;
+}
+
+pten::DenseTensor TransformData(const pten::DenseTensor& tensor,
+                                const pten::TensorArgDef& target_args_def,
+                                const TransformFlag& transform_flag) {
+  pten::DenseTensor out = tensor;
+  if (NeedTransformLayout(
+          tensor.layout(), target_args_def.layout, transform_flag)) {
+    out = TransDataLayout(out, target_args_def.layout);
+  }
+
+  if (NeedTransformDataType(
+          tensor.dtype(), target_args_def.dtype, transform_flag)) {
+    out = TransDataType(out, target_args_def.dtype);
+  }
+
+  if (NeedTransformPlace(
+          out.place(), target_args_def.backend, transform_flag)) {
+    pten::DenseTensor result(
+        pten::make_intrusive<paddle::experimental::SharedStorage>(
+            pten::TransToFluidPlace(target_args_def.backend)),
+        {out.dtype(), out.dims(), out.layout()});
+    framework::TransDataDevice(
+        out, pten::TransToFluidPlace(target_args_def.backend), &result);
+    out = result;
+  }
+  return out;
+}
+
+std::shared_ptr<pten::DenseTensor> PrepareData(
+    const Tensor& input,
+    const pten::TensorArgDef& target_args_def,
+    const TransformFlag& transform_flag) {
+  const auto& tensor_in = input.impl();
+  if (!transform_flag.NeedTransform() || !tensor_in->initialized() ||
+      (!NeedTransformPlace(
+           tensor_in->place(), target_args_def.backend, transform_flag) &&
+       !NeedTransformDataType(
+           tensor_in->dtype(), target_args_def.dtype, transform_flag) &&
+       !NeedTransformLayout(
+           tensor_in->layout(), target_args_def.layout, transform_flag))) {
+    return std::dynamic_pointer_cast<pten::DenseTensor>(tensor_in);
+  }
+
+  pten::DenseTensor out =
+      TransformData(*(static_cast<pten::DenseTensor*>(tensor_in.get())),
+                    target_args_def,
+                    transform_flag);
+  return std::make_shared<pten::DenseTensor>(out);
+}
+
+std::unique_ptr<std::vector<pten::DenseTensor>> PrepareData(
+    const std::vector<Tensor>& inputs,
+    const pten::TensorArgDef& target_args_def,
+    const TransformFlag& transform_flag) {
+  auto pt_tensors = std::make_unique<std::vector<pten::DenseTensor>>();
+  pt_tensors->reserve(inputs.size());
+
+  for (const auto& input : inputs) {
+    const auto& tensor_in = input.impl();
+    if (!transform_flag.NeedTransform() || !tensor_in->initialized() ||
+        (!NeedTransformPlace(
+             tensor_in->place(), target_args_def.backend, transform_flag) &&
+         !NeedTransformDataType(
+             tensor_in->dtype(), target_args_def.dtype, transform_flag) &&
+         !NeedTransformLayout(
+             tensor_in->layout(), target_args_def.layout, transform_flag))) {
+      pt_tensors->emplace_back(
+          *std::dynamic_pointer_cast<pten::DenseTensor>(tensor_in));
+    } else {
+      pt_tensors->emplace_back(
+          TransformData(*(static_cast<pten::DenseTensor*>(tensor_in.get())),
+                        target_args_def,
+                        transform_flag));
+    }
+  }
+
+  return std::move(pt_tensors);
+}
+
+}  // namespace experimental
+}  // namespace paddle
diff --git a/paddle/pten/api/lib/data_transform.h b/paddle/pten/api/lib/data_transform.h
new file mode 100644
index 00000000000..59f83d6e4f3
--- /dev/null
+++ b/paddle/pten/api/lib/data_transform.h
@@ -0,0 +1,75 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/pten/api/include/tensor.h"
+#include "paddle/pten/core/kernel_factory.h"
+
+namespace paddle {
+namespace experimental {
+
+class TransformFlag {
+ public:
+  TransformFlag(bool stop_transform = false,
+                bool trans_dtype = false,
+                bool trans_backend = true,
+                bool trans_layout = true)
+      : stop_transform_(stop_transform),
+        trans_data_type_(trans_dtype),
+        trans_backend_(trans_backend),
+        trans_layout_(trans_layout) {}
+
+  bool NeedTransform() const {
+    return !stop_transform_ &&
+           (trans_data_type_ || trans_backend_ || trans_layout_);
+  }
+
+  bool need_trans_data_type() const {
+    return !stop_transform_ && trans_data_type_;
+  }
+
+  bool need_trans_backend() const { return !stop_transform_ && trans_backend_; }
+
+  bool need_trans_layout() const { return !stop_transform_ && trans_layout_; }
+
+ private:
+  // This is the highest priority in flags,
+  // and can be setted by api[data_transform->skip_transform] in the yaml file.
+  bool stop_transform_ = false;
+
+  // trans_data_type_ can be setted by api[data_transform->support_trans_dtype]
+  // in the yaml file.
+  // trans_data_type_ only affect the non complex types,
+  // the complex is always transferd, except stop_transform_ is true.
+  bool trans_data_type_ = false;
+
+  // trans_backend_ and trans_layout_ are true defalutly,
+  // and they can only be setted by global flag.
+  bool trans_backend_ = true;
+  bool trans_layout_ = true;
+};
+
+std::shared_ptr<pten::DenseTensor> PrepareData(
+    const Tensor& input,
+    const pten::TensorArgDef& target_args_def,
+    const TransformFlag& transform_flag);
+
+std::unique_ptr<std::vector<pten::DenseTensor>> PrepareData(
+    const std::vector<Tensor>& inputs,
+    const pten::TensorArgDef& target_args_def,
+    const TransformFlag& transform_flag);
+
+}  // namespace experimental
+}  // namespace paddle
diff --git a/paddle/pten/api/lib/kernel_dispatch.cc b/paddle/pten/api/lib/kernel_dispatch.cc
index 7930869632a..831d01b7cf5 100644
--- a/paddle/pten/api/lib/kernel_dispatch.cc
+++ b/paddle/pten/api/lib/kernel_dispatch.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "paddle/pten/api/lib/kernel_dispatch.h"
 
-#include "paddle/pten/core/convert_utils.h"
+#include "paddle/pten/core/compat/convert_utils.h"
 
 namespace paddle {
 namespace experimental {
diff --git a/paddle/pten/api/lib/utils.cc b/paddle/pten/api/lib/manual_api.cc
similarity index 91%
rename from paddle/pten/api/lib/utils.cc
rename to paddle/pten/api/lib/manual_api.cc
index aacbfb15ed7..5b697c3ff76 100644
--- a/paddle/pten/api/lib/utils.cc
+++ b/paddle/pten/api/lib/manual_api.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/pten/api/include/utils.h"
+#include "paddle/pten/api/include/manual_api.h"
 
 #include <memory>
 
@@ -57,20 +57,19 @@ PADDLE_API Tensor copy_to(const Tensor& x, Backend backend, bool blocking) {
   kernel_context.EmplaceBackInput(dense_x.get());
   kernel_context.EmplaceBackAttr(blocking);
 
-  // 4. InferMeta
-  auto out_meta = UnchangedInferMeta(dense_x->meta());
-
-  // 5. Prepare outputs
+  // 4. Prepare outputs & InferMeta
   auto dense_out = std::make_shared<pten::DenseTensor>(
       pten::make_intrusive<paddle::experimental::SharedStorage>(
           pten::TransToFluidPlace(backend)),
-      std::move(out_meta));
+      pten::DenseTensorMeta());
+  pten::MetaTensor meta_out(dense_out.get());
+  pten::UnchangedInferMeta(*dense_x, &meta_out);
   dense_out->mutable_data(pten::TransToFluidPlace(backend));
   kernel_context.EmplaceBackOutput(dense_out.get());
   Tensor out;
   out.set_impl(dense_out);
 
-  // 6. Call kernel
+  // 5. Call kernel
   kernel(&kernel_context);
 
   return out;
diff --git a/paddle/pten/api/lib/op_meta_info.cc b/paddle/pten/api/lib/op_meta_info.cc
index 82d465b4c21..649960e1e1c 100644
--- a/paddle/pten/api/lib/op_meta_info.cc
+++ b/paddle/pten/api/lib/op_meta_info.cc
@@ -19,10 +19,102 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/fluid/framework/custom_operator.h"
+#include "paddle/pten/core/dense_tensor.h"
 #include "paddle/pten/core/enforce.h"
 
 namespace paddle {
 
+PADDLE_API void AssignTensorImpl(const Tensor& src, Tensor* dst) {
+  PADDLE_ENFORCE_EQ(src.is_dense_tensor() && dst->is_dense_tensor(),
+                    true,
+                    pten::errors::Unavailable(
+                        "Now only supported DenseTensor in Custom Operator."));
+  PADDLE_ENFORCE_EQ(
+      src.initialized(),
+      true,
+      pten::errors::Unavailable(
+          "The Custom OpKernel calculate output is not initialized."));
+  PADDLE_ENFORCE_EQ(dst->defined(),
+                    true,
+                    pten::errors::Unavailable(
+                        "The Custom OpKernel origin output is not defined."));
+  auto& dense_src = static_cast<const pten::DenseTensor&>(*src.impl());
+  auto* dense_dst = static_cast<pten::DenseTensor*>(dst->impl().get());
+  *dense_dst = dense_src;
+}
+
+////////////////////// Kernel Context //////////////////////
+
+void CustomOpKernelContext::EmplaceBackInput(Tensor&& input) {
+  size_t index = inputs_.size();
+  inputs_.emplace_back(input);
+  input_range_.emplace_back(std::make_pair(index, index + 1));
+}
+
+void CustomOpKernelContext::EmplaceBackInputs(std::vector<Tensor>&& inputs) {
+  size_t index = inputs_.size();
+  input_range_.emplace_back(std::make_pair(index, index + inputs.size()));
+  inputs_.insert(inputs_.end(),
+                 std::make_move_iterator(inputs.begin()),
+                 std::make_move_iterator(inputs.end()));
+}
+
+void CustomOpKernelContext::EmplaceBackOutput(Tensor&& output) {
+  size_t index = outputs_.size();
+  outputs_.emplace_back(output);
+  output_range_.emplace_back(std::make_pair(index, index + 1));
+}
+
+void CustomOpKernelContext::EmplaceBackOutputs(std::vector<Tensor>&& outputs) {
+  size_t index = outputs_.size();
+  output_range_.emplace_back(std::make_pair(index, index + outputs.size()));
+  outputs_.insert(outputs_.end(),
+                  std::make_move_iterator(outputs.begin()),
+                  std::make_move_iterator(outputs.end()));
+}
+
+void CustomOpKernelContext::EmplaceBackAttr(paddle::any attr) {
+  attrs_.emplace_back(std::move(attr));
+}
+
+const Tensor& CustomOpKernelContext::InputAt(size_t idx) const {
+  return inputs_.at(idx);
+}
+
+std::vector<Tensor> CustomOpKernelContext::InputsBetween(size_t start,
+                                                         size_t end) const {
+  std::vector<Tensor> rlt;
+  for (size_t i = start; i < end; ++i) {
+    rlt.emplace_back(inputs_.at(i));
+  }
+  return rlt;
+}
+
+Tensor* CustomOpKernelContext::MutableOutputAt(size_t idx) {
+  return &(outputs_.at(idx));
+}
+std::vector<Tensor*> CustomOpKernelContext::MutableOutputBetweeen(size_t start,
+                                                                  size_t end) {
+  std::vector<Tensor*> rlt;
+  for (size_t i = start; i < end; ++i) {
+    rlt.emplace_back(&(outputs_.at(i)));
+  }
+  return rlt;
+}
+
+std::vector<Tensor>* CustomOpKernelContext::AllMutableOutput() {
+  return &outputs_;
+}
+
+const std::pair<size_t, size_t>& CustomOpKernelContext::InputRangeAt(
+    size_t idx) const {
+  return input_range_.at(idx);
+}
+const std::pair<size_t, size_t>& CustomOpKernelContext::OutputRangeAt(
+    size_t idx) const {
+  return output_range_.at(idx);
+}
+
 ////////////////////// Op Meta Info //////////////////////
 
 OpMetaInfo& OpMetaInfo::Inputs(std::vector<std::string>&& inputs) {
diff --git a/paddle/pten/api/lib/sparse_api.cc b/paddle/pten/api/lib/sparse_api.cc
new file mode 100644
index 00000000000..d763bb7e8d6
--- /dev/null
+++ b/paddle/pten/api/lib/sparse_api.cc
@@ -0,0 +1,102 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/api/include/sparse_api.h"
+
+#include <memory>
+#include "glog/logging.h"
+#include "paddle/pten/api/lib/api_registry.h"
+#include "paddle/pten/api/lib/kernel_dispatch.h"
+#include "paddle/pten/api/lib/utils/storage.h"
+#include "paddle/pten/core/kernel_registry.h"
+#include "paddle/pten/infermeta/unary.h"
+
+PT_DECLARE_KERNEL(dense_to_sparse_coo, CPU, ALL_LAYOUT);
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PT_DECLARE_KERNEL(dense_to_sparse_coo, GPU, ALL_LAYOUT);
+#endif
+
+namespace paddle {
+namespace experimental {
+namespace sparse {
+
+PADDLE_API Tensor to_sparse_coo(const Tensor& x,
+                                Backend backend,
+                                const int64_t sparse_dim) {
+  if (x.layout() == pten::DataLayout::SPARSE_COO) {
+    return x;
+  }
+  // 1. Get kernel signature and kernel
+  auto kernel_key_set = ParseKernelKeyByInputArgs(x);
+  kernel_key_set.backend_set = kernel_key_set.backend_set | BackendSet(backend);
+  auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
+  std::string kernel_name = "dense_to_sparse_coo";
+  if (x.layout() == pten::DataLayout::SPARSE_CSR) {
+    kernel_name = "sparse_csr_to_coo";
+  }
+
+  auto kernel = pten::KernelFactory::Instance().SelectKernelOrThrowError(
+      kernel_name, kernel_key);
+
+  VLOG(6) << "to API kernel key: " << kernel_key;
+  VLOG(6) << "to API kernel: " << kernel;
+
+  // 2. Get Device Context
+  auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
+  auto kernel_context = pten::KernelContext(dev_ctx);
+
+  // 3. Auto data transform
+  if (x.layout() == pten::DataLayout::SPARSE_CSR) {
+    auto input = std::dynamic_pointer_cast<pten::SparseCsrTensor>(x.impl());
+    kernel_context.EmplaceBackInput(input.get());
+  } else {
+    auto input = std::dynamic_pointer_cast<pten::DenseTensor>(x.impl());
+    kernel_context.EmplaceBackInput(input.get());
+    kernel_context.EmplaceBackAttr(sparse_dim);
+  }
+
+  // 4. InferMeta
+  auto indices_meta = pten::DenseTensorMeta(
+      pten::DataType::INT64, {-1}, pten::DataLayout::NCHW);
+  auto elements_meta = pten::DenseTensorMeta(x.dtype(), {-1}, x.layout());
+
+  // 5. Prepare outputs
+  // create empty SparseCooTensor
+  pten::DenseTensor non_zero_indices(
+      pten::make_intrusive<paddle::experimental::SharedStorage>(
+          pten::TransToFluidPlace(backend)),
+      std::move(indices_meta));
+  pten::DenseTensor non_zero_elements(
+      pten::make_intrusive<paddle::experimental::SharedStorage>(
+          pten::TransToFluidPlace(backend)),
+      std::move(elements_meta));
+  auto coo = std::make_shared<pten::SparseCooTensor>(
+      non_zero_indices, non_zero_elements, x.dims());
+
+  kernel_context.EmplaceBackOutput(coo.get());
+  Tensor out;
+  out.set_impl(coo);
+
+  // 6. Call kernel
+  kernel(&kernel_context);
+
+  return out;
+}
+
+}  // namespace sparse
+}  // namespace experimental
+}  // namespace paddle
+
+PT_REGISTER_API(SparseApi);
diff --git a/paddle/pten/api/lib/tensor.cc b/paddle/pten/api/lib/tensor.cc
index 02fd918d799..ef84e2a8c81 100644
--- a/paddle/pten/api/lib/tensor.cc
+++ b/paddle/pten/api/lib/tensor.cc
@@ -19,15 +19,15 @@ limitations under the License. */
 #include <vector>
 
 #include "glog/logging.h"
-#include "paddle/pten/api/include/utils.h"
+#include "paddle/pten/api/include/manual_api.h"
 #include "paddle/pten/api/lib/ext_compat_utils.h"
 #include "paddle/pten/api/lib/utils/allocator.h"
 #include "paddle/pten/api/lib/utils/storage.h"
-#include "paddle/pten/core/compat_utils.h"
-#include "paddle/pten/core/convert_utils.h"
+#include "paddle/pten/core/compat/convert_utils.h"
 #include "paddle/pten/core/dense_tensor.h"
 #include "paddle/pten/core/tensor_base.h"
 #include "paddle/pten/core/tensor_meta.h"
+#include "paddle/pten/core/tensor_utils.h"
 
 /**
  * [ Why still include the fluid headers? ]
@@ -77,7 +77,7 @@ Tensor::Tensor(const PlaceType &place)
           std::move(pten::make_intrusive<SharedStorage>(
               ConvertExtPlaceToInnerPlace(place))),
           std::move(pten::DenseTensorMeta(pten::DataType::UNDEFINED,
-                                          framework::make_ddim({}),
+                                          pten::framework::make_ddim({}),
                                           pten::DataLayout::NCHW))))),
       place_{place} {}
 
@@ -86,7 +86,7 @@ Tensor::Tensor(const PlaceType &place, const std::vector<int64_t> &shape)
           std::move(pten::make_intrusive<SharedStorage>(
               ConvertExtPlaceToInnerPlace(place))),
           std::move(pten::DenseTensorMeta(pten::DataType::UNDEFINED,
-                                          framework::make_ddim(shape),
+                                          pten::framework::make_ddim(shape),
                                           pten::DataLayout::NCHW))))),
       place_{place} {}
 
@@ -113,7 +113,7 @@ void Tensor::reshape(const std::vector<int64_t> &shape) {
                   "the tensor to remain constant.";
   if (is_dense_tensor()) {
     std::dynamic_pointer_cast<pten::DenseTensor>(impl_)->set_meta(
-        pten::DenseTensorMeta(dtype(), framework::make_ddim(shape)));
+        pten::DenseTensorMeta(dtype(), pten::framework::make_ddim(shape)));
   } else {
     PADDLE_THROW(pten::errors::Unimplemented(
         "Only support reshape operation on DenseTensor now."));
@@ -270,7 +270,7 @@ Tensor::data<paddle::platform::float16>();
 Tensor Tensor::slice(int64_t begin_idx, int64_t end_idx) const {
   if (is_dense_tensor()) {
     return Tensor(std::make_shared<pten::DenseTensor>(
-        std::move(pten::CompatibleDenseTensorUtils::Slice(
+        std::move(pten::DenseTensorUtils::Slice(
             *(std::dynamic_pointer_cast<pten::DenseTensor>(impl_).get()),
             begin_idx,
             end_idx))));
diff --git a/paddle/pten/api/lib/utils/storage.h b/paddle/pten/api/lib/utils/storage.h
index a02e5d46a65..4e311135eeb 100644
--- a/paddle/pten/api/lib/utils/storage.h
+++ b/paddle/pten/api/lib/utils/storage.h
@@ -22,7 +22,7 @@ namespace experimental {
 
 class ExternalStorage : public pten::Storage {
  public:
-  ExternalStorage(void* ptr, size_t size, const paddle::platform::Place& place);
+  ExternalStorage(void* ptr, size_t size, const pten::Place& place);
   ExternalStorage(const pten::intrusive_ptr<pten::Storage>& root,
                   size_t delta,
                   size_t size);
@@ -52,7 +52,7 @@ class ExternalStorage : public pten::Storage {
   }
 
   size_t size() const noexcept override { return size_; }
-  const paddle::platform::Place& place() const override {
+  const pten::Place& place() const override {
     PADDLE_ENFORCE_NOT_NULL(
         data_,
         paddle::platform::errors::Unavailable(
@@ -78,9 +78,7 @@ class SharedStorage : public pten::Storage {
   // In order to be compatible with the original Tensor design and execution
   // system, we need to allow the uninitialized SharedStorage to exist,
   // and it can be removed after the compatibility phase is over in the future
-  explicit SharedStorage(const paddle::platform::Place& place) {
-    place_ = place;
-  }
+  explicit SharedStorage(const pten::Place& place) { place_ = place; }
 
   void Realloc(size_t n) override {
     this->Clear();
@@ -106,14 +104,14 @@ class SharedStorage : public pten::Storage {
 
   std::shared_ptr<paddle::memory::Allocation>&& move_data_shared() override {
     size_ = 0;
-    place_ = Place();
+    place_ = pten::Place();
     return std::move(data_);
   }
 
   size_t size() const noexcept override {
     return data_ ? data_->size() : size_;
   }
-  const paddle::platform::Place& place() const override {
+  const pten::Place& place() const override {
     return data_ ? data_->place() : place_;
   }
   bool OwnsMemory() const noexcept override { return false; }
@@ -130,15 +128,13 @@ class SharedStorage : public pten::Storage {
   }
 
   // Temporary method: For compatible with fluid Tensor and improve performance
-  void ResetAllocationPlace(const paddle::platform::Place& place) {
-    place_ = place;
-  }
+  void ResetAllocationPlace(const pten::Place& place) { place_ = place; }
 
   // Temporary method: For compatible with fluid Tensor and improve performance
   void Reset() { this->Clear(); }
 
  private:
-  Place place_;
+  pten::Place place_;
   int64_t size_{0};
 };
 
diff --git a/paddle/pten/api/lib/utils/tensor_utils.cc b/paddle/pten/api/lib/utils/tensor_utils.cc
index d1441331755..8fdfc29540b 100644
--- a/paddle/pten/api/lib/utils/tensor_utils.cc
+++ b/paddle/pten/api/lib/utils/tensor_utils.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
-#include "paddle/pten/core/compat_utils.h"
+#include "paddle/pten/core/tensor_utils.h"
 
 namespace paddle {
 namespace experimental {
@@ -198,72 +198,10 @@ pten::ScalarArray MakePtenScalarArrayFromVarList(
   return {vector_data};
 }
 
-void SharesStorageBase(pten::DenseTensor* src, paddle::framework::Tensor* dst) {
-  PADDLE_ENFORCE_NOT_NULL(
-      src,
-      platform::errors::InvalidArgument(
-          "The source DenseTensor is nullptr when move allocation."));
-  PADDLE_ENFORCE_NOT_NULL(
-      dst,
-      platform::errors::InvalidArgument(
-          "The destination Tensor is nullptr when move allocation."));
-  dst->Resize(src->dims());
-  dst->ResetHolderWithType(src->Holder(),
-                           pten::TransToProtoVarType(src->dtype()));
-  dst->set_offset(src->meta().offset);
-}
-
-void SharesStorage(pten::DenseTensor* src, paddle::framework::Tensor* dst) {
-  SharesStorageBase(src, static_cast<paddle::framework::Tensor*>(dst));
-  SetLoD(dst->mutable_lod(), src->lod());
-}
-
-static bool IsSameAllocation(const std::shared_ptr<memory::Allocation>& a,
-                             const std::shared_ptr<memory::Allocation>& b) {
-  return a->ptr() == b->ptr() && a->size() == b->size() &&
-         platform::is_same_place(a->place(), b->place());
-}
-
-void MakeVariableFromPtenTensor(pten::DenseTensor* src,
-                                framework::Variable* variable) {
-  if (variable->IsType<framework::LoDTensor>()) {
-    auto* tensor = variable->GetMutable<framework::LoDTensor>();
-
-    auto dtype = pten::TransToProtoVarType(src->dtype());
-    tensor->Resize(src->dims());
-    SetLoD(tensor->mutable_lod(), src->lod());
-
-    if (!tensor->IsInitialized() ||
-        (tensor->IsInitialized() &&
-         !IsSameAllocation(tensor->Holder(), src->Holder()))) {
-      tensor->ResetHolderWithType(std::move(src->Holder()), dtype);
-    } else {
-      // Even the pten tensor and Variable have the same Alloctation (both have
-      // the same pointer address, same size and same place)
-      // but there is possible that they do not have the same data_type.
-      // so, here we set the variable's type with the pten tensor dtype.
-      tensor->set_type(dtype);
-    }
-
-  } else if (variable->IsType<pten::SelectedRows>()) {
-    auto* tensor = variable->GetMutable<pten::SelectedRows>();
-    auto dtype = pten::TransToProtoVarType(src->dtype());
-
-    if (!tensor->value().IsInitialized()) {
-      tensor->mutable_value()->ResetHolderWithType(std::move(src->Holder()),
-                                                   dtype);
-    }
-  } else {
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "Unsupported shared input `%s` type now when call pt kernel.",
-        framework::ToTypeName(variable->Type())));
-  }
-}
-
 void ResetTensorByArgDef(pten::DenseTensor* dst,
                          const pten::TensorArgDef& arg_def) {
   VLOG(5) << "ResetTensor by TensorArgDef.";
-  auto* meta = pten::CompatibleDenseTensorUtils::GetMutableMeta(dst);
+  auto* meta = pten::DenseTensorUtils::GetMutableMeta(dst);
   meta->dtype = arg_def.dtype;
   meta->layout = arg_def.layout;
 }
diff --git a/paddle/pten/api/lib/utils/tensor_utils.h b/paddle/pten/api/lib/utils/tensor_utils.h
index 32d65eded6e..1ffcc7d4d5b 100644
--- a/paddle/pten/api/lib/utils/tensor_utils.h
+++ b/paddle/pten/api/lib/utils/tensor_utils.h
@@ -23,7 +23,7 @@ limitations under the License. */
 #include "paddle/pten/api/lib/utils/storage.h"
 #include "paddle/pten/common/scalar.h"
 #include "paddle/pten/common/scalar_array.h"
-#include "paddle/pten/core/convert_utils.h"
+#include "paddle/pten/core/compat/convert_utils.h"
 #include "paddle/pten/core/dense_tensor.h"
 #include "paddle/pten/core/kernel_factory.h"
 
@@ -45,11 +45,6 @@ pten::ScalarArray MakePtenScalarArrayFromVar(
 pten::ScalarArray MakePtenScalarArrayFromVarList(
     const std::vector<framework::Variable*>& variable_list);
 
-void SharesStorage(pten::DenseTensor* src, paddle::framework::Tensor* dst);
-
-void MakeVariableFromPtenTensor(pten::DenseTensor* src,
-                                framework::Variable* variable);
-
 void ResetTensorByArgDef(pten::DenseTensor* dst,
                          const pten::TensorArgDef& arg_def);
 
diff --git a/paddle/pten/backends/CMakeLists.txt b/paddle/pten/backends/CMakeLists.txt
index e9f222d642e..cc935289203 100644
--- a/paddle/pten/backends/CMakeLists.txt
+++ b/paddle/pten/backends/CMakeLists.txt
@@ -2,6 +2,10 @@ add_subdirectory(dynload)
 
 add_subdirectory(cpu)
 
+if(WITH_GPU OR WITH_ROCM)
+  add_subdirectory(gpu)
+endif()
+
 if(WITH_XPU)
   add_subdirectory(xpu)
 endif()
@@ -11,3 +15,7 @@ cc_library(pten_context SRCS all_context.cc DEPS device_context cpu_context)
 if(WITH_XPU)
   add_dependencies(pten_context xpu_context)
 endif()
+
+if(WITH_GPU)
+  add_dependencies(pten_context gpu_context)
+endif()
diff --git a/paddle/pten/backends/cpu/cpu_context.cc b/paddle/pten/backends/cpu/cpu_context.cc
index efce128596b..4029c286a5b 100644
--- a/paddle/pten/backends/cpu/cpu_context.cc
+++ b/paddle/pten/backends/cpu/cpu_context.cc
@@ -15,75 +15,59 @@
 #include "paddle/pten/backends/cpu/cpu_context.h"
 
 #include "paddle/pten/api/ext/exception.h"
+#include "paddle/pten/common/place.h"
 
 // NOTE: The paddle framework should add WITH_EIGEN option to support compile
 // without eigen.
+#include "paddle/pten/core/device_context.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 
 namespace pten {
 
-struct CPUContext::CPUImpl {
-  CPUImpl() { device_ = new Eigen::DefaultDevice(); }
+struct CPUContext::Impl {
+  Impl() : place_(CPUPlace()) {}
 
-  // Users need to manage external resources.
-  explicit CPUImpl(const CPUContextResource& ctx_res) : res_(ctx_res) {
-    device_ = res_.device;
-  }
+  explicit Impl(const Place& place) : place_(place) {}
 
-  ~CPUImpl() {
-    if (res_.device == nullptr && device_ != nullptr) {
-      delete device_;
-      device_ = nullptr;
+  ~Impl() {
+    if (owned_) {
+      delete eigen_device_;
     }
   }
 
-  Eigen::DefaultDevice* GetEigenDevice() const {
-    PD_CHECK(device_ != nullptr, "the eigen_device is nullptr.");
-    return device_;
+  void Init() {
+    owned_ = true;
+    eigen_device_ = new Eigen::DefaultDevice();
   }
 
-  void SetEigenDevice(Eigen::DefaultDevice* device) {
-    if (device == nullptr) {
-      return;
-    }
-    res_.device = device;
-    device_ = device;
+  Eigen::DefaultDevice* GetEigenDevice() const {
+    PD_CHECK(eigen_device_ != nullptr, "the cpu eigen_device is nullptr.");
+    return eigen_device_;
   }
 
-  Place GetPlace() const { return place_; }
-
-  Eigen::DefaultDevice* device_{nullptr};
-  CPUContextResource res_;
-  CPUPlace place_;
+  bool owned_{false};
+  Eigen::DefaultDevice* eigen_device_{nullptr};
+  Place place_;
 };
 
-CPUContext::CPUContext() : DeviceContext() {
-  cpu_impl_ = std::make_unique<CPUImpl>();
-}
-
-CPUContext::CPUContext(const CPUContext& other) : DeviceContext() {
-  cpu_impl_ = std::make_unique<CPUImpl>();
-  cpu_impl_->SetEigenDevice(other.eigen_device());
-}
+CPUContext::CPUContext()
+    : DeviceContext(), impl_(std::make_unique<CPUContext::Impl>()) {}
 
-CPUContext::CPUContext(CPUContext&& other) : DeviceContext() {
-  cpu_impl_ = std::move(other.cpu_impl_);
-}
+CPUContext::CPUContext(const Place& place)
+    : DeviceContext(), impl_(std::make_unique<CPUContext::Impl>(place)) {}
 
 CPUContext::~CPUContext() = default;
 
-CPUContext::CPUContext(const CPUContextResource& ctx_res) : DeviceContext() {
-  cpu_impl_ = std::make_unique<CPUImpl>(ctx_res);
-}
+void CPUContext::Init() { impl_->Init(); }
 
 Eigen::DefaultDevice* CPUContext::eigen_device() const {
-  return cpu_impl_->GetEigenDevice();
+  return impl_->GetEigenDevice();
 }
 
+const Place& CPUContext::GetPlace() const { return impl_->place_; }
+
 void CPUContext::SetEigenDevice(Eigen::DefaultDevice* device) {
-  cpu_impl_->SetEigenDevice(device);
+  impl_->eigen_device_ = device;
 }
 
-Place CPUContext::GetPlace() const { return cpu_impl_->GetPlace(); }
-
 }  // namespace pten
diff --git a/paddle/pten/backends/cpu/cpu_context.h b/paddle/pten/backends/cpu/cpu_context.h
index 059588dc712..dca87a786b9 100644
--- a/paddle/pten/backends/cpu/cpu_context.h
+++ b/paddle/pten/backends/cpu/cpu_context.h
@@ -24,37 +24,29 @@ limitations under the License. */
 
 namespace pten {
 
-struct CPUContextResource {
-  Eigen::DefaultDevice* device{nullptr};
-};
-
 class CPUContext : public DeviceContext {
  public:
-  // NOTE: DeviceContext hold resources. Used in training scenarios.
   CPUContext();
-
-  // NOTE: Share the same underlying resources, please ensure that resources are
-  // not released.
-  CPUContext(const CPUContext&);
-
-  CPUContext(CPUContext&&);
-
-  ~CPUContext();
-
+  explicit CPUContext(const Place&);
+  virtual ~CPUContext();
   Eigen::DefaultDevice* eigen_device() const;
-
-  // TODO(wilber): Whether the interface should be preserved.
-  Place GetPlace() const override;
+  const Place& GetPlace() const override;
 
  public:
-  // NOTE: External users manage resources. Used in inference scenarios.
-  explicit CPUContext(const CPUContextResource& ctx_res);
+  // NOTE: DeviceContext hold resources. Used in training scenarios.
+  // The interface used by the training scene, DeviceContext will initialize
+  // all resources and delete them when destructing.
+  void Init();
 
+ protected:
+  // NOTE: External users manage resources. Used in inference scenarios.
+  // The Set interface is for inference only, DeviceContext will mark the
+  // resource as external, and will not delete any resource when destructing.
   void SetEigenDevice(Eigen::DefaultDevice* device);
 
  private:
-  struct CPUImpl;
-  std::unique_ptr<CPUImpl> cpu_impl_;
+  struct Impl;
+  std::unique_ptr<Impl> impl_;
 };
 
 }  // namespace pten
diff --git a/paddle/pten/backends/gpu/CMakeLists.txt b/paddle/pten/backends/gpu/CMakeLists.txt
new file mode 100644
index 00000000000..09591f79ae8
--- /dev/null
+++ b/paddle/pten/backends/gpu/CMakeLists.txt
@@ -0,0 +1,9 @@
+if(WITH_GPU)
+  add_subdirectory(cuda)
+  nv_library(pten_gpu_info SRCS gpu_info.cc DEPS pten_cuda_info gflags glog enforce pten_dynload_cuda)
+elseif(WITH_ROCM)
+  add_subdirectory(rocm)
+  hip_library(pten_gpu_info SRCS gpu_info.cc DEPS pten_rocm_info gflags glog enforce pten_dynload_cuda)
+endif()
+
+cc_library(gpu_context SRCS gpu_context.cc DEPS pten_device_context pten_gpu_info eigen3)
diff --git a/paddle/pten/backends/gpu/cuda/CMakeLists.txt b/paddle/pten/backends/gpu/cuda/CMakeLists.txt
new file mode 100644
index 00000000000..7eb1983a793
--- /dev/null
+++ b/paddle/pten/backends/gpu/cuda/CMakeLists.txt
@@ -0,0 +1 @@
+nv_library(pten_cuda_info SRCS cuda_info.cc DEPS gflags glog enforce pten_dynload_cuda)
diff --git a/paddle/pten/backends/gpu/cuda/cuda_helper.h b/paddle/pten/backends/gpu/cuda/cuda_helper.h
new file mode 100644
index 00000000000..baa1e43ce3c
--- /dev/null
+++ b/paddle/pten/backends/gpu/cuda/cuda_helper.h
@@ -0,0 +1,72 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+namespace pten {
+namespace backends {
+namespace gpu {
+
+/*
+ * Summary: Grid stride looping macro in CUDA kernel
+ *
+ *  [ Why need this macro? ]
+ *
+ *    The original looping in CUDA kernel is:
+ *
+ *    `for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+ *        i += blockDim.x * gridDim.x)`
+ *
+ *    This for condition is risky. The value of `blockIdx.x * blockDim.x`
+ *    may be large, such as over 1GB, the first iteration is no problem here,
+ *    but when `i += blockDim.x * gridDim.x` is executed, the value of i
+ *    will greater than INT_MAX and overflow becomes negative value, at
+ *    this time, the cycle condition `i < (n)` is still satisfied, so it
+ *    will cause illegal access to cuda memory.
+ *
+ *    Here is a real example in ERINE, it will trigger above error.
+ *    The related data are:
+ *      - blockIdx.x = 2172938
+ *      - blockDim.x = 512
+ *      - blockIdx.x * blockDim.x = 1112543864
+ *      - INT_MAX = 2147483647
+ *
+ *    So we polish the for condition as follow, the int64_t __index__ will
+ *    prevent overflow in the loop increment.
+ *
+ * Parameters:
+ *    - i: loop index
+ *    - num: total element numbers
+ *
+ * Examples:
+ *    template <typename T>
+ *    __global__ void Scale(T* logit_grad, const T* loss_grad, const int num,
+ *                      const int d, const int remain) {
+ *    CUDA_KERNEL_LOOP(index, num) {
+ *      int idx_n = index / d;
+ *      int idx_remain = index % remain;
+ *      logit_grad[index] *= loss_grad[idx_n * remain + idx_remain];
+ *      }
+ *    }
+ *
+*/
+
+#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type)            \
+  int64_t __index__ = blockIdx.x * blockDim.x + threadIdx.x; \
+  for (index_type i = __index__; __index__ < (num);          \
+       __index__ += blockDim.x * gridDim.x, i = __index__)
+
+}  // namespace gpu
+}  // namespace backends
+}  // namespace pten
diff --git a/paddle/fluid/platform/device/gpu/cuda/cuda_info.cc b/paddle/pten/backends/gpu/cuda/cuda_info.cc
similarity index 71%
rename from paddle/fluid/platform/device/gpu/cuda/cuda_info.cc
rename to paddle/pten/backends/gpu/cuda/cuda_info.cc
index 6109ed65543..55766facac8 100644
--- a/paddle/fluid/platform/device/gpu/cuda/cuda_info.cc
+++ b/paddle/pten/backends/gpu/cuda/cuda_info.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,20 +12,19 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/pten/backends/gpu/gpu_info.h"
+
+// TODO(pten): remove fluid headers.
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/lock_guard_ptr.h"
-#include "paddle/fluid/platform/macros.h"
-#include "paddle/fluid/platform/monitor.h"
-#include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/string/split.h"
 
 static std::once_flag g_device_props_size_init_flag;
 static std::vector<std::unique_ptr<std::once_flag>> g_device_props_init_flags;
-static std::vector<paddle::gpuDeviceProp> g_device_props;
+static std::vector<pten::gpuDeviceProp> g_device_props;
+
+namespace pten {
+namespace backends {
+namespace gpu {
 
-namespace paddle {
-namespace platform {
 int DnnVersion() {
   if (!dynload::HasCUDNN()) return -1;
   return dynload::cudnnGetVersion();
@@ -75,11 +74,13 @@ int GetGPUDeviceCount() {
 }
 
 int GetGPUComputeCapability(int id) {
-  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
-                    platform::errors::InvalidArgument(
+  PADDLE_ENFORCE_LT(id,
+                    GetGPUDeviceCount(),
+                    paddle::platform::errors::InvalidArgument(
                         "Device id must be less than GPU count, "
                         "but received id is: %d. GPU count is: %d.",
-                        id, GetGPUDeviceCount()));
+                        id,
+                        GetGPUDeviceCount()));
   int major, minor;
   auto major_error_code =
       cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, id);
@@ -92,22 +93,26 @@ int GetGPUComputeCapability(int id) {
 }
 
 int GetGPURuntimeVersion(int id) {
-  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
-                    platform::errors::InvalidArgument(
+  PADDLE_ENFORCE_LT(id,
+                    GetGPUDeviceCount(),
+                    paddle::platform::errors::InvalidArgument(
                         "Device id must be less than GPU count, "
                         "but received id is: %d. GPU count is: %d.",
-                        id, GetGPUDeviceCount()));
+                        id,
+                        GetGPUDeviceCount()));
   int runtime_version = 0;
   PADDLE_ENFORCE_GPU_SUCCESS(cudaRuntimeGetVersion(&runtime_version));
   return runtime_version;
 }
 
 int GetGPUDriverVersion(int id) {
-  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
-                    platform::errors::InvalidArgument(
+  PADDLE_ENFORCE_LT(id,
+                    GetGPUDeviceCount(),
+                    paddle::platform::errors::InvalidArgument(
                         "Device id must be less than GPU count, "
                         "but received id is: %d. GPU count is: %d.",
-                        id, GetGPUDeviceCount()));
+                        id,
+                        GetGPUDeviceCount()));
   int driver_version = 0;
   PADDLE_ENFORCE_GPU_SUCCESS(cudaDriverGetVersion(&driver_version));
   return driver_version;
@@ -120,11 +125,13 @@ bool TensorCoreAvailable() {
 }
 
 int GetGPUMultiProcessors(int id) {
-  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
-                    platform::errors::InvalidArgument(
+  PADDLE_ENFORCE_LT(id,
+                    GetGPUDeviceCount(),
+                    paddle::platform::errors::InvalidArgument(
                         "Device id must be less than GPU count, "
                         "but received id is: %d. GPU count is: %d.",
-                        id, GetGPUDeviceCount()));
+                        id,
+                        GetGPUDeviceCount()));
   int count;
   PADDLE_ENFORCE_GPU_SUCCESS(
       cudaDeviceGetAttribute(&count, cudaDevAttrMultiProcessorCount, id));
@@ -132,11 +139,13 @@ int GetGPUMultiProcessors(int id) {
 }
 
 int GetGPUMaxThreadsPerMultiProcessor(int id) {
-  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
-                    platform::errors::InvalidArgument(
+  PADDLE_ENFORCE_LT(id,
+                    GetGPUDeviceCount(),
+                    paddle::platform::errors::InvalidArgument(
                         "Device id must be less than GPU count, "
                         "but received id is: %d. GPU count is: %d.",
-                        id, GetGPUDeviceCount()));
+                        id,
+                        GetGPUDeviceCount()));
   int count;
   PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceGetAttribute(
       &count, cudaDevAttrMaxThreadsPerMultiProcessor, id));
@@ -145,11 +154,13 @@ int GetGPUMaxThreadsPerMultiProcessor(int id) {
 }
 
 int GetGPUMaxThreadsPerBlock(int id) {
-  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
-                    platform::errors::InvalidArgument(
+  PADDLE_ENFORCE_LT(id,
+                    GetGPUDeviceCount(),
+                    paddle::platform::errors::InvalidArgument(
                         "Device id must be less than GPU count, "
                         "but received id is: %d. GPU count is: %d.",
-                        id, GetGPUDeviceCount()));
+                        id,
+                        GetGPUDeviceCount()));
   int count;
   PADDLE_ENFORCE_GPU_SUCCESS(
       cudaDeviceGetAttribute(&count, cudaDevAttrMaxThreadsPerBlock, id));
@@ -162,32 +173,34 @@ int GetCurrentDeviceId() {
   return device_id;
 }
 
-dim3 GetGpuMaxGridDimSize(int id) {
-  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
-                    platform::errors::InvalidArgument(
+std::array<int, 3> GetGpuMaxGridDimSize(int id) {
+  PADDLE_ENFORCE_LT(id,
+                    GetGPUDeviceCount(),
+                    paddle::platform::errors::InvalidArgument(
                         "Device id must be less than GPU count, "
                         "but received id is: %d. GPU count is: %d.",
-                        id, GetGPUDeviceCount()));
-  dim3 ret;
+                        id,
+                        GetGPUDeviceCount()));
+  std::array<int, 3> ret;
   int size;
   auto error_code_x = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimX, id);
   PADDLE_ENFORCE_GPU_SUCCESS(error_code_x);
-  ret.x = size;
+  ret[0] = size;
 
   auto error_code_y = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimY, id);
   PADDLE_ENFORCE_GPU_SUCCESS(error_code_y);
-  ret.y = size;
+  ret[1] = size;
 
   auto error_code_z = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimZ, id);
   PADDLE_ENFORCE_GPU_SUCCESS(error_code_z);
-  ret.z = size;
+  ret[2] = size;
   return ret;
 }
 
 const gpuDeviceProp &GetDeviceProperties(int id) {
   std::call_once(g_device_props_size_init_flag, [&] {
     int gpu_num = 0;
-    gpu_num = platform::GetGPUDeviceCount();
+    gpu_num = GetGPUDeviceCount();
     g_device_props_init_flags.resize(gpu_num);
     g_device_props.resize(gpu_num);
     for (int i = 0; i < gpu_num; ++i) {
@@ -196,16 +209,17 @@ const gpuDeviceProp &GetDeviceProperties(int id) {
   });
 
   if (id == -1) {
-    id = platform::GetCurrentDeviceId();
+    id = GetCurrentDeviceId();
   }
 
   if (id < 0 || id >= static_cast<int>(g_device_props.size())) {
-    PADDLE_THROW(platform::errors::OutOfRange(
+    PADDLE_THROW(paddle::platform::errors::OutOfRange(
         "The device id %d is out of range [0, %d), where %d is the number of "
         "devices on this machine. Because the device id should be greater than "
         "or equal to zero and smaller than the number of gpus. Please input "
         "appropriate device again!",
-        id, static_cast<int>(g_device_props.size()),
+        id,
+        static_cast<int>(g_device_props.size()),
         static_cast<int>(g_device_props.size())));
   }
 
@@ -219,32 +233,43 @@ const gpuDeviceProp &GetDeviceProperties(int id) {
 
 void SetDeviceId(int id) {
   // TODO(qijun): find a better way to cache the cuda device count
-  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
-                    platform::errors::InvalidArgument(
+  PADDLE_ENFORCE_LT(id,
+                    GetGPUDeviceCount(),
+                    paddle::platform::errors::InvalidArgument(
                         "Device id must be less than GPU count, "
                         "but received id is: %d. GPU count is: %d.",
-                        id, GetGPUDeviceCount()));
+                        id,
+                        GetGPUDeviceCount()));
   PADDLE_RETRY_CUDA_SUCCESS(cudaSetDevice(id));
 }
 
-void GpuMemcpyAsync(void *dst, const void *src, size_t count,
-                    gpuMemcpyKind kind, gpuStream_t stream) {
+void GpuMemcpyAsync(void *dst,
+                    const void *src,
+                    size_t count,
+                    gpuMemcpyKind kind,
+                    gpuStream_t stream) {
   PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(dst, src, count, kind, stream));
 }
 
-void GpuMemcpySync(void *dst, const void *src, size_t count,
+void GpuMemcpySync(void *dst,
+                   const void *src,
+                   size_t count,
                    gpuMemcpyKind kind) {
   PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpy(dst, src, count, kind));
 }
 
-void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src,
-                        int src_device, size_t count, gpuStream_t stream) {
+void GpuMemcpyPeerAsync(void *dst,
+                        int dst_device,
+                        const void *src,
+                        int src_device,
+                        size_t count,
+                        gpuStream_t stream) {
   PADDLE_ENFORCE_GPU_SUCCESS(
       cudaMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream));
 }
 
-void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src,
-                       int src_device, size_t count) {
+void GpuMemcpyPeerSync(
+    void *dst, int dst_device, const void *src, int src_device, size_t count) {
   PADDLE_ENFORCE_GPU_SUCCESS(
       cudaMemcpyPeer(dst, dst_device, src, src_device, count));
 }
@@ -264,5 +289,7 @@ void GpuDestroyStream(gpuStream_t stream) {
 void GpuDeviceSync() { PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize()); }
 
 gpuError_t GpuGetLastError() { return cudaGetLastError(); }
-}  // namespace platform
-}  // namespace paddle
+
+}  // namespace gpu
+}  // namespace backends
+}  // namespace pten
diff --git a/paddle/pten/backends/gpu/forwards.h b/paddle/pten/backends/gpu/forwards.h
new file mode 100644
index 00000000000..d0787159e1e
--- /dev/null
+++ b/paddle/pten/backends/gpu/forwards.h
@@ -0,0 +1,111 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+// Forward-declares CUDA API types used in platform-agnostic wrapper headers.
+#pragma once
+
+/// Forward declaration of Eigen types.
+namespace Eigen {
+struct GpuDevice;
+}  // namespace Eigen
+
+/// Forward declaration of CUDA types.
+
+// Forward declaration of CUDA runtime types.
+using cudaStream_t = struct CUstream_st *;
+using cudaEvent_t = struct CUevent_st *;
+
+// Forward declaration of cuDNN types.
+using cudnnHandle_t = struct cudnnContext *;
+using cudnnTensorDescriptor_t = struct cudnnTensorStruct *;
+using cudnnConvolutionDescriptor_t = struct cudnnConvolutionStruct *;
+using cudnnPoolingDescriptor_t = struct cudnnPoolingStruct *;
+using cudnnFilterDescriptor_t = struct cudnnFilterStruct *;
+using cudnnLRNDescriptor_t = struct cudnnLRNStruct *;
+using cudnnActivationDescriptor_t = struct cudnnActivationStruct *;
+using cudnnSpatialTransformerDescriptor_t =
+    struct cudnnSpatialTransformerStruct *;
+using cudnnOpTensorDescriptor_t = struct cudnnOpTensorStruct *;
+using cudnnReduceTensorDescriptor_t = struct cudnnReduceTensorStruct *;
+using cudnnCTCLossDescriptor_t = struct cudnnCTCLossStruct *;
+using cudnnTensorTransformDescriptor_t = struct cudnnTensorTransformStruct *;
+using cudnnDropoutDescriptor_t = struct cudnnDropoutStruct *;
+using cudnnRNNDescriptor_t = struct cudnnRNNStruct *;
+using cudnnPersistentRNNPlan_t = struct cudnnPersistentRNNPlan *;
+using cudnnRNNDataDescriptor_t = struct cudnnRNNDataStruct *;
+using cudnnAlgorithmDescriptor_t = struct cudnnAlgorithmStruct *;
+using cudnnAlgorithmPerformance_t = struct cudnnAlgorithmPerformanceStruct *;
+using cudnnSeqDataDescriptor_t = struct cudnnSeqDataStruct *;
+using cudnnAttnDescriptor_t = struct cudnnAttnStruct *;
+using cudnnFusedOpsConstParamPack_t = struct cudnnFusedOpsConstParamStruct *;
+using cudnnFusedOpsVariantParamPack_t =
+    struct cudnnFusedOpsVariantParamStruct *;
+using cudnnFusedOpsPlan_t = struct cudnnFusedOpsPlanStruct *;
+
+// Forward declaration of cuBLAS types.
+using cublasHandle_t = struct cublasContext *;
+
+// Forward declaration of cuSOLVER types.
+using cusolverDnHandle_t = struct cusolverDnContext *;
+
+// Forward declaration of cuSparse types.
+using cusparseHandle_t = struct cusparseContext *;
+
+// Forward declaration of cuFFT types.
+using cufftHandle = int;
+
+// Forward declaration of NCCL types.
+using ncclComm_t = struct ncclComm *;
+
+/// Forward declaration of ROCM types.
+#include <cstddef>
+
+using hipDevice_t = int;
+using hipCtx_t = struct ihipCtx_t *;
+using hipModule_t = struct ihipModule_t *;
+using hipStream_t = struct ihipStream_t *;
+using hipEvent_t = struct ihipEvent_t *;
+using hipFunction_t = struct ihipModuleSymbol_t *;
+
+// Forward declaration of MIOpen types.
+using miopenHandle_t = struct miopenHandle *;
+using miopenAcceleratorQueue_t = hipStream_t;
+using miopenFusionOpDescriptor_t = struct miopenFusionOpDescriptor *;
+using miopenTensorDescriptor_t = struct miopenTensorDescriptor *;
+using miopenConvolutionDescriptor_t = struct miopenConvolutionDescriptor *;
+using miopenPoolingDescriptor_t = struct miopenPoolingDescriptor *;
+using miopenLRNDescriptor_t = struct miopenLRNDescriptor *;
+using miopenActivationDescriptor_t = struct miopenActivationDescriptor *;
+using miopenRNNDescriptor_t = struct miopenRNNDescriptor *;
+using miopenCTCLossDescriptor_t = struct miopenCTCLossDescriptor *;
+using miopenDropoutDescriptor_t = struct miopenDropoutDescriptor *;
+using miopenFusionPlanDescriptor_t = struct miopenFusionPlanDescriptor *;
+using miopenOperatorDescriptor_t = struct miopenOperatorDescriptor *;
+using miopenOperatorArgs_t = struct miopenOperatorArgs *;
+using miopenAllocatorFunction = void *(*)(void *context, size_t sizeBytes);
+// using miopenDeallocatorFunction = void *(*)(void *context, void *memory);
+// struct miopenConvAlgoPerf_t;
+// struct miopenConvSolution_t;
+
+// Forward declaration of rocBLAS types.
+using rocblas_handle = struct _rocblas_handle *;
+
+// Forward declaration of hipfft types.
+using hipfftHandle = struct hipfftHandle_t *;
+
+// Forward declaration of rocSOLVER types.
+using rocsolver_handle = rocblas_handle;
+
+// Forward declaration of rocSparse types.
+using rocsparse_handle = struct _rocsparse_handle *;
diff --git a/paddle/pten/backends/gpu/gpu_context.cc b/paddle/pten/backends/gpu/gpu_context.cc
new file mode 100644
index 00000000000..1e707c46cc9
--- /dev/null
+++ b/paddle/pten/backends/gpu/gpu_context.cc
@@ -0,0 +1,899 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/pten/backends/gpu/gpu_context.h"
+#include <array>
+#include <functional>
+#include <future>
+#include <memory>
+#include <mutex>
+
+#include "paddle/pten/api/ext/exception.h"
+
+#include "paddle/pten/backends/gpu/gpu_decls.h"
+#include "paddle/pten/backends/gpu/gpu_info.h"
+#include "paddle/pten/common/float16.h"
+#include "paddle/pten/common/place.h"
+#include "paddle/pten/core/allocator.h"
+
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/pten/backends/dynload/cublas.h"
+#include "paddle/pten/backends/dynload/cudnn.h"
+#include "paddle/pten/backends/dynload/cusolver.h"
+#include "paddle/pten/backends/dynload/cusparse.h"
+#if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
+#include "paddle/pten/backends/dynload/nccl.h"
+#endif  // !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
+#endif  // PADDLE_WITH_CUDA
+
+#ifdef PADDLE_WITH_HIP
+#include "paddle/pten/backends/dynload/miopen.h"
+#include "paddle/pten/backends/dynload/rocblas.h"
+#if !defined(__APPLE__) && defined(PADDLE_WITH_RCCL)
+#include "paddle/pten/backends/dynload/rccl.h"
+#endif  // !defined(__APPLE__) && defined(PADDLE_WITH_RCCL)
+#endif  // PADDLE_WITH_HIP
+
+// NOTE: The paddle framework should add WITH_EIGEN option to support compile
+// without eigen.
+#include "unsupported/Eigen/CXX11/Tensor"
+
+// TODO(pten): remove fluid header.
+#include "paddle/fluid/platform/enforce.h"
+
+namespace pten {
+
+namespace internal {
+
+class EigenGpuStreamDevice : public Eigen::StreamInterface {
+ public:
+  EigenGpuStreamDevice() : scratch_(nullptr), semaphore_(nullptr) {
+    Eigen::initializeDeviceProp();
+  }
+  ~EigenGpuStreamDevice() override {}
+
+  void Reinitialize(gpuStream_t cuda_stream,
+                    Allocator* allocator,
+                    GPUPlace place) {
+    stream_ = cuda_stream;
+    place_ = place;
+    allocator_ = allocator;
+    device_prop_ = &Eigen::m_deviceProperties[place.device];
+  }
+
+  const gpuStream_t& stream() const override { return stream_; }
+
+  const gpuDeviceProp& deviceProperties() const override {
+    return *device_prop_;
+  }
+
+  void* allocate(size_t num_bytes) const override {
+    if (UNLIKELY(num_bytes == 0)) {
+      return nullptr;
+    }
+    auto buf = allocator_->Allocate(num_bytes);
+    VLOG(4) << "Eigen allocated at " << buf->ptr() << " requested "
+            << num_bytes;
+    void* retv = buf->ptr();
+    {
+      std::lock_guard<std::mutex> lock(mtx_);
+      allocations_.emplace(retv, std::move(buf));
+    }
+    return retv;
+  }
+
+  void deallocate(void* buffer) const override {
+    if (LIKELY(buffer)) {
+      std::lock_guard<std::mutex> lock(mtx_);
+      allocations_.erase(buffer);
+    }
+  }
+
+  void* scratchpad() const override {
+    if (scratch_ == NULL) {
+      scratch_ = allocate(Eigen::kGpuScratchSize + sizeof(unsigned int));
+    }
+    return scratch_;
+  }
+
+  unsigned int* semaphore() const override {
+    if (semaphore_ == NULL) {
+      char* scratch = static_cast<char*>(scratchpad()) + Eigen::kGpuScratchSize;
+      semaphore_ = reinterpret_cast<unsigned int*>(scratch);
+#ifdef PADDLE_WITH_HIP
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          hipMemsetAsync(semaphore_, 0, sizeof(unsigned int), stream_));
+#else
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), stream_));
+#endif
+    }
+    return semaphore_;
+  }
+
+ private:
+  GPUPlace place_;
+  gpuStream_t stream_;                // not owned;
+  Allocator* allocator_;              // not owned;
+  const gpuDeviceProp* device_prop_;  // not owned;
+  mutable void* scratch_;
+  mutable unsigned int* semaphore_;
+  mutable std::mutex mtx_;  // to protect allocations_
+  mutable std::unordered_map<void*, Allocator::AllocationPtr> allocations_;
+};
+
+#ifdef PADDLE_WITH_HIP
+static void StreamCallbackFunc(gpuStream_t stream,
+                               gpuError_t status,
+                               void* user_data)
+#endif
+#ifdef PADDLE_WITH_CUDA
+#if CUDA_VERSION >= 10000
+    static void CUDART_CB StreamCallbackFunc(void* user_data)
+#else
+    static void CUDART_CB
+    StreamCallbackFunc(cudaStream_t stream, cudaError_t status, void* user_data)
+#endif
+#endif
+{
+  std::unique_ptr<std::function<void()>> func(
+      reinterpret_cast<std::function<void()>*>(user_data));
+  (*func)();
+}
+
+}  // namespace internal
+
+class DnnWorkspaceHandle {
+ public:
+  explicit inline DnnWorkspaceHandle(Allocator* allocator)
+      : allocator_(allocator) {}
+
+  inline void RunFunc(const std::function<void(void*)>& cudnn_func,
+                      size_t required_workspace_bytes) {
+    if (required_workspace_bytes > WorkspaceSize()) {
+      ReallocWorkspace(required_workspace_bytes);
+    }
+    VLOG(2) << "Cudnn workspace size at RunFunc: "
+            << static_cast<double>(WorkspaceSize()) / (1 << 20) << " MB";
+    {
+      std::lock_guard<std::mutex> guard(mtx_);
+      cudnn_func(allocation_ ? allocation_->ptr() : nullptr);
+    }
+  }
+
+  /*! \brief Thread which call RunFuncSync() would release gpu memory after
+   *  running the function. Currently this function is only used when cudnn
+   *  exhaustive searching and callers have to guarantee that the input function
+   *  is host blocking */
+  inline void RunFuncSync(const std::function<void(void*)>& cudnn_func,
+                          size_t required_workspace_bytes) {
+    RunFunc(cudnn_func, required_workspace_bytes);
+    ResetWorkspace();
+  }
+
+  inline size_t WorkspaceSize() {
+    if (allocation_ == nullptr) {
+      return 0;
+    }
+    return allocation_->size();
+  }
+
+  void ResetWorkspace() { allocation_ = nullptr; }
+
+  void ReallocWorkspace(size_t required_workspace_bytes) {
+    if (required_workspace_bytes <= WorkspaceSize()) return;
+    // reset allocation first before re-allocate to save memory
+    allocation_.reset();
+    allocation_ = allocator_->Allocate(required_workspace_bytes);
+  }
+
+ private:
+  Allocator::AllocationPtr allocation_{nullptr};
+  Allocator* allocator_{nullptr};
+  std::mutex mtx_;
+};
+
+struct GPUContext::Impl {
+  void Init() {
+    owned_ = true;
+    backends::gpu::GPUDeviceGuard guard(place_.device);
+    InitGpuProperties();
+    InitStream();
+    InitEigenDevice();
+    InitBlasHandle();
+    InitDNNHandle();
+    InitSolverHandle();
+    InitSparseHandle();
+    InitDnnWorkspace();
+  }
+
+  void PartialInitWithoutAllocator() {
+    owned_ = true;
+    backends::gpu::GPUDeviceGuard guard(place_.device);
+    InitGpuProperties();
+    InitStream();
+    InitBlasHandle();
+    InitDNNHandle();
+    InitSolverHandle();
+    InitSparseHandle();
+  }
+
+  void PartialInitWithAllocator() {
+    owned_ = true;
+    backends::gpu::GPUDeviceGuard guard(place_.device);
+    InitEigenDevice();
+    InitDnnWorkspace();
+  }
+
+  Impl() : place_(GPUPlace()) {}
+
+  explicit Impl(const GPUPlace& place) : place_(place) {}
+
+  ~Impl() {
+    backends::gpu::GPUDeviceGuard guard(place_.device);
+    DestoryInternalWorkspace();
+    DestoryInternalEigenDevice();
+    DestroyInternalSparseHandle();
+    DestroyInternalSolverHandle();
+    DestroyInternalDnnHandle();
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+    if (nccl_comm_) {
+      PADDLE_ENFORCE_GPU_SUCCESS(dynload::ncclCommDestroy(nccl_comm_));
+    }
+#endif
+    DestroyInternalBlasHandle();
+    DestoryInternalStream();
+  }
+
+  const Place& GetPlace() const { return place_; }
+
+  bool IsTensorCoreAvailable() const {
+    return blas_tensor_core_handle_ != nullptr;
+  }
+
+  void InitGpuProperties() {
+    backends::gpu::GPUDeviceGuard guard(place_.GetDeviceId());
+    compute_capability_ =
+        backends::gpu::GetGPUComputeCapability(place_.GetDeviceId());
+    multi_process_ = backends::gpu::GetGPUMultiProcessors(place_.GetDeviceId());
+    max_threads_per_mp_ =
+        backends::gpu::GetGPUMaxThreadsPerMultiProcessor(place_.GetDeviceId());
+    max_grid_dim_size_ =
+        backends::gpu::GetGpuMaxGridDimSize(place_.GetDeviceId());
+    max_threads_per_block_ =
+        backends::gpu::GetGPUMaxThreadsPerBlock(place_.GetDeviceId());
+    driver_version_ = backends::gpu::GetGPUDriverVersion(place_.GetDeviceId());
+    runtime_version_ =
+        backends::gpu::GetGPURuntimeVersion(place_.GetDeviceId());
+
+    // TODO(wilber): glog may be replaced in the future?
+    LOG_FIRST_N(WARNING, 1)
+        << "Please NOTE: device: " << static_cast<int>(place_.device)
+        << ", GPU Compute Capability: " << compute_capability_ / 10 << "."
+        << compute_capability_ % 10
+        << ", Driver API Version: " << driver_version_ / 1000 << "."
+        << (driver_version_ % 100) / 10
+        << ", Runtime API Version: " << runtime_version_ / 1000 << "."
+        << (runtime_version_ % 100) / 10;
+#ifdef PADDLE_WITH_HIP
+    size_t miopen_major, miopen_minor, miopen_patch;
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        dynload::miopenGetVersion(&miopen_major, &miopen_minor, &miopen_patch));
+    auto cudnn_dso_ver =
+        (miopen_major * 1000 + miopen_minor * 10 + miopen_patch) / 10;
+    auto compile_miopen_version = MIOPEN_VERSION / 10;
+    if (cudnn_dso_ver < static_cast<size_t>(compile_miopen_version)) {
+      LOG_FIRST_N(WARNING, 1)
+          << "WARNING: device: " << static_cast<int>(place_.device)
+          << ". The installed Paddle is compiled with MIOPEN "
+          << compile_miopen_version / 100 << "." << compile_miopen_version % 100
+          << ", but MIOPEN version in your machine is " << cudnn_dso_ver / 100
+          << "." << cudnn_dso_ver % 100
+          << ", which may cause serious incompatible bug. "
+          << "Please recompile or reinstall Paddle with compatible MIOPEN "
+             "version.";
+    }
+#else
+    size_t cudnn_dso_ver = dynload::cudnnGetVersion();
+    LOG_FIRST_N(WARNING, 1) << "device: " << static_cast<int>(place_.device)
+                            << ", cuDNN Version: " << cudnn_dso_ver / 1000
+                            << "." << (cudnn_dso_ver % 1000) / 100 << ".";
+
+    // Check CUDA/CUDNN version compatiblity
+    auto local_cuda_version =
+        (driver_version_ / 1000) * 10 + (driver_version_ % 100) / 10;
+    auto compile_cuda_version =
+        (CUDA_VERSION / 1000) * 10 + (CUDA_VERSION % 100) / 10;
+    if (local_cuda_version < compile_cuda_version) {
+      LOG_FIRST_N(WARNING, 1)
+          << "WARNING: device: " << static_cast<int>(place_.device)
+          << ". The installed Paddle is compiled with CUDA "
+          << compile_cuda_version / 10 << "." << compile_cuda_version % 10
+          << ", but CUDA runtime version in your machine is "
+          << local_cuda_version / 10 << "." << local_cuda_version % 10
+          << ", which may cause serious incompatible bug. "
+          << "Please recompile or reinstall Paddle with compatible CUDA "
+             "version.";
+    }
+#endif
+  }
+
+  void InitDnnWorkspace() {
+    PD_CHECK(allocator_ != nullptr,
+             "the device allocator for gpu context is nullptr.");
+    workspace_ = new DnnWorkspaceHandle(allocator_);
+  }
+
+  void DestoryInternalWorkspace() {
+    if (owned_ && workspace_ != nullptr) {
+      delete workspace_;
+      stream_ = nullptr;
+    }
+  }
+
+  DnnWorkspaceHandle* GetDnnWorkspace() {
+    PD_CHECK(workspace_ != nullptr, "the gpu cudnn workspace is nullptr.");
+    return workspace_;
+  }
+
+  void InitStream() {
+#ifdef PADDLE_WITH_HIP
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        hipStreamCreateWithPriority(&stream_, hipStreamDefault, 0));
+#else
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        cudaStreamCreateWithPriority(&stream_, cudaStreamDefault, 0));
+#endif
+  }
+
+  void DestoryInternalStream() {
+    if (owned_ && stream_ != nullptr) {
+#ifdef PADDLE_WITH_HIP
+      PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(stream_));
+#else
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(stream_));
+#endif
+    }
+    stream_ = nullptr;
+  }
+
+  void SetStream(gpuStream_t stream) { stream_ = stream; }
+
+  gpuStream_t GetStream() const {
+    PD_CHECK(stream_ != nullptr, "the gpu stream is nullptr.");
+    return stream_;
+  }
+
+  void InitEigenDevice() {
+    PD_CHECK(allocator_ != nullptr,
+             "the allocator for eigen device is nullptr.");
+    eigen_stream_.reset(new internal::EigenGpuStreamDevice());
+    eigen_stream_->Reinitialize(stream_, allocator_, place_);
+    eigen_device_ = new Eigen::GpuDevice(eigen_stream_.get());
+  }
+
+  void DestoryInternalEigenDevice() {
+    if (owned_ && eigen_device_ != nullptr) {
+      delete eigen_device_;
+      eigen_device_ = nullptr;
+    }
+  }
+
+  void SetEigenDevice(Eigen::GpuDevice* device) { eigen_device_ = device; }
+
+  Eigen::GpuDevice* eigen_device() const {
+    PD_CHECK(eigen_device_ != nullptr, "the gpu eigen_device is nullptr.");
+    return eigen_device_;
+  }
+
+  void InitBlasHandle() {
+#ifdef PADDLE_WITH_HIP
+    pten::dynload::rocblas_create_handle(&blas_handle_);
+    pten::dynload::rocblas_set_stream(blas_handle_, stream_);
+#else  // PADDLE_WITH_CUDA
+    PADDLE_RETRY_CUDA_SUCCESS(pten::dynload::cublasCreate(&blas_handle_));
+    PADDLE_RETRY_CUDA_SUCCESS(
+        pten::dynload::cublasSetStream(blas_handle_, stream_));
+#if CUDA_VERSION >= 9000
+    PADDLE_RETRY_CUDA_SUCCESS(
+        pten::dynload::cublasCreate(&blas_tensor_core_handle_));
+    PADDLE_RETRY_CUDA_SUCCESS(
+        pten::dynload::cublasSetStream(blas_tensor_core_handle_, stream_));
+    PADDLE_RETRY_CUDA_SUCCESS(pten::dynload::cublasSetMathMode(
+        blas_tensor_core_handle_, CUBLAS_TENSOR_OP_MATH));
+#if CUDA_VERSION >= 11000
+    PADDLE_RETRY_CUDA_SUCCESS(
+        pten::dynload::cublasCreate(&blas_tf32_tensor_core_handle_));
+    PADDLE_RETRY_CUDA_SUCCESS(
+        pten::dynload::cublasSetStream(blas_tf32_tensor_core_handle_, stream_));
+    PADDLE_RETRY_CUDA_SUCCESS(pten::dynload::cublasSetMathMode(
+        blas_tf32_tensor_core_handle_, CUBLAS_TF32_TENSOR_OP_MATH));
+#endif  // CUDA_VERSION >= 11000
+#endif  // CUDA_VERSION >= 9000
+#endif  // PADDLE_WITH_HIP
+  }
+
+  void DestroyInternalBlasHandle() {
+#ifdef PADDLE_WITH_HIP
+    if (owned_ && blas_handle_ != nullptr) {
+      pten::dynload::rocblas_destroy_handle(blas_handle_);
+      blas_handle_ = nullptr;
+    }
+#else
+    if (owned_ && blas_handle_ != nullptr) {
+      pten::dynload::cublasDestroy(blas_handle_);
+      blas_handle_ = nullptr;
+    }
+    if (owned_ && blas_tensor_core_handle_ != nullptr) {
+      pten::dynload::cublasDestroy(blas_tensor_core_handle_);
+      blas_tensor_core_handle_ = nullptr;
+    }
+    if (owned_ && blas_tf32_tensor_core_handle_ != nullptr) {
+      pten::dynload::cublasDestroy(blas_tf32_tensor_core_handle_);
+      blas_tf32_tensor_core_handle_ = nullptr;
+    }
+#endif  // PADDLE_WITH_HIP
+  }
+
+  blasHandle_t GetBlasHandle() const {
+    PD_CHECK(blas_handle_ != nullptr, "the gpu blas handle is nullptr.");
+    return blas_handle_;
+  }
+
+  void SetBlasHandle(blasHandle_t blas) { blas_handle_ = blas; }
+
+  void InitDNNHandle() {
+    if (pten::dynload::HasCUDNN()) {
+#ifdef PADDLE_WITH_HIP
+      size_t miopen_major, miopen_minor, miopen_patch;
+      PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenGetVersion(
+          &miopen_major, &miopen_minor, &miopen_patch));
+      auto local_miopen_version =
+          (miopen_major * 1000 + miopen_minor * 10 + miopen_patch) / 10;
+      auto compile_miopen_version = MIOPEN_VERSION / 10;
+      if (local_miopen_version < static_cast<size_t>(compile_miopen_version)) {
+        LOG_FIRST_N(WARNING, 1)
+            << "WARNING: device: " << place_.device
+            << ". The installed Paddle is compiled with MIOPEN "
+            << compile_miopen_version / 100 << "."
+            << compile_miopen_version % 100
+            << ", but MIOPEN version in your machine is "
+            << local_miopen_version / 100 << "." << local_miopen_version % 100
+            << ", which may cause serious incompatible bug. "
+            << "Please recompile or reinstall Paddle with compatible MIOPEN "
+               "version.";
+      }
+      PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenCreate(&dnn_handle_));
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          dynload::miopenSetStream(dnn_handle_, stream_));
+#else
+      auto local_cudnn_version = pten::dynload::cudnnGetVersion() / 100;
+      auto compile_cudnn_version = CUDNN_VERSION / 100;
+      if (local_cudnn_version < static_cast<size_t>(compile_cudnn_version)) {
+        LOG_FIRST_N(WARNING, 1)
+            << "WARNING: device: " << place_.device
+            << ". The installed Paddle is compiled with CUDNN "
+            << compile_cudnn_version / 10 << "." << compile_cudnn_version % 10
+            << ", but CUDNN version in your machine is "
+            << local_cudnn_version / 10 << "." << local_cudnn_version % 10
+            << ", which may cause serious incompatible bug. "
+            << "Please recompile or reinstall Paddle with compatible CUDNN "
+               "version.";
+      }
+      PADDLE_RETRY_CUDA_SUCCESS(pten::dynload::cudnnCreate(&dnn_handle_));
+      PADDLE_RETRY_CUDA_SUCCESS(
+          pten::dynload::cudnnSetStream(dnn_handle_, stream_));
+#endif
+    } else {
+      dnn_handle_ = nullptr;
+    }
+  }
+
+  dnnHandle_t GetDnnHandle() {
+    PD_CHECK(dnn_handle_ != nullptr, "the gpu dnn handle is nullptr.");
+    return dnn_handle_;
+  }
+
+  void DestroyInternalDnnHandle() {
+#ifdef PADDLE_WITH_HIP
+    if (owned_ && dnn_handle_ != nullptr) {
+      PADDLE_ENFORCE_GPU_SUCCESS(pten::dynload::miopenDestroy(dnn_handle_));
+      dnn_handle_ = nullptr;
+    }
+#else
+    if (owned_ && dnn_handle_ != nullptr) {
+      PADDLE_ENFORCE_GPU_SUCCESS(pten::dynload::cudnnDestroy(dnn_handle_));
+      dnn_handle_ = nullptr;
+    }
+#endif  // PADDLE_WITH_HIP
+  }
+
+  void SetDnnHandle(dnnHandle_t handle) { dnn_handle_ = handle; }
+
+  void InitSolverHandle() {
+#ifndef PADDLE_WITH_HIP
+    PADDLE_RETRY_CUDA_SUCCESS(pten::dynload::cusolverDnCreate(&solver_handle_));
+    PADDLE_RETRY_CUDA_SUCCESS(
+        pten::dynload::cusolverDnSetStream(solver_handle_, stream_));
+#endif
+  }
+
+  void DestroyInternalSolverHandle() {
+#ifndef PADDLE_WITH_HIP
+    if (owned_ && solver_handle_ != nullptr) {
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          pten::dynload::cusolverDnDestroy(solver_handle_));
+      solver_handle_ = nullptr;
+    }
+#endif
+  }
+
+  solverHandle_t GetSolverHandle() const {
+    PD_CHECK(solver_handle_ != nullptr, "the gpu solver handle is nullptr.");
+    return solver_handle_;
+  }
+
+  void SetSolverHandle(solverHandle_t handle) { solver_handle_ = handle; }
+
+  void InitSparseHandle() {
+// ROCM is not yet supported
+#if defined(PADDLE_WITH_CUDA)
+// The generic APIs is supported from CUDA10.1
+#if CUDA_VERSION >= 10010
+    PADDLE_RETRY_CUDA_SUCCESS(dynload::cusparseCreate(&sparse_handle_));
+    PADDLE_RETRY_CUDA_SUCCESS(
+        dynload::cusparseSetStream(sparse_handle_, stream_));
+#endif
+#endif
+  }
+
+  void DestroyInternalSparseHandle() {
+#ifdef PADDLE_WITH_CUDA
+#if CUDA_VERSION >= 10010
+    if (owned_ && sparse_handle_ != nullptr) {
+      PADDLE_RETRY_CUDA_SUCCESS(dynload::cusparseDestroy(sparse_handle_));
+      sparse_handle_ = nullptr;
+    }
+#endif
+#endif
+  }
+
+  sparseHandle_t GetSparseHandle() const {
+    PD_CHECK(sparse_handle_ != nullptr, "the gpu sparse handle is nullptr.");
+    return sparse_handle_;
+  }
+
+  void SetSparseHandle(sparseHandle_t handle) { sparse_handle_ = handle; }
+
+  void Wait() const {
+#ifdef PADDLE_WITH_HIP
+    hipError_t e_sync = hipSuccess;
+#if !defined(_WIN32)
+    e_sync = hipStreamSynchronize(stream_);
+#else
+    while (e_sync = hipStreamQuery(stream_)) {
+      if (e_sync == hipErrorNotReady) continue;
+      break;
+    }
+#endif  // !defined(_WIN32)
+#else   // PADDLE_WITH_HIP
+    cudaError_t e_sync = cudaSuccess;
+#if !defined(_WIN32)
+    e_sync = cudaStreamSynchronize(stream_);
+#else
+    while (e_sync = cudaStreamQuery(stream_)) {
+      if (e_sync == cudaErrorNotReady) continue;
+      break;
+    }
+#endif  // !defined(_WIN32)
+#endif  // PADDLE_WITH_HIP
+
+    PADDLE_ENFORCE_GPU_SUCCESS(e_sync);
+  }
+
+  void WaitEvent(gpuEvent_t ev) const {
+#ifdef PADDLE_WITH_HIP
+    PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(stream_, ev, 0));
+#else
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(stream_, ev, 0));
+#endif
+  }
+
+  ncclComm_t GetNcclComm() const {
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+    // PD_CHECK(nccl_comm_ != nullptr, "the gpu nccl_comm is nullptr.");
+    return nccl_comm_;
+#endif
+    return nullptr;
+  }
+
+  void SetNcclComm(ncclComm_t comm) {
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+    nccl_comm_ = comm;
+#endif
+  }
+
+  inline void CublasCall(
+      const std::function<void(blasHandle_t)>& callback) const {
+    if (blas_tf32_tensor_core_handle_ != nullptr) {
+      std::lock_guard<std::mutex> guard(blas_tf32_mtx_);
+      callback(blas_tf32_tensor_core_handle_);
+    } else {
+      std::lock_guard<std::mutex> guard(blas_mtx_);
+      callback(blas_handle_);
+    }
+  }
+
+  inline void TensorCoreCublasCallIfAvailable(
+      const std::function<void(blasHandle_t)>& callback) const {
+    if (blas_tensor_core_handle_ != nullptr) {
+      std::lock_guard<std::mutex> guard(blas_tensor_core_mtx_);
+      callback(blas_tensor_core_handle_);
+    } else {
+      std::lock_guard<std::mutex> guard(blas_mtx_);
+      callback(blas_handle_);
+    }
+  }
+
+  inline void CusparseCall(
+      const std::function<void(sparseHandle_t)>& callback) const {
+    std::lock_guard<std::mutex> guard(sparse_mtx_);
+    callback(sparse_handle_);
+  }
+
+  void RecordEvent(gpuEvent_t ev, const std::function<void()>& callback) const {
+    callback();
+    RecordEvent(ev);
+  }
+
+  void RecordEvent(gpuEvent_t ev) const {
+#ifdef PADDLE_WITH_HIP
+    PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(ev, stream_));
+#else
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(ev, stream_));
+#endif
+  }
+
+  void AddStreamCallback(const std::function<void()>& callback) const {
+    // TODO(wilber): Do we need ThreadPool?
+    auto* func = new std::function<void()>([this, callback] {
+      std::lock_guard<std::mutex> lock(stream_call_back_mtx_);
+      last_future_ = std::async(std::launch::deferred, [&]() { callback(); });
+    });
+
+#ifdef PADDLE_WITH_HIP
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        hipStreamAddCallback(stream_, internal::StreamCallbackFunc, func, 0));
+#endif
+#ifdef PADDLE_WITH_CUDA
+#if CUDA_VERSION >= 10000
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        cudaLaunchHostFunc(stream_, internal::StreamCallbackFunc, func));
+#else
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        cudaStreamAddCallback(stream_, internal::StreamCallbackFunc, func, 0));
+#endif
+#endif
+  }
+
+  void WaitStreamCallback() const {
+#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_CUDA)
+    pten::backends::gpu::GpuStreamSync(stream_);
+#endif
+    {
+      std::lock_guard<std::mutex> lock(stream_call_back_mtx_);
+      if (last_future_.valid()) {
+        last_future_.wait();
+      }
+    }
+  }
+
+  bool owned_{false};
+  Place place_;
+  int compute_capability_;
+  int runtime_version_;
+  int driver_version_;
+  int multi_process_;
+  int max_threads_per_mp_;
+  int max_threads_per_block_;
+  std::array<int, 3> max_grid_dim_size_;
+
+  gpuStream_t stream_{nullptr};
+  Eigen::GpuDevice* eigen_device_{nullptr};
+  blasHandle_t blas_handle_{nullptr};
+  blasHandle_t blas_tensor_core_handle_{nullptr};
+  blasHandle_t blas_tf32_tensor_core_handle_{nullptr};
+  dnnHandle_t dnn_handle_{nullptr};
+  solverHandle_t solver_handle_{nullptr};
+  sparseHandle_t sparse_handle_{nullptr};
+  DnnWorkspaceHandle* workspace_{nullptr};
+
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+  // NCCL communicator (single process version) for NCCL collective operations.
+  // NCCL collective operations provides fast collectives over multiple GPUs
+  // both within and across nodes.
+  // But, this collectives is used for collectives over multiple GPUs within
+  // nodes.
+
+  // NOTE: Distributed communicator, distributed framework manages its
+  // resources.
+  ncclComm_t nccl_comm_{nullptr};
+#endif
+
+  mutable std::mutex blas_mtx_;
+  mutable std::mutex blas_tensor_core_mtx_;
+  mutable std::mutex blas_tf32_mtx_;
+  mutable std::mutex sparse_mtx_;
+  mutable std::mutex stream_call_back_mtx_;
+  mutable std::future<void> last_future_;
+
+  Allocator* allocator_{nullptr};  // external resource.
+  // A internal resouce to initinalize eigen_device.
+  std::unique_ptr<internal::EigenGpuStreamDevice> eigen_stream_{nullptr};
+};
+
+GPUContext::GPUContext() : DeviceContext(), impl_(std::make_unique<Impl>()) {}
+
+GPUContext::GPUContext(const GPUPlace& place)
+    : DeviceContext(), impl_(std::make_unique<Impl>(place)) {}
+
+GPUContext::~GPUContext() = default;
+
+const Place& GPUContext::GetPlace() const { return impl_->GetPlace(); }
+
+gpuStream_t GPUContext::stream() const { return impl_->GetStream(); }
+
+dnnHandle_t GPUContext::cudnn_handle() const { return impl_->GetDnnHandle(); }
+
+blasHandle_t GPUContext::cublas_handle() const {
+  return impl_->GetBlasHandle();
+}
+
+solverHandle_t GPUContext::cusolver_dn_handle() const {
+  return impl_->GetSolverHandle();
+}
+
+sparseHandle_t GPUContext::cusparse_handle() const {
+  return impl_->GetSparseHandle();
+}
+
+void GPUContext::Wait() const { impl_->Wait(); }
+
+void GPUContext::WaitEvent(gpuEvent_t ev) const { impl_->WaitEvent(ev); }
+
+bool GPUContext::tensor_core_available() const {
+  return impl_->IsTensorCoreAvailable();
+}
+
+int GPUContext::GetComputeCapability() const {
+  return impl_->compute_capability_;
+}
+
+int GPUContext::GetMaxPhysicalThreadCount() const {
+  return impl_->multi_process_ * impl_->max_threads_per_mp_;
+}
+
+int GPUContext::GetSMCount() const { return impl_->multi_process_; }
+
+int GPUContext::GetMaxThreadsPerBlock() const {
+  return impl_->max_threads_per_block_;
+}
+
+std::array<int, 3> GPUContext::GetCUDAMaxGridDimSize() const {
+  return impl_->max_grid_dim_size_;
+}
+
+Eigen::GpuDevice* GPUContext::eigen_device() const {
+  return impl_->eigen_device();
+}
+
+DnnWorkspaceHandle* GPUContext::cudnn_workspace_handle() {
+  return impl_->GetDnnWorkspace();
+}
+
+void GPUContext::CublasCall(
+    const std::function<void(blasHandle_t)>& callback) const {
+  impl_->CublasCall(callback);
+}
+
+void GPUContext::TensorCoreCublasCallIfAvailable(
+    const std::function<void(blasHandle_t)>& callback) const {
+  impl_->TensorCoreCublasCallIfAvailable(callback);
+}
+
+void GPUContext::CusparseCall(
+    const std::function<void(sparseHandle_t)>& callback) const {
+  impl_->CusparseCall(callback);
+}
+
+void GPUContext::RecordEvent(gpuEvent_t ev,
+                             const std::function<void()>& callback) const {
+  impl_->RecordEvent(ev, callback);
+}
+
+void GPUContext::RecordEvent(gpuEvent_t ev) const { impl_->RecordEvent(ev); }
+
+void GPUContext::AddStreamCallback(
+    const std::function<void()>& callback) const {
+  impl_->AddStreamCallback(callback);
+}
+
+void GPUContext::WaitStreamCallback() const { impl_->WaitStreamCallback(); }
+
+ncclComm_t GPUContext::nccl_comm() const { return impl_->GetNcclComm(); }
+
+void GPUContext::set_nccl_comm(ncclComm_t comm) { impl_->SetNcclComm(comm); }
+
+void GPUContext::Init() {
+  impl_->allocator_ = const_cast<Allocator*>(&this->GetAllocator());
+  impl_->Init();
+}
+
+void GPUContext::SetStream(gpuStream_t stream) { impl_->SetStream(stream); }
+
+void GPUContext::SetEigenDevice(Eigen::GpuDevice* device) {
+  impl_->SetEigenDevice(device);
+}
+
+void GPUContext::SetBlasHandle(blasHandle_t blas) {
+  impl_->SetBlasHandle(blas);
+}
+
+void GPUContext::SetDnnHandle(dnnHandle_t handle) {
+  impl_->SetDnnHandle(handle);
+}
+
+void GPUContext::SetSolverHandle(solverHandle_t handle) {
+  impl_->SetSolverHandle(handle);
+}
+
+void GPUContext::SetSparseHandle(sparseHandle_t handle) {
+  impl_->SetSparseHandle(handle);
+}
+
+void GPUContext::SetDnnWorkspaceHandle(DnnWorkspaceHandle* handle) {
+  impl_->workspace_ = handle;
+}
+
+void GPUContext::PartialInitWithoutAllocator() {
+  impl_->PartialInitWithoutAllocator();
+}
+
+void GPUContext::PartialInitWithAllocator() {
+  impl_->allocator_ = const_cast<Allocator*>(&this->GetAllocator());
+  impl_->PartialInitWithAllocator();
+}
+
+void GPUContext::SetComputeCapability(int val) {
+  impl_->compute_capability_ = val;
+}
+
+void GPUContext::SetMaxThreadsPerMultiProcessor(int val) {
+  impl_->max_threads_per_mp_ = val;
+}
+
+void GPUContext::SetMultiProcessors(int val) { impl_->multi_process_ = val; }
+
+void GPUContext::SetMaxThreadsPerBlock(int val) {
+  impl_->max_threads_per_block_ = val;
+}
+
+void GPUContext::SetMaxGridDimSize(const std::array<int, 3>& val) {
+  impl_->max_grid_dim_size_ = val;
+}
+
+void GPUContext::SetDriverVersion(int val) { impl_->driver_version_ = val; }
+
+void GPUContext::SetRuntimeVersion(int val) { impl_->runtime_version_ = val; }
+
+}  // namespace pten
diff --git a/paddle/pten/backends/gpu/gpu_context.h b/paddle/pten/backends/gpu/gpu_context.h
index 1adfd155ce3..2a2be0e44b4 100644
--- a/paddle/pten/backends/gpu/gpu_context.h
+++ b/paddle/pten/backends/gpu/gpu_context.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,13 +14,162 @@ limitations under the License. */
 
 #pragma once
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-
-// See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/platform/device_context.h"
+#include <array>
+#include <functional>
+#include "paddle/pten/backends/gpu/forwards.h"
+#include "paddle/pten/backends/gpu/gpu_decls.h"
+#include "paddle/pten/backends/gpu/gpu_helper.h"
+#include "paddle/pten/common/place.h"
+#include "paddle/pten/core/device_context.h"
 
 namespace pten {
-using GPUContext = paddle::platform::CUDADeviceContext;
-}  // namespace pten
 
-#endif
+class DnnWorkspaceHandle;
+
+class GPUContext : public DeviceContext {
+ public:
+  GPUContext();
+
+  explicit GPUContext(const GPUPlace& place);
+
+  virtual ~GPUContext();
+
+  /*! \brief  Return place in the device context. */
+  const Place& GetPlace() const override;
+
+  /*! \brief  Return gpu stream in the device context. */
+  gpuStream_t stream() const;
+
+  /*! \brief  Return cudnn  handle in the device context. */
+  dnnHandle_t cudnn_handle() const;
+
+  /*! \brief  Return cublas handle in the device context. */
+  blasHandle_t cublas_handle() const;
+
+  /*! \brief  Return cusolver handle in the device context. */
+  solverHandle_t cusolver_dn_handle() const;
+
+  /*! \brief  Return cusparse handle in the device context. */
+  sparseHandle_t cusparse_handle() const;
+
+  /*! \brief  Wait for all operations completion in the stream. */
+  void Wait() const override;
+
+  /*! \brief  Wait for event in the stream. */
+  void WaitEvent(gpuEvent_t ev) const;
+
+  /*! \brief  Check whether tensor core is supported */
+  bool tensor_core_available() const;
+
+  /*! \brief  Return compute capability in the device context. */
+  int GetComputeCapability() const;
+
+  /*! \brief  Return the max physical thread count in the device context */
+  int GetMaxPhysicalThreadCount() const;
+
+  /*! \brief  Return the SM count in the device context */
+  int GetSMCount() const;
+
+  /*! \brief  Return the Max thread num of block in the device context */
+  int GetMaxThreadsPerBlock() const;
+
+  /*! \brief  Return the max grid dim size in the device context */
+  std::array<int, 3> GetCUDAMaxGridDimSize() const;
+
+  /*! \brief  Return eigen device in the device context. */
+  Eigen::GpuDevice* eigen_device() const;
+
+  /*! \brief  Return a cudnn workspace handle to call multiple cudnn
+   *  functions without interrupting by other threads.
+   *  Once the first cudnn function is called by the handle, a lock
+   *  would be acquired to prevent other threads from accessing the
+   *  workspace. Once the handle is destructed, the lock would be released.
+   */
+  DnnWorkspaceHandle* cudnn_workspace_handle();
+
+ public:
+  /*! \brief  Call cublas function safely. */
+  void CublasCall(const std::function<void(blasHandle_t)>&) const;
+
+  /*! \brief  Call cublas function with Tensor Core safely. If
+      Tensor Core is not available, use DEFAULT_MATH instead. */
+  void TensorCoreCublasCallIfAvailable(
+      const std::function<void(blasHandle_t)>&) const;
+
+  /*! \brief  Call cusparse function safely. */
+  void CusparseCall(const std::function<void(sparseHandle_t)>&) const;
+
+  void RecordEvent(gpuEvent_t ev, const std::function<void()>& callback) const;
+
+  void RecordEvent(gpuEvent_t ev) const;
+
+  void AddStreamCallback(const std::function<void()>& callback) const;
+
+  void WaitStreamCallback() const;
+
+ public:
+  /*! \brief  Return nccl communicators. */
+  ncclComm_t nccl_comm() const;
+
+  /*! \brief  Set nccl communicators. */
+  void set_nccl_comm(ncclComm_t comm);
+
+ public:
+  // NOTE: DeviceContext hold resources. Used in training scenarios.
+  // The interface used by the training scene, DeviceContext will initialize
+  // all resources and delete them when destructing.
+  // Note that you must set the Allocator before calling Init function.
+  void Init();
+
+  // TODO(wilber): Why does the GetAllocator interface require a stream
+  // parameter?
+  // The temporary trick method bypasses this problem, and the following
+  // interfaces
+  // need to be deleted later.
+
+  // Note that this is a trick implementation, which can be used to partially
+  // initialize when the SetAllocator interface is not called.
+  void PartialInitWithoutAllocator();
+  // Note that this is a trick implementation that can be used to initialize
+  // resources that require an Allocator when the SetAllocator interface is
+  // called.
+  void PartialInitWithAllocator();
+
+ protected:
+  // NOTE: External users manage resources. Used in inference scenarios.
+  // The Set interface is for inference only, DeviceContext will mark the
+  // resource as external, and will not delete any resource when destructing.
+  void SetStream(gpuStream_t);
+
+  void SetEigenDevice(Eigen::GpuDevice*);
+
+  void SetBlasHandle(blasHandle_t);
+
+  void SetDnnHandle(dnnHandle_t);
+
+  void SetSolverHandle(solverHandle_t);
+
+  void SetSparseHandle(sparseHandle_t);
+
+  void SetDnnWorkspaceHandle(DnnWorkspaceHandle*);
+
+  void SetComputeCapability(int val);
+
+  void SetMaxThreadsPerMultiProcessor(int val);
+
+  void SetMultiProcessors(int val);
+
+  void SetMaxThreadsPerBlock(int val);
+
+  void SetMaxGridDimSize(const std::array<int, 3>& val);
+
+  void SetDriverVersion(int val);
+
+  void SetRuntimeVersion(int val);
+
+ private:
+  struct Impl;
+  std::unique_ptr<Impl> impl_;
+};
+
+}  // namespace pten
diff --git a/paddle/pten/backends/gpu/gpu_decls.h b/paddle/pten/backends/gpu/gpu_decls.h
new file mode 100644
index 00000000000..747244c4407
--- /dev/null
+++ b/paddle/pten/backends/gpu/gpu_decls.h
@@ -0,0 +1,70 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/pten/backends/gpu/forwards.h"
+
+namespace pten {
+
+#ifdef PADDLE_WITH_HIP
+#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \
+  using GPU_TYPE = ROCM_TYPE;
+
+#else  // PADDLE_WITH_CDUA
+
+#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \
+  using GPU_TYPE = CUDA_TYPE;
+#endif
+
+DECLARE_TYPE_FOR_GPU(gpuStream_t, cudaStream_t, hipStream_t);
+DECLARE_TYPE_FOR_GPU(gpuEvent_t, cudaEvent_t, hipEvent_t);
+
+DECLARE_TYPE_FOR_GPU(dnnActivationDescriptor,
+                     cudnnActivationStruct,
+                     miopenActivationDescriptor);
+DECLARE_TYPE_FOR_GPU(dnnTensorDescriptor,
+                     cudnnTensorStruct,
+                     miopenTensorDescriptor);
+DECLARE_TYPE_FOR_GPU(dnnFilterDescriptor,
+                     cudnnFilterStruct,
+                     miopenTensorDescriptor);
+DECLARE_TYPE_FOR_GPU(dnnFilterDescriptor_t,
+                     cudnnFilterDescriptor_t,
+                     miopenTensorDescriptor_t);
+DECLARE_TYPE_FOR_GPU(dnnConvolutionDescriptor,
+                     cudnnConvolutionStruct,
+                     miopenConvolutionDescriptor);
+DECLARE_TYPE_FOR_GPU(dnnConvolutionDescriptor_t,
+                     cudnnConvolutionDescriptor_t,
+                     miopenConvolutionDescriptor_t);
+DECLARE_TYPE_FOR_GPU(dnnPoolingDescriptor_t,
+                     cudnnPoolingDescriptor_t,
+                     miopenPoolingDescriptor_t);
+DECLARE_TYPE_FOR_GPU(dnnDropoutDescriptor_t,
+                     cudnnDropoutDescriptor_t,
+                     miopenDropoutDescriptor_t);
+DECLARE_TYPE_FOR_GPU(dnnHandle_t, cudnnHandle_t, miopenHandle_t);
+
+DECLARE_TYPE_FOR_GPU(blasHandle_t, cublasHandle_t, rocblas_handle);
+
+DECLARE_TYPE_FOR_GPU(solverHandle_t, cusolverDnHandle_t, rocsolver_handle);
+
+DECLARE_TYPE_FOR_GPU(sparseHandle_t, cusparseHandle_t, rocsparse_handle);
+
+#undef DECLARE_TYPE_FOR_GPU
+
+using CUDAGraphID = unsigned long long;  // NOLINT
+
+}  // namespace pten
diff --git a/paddle/fluid/eager/accumulation/gradient_accumulation.h b/paddle/pten/backends/gpu/gpu_helper.h
similarity index 69%
rename from paddle/fluid/eager/accumulation/gradient_accumulation.h
rename to paddle/pten/backends/gpu/gpu_helper.h
index 725410dac72..e9254115ed5 100644
--- a/paddle/fluid/eager/accumulation/gradient_accumulation.h
+++ b/paddle/pten/backends/gpu/gpu_helper.h
@@ -13,11 +13,14 @@
 // limitations under the License.
 
 #pragma once
-#include "paddle/fluid/eager/eager_tensor.h"
-#include "paddle/pten/api/all.h"
-namespace egr {
-// Accumulation API
-void TensorAdd(const egr::EagerTensor& src, egr::EagerTensor* dst);
-void VariableAdd(const egr::EagerTensor& src, egr::EagerTensor* dst);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
-}  // namespace egr
+#ifdef PADDLE_WITH_HIP
+#include "paddle/pten/backends/gpu/rocm/rocm_helper.h"
+#else
+#include "paddle/pten/backends/gpu/cuda/cuda_helper.h"
+#endif
+
+#define CUDA_KERNEL_LOOP(i, num) CUDA_KERNEL_LOOP_TYPE(i, num, int)
+
+#endif
diff --git a/paddle/pten/backends/gpu/gpu_info.cc b/paddle/pten/backends/gpu/gpu_info.cc
new file mode 100644
index 00000000000..d1b8d4a3626
--- /dev/null
+++ b/paddle/pten/backends/gpu/gpu_info.cc
@@ -0,0 +1,60 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/backends/gpu/gpu_info.h"
+
+#include <vector>
+
+#include "gflags/gflags.h"
+
+DECLARE_string(selected_gpus);
+
+namespace pten {
+namespace backends {
+namespace gpu {
+
+static inline std::vector<std::string> Split(std::string const& original,
+                                             char separator) {
+  std::vector<std::string> results;
+  std::string token;
+  std::istringstream is(original);
+  while (std::getline(is, token, separator)) {
+    if (!token.empty()) {
+      results.push_back(token);
+    }
+  }
+  return results;
+}
+
+//! Get a list of device ids from environment variable or use all.
+std::vector<int> GetSelectedDevices() {
+  // use user specified GPUs in single-node multi-process mode.
+  std::vector<int> devices;
+  if (!FLAGS_selected_gpus.empty()) {
+    auto devices_str = Split(FLAGS_selected_gpus, ',');
+    for (auto id : devices_str) {
+      devices.push_back(atoi(id.c_str()));
+    }
+  } else {
+    int count = GetGPUDeviceCount();
+    for (int i = 0; i < count; ++i) {
+      devices.push_back(i);
+    }
+  }
+  return devices;
+}
+
+}  // namespace gpu
+}  // namespace backends
+}  // namespace pten
diff --git a/paddle/pten/backends/gpu/gpu_info.h b/paddle/pten/backends/gpu/gpu_info.h
new file mode 100644
index 00000000000..59add6166d1
--- /dev/null
+++ b/paddle/pten/backends/gpu/gpu_info.h
@@ -0,0 +1,132 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+
+#include <stddef.h>
+#include <array>
+#include <string>
+#include <vector>
+
+#include "paddle/pten/backends/gpu/gpu_types.h"
+
+namespace pten {
+namespace backends {
+namespace gpu {
+
+//! Get the version of dnn
+int DnnVersion();
+
+//! Get the total number of GPU devices in system.
+int GetGPUDeviceCount();
+
+//! Get the compute capability of the ith GPU (format: major * 10 + minor)
+int GetGPUComputeCapability(int id);
+
+//! Get the runtime version of the ith GPU
+int GetGPURuntimeVersion(int id);
+
+//! Get the driver version of the ith GPU
+int GetGPUDriverVersion(int id);
+
+//! Wheter the current device support TensorCore
+bool TensorCoreAvailable();
+
+//! Get the MultiProcessors of the ith GPU.
+int GetGPUMultiProcessors(int id);
+
+//! Get the MaxThreads of each MultiProcessor of the ith GPU.
+int GetGPUMaxThreadsPerMultiProcessor(int id);
+
+//! Get the MaxThreads of each block of the ith GPU.
+int GetGPUMaxThreadsPerBlock(int id);
+
+//! Get the current GPU device id in system.
+int GetCurrentDeviceId();
+
+//! Get the maximum GridDim size for GPU buddy allocator.
+std::array<int, 3> GetGpuMaxGridDimSize(int);
+
+//! Get a list of device ids from environment variable or use all.
+std::vector<int> GetSelectedDevices();
+
+//! Get the properties of the ith GPU device.
+const gpuDeviceProp &GetDeviceProperties(int id);
+
+//! Set the GPU device id for next execution.
+void SetDeviceId(int device_id);
+
+//! Copy memory from address src to dst asynchronously.
+void GpuMemcpyAsync(void *dst,
+                    const void *src,
+                    size_t count,
+                    gpuMemcpyKind kind,
+                    gpuStream_t stream);
+
+//! Copy memory from address src to dst synchronously.
+void GpuMemcpySync(void *dst,
+                   const void *src,
+                   size_t count,
+                   gpuMemcpyKind kind);
+
+//! Copy memory from one device to another device asynchronously.
+void GpuMemcpyPeerAsync(void *dst,
+                        int dst_device,
+                        const void *src,
+                        int src_device,
+                        size_t count,
+                        gpuStream_t stream);
+
+//! Copy memory from one device to another device synchronously.
+void GpuMemcpyPeerSync(
+    void *dst, int dst_device, const void *src, int src_device, size_t count);
+
+//! Set memory dst with value count size asynchronously
+void GpuMemsetAsync(void *dst, int value, size_t count, gpuStream_t stream);
+
+//! Blocks until stream has completed all operations.
+void GpuStreamSync(gpuStream_t stream);
+
+void GpuDestroyStream(gpuStream_t stream);
+
+// ! Blocks until device has completed all operations.
+void GpuDeviceSync();
+
+gpuError_t GpuGetLastError();
+
+class GPUDeviceGuard {
+ public:
+  explicit inline GPUDeviceGuard(int dev_id) {
+    int prev_id = GetCurrentDeviceId();
+    if (prev_id != dev_id) {
+      prev_id_ = prev_id;
+      SetDeviceId(dev_id);
+    }
+  }
+  inline ~GPUDeviceGuard() {
+    if (prev_id_ != -1) {
+      SetDeviceId(prev_id_);
+    }
+  }
+  GPUDeviceGuard(const GPUDeviceGuard &o) = delete;
+  GPUDeviceGuard &operator=(const GPUDeviceGuard &o) = delete;
+
+ private:
+  int prev_id_{-1};
+};
+
+}  // namespace gpu
+}  // namespace backends
+}  // namespace pten
+
+#endif
diff --git a/paddle/pten/backends/gpu/gpu_launch_config.h b/paddle/pten/backends/gpu/gpu_launch_config.h
new file mode 100644
index 00000000000..c166b3d2f80
--- /dev/null
+++ b/paddle/pten/backends/gpu/gpu_launch_config.h
@@ -0,0 +1,181 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Used for compute gpu launch parameter config
+
+#pragma once
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+
+#ifdef PADDLE_WITH_CUDA
+#include <cuda_runtime.h>
+#else
+#include <hip/hip_runtime.h>
+#endif
+
+#include <stddef.h>
+#include <algorithm>
+#include <string>
+#include <vector>
+#include "paddle/pten/backends/gpu/gpu_context.h"
+
+#ifdef __HIPCC__
+// HIP results in error or nan if > 256
+#define PREDEFINED_BLOCK_SIZE 256
+#else
+/* CUDA performs better as thread_per_block
+   num is between [64, 512] */
+#define PREDEFINED_BLOCK_SIZE 512
+#endif
+
+namespace pten {
+namespace backends {
+namespace gpu {
+
+inline int DivUp(int a, int b) { return (a + b - 1) / b; }
+
+/* https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
+   for round integer value into next highest power of 2. */
+static inline int RoundToPowerOfTwo(int n) {
+  n--;
+  n |= (n >> 1);
+  n |= (n >> 2);
+  n |= (n >> 4);
+  n |= (n >> 8);
+  n |= (n >> 16);
+#ifdef __HIPCC__
+  return std::min(256, std::max(32, (n + 1)));
+#else
+  return std::min(1024, std::max(32, (n + 1)));
+#endif
+}
+
+#ifdef WITH_NV_JETSON
+// The number of threads cannot be assigned 1024 in some cases when the device
+// is nano or tx2 .
+inline void ChangeThreadNum(const pten::CUDAContext& context,
+                            int* num_thread,
+                            int alternative_num_thread = 512) {
+  if (context.GetComputeCapability() == 53 ||
+      context.GetComputeCapability() == 62) {
+    *num_thread = alternative_num_thread;
+  }
+}
+#endif
+
+struct GpuLaunchConfig {
+ public:
+  GpuLaunchConfig() {}
+
+  size_t GetThreadNum() const { return GetBlockSize() * GetGridSize(); }
+
+  size_t GetGridSize() const {
+    return block_per_grid.x * block_per_grid.y * block_per_grid.z;
+  }
+
+  size_t GetBlockSize() const {
+    return thread_per_block.x * thread_per_block.y * thread_per_block.z;
+  }
+
+  int compute_capability = 0;
+  dim3 thread_per_block = dim3(1, 1, 1);
+  dim3 block_per_grid = dim3(1, 1, 1);
+};
+
+/* According to NVIDIA, if number of threads per block is 64/128/256/512,
+  * cuda performs better. And number of blocks should be greater (at least
+  * 2x~4x) than number of SMs. Hence, SM count is took into account within
+  * this function to determine the right number of threads per block. */
+inline GpuLaunchConfig GetGpuLaunchConfig1D(const pten::GPUContext& context,
+                                            int64_t numel,
+                                            int vec_size = 1) {
+  PADDLE_ENFORCE_GT(numel,
+                    0,
+                    paddle::platform::errors::InvalidArgument(
+                        "element quantity should be greater than 0,"
+                        " but received value is: %d.",
+                        numel));
+  // Get compute_capability
+  const int capability = context.GetComputeCapability();
+  /* If thread number per block is 64/128/256/512, cuda performs better.*/
+  int limit_threads =
+      std::min(PREDEFINED_BLOCK_SIZE, context.GetMaxThreadsPerBlock());
+#ifdef WITH_NV_JETSON
+  if (capability == 53 || capability == 62) {
+    limit_threads = 512;
+  }
+#endif
+  int threads = limit_threads;
+  int sm_count = context.GetSMCount();
+  int active_threads_num = numel / vec_size;
+  if (active_threads_num / (sm_count << 1) < limit_threads) {
+    // Round up threads number into an exponential multiple of 2, while number
+    // of acitve blocks is about twice of SM, to acquire better performance.
+    threads = RoundToPowerOfTwo(active_threads_num / (sm_count << 1));
+  } else if (active_threads_num / (sm_count << 2) < limit_threads) {
+    // Round up threads number into an exponential multiple of 2, while number
+    // of acitve blocks is about 4 times of SM, to acquire better performance.
+    threads = RoundToPowerOfTwo(active_threads_num / (sm_count << 2));
+  }
+  // Number of threads per block shall be larger than 64.
+  threads = std::max(64, threads);
+  int blocks = DivUp(DivUp(numel, vec_size), threads);
+
+  GpuLaunchConfig config;
+  config.thread_per_block.x = threads;
+  config.block_per_grid.x = blocks;
+  config.compute_capability = capability;
+  return config;
+}
+
+inline GpuLaunchConfig GetGpuLaunchConfig2D(const pten::GPUContext& context,
+                                            int x_dim,
+                                            int y_dim) {
+  PADDLE_ENFORCE_GT(x_dim,
+                    0,
+                    paddle::platform::errors::InvalidArgument(
+                        "x dim number should greater than 0,"
+                        " but received value is: %d",
+                        x_dim));
+  PADDLE_ENFORCE_GT(y_dim,
+                    0,
+                    paddle::platform::errors::InvalidArgument(
+                        "y dim number should greater than 0,"
+                        " but received value is: %d",
+                        y_dim));
+
+  const int kThreadsPerBlock = 256;
+  int block_cols = (std::min)(x_dim, kThreadsPerBlock);
+  int block_rows = (std::max)(kThreadsPerBlock / block_cols, 1);
+
+  int max_physical_threads = context.GetMaxPhysicalThreadCount();
+  const int max_blocks = (std::max)(max_physical_threads / kThreadsPerBlock, 1);
+
+  GpuLaunchConfig config;
+  // Noticed, block size is not align to 32, if needed do it yourself.
+  config.thread_per_block = dim3(block_cols, block_rows, 1);
+
+  int grid_x = (std::min)(DivUp(x_dim, block_cols), max_blocks);
+  int grid_y =
+      (std::min)(max_blocks / grid_x, (std::max)(y_dim / block_rows, 1));
+
+  config.block_per_grid = dim3(grid_x, grid_y, 1);
+  return config;
+}
+
+}  // namespace gpu
+}  // namespace backends
+}  // namespace pten
+
+#endif
diff --git a/paddle/pten/backends/gpu/gpu_types.h b/paddle/pten/backends/gpu/gpu_types.h
new file mode 100644
index 00000000000..72353294b29
--- /dev/null
+++ b/paddle/pten/backends/gpu/gpu_types.h
@@ -0,0 +1,73 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/pten/backends/gpu/forwards.h"
+#include "paddle/pten/backends/gpu/gpu_decls.h"
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+
+#ifdef PADDLE_WITH_HIP
+#include "paddle/pten/backends/dynload/miopen.h"
+#include "paddle/pten/backends/dynload/rocblas.h"
+#else  // PADDLE_WITH_CUDA
+#include "paddle/pten/backends/dynload/cublas.h"
+#include "paddle/pten/backends/dynload/cudnn.h"
+#endif
+
+namespace pten {
+
+#ifdef PADDLE_WITH_HIP
+#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \
+  using GPU_TYPE = ROCM_TYPE;
+
+#else  // PADDLE_WITH_CDUA
+
+#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \
+  using GPU_TYPE = CUDA_TYPE;
+#endif
+
+DECLARE_TYPE_FOR_GPU(gpuError_t, cudaError_t, hipError_t);
+DECLARE_TYPE_FOR_GPU(gpuMemcpyKind, cudaMemcpyKind, hipMemcpyKind);
+DECLARE_TYPE_FOR_GPU(gpuDeviceProp, cudaDeviceProp, hipDeviceProp_t);
+DECLARE_TYPE_FOR_GPU(dnnDataType_t, cudnnDataType_t, miopenDataType_t);
+DECLARE_TYPE_FOR_GPU(dnnPoolingMode_t, cudnnPoolingMode_t, miopenPoolingMode_t);
+DECLARE_TYPE_FOR_GPU(dnnTensorFormat_t,
+                     cudnnTensorFormat_t,
+                     miopenTensorFormat_t);
+DECLARE_TYPE_FOR_GPU(dnnActivationMode_t,
+                     cudnnActivationMode_t,
+                     miopenActivationMode_t);
+
+#undef DECLARE_TYPE_FOR_GPU
+
+#ifdef PADDLE_WITH_HIP
+#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV) \
+  constexpr auto GPU_CV = ROCM_CV;
+#else  // PADDLE_WITH_CUDA
+#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV) \
+  constexpr auto GPU_CV = CUDA_CV;
+#endif
+
+DECLARE_CONSTANT_FOR_GPU(gpuErrorOutOfMemory,
+                         cudaErrorMemoryAllocation,
+                         hipErrorOutOfMemory);
+DECLARE_CONSTANT_FOR_GPU(gpuErrorNotReady, cudaErrorNotReady, hipErrorNotReady);
+DECLARE_CONSTANT_FOR_GPU(gpuSuccess, cudaSuccess, hipSuccess);
+
+#undef DECLARE_CONSTANT_FOR_GPU
+}  // namespace pten
+
+#endif  // defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
diff --git a/paddle/pten/backends/gpu/rocm/CMakeLists.txt b/paddle/pten/backends/gpu/rocm/CMakeLists.txt
new file mode 100644
index 00000000000..181f92cbfc3
--- /dev/null
+++ b/paddle/pten/backends/gpu/rocm/CMakeLists.txt
@@ -0,0 +1 @@
+hip_library(pten_rocm_info SRCS rocm_info.cc DEPS gflags glog enforce pten_dynload_cuda)
diff --git a/paddle/pten/backends/gpu/rocm/rocm_helper.h b/paddle/pten/backends/gpu/rocm/rocm_helper.h
new file mode 100644
index 00000000000..3a16bfc5286
--- /dev/null
+++ b/paddle/pten/backends/gpu/rocm/rocm_helper.h
@@ -0,0 +1,72 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+namespace pten {
+namespace backends {
+namespace gpu {
+
+/*
+ * Summary: Grid stride looping macro in CUDA kernel
+ *
+ *  [ Why need this macro? ]
+ *
+ *    The original looping in CUDA kernel is:
+ *
+ *    `for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+ *        i += blockDim.x * gridDim.x)`
+ *
+ *    This for condition is risky. The value of `blockIdx.x * blockDim.x`
+ *    may be large, such as over 1GB, the first iteration is no problem here,
+ *    but when `i += blockDim.x * gridDim.x` is executed, the value of i
+ *    will greater than INT_MAX and overflow becomes negative value, at
+ *    this time, the cycle condition `i < (n)` is still satisfied, so it
+ *    will cause illegal access to cuda memory.
+ *
+ *    Here is a real example in ERINE, it will trigger above error.
+ *    The related data are:
+ *      - blockIdx.x = 2172938
+ *      - blockDim.x = 512
+ *      - blockIdx.x * blockDim.x = 1112543864
+ *      - INT_MAX = 2147483647
+ *
+ *    So we polish the for condition as follow, the int64_t __index__ will
+ *    prevent overflow in the loop increment.
+ *
+ * Parameters:
+ *    - i: loop index
+ *    - num: total element numbers
+ *
+ * Examples:
+ *    template <typename T>
+ *    __global__ void Scale(T* logit_grad, const T* loss_grad, const int num,
+ *                      const int d, const int remain) {
+ *    CUDA_KERNEL_LOOP(index, num) {
+ *      int idx_n = index / d;
+ *      int idx_remain = index % remain;
+ *      logit_grad[index] *= loss_grad[idx_n * remain + idx_remain];
+ *      }
+ *    }
+ *
+*/
+
+#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type)                     \
+  int64_t __index__ = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; \
+  for (index_type i = __index__; __index__ < (num);                   \
+       __index__ += hipBlockDim_x * hipGridDim_x, i = __index__)
+
+}  // namespace gpu
+}  // namespace backends
+}  // namespace pten
diff --git a/paddle/fluid/platform/device/gpu/rocm/rocm_info.cc b/paddle/pten/backends/gpu/rocm/rocm_info.cc
similarity index 72%
rename from paddle/fluid/platform/device/gpu/rocm/rocm_info.cc
rename to paddle/pten/backends/gpu/rocm/rocm_info.cc
index 06dba8ce423..095acdf0769 100644
--- a/paddle/fluid/platform/device/gpu/rocm/rocm_info.cc
+++ b/paddle/pten/backends/gpu/rocm/rocm_info.cc
@@ -12,20 +12,20 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include <array>
+#include "paddle/pten/backends/gpu/gpu_info.h"
+
+// TODO(pten): remove fluid headers.
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/lock_guard_ptr.h"
-#include "paddle/fluid/platform/macros.h"
-#include "paddle/fluid/platform/monitor.h"
-#include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/string/split.h"
 
 static std::once_flag g_device_props_size_init_flag;
 static std::vector<std::unique_ptr<std::once_flag>> g_device_props_init_flags;
 static std::vector<paddle::gpuDeviceProp> g_device_props;
 
-namespace paddle {
-namespace platform {
+namespace pten {
+namespace backends {
+namespace gpu {
+
 int DnnVersion() {
   if (!dynload::HasCUDNN()) return -1;
   size_t version_major, version_minor, version_patch;
@@ -78,11 +78,13 @@ int GetGPUDeviceCount() {
 }
 
 int GetGPUComputeCapability(int id) {
-  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
-                    platform::errors::InvalidArgument(
+  PADDLE_ENFORCE_LT(id,
+                    GetGPUDeviceCount(),
+                    paddle::platform::errors::InvalidArgument(
                         "Device id must be less than GPU count, "
                         "but received id is: %d. GPU count is: %d.",
-                        id, GetGPUDeviceCount()));
+                        id,
+                        GetGPUDeviceCount()));
   int major, minor;
   auto major_error_code = hipDeviceGetAttribute(
       &major, hipDeviceAttributeComputeCapabilityMajor, id);
@@ -95,22 +97,26 @@ int GetGPUComputeCapability(int id) {
 }
 
 int GetGPURuntimeVersion(int id) {
-  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
-                    platform::errors::InvalidArgument(
+  PADDLE_ENFORCE_LT(id,
+                    GetGPUDeviceCount(),
+                    paddle::platform::errors::InvalidArgument(
                         "Device id must be less than GPU count, "
                         "but received id is: %d. GPU count is: %d.",
-                        id, GetGPUDeviceCount()));
+                        id,
+                        GetGPUDeviceCount()));
   int runtime_version = 0;
   PADDLE_ENFORCE_GPU_SUCCESS(hipRuntimeGetVersion(&runtime_version));
   return runtime_version;
 }
 
 int GetGPUDriverVersion(int id) {
-  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
-                    platform::errors::InvalidArgument(
+  PADDLE_ENFORCE_LT(id,
+                    GetGPUDeviceCount(),
+                    paddle::platform::errors::InvalidArgument(
                         "Device id must be less than GPU count, "
                         "but received id is: %d. GPU count is: %d.",
-                        id, GetGPUDeviceCount()));
+                        id,
+                        GetGPUDeviceCount()));
   int driver_version = 0;
   PADDLE_ENFORCE_GPU_SUCCESS(hipDriverGetVersion(&driver_version));
   return driver_version;
@@ -119,11 +125,13 @@ int GetGPUDriverVersion(int id) {
 bool TensorCoreAvailable() { return false; }
 
 int GetGPUMultiProcessors(int id) {
-  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
-                    platform::errors::InvalidArgument(
+  PADDLE_ENFORCE_LT(id,
+                    GetGPUDeviceCount(),
+                    paddle::platform::errors::InvalidArgument(
                         "Device id must be less than GPU count, "
                         "but received id is: %d. GPU count is: %d.",
-                        id, GetGPUDeviceCount()));
+                        id,
+                        GetGPUDeviceCount()));
   int count;
   PADDLE_ENFORCE_GPU_SUCCESS(
       hipDeviceGetAttribute(&count, hipDeviceAttributeMultiprocessorCount, id));
@@ -131,11 +139,13 @@ int GetGPUMultiProcessors(int id) {
 }
 
 int GetGPUMaxThreadsPerMultiProcessor(int id) {
-  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
-                    platform::errors::InvalidArgument(
+  PADDLE_ENFORCE_LT(id,
+                    GetGPUDeviceCount(),
+                    paddle::platform::errors::InvalidArgument(
                         "Device id must be less than GPU count, "
                         "but received id is: %d. GPU count is: %d.",
-                        id, GetGPUDeviceCount()));
+                        id,
+                        GetGPUDeviceCount()));
   int count;
   PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceGetAttribute(
       &count, hipDeviceAttributeMaxThreadsPerMultiProcessor, id));
@@ -144,11 +154,13 @@ int GetGPUMaxThreadsPerMultiProcessor(int id) {
 }
 
 int GetGPUMaxThreadsPerBlock(int id) {
-  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
-                    platform::errors::InvalidArgument(
+  PADDLE_ENFORCE_LT(id,
+                    GetGPUDeviceCount(),
+                    paddle::platform::errors::InvalidArgument(
                         "Device id must be less than GPU count, "
                         "but received id is: %d. GPU count is: %d.",
-                        id, GetGPUDeviceCount()));
+                        id,
+                        GetGPUDeviceCount()));
   int count;
   PADDLE_ENFORCE_GPU_SUCCESS(
       hipDeviceGetAttribute(&count, hipDeviceAttributeMaxThreadsPerBlock, id));
@@ -161,35 +173,37 @@ int GetCurrentDeviceId() {
   return device_id;
 }
 
-dim3 GetGpuMaxGridDimSize(int id) {
-  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
-                    platform::errors::InvalidArgument(
+std::array<int, 3> GetGpuMaxGridDimSize(int id) {
+  PADDLE_ENFORCE_LT(id,
+                    GetGPUDeviceCount(),
+                    paddle::platform::errors::InvalidArgument(
                         "Device id must be less than GPU count, "
                         "but received id is: %d. GPU count is: %d.",
-                        id, GetGPUDeviceCount()));
-  dim3 ret;
+                        id,
+                        GetGPUDeviceCount()));
+  std::array<int, 3> ret;
   int size;
   auto error_code_x =
       hipDeviceGetAttribute(&size, hipDeviceAttributeMaxGridDimX, id);
   PADDLE_ENFORCE_GPU_SUCCESS(error_code_x);
-  ret.x = size;
+  ret[0] = size;
 
   auto error_code_y =
       hipDeviceGetAttribute(&size, hipDeviceAttributeMaxGridDimY, id);
   PADDLE_ENFORCE_GPU_SUCCESS(error_code_y);
-  ret.y = size;
+  ret[1] = size;
 
   auto error_code_z =
       hipDeviceGetAttribute(&size, hipDeviceAttributeMaxGridDimZ, id);
   PADDLE_ENFORCE_GPU_SUCCESS(error_code_z);
-  ret.z = size;
+  ret[2] = size;
   return ret;
 }
 
 const gpuDeviceProp &GetDeviceProperties(int id) {
   std::call_once(g_device_props_size_init_flag, [&] {
     int gpu_num = 0;
-    gpu_num = platform::GetGPUDeviceCount();
+    gpu_num = GetGPUDeviceCount();
     g_device_props_init_flags.resize(gpu_num);
     g_device_props.resize(gpu_num);
     for (int i = 0; i < gpu_num; ++i) {
@@ -198,16 +212,17 @@ const gpuDeviceProp &GetDeviceProperties(int id) {
   });
 
   if (id == -1) {
-    id = platform::GetCurrentDeviceId();
+    id = GetCurrentDeviceId();
   }
 
   if (id < 0 || id >= static_cast<int>(g_device_props.size())) {
-    PADDLE_THROW(platform::errors::OutOfRange(
+    PADDLE_THROW(paddle::platform::errors::OutOfRange(
         "The device id %d is out of range [0, %d), where %d is the number of "
         "devices on this machine. Because the device id should be greater than "
         "or equal to zero and smaller than the number of gpus. Please input "
         "appropriate device again!",
-        id, static_cast<int>(g_device_props.size()),
+        id,
+        static_cast<int>(g_device_props.size()),
         static_cast<int>(g_device_props.size())));
   }
 
@@ -220,32 +235,43 @@ const gpuDeviceProp &GetDeviceProperties(int id) {
 
 void SetDeviceId(int id) {
   // TODO(qijun): find a better way to cache the cuda device count
-  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
-                    platform::errors::InvalidArgument(
+  PADDLE_ENFORCE_LT(id,
+                    GetGPUDeviceCount(),
+                    paddle::platform::errors::InvalidArgument(
                         "Device id must be less than GPU count, "
                         "but received id is: %d. GPU count is: %d.",
-                        id, GetGPUDeviceCount()));
+                        id,
+                        GetGPUDeviceCount()));
   PADDLE_RETRY_CUDA_SUCCESS(hipSetDevice(id));
 }
 
-void GpuMemcpyAsync(void *dst, const void *src, size_t count,
-                    gpuMemcpyKind kind, gpuStream_t stream) {
+void GpuMemcpyAsync(void *dst,
+                    const void *src,
+                    size_t count,
+                    gpuMemcpyKind kind,
+                    gpuStream_t stream) {
   PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpyAsync(dst, src, count, kind, stream));
 }
 
-void GpuMemcpySync(void *dst, const void *src, size_t count,
+void GpuMemcpySync(void *dst,
+                   const void *src,
+                   size_t count,
                    gpuMemcpyKind kind) {
   PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpy(dst, src, count, kind));
 }
 
-void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src,
-                        int src_device, size_t count, gpuStream_t stream) {
+void GpuMemcpyPeerAsync(void *dst,
+                        int dst_device,
+                        const void *src,
+                        int src_device,
+                        size_t count,
+                        gpuStream_t stream) {
   PADDLE_ENFORCE_GPU_SUCCESS(
       hipMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream));
 }
 
-void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src,
-                       int src_device, size_t count) {
+void GpuMemcpyPeerSync(
+    void *dst, int dst_device, const void *src, int src_device, size_t count) {
   PADDLE_ENFORCE_GPU_SUCCESS(
       hipMemcpyPeer(dst, dst_device, src, src_device, count));
 }
@@ -265,5 +291,7 @@ void GpuDestroyStream(gpuStream_t stream) {
 void GpuDeviceSync() { PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize()); }
 
 gpuError_t GpuGetLastError() { return hipGetLastError(); }
-}  // namespace platform
-}  // namespace paddle
+
+}  // namespace gpu
+}  // namespace backends
+}  // namespace pten
diff --git a/paddle/pten/backends/xpu/xpu_context.cc b/paddle/pten/backends/xpu/xpu_context.cc
index af4478662a5..22e82b220d2 100644
--- a/paddle/pten/backends/xpu/xpu_context.cc
+++ b/paddle/pten/backends/xpu/xpu_context.cc
@@ -13,8 +13,11 @@
 // limitations under the License.
 
 #include "paddle/pten/backends/xpu/xpu_context.h"
+
 #include <memory>
+
 #include "paddle/pten/api/ext/exception.h"
+#include "paddle/pten/common/place.h"
 
 #include "xpu/runtime.h"
 #include "xpu/runtime_ex.h"
@@ -24,12 +27,11 @@ namespace xpu = baidu::xpu::api;
 
 namespace pten {
 
-struct XPUContext::XPUImpl {
-  void SetL3Cache() {
+struct XPUContext::Impl {
+  void SetL3Cache(int l3_size = 14155776) {
     const int MAX_XPU_NUM = 16;
     static void* l3ptrs[MAX_XPU_NUM] = {nullptr};
 
-    int l3_size = 13.5 * 1024 * 1024;
     if (std::getenv("XPU_PADDLE_L3_SIZE") != nullptr) {
       l3_size = atoi(std::getenv("XPU_PADDLE_L3_SIZE"));
     }
@@ -52,48 +54,28 @@ struct XPUContext::XPUImpl {
     }
   }
 
-  XPUImpl() {
-    context_ = xpu::create_context();
-    xpu_version_ = backends::xpu::get_xpu_version(place_.device);
-  }
-
-  explicit XPUImpl(XPUPlace place) : place_(place) {
-    backends::xpu::XPUDeviceGuard guard(place_.GetDeviceId());
-
-    LOG_FIRST_N(WARNING, 1) << "Please NOTE: xpu device: "
-                            << static_cast<int>(place_.device);
-
-    context_ = xpu::create_context();
-    xpu_version_ = backends::xpu::get_xpu_version(place_.device);
-    SetL3Cache();
-  }
+  Impl() : place_(XPUPlace()) {}
 
-  // Users need to manage external resources.
-  explicit XPUImpl(const XPUContextResource& ctx_res,
-                   const XPUPlace& place = XPUPlace(0))
-      : res_(ctx_res), place_(place) {
-    context_ = res_.context;
-    xpu_version_ = backends::xpu::get_xpu_version(place_.device);
-    SetL3Cache();
-  }
+  explicit Impl(const Place& place) : place_(place) {}
 
-  ~XPUImpl() {
-    if (res_.context == nullptr && context_ != nullptr) {
+  ~Impl() {
+    if (owned_ && context_ != nullptr) {
       xpu::destroy_context(context_);
       context_ = nullptr;
     }
   }
 
-  Place GetPlace() const { return place_; }
-
-  backends::xpu::XPUVersion GetXpuVersion() const { return xpu_version_; }
+  const Place& GetPlace() const { return place_; }
 
   xpu::Context* GetXContext() const {
     PD_CHECK(context_ != nullptr, "the xpu context is nullptr.");
     return context_;
   }
 
-  xpu::BKCLContext_t GetBkclContext() const { return bkcl_context_; }
+  xpu::BKCLContext_t GetBkclContext() const {
+    PD_CHECK(bkcl_context_ != nullptr, "the xpu bkcl_context is nullptr.");
+    return bkcl_context_;
+  }
 
   void Wait() const {
     backends::xpu::SetXPUDeviceId(place_.GetDeviceId());
@@ -101,53 +83,41 @@ struct XPUContext::XPUImpl {
     xpu_wait(context_->xpu_stream);
   }
 
-  void SetXContext(xpu::Context* context) {
-    if (context == nullptr) {
-      return;
-    }
-    res_.context = context;
-    context_ = context;
+  void Init() {
+    owned_ = true;
+    backends::xpu::XPUDeviceGuard guard(place_.GetDeviceId());
+    LOG_FIRST_N(WARNING, 1) << "Please NOTE: xpu device: "
+                            << static_cast<int>(place_.device);
+    context_ = xpu::create_context();
+    xpu_version_ = backends::xpu::get_xpu_version(place_.device);
+    SetL3Cache();
   }
 
+  void SetXContext(xpu::Context* context) { context_ = context; }
+
   void SetBkclContext(xpu::BKCLContext_t context) { bkcl_context_ = context; }
 
-  XPUContextResource res_;
-  XPUPlace place_;
+  bool owned_{false};
+  Place place_;
   backends::xpu::XPUVersion xpu_version_;
   xpu::Context* context_{nullptr};
+
   // NOTE: Distributed communicator, distributed framework manages its
   // resources, XPUContext only holds references.
   xpu::BKCLContext_t bkcl_context_{nullptr};
 };
 
-XPUContext::XPUContext() : DeviceContext() {
-  impl_ = std::make_unique<XPUImpl>();
-}
+XPUContext::XPUContext() : DeviceContext(), impl_(std::make_unique<Impl>()) {}
 
-XPUContext::XPUContext(const XPUPlace& place) {
-  impl_ = std::make_unique<XPUImpl>(place);
-}
-
-XPUContext::XPUContext(const XPUContext& other) : DeviceContext() {
-  impl_ = std::make_unique<XPUImpl>();
-  impl_->SetXContext(other.x_context());
-  impl_->SetBkclContext(other.bkcl_context());
-}
-
-XPUContext::XPUContext(XPUContext&& other) : DeviceContext() {
-  impl_ = std::move(other.impl_);
-}
+XPUContext::XPUContext(const XPUPlace& place)
+    : DeviceContext(), impl_(std::make_unique<Impl>(place)) {}
 
 XPUContext::~XPUContext() = default;
 
-XPUContext::XPUContext(const XPUContextResource& ctx_res) : DeviceContext() {
-  impl_ = std::make_unique<XPUImpl>(ctx_res);
-}
-
-Place XPUContext::GetPlace() const { return impl_->GetPlace(); }
+const Place& XPUContext::GetPlace() const { return impl_->GetPlace(); }
 
 backends::xpu::XPUVersion XPUContext::xpu_version() const {
-  return impl_->GetXpuVersion();
+  return impl_->xpu_version_;
 }
 
 xpu::Context* XPUContext::x_context() const { return impl_->GetXContext(); }
@@ -158,12 +128,16 @@ xpu::BKCLContext_t XPUContext::bkcl_context() const {
 
 void XPUContext::Wait() const { impl_->Wait(); }
 
-void XPUContext::set_x_context(xpu::Context* context) {
+void XPUContext::SetXContext(xpu::Context* context) {
   impl_->SetXContext(context);
 }
 
-void XPUContext::set_bkcl_context(xpu::BKCLContext_t context) {
+void XPUContext::SetL3Cache(int l3_size) { impl_->SetL3Cache(l3_size); }
+
+void XPUContext::SetBkclContext(xpu::BKCLContext_t context) {
   impl_->SetBkclContext(context);
 }
 
+void XPUContext::Init() { impl_->Init(); }
+
 }  // namespace pten
diff --git a/paddle/pten/backends/xpu/xpu_context.h b/paddle/pten/backends/xpu/xpu_context.h
index 4ae5786211d..440d06a60fc 100644
--- a/paddle/pten/backends/xpu/xpu_context.h
+++ b/paddle/pten/backends/xpu/xpu_context.h
@@ -26,26 +26,15 @@ namespace xpu = baidu::xpu::api;
 
 namespace pten {
 
-struct XPUContextResource {
-  xpu::Context* context{nullptr};
-};
-
 class XPUContext : public DeviceContext {
  public:
-  // NOTE: DeviceContext hold resources. Used in training scenarios.
   XPUContext();
 
   explicit XPUContext(const XPUPlace&);
 
-  // NOTE: Share the same underlying resources, please ensure that resources are
-  // not released.
-  XPUContext(const XPUContext&);
-
-  XPUContext(XPUContext&&);
-
   virtual ~XPUContext();
 
-  Place GetPlace() const override;
+  const Place& GetPlace() const override;
 
   backends::xpu::XPUVersion xpu_version() const;
 
@@ -53,21 +42,28 @@ class XPUContext : public DeviceContext {
 
   // Return bkcl context.
   xpu::BKCLContext_t bkcl_context() const;
+  void SetBkclContext(xpu::BKCLContext_t context);
 
   // Wait for all operations completion in the stream.
   void Wait() const override;
 
  public:
-  // NOTE: External users manage resources. Used in inference scenarios.
-  explicit XPUContext(const XPUContextResource&);
+  // NOTE: DeviceContext hold resources. Used in training scenarios.
+  // The interface used by the training scene, DeviceContext will initialize
+  // all resources and delete them when destructing.
+  void Init();
 
-  void set_x_context(xpu::Context*);
+ public:
+  // NOTE: External users manage resources. Used in inference scenarios.
+  // The Set interface is for inference only, DeviceContext will mark the
+  // resource as external, and will not delete any resource when destructing.
+  void SetXContext(xpu::Context*);
 
-  void set_bkcl_context(xpu::BKCLContext_t context);
+  void SetL3Cache(int l3_size = 14155776);
 
  private:
-  struct XPUImpl;
-  std::unique_ptr<XPUImpl> impl_;
+  struct Impl;
+  std::unique_ptr<Impl> impl_;
 };
 
 }  // namespace pten
diff --git a/paddle/pten/core/CMakeLists.txt b/paddle/pten/core/CMakeLists.txt
index ab6a9931973..46cd2a96414 100644
--- a/paddle/pten/core/CMakeLists.txt
+++ b/paddle/pten/core/CMakeLists.txt
@@ -1,14 +1,6 @@
-# utils used for compatible for fluid op system
+# compatible utils used for fluid op system
 add_subdirectory(compat)
 
-if(WITH_GPU)
-  cc_library(convert_utils SRCS convert_utils.cc DEPS data_type place gpu_info)
-elseif(WITH_ROCM)
-  cc_library(convert_utils SRCS convert_utils.cc DEPS data_type place gpu_info)
-else()
-  cc_library(convert_utils SRCS convert_utils.cc DEPS data_type place)
-endif()
-
 cc_library(errors SRCS errors.cc)
 set(pten_enforce_deps errors flags)
 if(WITH_GPU)
@@ -19,21 +11,20 @@ cc_library(pten_enforce INTERFACE SRCS enforce.cc DEPS ${pten_enforce_deps})
 cc_library(kernel_factory SRCS kernel_factory.cc DEPS pten_enforce convert_utils)
 cc_library(kernel_context SRCS kernel_context.cc DEPS pten_enforce pten_context)
 
-cc_library(tensor_base SRCS tensor_base.cc allocator.cc storage.cc DEPS pten_enforce)
+cc_library(ddim SRCS ddim.cc DEPS pten_enforce)
+cc_library(tensor_base SRCS tensor_base.cc allocator.cc DEPS pten_enforce)
 cc_library(tensor_meta SRCS tensor_meta.cc DEPS pten_enforce mixed_vector)
 cc_library(lod_utils SRCS lod_utils.cc DEPS pten_enforce mixed_vector)
+
+cc_library(pten_device_context SRCS device_context.cc DEPS tensor_base)
+cc_library(dense_tensor SRCS dense_tensor.cc dense_tensor_impl.cc DEPS convert_utils tensor_meta tensor_base)
 cc_library(sparse_coo_tensor SRCS sparse_coo_tensor.cc DEPS tensor_meta tensor_base)
 cc_library(sparse_csr_tensor SRCS sparse_csr_tensor.cc DEPS dense_tensor tensor_base)
-cc_library(dense_tensor SRCS dense_tensor.cc dense_tensor_impl.cc DEPS convert_utils tensor_meta tensor_base)
-cc_library(pten_device_context SRCS device_context.cc DEPS tensor_base )
 
 cc_library(meta_tensor SRCS meta_tensor.cc DEPS tensor_base tensor_meta dense_tensor)
 cc_library(infermeta_utils SRCS infermeta_utils.cc DEPS meta_tensor)
 cc_library(selected_rows SRCS selected_rows.cc DEPS dense_tensor mixed_vector pten_enforce ddim)
 
-cc_test(unroll_array_ops_test SRCS unroll_array_ops_test.cc)
-cc_library(ddim SRCS ddim.cc DEPS eigen3 boost enforce)
-
 # Will remove once we implemented MKLDNN_Tensor
 if(WITH_MKLDNN)
     add_dependencies(dense_tensor mkldnn)
diff --git a/paddle/pten/core/allocator.h b/paddle/pten/core/allocator.h
index 75d42c4fd15..098e5698dee 100644
--- a/paddle/pten/core/allocator.h
+++ b/paddle/pten/core/allocator.h
@@ -16,7 +16,9 @@ limitations under the License. */
 
 #include <cstdint>
 #include <functional>
-#include "paddle/fluid/platform/place.h"
+#include <memory>
+
+#include "paddle/pten/common/place.h"
 
 namespace pten {
 
@@ -26,7 +28,7 @@ namespace pten {
 /// support being inherited.
 class Allocation {
  public:
-  using Place = paddle::platform::Place;
+  using Place = pten::Place;
   using DeleterFnPtr = void (*)(Allocation*);
 
   Allocation() = default;
diff --git a/paddle/pten/core/compat/CMakeLists.txt b/paddle/pten/core/compat/CMakeLists.txt
index 0c081edb81c..6d1529d94fd 100644
--- a/paddle/pten/core/compat/CMakeLists.txt
+++ b/paddle/pten/core/compat/CMakeLists.txt
@@ -1,2 +1,13 @@
 cc_library(arg_map_context SRCS arg_map_context.cc DEPS pten_enforce)
-cc_library(op_utils SRCS op_utils.cc DEPS arg_map_context enforce convert_utils)
+cc_library(op_utils SRCS op_utils.cc DEPS arg_map_context enforce)
+if(WITH_GPU)
+  cc_library(convert_utils SRCS convert_utils.cc DEPS data_type place op_utils gpu_info)
+elseif(WITH_ROCM)
+  cc_library(convert_utils SRCS convert_utils.cc DEPS data_type place op_utils gpu_info)
+elseif(WITH_XPU)
+  cc_library(convert_utils SRCS convert_utils.cc DEPS data_type place op_utils xpu_info)
+elseif(WITH_ASCEND_CL)
+  cc_library(convert_utils SRCS convert_utils.cc DEPS data_type place op_utils npu_info)
+else()
+  cc_library(convert_utils SRCS convert_utils.cc DEPS data_type place op_utils)
+endif()
diff --git a/paddle/pten/core/compat/arg_map_context.cc b/paddle/pten/core/compat/arg_map_context.cc
index 73fa0b300cf..b6b2bf66270 100644
--- a/paddle/pten/core/compat/arg_map_context.cc
+++ b/paddle/pten/core/compat/arg_map_context.cc
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include "paddle/pten/core/compat/arg_map_context.h"
 
-#include "paddle/fluid/string/string_helper.h"
 #include "paddle/pten/core/enforce.h"
+#include "paddle/utils/string/string_helper.h"
 
 namespace pten {
 std::ostream& operator<<(std::ostream& os, KernelSignature signature) {
diff --git a/paddle/pten/core/compat/arg_map_context.h b/paddle/pten/core/compat/arg_map_context.h
index 835799ec546..6898dd36d63 100644
--- a/paddle/pten/core/compat/arg_map_context.h
+++ b/paddle/pten/core/compat/arg_map_context.h
@@ -75,6 +75,9 @@ class ArgumentMappingContext {
 
   virtual bool IsDenseTensorInput(const std::string& name) const = 0;
   virtual bool IsSelectedRowsInput(const std::string& name) const = 0;
+
+  virtual bool IsDenseTensorOutput(const std::string& name) const = 0;
+  virtual bool IsSelectedRowsOutput(const std::string& name) const = 0;
 };
 
 }  // namespace pten
diff --git a/paddle/pten/core/convert_utils.cc b/paddle/pten/core/compat/convert_utils.cc
similarity index 81%
rename from paddle/pten/core/convert_utils.cc
rename to paddle/pten/core/compat/convert_utils.cc
index 70184e31f7d..355a67601dd 100644
--- a/paddle/pten/core/convert_utils.cc
+++ b/paddle/pten/core/compat/convert_utils.cc
@@ -11,10 +11,14 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/pten/core/convert_utils.h"
-#include "paddle/pten/core/kernel_alias_name.h"
+
+#include "paddle/pten/core/compat/convert_utils.h"
+#include "paddle/pten/core/compat/op_utils.h"
+
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/device/npu/npu_info.h"
+#include "paddle/fluid/platform/device/xpu/xpu_info.h"
 
 namespace pten {
 
@@ -63,15 +67,18 @@ paddle::experimental::DataType TransToPtenDataType(
   }
 }
 
-paddle::platform::Place TransToFluidPlace(const Backend& backend) {
-  // TODO(chenweihang): add other trans cases later
+paddle::platform::Place TransToFluidPlace(const Backend& backend,
+                                          bool set_device_id) {
+  // NOTE(zhiqiu): GetCurrentDeviceId not always success, and device id is not
+  // always needed.
+  // So, add set_device_id parameter here.
   switch (backend) {
     case pten::Backend::CPU:
       return paddle::platform::CPUPlace();
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     case pten::Backend::GPU:
       return paddle::platform::CUDAPlace(
-          paddle::platform::GetCurrentDeviceId());
+          set_device_id ? paddle::platform::GetCurrentDeviceId() : 0);
 #endif
 #ifdef PADDLE_WITH_MKLDNN
     case pten::Backend::MKLDNN:
@@ -80,7 +87,17 @@ paddle::platform::Place TransToFluidPlace(const Backend& backend) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     case pten::Backend::CUDNN:
       return paddle::platform::CUDAPlace(
-          paddle::platform::GetCurrentDeviceId());
+          set_device_id ? paddle::platform::GetCurrentDeviceId() : 0);
+#endif
+#if defined(PADDLE_WITH_XPU)
+    case pten::Backend::XPU:
+      return paddle::platform::XPUPlace(
+          set_device_id ? paddle::platform::GetXPUCurrentDeviceId() : 0);
+#endif
+#if defined(PADDLE_WITH_ASCEND_CL)
+    case pten::Backend::NPU:
+      return paddle::platform::NPUPlace(
+          set_device_id ? paddle::platform::GetCurrentNPUDeviceId() : 0);
 #endif
     default:
       PADDLE_THROW(paddle::platform::errors::Unimplemented(
@@ -126,26 +143,6 @@ paddle::framework::proto::VarType::Type TransToProtoVarType(
   }
 }
 
-paddle::framework::LoD TransToFluidLoD(const pten::LoD& lod) {
-  paddle::framework::LoD out;
-  out.reserve(lod.size());
-
-  for (auto& elem : lod) {
-    out.emplace_back(elem);
-  }
-  return out;
-}
-
-pten::LoD TransToPtenLoD(const paddle::framework::LoD& lod) {
-  pten::LoD out;
-  out.reserve(lod.size());
-
-  for (auto& elem : lod) {
-    out.emplace_back(elem);
-  }
-  return out;
-}
-
 size_t DataTypeSize(DataType dtype) {
   switch (dtype) {
     case DataType::UNDEFINED:
@@ -237,12 +234,21 @@ std::string DataType2String(DataType dtype) {
   }
 }
 
-const std::string& TransToPtenKernelName(const std::string& fluid_op_name) {
-  if (kernel_alias_name_map.find(fluid_op_name) !=
-      kernel_alias_name_map.end()) {
-    return kernel_alias_name_map.at(fluid_op_name);
+std::string TransToPtenKernelName(const std::string& fluid_op_name) {
+  return OpUtilsMap::Instance().GetBaseKernelName(fluid_op_name);
+}
+
+const std::string& TransToFluidOpName(const std::string& pten_kernel_name) {
+  auto& base_kernel_name_map = OpUtilsMap::Instance().base_kernel_name_map();
+  auto it = std::find_if(base_kernel_name_map.begin(),
+                         base_kernel_name_map.end(),
+                         [&pten_kernel_name](const auto& pair) {
+                           return pair.second == pten_kernel_name;
+                         });
+  if (it != base_kernel_name_map.end()) {
+    return it->first;
   }
-  return fluid_op_name;
+  return pten_kernel_name;
 }
 
 }  // namespace pten
diff --git a/paddle/pten/core/convert_utils.h b/paddle/pten/core/compat/convert_utils.h
similarity index 74%
rename from paddle/pten/core/convert_utils.h
rename to paddle/pten/core/compat/convert_utils.h
index 9e33d37c4a8..1d241c5ad40 100644
--- a/paddle/pten/core/convert_utils.h
+++ b/paddle/pten/core/compat/convert_utils.h
@@ -17,33 +17,28 @@ limitations under the License. */
 #include "paddle/pten/common/backend.h"
 #include "paddle/pten/common/data_type.h"
 #include "paddle/pten/common/layout.h"
+#include "paddle/pten/common/place.h"
 #include "paddle/pten/core/tensor_meta.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/platform/place.h"
 
 // TODO(chenweihang): this file may need to be removed
 
 namespace pten {
 
-using DataType = paddle::experimental::DataType;
-using DataLayout = paddle::experimental::DataLayout;
+std::string TransToPtenKernelName(const std::string& fluid_op_name);
+const std::string& TransToFluidOpName(const std::string& pten_kernel_name);
 
-const std::string& TransToPtenKernelName(const std::string& fluid_op_name);
-
-Backend TransToPtenBackend(const paddle::platform::Place& place);
+Backend TransToPtenBackend(const pten::Place& place);
 DataType TransToPtenDataType(
     const paddle::framework::proto::VarType::Type& dtype);
 
-paddle::platform::Place TransToFluidPlace(const Backend& backend);
+paddle::platform::Place TransToFluidPlace(const Backend& backend,
+                                          bool set_device_id = true);
 paddle::framework::proto::VarType::Type TransToProtoVarType(
     const DataType& dtype);
 
-paddle::framework::LoD TransToFluidLoD(const pten::LoD& lod);
-pten::LoD TransToPtenLoD(const paddle::framework::LoD& lod);
-
 size_t DataTypeSize(DataType dtype);
 DataType String2DataType(const std::string& str);
 std::string DataType2String(DataType dtype);
diff --git a/paddle/pten/core/compat/op_utils.h b/paddle/pten/core/compat/op_utils.h
index 505ef13891a..93090616366 100644
--- a/paddle/pten/core/compat/op_utils.h
+++ b/paddle/pten/core/compat/op_utils.h
@@ -14,18 +14,34 @@ limitations under the License. */
 
 #pragma once
 
-#include <mutex>
+#include <string>
+#include <unordered_set>
 
 #include "paddle/pten/core/compat/arg_map_context.h"
+#include "paddle/pten/core/enforce.h"
 #include "paddle/pten/core/infermeta_utils.h"
-#include "paddle/pten/core/kernel_def.h"
 #include "paddle/pten/core/macros.h"
+#include "paddle/pten/core/type_defs.h"
 #include "paddle/utils/flat_hash_map.h"
 
-#include "paddle/fluid/platform/enforce.h"
-
 namespace pten {
 
+/**
+ * Some fluid ops are no longer used under the corresponding official API
+ * system of 2.0. These names need to correspond to the official API names
+ * after 2.0, and can no longer be occupied by the previously abandoned ops.
+ * They are marked here uniformly.
+ */
+const std::unordered_set<std::string> deprecated_op_names({"flatten",
+                                                           "flatten_grad",
+                                                           "matmul",
+                                                           "matmul_grad",
+                                                           "matmul_grad_grad",
+                                                           "mean",
+                                                           "reshape",
+                                                           "reshape_grad",
+                                                           "sum"});
+
 class DefaultKernelSignatureMap {
  public:
   static DefaultKernelSignatureMap& Instance();
@@ -37,7 +53,7 @@ class DefaultKernelSignatureMap {
     PADDLE_ENFORCE_NE(
         it,
         map_.end(),
-        paddle::platform::errors::NotFound(
+        pten::errors::NotFound(
             "Operator `%s`'s kernel signature is not registered.", op_type));
     return it->second;
   }
@@ -46,7 +62,7 @@ class DefaultKernelSignatureMap {
     PADDLE_ENFORCE_NE(
         Has(op_type),
         true,
-        paddle::platform::errors::AlreadyExists(
+        pten::errors::AlreadyExists(
             "Operator (%s)'s Kernel Siginature has been registered.", op_type));
     map_.insert({std::move(op_type), std::move(signature)});
   }
@@ -64,32 +80,37 @@ class OpUtilsMap {
   static OpUtilsMap& Instance();
 
   bool Contains(const std::string& op_type) const {
-    return name_map_.count(op_type) || arg_mapping_fn_map_.count(op_type);
+    return base_kernel_name_map_.count(op_type) ||
+           arg_mapping_fn_map_.count(op_type);
   }
 
-  void InsertApiName(std::string op_type, std::string api_name) {
+  void InsertBaseKernelName(std::string op_type, std::string base_kernel_name) {
     PADDLE_ENFORCE_EQ(
-        name_map_.count(op_type),
+        base_kernel_name_map_.count(op_type),
         0UL,
-        paddle::platform::errors::AlreadyExists(
+        pten::errors::AlreadyExists(
             "Operator (%s)'s api name has been registered.", op_type));
-    name_map_.insert({std::move(op_type), std::move(api_name)});
+    base_kernel_name_map_.insert(
+        {std::move(op_type), std::move(base_kernel_name)});
   }
 
   void InsertArgumentMappingFn(std::string op_type, ArgumentMappingFn fn) {
     PADDLE_ENFORCE_EQ(
         arg_mapping_fn_map_.count(op_type),
         0UL,
-        paddle::platform::errors::AlreadyExists(
+        pten::errors::AlreadyExists(
             "Operator (%s)'s argu,emt mapping function has been registered.",
             op_type));
     arg_mapping_fn_map_.insert({std::move(op_type), std::move(fn)});
   }
 
-  std::string GetApiName(const std::string& op_type) const {
-    auto it = name_map_.find(op_type);
-    if (it == name_map_.end()) {
+  std::string GetBaseKernelName(const std::string& op_type) const {
+    if (deprecated_op_names.find(op_type) != deprecated_op_names.end()) {
       return "deprecated";
+    }
+    auto it = base_kernel_name_map_.find(op_type);
+    if (it == base_kernel_name_map_.end()) {
+      return op_type;
     } else {
       return it->second;
     }
@@ -108,18 +129,23 @@ class OpUtilsMap {
     }
   }
 
+  const paddle::flat_hash_map<std::string, std::string>& base_kernel_name_map()
+      const {
+    return base_kernel_name_map_;
+  }
+
  private:
   OpUtilsMap() = default;
 
-  paddle::flat_hash_map<std::string, std::string> name_map_;
+  paddle::flat_hash_map<std::string, std::string> base_kernel_name_map_;
   paddle::flat_hash_map<std::string, ArgumentMappingFn> arg_mapping_fn_map_;
 
   DISABLE_COPY_AND_ASSIGN(OpUtilsMap);
 };
 
-struct ApiNameRegistrar {
-  ApiNameRegistrar(const char* op_type, const char* api_name) {
-    OpUtilsMap::Instance().InsertApiName(op_type, api_name);
+struct BaseKernelNameRegistrar {
+  BaseKernelNameRegistrar(const char* op_type, const char* base_kernel_name) {
+    OpUtilsMap::Instance().InsertBaseKernelName(op_type, base_kernel_name);
   }
 };
 
@@ -131,21 +157,21 @@ struct ArgumentMappingFnRegistrar {
   }
 };
 
-#define PT_REGISTER_API_NAME(op_type, api_name)                             \
-  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                        \
-      pt_register_api_name_ns_check_##op_type,                              \
-      "PT_REGISTER_API_NAME must be called in global namespace.");          \
-  static const ::pten::ApiNameRegistrar __registrar_api_name_for_##op_type( \
-      #op_type, #api_name);                                                 \
-  int TouchApiNameSymbol_##op_type() { return 0; }
-
-#define PT_DECLARE_API_NAME(op_type)                              \
-  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                              \
-      pt_declare_ai_name_ns_check_##op_type,                      \
-      "PT_DECLARE_API_NAME must be called in global namespace."); \
-  extern int TouchApiNameSymbol_##op_type();                      \
-  UNUSED static int __declare_api_name_symbol_for_##op_type =     \
-      TouchApiNameSymbol_##op_type()
+#define PT_REGISTER_BASE_KERNEL_NAME(op_type, base_kernel_name)                \
+  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                           \
+      pt_register_base_kernel_name_ns_check_##op_type,                         \
+      "PT_REGISTER_BASE_KERNEL_NAME must be called in global namespace.");     \
+  static const ::pten::BaseKernelNameRegistrar                                 \
+      __registrar_base_kernel_name_for_##op_type(#op_type, #base_kernel_name); \
+  int TouchBaseKernelNameSymbol_##op_type() { return 0; }
+
+#define PT_DECLARE_BASE_KERNEL_NAME(op_type)                              \
+  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                      \
+      pt_declare_ai_name_ns_check_##op_type,                              \
+      "PT_DECLARE_BASE_KERNEL_NAME must be called in global namespace."); \
+  extern int TouchBaseKernelNameSymbol_##op_type();                       \
+  UNUSED static int __declare_base_kernel_name_symbol_for_##op_type =     \
+      TouchBaseKernelNameSymbol_##op_type()
 
 #define PT_REGISTER_ARG_MAPPING_FN(op_type, arg_mapping_fn)              \
   PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                     \
diff --git a/paddle/pten/core/compat/type_defs.h b/paddle/pten/core/compat/type_defs.h
new file mode 100644
index 00000000000..eb5459b1b6e
--- /dev/null
+++ b/paddle/pten/core/compat/type_defs.h
@@ -0,0 +1,96 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <map>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include <boost/variant.hpp>
+
+namespace egr {
+class EagerTensor;
+}
+namespace paddle {
+namespace framework {
+// The order should be as same as framework.proto
+// NOTE(xiongkun): we extract from framework/typedef.h to ensure we can transfer
+// enforce.h
+class BlockDesc;
+using Attribute = boost::variant<boost::blank,
+                                 int,
+                                 float,
+                                 std::string,
+                                 std::vector<int>,
+                                 std::vector<float>,
+                                 std::vector<std::string>,
+                                 bool,
+                                 std::vector<bool>,
+                                 BlockDesc*,
+                                 int64_t,
+                                 std::vector<BlockDesc*>,
+                                 std::vector<int64_t>,
+                                 std::vector<double>>;
+using AttributeMap = std::unordered_map<std::string, Attribute>;
+}  // namespace framework
+
+namespace imperative {
+
+class VariableWrapper;
+class SavedVariableWrapperList;
+class VarBase;
+class OpBase;
+class GradOpNode;
+class Tracer;
+
+using WeakNameVarBaseMap =
+    std::map<std::string, std::vector<std::weak_ptr<VarBase>>>;
+
+namespace details {
+template <typename T>
+struct NameVarMapTrait {};
+
+template <>
+struct NameVarMapTrait<VarBase> {
+  using Type = std::map<std::string, std::vector<std::shared_ptr<VarBase>>>;
+};
+
+template <>
+struct NameVarMapTrait<VariableWrapper> {
+  using Type = std::map<std::string, SavedVariableWrapperList>;
+};
+
+template <>
+struct NameVarMapTrait<egr::EagerTensor> {
+  using Type =
+      std::map<std::string, std::vector<std::shared_ptr<egr::EagerTensor>>>;
+};
+
+}  // namespace details
+
+template <typename T>
+using NameVarMap = typename details::NameVarMapTrait<T>::Type;
+
+using NameVarBaseMap = NameVarMap<VarBase>;
+using NameVariableWrapperMap = NameVarMap<VariableWrapper>;
+using NameTensorMap = NameVarMap<egr::EagerTensor>;
+
+using VariableWrapperList = std::vector<std::shared_ptr<VariableWrapper>>;
+
+}  // namespace imperative
+}  // namespace paddle
diff --git a/paddle/pten/core/ddim.cc b/paddle/pten/core/ddim.cc
index 663f92a5bf8..1846b7cf7fc 100644
--- a/paddle/pten/core/ddim.cc
+++ b/paddle/pten/core/ddim.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/pten/core/ddim.h"
+
 #include <set>
 
 namespace pten {
-namespace platform = paddle::platform;
 namespace framework {
 
 DDim make_ddim(std::initializer_list<int64_t> dims) {
@@ -84,7 +84,7 @@ DDim slice_ddim(const DDim& dim, int begin, int end) {
   PADDLE_ENFORCE_EQ(
       (begin >= 0 && end <= dim.size()),
       true,
-      platform::errors::InvalidArgument(
+      pten::errors::InvalidArgument(
           "[begin(%d), end(%d)) must be inside [0, %d) in ddim slice.",
           begin,
           end,
@@ -111,30 +111,30 @@ std::ostream& operator<<(std::ostream& os, const DDim& ddim) {
 }
 
 DDim flatten_to_3d(const DDim& src, int num_row_dims, int num_col_dims) {
-  PADDLE_ENFORCE_GE(src.size(),
-                    3,
-                    platform::errors::InvalidArgument(
-                        "The rank of src dim should be at least 3 "
-                        "in flatten_to_3d, but received %d.",
-                        src.size()));
-  PADDLE_ENFORCE_EQ((num_row_dims >= 1 && num_row_dims < src.size()),
-                    true,
-                    platform::errors::InvalidArgument(
-                        "The num_row_dims should be inside [1, %d] "
-                        "in flatten_to_3d, but received %d.",
-                        src.size() - 1,
-                        num_row_dims));
-  PADDLE_ENFORCE_EQ((num_col_dims >= 2 && num_col_dims <= src.size()),
-                    true,
-                    platform::errors::InvalidArgument(
-                        "The num_col_dims should be inside [2, %d] "
-                        "in flatten_to_3d, but received %d.",
-                        src.size(),
-                        num_col_dims));
+  PADDLE_ENFORCE_GE(
+      src.size(),
+      3,
+      pten::errors::InvalidArgument("The rank of src dim should be at least 3 "
+                                    "in flatten_to_3d, but received %d.",
+                                    src.size()));
+  PADDLE_ENFORCE_EQ(
+      (num_row_dims >= 1 && num_row_dims < src.size()),
+      true,
+      pten::errors::InvalidArgument("The num_row_dims should be inside [1, %d] "
+                                    "in flatten_to_3d, but received %d.",
+                                    src.size() - 1,
+                                    num_row_dims));
+  PADDLE_ENFORCE_EQ(
+      (num_col_dims >= 2 && num_col_dims <= src.size()),
+      true,
+      pten::errors::InvalidArgument("The num_col_dims should be inside [2, %d] "
+                                    "in flatten_to_3d, but received %d.",
+                                    src.size(),
+                                    num_col_dims));
   PADDLE_ENFORCE_GE(
       num_col_dims,
       num_row_dims,
-      platform::errors::InvalidArgument(
+      pten::errors::InvalidArgument(
           "The num_row_dims should be less than num_col_dims in flatten_to_3d,"
           "but received num_row_dims = %d, num_col_dims = %d.",
           num_row_dims,
@@ -181,7 +181,7 @@ DDim DDim::reshape(const std::vector<int>& shape) const {
     if (shape[i] == copy_dim_val) {
       PADDLE_ENFORCE_LT(static_cast<int>(i),
                         in_dims.size(),
-                        platform::errors::InvalidArgument(
+                        pten::errors::InvalidArgument(
                             "Index %d of shape under which the value of 0 "
                             "is stored, must be lower than the number of "
                             "old dimensions. But received shape[%d] = 0, "
@@ -205,22 +205,22 @@ DDim DDim::transpose(const std::vector<int>& axis) const {
   auto axis_set = std::set<int>(axis.begin(), axis.end());
   PADDLE_ENFORCE_EQ(axis_set.size(),
                     axis_size,
-                    platform::errors::InvalidArgument(
+                    pten::errors::InvalidArgument(
                         "In an axis array, elements must be unique."));
 
   PADDLE_ENFORCE_EQ(
       in_rank,
       axis_size,
-      platform::errors::InvalidArgument("The input dimension's size "
-                                        "should be equal to the axis's size. "
-                                        "But received dimension is %d, "
-                                        "axis's size is %d",
-                                        in_rank,
-                                        axis_size));
+      pten::errors::InvalidArgument("The input dimension's size "
+                                    "should be equal to the axis's size. "
+                                    "But received dimension is %d, "
+                                    "axis's size is %d",
+                                    in_rank,
+                                    axis_size));
 
   PADDLE_ENFORCE_LT(*std::max_element(axis.begin(), axis.end()),
                     axis_size,
-                    platform::errors::InvalidArgument(
+                    pten::errors::InvalidArgument(
                         "Axis values must be ranging from 0 to (dims - 1)."));
 
   DDim out_dims(in_dims);
@@ -231,4 +231,4 @@ DDim DDim::transpose(const std::vector<int>& axis) const {
 }
 
 }  // namespace framework
-}  // namespace pten
\ No newline at end of file
+}  // namespace pten
diff --git a/paddle/pten/core/ddim.h b/paddle/pten/core/ddim.h
index 148c32481c0..71ee732be27 100644
--- a/paddle/pten/core/ddim.h
+++ b/paddle/pten/core/ddim.h
@@ -17,10 +17,10 @@
 #include <string>
 #include <vector>
 
-#include "paddle/pten/core/dim.h"
+#include "paddle/pten/core/enforce.h"
+#include "paddle/pten/core/utils/dim.h"
 
 namespace pten {
-namespace platform = paddle::platform;
 namespace framework {
 
 #define PADDLE_VISIT_DDIM_BASE(rank, callback) \
@@ -42,7 +42,7 @@ namespace framework {
     PADDLE_VISIT_DDIM_BASE(8, callback);                                   \
     PADDLE_VISIT_DDIM_BASE(9, callback);                                   \
     default:                                                               \
-      PADDLE_THROW(platform::errors::Unimplemented(                        \
+      PADDLE_THROW(pten::errors::Unimplemented(                            \
           "Invalid dimension to be accessed. Now only supports access to " \
           "dimension 0 to 9, but received dimension is %d.",               \
           rank));                                                          \
@@ -98,14 +98,14 @@ class DDim {
   int64_t& at(int idx) {
     PADDLE_ENFORCE_GE(idx,
                       0,
-                      platform::errors::InvalidArgument(
+                      pten::errors::InvalidArgument(
                           "Invalid DDim index to be accessed. The valid index "
                           "is between 0 and %d, but received index is %d.",
                           rank_,
                           idx));
     PADDLE_ENFORCE_LT(idx,
                       rank_,
-                      platform::errors::InvalidArgument(
+                      pten::errors::InvalidArgument(
                           "Invalid DDim index to be accessed. The valid index "
                           "is between 0 and %d, but received index is %d.",
                           rank_,
@@ -116,14 +116,14 @@ class DDim {
   int64_t at(int idx) const {
     PADDLE_ENFORCE_GE(idx,
                       0,
-                      platform::errors::InvalidArgument(
+                      pten::errors::InvalidArgument(
                           "Invalid DDim index to be accessed. The valid index "
                           "is between 0 and %d, but received index is %d.",
                           rank_,
                           idx));
     PADDLE_ENFORCE_LT(idx,
                       rank_,
-                      platform::errors::InvalidArgument(
+                      pten::errors::InvalidArgument(
                           "Invalid DDim index to be accessed. The valid index "
                           "is between 0 and %d, but received index is %d.",
                           rank_,
diff --git a/paddle/pten/core/dense_tensor.cc b/paddle/pten/core/dense_tensor.cc
index 7373ba79c0a..36d56212e21 100644
--- a/paddle/pten/core/dense_tensor.cc
+++ b/paddle/pten/core/dense_tensor.cc
@@ -14,13 +14,13 @@ limitations under the License. */
 
 #include "paddle/pten/core/dense_tensor.h"
 
-// See Note [ Why still include the fluid headers? ]
 #include "paddle/pten/common/bfloat16.h"
 #include "paddle/pten/common/complex.h"
 #include "paddle/pten/common/float16.h"
+#include "paddle/pten/core/compat/convert_utils.h"
 
-#include "paddle/pten/api/lib/utils/storage.h"
-#include "paddle/pten/core/convert_utils.h"
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/memory/malloc.h"
 
 namespace pten {
 
diff --git a/paddle/pten/core/dense_tensor.h b/paddle/pten/core/dense_tensor.h
index fbecbcf0a1f..8d10753b58d 100644
--- a/paddle/pten/core/dense_tensor.h
+++ b/paddle/pten/core/dense_tensor.h
@@ -14,15 +14,15 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/fluid/framework/data_layout.h"
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/platform/stream/stream.h"
-
 #include "paddle/pten/core/allocator.h"
 #include "paddle/pten/core/storage.h"
 #include "paddle/pten/core/tensor_base.h"
 #include "paddle/pten/core/tensor_meta.h"
 
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/platform/stream/stream.h"
+
 /* @jim19930609: Move to MKLDNN_Tensor in the future
     */
 #ifdef PADDLE_WITH_MKLDNN
@@ -31,7 +31,7 @@ limitations under the License. */
 
 namespace pten {
 
-class CompatibleDenseTensorUtils;
+class DenseTensorUtils;
 
 /// \brief The Dense tensor store values in a contiguous sequential block
 /// of memory where all values are represented. Tensors or multi-dimensional
@@ -120,8 +120,8 @@ class DenseTensor : public TensorBase,
   /// \return Whether the metadata is valid.
   bool valid() const noexcept override { return meta_.valid(); }
 
-  /// \brief Test whether the storage is allocated.
-  /// return Whether the storage is allocated.
+  /// \brief Test whether the allocation is allocated.
+  /// return Whether the allocation is allocated.
   bool initialized() const override { return holder_ && holder_->ptr(); }
 
   /// \brief Allocate memory with requested size from allocator.
@@ -130,12 +130,12 @@ class DenseTensor : public TensorBase,
                      DataType dtype,
                      size_t requested_size = 0) override;
 
-  /// \brief Check if storage is shared with other objects.
-  /// \return Whether the storage is shared with other objects.
+  /// \brief Check if allocation is shared with other objects.
+  /// \return Whether the allocation is shared with other objects.
   bool IsSharedWith(const DenseTensor& b) const;
 
   /// \brief Change the shape information in the metadata. If the new size is
-  /// larger than the original value, the storage area will be reallocated.
+  /// larger than the original value, the allocation area will be reallocated.
   /// \param dims The new dims of the dense tensor.
   /// \param lod The new lod of the dense tensor.
   // void Resize(const DDim& dims);
@@ -147,9 +147,10 @@ class DenseTensor : public TensorBase,
   /// \param lod The new lod of the dense tensor.
   void ResetLoD(const LoD& lod);
 
-  /// \brief Returns the actual storage size occupied by tensor, may be larger
+  /// \brief Returns the actual allocation size occupied by tensor, may be
+  /// larger
   /// than its shape dims.
-  /// \return The actual storage size occupied by tensor.
+  /// \return The actual allocation size occupied by tensor.
   size_t capacity() const { return holder_->size(); }
 
   /// \brief Get the const data pointer value of type T.
@@ -162,7 +163,7 @@ class DenseTensor : public TensorBase,
   const void* data() const;
 
  private:
-  friend class CompatibleDenseTensorUtils;
+  friend class DenseTensorUtils;
 
  protected:
   DenseTensorMeta meta_;
diff --git a/paddle/pten/core/dense_tensor_impl.cc b/paddle/pten/core/dense_tensor_impl.cc
index f825d3619b9..85714373472 100644
--- a/paddle/pten/core/dense_tensor_impl.cc
+++ b/paddle/pten/core/dense_tensor_impl.cc
@@ -14,13 +14,12 @@ limitations under the License. */
 
 #include "paddle/pten/core/dense_tensor.h"
 
-// See Note [ Why still include the fluid headers? ]
 #include "paddle/pten/common/bfloat16.h"
 #include "paddle/pten/common/complex.h"
 #include "paddle/pten/common/float16.h"
 
 #include "paddle/pten/api/lib/utils/storage.h"
-#include "paddle/pten/core/convert_utils.h"
+#include "paddle/pten/core/compat/convert_utils.h"
 
 namespace pten {
 /* --------------------------- */
diff --git a/paddle/pten/core/device_context.cc b/paddle/pten/core/device_context.cc
index d6e01c5c6e6..2a11b1bef9d 100644
--- a/paddle/pten/core/device_context.cc
+++ b/paddle/pten/core/device_context.cc
@@ -23,7 +23,7 @@ struct DeviceContext::Impl {
   Impl() = default;
   ~Impl() = default;
 
-  void SetDeviceAllocator(const Allocator* allocator) {
+  void SetAllocator(const Allocator* allocator) {
     PADDLE_ENFORCE_NOT_NULL(
         allocator,
         pten::errors::InvalidArgument(
@@ -47,7 +47,7 @@ struct DeviceContext::Impl {
     zero_allocator_ = allocator;
   }
 
-  const Allocator& GetDeviceAllocator() const {
+  const Allocator& GetAllocator() const {
     PADDLE_ENFORCE_NOT_NULL(
         device_allocator_,
         pten::errors::InvalidArgument("Required device_allocator_ shall not be "
@@ -124,7 +124,7 @@ DeviceContext::DeviceContext() { impl_ = std::make_unique<Impl>(); }
 
 DeviceContext::DeviceContext(const DeviceContext& other) {
   impl_->SetHostAllocator(&other.GetHostAllocator());
-  impl_->SetDeviceAllocator(&other.GetDeviceAllocator());
+  impl_->SetAllocator(&other.GetAllocator());
   impl_->SetZeroAllocator(&other.GetZeroAllocator());
 }
 
@@ -134,12 +134,12 @@ DeviceContext::DeviceContext(DeviceContext&& other) {
 
 DeviceContext::~DeviceContext() = default;
 
-void DeviceContext::SetDeviceAllocator(const Allocator* allocator) {
-  impl_->SetDeviceAllocator(allocator);
+void DeviceContext::SetAllocator(const Allocator* allocator) {
+  impl_->SetAllocator(allocator);
 }
 
-const Allocator& DeviceContext::GetDeviceAllocator() const {
-  return impl_->GetDeviceAllocator();
+const Allocator& DeviceContext::GetAllocator() const {
+  return impl_->GetAllocator();
 }
 
 void DeviceContext::SetHostAllocator(const Allocator* allocator) {
diff --git a/paddle/pten/core/device_context.h b/paddle/pten/core/device_context.h
index 30be5cd22dd..68c16dc3a19 100644
--- a/paddle/pten/core/device_context.h
+++ b/paddle/pten/core/device_context.h
@@ -60,7 +60,7 @@ class DeviceContext {
    *
    * @param allocator
    */
-  void SetDeviceAllocator(const Allocator*);
+  void SetAllocator(const Allocator*);
 
   /**
    * @brief Set the host Allocator object.
@@ -81,7 +81,7 @@ class DeviceContext {
    *
    * @return Allocator
    */
-  const Allocator& GetDeviceAllocator() const;
+  const Allocator& GetAllocator() const;
 
   /**
    * @brief Get the const device-related Allocator object.
@@ -114,7 +114,7 @@ class DeviceContext {
 
   // TODO(wilber): Just for the convenience of migrating the code, it will be
   // modified or removed later.
-  virtual Place GetPlace() const = 0;
+  virtual const Place& GetPlace() const = 0;
   // TODO(wilber): The fluid framework uses wait() in many places, how to delete
   // this API interface.
   virtual void Wait() const {}
diff --git a/paddle/pten/core/enforce.h b/paddle/pten/core/enforce.h
index 97433f1a6d5..2c52d044d26 100644
--- a/paddle/pten/core/enforce.h
+++ b/paddle/pten/core/enforce.h
@@ -49,7 +49,7 @@ limitations under the License. */
 #include "paddle/utils/string/to_string.h"
 
 // Note: these headers for simplify demangle type string
-#include "paddle/pten/core/type_defs.h"
+#include "paddle/pten/core/compat/type_defs.h"
 
 namespace pten {
 class ErrorSummary;
diff --git a/paddle/pten/core/infermeta_utils.h b/paddle/pten/core/infermeta_utils.h
index 47f55f85ac2..fecfab7153f 100644
--- a/paddle/pten/core/infermeta_utils.h
+++ b/paddle/pten/core/infermeta_utils.h
@@ -18,24 +18,14 @@ limitations under the License. */
 #include <utility>
 
 #include "paddle/pten/core/enforce.h"
-#include "paddle/pten/core/kernel_def.h"
 #include "paddle/pten/core/macros.h"
 #include "paddle/pten/core/meta_tensor.h"
+#include "paddle/pten/core/type_defs.h"
 #include "paddle/utils/flat_hash_map.h"
 #include "paddle/utils/small_vector.h"
 
 namespace pten {
 
-// TODO(chenweihang): add other flags if needed
-struct MetaConfig {
-  bool is_runtime{true};
-
-  MetaConfig() = default;
-
-  // supporting implicit construction is easier to use
-  MetaConfig(bool is_runtime) : is_runtime(is_runtime) {}  // NOLINT
-};
-
 class InferMetaContext {
  public:
   InferMetaContext() = default;
diff --git a/paddle/pten/core/kernel_alias_name.h b/paddle/pten/core/kernel_alias_name.h
deleted file mode 100644
index cfe3f757974..00000000000
--- a/paddle/pten/core/kernel_alias_name.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-// TODO(yuanrisheng): this file may need to be removed
-#pragma once
-
-namespace pten {
-
-// the key is kernel_name in fluid, the value is the kernel_name in pten
-// the key is sorted by key's alphabet
-const std::unordered_map<std::string, std::string> kernel_alias_name_map = {
-    {"elementwise_add", "add_raw"},
-    {"elementwise_add_grad", "add_grad"},
-    {"elementwise_div", "divide_raw"},
-    {"elementwise_mul", "muliply_raw"},
-    {"elementwise_sub", "subtract_raw"},
-    {"elementwise_sub_grad", "subtract_grad"},
-    {"fill_any_like", "full_like"},
-    {"fill_constant", "full"},
-    {"flatten_contiguous_range", "flatten"},
-    {"flatten_contiguous_range_grad", "flatten_grad"},
-    {"matmul_v2", "matmul"},
-    {"matmul_v2_grad", "matmul_grad"},
-    {"matmul_v2_grad_grad", "matmul_double_grad"},
-    {"matmul_v2_triple_grad", "matmul_triple_grad"},
-    {"reduce_mean", "mean_raw"},
-    {"reduce_sum", "sum_raw"},
-    {"reshape2", "reshape"},
-    {"reshape2_grad", "reshape_grad"},
-    {"reshape2_grad_grad", "reshape_double_grad"},
-    // fluid kernel "mean/reshape/matmul/flatten/sum" should be deprecated
-    {"flatten", "deprecated"},
-    {"flatten_grad", "deprecated"},
-    {"matmul", "deprecated"},
-    {"matmul_grad", "deprecated"},
-    {"matmul_grad_grad", "deprecated"},
-    {"mean", "deprecated"},
-    {"reshape", "deprecated"},
-    {"reshape_grad", "deprecated"},
-    {"sum", "deprecated"}};
-
-}  // namespace pten
diff --git a/paddle/pten/core/kernel_context.h b/paddle/pten/core/kernel_context.h
index def1019e204..876c98e3bcf 100644
--- a/paddle/pten/core/kernel_context.h
+++ b/paddle/pten/core/kernel_context.h
@@ -17,20 +17,16 @@
 #include <iterator>
 #include <utility>
 
-#include "paddle/pten/core/compat_utils.h"
+#include "paddle/pten/core/device_context.h"
+#include "paddle/pten/core/enforce.h"
 #include "paddle/pten/core/tensor_base.h"
+#include "paddle/pten/core/tensor_utils.h"
 #include "paddle/utils/any.h"
 #include "paddle/utils/small_vector.h"
 
-// See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/pten/core/enforce.h"
-
 namespace pten {
 
-using DeviceContext = paddle::platform::DeviceContext;
-using DataType = paddle::experimental::DataType;
-using DataLayout = paddle::experimental::DataLayout;
+using DeviceContext = pten::DeviceContext;
 
 /**
  * Note: KernelContext doesn't manage the life if DeviceContext and Tensor
diff --git a/paddle/pten/core/kernel_def.h b/paddle/pten/core/kernel_def.h
deleted file mode 100644
index 9b91720d86f..00000000000
--- a/paddle/pten/core/kernel_def.h
+++ /dev/null
@@ -1,38 +0,0 @@
-//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <functional>
-
-namespace pten {
-
-class Kernel;
-class KernelKey;
-class KernelArgsDef;
-class KernelContext;
-class KernelSignature;
-class ArgumentMappingContext;
-class InferMetaContext;
-
-using KernelFn = std::function<void(KernelContext* ctx)>;
-using KernelArgsDefFn = void (*)(Kernel* kernel);
-using KernelArgsParseFn = void (*)(const KernelKey& default_key,
-                                   KernelArgsDef* args_def);
-
-using ArgumentMappingFn =
-    std::function<KernelSignature(const ArgumentMappingContext&)>;
-using InferMetaFn = void (*)(InferMetaContext* ctx);
-
-}  // namespace pten
diff --git a/paddle/pten/core/kernel_factory.cc b/paddle/pten/core/kernel_factory.cc
index 06049b237d5..22899fbe84d 100644
--- a/paddle/pten/core/kernel_factory.cc
+++ b/paddle/pten/core/kernel_factory.cc
@@ -50,11 +50,11 @@ Kernel KernelFactory::SelectKernel(const std::string& kernel_name,
   return kernel_iter->second;
 }
 
-paddle::flat_hash_map<KernelKey, Kernel, KernelKey::Hash>
-KernelFactory::SelectKernelMap(const std::string& kernel_name) const {
+KernelKeyMap KernelFactory::SelectKernelMap(
+    const std::string& kernel_name) const {
   auto iter = kernels_.find(kernel_name);
   if (iter == kernels_.end()) {
-    return paddle::flat_hash_map<KernelKey, Kernel, KernelKey::Hash>();
+    return KernelKeyMap();
   }
   return iter->second;
 }
@@ -70,9 +70,9 @@ const Kernel& KernelFactory::SelectKernelOrThrowError(
   auto kernel_iter = iter->second.find(kernel_key);
   // TODO(chenweihang): polish refind impl here
   if (kernel_iter == iter->second.end() &&
-      kernel_key.layout() != pten::DataLayout::ANY) {
+      kernel_key.layout() != pten::DataLayout::ALL_LAYOUT) {
     pten::KernelKey any_layout_kernel_key(
-        kernel_key.backend(), pten::DataLayout::ANY, kernel_key.dtype());
+        kernel_key.backend(), pten::DataLayout::ALL_LAYOUT, kernel_key.dtype());
     kernel_iter = iter->second.find(any_layout_kernel_key);
   }
   PADDLE_ENFORCE_NE(
diff --git a/paddle/pten/core/kernel_factory.h b/paddle/pten/core/kernel_factory.h
index b21c71f3fa1..7d64429d3f2 100644
--- a/paddle/pten/core/kernel_factory.h
+++ b/paddle/pten/core/kernel_factory.h
@@ -23,11 +23,9 @@
 #include "paddle/pten/common/backend.h"
 #include "paddle/pten/common/data_type.h"
 #include "paddle/pten/common/layout.h"
-#include "paddle/pten/core/convert_utils.h"
-#include "paddle/pten/core/kernel_def.h"
-
-// See Note [ Why still include the fluid headers? ]
+#include "paddle/pten/core/compat/convert_utils.h"
 #include "paddle/pten/core/enforce.h"
+#include "paddle/pten/core/type_defs.h"
 #include "paddle/utils/flat_hash_map.h"
 #include "paddle/utils/small_vector.h"
 
@@ -198,6 +196,10 @@ class Kernel {
   KernelArgsDef args_def_;
 };
 
+using KernelKeyMap = paddle::flat_hash_map<KernelKey, Kernel, KernelKey::Hash>;
+
+using KernelNameMap = paddle::flat_hash_map<std::string, KernelKeyMap>;
+
 /**
  * Note: Each Computation need a basic kernel map that named by kernel_name.
  *       Such as for scale op, KernelMap contains a `scale` kernel map,
@@ -206,11 +208,6 @@ class Kernel {
  */
 class KernelFactory {
  public:
-  using KernelKeyMap =
-      paddle::flat_hash_map<KernelKey, Kernel, KernelKey::Hash>;
-
-  using KernelNameMap = paddle::flat_hash_map<std::string, KernelKeyMap>;
-
   static KernelFactory& Instance();
 
   KernelNameMap& kernels() { return kernels_; }
diff --git a/paddle/pten/core/kernel_registry.h b/paddle/pten/core/kernel_registry.h
index 800c01f6916..a0ff340b000 100644
--- a/paddle/pten/core/kernel_registry.h
+++ b/paddle/pten/core/kernel_registry.h
@@ -21,10 +21,10 @@
 #include <typeinfo>
 #include <vector>
 
-#include "paddle/pten/core/kernel_def.h"
 #include "paddle/pten/core/kernel_factory.h"
 #include "paddle/pten/core/kernel_utils.h"
 #include "paddle/pten/core/macros.h"
+#include "paddle/pten/core/type_defs.h"
 
 #include "paddle/pten/core/enforce.h"
 
@@ -74,6 +74,9 @@ struct KernelArgsParseFunctor<Return_ (*)(Args_...)> {
                  std::type_index(typeid(const std::vector<DenseTensor>&))) {
         args_def->AppendInput(
             default_key.backend(), default_tensor_layout, default_key.dtype());
+      } else if (arg_type == std::type_index(typeid(const SelectedRows&))) {
+        args_def->AppendInput(
+            default_key.backend(), default_tensor_layout, default_key.dtype());
       } else if (arg_type == std::type_index(typeid(DenseTensor*))) {
         args_def->AppendOutput(
             default_key.backend(), default_tensor_layout, default_key.dtype());
@@ -81,6 +84,9 @@ struct KernelArgsParseFunctor<Return_ (*)(Args_...)> {
                  std::type_index(typeid(std::vector<DenseTensor*>))) {
         args_def->AppendOutput(
             default_key.backend(), default_tensor_layout, default_key.dtype());
+      } else if (arg_type == std::type_index(typeid(SelectedRows*))) {
+        args_def->AppendOutput(
+            default_key.backend(), default_tensor_layout, default_key.dtype());
       } else {
         // Attribute deal with
         // TODO(chenweihang): now here allow any types of attribute, maybe
@@ -130,6 +136,13 @@ struct KernelRegistrar {
     for (size_t dtype = static_cast<size_t>(DataType::BOOL);
          dtype != static_cast<size_t>(DataType::NUM_DATA_TYPES);
          dtype++) {
+      // NOTE(zhiqiu): why skip these types, because fluid kernel has no kernel
+      // of these type.
+      if (dtype == static_cast<size_t>(DataType::UINT32) ||
+          dtype == static_cast<size_t>(DataType::UINT64) ||
+          dtype == static_cast<size_t>(DataType::UINT16)) {
+        continue;
+      }
       ConstructKernel(kernel_name_cstr,
                       backend,
                       layout,
diff --git a/paddle/pten/core/kernel_utils.h b/paddle/pten/core/kernel_utils.h
index d48572db5a2..d05e3c28873 100644
--- a/paddle/pten/core/kernel_utils.h
+++ b/paddle/pten/core/kernel_utils.h
@@ -18,13 +18,12 @@
 #include "paddle/pten/common/scalar.h"
 #include "paddle/pten/common/scalar_array.h"
 #include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/enforce.h"
 #include "paddle/pten/core/kernel_context.h"
-#include "paddle/pten/core/kernel_def.h"
+#include "paddle/pten/core/selected_rows.h"
 #include "paddle/pten/core/sparse_coo_tensor.h"
 #include "paddle/pten/core/sparse_csr_tensor.h"
-
-// See Note [ Why still include the fluid headers? ]
-#include "paddle/pten/core/enforce.h"
+#include "paddle/pten/core/type_defs.h"
 
 namespace pten {
 
@@ -215,6 +214,7 @@ struct KernelImpl<Return (*)(DevCtx, Args...), kernel_fn> {
   PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(DenseTensor);
   PT_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(DenseTensor);
   PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(DenseTensor);
+  PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(SelectedRows);
 
   PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(SparseCooTensor);
   PT_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(SparseCooTensor);
@@ -223,8 +223,6 @@ struct KernelImpl<Return (*)(DevCtx, Args...), kernel_fn> {
   PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(SparseCsrTensor);
   PT_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(SparseCsrTensor);
   PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(SparseCsrTensor);
-  // TODO(chenweihang): adapt SelectedRows
-  // PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(SelectedRowsTensor);
 
   /* Attribute Helpers */
 
@@ -236,6 +234,7 @@ struct KernelImpl<Return (*)(DevCtx, Args...), kernel_fn> {
   PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(paddle::platform::float16);
   PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const Scalar&);
   PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(DataType);
+  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(DataLayout);
   PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<int64_t>&);
   PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const ScalarArray&);
   PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<int>&);
@@ -244,14 +243,13 @@ struct KernelImpl<Return (*)(DevCtx, Args...), kernel_fn> {
 
   PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(DenseTensor);
   PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(DenseTensor);
+  PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(SelectedRows);
 
   PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(SparseCooTensor);
   PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(SparseCooTensor);
 
   PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(SparseCsrTensor);
   PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(SparseCsrTensor);
-  // TODO(chenweihang): adapt SelectedRows
-  // PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(SelectedRowsTensor);
 
   /* End case */
   template <typename T>
diff --git a/paddle/pten/core/lod_utils.cc b/paddle/pten/core/lod_utils.cc
index ad5ea6d39d3..83b90c4305f 100644
--- a/paddle/pten/core/lod_utils.cc
+++ b/paddle/pten/core/lod_utils.cc
@@ -14,14 +14,14 @@
 
 #include "paddle/pten/core/lod_utils.h"
 
-#include "paddle/fluid/platform/enforce.h"
+#include "paddle/pten/core/enforce.h"
 
 namespace pten {
 
 void AppendLoD(LoD *lod, const LoD &lod_length) {
   PADDLE_ENFORCE(
       lod->empty() || lod->size() == lod_length.size(),
-      paddle::platform::errors::InvalidArgument(
+      pten::errors::InvalidArgument(
           "The input LoD length should be equal to the appended LoD size, but "
           "received input LoD length is %d, actual LoD size is %d.",
           lod_length.size(),
diff --git a/paddle/pten/core/meta_tensor.cc b/paddle/pten/core/meta_tensor.cc
index a8229b568a6..d205ee1ca40 100644
--- a/paddle/pten/core/meta_tensor.cc
+++ b/paddle/pten/core/meta_tensor.cc
@@ -14,10 +14,9 @@ limitations under the License. */
 
 #include "paddle/pten/core/meta_tensor.h"
 
-#include "paddle/pten/core/compat_utils.h"
 #include "paddle/pten/core/dense_tensor.h"
-
-#include "paddle/fluid/platform/enforce.h"
+#include "paddle/pten/core/enforce.h"
+#include "paddle/pten/core/tensor_utils.h"
 
 namespace pten {
 
@@ -31,46 +30,42 @@ DataLayout MetaTensor::layout() const { return tensor_->layout(); }
 
 void MetaTensor::set_dims(const DDim& dims) {
   if (pten::DenseTensor::classof(tensor_)) {
-    CompatibleDenseTensorUtils::GetMutableMeta(
-        static_cast<DenseTensor*>(tensor_))
-        ->dims = dims;
+    DenseTensorUtils::GetMutableMeta(static_cast<DenseTensor*>(tensor_))->dims =
+        dims;
   } else {
-    PADDLE_THROW(paddle::platform::errors::Unimplemented(
+    PADDLE_THROW(pten::errors::Unimplemented(
         "Unsupported setting dims for `%s`.", tensor_->type_info().name()));
   }
 }
 
 void MetaTensor::set_dtype(DataType dtype) {
   if (pten::DenseTensor::classof(tensor_)) {
-    CompatibleDenseTensorUtils::GetMutableMeta(
-        static_cast<DenseTensor*>(tensor_))
+    DenseTensorUtils::GetMutableMeta(static_cast<DenseTensor*>(tensor_))
         ->dtype = dtype;
   } else {
-    PADDLE_THROW(paddle::platform::errors::Unimplemented(
+    PADDLE_THROW(pten::errors::Unimplemented(
         "Unsupported settting dtype for `%s`.", tensor_->type_info().name()));
   }
 }
 
 void MetaTensor::set_layout(DataLayout layout) {
   if (pten::DenseTensor::classof(tensor_)) {
-    CompatibleDenseTensorUtils::GetMutableMeta(
-        static_cast<DenseTensor*>(tensor_))
+    DenseTensorUtils::GetMutableMeta(static_cast<DenseTensor*>(tensor_))
         ->layout = layout;
   } else {
-    PADDLE_THROW(paddle::platform::errors::Unimplemented(
+    PADDLE_THROW(pten::errors::Unimplemented(
         "Unsupported settting layout for `%s`.", tensor_->type_info().name()));
   }
 }
 
 void MetaTensor::share_lod(const MetaTensor& meta_tensor) {
   if (pten::DenseTensor::classof(tensor_)) {
-    CompatibleDenseTensorUtils::GetMutableMeta(
-        static_cast<DenseTensor*>(tensor_))
-        ->lod = meta_tensor.lod();
+    DenseTensorUtils::GetMutableMeta(static_cast<DenseTensor*>(tensor_))->lod =
+        meta_tensor.lod();
   } else {
-    PADDLE_THROW(paddle::platform::errors::Unimplemented(
-        "Unsupported share lod inplace for `%s`.",
-        tensor_->type_info().name()));
+    PADDLE_THROW(
+        pten::errors::Unimplemented("Unsupported sharing lod inplace for `%s`.",
+                                    tensor_->type_info().name()));
   }
 }
 
@@ -78,8 +73,20 @@ const LoD& MetaTensor::lod() const {
   if (pten::DenseTensor::classof(tensor_)) {
     return static_cast<DenseTensor*>(tensor_)->lod();
   } else {
-    PADDLE_THROW(paddle::platform::errors::Unimplemented(
-        "Unsupported setting dims for `%s`.", tensor_->type_info().name()));
+    PADDLE_THROW(pten::errors::Unimplemented("Unsupported getting lod of `%s`.",
+                                             tensor_->type_info().name()));
+  }
+}
+
+void MetaTensor::share_meta(const MetaTensor& meta_tensor) {
+  if (pten::DenseTensor::classof(tensor_)) {
+    set_dims(meta_tensor.dims());
+    set_dtype(meta_tensor.dtype());
+    set_layout(meta_tensor.layout());
+    share_lod(meta_tensor);
+  } else {
+    PADDLE_THROW(pten::errors::Unimplemented(
+        "Unsupported sharing meta for `%s`.", tensor_->type_info().name()));
   }
 }
 
diff --git a/paddle/pten/core/meta_tensor.h b/paddle/pten/core/meta_tensor.h
index 1435e1c3912..6ccb698fe18 100644
--- a/paddle/pten/core/meta_tensor.h
+++ b/paddle/pten/core/meta_tensor.h
@@ -16,20 +16,33 @@ limitations under the License. */
 
 #include "paddle/pten/common/data_type.h"
 #include "paddle/pten/common/layout.h"
+#include "paddle/pten/core/ddim.h"
 #include "paddle/pten/core/macros.h"
 #include "paddle/pten/core/tensor_base.h"
 #include "paddle/pten/core/tensor_meta.h"
 
-// See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/framework/ddim.h"
-
 namespace pten {
 
+// TODO(chenweihang): add other flags if needed
+struct MetaConfig {
+  bool is_runtime{true};
+
+  MetaConfig() = default;
+
+  // supporting implicit construction is easier to use
+  MetaConfig(bool is_runtime) : is_runtime(is_runtime) {}  // NOLINT
+};
+
 class MetaTensor {
  public:
-  explicit MetaTensor(TensorBase* tensor) : tensor_(tensor) {}
-
   MetaTensor() = default;
+
+  // supporting implicit construction is easier to use
+  MetaTensor(TensorBase* tensor) : tensor_(tensor) {}  // NOLINT
+  MetaTensor(const TensorBase& tensor)                 // NOLINT
+      : tensor_(const_cast<TensorBase*>(&tensor)) {}
+  MetaTensor(TensorBase& tensor) : tensor_(&tensor) {}  // NOLINT
+
   MetaTensor(const MetaTensor&) = default;
   MetaTensor(MetaTensor&&) = default;
   MetaTensor& operator=(const MetaTensor&) = delete;
@@ -44,7 +57,9 @@ class MetaTensor {
   virtual void set_dims(const DDim& dims);
   virtual void set_dtype(DataType dtype);
   virtual void set_layout(DataLayout layout);
+
   virtual void share_lod(const MetaTensor& meta_tensor);
+  virtual void share_meta(const MetaTensor& meta_tensor);
 
  private:
   // Because the lod in compiletime and runtime is different,
diff --git a/paddle/pten/core/sparse_csr_tensor.cc b/paddle/pten/core/sparse_csr_tensor.cc
index 4376c05958e..de12d53fdee 100644
--- a/paddle/pten/core/sparse_csr_tensor.cc
+++ b/paddle/pten/core/sparse_csr_tensor.cc
@@ -26,10 +26,6 @@ inline void check_shape(const DDim& dims) {
 #define Check(non_zero_crows, non_zero_cols, non_zero_elements, dims)          \
   {                                                                            \
     check_shape(dims);                                                         \
-    PADDLE_ENFORCE_EQ(dims.size(),                                             \
-                      2,                                                       \
-                      paddle::platform::errors::InvalidArgument(               \
-                          "the SparseCsrTensor only support 2-D Tensor."));    \
     PADDLE_ENFORCE_EQ(                                                         \
         non_zero_cols.place(),                                                 \
         non_zero_crows.place(),                                                \
@@ -50,7 +46,12 @@ SparseCsrTensor::SparseCsrTensor(const DenseTensor& non_zero_crows,
       non_zero_cols_(non_zero_cols),
       non_zero_elements_(non_zero_elements),
       dims_(dims) {
-  Check(non_zero_crows_, non_zero_cols_, non_zero_elements_, dims_);
+  if (non_zero_crows.initialized()) {
+    Check(non_zero_crows_, non_zero_cols_, non_zero_elements_, dims_);
+  } else {
+    // create a empty tensor
+    check_shape(dims);
+  }
 }
 
 SparseCsrTensor::SparseCsrTensor(const SparseCsrTensor& other)
diff --git a/paddle/pten/core/storage.h b/paddle/pten/core/storage.h
index fc8b5dfaab7..6e7c47c2055 100644
--- a/paddle/pten/core/storage.h
+++ b/paddle/pten/core/storage.h
@@ -17,14 +17,12 @@ limitations under the License. */
 #include <cstddef>
 
 #include "boost/intrusive_ptr.hpp"
+#include "paddle/pten/common/place.h"
+#include "paddle/pten/core/allocator.h"
 #include "paddle/pten/core/utils/intrusive_ptr.h"
 #include "paddle/pten/core/utils/intrusive_ref_counter.h"
 #include "paddle/pten/core/utils/type_info.h"
 
-#include "paddle/fluid/memory/memory.h"
-#include "paddle/fluid/platform/place.h"
-#include "paddle/pten/core/allocator.h"
-
 namespace pten {
 
 /// \brief The interface of contiguous storage used for the dense tensor.
@@ -32,7 +30,6 @@ namespace pten {
 /// all default copy operations to ensure the integrity of the package.
 class Storage : public intrusive_ref_counter<Storage> {
  public:
-  using Place = paddle::platform::Place;
   Storage() = default;
   Storage(const Storage&) = delete;
 
@@ -43,11 +40,11 @@ class Storage : public intrusive_ref_counter<Storage> {
 
   /*   --------- shared_ptr<Allocation> -------- */
   // Initialize a Storage with unique Allocation
-  explicit Storage(std::shared_ptr<paddle::memory::Allocation>&& data)
+  explicit Storage(std::shared_ptr<pten::Allocation>&& data)
       : data_(std::move(data)) {}
 
   // Initialize a Storage shareing Allocation with another storage
-  explicit Storage(const std::shared_ptr<paddle::memory::Allocation>& data)
+  explicit Storage(const std::shared_ptr<pten::Allocation>& data)
       : data_(data) {}
 
   void* data() const {
@@ -56,17 +53,15 @@ class Storage : public intrusive_ref_counter<Storage> {
                  : nullptr;
   }
 
-  const std::shared_ptr<paddle::memory::Allocation>& data_shared() const {
-    return data_;
-  }
+  const std::shared_ptr<pten::Allocation>& data_shared() const { return data_; }
 
   virtual void set_data_shared(
-      const std::shared_ptr<paddle::memory::Allocation>& holder) = 0;
+      const std::shared_ptr<pten::Allocation>& holder) = 0;
 
-  virtual std::shared_ptr<paddle::memory::Allocation>&& move_data_shared() = 0;
+  virtual std::shared_ptr<pten::Allocation>&& move_data_shared() = 0;
 
   virtual void ReallocShared(size_t n) {
-    PADDLE_THROW(paddle::platform::errors::Unimplemented(
+    PADDLE_THROW(pten::errors::Unimplemented(
         "ReallocShared has not been overrided by the current Storage"));
   }
   /* --------- shared_ptr<Allocation> -------- */
@@ -81,13 +76,11 @@ class Storage : public intrusive_ref_counter<Storage> {
   virtual void Realloc(size_t n) = 0;
 
  protected:
-  std::shared_ptr<paddle::memory::Allocation> data_;
+  std::shared_ptr<pten::Allocation> data_;
 };
 
 class TensorStorage : public Storage {
  public:
-  using Place = paddle::platform::Place;
-
   explicit TensorStorage(Allocator* a) : alloc_(a) {}
 
   TensorStorage(Allocator* a, size_t size)
@@ -110,7 +103,7 @@ class TensorStorage : public Storage {
 
   const Place& place() const override {
     if (!data_) {
-      PADDLE_THROW(paddle::platform::errors::Unimplemented(
+      PADDLE_THROW(pten::errors::Unimplemented(
           "Unable to visit place: either data_ or alloc_ has to be initialized "
           "first."));
     }
@@ -120,13 +113,13 @@ class TensorStorage : public Storage {
   bool OwnsMemory() const noexcept override { return true; }
 
   void set_data_shared(
-      const std::shared_ptr<paddle::memory::Allocation>& holder) override {
+      const std::shared_ptr<pten::Allocation>& holder) override {
     CHECK(holder);
     data_ = holder;
     size_ = holder->size();
   }
 
-  std::shared_ptr<paddle::memory::Allocation>&& move_data_shared() override {
+  std::shared_ptr<pten::Allocation>&& move_data_shared() override {
     size_ = 0;
     return std::move(data_);
   }
diff --git a/paddle/pten/core/tensor_base.h b/paddle/pten/core/tensor_base.h
index 7a5e42da490..5c0f74df492 100644
--- a/paddle/pten/core/tensor_base.h
+++ b/paddle/pten/core/tensor_base.h
@@ -14,23 +14,19 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/fluid/platform/place.h"
 #include "paddle/pten/common/backend.h"
 #include "paddle/pten/common/data_type.h"
 #include "paddle/pten/common/layout.h"
+#include "paddle/pten/common/place.h"
 #include "paddle/pten/core/allocator.h"
 #include "paddle/pten/core/ddim.h"
-#include "paddle/pten/core/storage.h"
 #include "paddle/pten/core/utils/type_registry.h"
 
 namespace pten {
 
 class TensorBase {
  public:
-  using DataType = paddle::experimental::DataType;
-  using DataLayout = paddle::experimental::DataLayout;
   using DDim = pten::framework::DDim;
-  using Place = paddle::platform::Place;
 
   virtual ~TensorBase() = default;
 
diff --git a/paddle/pten/core/tensor_meta.h b/paddle/pten/core/tensor_meta.h
index ac3f17267c4..60b2be3c175 100644
--- a/paddle/pten/core/tensor_meta.h
+++ b/paddle/pten/core/tensor_meta.h
@@ -19,8 +19,6 @@ limitations under the License. */
 #include "paddle/pten/common/backend.h"
 #include "paddle/pten/common/data_type.h"
 #include "paddle/pten/common/layout.h"
-
-// See Note [ Why still include the fluid headers? ]
 #include "paddle/pten/core/ddim.h"
 
 // Note: mixed_vector include many header now, LoD will be
diff --git a/paddle/pten/core/compat_utils.h b/paddle/pten/core/tensor_utils.h
similarity index 81%
rename from paddle/pten/core/compat_utils.h
rename to paddle/pten/core/tensor_utils.h
index 46e53e3997c..00dcdc65b84 100644
--- a/paddle/pten/core/compat_utils.h
+++ b/paddle/pten/core/tensor_utils.h
@@ -14,33 +14,17 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/pten/api/lib/utils/storage.h"
 #include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/core/storage.h"
 #include "paddle/pten/core/tensor_meta.h"
 
 namespace pten {
 
-/**
- * In order to meet some adaptation requirements of the compatible state,
- * these class is added to provide some tool functions.
- *
- * These utility functions may be deleted in the future, It is not recommended
- * to be widely used in the framework
- */
-
-class CompatibleDenseTensorUtils {
+class DenseTensorUtils {
  public:
   static DenseTensorMeta* GetMutableMeta(DenseTensor* tensor) {
     return &(tensor->meta_);
   }
 
-  // only can deal with SharedStorage now
-  static void ClearStorage(DenseTensor* tensor) {
-    // use static_cast to improve performance, replace by dynamic_cast later
-    tensor->MoveMemoryHolder();
-  }
-
   static DenseTensor Slice(const DenseTensor& tensor,
                            int64_t begin_idx,
                            int64_t end_idx) {
diff --git a/paddle/pten/core/type_defs.h b/paddle/pten/core/type_defs.h
index eb5459b1b6e..9b91720d86f 100644
--- a/paddle/pten/core/type_defs.h
+++ b/paddle/pten/core/type_defs.h
@@ -1,96 +1,38 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
 
 #pragma once
 
-#include <map>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-
-#include <boost/variant.hpp>
-
-namespace egr {
-class EagerTensor;
-}
-namespace paddle {
-namespace framework {
-// The order should be as same as framework.proto
-// NOTE(xiongkun): we extract from framework/typedef.h to ensure we can transfer
-// enforce.h
-class BlockDesc;
-using Attribute = boost::variant<boost::blank,
-                                 int,
-                                 float,
-                                 std::string,
-                                 std::vector<int>,
-                                 std::vector<float>,
-                                 std::vector<std::string>,
-                                 bool,
-                                 std::vector<bool>,
-                                 BlockDesc*,
-                                 int64_t,
-                                 std::vector<BlockDesc*>,
-                                 std::vector<int64_t>,
-                                 std::vector<double>>;
-using AttributeMap = std::unordered_map<std::string, Attribute>;
-}  // namespace framework
-
-namespace imperative {
-
-class VariableWrapper;
-class SavedVariableWrapperList;
-class VarBase;
-class OpBase;
-class GradOpNode;
-class Tracer;
-
-using WeakNameVarBaseMap =
-    std::map<std::string, std::vector<std::weak_ptr<VarBase>>>;
-
-namespace details {
-template <typename T>
-struct NameVarMapTrait {};
-
-template <>
-struct NameVarMapTrait<VarBase> {
-  using Type = std::map<std::string, std::vector<std::shared_ptr<VarBase>>>;
-};
-
-template <>
-struct NameVarMapTrait<VariableWrapper> {
-  using Type = std::map<std::string, SavedVariableWrapperList>;
-};
-
-template <>
-struct NameVarMapTrait<egr::EagerTensor> {
-  using Type =
-      std::map<std::string, std::vector<std::shared_ptr<egr::EagerTensor>>>;
-};
+#include <functional>
 
-}  // namespace details
+namespace pten {
 
-template <typename T>
-using NameVarMap = typename details::NameVarMapTrait<T>::Type;
+class Kernel;
+class KernelKey;
+class KernelArgsDef;
+class KernelContext;
+class KernelSignature;
+class ArgumentMappingContext;
+class InferMetaContext;
 
-using NameVarBaseMap = NameVarMap<VarBase>;
-using NameVariableWrapperMap = NameVarMap<VariableWrapper>;
-using NameTensorMap = NameVarMap<egr::EagerTensor>;
+using KernelFn = std::function<void(KernelContext* ctx)>;
+using KernelArgsDefFn = void (*)(Kernel* kernel);
+using KernelArgsParseFn = void (*)(const KernelKey& default_key,
+                                   KernelArgsDef* args_def);
 
-using VariableWrapperList = std::vector<std::shared_ptr<VariableWrapper>>;
+using ArgumentMappingFn =
+    std::function<KernelSignature(const ArgumentMappingContext&)>;
+using InferMetaFn = void (*)(InferMetaContext* ctx);
 
-}  // namespace imperative
-}  // namespace paddle
+}  // namespace pten
diff --git a/paddle/pten/core/array.h b/paddle/pten/core/utils/array.h
similarity index 88%
rename from paddle/pten/core/array.h
rename to paddle/pten/core/utils/array.h
index 86d222d2d57..cd43dc7b420 100644
--- a/paddle/pten/core/array.h
+++ b/paddle/pten/core/utils/array.h
@@ -15,12 +15,11 @@
 #pragma once
 
 #include <cstdint>
-#include "paddle/pten/core/unroll_array_ops.h"
-// TODO(paddle-dev): Need to modify into pten/core/enforce.h
-#include "paddle/fluid/platform/enforce.h"
+
+#include "paddle/pten/core/enforce.h"
+#include "paddle/pten/core/utils/unroll_array_ops.h"
 
 namespace pten {
-namespace platform = paddle::platform;
 namespace framework {
 
 template <typename T, size_t N>
@@ -58,7 +57,7 @@ class Array {
   HOSTDEVICE inline T &at(size_t i) {
 #if !defined(__CUDA_ARCH__) && !defined(__HIPCC__)
     PADDLE_ENFORCE_LT(
-        i, N, platform::errors::OutOfRange("Array index out of bounds."));
+        i, N, pten::errors::OutOfRange("Array index out of bounds."));
 #endif
     return (*this)[i];
   }
@@ -66,7 +65,7 @@ class Array {
   HOSTDEVICE inline const T &at(size_t i) const {
 #if !defined(__CUDA_ARCH__) && !defined(__HIPCC__)
     PADDLE_ENFORCE_LT(
-        i, N, platform::errors::OutOfRange("Array index out of bounds."));
+        i, N, pten::errors::OutOfRange("Array index out of bounds."));
 #endif
     return (*this)[i];
   }
@@ -114,7 +113,7 @@ class Array<T, 0> {
     static T obj();
     return obj;
 #else
-    PADDLE_THROW(platform::errors::Unavailable("Array<T, 0> has no element."));
+    PADDLE_THROW(pten::errors::Unavailable("Array<T, 0> has no element."));
 #endif
   }
 
@@ -128,7 +127,7 @@ class Array<T, 0> {
     static const T obj();
     return obj;
 #else
-    PADDLE_THROW(platform::errors::Unavailable("Array<T, 0> has no element."));
+    PADDLE_THROW(pten::errors::Unavailable("Array<T, 0> has no element."));
 #endif
   }
 
diff --git a/paddle/pten/core/dim.h b/paddle/pten/core/utils/dim.h
similarity index 98%
rename from paddle/pten/core/dim.h
rename to paddle/pten/core/utils/dim.h
index 8dd984891a8..a24d6e40dfc 100644
--- a/paddle/pten/core/dim.h
+++ b/paddle/pten/core/utils/dim.h
@@ -20,8 +20,8 @@
 #include <string>
 #include <type_traits>
 
-#include "paddle/pten/core/array.h"
 #include "paddle/pten/core/hostdevice.h"
+#include "paddle/pten/core/utils/array.h"
 
 namespace pten {
 namespace framework {
diff --git a/paddle/pten/core/utils/rw_lock.h b/paddle/pten/core/utils/rw_lock.h
index 7bd190c901b..6a2429ef30d 100644
--- a/paddle/pten/core/utils/rw_lock.h
+++ b/paddle/pten/core/utils/rw_lock.h
@@ -20,8 +20,7 @@ limitations under the License. */
 #include <mutex>  // NOLINT
 #endif            // !_WIN32
 
-// See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/platform/enforce.h"
+#include "paddle/pten/core/enforce.h"
 
 namespace pten {
 
@@ -32,24 +31,23 @@ struct RWLock {
   ~RWLock() { pthread_rwlock_destroy(&lock_); }
 
   inline void RDLock() {
-    PADDLE_ENFORCE_EQ(pthread_rwlock_rdlock(&lock_),
-                      0,
-                      paddle::platform::errors::External(
-                          "The pthread failed to acquire read lock."));
+    PADDLE_ENFORCE_EQ(
+        pthread_rwlock_rdlock(&lock_),
+        0,
+        pten::errors::External("The pthread failed to acquire read lock."));
   }
 
   inline void WRLock() {
-    PADDLE_ENFORCE_EQ(pthread_rwlock_wrlock(&lock_),
-                      0,
-                      paddle::platform::errors::External(
-                          "The pthread failed to acquire write lock."));
+    PADDLE_ENFORCE_EQ(
+        pthread_rwlock_wrlock(&lock_),
+        0,
+        pten::errors::External("The pthread failed to acquire write lock."));
   }
 
   inline void UNLock() {
-    PADDLE_ENFORCE_EQ(
-        pthread_rwlock_unlock(&lock_),
-        0,
-        paddle::platform::errors::External("The pthread failed to unlock."));
+    PADDLE_ENFORCE_EQ(pthread_rwlock_unlock(&lock_),
+                      0,
+                      pten::errors::External("The pthread failed to unlock."));
   }
 
  private:
diff --git a/paddle/pten/core/unroll_array_ops.h b/paddle/pten/core/utils/unroll_array_ops.h
similarity index 100%
rename from paddle/pten/core/unroll_array_ops.h
rename to paddle/pten/core/utils/unroll_array_ops.h
diff --git a/paddle/pten/infermeta/CMakeLists.txt b/paddle/pten/infermeta/CMakeLists.txt
index 2216d38708b..c077e7b4c55 100644
--- a/paddle/pten/infermeta/CMakeLists.txt
+++ b/paddle/pten/infermeta/CMakeLists.txt
@@ -1,2 +1,2 @@
-cc_library(infermeta SRCS nullary.cc unary.cc binary.cc multiary.cc DEPS convert_utils infermeta_utils)
-cc_library(backward_infermeta SRCS backward.cc DEPS convert_utils)
+cc_library(infermeta SRCS nullary.cc unary.cc binary.cc multiary.cc DEPS convert_utils meta_tensor infermeta_utils)
+cc_library(backward_infermeta SRCS backward.cc DEPS meta_tensor convert_utils)
diff --git a/paddle/pten/infermeta/backward.cc b/paddle/pten/infermeta/backward.cc
index 5a66e8cd2ec..b7bb17bdd1c 100644
--- a/paddle/pten/infermeta/backward.cc
+++ b/paddle/pten/infermeta/backward.cc
@@ -16,13 +16,15 @@ limitations under the License. */
 
 namespace pten {
 
-std::tuple<DenseTensorMeta, DenseTensorMeta> MatmulGradInferMeta(
-    const DenseTensorMeta& x_meta,
-    const DenseTensorMeta& y_meta,
-    const DenseTensorMeta& out_grad_meta,
-    bool transpose_x,
-    bool transpose_y) {
-  return std::make_tuple(x_meta, y_meta);
+void MatmulGradInferMeta(const MetaTensor& x,
+                         const MetaTensor& y,
+                         const MetaTensor& out_grad_meta,
+                         bool transpose_x,
+                         bool transpose_y,
+                         MetaTensor* dx,
+                         MetaTensor* dy) {
+  dx->share_meta(x);
+  dy->share_meta(y);
 }
 
 }  // namespace pten
diff --git a/paddle/pten/infermeta/backward.h b/paddle/pten/infermeta/backward.h
index 03bdb3a962a..d6b96861412 100644
--- a/paddle/pten/infermeta/backward.h
+++ b/paddle/pten/infermeta/backward.h
@@ -15,15 +15,17 @@ limitations under the License. */
 #pragma once
 
 #include <tuple>
-#include "paddle/pten/core/tensor_meta.h"
+
+#include "paddle/pten/core/meta_tensor.h"
 
 namespace pten {
 
-std::tuple<DenseTensorMeta, DenseTensorMeta> MatmulGradInferMeta(
-    const DenseTensorMeta& x_meta,
-    const DenseTensorMeta& y_meta,
-    const DenseTensorMeta& out_grad_meta,
-    bool transpose_x,
-    bool transpose_y);
+void MatmulGradInferMeta(const MetaTensor& x,
+                         const MetaTensor& y,
+                         const MetaTensor& out_grad_meta,
+                         bool transpose_x,
+                         bool transpose_y,
+                         MetaTensor* dx,
+                         MetaTensor* dy);
 
 }  // namespace pten
diff --git a/paddle/pten/infermeta/binary.cc b/paddle/pten/infermeta/binary.cc
index cb605db78d9..02d78b5caa7 100644
--- a/paddle/pten/infermeta/binary.cc
+++ b/paddle/pten/infermeta/binary.cc
@@ -12,15 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-// See Note [ Why still include the fluid headers? ]
 #include "paddle/pten/infermeta/binary.h"
 #include "paddle/pten/kernels/funcs/common_shape.h"
 
 namespace pten {
 
-DenseTensorMeta DotInferMeta(const DenseTensorMeta& x_meta,
-                             const DenseTensorMeta& y_meta) {
-  auto x_dims = x_meta.dims;
+void DotInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out) {
+  auto x_dims = x.dims();
   auto x_rank = static_cast<size_t>(x_dims.size());
   PADDLE_ENFORCE_EQ(true,
                     1 == x_rank || 2 == x_rank,
@@ -29,10 +27,10 @@ DenseTensorMeta DotInferMeta(const DenseTensorMeta& x_meta,
                         "should be 1 or 2",
                         x_dims.to_str()));
 
-  auto y_dims = y_meta.dims;
+  auto y_dims = y.dims();
   PADDLE_ENFORCE_EQ(
       true,
-      x_rank == (size_t)y_dims.size(),
+      x_rank == static_cast<size_t>(y_dims.size()),
       paddle::platform::errors::PreconditionNotMet(
           "ShapeError: The shape of input tensor Y: %s should match with "
           "input tenosr X: %s",
@@ -56,25 +54,27 @@ DenseTensorMeta DotInferMeta(const DenseTensorMeta& x_meta,
                         y_dims.to_str()));
 
   x_dims[x_dims.size() - 1] = 1;
-  DenseTensorMeta return_meta(x_meta.dtype, x_dims, x_meta.layout);
-  return return_meta;
+  out->set_dims(x_dims);
+  out->set_dtype(x.dtype());
+  out->set_layout(x.layout());
 }
 
-DenseTensorMeta MatmulInferMeta(const DenseTensorMeta& x_meta,
-                                const DenseTensorMeta& y_meta,
-                                bool trans_x,
-                                bool trans_y) {
-  std::vector<int64_t> dims_x = pten::framework::vectorize(x_meta.dims);
-  std::vector<int64_t> dims_y = pten::framework::vectorize(y_meta.dims);
+void MatmulInferMeta(const MetaTensor& x,
+                     const MetaTensor& y,
+                     bool trans_x,
+                     bool trans_y,
+                     MetaTensor* out) {
+  std::vector<int64_t> dims_x = pten::framework::vectorize(x.dims());
+  std::vector<int64_t> dims_y = pten::framework::vectorize(y.dims());
   auto ndims_x = dims_x.size();
   auto ndims_y = dims_y.size();
   PADDLE_ENFORCE_GT(ndims_x,
-                    0,
+                    0UL,
                     paddle::platform::errors::InvalidArgument(
                         "The Input(x) dims size must be greater than 0,"
                         " but reviced dims size is 0. "));
   PADDLE_ENFORCE_GT(ndims_y,
-                    0,
+                    0UL,
                     paddle::platform::errors::InvalidArgument(
                         "The Input(y) dims size must be greater than 0,"
                         " but reviced dims size is 0. "));
@@ -127,21 +127,24 @@ DenseTensorMeta MatmulInferMeta(const DenseTensorMeta& x_meta,
 
   auto ddim_out = pten::framework::make_ddim(new_dims);
 
-  return {x_meta.dtype, ddim_out, x_meta.layout};
+  out->set_dims(ddim_out);
+  out->set_dtype(x.dtype());
+  out->set_layout(x.layout());
 }
 
-DenseTensorMeta ElementwiseInferMeta(const DenseTensorMeta& x_meta,
-                                     const DenseTensorMeta& y_meta) {
-  return ElementwiseRawInferMeta(x_meta, y_meta, -1);
+void ElementwiseInferMeta(const MetaTensor& x,
+                          const MetaTensor& y,
+                          MetaTensor* out) {
+  return ElementwiseRawInferMeta(x, y, -1, std::move(out));
 }
 
-DenseTensorMeta ElementwiseRawInferMeta(const DenseTensorMeta& x_meta,
-                                        const DenseTensorMeta& y_meta,
-                                        int axis) {
-  DenseTensorMeta return_meta(x_meta.dtype, x_meta.dims, x_meta.layout);
-  if (x_meta.dims != y_meta.dims) {
-    auto x_dims = x_meta.dims;
-    auto y_dims = y_meta.dims;
+void ElementwiseRawInferMeta(const MetaTensor& x,
+                             const MetaTensor& y,
+                             int axis,
+                             MetaTensor* out) {
+  if (x.dims() != y.dims()) {
+    auto x_dims = x.dims();
+    auto y_dims = y.dims();
     int max_dim = std::max(x_dims.size(), y_dims.size());
     if (x_dims.size() == y_dims.size()) {
       PADDLE_ENFORCE_EQ((axis == -1) || (axis == 0),
@@ -174,10 +177,15 @@ DenseTensorMeta ElementwiseRawInferMeta(const DenseTensorMeta& x_meta,
                                   out_dims_array.data(),
                                   max_dim,
                                   axis);
-    return_meta.dims = pten::framework::make_ddim(out_dims_array);
+    auto out_dims = pten::framework::make_ddim(out_dims_array);
+    out->set_dims(out_dims);
+  } else {
+    out->set_dims(x.dims());
   }
-  return_meta.lod = x_meta.lod;
-  return return_meta;
+
+  out->set_dtype(x.dtype());
+  out->set_layout(x.layout());
+  out->share_lod(x);
 }
 
 }  // namespace pten
diff --git a/paddle/pten/infermeta/binary.h b/paddle/pten/infermeta/binary.h
index 658211e48ac..99208246496 100644
--- a/paddle/pten/infermeta/binary.h
+++ b/paddle/pten/infermeta/binary.h
@@ -14,38 +14,35 @@ limitations under the License. */
 
 #pragma once
 
-// See Note [ Why still include the fluid headers? ]
-#include "paddle/pten/core/tensor_meta.h"
+#include "paddle/pten/core/meta_tensor.h"
 
 namespace pten {
 
 // Common InferMeta Functions for binary operators, The format like:
 //
-//   1. DenseTensorMeta [OpName]InferMeta(const DenseTensorMeta& x_meta, ...)
-//   {}
-//   2. std::pair<DenseTensorMeta, DenseTensorMeta> [OpName]InferMeta(const
-//   DenseTensorMeta&
-//   x_meta, ...) {}
-//   3. std::tuple<DenseTensorMeta, DenseTensorMeta, DenseTensorMeta>
-//   [OpName]InferMeta(const
-//   DenseTensorMeta& x_meta, ...)
-//  NOTE: The name "InferMeta" may be not appropriate. "InferMeta" may be good.
-//  Because functions in this file
-//  not only can infer shape, but alse need infer lod or other useful data.
-
-DenseTensorMeta DotInferMeta(const DenseTensorMeta& x_meta,
-                             const DenseTensorMeta& y_meta);
-
-DenseTensorMeta MatmulInferMeta(const DenseTensorMeta& x_meta,
-                                const DenseTensorMeta& y_meta,
-                                bool trans_x,
-                                bool trans_y);
-
-DenseTensorMeta ElementwiseInferMeta(const DenseTensorMeta& x_meta,
-                                     const DenseTensorMeta& y_meta);
-
-DenseTensorMeta ElementwiseRawInferMeta(const DenseTensorMeta& x_meta,
-                                        const DenseTensorMeta& y_meta,
-                                        int axis);
-
+//   1. void [FunctionDesc|OpName]InferMeta(const MetaTensor& x,
+//                                          const MetaTensor& y,
+//                                          ...,
+//                                          MetaTensor* out) {}
+//
+// NOTE: The name "InferShape" may be not appropriate. "InferMeta" may be good.
+//   Because functions in this file not only can infer shape, but also need
+//   infer lod or other useful data.
+
+void DotInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out);
+
+void MatmulInferMeta(const MetaTensor& x,
+                     const MetaTensor& y,
+                     bool trans_x,
+                     bool trans_y,
+                     MetaTensor* out);
+
+void ElementwiseInferMeta(const MetaTensor& x,
+                          const MetaTensor& y,
+                          MetaTensor* out);
+
+void ElementwiseRawInferMeta(const MetaTensor& x_meta,
+                             const MetaTensor& y_meta,
+                             int axis,
+                             MetaTensor* out);
 }  // namespace pten
diff --git a/paddle/pten/infermeta/multiary.cc b/paddle/pten/infermeta/multiary.cc
index ecd0396a286..869e87df5d8 100644
--- a/paddle/pten/infermeta/multiary.cc
+++ b/paddle/pten/infermeta/multiary.cc
@@ -18,18 +18,19 @@ limitations under the License. */
 #include "paddle/pten/kernels/funcs/concat_funcs.h"
 namespace pten {
 
-DenseTensorMeta ConcatInferMeta(const std::vector<DenseTensorMeta>& x_meta,
-                                const Scalar& axis_scalar,
-                                bool is_runtime) {
-  PADDLE_ENFORCE_GE(x_meta.size(),
-                    0,
+void ConcatInferMeta(const std::vector<MetaTensor>& x,
+                     const Scalar& axis_scalar,
+                     MetaTensor* out,
+                     MetaConfig config) {
+  PADDLE_ENFORCE_GE(x.size(),
+                    0UL,
                     paddle::platform::errors::InvalidArgument(
                         "The size of input meta vector should be greater"
                         "than 0."));
 
   int axis = axis_scalar.to<int>();
   // 1. calculate axis
-  int rank = x_meta[0].dims.size();
+  int rank = x.at(0).dims().size();
   PADDLE_ENFORCE_EQ(
       axis >= -rank && axis < rank,
       true,
@@ -44,13 +45,15 @@ DenseTensorMeta ConcatInferMeta(const std::vector<DenseTensorMeta>& x_meta,
 
   // 2. calculate out dims
   std::vector<pten::DDim> x_dims;
-  for (auto meta : x_meta) {
-    x_dims.push_back(meta.dims);
+  for (auto& x_t : x) {
+    x_dims.push_back(x_t.dims());
   }
   pten::DDim out_dim =
-      pten::funcs::ComputeAndCheckShape(is_runtime, x_dims, axis);
+      pten::funcs::ComputeAndCheckShape(config.is_runtime, x_dims, axis);
 
-  return {x_meta[0].dtype, out_dim, x_meta[0].layout};
+  out->set_dims(out_dim);
+  out->set_dtype(x.at(0).dtype());
+  out->set_layout(x.at(0).layout());
 }
 
 }  // namespace pten
diff --git a/paddle/pten/infermeta/multiary.h b/paddle/pten/infermeta/multiary.h
index f8d5468e50d..c251699da4a 100644
--- a/paddle/pten/infermeta/multiary.h
+++ b/paddle/pten/infermeta/multiary.h
@@ -15,12 +15,12 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/pten/common/scalar.h"
-#include "paddle/pten/core/tensor_meta.h"
+#include "paddle/pten/core/meta_tensor.h"
 namespace pten {
 
-// TODO(chentianyu03) use std::vector<DenseTensor> as InferMeta inputs
-DenseTensorMeta ConcatInferMeta(const std::vector<DenseTensorMeta>& x_meta,
-                                const Scalar& axis_scalar,
-                                bool is_runtime);
+void ConcatInferMeta(const std::vector<MetaTensor>& x,
+                     const Scalar& axis_scalar,
+                     MetaTensor* out,
+                     MetaConfig config = MetaConfig());
 
 }  // namespace pten
diff --git a/paddle/pten/infermeta/nullary.cc b/paddle/pten/infermeta/nullary.cc
index 19e11f049fe..fd9b2a8f717 100644
--- a/paddle/pten/infermeta/nullary.cc
+++ b/paddle/pten/infermeta/nullary.cc
@@ -12,23 +12,25 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-// See Note [ Why still include the fluid headers? ]
 #include "paddle/pten/infermeta/nullary.h"
 
 namespace pten {
 
-DenseTensorMeta CreateInferMeta(const std::vector<int64_t>& shape,
-                                DataType dtype,
-                                DataLayout layout) {
-  const auto& out_dims = pten::framework::make_ddim(shape);
-  return {dtype, out_dims, layout};
+void CreateInferMeta(const std::vector<int64_t>& shape,
+                     DataType dtype,
+                     DataLayout layout,
+                     MetaTensor* out) {
+  auto out_dims = pten::framework::make_ddim(shape);
+  out->set_dims(out_dims);
+  out->set_dtype(dtype);
+  out->set_layout(layout);
 }
 
-DenseTensorMeta CreateInferMeta(const ScalarArray& shape,
-                                DataType dtype,
-                                DataLayout layout) {
-  const auto& out_dims = pten::framework::make_ddim(shape.GetData());
-  return {dtype, out_dims, layout};
+void CreateInferMeta(const ScalarArray& shape,
+                     DataType dtype,
+                     DataLayout layout,
+                     MetaTensor* out) {
+  CreateInferMeta(shape.GetData(), dtype, layout, out);
 }
 
 }  // namespace pten
diff --git a/paddle/pten/infermeta/nullary.h b/paddle/pten/infermeta/nullary.h
index 721a39bb3ac..f0b6aad26be 100644
--- a/paddle/pten/infermeta/nullary.h
+++ b/paddle/pten/infermeta/nullary.h
@@ -15,24 +15,27 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/pten/common/scalar_array.h"
-#include "paddle/pten/core/tensor_meta.h"
+#include "paddle/pten/core/meta_tensor.h"
 
 namespace pten {
 
 // Common InferMeta Functions for 0-nary operators(no input tensor), The format
 // like:
 //
-//   1. DenseTensorMeta [OpName]InferMeta( ...)
-//  NOTE: The name "InferMeta" may be not appropriate. "InferMeta" may be good.
-//  Because functions in this file
-//  not only can infer shape, but alse need infer lod or other useful data.
-
-DenseTensorMeta CreateInferMeta(const std::vector<int64_t>& shape,
-                                DataType dtype,
-                                DataLayout layout);
-
-DenseTensorMeta CreateInferMeta(const ScalarArray& shape,
-                                DataType dtype,
-                                DataLayout layout);
+//   1. void [FunctionDesc|OpName]InferMeta(..., MetaTensor* out)
+//
+// NOTE: The name "InferShape" may be not appropriate. "InferMeta" may be good.
+//   Because functions in this file not only can infer shape, but also need
+//   infer lod or other useful data.
+
+void CreateInferMeta(const std::vector<int64_t>& shape,
+                     DataType dtype,
+                     DataLayout layout,
+                     MetaTensor* out);
+
+void CreateInferMeta(const ScalarArray& shape,
+                     DataType dtype,
+                     DataLayout layout,
+                     MetaTensor* out);
 
 }  // namespace pten
diff --git a/paddle/pten/infermeta/unary.cc b/paddle/pten/infermeta/unary.cc
index 3f6b559f560..ae1461fe8e7 100644
--- a/paddle/pten/infermeta/unary.cc
+++ b/paddle/pten/infermeta/unary.cc
@@ -16,31 +16,20 @@ limitations under the License. */
 
 #include <set>
 
+#include "paddle/pten/common/data_type.h"
 #include "paddle/pten/core/infermeta_utils.h"
 
 namespace pten {
 
-void UnchangedInferMetaNew(MetaConfig config,
-                           const MetaTensor& x,
-                           MetaTensor* out) {
-  out->set_dims(x.dims());
-  out->share_lod(x);
-}
-
-DenseTensorMeta UnchangedInferMeta(const DenseTensorMeta& x_meta) {
-  return x_meta;
+void UnchangedInferMeta(const MetaTensor& x, MetaTensor* out) {
+  out->share_meta(x);
 }
 
-DenseTensorMeta ReductionInferMeta(const DenseTensorMeta& x_meta) {
-  const auto& out_dims = pten::framework::make_ddim({1});
-  DenseTensorMeta return_meta(x_meta.dtype, out_dims, x_meta.layout);
-  return return_meta;
-}
-
-DenseTensorMeta FlattenInferMeta(const DenseTensorMeta& x_meta,
-                                 int start_axis,
-                                 int stop_axis) {
-  auto& x_dims = x_meta.dims;
+void FlattenInferMeta(const MetaTensor& x,
+                      int start_axis,
+                      int stop_axis,
+                      MetaTensor* out) {
+  auto x_dims = x.dims();
   int in_dims_size = x_dims.size();
   if (start_axis < 0) {
     start_axis = start_axis + in_dims_size;
@@ -73,29 +62,30 @@ DenseTensorMeta FlattenInferMeta(const DenseTensorMeta& x_meta,
     out_shape.push_back(x_dims[i]);
   }
   const auto& out_dims = pten::framework::make_ddim(out_shape);
-  DenseTensorMeta return_meta(x_meta.dtype, out_dims, x_meta.layout);
+  out->set_dims(out_dims);
+  out->set_dtype(x.dtype());
+  out->set_layout(x.layout());
 
-  if (x_dims[0] == return_meta.dims[0]) {
+  if (x_dims[0] == out_dims[0]) {
     // Only pass LoD when the first dimension of output and Input(X)
     // are the same.
-    return_meta.lod = x_meta.lod;
+    out->share_lod(x);
   }
-
-  return return_meta;
 }
 
-DenseTensorMeta CastInferMeta(const DenseTensorMeta& x_meta,
-                              const DataType out_dtype) {
-  DenseTensorMeta out_meta(out_dtype, x_meta.dims, x_meta.layout);
-  return out_meta;
+void CastInferMeta(const MetaTensor& x, DataType out_dtype, MetaTensor* out) {
+  out->set_dims(x.dims());
+  out->set_dtype(out_dtype);
+  out->set_layout(x.layout());
 }
 
-DenseTensorMeta CreateLikeInferMeta(const DenseTensorMeta& x_meta,
-                                    DataType dtype,
-                                    DataLayout layout) {
-  return {dtype == DataType::UNDEFINED ? x_meta.dtype : dtype,
-          x_meta.dims,
-          layout == DataLayout::UNDEFINED ? x_meta.layout : layout};
+void CreateLikeInferMeta(const MetaTensor& x,
+                         DataType dtype,
+                         DataLayout layout,
+                         MetaTensor* out) {
+  out->set_dims(x.dims());
+  out->set_dtype(dtype == DataType::UNDEFINED ? x.dtype() : dtype);
+  out->set_layout(layout == DataLayout::UNDEFINED ? x.layout() : layout);
 }
 
 static pten::framework::DDim ValidateShape(
@@ -218,46 +208,51 @@ static pten::framework::DDim ValidateShape(
   return pten::framework::make_ddim(output_shape);
 }
 
-DenseTensorMeta InferMetaFromVecValue(const DenseTensorMeta& x_meta,
-                                      const std::vector<int64_t>& shape) {
+void InferMetaFromVecValue(const MetaTensor& x,
+                           const std::vector<int64_t>& shape,
+                           MetaTensor* out) {
   PADDLE_ENFORCE_EQ(!shape.empty(),
                     true,
                     paddle::platform::errors::InvalidArgument(
                         "The parameter 'shape' in ReshapeOp must be set. "
                         "But received 'shape' is empty."));
-  auto x_dims = x_meta.dims;
+  auto x_dims = x.dims();
   auto out_dims = ValidateShape(shape, x_dims);
-  DenseTensorMeta return_meta(x_meta.dtype, out_dims, x_meta.layout);
-  if (x_dims[0] == return_meta.dims[0]) {
+  out->set_dims(out_dims);
+  out->set_dtype(x.dtype());
+  out->set_layout(x.layout());
+  if (x_dims[0] == out_dims[0]) {
     // Only pass LoD when the first dimension of output and Input(X)
     // are the same.
-    return_meta.lod = x_meta.lod;
+    out->share_lod(x);
   }
-  return return_meta;
 }
 
-DenseTensorMeta ReshapeInferMeta(const DenseTensorMeta& x_meta,
-                                 const ScalarArray& shape) {
-  return InferMetaFromVecValue(x_meta, shape.GetData());
+void ReshapeInferMeta(const MetaTensor& x,
+                      const ScalarArray& shape,
+                      MetaTensor* out) {
+  InferMetaFromVecValue(x, shape.GetData(), out);
 }
 
 /*  Why not use ReduceInferMeta directly?
     Because we need make InferMetaFunction's args follow the design of api.yaml
 */
-DenseTensorMeta SumInferMeta(const DenseTensorMeta& x_meta,
-                             const std::vector<int64_t>& axis,
-                             DataType dtype,
-                             bool keep_dim) {
-  return ReduceInferMeta(x_meta, axis, keep_dim, dtype);
+void SumInferMeta(const MetaTensor& x,
+                  const std::vector<int64_t>& axis,
+                  DataType dtype,
+                  bool keep_dim,
+                  MetaTensor* out) {
+  ReduceInferMeta(x, axis, keep_dim, dtype, std::move(out));
 }
 
-DenseTensorMeta ReduceInferMeta(const DenseTensorMeta& x_meta,
-                                const std::vector<int64_t>& axis,
-                                bool keep_dim,
-                                DataType dtype) {
+void ReduceInferMeta(const MetaTensor& x,
+                     const std::vector<int64_t>& axis,
+                     bool keep_dim,
+                     DataType dtype,
+                     MetaTensor* out) {
   bool reduce_all = true;
   std::set<int64_t> dims_set(axis.begin(), axis.end());
-  for (int64_t i = 0; i < x_meta.dims.size(); ++i) {
+  for (int64_t i = 0; i < x.dims().size(); ++i) {
     if (dims_set.find(i) == dims_set.end()) {
       reduce_all = false;
       break;
@@ -266,19 +261,19 @@ DenseTensorMeta ReduceInferMeta(const DenseTensorMeta& x_meta,
 
   std::vector<int64_t> out_dim_vector;
   if (keep_dim) {
-    for (int64_t i = 0; i < x_meta.dims.size(); ++i) {
+    for (int64_t i = 0; i < x.dims().size(); ++i) {
       if (reduce_all || dims_set.find(i) != dims_set.end()) {
         out_dim_vector.push_back(1);
       } else {
-        out_dim_vector.push_back(x_meta.dims.at(i));
+        out_dim_vector.push_back(x.dims().at(i));
       }
     }
   } else {
-    for (int64_t i = 0; i < x_meta.dims.size(); ++i) {
+    for (int64_t i = 0; i < x.dims().size(); ++i) {
       if (reduce_all || dims_set.find(i) != dims_set.end()) {
         continue;
       } else {
-        out_dim_vector.push_back(x_meta.dims.at(i));
+        out_dim_vector.push_back(x.dims().at(i));
       }
     }
 
@@ -292,18 +287,34 @@ DenseTensorMeta ReduceInferMeta(const DenseTensorMeta& x_meta,
   if (dtype != DataType::UNDEFINED) {
     out_dtype = dtype;
   } else {
-    if (x_meta.dtype == DataType::BOOL || x_meta.dtype == DataType::INT32 ||
-        x_meta.dtype == DataType::INT64) {
+    if (x.dtype() == DataType::BOOL || x.dtype() == DataType::INT32 ||
+        x.dtype() == DataType::INT64) {
       out_dtype = DataType::INT64;
     } else {
-      out_dtype = x_meta.dtype;
+      out_dtype = x.dtype();
     }
   }
 
-  DenseTensorMeta return_meta(out_dtype, out_dim, x_meta.layout);
-  return return_meta;
+  out->set_dims(out_dim);
+  out->set_dtype(out_dtype);
+  out->set_layout(x.layout());
+}
+
+void ReduceInferMeta(const MetaTensor& x,
+                     const std::vector<int64_t>& axis,
+                     bool keep_dim,
+                     MetaTensor* out) {
+  ReduceInferMeta(x, axis, keep_dim, DataType::UNDEFINED, out);
+}
+
+void TransferLayoutInferMeta(const MetaTensor& x,
+                             DataLayout layout,
+                             MetaTensor* out) {
+  out->set_dims(x.dims());
+  out->set_dtype(x.dtype());
+  out->set_layout(layout);
 }
 
 }  // namespace pten
 
-PT_REGISTER_INFER_META_FN(sign, pten::UnchangedInferMetaNew);
+PT_REGISTER_INFER_META_FN(sign, pten::UnchangedInferMeta);
diff --git a/paddle/pten/infermeta/unary.h b/paddle/pten/infermeta/unary.h
index 670c70de84c..65c6380695c 100644
--- a/paddle/pten/infermeta/unary.h
+++ b/paddle/pten/infermeta/unary.h
@@ -16,9 +16,7 @@ limitations under the License. */
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/pten/common/scalar_array.h"
-#include "paddle/pten/core/infermeta_utils.h"
 #include "paddle/pten/core/meta_tensor.h"
-#include "paddle/pten/core/tensor_meta.h"
 
 namespace pten {
 
@@ -26,45 +24,54 @@ class MetaConfig;
 
 // Common InferMeta Functions for unary operators, The format like:
 //
-//   void [OpName]InferMeta(const MetaTensor& x, ..., MetaTensor* out) {}
+//   void [FunctionDesc|OpName]InferMeta(const MetaTensor& x, ..., MetaTensor*
+//   out) {}
 //
 // NOTE: The name "InferShape" may be not appropriate. "InferMeta" may be good.
 // Because functions in this file not only can infer shape, but also need
 // infer lod or other useful data.
 
-// TODO(chenweihang): update all InferMeta function format in next pr,
-// now add UnchangedInferMetaNew for test new format
-void UnchangedInferMetaNew(MetaConfig config,
-                           const MetaTensor& x,
-                           MetaTensor* out);
+void UnchangedInferMeta(const MetaTensor& x, MetaTensor* out);
+
+void FlattenInferMeta(const MetaTensor& x,
+                      int start_axis,
+                      int stop_axis,
+                      MetaTensor* out);
 
-DenseTensorMeta UnchangedInferMeta(const DenseTensorMeta& x_meta);
+void CastInferMeta(const MetaTensor& x, DataType out_dtype, MetaTensor* out);
 
-DenseTensorMeta ReductionInferMeta(const DenseTensorMeta& x_meta);
+void CreateLikeInferMeta(const MetaTensor& x,
+                         DataType dtype,
+                         DataLayout layout,
+                         MetaTensor* out);
+
+void InferMetaFromVecValue(const MetaTensor& x,
+                           const std::vector<int64_t>& shape,
+                           MetaTensor* out);
 
-DenseTensorMeta FlattenInferMeta(const DenseTensorMeta& x_meta,
-                                 int start_axis,
-                                 int stop_axis);
-DenseTensorMeta CastInferMeta(const DenseTensorMeta& x_meta,
-                              const DataType out_dtype);
+void ReshapeInferMeta(const MetaTensor& x,
+                      const ScalarArray& shape,
+                      MetaTensor* out);
 
-DenseTensorMeta CreateLikeInferMeta(const DenseTensorMeta& x_meta,
-                                    DataType dtype,
-                                    DataLayout layout);
+void ReduceInferMeta(const MetaTensor& x,
+                     const std::vector<int64_t>& axis,
+                     bool keep_dim,
+                     DataType dtype,
+                     MetaTensor* out);
 
-DenseTensorMeta InferMetaFromVecValue(const DenseTensorMeta& x_meta,
-                                      const std::vector<int64_t>& shape);
+void ReduceInferMeta(const MetaTensor& x,
+                     const std::vector<int64_t>& axis,
+                     bool keep_dim,
+                     MetaTensor* out);
 
-DenseTensorMeta ReshapeInferMeta(const DenseTensorMeta& x_meta,
-                                 const ScalarArray& shape);
+void SumInferMeta(const MetaTensor& x,
+                  const std::vector<int64_t>& axis,
+                  DataType dtype,
+                  bool keep_dim,
+                  MetaTensor* out);
 
-DenseTensorMeta ReduceInferMeta(const DenseTensorMeta& x_meta,
-                                const std::vector<int64_t>& axis,
-                                bool keep_dim,
-                                DataType dtype = DataType::UNDEFINED);
+void TransferLayoutInferMeta(const MetaTensor& x,
+                             DataLayout layout,
+                             MetaTensor* out);
 
-DenseTensorMeta SumInferMeta(const DenseTensorMeta& x_meta,
-                             const std::vector<int64_t>& axis,
-                             DataType dtype,
-                             bool keep_dim);
 }  // namespace pten
diff --git a/paddle/pten/kernels/CMakeLists.txt b/paddle/pten/kernels/CMakeLists.txt
index e14c2f6b6c4..a9b81ad4eb2 100644
--- a/paddle/pten/kernels/CMakeLists.txt
+++ b/paddle/pten/kernels/CMakeLists.txt
@@ -9,7 +9,7 @@ add_subdirectory(funcs)
 # pten depends all pten kernel targets
 set_property(GLOBAL PROPERTY PTEN_KERNELS "")
 
-set(COMMON_KERNEL_DEPS dense_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils)
+set(COMMON_KERNEL_DEPS dense_tensor sparse_coo_tensor sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils)
 set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function blas math_function)
 # remove this dep after removing fluid deps on tensor creation
 set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} pten_api_utils)
@@ -23,5 +23,6 @@ endif()
 # auto build kernel targets by cmake
 register_kernels(EXCLUDES math_kernel DEPS ${COMMON_KERNEL_DEPS})
 kernel_library(math_kernel DEPS ${MATH_KERNEL_DEPS})
+add_subdirectory(sparse)
 
 copy_if_different(${kernel_declare_file} ${kernel_declare_file_final})
diff --git a/paddle/pten/kernels/cast_kernel.h b/paddle/pten/kernels/cast_kernel.h
index 8fdce9cda6f..a7f84619345 100644
--- a/paddle/pten/kernels/cast_kernel.h
+++ b/paddle/pten/kernels/cast_kernel.h
@@ -29,8 +29,9 @@ template <typename T, typename Context>
 DenseTensor Cast(const Context& dev_ctx,
                  const DenseTensor& x,
                  DataType out_dtype) {
-  auto out_meta = CastInferMeta(x.meta(), out_dtype);
-  auto dense_out = pten::Empty<T, Context>(dev_ctx, std::move(out_meta));
+  auto dense_out = pten::Empty<T, Context>(dev_ctx);
+  MetaTensor meta_out(&dense_out);
+  CastInferMeta(x, out_dtype, &meta_out);
   CastKernel<T, Context>(dev_ctx, x, out_dtype, &dense_out);
   return dense_out;
 }
diff --git a/paddle/pten/kernels/complex_kernel.h b/paddle/pten/kernels/complex_kernel.h
index ff27144eb49..ab1cb59872a 100644
--- a/paddle/pten/kernels/complex_kernel.h
+++ b/paddle/pten/kernels/complex_kernel.h
@@ -32,8 +32,9 @@ template <typename T,
                   std::is_same<T, paddle::platform::complex<double>>::value,
               bool> = true>
 DenseTensor Conj(const Context& dev_ctx, const DenseTensor& x) {
-  auto out_meta = UnchangedInferMeta(x.meta());
-  auto dense_out = pten::Empty<T, Context>(dev_ctx, std::move(out_meta));
+  auto dense_out = pten::Empty<T, Context>(dev_ctx);
+  MetaTensor meta_out(&dense_out);
+  UnchangedInferMeta(x, &meta_out);
   ConjKernel<T>(dev_ctx, x, &dense_out);
   return dense_out;
 }
diff --git a/paddle/pten/kernels/concat_kernel.h b/paddle/pten/kernels/concat_kernel.h
index 310b9ba8c0c..8c9103145f6 100644
--- a/paddle/pten/kernels/concat_kernel.h
+++ b/paddle/pten/kernels/concat_kernel.h
@@ -30,14 +30,16 @@ template <typename T, typename Context>
 DenseTensor Concat(const Context& dev_ctx,
                    const std::vector<DenseTensor>& x,
                    const Scalar& axis) {
-  std::vector<DenseTensorMeta> x_meta;
-  for (auto t : x) {
-    x_meta.push_back(t.meta());
+  std::vector<MetaTensor> meta_x;
+  for (const auto& t : x) {
+    meta_x.emplace_back(t);
   }
 
-  auto out_meta = ConcatInferMeta(x_meta, axis.to<int>(), true);
-  auto dense_out = pten::Empty<T, Context>(dev_ctx, std::move(out_meta));
+  auto dense_out = pten::Empty<T, Context>(dev_ctx);
+  MetaTensor meta_out(&dense_out);
+  ConcatInferMeta(meta_x, axis.to<int>(), &meta_out, /*is_runtime=*/true);
   ConcatKernel<T, Context>(dev_ctx, x, axis, &dense_out);
   return dense_out;
 }
+
 }  // namespace pten
diff --git a/paddle/pten/kernels/cpu/copy_kernel.cc b/paddle/pten/kernels/cpu/copy_kernel.cc
index 0892e3974fe..6a5d33b91aa 100644
--- a/paddle/pten/kernels/cpu/copy_kernel.cc
+++ b/paddle/pten/kernels/cpu/copy_kernel.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include "paddle/pten/backends/cpu/cpu_context.h"
 #include "paddle/pten/common/data_type.h"
-#include "paddle/pten/core/convert_utils.h"
+#include "paddle/pten/core/compat/convert_utils.h"
 #include "paddle/pten/core/kernel_registry.h"
 
 // See Note [ Why still include the fluid headers? ]
diff --git a/paddle/pten/kernels/cpu/digamma_grad_kernel.cc b/paddle/pten/kernels/cpu/digamma_grad_kernel.cc
new file mode 100644
index 00000000000..47be4302e84
--- /dev/null
+++ b/paddle/pten/kernels/cpu/digamma_grad_kernel.cc
@@ -0,0 +1,23 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/pten/kernels/digamma_grad_kernel.h"
+#include "paddle/pten/backends/cpu/cpu_context.h"
+#include "paddle/pten/common/scalar.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/kernel_registry.h"
+#include "paddle/pten/kernels/impl/digamma_grad_kernel_impl.h"
+
+PT_REGISTER_KERNEL(
+    digamma_grad, CPU, ALL_LAYOUT, pten::DigammaGradKernel, float, double) {}
diff --git a/paddle/pten/kernels/cpu/digamma_kernel.cc b/paddle/pten/kernels/cpu/digamma_kernel.cc
new file mode 100644
index 00000000000..6766c3b2999
--- /dev/null
+++ b/paddle/pten/kernels/cpu/digamma_kernel.cc
@@ -0,0 +1,23 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/pten/kernels/digamma_kernel.h"
+#include "paddle/pten/backends/cpu/cpu_context.h"
+#include "paddle/pten/common/scalar.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/kernel_registry.h"
+#include "paddle/pten/kernels/impl/digamma_kernel_impl.h"
+
+PT_REGISTER_KERNEL(
+    digamma, CPU, ALL_LAYOUT, pten::DigammaKernel, float, double) {}
diff --git a/paddle/pten/kernels/cpu/scale_kernel.cc b/paddle/pten/kernels/cpu/scale_kernel.cc
index 4f999ac4d17..4c248e6a014 100644
--- a/paddle/pten/kernels/cpu/scale_kernel.cc
+++ b/paddle/pten/kernels/cpu/scale_kernel.cc
@@ -57,7 +57,7 @@ PT_REGISTER_KERNEL(scale,
                    pten::ScaleKernel,
                    float,
                    double,
-                   paddle::platform::bfloat16,
+                   pten::dtype::bfloat16,
                    uint8_t,
                    int8_t,
                    int16_t,
diff --git a/paddle/pten/kernels/digamma_grad_kernel.h b/paddle/pten/kernels/digamma_grad_kernel.h
new file mode 100644
index 00000000000..ef3d084e0a0
--- /dev/null
+++ b/paddle/pten/kernels/digamma_grad_kernel.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/pten/core/dense_tensor.h"
+
+namespace pten {
+
+template <typename T, typename Context>
+void DigammaGradKernel(const Context& ctx,
+                       const DenseTensor& out_grad,
+                       const DenseTensor& x,
+                       DenseTensor* x_grad);
+
+}  // namepsace pten
diff --git a/paddle/pten/kernels/digamma_kernel.h b/paddle/pten/kernels/digamma_kernel.h
new file mode 100644
index 00000000000..af7a2893127
--- /dev/null
+++ b/paddle/pten/kernels/digamma_kernel.h
@@ -0,0 +1,24 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/pten/core/dense_tensor.h"
+
+namespace pten {
+
+template <typename T, typename Context>
+void DigammaKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out);
+
+}  // namepsace pten
diff --git a/paddle/pten/kernels/dot_kernel.h b/paddle/pten/kernels/dot_kernel.h
index 47f1c89109e..67f6ca3517f 100644
--- a/paddle/pten/kernels/dot_kernel.h
+++ b/paddle/pten/kernels/dot_kernel.h
@@ -29,8 +29,9 @@ template <typename T, typename Context>
 DenseTensor Dot(const Context& dev_ctx,
                 const DenseTensor& x,
                 const DenseTensor& y) {
-  auto out_meta = DotInferMeta(x.meta(), y.meta());
-  auto dense_out = pten::Empty<T, Context>(dev_ctx, std::move(out_meta));
+  auto dense_out = pten::Empty<T, Context>(dev_ctx);
+  MetaTensor meta_out(&dense_out);
+  DotInferMeta(x, y, &meta_out);
   DotKernel<T, Context>(dev_ctx, x, y, &dense_out);
   return dense_out;
 }
diff --git a/paddle/pten/kernels/empty_kernel.h b/paddle/pten/kernels/empty_kernel.h
index d283ef5c1e4..8a7da8fbd56 100644
--- a/paddle/pten/kernels/empty_kernel.h
+++ b/paddle/pten/kernels/empty_kernel.h
@@ -55,8 +55,9 @@ DenseTensor Empty(const Context& dev_ctx,
                   DataType dtype = DataType::FLOAT32,
                   Backend backend = Backend::CPU,  // Is backend needed here?
                   DataLayout layout = DataLayout::NCHW) {
-  auto out_meta = CreateInferMeta(shape, dtype, layout);
-  auto dense_out = Empty<T, Context>(dev_ctx, std::move(out_meta));
+  auto dense_out = Empty<T, Context>(dev_ctx);
+  MetaTensor meta_out(&dense_out);
+  CreateInferMeta(shape, dtype, layout, &meta_out);
   EmptyKernel<T, Context>(dev_ctx, shape, &dense_out);
   return dense_out;
 }
@@ -68,8 +69,9 @@ DenseTensor EmptyLike(
     DataType dtype = DataType::UNDEFINED,
     Backend backend = Backend::UNDEFINED,  // Is backend needed here?
     DataLayout layout = DataLayout::UNDEFINED) {
-  auto out_meta = CreateLikeInferMeta(x.meta(), dtype, layout);
-  auto dense_out = Empty<T, Context>(dev_ctx, std::move(out_meta));
+  auto dense_out = Empty<T, Context>(dev_ctx);
+  MetaTensor meta_out(&dense_out);
+  CreateLikeInferMeta(x, dtype, layout, &meta_out);
   EmptyLikeKernel<T, Context>(dev_ctx, &dense_out);
   return dense_out;
 }
diff --git a/paddle/pten/kernels/flatten_kernel.h b/paddle/pten/kernels/flatten_kernel.h
index c974fda1ed3..38d8786c7fc 100644
--- a/paddle/pten/kernels/flatten_kernel.h
+++ b/paddle/pten/kernels/flatten_kernel.h
@@ -40,8 +40,9 @@ DenseTensor Flatten(const Context& dev_ctx,
                     const DenseTensor& x,
                     int start_axis,
                     int stop_axis) {
-  auto out_meta = FlattenInferMeta(x.meta(), start_axis, stop_axis);
-  auto dense_out = Empty<T, Context>(dev_ctx, std::move(out_meta));
+  auto dense_out = Empty<T, Context>(dev_ctx);
+  MetaTensor meta_out(&dense_out);
+  FlattenInferMeta(x, start_axis, stop_axis, &meta_out);
   FlattenKernel<T, Context>(dev_ctx, x, start_axis, stop_axis, &dense_out);
   return dense_out;
 }
diff --git a/paddle/pten/kernels/full_kernel.h b/paddle/pten/kernels/full_kernel.h
index bc484fb4edf..030eb4b1c76 100644
--- a/paddle/pten/kernels/full_kernel.h
+++ b/paddle/pten/kernels/full_kernel.h
@@ -41,8 +41,9 @@ DenseTensor Full(const Context& dev_ctx,
                  DataType dtype = DataType::FLOAT32,
                  Backend backend = Backend::CPU,  // Is backend needed here?
                  DataLayout layout = DataLayout::NCHW) {
-  auto out_meta = CreateInferMeta(shape, dtype, layout);
-  auto dense_out = Empty<T, Context>(dev_ctx, std::move(out_meta));
+  auto dense_out = Empty<T, Context>(dev_ctx);
+  MetaTensor meta_out(&dense_out);
+  CreateInferMeta(shape, dtype, layout, &meta_out);
   FullKernel<T, Context>(dev_ctx, shape, val, &dense_out);
   return dense_out;
 }
@@ -55,8 +56,9 @@ DenseTensor FullLike(
     DataType dtype = DataType::UNDEFINED,
     Backend backend = Backend::UNDEFINED,  // Is backend needed here?
     DataLayout layout = DataLayout::UNDEFINED) {
-  auto out_meta = CreateLikeInferMeta(x.meta(), dtype, layout);
-  auto dense_out = Empty<T, Context>(dev_ctx, std::move(out_meta));
+  auto dense_out = Empty<T, Context>(dev_ctx);
+  MetaTensor meta_out(&dense_out);
+  CreateLikeInferMeta(x, dtype, layout, &meta_out);
   FullLikeKernel<T, Context>(dev_ctx, val, &dense_out);
   return dense_out;
 }
diff --git a/paddle/pten/kernels/funcs/concat_funcs.h b/paddle/pten/kernels/funcs/concat_funcs.h
index 8455b809692..88fdad3a6da 100644
--- a/paddle/pten/kernels/funcs/concat_funcs.h
+++ b/paddle/pten/kernels/funcs/concat_funcs.h
@@ -35,7 +35,7 @@ static inline int64_t ComputeAxis(int64_t axis, int64_t rank) {
 }
 
 static inline pten::DDim ComputeAndCheckShape(
-    const bool is_runtime,
+    bool is_runtime,
     const std::vector<pten::DDim>& inputs_dims,
     const size_t axis) {
   const size_t n = inputs_dims.size();
diff --git a/paddle/pten/kernels/funcs/cuda_kernel_config.h b/paddle/pten/kernels/funcs/cuda_kernel_config.h
index 6c7c6d11713..483e58eedb4 100644
--- a/paddle/pten/kernels/funcs/cuda_kernel_config.h
+++ b/paddle/pten/kernels/funcs/cuda_kernel_config.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+#include "paddle/pten/backends/gpu/gpu_context.h"
 
 #ifdef __HIPCC__
 #define ELEMENTWISE_BLOCK_SIZE 256
@@ -31,7 +32,7 @@ namespace funcs {
 * 2x~4x) than number of SMs. Hence, SM count is took into account within
 * this function to determine the right number of threads per block.
 */
-inline int GetThreadsConfig(const paddle::platform::CUDADeviceContext &ctx,
+inline int GetThreadsConfig(const pten::GPUContext &ctx,
                             int64_t numel,
                             int vec_size) {
   int threads = ELEMENTWISE_BLOCK_SIZE;
diff --git a/paddle/pten/kernels/funcs/elementwise_base.h b/paddle/pten/kernels/funcs/elementwise_base.h
index 3f1651eeb27..d102fd63718 100644
--- a/paddle/pten/kernels/funcs/elementwise_base.h
+++ b/paddle/pten/kernels/funcs/elementwise_base.h
@@ -23,8 +23,8 @@ limitations under the License. */
 
 #if defined(__NVCC__) || defined(__HIPCC__)
 #include "paddle/fluid/platform/aligned_vector.h"
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/function_traits.h"
+#include "paddle/pten/backends/gpu/gpu_launch_config.h"
 #include "paddle/pten/kernels/primitive/kernel_primitives.h"
 
 namespace kps = pten::kps;
@@ -646,7 +646,8 @@ void ElementwiseCudaKernel(const KPDevice &ctx,
                               VecSize><<<grid_size, block_size, 0, stream>>>(
       ins_data, outs_data, numel, main_offset, func);
 #else
-  auto gpu_config = GetGpuLaunchConfig1D(ctx, numel, VecSize);
+  auto gpu_config =
+      pten::backends::gpu::GetGpuLaunchConfig1D(ctx, numel, VecSize);
   int main_offset = (numel / (VecSize * gpu_config.GetBlockSize())) * VecSize *
                     gpu_config.GetBlockSize();
   auto stream = ctx.stream();
diff --git a/paddle/pten/kernels/funcs/transpose.cu b/paddle/pten/kernels/funcs/transpose.cu
index a7b7184487c..09baa2c6e02 100644
--- a/paddle/pten/kernels/funcs/transpose.cu
+++ b/paddle/pten/kernels/funcs/transpose.cu
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/pten/backends/gpu/gpu_context.h"
 #include "paddle/pten/core/ddim.h"
diff --git a/paddle/pten/kernels/gpu/concat_and_split.h b/paddle/pten/kernels/gpu/concat_and_split.h
index 66b21b5f513..47022666564 100644
--- a/paddle/pten/kernels/gpu/concat_and_split.h
+++ b/paddle/pten/kernels/gpu/concat_and_split.h
@@ -237,12 +237,11 @@ __global__ void SplitKernel(const T* input_data,
   SplitKernelDetail<T>(input_data, in_row, in_col, fixed_out_col, outputs_data);
 }
 
-static inline void GetBlockDims(
-    const paddle::platform::CUDADeviceContext& context,
-    int64_t num_rows,
-    int64_t num_cols,
-    dim3* block_dims,
-    dim3* grid_dims) {
+static inline void GetBlockDims(const pten::GPUContext& context,
+                                int64_t num_rows,
+                                int64_t num_cols,
+                                dim3* block_dims,
+                                dim3* grid_dims) {
   // Set the thread block and grid according to CurrentDeviceId
   const int kThreadsPerBlock = 1024;
   int block_cols = kThreadsPerBlock;
diff --git a/paddle/pten/kernels/gpu/copy_kernel.cu b/paddle/pten/kernels/gpu/copy_kernel.cu
index d2578723158..d48a9fb1d77 100644
--- a/paddle/pten/kernels/gpu/copy_kernel.cu
+++ b/paddle/pten/kernels/gpu/copy_kernel.cu
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include "paddle/pten/backends/gpu/gpu_context.h"
 #include "paddle/pten/common/data_type.h"
-#include "paddle/pten/core/convert_utils.h"
+#include "paddle/pten/core/compat/convert_utils.h"
 #include "paddle/pten/core/kernel_registry.h"
 
 // See Note [ Why still include the fluid headers? ]
@@ -87,9 +87,7 @@ void Copy(const Context& dev_ctx,
                           ctx_gpu_place));
     auto stream =
         blocking ? nullptr
-                 : reinterpret_cast<const paddle::platform::CUDADeviceContext&>(
-                       dev_ctx)
-                       .stream();
+                 : reinterpret_cast<const pten::GPUContext&>(dev_ctx).stream();
     paddle::memory::Copy(
         dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
   } else if (paddle::platform::is_cpu_place(src_place) &&  // NOLINT
@@ -113,9 +111,7 @@ void Copy(const Context& dev_ctx,
                           ctx_gpu_place));
     auto stream =
         blocking ? nullptr
-                 : reinterpret_cast<const paddle::platform::CUDADeviceContext&>(
-                       dev_ctx)
-                       .stream();
+                 : reinterpret_cast<const pten::GPUContext&>(dev_ctx).stream();
     paddle::memory::Copy(
         dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, stream);
   } else if (paddle::platform::is_gpu_place(src_place) &&  // NOLINT
@@ -140,9 +136,7 @@ void Copy(const Context& dev_ctx,
                           ctx_gpu_place.device));
     auto stream =
         blocking ? nullptr
-                 : reinterpret_cast<const paddle::platform::CUDADeviceContext&>(
-                       dev_ctx)
-                       .stream();
+                 : reinterpret_cast<const pten::GPUContext&>(dev_ctx).stream();
     paddle::memory::Copy(
         dst_cuda_pinned_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
   } else if (paddle::platform::is_cuda_pinned_place(src_place) &&  // NOLINT
@@ -167,9 +161,7 @@ void Copy(const Context& dev_ctx,
                           ctx_gpu_place.device));
     auto stream =
         blocking ? nullptr
-                 : reinterpret_cast<const paddle::platform::CUDADeviceContext&>(
-                       dev_ctx)
-                       .stream();
+                 : reinterpret_cast<const pten::GPUContext&>(dev_ctx).stream();
     paddle::memory::Copy(
         dst_gpu_place, dst_ptr, src_cuda_pinned_place, src_ptr, size, stream);
   } else if (paddle::platform::is_gpu_place(src_place) &&  // NOLINT
@@ -185,9 +177,7 @@ void Copy(const Context& dev_ctx,
             ctx_place));
     auto stream =
         blocking ? nullptr
-                 : reinterpret_cast<const paddle::platform::CUDADeviceContext&>(
-                       dev_ctx)
-                       .stream();
+                 : reinterpret_cast<const pten::GPUContext&>(dev_ctx).stream();
     if (paddle::platform::is_same_place(src_place, dst_place)) {
       paddle::memory::Copy(
           dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
diff --git a/paddle/pten/kernels/gpu/digamma_grad_kernel.cu b/paddle/pten/kernels/gpu/digamma_grad_kernel.cu
new file mode 100644
index 00000000000..b87ea5a5cc0
--- /dev/null
+++ b/paddle/pten/kernels/gpu/digamma_grad_kernel.cu
@@ -0,0 +1,22 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/pten/backends/gpu/gpu_context.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/kernel_registry.h"
+#include "paddle/pten/kernels/digamma_grad_kernel.h"
+#include "paddle/pten/kernels/impl/digamma_grad_kernel_impl.h"
+
+PT_REGISTER_KERNEL(
+    digamma_grad, GPU, ALL_LAYOUT, pten::DigammaGradKernel, float, double) {}
diff --git a/paddle/pten/kernels/gpu/digamma_kernel.cu b/paddle/pten/kernels/gpu/digamma_kernel.cu
new file mode 100644
index 00000000000..8b847c1a476
--- /dev/null
+++ b/paddle/pten/kernels/gpu/digamma_kernel.cu
@@ -0,0 +1,23 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/pten/backends/gpu/gpu_context.h"
+#include "paddle/pten/common/scalar.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/kernel_registry.h"
+#include "paddle/pten/kernels/digamma_kernel.h"
+#include "paddle/pten/kernels/impl/digamma_kernel_impl.h"
+
+PT_REGISTER_KERNEL(
+    digamma, GPU, ALL_LAYOUT, pten::DigammaKernel, float, double) {}
diff --git a/paddle/pten/kernels/gpu/elementwise.h b/paddle/pten/kernels/gpu/elementwise.h
index f988f5abdb1..947f969b077 100644
--- a/paddle/pten/kernels/gpu/elementwise.h
+++ b/paddle/pten/kernels/gpu/elementwise.h
@@ -2020,7 +2020,7 @@ void default_elementwise_add_grad(const GPUContext &ctx,
                                        T,
                                        kps::AddFunctor,
                                        kps::IdentityFunctor<T>>(
-          dout, dx, kps::IdentityFunctor<T>(), reduce_dims, stream);
+          ctx, dout, dx, kps::IdentityFunctor<T>(), reduce_dims, stream);
     }
   }
   // dy
@@ -2038,7 +2038,7 @@ void default_elementwise_add_grad(const GPUContext &ctx,
                                        T,
                                        kps::AddFunctor,
                                        kps::IdentityFunctor<T>>(
-          dout, dy, kps::IdentityFunctor<T>(), reduce_dims, stream);
+          ctx, dout, dy, kps::IdentityFunctor<T>(), reduce_dims, stream);
     }
   }
 }
@@ -2137,7 +2137,7 @@ void default_elementwise_sub_grad(const GPUContext &ctx,
                                        T,
                                        kps::AddFunctor,
                                        kps::IdentityFunctor<T>>(
-          dout, dx, kps::IdentityFunctor<T>(), reduce_dims, stream);
+          ctx, dout, dx, kps::IdentityFunctor<T>(), reduce_dims, stream);
     }
   }
   // dy
@@ -2161,7 +2161,7 @@ void default_elementwise_sub_grad(const GPUContext &ctx,
                                        T,
                                        kps::AddFunctor,
                                        kps::InverseFunctor<T>>(
-          dout, dy, kps::InverseFunctor<T>(), reduce_dims, stream);
+          ctx, dout, dy, kps::InverseFunctor<T>(), reduce_dims, stream);
     }
   }
 }
diff --git a/paddle/pten/kernels/gpu/math_kernel.cu b/paddle/pten/kernels/gpu/math_kernel.cu
index 37104c46a49..1a549087e42 100644
--- a/paddle/pten/kernels/gpu/math_kernel.cu
+++ b/paddle/pten/kernels/gpu/math_kernel.cu
@@ -29,7 +29,7 @@ namespace cub = hipcub;
 
 #include "paddle/pten/common/complex.h"
 #include "paddle/pten/common/float16.h"
-#include "paddle/pten/core/convert_utils.h"
+#include "paddle/pten/core/compat/convert_utils.h"
 #include "paddle/pten/core/enforce.h"
 #include "paddle/pten/core/kernel_registry.h"
 
diff --git a/paddle/pten/kernels/gpu/reduce.h b/paddle/pten/kernels/gpu/reduce.h
index d864c76ea19..7a76a988dee 100644
--- a/paddle/pten/kernels/gpu/reduce.h
+++ b/paddle/pten/kernels/gpu/reduce.h
@@ -38,14 +38,13 @@ namespace cub = hipcub;
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/fast_divmod.h"
 #include "paddle/fluid/string/string_helper.h"
-#include "paddle/pten/core/array.h"
-#include "paddle/pten/core/enforce.h"
-#include "paddle/pten/kernels/primitive/kernel_primitives.h"
-
 #include "paddle/pten/api/ext/dispatch.h"
 #include "paddle/pten/backends/gpu/gpu_context.h"
 #include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/enforce.h"
+#include "paddle/pten/core/utils/array.h"
 #include "paddle/pten/kernels/funcs/elementwise_base.h"
+#include "paddle/pten/kernels/primitive/kernel_primitives.h"
 
 // Reduce split or not, Whether to use ReduceHigherDim
 #define REDUCE_SPLIT_BOUNDARY 512
@@ -1065,7 +1064,8 @@ template <typename Tx,
           typename Ty,
           template <typename> class ReduceOp,
           typename TransformOp>
-void TensorReduceFunctorImpl(const pten::DenseTensor& x,
+void TensorReduceFunctorImpl(const pten::GPUContext& dev_ctx,
+                             const pten::DenseTensor& x,
                              pten::DenseTensor* y,
                              const TransformOp& transform,
                              const std::vector<int>& origin_reduce_dims,
@@ -1089,13 +1089,11 @@ void TensorReduceFunctorImpl(const pten::DenseTensor& x,
   auto x_data = x.data<Tx>();
   auto y_data = y->data<Ty>();
 
-  auto* dev_ctx = static_cast<paddle::platform::CUDADeviceContext*>(
-      paddle::platform::DeviceContextPool::Instance().Get(x.place()));
   if (config.reduce_num == 1) {
     std::vector<const DenseTensor*> inputs = {&x};
     std::vector<DenseTensor*> outputs = {y};
     funcs::LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kUnary, Tx, Ty>(
-        *dev_ctx, inputs, &outputs, transform);
+        dev_ctx, inputs, &outputs, transform);
     return;
   }
 
@@ -1245,13 +1243,23 @@ void Reduce(const GPUContext& dev_ctx,
                                                  data_t,
                                                  ReduceOp,
                                                  TransformOp<T, MPType>>(
-              x, out, TransformOp<T, MPType>(reduce_num), reduce_dims, stream);
+              dev_ctx,
+              x,
+              out,
+              TransformOp<T, MPType>(reduce_num),
+              reduce_dims,
+              stream);
         }));
   } else {
     using MPType = typename kps::details::MPTypeTrait<T>::Type;
     pten::kernels::
         TensorReduceFunctorImpl<T, T, ReduceOp, TransformOp<T, MPType>>(
-            x, out, TransformOp<T, MPType>(reduce_num), reduce_dims, stream);
+            dev_ctx,
+            x,
+            out,
+            TransformOp<T, MPType>(reduce_num),
+            reduce_dims,
+            stream);
   }
 }
 }  // namespace pten
diff --git a/paddle/pten/kernels/gpu/scale_kernel.cu b/paddle/pten/kernels/gpu/scale_kernel.cu
index 5aba001267a..6cf84acd9dc 100644
--- a/paddle/pten/kernels/gpu/scale_kernel.cu
+++ b/paddle/pten/kernels/gpu/scale_kernel.cu
@@ -72,7 +72,7 @@ PT_REGISTER_KERNEL(scale,
                    pten::ScaleKernel,
                    float,
                    double,
-                   paddle::platform::float16,
+                   pten::dtype::float16,
                    uint8_t,
                    int8_t,
                    int16_t,
diff --git a/paddle/pten/kernels/impl/digamma_grad_kernel_impl.h b/paddle/pten/kernels/impl/digamma_grad_kernel_impl.h
new file mode 100644
index 00000000000..f919fe234c3
--- /dev/null
+++ b/paddle/pten/kernels/impl/digamma_grad_kernel_impl.h
@@ -0,0 +1,55 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <unsupported/Eigen/SpecialFunctions>
+#include "paddle/fluid/platform/for_range.h"
+#include "paddle/pten/core/dense_tensor.h"
+
+namespace pten {
+
+template <typename T>
+struct DigammaGradFunctor {
+  DigammaGradFunctor(const T* dout, const T* x, T* output, int64_t numel)
+      : dout_(dout), x_(x), output_(output), numel_(numel) {}
+
+  HOSTDEVICE void operator()(int64_t idx) const {
+    output_[idx] = dout_[idx] * Eigen::numext::polygamma(T(1), x_[idx]);
+  }
+
+ private:
+  const T* dout_;
+  const T* x_;
+  T* output_;
+  int64_t numel_;
+};
+
+template <typename T, typename Context>
+void DigammaGradKernel(const Context& ctx,
+                       const DenseTensor& out_grad,
+                       const DenseTensor& x,
+                       DenseTensor* x_grad) {
+  x_grad->mutable_data<T>(ctx.GetPlace());
+
+  auto* dout_data = out_grad.data<T>();
+  auto* x_data = x.data<T>();
+  auto* dx_data = x_grad->data<T>();
+  auto numel = out_grad.numel();
+  paddle::platform::ForRange<Context> for_range(ctx, numel);
+  DigammaGradFunctor<T> functor(dout_data, x_data, dx_data, numel);
+  for_range(functor);
+}
+
+}  // namespace pten
diff --git a/paddle/pten/kernels/impl/digamma_kernel_impl.h b/paddle/pten/kernels/impl/digamma_kernel_impl.h
new file mode 100644
index 00000000000..209b3e95376
--- /dev/null
+++ b/paddle/pten/kernels/impl/digamma_kernel_impl.h
@@ -0,0 +1,49 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <unsupported/Eigen/SpecialFunctions>
+#include "paddle/fluid/platform/for_range.h"
+#include "paddle/pten/core/dense_tensor.h"
+
+namespace pten {
+
+template <typename T>
+struct DigammaFunctor {
+  DigammaFunctor(const T* input, T* output, int64_t numel)
+      : input_(input), output_(output), numel_(numel) {}
+
+  HOSTDEVICE void operator()(int64_t idx) const {
+    output_[idx] = Eigen::numext::digamma(input_[idx]);
+  }
+
+ private:
+  const T* input_;
+  T* output_;
+  int64_t numel_;
+};
+
+template <typename T, typename Context>
+void DigammaKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out) {
+  out->mutable_data<T>(ctx.GetPlace());
+  auto* x_data = x.data<T>();
+  auto* out_data = out->data<T>();
+  auto numel = x.numel();
+  paddle::platform::ForRange<Context> for_range(ctx, numel);
+  DigammaFunctor<T> functor(x_data, out_data, numel);
+  for_range(functor);
+}
+
+}  // namespace pten
diff --git a/paddle/pten/kernels/impl/matmul_grad_kernel_impl.h b/paddle/pten/kernels/impl/matmul_grad_kernel_impl.h
index 87785a2b477..226cfd89b13 100644
--- a/paddle/pten/kernels/impl/matmul_grad_kernel_impl.h
+++ b/paddle/pten/kernels/impl/matmul_grad_kernel_impl.h
@@ -60,9 +60,11 @@ struct ReduceSumForMatmulGrad<GPUContext, T> {
                   DenseTensor* output,
                   const std::vector<int>& reduce_dims) {
     auto stream = dev_ctx.stream();
-    kernels::
-        TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-            input, output, kps::IdentityFunctor<T>(), reduce_dims, stream);
+    kernels::TensorReduceFunctorImpl<T,
+                                     T,
+                                     kps::AddFunctor,
+                                     kps::IdentityFunctor<T>>(
+        dev_ctx, input, output, kps::IdentityFunctor<T>(), reduce_dims, stream);
   }
 };
 #endif
diff --git a/paddle/pten/kernels/math_kernel.h b/paddle/pten/kernels/math_kernel.h
index afef5866931..eb39b618eb6 100644
--- a/paddle/pten/kernels/math_kernel.h
+++ b/paddle/pten/kernels/math_kernel.h
@@ -14,7 +14,6 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/pten/api/lib/utils/storage.h"
 #include "paddle/pten/core/dense_tensor.h"
 #include "paddle/pten/infermeta/binary.h"
 #include "paddle/pten/infermeta/unary.h"
@@ -110,8 +109,9 @@ template <typename T, typename Context>
 DenseTensor Add(const Context& dev_ctx,
                 const DenseTensor& x,
                 const DenseTensor& y) {
-  auto out_meta = ElementwiseRawInferMeta(x.meta(), y.meta(), -1);
-  auto dense_out = pten::Empty<T, Context>(dev_ctx, std::move(out_meta));
+  auto dense_out = pten::Empty<T, Context>(dev_ctx);
+  MetaTensor meta_out(&dense_out);
+  ElementwiseInferMeta(x, y, &meta_out);
   AddKernel<T, Context>(dev_ctx, x, y, &dense_out);
   return dense_out;
 }
@@ -120,8 +120,9 @@ template <typename T, typename Context>
 DenseTensor Subtract(const Context& dev_ctx,
                      const DenseTensor& x,
                      const DenseTensor& y) {
-  auto out_meta = ElementwiseRawInferMeta(x.meta(), y.meta(), -1);
-  auto dense_out = pten::Empty<T, Context>(dev_ctx, std::move(out_meta));
+  auto dense_out = pten::Empty<T, Context>(dev_ctx);
+  MetaTensor meta_out(&dense_out);
+  ElementwiseInferMeta(x, y, &meta_out);
   SubtractKernel<T, Context>(dev_ctx, x, y, &dense_out);
   return dense_out;
 }
@@ -130,8 +131,9 @@ template <typename T, typename Context>
 DenseTensor Divide(const Context& dev_ctx,
                    const DenseTensor& x,
                    const DenseTensor& y) {
-  auto out_meta = ElementwiseRawInferMeta(x.meta(), y.meta(), -1);
-  auto dense_out = pten::Empty<T, Context>(dev_ctx, std::move(out_meta));
+  auto dense_out = pten::Empty<T, Context>(dev_ctx);
+  MetaTensor meta_out(&dense_out);
+  ElementwiseInferMeta(x, y, &meta_out);
   DivideKernel<T, Context>(dev_ctx, x, y, &dense_out);
   return dense_out;
 }
@@ -140,8 +142,9 @@ template <typename T, typename Context>
 DenseTensor Multiply(const Context& dev_ctx,
                      const DenseTensor& x,
                      const DenseTensor& y) {
-  auto out_meta = ElementwiseRawInferMeta(x.meta(), y.meta(), -1);
-  auto dense_out = pten::Empty<T, Context>(dev_ctx, std::move(out_meta));
+  auto dense_out = pten::Empty<T, Context>(dev_ctx);
+  MetaTensor meta_out(&dense_out);
+  ElementwiseInferMeta(x, y, &meta_out);
   MultiplyKernel<T, Context>(dev_ctx, x, y, &dense_out);
   return dense_out;
 }
@@ -151,8 +154,9 @@ DenseTensor Mean(const Context& dev_ctx,
                  const DenseTensor& x,
                  const std::vector<int64_t>& axis,
                  bool keep_dim) {
-  auto out_meta = ReduceInferMeta(x.meta(), axis, keep_dim);
-  auto dense_out = pten::Empty<T, Context>(dev_ctx, std::move(out_meta));
+  auto dense_out = pten::Empty<T, Context>(dev_ctx);
+  MetaTensor meta_out(&dense_out);
+  ReduceInferMeta(x, axis, keep_dim, x.dtype(), &meta_out);
   MeanKernel<T, Context>(dev_ctx, x, axis, keep_dim, &dense_out);
   return dense_out;
 }
@@ -163,9 +167,9 @@ DenseTensor Sum(const Context& dev_ctx,
                 const std::vector<int64_t>& axis,
                 DataType dtype,
                 bool keep_dim) {
-  auto out_meta = SumInferMeta(x.meta(), axis, dtype, keep_dim);
-  auto dense_out = pten::Empty<T, Context>(dev_ctx, std::move(out_meta));
-
+  auto dense_out = pten::Empty<T, Context>(dev_ctx);
+  MetaTensor meta_out(&dense_out);
+  SumInferMeta(x, axis, dtype, keep_dim, &meta_out);
   SumKernel<T, Context>(dev_ctx, x, axis, dtype, keep_dim, &dense_out);
   return dense_out;
 }
diff --git a/paddle/pten/kernels/matmul_kernel.h b/paddle/pten/kernels/matmul_kernel.h
index f9cb2c3801c..e6b9302cadd 100644
--- a/paddle/pten/kernels/matmul_kernel.h
+++ b/paddle/pten/kernels/matmul_kernel.h
@@ -35,8 +35,9 @@ DenseTensor Matmul(const Context& dev_ctx,
                    const DenseTensor& y,
                    bool transpose_x,
                    bool transpose_y) {
-  auto out_meta = MatmulInferMeta(x.meta(), y.meta(), transpose_x, transpose_y);
-  auto dense_out = Empty<T, Context>(dev_ctx, std::move(out_meta));
+  auto dense_out = Empty<T, Context>(dev_ctx);
+  MetaTensor meta_out(&dense_out);
+  MatmulInferMeta(x, y, transpose_x, transpose_y, &meta_out);
   MatmulKernel<T, Context>(dev_ctx, x, y, transpose_x, transpose_y, &dense_out);
   return dense_out;
 }
diff --git a/paddle/pten/kernels/reshape_kernel.cc b/paddle/pten/kernels/reshape_kernel.cc
index a76dfb09a0e..c52d251582b 100644
--- a/paddle/pten/kernels/reshape_kernel.cc
+++ b/paddle/pten/kernels/reshape_kernel.cc
@@ -26,15 +26,18 @@ void ReshapeKernel(const Context& dev_ctx,
                    const DenseTensor& x,
                    const ScalarArray& shape,
                    DenseTensor* out) {
-  auto out_meta = InferMetaFromVecValue(x.meta(), shape.GetData());
+  MetaTensor meta_out(out);
+  InferMetaFromVecValue(x, shape.GetData(), &meta_out);
   if (x.initialized() && x.Holder() == out->Holder()) {
-    out->ResizeAndAllocate(out_meta.dims);
+    dev_ctx.Alloc(out);
     return;
   }
-  out->set_meta(out_meta);
   dev_ctx.Alloc(out);
+  // TODO(chenweihang): the output dims are overwrite after copying,
+  // here we need to use copy method that only copy data
+  auto dims = out->dims();
   pten::Copy(dev_ctx, x, false, out);
-  out->Resize(out_meta.dims);
+  out->Resize(dims);
   out->ResetLoD(x.lod());
 }
 
diff --git a/paddle/pten/kernels/reshape_kernel.h b/paddle/pten/kernels/reshape_kernel.h
index 293f6cd2baf..a5672ad6e5b 100644
--- a/paddle/pten/kernels/reshape_kernel.h
+++ b/paddle/pten/kernels/reshape_kernel.h
@@ -38,8 +38,9 @@ template <typename T, typename Context>
 DenseTensor Reshape(const Context& dev_ctx,
                     const DenseTensor& x,
                     const std::vector<int64_t>& shape) {
-  auto out_meta = InferMetaFromVecValue(x.meta(), shape);
-  auto dense_out = Empty<T, Context>(dev_ctx, std::move(out_meta));
+  auto dense_out = Empty<T, Context>(dev_ctx);
+  MetaTensor meta_out(&dense_out);
+  InferMetaFromVecValue(x, shape, &meta_out);
   ReshapeKernel<Context>(dev_ctx, x, ScalarArray(shape), &dense_out);
   return dense_out;
 }
diff --git a/paddle/pten/kernels/scale_kernel.h b/paddle/pten/kernels/scale_kernel.h
index 1cd11f0b878..357bc70b40d 100644
--- a/paddle/pten/kernels/scale_kernel.h
+++ b/paddle/pten/kernels/scale_kernel.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include "paddle/pten/common/scalar.h"
 #include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/selected_rows.h"
 #include "paddle/pten/infermeta/unary.h"
 #include "paddle/pten/kernels/empty_kernel.h"
 namespace pten {
@@ -28,14 +29,23 @@ void ScaleKernel(const Context& dev_ctx,
                  bool bias_after_scale,
                  DenseTensor* out);
 
+template <typename T, typename Context>
+void ScaleSR(const Context& dev_ctx,
+             const SelectedRows& x,
+             const Scalar& scale,
+             float bias,
+             bool bias_after_scale,
+             SelectedRows* out);
+
 template <typename T, typename Context>
 DenseTensor Scale(const Context& dev_ctx,
                   const DenseTensor& x,
                   const Scalar& scale,
                   float bias,
                   bool bias_after_scale) {
-  auto out_meta = UnchangedInferMeta(x.meta());
-  auto dense_out = pten::Empty<T, Context>(dev_ctx, std::move(out_meta));
+  auto dense_out = pten::Empty<T, Context>(dev_ctx);
+  MetaTensor meta_out(&dense_out);
+  UnchangedInferMeta(x, &meta_out);
   ScaleKernel<T, Context>(
       dev_ctx, x, scale, bias, bias_after_scale, &dense_out);
   return dense_out;
diff --git a/paddle/pten/kernels/selected_rows/scale_kernel.cc b/paddle/pten/kernels/selected_rows/scale_kernel.cc
new file mode 100644
index 00000000000..8b29f1d6c53
--- /dev/null
+++ b/paddle/pten/kernels/selected_rows/scale_kernel.cc
@@ -0,0 +1,68 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/kernels/scale_kernel.h"
+
+#include "paddle/pten/backends/cpu/cpu_context.h"
+#include "paddle/pten/backends/gpu/gpu_context.h"
+#include "paddle/pten/core/kernel_registry.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/pten/common/bfloat16.h"
+namespace pten {
+
+template <typename T, typename Context>
+void ScaleSR(const Context& dev_ctx,
+             const SelectedRows& x,
+             const Scalar& scale,
+             float bias,
+             bool bias_after_scale,
+             SelectedRows* out) {
+  if (x.value().data() != out->value().data()) {
+    out->set_rows(x.rows());
+    out->set_height(x.height());
+  }
+  pten::ScaleKernel<T>(
+      dev_ctx, x.value(), scale, bias, bias_after_scale, out->mutable_value());
+}
+
+}  // namespace pten
+
+PT_REGISTER_KERNEL(scale_sr,
+                   CPU,
+                   ALL_LAYOUT,
+                   pten::ScaleSR,
+                   float,
+                   double,
+                   pten::dtype::bfloat16,
+                   uint8_t,
+                   int8_t,
+                   int16_t,
+                   int,
+                   int64_t) {}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PT_REGISTER_KERNEL(scale_sr,
+                   GPU,
+                   ALL_LAYOUT,
+                   pten::ScaleSR,
+                   float,
+                   double,
+                   pten::dtype::float16,
+                   uint8_t,
+                   int8_t,
+                   int16_t,
+                   int,
+                   int64_t) {}
+#endif
diff --git a/paddle/pten/kernels/sign_kernel.h b/paddle/pten/kernels/sign_kernel.h
index 304b640d2af..4161c76e471 100644
--- a/paddle/pten/kernels/sign_kernel.h
+++ b/paddle/pten/kernels/sign_kernel.h
@@ -25,8 +25,9 @@ void SignKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out);
 
 template <typename T, typename Context>
 DenseTensor Sign(const Context& dev_ctx, const DenseTensor& x) {
-  auto out_meta = UnchangedInferMeta(x.meta());
-  auto dense_out = pten::Empty<T, Context>(dev_ctx, std::move(out_meta));
+  auto dense_out = pten::Empty<T, Context>(dev_ctx);
+  MetaTensor meta_out(&dense_out);
+  UnchangedInferMeta(x, &meta_out);
   SignKernel<T, Context>(dev_ctx, x, &dense_out);
   return dense_out;
 }
diff --git a/paddle/pten/kernels/sparse/CMakeLists.txt b/paddle/pten/kernels/sparse/CMakeLists.txt
new file mode 100644
index 00000000000..3e4a968b7a8
--- /dev/null
+++ b/paddle/pten/kernels/sparse/CMakeLists.txt
@@ -0,0 +1,3 @@
+
+set(SPARSE_KERNEL_DEPS dense_tensor sparse_coo_tensor sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils)
+register_kernels(DEPS ${SPARSE_KERNEL_DEPS} SUB_DIR "sparse_kernel")
diff --git a/paddle/pten/kernels/sparse/cpu/sparse_utils_kernel.cc b/paddle/pten/kernels/sparse/cpu/sparse_utils_kernel.cc
new file mode 100644
index 00000000000..d3aac6ee7d2
--- /dev/null
+++ b/paddle/pten/kernels/sparse/cpu/sparse_utils_kernel.cc
@@ -0,0 +1,187 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/kernels/sparse/sparse_utils_kernel.h"
+#include "paddle/pten/api/lib/utils/allocator.h"
+#include "paddle/pten/backends/gpu/gpu_context.h"
+#include "paddle/pten/core/kernel_registry.h"
+#include "paddle/pten/core/tensor_meta.h"
+
+namespace pten {
+namespace sparse {
+
+template <typename T>
+inline bool IsZero(const T* data, const size_t n) {
+  const T zero = static_cast<T>(0);
+  for (size_t i = 0; i < n; i++) {
+    if (data[i] != zero) {
+      return false;
+    }
+  }
+  return true;
+}
+
+// TODO(zhangkaihuo): implement a kernel to count the number of non-zero
+// elements in tensor
+template <typename T>
+inline int64_t GetNonZeroNum(const DenseTensor& dense,
+                             const int64_t sparse_dim) {
+  const auto& dims = dense.dims();
+  PADDLE_ENFORCE_GE(
+      dims.size(),
+      sparse_dim,
+      paddle::platform::errors::InvalidArgument(
+          "sparse_dim(%d) should be less than or equal to dense.dim(%d)",
+          sparse_dim,
+          dims.size()));
+
+  auto dims_2d = flatten_to_2d(dims, sparse_dim);
+  const int rows = dims_2d[0];
+  const int cols = dims_2d[1];
+
+  const T* data = dense.data<T>();
+  int64_t non_zero_num = 0;
+  for (int64_t i = 0; i < rows; i++) {
+    if (!IsZero(data + i * cols, cols)) {
+      non_zero_num = non_zero_num + 1;
+    }
+  }
+  return non_zero_num;
+}
+
+template <typename T, typename Context>
+void DenseToSparseCooKernel(const Context& dev_ctx,
+                            const DenseTensor& x,
+                            const int64_t sparse_dim,
+                            SparseCooTensor* out) {
+  const T* x_data = x.data<T>();
+  const auto& x_dims = x.dims();
+
+  int64_t non_zero_num = GetNonZeroNum<T>(x, sparse_dim);
+
+  const auto place = dev_ctx.GetPlace();
+  const auto values_dims = InferDenseDims(x_dims, sparse_dim, non_zero_num);
+  DenseTensorMeta indices_meta(DataType::INT64,
+                               {sparse_dim, static_cast<int64_t>(non_zero_num)},
+                               DataLayout::NCHW);
+  DenseTensorMeta values_meta(x.meta().dtype, values_dims, x.meta().layout);
+  pten::DenseTensor indices =
+      pten::Empty<int64_t, Context>(dev_ctx, std::move(indices_meta));
+  pten::DenseTensor values =
+      pten::Empty<T, Context>(dev_ctx, std::move(values_meta));
+  int64_t* indices_data = indices.mutable_data<int64_t>(place);
+  T* values_data = values.mutable_data<T>(place);
+
+  auto dims_2d = flatten_to_2d(x_dims, sparse_dim);
+  const int rows = dims_2d[0];
+  const int cols = dims_2d[1];
+
+  int index = 0;
+  for (int i = 0; i < rows; i++) {
+    if (!IsZero(x_data + i * cols, cols)) {
+      int64_t sparse_index = i;
+      for (int64_t j = sparse_dim - 1; j >= 0; j--) {
+        indices_data[j * non_zero_num + index] = sparse_index % x_dims[j];
+        sparse_index /= x_dims[j];
+      }
+      memcpy(values_data + index * cols, x_data + i * cols, cols * sizeof(T));
+      ++index;
+    }
+  }
+  out->SetMember(indices, values, x_dims, true);
+}
+
+template <typename T, typename Context>
+void SparseCsrToCooKernel(const Context& dev_ctx,
+                          const SparseCsrTensor& x,
+                          SparseCooTensor* out) {
+  const DDim& x_dims = x.dims();
+  const int64_t non_zero_num = x.non_zero_cols().numel();
+  const auto& csr_crows = x.non_zero_crows();
+  const auto& csr_cols = x.non_zero_cols();
+  const auto& csr_values = x.non_zero_elements();
+  const int64_t* csr_crows_data = csr_crows.data<int64_t>();
+  const int64_t* csr_cols_data = csr_cols.data<int64_t>();
+  const T* csr_values_data = csr_values.data<T>();
+
+  int64_t sparse_dim = 2;
+  if (x_dims.size() == 3) {
+    sparse_dim = 3;
+  }
+  const auto place = dev_ctx.GetPlace();
+  DenseTensorMeta indices_meta(
+      DataType::INT64, {sparse_dim, non_zero_num}, DataLayout::NCHW);
+  DenseTensorMeta values_meta(x.dtype(), {non_zero_num}, x.layout());
+  pten::DenseTensor indices =
+      pten::Empty<int64_t, Context>(dev_ctx, std::move(indices_meta));
+  pten::DenseTensor values =
+      pten::Empty<T, Context>(dev_ctx, std::move(values_meta));
+  int64_t* coo_indices = indices.mutable_data<int64_t>(place);
+  int64_t* batch_ptr = x_dims.size() == 2 ? nullptr : coo_indices;
+  int64_t* coo_rows_data =
+      x_dims.size() == 2 ? coo_indices : batch_ptr + non_zero_num;
+  int64_t* coo_cols_data = coo_rows_data + non_zero_num;
+  T* coo_values_data = values.mutable_data<T>(place);
+
+  int batch = x_dims.size() == 2 ? 1 : x_dims[0];
+  int rows = x_dims.size() == 2 ? x_dims[0] : x_dims[1];
+
+  int index = 0;
+  for (int b = 0; b < batch; b++) {
+    for (int i = 0; i < rows; i++) {
+      for (int j = csr_crows_data[b * (rows + 1) + i];
+           j < csr_crows_data[b * (rows + 1) + i + 1];
+           j++) {
+        coo_rows_data[index] = i;
+        if (batch_ptr) {
+          batch_ptr[index] = b;
+        }
+        ++index;
+      }
+    }
+  }
+
+  memcpy(coo_cols_data, csr_cols_data, sizeof(int64_t) * non_zero_num);
+  memcpy(coo_values_data, csr_values_data, sizeof(T) * non_zero_num);
+  out->SetMember(indices, values, x_dims, true);
+}
+
+}  // namespace sparse
+}  // namespace pten
+
+PT_REGISTER_KERNEL(dense_to_sparse_coo,
+                   CPU,
+                   ALL_LAYOUT,
+                   pten::sparse::DenseToSparseCooKernel,
+                   float,
+                   double,
+                   paddle::float16,
+                   uint8_t,
+                   int8_t,
+                   int16_t,
+                   int,
+                   int64_t) {}
+
+PT_REGISTER_KERNEL(sparse_csr_to_coo,
+                   CPU,
+                   ALL_LAYOUT,
+                   pten::sparse::SparseCsrToCooKernel,
+                   float,
+                   double,
+                   paddle::float16,
+                   uint8_t,
+                   int8_t,
+                   int16_t,
+                   int,
+                   int64_t) {}
diff --git a/paddle/pten/kernels/sparse/gpu/sparse_utils_kernel.cu b/paddle/pten/kernels/sparse/gpu/sparse_utils_kernel.cu
new file mode 100644
index 00000000000..eb9fa7a1696
--- /dev/null
+++ b/paddle/pten/kernels/sparse/gpu/sparse_utils_kernel.cu
@@ -0,0 +1,360 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <thrust/execution_policy.h>
+#include <thrust/remove.h>
+
+#include "paddle/pten/backends/gpu/gpu_context.h"
+#include "paddle/pten/core/kernel_registry.h"
+#include "paddle/pten/core/tensor_meta.h"
+#include "paddle/pten/kernels/sparse/sparse_utils_kernel.h"
+
+namespace pten {
+namespace sparse {
+
+template <typename T>
+inline __device__ bool DevIsZero(const T* data, const int64_t cols) {
+  const T zero = static_cast<T>(0);
+  // TODO(zhangkaihuo): check the data is zero or not in parallen when cols > 1
+  for (int64_t i = 0; i < cols; i++) {
+    if (data[i] != zero) {
+      return false;
+    }
+  }
+  return true;
+}
+
+template <typename T>
+__global__ void GetNonZeroNums(const T* dense_data,
+                               const int rows,
+                               const int cols,
+                               int* non_zero_num,
+                               int* temp_indexs) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  __shared__ int counter;
+  if (threadIdx.x == 0) counter = 0;
+  __syncthreads();
+
+  for (int i = tid; i < rows; i += gridDim.x * blockDim.x) {
+    int index = -1;
+    // TODO(zhangkaihuo): when cols=1, vectorization can be used
+    if (!DevIsZero(dense_data + i * cols, cols)) {
+      // use reductions?
+      atomicAdd(&counter, 1);
+      index = i;
+    }
+    temp_indexs[i] = index;
+  }
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    atomicAdd(non_zero_num, counter);
+  }
+}
+
+template <typename T>
+__global__ void GetNonZeroElementsAndIndices(const T* dense_data,
+                                             const int64_t sparse_dim,
+                                             const int64_t cols,
+                                             const int64_t* x_dims,
+                                             const int non_zero_num,
+                                             const int* indexs,
+                                             int64_t* indices,
+                                             T* sparse_data) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  for (int i = tid; i < non_zero_num; i += gridDim.x * blockDim.x) {
+    int64_t sparse_index = indexs[i];
+    int64_t x_index = sparse_index;
+    for (int64_t j = sparse_dim - 1; j >= 0; j--) {
+      indices[j * non_zero_num + i] = sparse_index % x_dims[j];
+      sparse_index /= x_dims[j];
+    }
+
+    for (int j = 0; j < cols; j++) {
+      sparse_data[i * cols + j] = dense_data[x_index * cols + j];
+    }
+  }
+}
+
+template <typename Context>
+void GetGpuLaunchConfig1D(const Context& dev_ctx,
+                          const int64_t n,
+                          int* grid_size,
+                          int* block_size) {
+  const int MAX_BLOCK_DIM = dev_ctx.GetMaxThreadsPerBlock();
+  const int MAX_GRID_DIM = dev_ctx.GetMaxPhysicalThreadCount() / MAX_BLOCK_DIM;
+  *block_size = (n >= MAX_BLOCK_DIM) ? MAX_BLOCK_DIM
+                                     : (1 << static_cast<int>(std::log2(n)));
+  *grid_size = n / *block_size;
+  *grid_size = (*grid_size >= MAX_GRID_DIM) ? MAX_GRID_DIM : *grid_size;
+}
+
+template <typename T, typename Context>
+void DenseToSparseCooKernel(const Context& dev_ctx,
+                            const DenseTensor& x,
+                            const int64_t sparse_dim,
+                            SparseCooTensor* out) {
+  const T* x_data = x.data<T>();
+  const auto& x_dims = x.dims();
+  auto dims_2d = flatten_to_2d(x_dims, sparse_dim);
+  const int rows = dims_2d[0];
+  const int cols = dims_2d[1];
+  auto nums_meta =
+      pten::DenseTensorMeta(DataType::INT32, {1}, pten::DataLayout::NCHW);
+  DenseTensor nums =
+      pten::Empty<int64_t, Context>(dev_ctx, std::move(nums_meta));
+  auto x_dims_meta =
+      pten::DenseTensorMeta(DataType::INT64,
+                            {static_cast<int64_t>(x_dims.size())},
+                            pten::DataLayout::NCHW);
+  DenseTensor d_x_dims =
+      pten::Empty<T, Context>(dev_ctx, std::move(x_dims_meta));
+
+  const auto place = dev_ctx.GetPlace();
+
+  // 1. get numbers of non zero elements, and get the index of non zero elements
+  int* nums_ptr = nums.mutable_data<int>(place);
+#ifdef PADDLE_WITH_HIP
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      hipMemsetAsync(nums_ptr, 0, sizeof(int), dev_ctx.stream()));
+#else
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      cudaMemsetAsync(nums_ptr, 0, sizeof(int), dev_ctx.stream()));
+#endif
+  int grid_size = 1, block_size = 1;
+  GetGpuLaunchConfig1D(dev_ctx, rows, &grid_size, &block_size);
+
+  auto temp_indexs_meta =
+      pten::DenseTensorMeta(DataType::INT32, {rows}, pten::DataLayout::NCHW);
+  DenseTensor temp_indexs =
+      pten::Empty<T, Context>(dev_ctx, std::move(temp_indexs_meta));
+  int* temp_indexs_ptr = temp_indexs.mutable_data<int>(place);
+  GetNonZeroNums<<<grid_size, block_size, 0, dev_ctx.stream()>>>(
+      x_data, rows, cols, nums_ptr, temp_indexs_ptr);
+#ifdef PADDLE_WITH_HIP
+  thrust::remove(thrust::hip::par.on(dev_ctx.stream()),
+#else
+  thrust::remove(thrust::cuda::par.on(dev_ctx.stream()),
+#endif
+                 temp_indexs_ptr,
+                 temp_indexs_ptr + rows,
+                 -1);
+
+  // 2. copy non_zero_num to host, copy x_dims to device
+  int non_zero_num = 0;
+#ifdef PADDLE_WITH_HIP
+  PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpyAsync(&non_zero_num,
+                                            nums_ptr,
+                                            sizeof(int),
+                                            hipMemcpyDeviceToHost,
+                                            dev_ctx.stream()));
+#else
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(&non_zero_num,
+                                             nums_ptr,
+                                             sizeof(int),
+                                             cudaMemcpyDeviceToHost,
+                                             dev_ctx.stream()));
+#endif
+
+#ifdef PADDLE_WITH_HIP
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      hipMemcpyAsync(d_x_dims.mutable_data<int64_t>(place),
+                     x_dims.Get(),
+                     x_dims.size() * sizeof(x_dims[0]),
+                     hipMemcpyHostToDevice,
+                     dev_ctx.stream()));
+#else
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      cudaMemcpyAsync(d_x_dims.mutable_data<int64_t>(place),
+                      x_dims.Get(),
+                      x_dims.size() * sizeof(x_dims[0]),
+                      cudaMemcpyHostToDevice,
+                      dev_ctx.stream()));
+#endif
+
+  dev_ctx.Wait();  // wait the copy
+
+  const auto values_dims = InferDenseDims(x_dims, sparse_dim, non_zero_num);
+  DenseTensorMeta indices_meta(DataType::INT64,
+                               {sparse_dim, static_cast<int64_t>(non_zero_num)},
+                               DataLayout::NCHW);
+  DenseTensorMeta values_meta(x.meta().dtype, values_dims, x.meta().layout);
+  pten::DenseTensor indices(
+      pten::make_intrusive<paddle::experimental::SharedStorage>(
+          dev_ctx.GetPlace()),
+      std::move(indices_meta));
+  pten::DenseTensor values(
+      pten::make_intrusive<paddle::experimental::SharedStorage>(
+          dev_ctx.GetPlace()),
+      std::move(values_meta));
+  int64_t* indices_data = indices.mutable_data<int64_t>(place);
+  T* sparse_data = values.mutable_data<T>(place);
+
+  // 3. calc indices by indexs and get values by indexs
+  GetGpuLaunchConfig1D(dev_ctx, non_zero_num, &grid_size, &block_size);
+  GetNonZeroElementsAndIndices<<<grid_size, block_size, 0, dev_ctx.stream()>>>(
+      x_data,
+      sparse_dim,
+      cols,
+      d_x_dims.data<int64_t>(),
+      non_zero_num,
+      temp_indexs_ptr,
+      indices_data,
+      sparse_data);
+  out->SetMember(indices, values, x_dims, true);
+}
+
+__global__ void GetBatchSizes(const int64_t* crows,
+                              const int rows,
+                              const int batchs,
+                              int* batch_sizes) {
+  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid < batchs) {
+    batch_sizes[tid] = crows[tid * (rows + 1) + rows];
+  }
+}
+
+__global__ void ConvertCsrCrowsToCooRows(const int64_t* crows_ptr,
+                                         const int* crows_offsets,
+                                         int64_t* rows_ptr,
+                                         int64_t* batch_ptr,
+                                         const int rows) {
+  const int b = blockIdx.y;
+  const int64_t offset = crows_offsets ? crows_offsets[b] : 0;
+  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  for (int i = tid; i < rows; i += gridDim.x * blockDim.x) {
+    for (int j = crows_ptr[b * (rows + 1) + i];
+         j < crows_ptr[b * (rows + 1) + i + 1];
+         j++) {
+      rows_ptr[offset + j] = i;
+      if (batch_ptr) {
+        batch_ptr[offset + j] = b;
+      }
+    }
+  }
+}
+
+template <typename T, typename Context>
+void SparseCsrToCooKernel(const Context& dev_ctx,
+                          const SparseCsrTensor& x,
+                          SparseCooTensor* out) {
+  const DDim& x_dims = x.dims();
+  const int64_t non_zero_num = x.non_zero_cols().numel();
+  const auto& csr_crows = x.non_zero_crows();
+  const auto& csr_cols = x.non_zero_cols();
+  const auto& csr_values = x.non_zero_elements();
+  const int64_t* csr_crows_data = csr_crows.data<int64_t>();
+  const int64_t* csr_cols_data = csr_cols.data<int64_t>();
+  const T* csr_values_data = csr_values.data<T>();
+
+  int64_t sparse_dim = 2;
+  if (x_dims.size() == 3) {
+    sparse_dim = 3;
+  }
+  int batchs = x_dims.size() == 2 ? 1 : x_dims[0];
+  int rows = x_dims.size() == 2 ? x_dims[0] : x_dims[1];
+
+  const auto place = dev_ctx.GetPlace();
+  DenseTensorMeta indices_meta(
+      DataType::INT64, {sparse_dim, non_zero_num}, DataLayout::NCHW);
+  DenseTensorMeta values_meta(x.dtype(), {non_zero_num}, x.layout());
+  DenseTensorMeta offsets_meta(DataType::INT32, {batchs}, DataLayout::NCHW);
+  DenseTensor indices =
+      pten::Empty<int64_t, Context>(dev_ctx, std::move(indices_meta));
+  DenseTensor values = pten::Empty<T, Context>(dev_ctx, std::move(values_meta));
+  DenseTensor offsets =
+      pten::Empty<T, Context>(dev_ctx, std::move(offsets_meta));
+  int64_t* coo_indices = indices.mutable_data<int64_t>(place);
+  int64_t* batch_ptr = x_dims.size() == 2 ? nullptr : coo_indices;
+  int64_t* coo_rows_data =
+      x_dims.size() == 2 ? coo_indices : batch_ptr + non_zero_num;
+  int64_t* coo_cols_data = coo_rows_data + non_zero_num;
+  int* offsets_ptr = batchs == 1 ? nullptr : offsets.mutable_data<int>(place);
+  T* coo_values_data = values.mutable_data<T>(place);
+
+  int grid_size = 1, block_size = 1;
+  if (batchs > 1) {
+    GetGpuLaunchConfig1D(dev_ctx, batchs, &grid_size, &block_size);
+    GetBatchSizes<<<grid_size, block_size>>>(
+        csr_crows_data, rows, batchs, offsets_ptr);
+
+#ifdef PADDLE_WITH_HIP
+    thrust::exclusive_scan(thrust::hip::par.on(dev_ctx.stream()),
+#else
+    thrust::exclusive_scan(thrust::cuda::par.on(dev_ctx.stream()),
+#endif
+                           offsets_ptr,
+                           offsets_ptr + batchs,
+                           offsets_ptr);
+  }
+
+  GetGpuLaunchConfig1D(dev_ctx, rows, &grid_size, &block_size);
+  dim3 grids(grid_size, batchs, 1);
+  ConvertCsrCrowsToCooRows<<<grids, block_size>>>(
+      csr_crows_data, offsets_ptr, coo_rows_data, batch_ptr, rows);
+
+#ifdef PADDLE_WITH_HIP
+  PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpyAsync(coo_cols_data,
+                                            csr_cols_data,
+                                            sizeof(int64_t) * non_zero_num,
+                                            hipMemcpyDeviceToDevice,
+                                            dev_ctx.stream()));
+  PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpyAsync(coo_values_data,
+                                            csr_values_data,
+                                            sizeof(T) * non_zero_num,
+                                            hipMemcpyDeviceToDevice,
+                                            dev_ctx.stream()));
+#else
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(coo_cols_data,
+                                             csr_cols_data,
+                                             sizeof(int64_t) * non_zero_num,
+                                             cudaMemcpyDeviceToDevice,
+                                             dev_ctx.stream()));
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(coo_values_data,
+                                             csr_values_data,
+                                             sizeof(T) * non_zero_num,
+                                             cudaMemcpyDeviceToDevice,
+                                             dev_ctx.stream()));
+#endif
+
+  out->SetMember(indices, values, x_dims, true);
+}
+
+}  // namespace sparse
+}  // namespace pten
+
+PT_REGISTER_KERNEL(dense_to_sparse_coo,
+                   GPU,
+                   ALL_LAYOUT,
+                   pten::sparse::DenseToSparseCooKernel,
+                   float,
+                   double,
+                   pten::dtype::float16,
+                   uint8_t,
+                   int8_t,
+                   int16_t,
+                   int,
+                   int64_t) {}
+
+PT_REGISTER_KERNEL(sparse_csr_to_coo,
+                   GPU,
+                   ALL_LAYOUT,
+                   pten::sparse::SparseCsrToCooKernel,
+                   float,
+                   double,
+                   pten::dtype::float16,
+                   uint8_t,
+                   int8_t,
+                   int16_t,
+                   int,
+                   int64_t) {}
diff --git a/paddle/pten/kernels/sparse/sparse_utils_kernel.h b/paddle/pten/kernels/sparse/sparse_utils_kernel.h
new file mode 100644
index 00000000000..c353caedf31
--- /dev/null
+++ b/paddle/pten/kernels/sparse/sparse_utils_kernel.h
@@ -0,0 +1,76 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/sparse_coo_tensor.h"
+#include "paddle/pten/core/sparse_csr_tensor.h"
+#include "paddle/pten/kernels/empty_kernel.h"
+
+namespace pten {
+namespace sparse {
+
+inline const DDim InferDenseDims(const DDim& x_dims,
+                                 const int64_t sparse_dim,
+                                 const int64_t non_zero_num) {
+  auto dense_dim = x_dims.size() - sparse_dim;
+  DDim values_dims;
+  if (dense_dim) {
+    std::vector<int64_t> dense_dim_vec(dense_dim + 1);
+    dense_dim_vec[0] = non_zero_num;
+    memcpy(&dense_dim_vec[1],
+           x_dims.Get() + sparse_dim,
+           dense_dim * sizeof(x_dims[0]));
+    values_dims = pten::framework::make_ddim(dense_dim_vec);
+  } else {
+    values_dims = pten::framework::make_ddim({non_zero_num});
+  }
+  return values_dims;
+}
+
+template <typename T, typename Context>
+void DenseToSparseCooKernel(const Context& dev_ctx,
+                            const DenseTensor& x,
+                            const int64_t sparse_dim,
+                            SparseCooTensor* out);
+
+template <typename T, typename Context>
+SparseCooTensor DenseToSparseCoo(const Context& dev_ctx,
+                                 const DenseTensor& x,
+                                 const int64_t sparse_dim) {
+  DenseTensor indices = pten::Empty<T, Context>(dev_ctx);
+  DenseTensor values = pten::Empty<T, Context>(dev_ctx);
+  SparseCooTensor coo(indices, values, x.dims());
+  DenseToSparseCooKernel<T, Context>(dev_ctx, x, sparse_dim, &coo);
+  return coo;
+}
+
+template <typename T, typename Context>
+void SparseCsrToCooKernel(const Context& dev_ctx,
+                          const SparseCsrTensor& x,
+                          SparseCooTensor* out);
+
+template <typename T, typename Context>
+SparseCooTensor SparseCsrToCoo(const Context& dev_ctx,
+                               const SparseCsrTensor& x) {
+  DenseTensor indices = pten::Empty<T, Context>(dev_ctx);
+  DenseTensor values = pten::Empty<T, Context>(dev_ctx);
+  SparseCooTensor coo(indices, values, x.dims());
+  SparseCsrToCooKernel<T, Context>(dev_ctx, x, &coo);
+  return coo;
+}
+
+}  // namespace sparse
+}  // namespace pten
diff --git a/paddle/pten/kernels/transfer_layout_kernel.cc b/paddle/pten/kernels/transfer_layout_kernel.cc
new file mode 100644
index 00000000000..c21ab7c304d
--- /dev/null
+++ b/paddle/pten/kernels/transfer_layout_kernel.cc
@@ -0,0 +1,77 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/kernels/transfer_layout_kernel.h"
+
+#include "paddle/pten/api/ext/dispatch.h"
+#include "paddle/pten/backends/all_context.h"
+#include "paddle/pten/core/kernel_registry.h"
+#include "paddle/pten/kernels/funcs/transpose.h"
+
+namespace pten {
+
+std::vector<int> GetAxis(const DataLayout& from, const DataLayout& to) {
+  PADDLE_ENFORCE_NE(
+      from,
+      to,
+      pten::errors::InvalidArgument(
+          "Layout transform should transform between different layout."));
+  if (from == DataLayout::NCHW && to == DataLayout::NHWC) {
+    return {0, 2, 3, 1};
+  } else if (from == DataLayout::NHWC && to == DataLayout::NCHW) {
+    return {0, 3, 1, 2};
+  } else {
+    PADDLE_THROW(
+        pten::errors::InvalidArgument("Unsupported layout transform."));
+  }
+}
+
+template <typename T, typename Context>
+void CastDataLayout(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const std::vector<int>& axis,
+                    DenseTensor* out) {
+  math::Transpose<Context, T, 4> trans4;
+  trans4(dev_ctx, x, out, axis);
+}
+
+template <typename Context>
+void TransferLayoutKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          DataLayout dst_layout,
+                          DenseTensor* out) {
+  auto src_dim = x.dims();
+
+  auto axis = GetAxis(x.layout(), dst_layout);
+
+  std::vector<int64_t> dst_dim;
+  dst_dim.resize(axis.size());
+  for (size_t i = 0; i < axis.size(); i++) {
+    dst_dim[i] = src_dim[axis[i]];
+  }
+
+  out->ResizeAndAllocate(framework::make_ddim(dst_dim));
+
+  PD_VISIT_ALL_TYPES(x.dtype(), "CastDataLayout", ([&] {
+                       CastDataLayout<data_t, Context>(dev_ctx, x, axis, out);
+                     }));
+}
+
+}  // namespace pten
+
+PT_REGISTER_GENERAL_KERNEL(pten_transfer_layout,
+                           CPU,
+                           ALL_LAYOUT,
+                           pten::TransferLayoutKernel<pten::CPUContext>,
+                           ALL_DTYPE) {}
diff --git a/paddle/pten/kernels/transfer_layout_kernel.h b/paddle/pten/kernels/transfer_layout_kernel.h
new file mode 100644
index 00000000000..24854842e8b
--- /dev/null
+++ b/paddle/pten/kernels/transfer_layout_kernel.h
@@ -0,0 +1,43 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/infermeta/unary.h"
+#include "paddle/pten/kernels/empty_kernel.h"
+
+namespace pten {
+
+template <typename Context>
+void TransferLayoutKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          DataLayout dst_layout,
+                          DenseTensor* out);
+
+template <typename Context>
+DenseTensor TransferLayout(const Context& dev_ctx,
+                           const DenseTensor& x,
+                           DataLayout dst_layout) {
+  pten::DenseTensor dense_out(
+      pten::make_intrusive<paddle::experimental::SharedStorage>(
+          dev_ctx.GetPlace()),
+      {x.dtype(), x.dims(), dst_layout});
+  MetaTensor meta_out(&dense_out);
+  TransferLayoutInferMeta(x, dst_layout, &meta_out);
+  TransferLayoutKernel<Context>(dev_ctx, x, dst_layout, &dense_out);
+  return dense_out;
+}
+
+}  // namespace pten
diff --git a/paddle/pten/kernels/xpu/cast_kernel.cc b/paddle/pten/kernels/xpu/cast_kernel.cc
index fc1ba021e22..027aa4f5b45 100644
--- a/paddle/pten/kernels/xpu/cast_kernel.cc
+++ b/paddle/pten/kernels/xpu/cast_kernel.cc
@@ -14,11 +14,13 @@
 
 #include "paddle/pten/kernels/cast_kernel.h"
 
-#include "paddle/fluid/platform/device/xpu/xpu_header.h"
 #include "paddle/pten/backends/xpu/xpu_context.h"
+#include "paddle/pten/common/float16.h"
+#include "paddle/pten/core/enforce.h"
 #include "paddle/pten/core/kernel_registry.h"
 
-#include "paddle/pten/core/enforce.h"
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/platform/device/xpu/xpu_header.h"
 
 namespace pten {
 
@@ -28,7 +30,7 @@ void CastKernel(const Context& dev_ctx,
                 DataType out_dtype,
                 DenseTensor* out) {
   using XPUInTDType = typename XPUTypeTrait<T>::Type;
-  using float16 = typename XPUTypeTrait<pten::platform::float16>::Type;
+  using float16 = typename XPUTypeTrait<pten::dtype::float16>::Type;
 
   auto* in_data = x.data<T>();
   auto numel = x.numel();
@@ -47,7 +49,7 @@ void CastKernel(const Context& dev_ctx,
           dev_ctx.x_context(),
           reinterpret_cast<const XPUInTDType*>(in_data),
           reinterpret_cast<float16*>(
-              out->mutable_data<pten::platform::float16>(dev_ctx.GetPlace())),
+              out->mutable_data<pten::dtype::float16>(dev_ctx.GetPlace())),
           numel);
       break;
     case pten::DataType::INT64:
@@ -72,7 +74,7 @@ void CastKernel(const Context& dev_ctx,
           numel);
       break;
     default:
-      PADDLE_THROW(platform::errors::Unavailable(
+      PADDLE_THROW(pten::errors::Unavailable(
           "Not supported cast %d -> %d", x.dtype(), out_dtype));
   }
 
@@ -90,7 +92,7 @@ PT_REGISTER_KERNEL(cast,
                    pten::CastKernel,
                    int32_t,
                    float,
-                   pten::platform::float16,
+                   pten::dtype::float16,
                    int64_t,
                    bool) {
   kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
diff --git a/paddle/pten/kernels/xpu/copy_kernel.cc b/paddle/pten/kernels/xpu/copy_kernel.cc
index 56ad19f0cc3..f27705ca112 100644
--- a/paddle/pten/kernels/xpu/copy_kernel.cc
+++ b/paddle/pten/kernels/xpu/copy_kernel.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include "paddle/pten/backends/xpu/xpu_context.h"
 #include "paddle/pten/common/data_type.h"
-#include "paddle/pten/core/convert_utils.h"
+#include "paddle/pten/core/compat/convert_utils.h"
 #include "paddle/pten/core/kernel_registry.h"
 
 // See Note [ Why still include the fluid headers? ]
diff --git a/paddle/pten/kernels/xpu/full_kernel.cc b/paddle/pten/kernels/xpu/full_kernel.cc
index 71d2b8e3add..cf6befac023 100644
--- a/paddle/pten/kernels/xpu/full_kernel.cc
+++ b/paddle/pten/kernels/xpu/full_kernel.cc
@@ -16,9 +16,15 @@
 
 #include "paddle/pten/api/ext/dispatch.h"
 #include "paddle/pten/backends/xpu/xpu_context.h"
+#include "paddle/pten/common/bfloat16.h"
+#include "paddle/pten/common/complex.h"
+#include "paddle/pten/common/float16.h"
 #include "paddle/pten/common/scalar.h"
 #include "paddle/pten/core/kernel_registry.h"
 
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/memory/memcpy.h"
+
 namespace pten {
 
 template <typename InType, typename OutType>
@@ -64,7 +70,7 @@ void FullLikeKernel(const Context& dev_ctx,
   using XPUInTDType = typename XPUTypeTrait<T>::Type;
   using CommonType = typename std::common_type<
       float,
-      typename std::conditional<std::is_same<T, pten::platform::float16>::value,
+      typename std::conditional<std::is_same<T, pten::dtype::float16>::value,
                                 float,
                                 T>::type>::type;
 
@@ -118,10 +124,10 @@ PT_REGISTER_KERNEL(full,
                    int,
                    int64_t,
                    bool,
-                   pten::platform::float16,
-                   pten::platform::bfloat16,
-                   pten::platform::complex<float>,
-                   pten::platform::complex<double>) {}
+                   pten::dtype::float16,
+                   pten::dtype::bfloat16,
+                   pten::dtype::complex<float>,
+                   pten::dtype::complex<double>) {}
 
 PT_REGISTER_KERNEL(full_like,
                    XPU,
@@ -130,4 +136,4 @@ PT_REGISTER_KERNEL(full_like,
                    float,
                    int,
                    int64_t,
-                   pten::platform::float16) {}
+                   pten::dtype::float16) {}
diff --git a/paddle/pten/kernels/xpu/scale_kernel.cc b/paddle/pten/kernels/xpu/scale_kernel.cc
index 116cd63f876..25fb0860446 100644
--- a/paddle/pten/kernels/xpu/scale_kernel.cc
+++ b/paddle/pten/kernels/xpu/scale_kernel.cc
@@ -18,7 +18,7 @@
 #include "paddle/pten/backends/xpu/xpu_context.h"
 #include "paddle/pten/common/data_type.h"
 #include "paddle/pten/common/float16.h"
-#include "paddle/pten/core/convert_utils.h"
+#include "paddle/pten/core/compat/convert_utils.h"
 #include "paddle/pten/core/kernel_registry.h"
 
 namespace pten {
diff --git a/paddle/pten/ops/compat/cast_sig.cc b/paddle/pten/ops/compat/cast_sig.cc
new file mode 100644
index 00000000000..e05ca88aaf3
--- /dev/null
+++ b/paddle/pten/ops/compat/cast_sig.cc
@@ -0,0 +1,25 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/core/compat/op_utils.h"
+
+namespace pten {
+
+KernelSignature CastOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("cast", {"X"}, {"out_dtype"}, {"Out"});
+}
+
+}  // namespace pten
+
+PT_REGISTER_ARG_MAPPING_FN(cast, pten::CastOpArgumentMapping);
diff --git a/paddle/pten/ops/compat/concat_sig.cc b/paddle/pten/ops/compat/concat_sig.cc
new file mode 100644
index 00000000000..1352cc7eaca
--- /dev/null
+++ b/paddle/pten/ops/compat/concat_sig.cc
@@ -0,0 +1,28 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/core/compat/op_utils.h"
+
+namespace pten {
+
+KernelSignature ConcatOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.HasInput("AxisTensor")) {
+    return KernelSignature("concat", {"X"}, {"AxisTensor"}, {"Out"});
+  }
+  return KernelSignature("concat", {"X"}, {"axis"}, {"Out"});
+}
+
+}  // namespace pten
+
+PT_REGISTER_ARG_MAPPING_FN(concat, pten::ConcatOpArgumentMapping);
diff --git a/paddle/pten/ops/compat/elementwise_sig.cc b/paddle/pten/ops/compat/elementwise_sig.cc
new file mode 100644
index 00000000000..4c14a5d139e
--- /dev/null
+++ b/paddle/pten/ops/compat/elementwise_sig.cc
@@ -0,0 +1,83 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/core/compat/op_utils.h"
+
+namespace pten {
+
+KernelSignature ElementwiseAddOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  int axis = paddle::any_cast<int>(ctx.Attr("axis"));
+  if (ctx.IsDenseTensorInput("X")) {
+    if (axis == -1) {
+      return KernelSignature("add", {"X", "Y"}, {}, {"Out"});
+    }
+    return KernelSignature("add_raw", {"X", "Y"}, {"axis"}, {"Out"});
+  }
+  return KernelSignature("unregistered", {}, {}, {});
+}
+
+KernelSignature ElementwiseSubOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  int axis = paddle::any_cast<int>(ctx.Attr("axis"));
+  if (ctx.IsDenseTensorInput("X")) {
+    if (axis == -1) {
+      return KernelSignature("subtract", {"X", "Y"}, {}, {"Out"});
+    }
+    return KernelSignature("subtract_raw", {"X", "Y"}, {"axis"}, {"Out"});
+  }
+  return KernelSignature("unregistered", {}, {}, {});
+}
+
+KernelSignature ElementwiseMulOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  int axis = paddle::any_cast<int>(ctx.Attr("axis"));
+  if (ctx.IsDenseTensorInput("X")) {
+    if (axis == -1) {
+      return KernelSignature("multiply", {"X", "Y"}, {}, {"Out"});
+    }
+    return KernelSignature("multiply_raw", {"X", "Y"}, {"axis"}, {"Out"});
+  }
+  return KernelSignature("unregistered", {}, {}, {});
+}
+
+KernelSignature ElementwiseDivOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  int axis = paddle::any_cast<int>(ctx.Attr("axis"));
+  if (ctx.IsDenseTensorInput("X")) {
+    if (axis == -1) {
+      return KernelSignature("divide", {"X", "Y"}, {}, {"Out"});
+    }
+    return KernelSignature("divide_raw", {"X", "Y"}, {"axis"}, {"Out"});
+  }
+  return KernelSignature("unregistered", {}, {}, {});
+}
+
+}  // namespace pten
+
+PT_REGISTER_BASE_KERNEL_NAME(elementwise_add, add_raw);
+PT_REGISTER_BASE_KERNEL_NAME(elementwise_sub, subtract_raw);
+PT_REGISTER_BASE_KERNEL_NAME(elementwise_mul, muliply_raw);
+PT_REGISTER_BASE_KERNEL_NAME(elementwise_div, divide_raw);
+PT_REGISTER_BASE_KERNEL_NAME(elementwise_add_grad, add_grad);
+PT_REGISTER_BASE_KERNEL_NAME(elementwise_sub_grad, subtract_grad);
+
+PT_REGISTER_ARG_MAPPING_FN(elementwise_add,
+                           pten::ElementwiseAddOpArgumentMapping);
+PT_REGISTER_ARG_MAPPING_FN(elementwise_sub,
+                           pten::ElementwiseSubOpArgumentMapping);
+PT_REGISTER_ARG_MAPPING_FN(elementwise_mul,
+                           pten::ElementwiseMulOpArgumentMapping);
+PT_REGISTER_ARG_MAPPING_FN(elementwise_div,
+                           pten::ElementwiseDivOpArgumentMapping);
diff --git a/paddle/pten/ops/compat/empty_sig.cc b/paddle/pten/ops/compat/empty_sig.cc
new file mode 100644
index 00000000000..c74f6106981
--- /dev/null
+++ b/paddle/pten/ops/compat/empty_sig.cc
@@ -0,0 +1,31 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/core/compat/op_utils.h"
+
+namespace pten {
+
+KernelSignature EmptyOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.HasInput("ShapeTensor")) {
+    return KernelSignature("empty", {}, {"ShapeTensor"}, {"Out"});
+  } else if (ctx.InputSize("ShapeTensorList") > 0) {
+    return KernelSignature("empty", {}, {"ShapeTensorList"}, {"Out"});
+  } else {
+    return KernelSignature("empty", {}, {"shape"}, {"Out"});
+  }
+}
+
+}  // namespace pten
+
+PT_REGISTER_ARG_MAPPING_FN(empty, pten::EmptyOpArgumentMapping);
diff --git a/paddle/pten/ops/compat/fill_any_like_sig.cc b/paddle/pten/ops/compat/fill_any_like_sig.cc
new file mode 100644
index 00000000000..81065d0c8ae
--- /dev/null
+++ b/paddle/pten/ops/compat/fill_any_like_sig.cc
@@ -0,0 +1,28 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/core/compat/op_utils.h"
+
+namespace pten {
+
+KernelSignature FillAnyLikeOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("full_like", {}, {"value"}, {"Out"});
+}
+
+}  // namespace pten
+
+PT_REGISTER_BASE_KERNEL_NAME(fill_any_like, full_like);
+
+PT_REGISTER_ARG_MAPPING_FN(fill_any_like, pten::FillAnyLikeOpArgumentMapping);
diff --git a/paddle/pten/ops/compat/fill_constant_sig.cc b/paddle/pten/ops/compat/fill_constant_sig.cc
new file mode 100644
index 00000000000..73dee270f70
--- /dev/null
+++ b/paddle/pten/ops/compat/fill_constant_sig.cc
@@ -0,0 +1,73 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/core/compat/op_utils.h"
+
+namespace pten {
+
+// we have to return every specific KernelSignature for infrt now
+KernelSignature FillConstantOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  if (ctx.IsDenseTensorOutput("Out")) {
+    if (ctx.HasInput("ShapeTensor")) {
+      if (ctx.HasInput("ValueTensor")) {
+        return KernelSignature(
+            "full", {}, {"ShapeTensor", "ValueTensor"}, {"Out"});
+      } else {
+        const auto& str_value =
+            paddle::any_cast<std::string>(ctx.Attr("str_value"));
+        if (str_value.empty()) {
+          return KernelSignature("full", {}, {"ShapeTensor", "value"}, {"Out"});
+        } else {
+          return KernelSignature(
+              "full", {}, {"ShapeTensor", "str_value"}, {"Out"});
+        }
+      }
+    } else if (ctx.InputSize("ShapeTensorList") > 0) {
+      if (ctx.HasInput("ValueTensor")) {
+        return KernelSignature(
+            "full", {}, {"ShapeTensorList", "ValueTensor"}, {"Out"});
+      } else {
+        const auto& str_value =
+            paddle::any_cast<std::string>(ctx.Attr("str_value"));
+        if (str_value.empty()) {
+          return KernelSignature(
+              "full", {}, {"ShapeTensorList", "value"}, {"Out"});
+        } else {
+          return KernelSignature(
+              "full", {}, {"ShapeTensorList", "str_value"}, {"Out"});
+        }
+      }
+    } else {
+      if (ctx.HasInput("ValueTensor")) {
+        return KernelSignature("full", {}, {"shape", "ValueTensor"}, {"Out"});
+      } else {
+        const auto& str_value =
+            paddle::any_cast<std::string>(ctx.Attr("str_value"));
+        if (str_value.empty()) {
+          return KernelSignature("full", {}, {"shape", "value"}, {"Out"});
+        } else {
+          return KernelSignature("full", {}, {"shape", "str_value"}, {"Out"});
+        }
+      }
+    }
+  }
+  return KernelSignature("unregistered", {}, {}, {});
+}
+
+}  // namespace pten
+
+PT_REGISTER_BASE_KERNEL_NAME(fill_constant, full);
+
+PT_REGISTER_ARG_MAPPING_FN(fill_constant, pten::FillConstantOpArgumentMapping);
diff --git a/paddle/pten/ops/compat/flatten_sig.cc b/paddle/pten/ops/compat/flatten_sig.cc
new file mode 100644
index 00000000000..1ef2977bf88
--- /dev/null
+++ b/paddle/pten/ops/compat/flatten_sig.cc
@@ -0,0 +1,37 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/core/compat/op_utils.h"
+
+namespace pten {
+
+KernelSignature FlattenOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.HasOutput("XShape")) {
+    return KernelSignature("flatten_with_xshape",
+                           {"X"},
+                           {"start_axis", "stop_axis"},
+                           {"Out", "XShape"});
+  } else {
+    return KernelSignature(
+        "flatten", {"X"}, {"start_axis", "stop_axis"}, {"Out"});
+  }
+}
+
+}  // namespace pten
+
+PT_REGISTER_BASE_KERNEL_NAME(flatten_contiguous_range, flatten);
+PT_REGISTER_BASE_KERNEL_NAME(flatten_contiguous_range_grad, flatten_grad);
+
+PT_REGISTER_ARG_MAPPING_FN(flatten_contiguous_range,
+                           pten::FlattenOpArgumentMapping);
diff --git a/paddle/pten/ops/compat/matmul_sig.cc b/paddle/pten/ops/compat/matmul_sig.cc
new file mode 100644
index 00000000000..67ef91b429e
--- /dev/null
+++ b/paddle/pten/ops/compat/matmul_sig.cc
@@ -0,0 +1,22 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/core/compat/op_utils.h"
+
+namespace pten {}  // namespace pten
+
+PT_REGISTER_BASE_KERNEL_NAME(matmul_v2, matmul);
+PT_REGISTER_BASE_KERNEL_NAME(matmul_v2_grad, matmul_grad);
+PT_REGISTER_BASE_KERNEL_NAME(matmul_v2_grad_grad, matmul_double_grad);
+PT_REGISTER_BASE_KERNEL_NAME(matmul_v2_triple_grad, matmul_triple_grad);
diff --git a/paddle/pten/ops/compat/reduce_sig.cc b/paddle/pten/ops/compat/reduce_sig.cc
new file mode 100644
index 00000000000..a8a2b517d3e
--- /dev/null
+++ b/paddle/pten/ops/compat/reduce_sig.cc
@@ -0,0 +1,52 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/core/compat/op_utils.h"
+
+namespace pten {
+
+KernelSignature ReduceSumOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  bool reduce_all = paddle::any_cast<bool>(ctx.Attr("reduce_all"));
+  if (ctx.IsDenseTensorInput("X")) {
+    if (!reduce_all) {
+      return KernelSignature(
+          "sum", {"X"}, {"dim", "keep_dim", "out_dtype"}, {"Out"});
+    }
+    return KernelSignature("sum_raw",
+                           {"X"},
+                           {"dim", "keep_dim", "reduce_all", "out_dtype"},
+                           {"Out"});
+  }
+  return KernelSignature("unregistered", {}, {}, {});
+}
+
+KernelSignature ReduceMeanOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  bool reduce_all = paddle::any_cast<bool>(ctx.Attr("reduce_all"));
+  if (ctx.IsDenseTensorInput("X")) {
+    if (!reduce_all) {
+      return KernelSignature("mean", {"X"}, {"dim", "keep_dim"}, {"Out"});
+    }
+    return KernelSignature(
+        "mean_raw", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"});
+  }
+  return KernelSignature("unregistered", {}, {}, {});
+}
+
+}  // namespace pten
+
+PT_REGISTER_BASE_KERNEL_NAME(reduce_sum, sum_raw);
+PT_REGISTER_BASE_KERNEL_NAME(reduce_mean, mean_raw);
+
+PT_REGISTER_ARG_MAPPING_FN(reduce_sum, pten::ReduceSumOpArgumentMapping);
+PT_REGISTER_ARG_MAPPING_FN(reduce_mean, pten::ReduceMeanOpArgumentMapping);
diff --git a/paddle/pten/ops/compat/reshape_sig.cc b/paddle/pten/ops/compat/reshape_sig.cc
new file mode 100644
index 00000000000..031b6875867
--- /dev/null
+++ b/paddle/pten/ops/compat/reshape_sig.cc
@@ -0,0 +1,35 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/core/compat/op_utils.h"
+
+namespace pten {
+
+KernelSignature ReshapeOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.InputSize("ShapeTensor") > 0) {
+    return KernelSignature("reshape", {"X"}, {"ShapeTensor"}, {"Out"});
+  } else if (ctx.HasInput("Shape")) {
+    return KernelSignature("reshape", {"X"}, {"Shape"}, {"Out"});
+  } else {
+    return KernelSignature("reshape", {"X"}, {"shape"}, {"Out"});
+  }
+}
+
+}  // namespace pten
+
+PT_REGISTER_BASE_KERNEL_NAME(reshape2, reshape);
+PT_REGISTER_BASE_KERNEL_NAME(reshape2_grad, reshape_grad);
+PT_REGISTER_BASE_KERNEL_NAME(reshape2_grad_grad, reshape_double_grad);
+
+PT_REGISTER_ARG_MAPPING_FN(reshape2, pten::ReshapeOpArgumentMapping);
diff --git a/paddle/pten/tests/api/CMakeLists.txt b/paddle/pten/tests/api/CMakeLists.txt
index e9faa22c4eb..b8491ab7f5e 100644
--- a/paddle/pten/tests/api/CMakeLists.txt
+++ b/paddle/pten/tests/api/CMakeLists.txt
@@ -1,7 +1,7 @@
 if(WITH_ROCM)
-  hip_test(test_pten_tensor SRCS test_pten_tensor.cc DEPS pten_tensor pten_function_api utils_api glog)
+  hip_test(test_pten_tensor SRCS test_pten_tensor.cc DEPS pten_tensor pten_function_api manual_api glog)
 else()
-  cc_test(test_pten_tensor SRCS test_pten_tensor.cc DEPS pten_tensor pten_function_api utils_api glog)
+  cc_test(test_pten_tensor SRCS test_pten_tensor.cc DEPS pten_tensor pten_function_api manual_api glog)
 endif()
 
 cc_test(test_pten_exception SRCS test_pten_exception.cc DEPS gtest)
@@ -22,3 +22,6 @@ cc_test(test_scale_api SRCS test_scale_api.cc DEPS pten_tensor pten_api pten_api
 cc_test(test_scale_benchmark SRCS test_scale_benchmark.cc DEPS pten_tensor pten_api pten_api_utils)
 cc_test(test_conj_api SRCS test_conj_api.cc DEPS pten_tensor pten_api pten_api_utils)
 cc_test(test_concat_api SRCS test_concat_api.cc DEPS pten_tensor pten_api pten_api_utils)
+
+cc_test(test_data_transform SRCS test_data_transform.cc DEPS pten_tensor pten_api pten_api_utils)
+cc_test(test_sparse_utils_api SRCS test_sparse_utils_api.cc DEPS pten_tensor pten_api pten_api_utils)
diff --git a/paddle/pten/tests/api/scale_api.h b/paddle/pten/tests/api/scale_api.h
index 0ba1d6a0e3f..b3b8b8f77c1 100644
--- a/paddle/pten/tests/api/scale_api.h
+++ b/paddle/pten/tests/api/scale_api.h
@@ -23,6 +23,7 @@
 #include "paddle/pten/common/scalar.h"
 #include "paddle/pten/common/scalar_array.h"
 #include "paddle/pten/core/kernel_registry.h"
+#include "paddle/pten/core/meta_tensor.h"
 #include "paddle/pten/infermeta/unary.h"
 #include "paddle/pten/kernels/scale_kernel.h"
 
@@ -68,11 +69,12 @@ PADDLE_API Tensor scale_kernel_context(const Tensor& x,
   kernel_context.EmplaceBackAttr(bias);
   kernel_context.EmplaceBackAttr(bias_after_scale);
 
-  auto out_meta = pten::UnchangedInferMeta(dense_x->meta());
   auto dense_out = std::make_shared<pten::DenseTensor>(
       pten::make_intrusive<paddle::experimental::SharedStorage>(
           pten::TransToFluidPlace(kernel_backend)),
-      std::move(out_meta));
+      pten::DenseTensorMeta());
+  pten::MetaTensor meta_out(dense_out.get());
+  pten::UnchangedInferMeta(*dense_x, &meta_out);
   kernel_context.EmplaceBackOutput(dense_out.get());
 
   Tensor out;
@@ -101,7 +103,7 @@ static void ScaleCPU(DataType kernel_dtype,
       break;
     }
     case pten::DataType::BFLOAT16: {
-      pten::ScaleKernel<paddle::platform::bfloat16>(
+      pten::ScaleKernel<pten::dtype::bfloat16>(
           dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out);
       break;
     }
@@ -234,11 +236,12 @@ Tensor scale_switch_case(const Tensor& x,
 
   auto dense_x = std::dynamic_pointer_cast<pten::DenseTensor>(x.impl());
 
-  auto out_meta = pten::UnchangedInferMeta(dense_x->meta());
   auto dense_out = std::make_shared<pten::DenseTensor>(
       pten::make_intrusive<paddle::experimental::SharedStorage>(
           pten::TransToFluidPlace(kernel_backend)),
-      std::move(out_meta));
+      pten::DenseTensorMeta());
+  pten::MetaTensor meta_out(dense_out.get());
+  pten::UnchangedInferMeta(*dense_x, &meta_out);
 
   Tensor out;
   out.set_impl(dense_out);
diff --git a/paddle/pten/tests/api/test_cast_api.cc b/paddle/pten/tests/api/test_cast_api.cc
index b87bebacab7..ad41759e150 100644
--- a/paddle/pten/tests/api/test_cast_api.cc
+++ b/paddle/pten/tests/api/test_cast_api.cc
@@ -35,7 +35,7 @@ TEST(API, cast) {
   auto dense_x = std::make_shared<pten::DenseTensor>(
       alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                            framework::make_ddim({3, 4}),
+                            pten::framework::make_ddim({3, 4}),
                             pten::DataLayout::NCHW));
   auto* dense_x_data =
       dense_x->mutable_data<float>(paddle::platform::CPUPlace());
diff --git a/paddle/pten/tests/api/test_concat_api.cc b/paddle/pten/tests/api/test_concat_api.cc
index c003e89f6c0..ec56861e806 100644
--- a/paddle/pten/tests/api/test_concat_api.cc
+++ b/paddle/pten/tests/api/test_concat_api.cc
@@ -24,8 +24,7 @@ limitations under the License. */
 namespace paddle {
 namespace tests {
 
-namespace framework = paddle::framework;
-using DDim = paddle::framework::DDim;
+using DDim = pten::framework::DDim;
 
 // TODO(chentianyu03): Remove this test after the API is used in the dygraph
 TEST(API, concat) {
@@ -35,7 +34,7 @@ TEST(API, concat) {
   auto dense_x = std::make_shared<pten::DenseTensor>(
       alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                            framework::make_ddim({3, 10}),
+                            pten::framework::make_ddim({3, 10}),
                             pten::DataLayout::NCHW));
   auto* dense_x_data =
       dense_x->mutable_data<float>(paddle::platform::CPUPlace());
@@ -43,7 +42,7 @@ TEST(API, concat) {
   auto dense_y = std::make_shared<pten::DenseTensor>(
       alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                            framework::make_ddim({3, 10}),
+                            pten::framework::make_ddim({3, 10}),
                             pten::DataLayout::NCHW));
   auto* dense_y_data =
       dense_y->mutable_data<float>(paddle::platform::CPUPlace());
diff --git a/paddle/pten/tests/api/test_conj_api.cc b/paddle/pten/tests/api/test_conj_api.cc
index 0273737347e..773ada1afcc 100644
--- a/paddle/pten/tests/api/test_conj_api.cc
+++ b/paddle/pten/tests/api/test_conj_api.cc
@@ -35,7 +35,7 @@ TEST(API, conj) {
   auto dense_x = std::make_shared<pten::DenseTensor>(
       alloc.get(),
       pten::DenseTensorMeta(pten::DataType::COMPLEX64,
-                            framework::make_ddim({3, 10}),
+                            pten::framework::make_ddim({3, 10}),
                             pten::DataLayout::NCHW));
   auto* dense_x_data =
       dense_x->mutable_data<paddle::complex64>(paddle::platform::CPUPlace());
diff --git a/paddle/pten/tests/api/test_data_transform.cc b/paddle/pten/tests/api/test_data_transform.cc
new file mode 100644
index 00000000000..ce3d19b8845
--- /dev/null
+++ b/paddle/pten/tests/api/test_data_transform.cc
@@ -0,0 +1,100 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <memory>
+
+#include "paddle/pten/api/include/api.h"
+#include "paddle/pten/api/include/manual_api.h"
+#include "paddle/pten/common/complex.h"
+#include "paddle/pten/core/compat/convert_utils.h"
+#include "paddle/pten/core/dense_tensor.h"
+
+namespace paddle {
+namespace tests {
+
+// TODO(chenweihang): Remove this test after the API is used in the dygraph
+TEST(API, data_transform_same_place) {
+  // 1. create tensor
+  auto x = paddle::experimental::full({3, 3},
+                                      1.0,
+                                      experimental::DataType::COMPLEX128,
+                                      experimental::Backend::CPU);
+
+  auto y = paddle::experimental::full(
+      {3, 3}, 2.0, experimental::DataType::FLOAT32, experimental::Backend::CPU);
+
+  std::vector<pten::dtype::complex<double>> sum(9, 6.0);
+
+  // 2. test API
+  auto out = paddle::experimental::matmul(x, y, false, false);
+
+  // 3. check result
+  ASSERT_EQ(out.dims().size(), 2);
+  ASSERT_EQ(out.dims()[0], 3);
+  ASSERT_EQ(out.dims()[1], 3);
+  ASSERT_EQ(out.numel(), 9);
+  ASSERT_EQ(out.type(), pten::DataType::COMPLEX128);
+  ASSERT_EQ(out.layout(), pten::DataLayout::NCHW);
+  ASSERT_EQ(out.initialized(), true);
+
+  auto dense_out = std::dynamic_pointer_cast<pten::DenseTensor>(out.impl());
+
+  for (size_t i = 0; i < 9; i++) {
+    ASSERT_NEAR(sum[i].real,
+                dense_out->data<pten::dtype::complex<double>>()[i].real,
+                1e-6f);
+    ASSERT_NEAR(sum[i].imag,
+                dense_out->data<pten::dtype::complex<double>>()[i].imag,
+                1e-6f);
+  }
+}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+TEST(Tensor, data_transform_diff_place) {
+  // 1. create tensor
+  auto x = paddle::experimental::full(
+      {3, 3}, 1.0, experimental::DataType::FLOAT64, experimental::Backend::CPU);
+
+  auto y = paddle::experimental::full(
+      {3, 3}, 2.0, experimental::DataType::FLOAT64, experimental::Backend::GPU);
+
+  std::vector<float> sum(9, 6.0);
+
+  // 2. test API
+  auto out = paddle::experimental::matmul(x, y, false, false);
+
+  // 3. check result
+  ASSERT_EQ(out.dims().size(), 2);
+  ASSERT_EQ(out.dims()[0], 3);
+  ASSERT_EQ(out.dims()[1], 3);
+  ASSERT_EQ(out.numel(), 9);
+  ASSERT_EQ(out.dtype(), pten::DataType::FLOAT64);
+  ASSERT_EQ(out.layout(), pten::DataLayout::NCHW);
+  ASSERT_EQ(out.initialized(), true);
+  ASSERT_EQ(out.impl()->place(),
+            pten::TransToFluidPlace(experimental::Backend::GPU));
+
+  auto ref_out = experimental::copy_to(out, experimental::Backend::CPU, true);
+
+  auto dense_out = std::dynamic_pointer_cast<pten::DenseTensor>(ref_out.impl());
+  for (size_t i = 0; i < 9; i++) {
+    ASSERT_NEAR(sum[i], dense_out->data<double>()[i], 1e-6f);
+  }
+}
+
+#endif
+
+}  // namespace tests
+}  // namespace paddle
diff --git a/paddle/pten/tests/api/test_dot_api.cc b/paddle/pten/tests/api/test_dot_api.cc
index 6de8943a467..9c35e251192 100644
--- a/paddle/pten/tests/api/test_dot_api.cc
+++ b/paddle/pten/tests/api/test_dot_api.cc
@@ -35,7 +35,7 @@ TEST(API, dot) {
   auto dense_x = std::make_shared<pten::DenseTensor>(
       alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                            framework::make_ddim({3, 10}),
+                            pten::framework::make_ddim({3, 10}),
                             pten::DataLayout::NCHW));
   auto* dense_x_data =
       dense_x->mutable_data<float>(paddle::platform::CPUPlace());
@@ -43,7 +43,7 @@ TEST(API, dot) {
   auto dense_y = std::make_shared<pten::DenseTensor>(
       alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                            framework::make_ddim({3, 10}),
+                            pten::framework::make_ddim({3, 10}),
                             pten::DataLayout::NCHW));
   auto* dense_y_data =
       dense_y->mutable_data<float>(paddle::platform::CPUPlace());
diff --git a/paddle/pten/tests/api/test_elementwise_api.cc b/paddle/pten/tests/api/test_elementwise_api.cc
index df1c6278d96..2a6cf461ee5 100644
--- a/paddle/pten/tests/api/test_elementwise_api.cc
+++ b/paddle/pten/tests/api/test_elementwise_api.cc
@@ -35,7 +35,7 @@ TEST(API, add) {
   auto dense_x = std::make_shared<pten::DenseTensor>(
       alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                            framework::make_ddim({3, 10}),
+                            pten::framework::make_ddim({3, 10}),
                             pten::DataLayout::NCHW));
   auto* dense_x_data =
       dense_x->mutable_data<float>(paddle::platform::CPUPlace());
@@ -43,7 +43,7 @@ TEST(API, add) {
   auto dense_y = std::make_shared<pten::DenseTensor>(
       alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                            framework::make_ddim({10}),
+                            pten::framework::make_ddim({10}),
                             pten::DataLayout::NCHW));
   auto* dense_y_data =
       dense_y->mutable_data<float>(paddle::platform::CPUPlace());
@@ -91,7 +91,7 @@ TEST(API, subtract) {
   auto dense_x = std::make_shared<pten::DenseTensor>(
       alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                            framework::make_ddim({3, 10}),
+                            pten::framework::make_ddim({3, 10}),
                             pten::DataLayout::NCHW));
   auto* dense_x_data =
       dense_x->mutable_data<float>(paddle::platform::CPUPlace());
@@ -99,7 +99,7 @@ TEST(API, subtract) {
   auto dense_y = std::make_shared<pten::DenseTensor>(
       alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                            framework::make_ddim({10}),
+                            pten::framework::make_ddim({10}),
                             pten::DataLayout::NCHW));
   auto* dense_y_data =
       dense_y->mutable_data<float>(paddle::platform::CPUPlace());
@@ -147,7 +147,7 @@ TEST(API, divide) {
   auto dense_x = std::make_shared<pten::DenseTensor>(
       alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                            framework::make_ddim({3, 10}),
+                            pten::framework::make_ddim({3, 10}),
                             pten::DataLayout::NCHW));
   auto* dense_x_data =
       dense_x->mutable_data<float>(paddle::platform::CPUPlace());
@@ -155,7 +155,7 @@ TEST(API, divide) {
   auto dense_y = std::make_shared<pten::DenseTensor>(
       alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                            framework::make_ddim({10}),
+                            pten::framework::make_ddim({10}),
                             pten::DataLayout::NCHW));
   auto* dense_y_data =
       dense_y->mutable_data<float>(paddle::platform::CPUPlace());
@@ -203,7 +203,7 @@ TEST(API, multiply) {
   auto dense_x = std::make_shared<pten::DenseTensor>(
       alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                            framework::make_ddim({3, 10}),
+                            pten::framework::make_ddim({3, 10}),
                             pten::DataLayout::NCHW));
   auto* dense_x_data =
       dense_x->mutable_data<float>(paddle::platform::CPUPlace());
@@ -211,7 +211,7 @@ TEST(API, multiply) {
   auto dense_y = std::make_shared<pten::DenseTensor>(
       alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                            framework::make_ddim({10}),
+                            pten::framework::make_ddim({10}),
                             pten::DataLayout::NCHW));
   auto* dense_y_data =
       dense_y->mutable_data<float>(paddle::platform::CPUPlace());
diff --git a/paddle/pten/tests/api/test_empty_api.cc b/paddle/pten/tests/api/test_empty_api.cc
index 72f9100f7b3..4ce97d6c3ab 100644
--- a/paddle/pten/tests/api/test_empty_api.cc
+++ b/paddle/pten/tests/api/test_empty_api.cc
@@ -35,7 +35,7 @@ TEST(API, empty_like) {
   auto dense_x = std::make_shared<pten::DenseTensor>(
       alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                            framework::make_ddim({3, 2}),
+                            pten::framework::make_ddim({3, 2}),
                             pten::DataLayout::NCHW));
 
   paddle::experimental::Tensor x(dense_x);
@@ -59,7 +59,7 @@ TEST(API, empty1) {
   auto dense_shape = std::make_shared<pten::DenseTensor>(
       alloc.get(),
       pten::DenseTensorMeta(pten::DataType::INT64,
-                            framework::make_ddim({2}),
+                            pten::framework::make_ddim({2}),
                             pten::DataLayout::NCHW));
   auto* shape_data =
       dense_shape->mutable_data<int64_t>(paddle::platform::CPUPlace());
@@ -86,7 +86,7 @@ TEST(API, empty2) {
   auto dense_scalar = std::make_shared<pten::DenseTensor>(
       alloc.get(),
       pten::DenseTensorMeta(pten::DataType::INT32,
-                            framework::make_ddim({1}),
+                            pten::framework::make_ddim({1}),
                             pten::DataLayout::NCHW));
   dense_scalar->mutable_data<int32_t>(paddle::platform::CPUPlace())[0] = 2;
 
diff --git a/paddle/pten/tests/api/test_fill_api.cc b/paddle/pten/tests/api/test_fill_api.cc
index 4b78d142aef..dd80b935340 100644
--- a/paddle/pten/tests/api/test_fill_api.cc
+++ b/paddle/pten/tests/api/test_fill_api.cc
@@ -35,7 +35,7 @@ TEST(API, full_like) {
   auto dense_x = std::make_shared<pten::DenseTensor>(
       alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                            framework::make_ddim({3, 2}),
+                            pten::framework::make_ddim({3, 2}),
                             pten::DataLayout::NCHW));
   auto* dense_x_data =
       dense_x->mutable_data<float>(paddle::platform::CPUPlace());
@@ -71,7 +71,7 @@ TEST(API, zeros_like) {
   auto dense_x = std::make_shared<pten::DenseTensor>(
       alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                            framework::make_ddim({3, 2}),
+                            pten::framework::make_ddim({3, 2}),
                             pten::DataLayout::NCHW));
   auto* dense_x_data =
       dense_x->mutable_data<float>(paddle::platform::CPUPlace());
@@ -105,7 +105,7 @@ TEST(API, ones_like) {
   auto dense_x = std::make_shared<pten::DenseTensor>(
       alloc.get(),
       pten::DenseTensorMeta(pten::DataType::INT32,
-                            framework::make_ddim({3, 2}),
+                            pten::framework::make_ddim({3, 2}),
                             pten::DataLayout::NCHW));
   auto* dense_x_data =
       dense_x->mutable_data<int32_t>(paddle::platform::CPUPlace());
@@ -140,7 +140,7 @@ TEST(API, full1) {
   auto dense_shape = std::make_shared<pten::DenseTensor>(
       alloc.get(),
       pten::DenseTensorMeta(pten::DataType::INT64,
-                            framework::make_ddim({2}),
+                            pten::framework::make_ddim({2}),
                             pten::DataLayout::NCHW));
   auto* shape_data =
       dense_shape->mutable_data<int64_t>(paddle::platform::CPUPlace());
@@ -150,7 +150,7 @@ TEST(API, full1) {
   auto dense_scalar = std::make_shared<pten::DenseTensor>(
       alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                            framework::make_ddim({1}),
+                            pten::framework::make_ddim({1}),
                             pten::DataLayout::NCHW));
   dense_scalar->mutable_data<float>(paddle::platform::CPUPlace())[0] = 1.0;
 
@@ -187,7 +187,7 @@ TEST(API, full2) {
   auto dense_scalar = std::make_shared<pten::DenseTensor>(
       alloc.get(),
       pten::DenseTensorMeta(pten::DataType::INT32,
-                            framework::make_ddim({1}),
+                            pten::framework::make_ddim({1}),
                             pten::DataLayout::NCHW));
   dense_scalar->mutable_data<int>(paddle::platform::CPUPlace())[0] = 2;
 
diff --git a/paddle/pten/tests/api/test_flatten_api.cc b/paddle/pten/tests/api/test_flatten_api.cc
index f3b80f7db57..7f04e6f3fc5 100644
--- a/paddle/pten/tests/api/test_flatten_api.cc
+++ b/paddle/pten/tests/api/test_flatten_api.cc
@@ -35,7 +35,7 @@ TEST(API, flatten) {
   auto dense_x = std::make_shared<pten::DenseTensor>(
       alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                            framework::make_ddim({3, 2, 2, 3}),
+                            pten::framework::make_ddim({3, 2, 2, 3}),
                             pten::DataLayout::NCHW));
   auto* dense_x_data =
       dense_x->mutable_data<float>(paddle::platform::CPUPlace());
diff --git a/paddle/pten/tests/api/test_matmul_api.cc b/paddle/pten/tests/api/test_matmul_api.cc
index 7342916c514..4d3adf86d16 100644
--- a/paddle/pten/tests/api/test_matmul_api.cc
+++ b/paddle/pten/tests/api/test_matmul_api.cc
@@ -18,6 +18,7 @@ limitations under the License. */
 #include "paddle/pten/api/include/api.h"
 
 #include "paddle/pten/api/lib/utils/allocator.h"
+#include "paddle/pten/backends/gpu/gpu_context.h"
 #include "paddle/pten/core/dense_tensor.h"
 #include "paddle/pten/core/kernel_registry.h"
 #include "paddle/pten/kernels/copy_kernel.h"
@@ -35,7 +36,7 @@ TEST(API, matmul_cpu) {
   auto dense_x = std::make_shared<pten::DenseTensor>(
       alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                            framework::make_ddim({3, 3}),
+                            pten::framework::make_ddim({3, 3}),
                             pten::DataLayout::NCHW));
 
   auto* dense_x_data =
@@ -44,7 +45,7 @@ TEST(API, matmul_cpu) {
   auto dense_y = std::make_shared<pten::DenseTensor>(
       alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                            framework::make_ddim({3, 3}),
+                            pten::framework::make_ddim({3, 3}),
                             pten::DataLayout::NCHW));
   auto* dense_y_data =
       dense_y->mutable_data<float>(paddle::platform::CPUPlace());
@@ -86,7 +87,7 @@ TEST(API, matmul_cuda) {
   auto ref_x = std::make_shared<pten::DenseTensor>(
       alloc_cpu.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                            framework::make_ddim({3, 3}),
+                            pten::framework::make_ddim({3, 3}),
                             pten::DataLayout::NCHW));
 
   auto* ref_x_data = ref_x->mutable_data<float>(paddle::platform::CPUPlace());
@@ -94,7 +95,7 @@ TEST(API, matmul_cuda) {
   auto ref_y = std::make_shared<pten::DenseTensor>(
       alloc_cpu.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                            framework::make_ddim({3, 3}),
+                            pten::framework::make_ddim({3, 3}),
                             pten::DataLayout::NCHW));
   auto* ref_y_data = ref_y->mutable_data<float>(paddle::platform::CPUPlace());
 
@@ -111,18 +112,18 @@ TEST(API, matmul_cuda) {
   auto dense_x = std::make_shared<pten::DenseTensor>(
       alloc_cuda.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                            framework::make_ddim({3, 3}),
+                            pten::framework::make_ddim({3, 3}),
                             pten::DataLayout::NCHW));
 
   auto dense_y = std::make_shared<pten::DenseTensor>(
       alloc_cuda.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                            framework::make_ddim({3, 3}),
+                            pten::framework::make_ddim({3, 3}),
                             pten::DataLayout::NCHW));
 
   auto& pool = paddle::platform::DeviceContextPool::Instance();
   auto place = paddle::platform::CUDAPlace();
-  auto* dev_ctx = pool.GetByPlace(place);
+  auto* dev_ctx = static_cast<const pten::GPUContext*>(pool.GetByPlace(place));
 
   pten::Copy(*dev_ctx, *ref_x.get(), false, dense_x.get());
   pten::Copy(*dev_ctx, *ref_y.get(), false, dense_y.get());
diff --git a/paddle/pten/tests/api/test_mean_api.cc b/paddle/pten/tests/api/test_mean_api.cc
index 046db05ca2b..68b4933c549 100644
--- a/paddle/pten/tests/api/test_mean_api.cc
+++ b/paddle/pten/tests/api/test_mean_api.cc
@@ -35,7 +35,7 @@ TEST(API, mean) {
   auto dense_x = std::make_shared<pten::DenseTensor>(
       alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                            framework::make_ddim({3, 4}),
+                            pten::framework::make_ddim({3, 4}),
                             pten::DataLayout::NCHW));
   auto* dense_x_data =
       dense_x->mutable_data<float>(paddle::platform::CPUPlace());
diff --git a/paddle/pten/tests/api/test_reshape_api.cc b/paddle/pten/tests/api/test_reshape_api.cc
index 1f0d734a7ec..27e47a9183f 100644
--- a/paddle/pten/tests/api/test_reshape_api.cc
+++ b/paddle/pten/tests/api/test_reshape_api.cc
@@ -35,7 +35,7 @@ TEST(API, reshape) {
   auto dense_x = std::make_shared<pten::DenseTensor>(
       alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                            framework::make_ddim({3, 2, 2, 3}),
+                            pten::framework::make_ddim({3, 2, 2, 3}),
                             pten::DataLayout::NCHW));
   auto* dense_x_data =
       dense_x->mutable_data<float>(paddle::platform::CPUPlace());
diff --git a/paddle/pten/tests/api/test_sparse_utils_api.cc b/paddle/pten/tests/api/test_sparse_utils_api.cc
new file mode 100644
index 00000000000..1ec025faedc
--- /dev/null
+++ b/paddle/pten/tests/api/test_sparse_utils_api.cc
@@ -0,0 +1,105 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <memory>
+
+#include "paddle/pten/api/include/api.h"
+
+#include "paddle/pten/api/include/sparse_api.h"
+
+#include "paddle/pten/api/lib/utils/allocator.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/kernel_registry.h"
+#include "paddle/pten/core/sparse_coo_tensor.h"
+
+TEST(API, to_sparse_coo) {
+  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+      paddle::platform::CPUPlace());
+
+  auto dense_x = std::make_shared<pten::DenseTensor>(
+      alloc.get(),
+      pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                            pten::framework::make_ddim({3, 3}),
+                            pten::DataLayout::NCHW));
+
+  pten::CPUPlace cpu;
+  const int64_t sparse_dim = 2;
+  auto* dense_x_data = dense_x->mutable_data<float>(cpu);
+  float dense_data[3][3] = {{0.0, 1.0, 0.0}, {2.0, 0.0, 3.0}, {3.2, 0.0, 0.0}};
+  std::vector<float> non_zero_data = {1.0, 2.0, 3.0, 3.2};
+  std::vector<int64_t> indices_data = {0, 1, 1, 2, 1, 0, 2, 0};
+  std::vector<int64_t> cols_data = {1, 0, 2, 0};
+  std::vector<int64_t> crows_data = {0, 1, 3, 4};
+  const int64_t non_zero_num = 4;
+
+  std::copy(&dense_data[0][0], &dense_data[0][0] + 9, dense_x_data);
+
+  pten::CPUContext dev_ctx_cpu;
+  dev_ctx_cpu.Init();
+
+  // 1. test dense_to_sparse_coo
+  paddle::experimental::Tensor x(dense_x);
+  auto out = paddle::experimental::sparse::to_sparse_coo(
+      x, pten::Backend::CPU, sparse_dim);
+  auto coo = std::dynamic_pointer_cast<pten::SparseCooTensor>(out.impl());
+  ASSERT_EQ(coo->nnz(), non_zero_num);
+  int cmp_indices = memcmp(coo->non_zero_indices().data<int64_t>(),
+                           indices_data.data(),
+                           indices_data.size() * sizeof(int64_t));
+  ASSERT_EQ(cmp_indices, 0);
+  int cmp_elements = memcmp(coo->non_zero_elements().data<float>(),
+                            non_zero_data.data(),
+                            non_zero_data.size() * sizeof(float));
+  ASSERT_EQ(cmp_elements, 0);
+
+  // 1. test sparse_csr_to_coo
+  auto dense_dims = pten::framework::make_ddim({3, 3});
+  pten::DenseTensorMeta crows_meta(
+      pten::DataType::INT64, {dense_dims[0] + 1}, pten::DataLayout::NCHW);
+  pten::DenseTensorMeta cols_meta(
+      pten::DataType::INT64, {non_zero_num}, pten::DataLayout::NCHW);
+  pten::DenseTensorMeta values_meta(
+      pten::DataType::FLOAT32, {non_zero_num}, pten::DataLayout::NCHW);
+
+  pten::CPUPlace place;
+  pten::DenseTensor crows(alloc.get(), crows_meta);
+  pten::DenseTensor cols(alloc.get(), cols_meta);
+  pten::DenseTensor values(alloc.get(), values_meta);
+  memcpy(crows.mutable_data<int64_t>(place),
+         crows_data.data(),
+         crows_data.size() * sizeof(int64_t));
+  memcpy(cols.mutable_data<int64_t>(place),
+         cols_data.data(),
+         cols_data.size() * sizeof(int64_t));
+  memcpy(values.mutable_data<float>(place),
+         non_zero_data.data(),
+         non_zero_data.size() * sizeof(float));
+  auto csr =
+      std::make_shared<pten::SparseCsrTensor>(crows, cols, values, dense_dims);
+  paddle::experimental::Tensor csr_x(csr);
+  auto out2 = paddle::experimental::sparse::to_sparse_coo(
+      csr_x, pten::Backend::CPU, sparse_dim);
+
+  auto coo2 = std::dynamic_pointer_cast<pten::SparseCooTensor>(out.impl());
+  ASSERT_EQ(coo2->nnz(), non_zero_num);
+  int cmp_indices2 = memcmp(coo2->non_zero_indices().data<int64_t>(),
+                            indices_data.data(),
+                            indices_data.size() * sizeof(int64_t));
+  ASSERT_EQ(cmp_indices2, 0);
+  int cmp_elements2 = memcmp(coo2->non_zero_elements().data<float>(),
+                             non_zero_data.data(),
+                             non_zero_data.size() * sizeof(float));
+  ASSERT_EQ(cmp_elements2, 0);
+}
diff --git a/paddle/pten/tests/api/test_sum_api.cc b/paddle/pten/tests/api/test_sum_api.cc
index 385d18aa784..33be9e54eeb 100644
--- a/paddle/pten/tests/api/test_sum_api.cc
+++ b/paddle/pten/tests/api/test_sum_api.cc
@@ -35,7 +35,7 @@ TEST(API, sum) {
   auto dense_x = std::make_shared<pten::DenseTensor>(
       alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                            framework::make_ddim({3, 4}),
+                            pten::framework::make_ddim({3, 4}),
                             pten::DataLayout::NCHW));
   auto* dense_x_data =
       dense_x->mutable_data<float>(paddle::platform::CPUPlace());
diff --git a/paddle/pten/tests/api/test_to_api.cc b/paddle/pten/tests/api/test_to_api.cc
index 11636e1c014..641c9e186d9 100644
--- a/paddle/pten/tests/api/test_to_api.cc
+++ b/paddle/pten/tests/api/test_to_api.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <memory>
 
-#include "paddle/pten/api/include/utils.h"
+#include "paddle/pten/api/include/manual_api.h"
 
 #include "paddle/pten/api/lib/utils/allocator.h"
 #include "paddle/pten/core/dense_tensor.h"
@@ -33,7 +33,7 @@ paddle::experimental::Tensor CreateInputTensor() {
   auto dense_x = std::make_shared<pten::DenseTensor>(
       alloc.get(),
       pten::DenseTensorMeta(pten::DataType::INT64,
-                            framework::make_ddim({3, 4}),
+                            pten::framework::make_ddim({3, 4}),
                             pten::DataLayout::NCHW));
   auto* dense_x_data =
       dense_x->mutable_data<int64_t>(paddle::platform::CPUPlace());
diff --git a/paddle/pten/tests/core/CMakeLists.txt b/paddle/pten/tests/core/CMakeLists.txt
index 1404b9921f3..d9c8c86a240 100644
--- a/paddle/pten/tests/core/CMakeLists.txt
+++ b/paddle/pten/tests/core/CMakeLists.txt
@@ -23,3 +23,5 @@ endif()
 if (NOT WIN32)
 cc_test(test_rw_lock SRCS test_rw_lock.cc)
 endif (NOT WIN32)
+
+cc_test(unroll_array_ops_test SRCS unroll_array_ops_test.cc)
diff --git a/paddle/pten/tests/core/test_convert_utils.cc b/paddle/pten/tests/core/test_convert_utils.cc
index 51fba7cbe06..cc7ac6e7e59 100644
--- a/paddle/pten/tests/core/test_convert_utils.cc
+++ b/paddle/pten/tests/core/test_convert_utils.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "gtest/gtest.h"
-#include "paddle/pten/core/convert_utils.h"
+#include "paddle/pten/core/compat/convert_utils.h"
 
 namespace pten {
 namespace tests {
diff --git a/paddle/pten/tests/core/test_device_context.cc b/paddle/pten/tests/core/test_device_context.cc
index a44d0d32156..4fa8faabcfc 100644
--- a/paddle/pten/tests/core/test_device_context.cc
+++ b/paddle/pten/tests/core/test_device_context.cc
@@ -25,43 +25,29 @@ limitations under the License. */
 namespace pten {
 namespace tests {
 
+class InferenceCPUContext : public CPUContext {
+ public:
+  void SetEigenDevice(Eigen::DefaultDevice* eigen_device) {
+    CPUContext::SetEigenDevice(eigen_device);
+  }
+};
+
 TEST(DeviceContext, cpu_context) {
   std::cout << "test training scenarios" << std::endl;
   {
     pten::CPUContext ctx;
-    CHECK(ctx.eigen_device() != nullptr);
+    ctx.Init();
+    EXPECT_TRUE(ctx.eigen_device() != nullptr);
   }
 
   std::cout << "test inference scenarios" << std::endl;
   Eigen::DefaultDevice* device = new Eigen::DefaultDevice();
   {
-    pten::CPUContextResource ctx_res{device};
-    pten::CPUContext ctx(ctx_res);
-    CHECK(ctx.eigen_device() != nullptr);
-  }
-  {
-    pten::CPUContextResource ctx_res{nullptr};
-    pten::CPUContext ctx(ctx_res);
+    InferenceCPUContext ctx;
     ctx.SetEigenDevice(device);
-    CHECK(ctx.eigen_device() != nullptr);
+    EXPECT_TRUE(ctx.eigen_device() != nullptr);
   }
   delete device;
-
-  std::cout << "test copy constructor" << std::endl;
-  {
-    pten::CPUContext ctx1;
-    pten::CPUContext ctx2(ctx1);
-    CHECK_EQ(ctx1.eigen_device(), ctx2.eigen_device());
-  }
-
-  std::cout << "test move constructor" << std::endl;
-  {
-    pten::CPUContext ctx1 = pten::CPUContext();
-    auto* eigen_device1 = ctx1.eigen_device();
-    pten::CPUContext ctx2(std::move(ctx1));
-    auto* eigen_device2 = ctx2.eigen_device();
-    CHECK_EQ(eigen_device1, eigen_device2);
-  }
 }
 
 }  // namespace tests
diff --git a/paddle/pten/tests/core/test_dim.cu b/paddle/pten/tests/core/test_dim.cu
index 1c4a9c163f9..fec1029814e 100644
--- a/paddle/pten/tests/core/test_dim.cu
+++ b/paddle/pten/tests/core/test_dim.cu
@@ -16,7 +16,7 @@
 #include <sstream>
 
 #include "gtest/gtest.h"
-#include "paddle/pten/core/dim.h"
+#include "paddle/pten/core/utils/dim.h"
 
 namespace pten {
 namespace tests {
@@ -100,4 +100,4 @@ TEST(Dim, Print) {
 }
 
 }  // namespace tests
-}  // namespace pten
\ No newline at end of file
+}  // namespace pten
diff --git a/paddle/pten/tests/core/test_meta_fn_utils.cc b/paddle/pten/tests/core/test_meta_fn_utils.cc
index e25fdd3a204..c7d092c468f 100644
--- a/paddle/pten/tests/core/test_meta_fn_utils.cc
+++ b/paddle/pten/tests/core/test_meta_fn_utils.cc
@@ -29,7 +29,7 @@ TEST(MetaFunctionMap, InferMetaFnExists) {
   pten::MetaTensor meta_x(&dense_x);
   pten::DenseTensor dense_out1;
   pten::MetaTensor meta_out(&dense_out1);
-  pten::UnchangedInferMetaNew(/*is_runtime=*/true, meta_x, &meta_out);
+  pten::UnchangedInferMeta(meta_x, &meta_out);
 
   auto shared_meat_x = std::make_shared<pten::MetaTensor>(&dense_x);
   pten::DenseTensor dense_out2;
diff --git a/paddle/pten/core/unroll_array_ops_test.cc b/paddle/pten/tests/core/unroll_array_ops_test.cc
similarity index 96%
rename from paddle/pten/core/unroll_array_ops_test.cc
rename to paddle/pten/tests/core/unroll_array_ops_test.cc
index f32d94be759..aee6cc6f404 100644
--- a/paddle/pten/core/unroll_array_ops_test.cc
+++ b/paddle/pten/tests/core/unroll_array_ops_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/pten/core/unroll_array_ops.h"
+#include "paddle/pten/core/utils/unroll_array_ops.h"
 
 #include <gtest/gtest.h>
 #include <array>
@@ -79,4 +79,4 @@ TEST(unroll_ops, product) {
 }
 
 }  // namespace framework
-}  // namespace pten
\ No newline at end of file
+}  // namespace pten
diff --git a/paddle/pten/tests/kernels/CMakeLists.txt b/paddle/pten/tests/kernels/CMakeLists.txt
index 407e5c097ae..e2063241689 100644
--- a/paddle/pten/tests/kernels/CMakeLists.txt
+++ b/paddle/pten/tests/kernels/CMakeLists.txt
@@ -11,3 +11,4 @@ cc_test(test_reshape_dev_api SRCS test_reshape_dev_api.cc DEPS pten pten_api_uti
 cc_test(test_sum_dev_api SRCS test_sum_dev_api.cc DEPS pten pten_api_utils)
 cc_test(test_conj_dev_api SRCS test_conj_dev_api.cc DEPS pten pten_api_utils)
 cc_test(test_concat_dev_api SRCS test_concat_dev_api.cc DEPS pten pten_api_utils)
+cc_test(test_sparse_utils_dev_api SRCS test_sparse_utils_dev_api.cc DEPS pten pten_api_utils)
diff --git a/paddle/pten/tests/kernels/test_cast_dev_api.cc b/paddle/pten/tests/kernels/test_cast_dev_api.cc
index 33d27ca5b1c..d6aac6d7db8 100644
--- a/paddle/pten/tests/kernels/test_cast_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_cast_dev_api.cc
@@ -35,10 +35,11 @@ TEST(DEV_API, cast) {
   // 1. create tensor
   const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
-  pten::DenseTensor dense_x(alloc.get(),
-                            pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                                                  framework::make_ddim({3, 4}),
-                                                  pten::DataLayout::NCHW));
+  pten::DenseTensor dense_x(
+      alloc.get(),
+      pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                            pten::framework::make_ddim({3, 4}),
+                            pten::DataLayout::NCHW));
   auto* dense_x_data =
       dense_x.mutable_data<float>(paddle::platform::CPUPlace());
 
@@ -49,10 +50,10 @@ TEST(DEV_API, cast) {
   }
 
   pten::CPUContext dev_ctx;
-  dev_ctx.SetDeviceAllocator(
-      paddle::memory::allocation::AllocatorFacade::Instance()
-          .GetAllocator(paddle::platform::CPUPlace())
-          .get());
+  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(paddle::platform::CPUPlace())
+                           .get());
+  dev_ctx.Init();
 
   pten::DataType out_dtype = pten::DataType::FLOAT64;
   // 2. test API
diff --git a/paddle/pten/tests/kernels/test_concat_dev_api.cc b/paddle/pten/tests/kernels/test_concat_dev_api.cc
index eb546e992e9..5e78545f6d6 100644
--- a/paddle/pten/tests/kernels/test_concat_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_concat_dev_api.cc
@@ -32,17 +32,19 @@ TEST(DEV_API, concat) {
   // 1. create tensor
   const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
-  pten::DenseTensor dense_x(alloc.get(),
-                            pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                                                  framework::make_ddim({3, 10}),
-                                                  pten::DataLayout::NCHW));
+  pten::DenseTensor dense_x(
+      alloc.get(),
+      pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                            pten::framework::make_ddim({3, 10}),
+                            pten::DataLayout::NCHW));
   auto* dense_x_data =
       dense_x.mutable_data<float>(paddle::platform::CPUPlace());
 
-  pten::DenseTensor dense_y(alloc.get(),
-                            pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                                                  framework::make_ddim({3, 10}),
-                                                  pten::DataLayout::NCHW));
+  pten::DenseTensor dense_y(
+      alloc.get(),
+      pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                            pten::framework::make_ddim({3, 10}),
+                            pten::DataLayout::NCHW));
   auto* dense_y_data =
       dense_y.mutable_data<float>(paddle::platform::CPUPlace());
 
@@ -57,10 +59,10 @@ TEST(DEV_API, concat) {
 
   // 2. test API
   pten::CPUContext dev_ctx;
-  dev_ctx.SetDeviceAllocator(
-      paddle::memory::allocation::AllocatorFacade::Instance()
-          .GetAllocator(paddle::platform::CPUPlace())
-          .get());
+  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(paddle::platform::CPUPlace())
+                           .get());
+  dev_ctx.Init();
   auto out = pten::Concat<float>(dev_ctx, inputs, 0);
 
   // 3. check result
diff --git a/paddle/pten/tests/kernels/test_conj_dev_api.cc b/paddle/pten/tests/kernels/test_conj_dev_api.cc
index e43769dfb2b..cac8370f67f 100644
--- a/paddle/pten/tests/kernels/test_conj_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_conj_dev_api.cc
@@ -33,10 +33,11 @@ TEST(DEV_API, conj) {
   // 1. create tensor
   const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
-  pten::DenseTensor dense_x(alloc.get(),
-                            pten::DenseTensorMeta(pten::DataType::COMPLEX64,
-                                                  framework::make_ddim({3, 4}),
-                                                  pten::DataLayout::NCHW));
+  pten::DenseTensor dense_x(
+      alloc.get(),
+      pten::DenseTensorMeta(pten::DataType::COMPLEX64,
+                            pten::framework::make_ddim({3, 4}),
+                            pten::DataLayout::NCHW));
 
   auto* dense_x_data =
       dense_x.mutable_data<paddle::complex64>(paddle::platform::CPUPlace());
@@ -45,10 +46,10 @@ TEST(DEV_API, conj) {
   }
 
   pten::CPUContext dev_ctx;
-  dev_ctx.SetDeviceAllocator(
-      paddle::memory::allocation::AllocatorFacade::Instance()
-          .GetAllocator(paddle::platform::CPUPlace())
-          .get());
+  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(paddle::platform::CPUPlace())
+                           .get());
+  dev_ctx.Init();
 
   // 2. test API
   auto out = pten::Conj<paddle::complex64>(dev_ctx, dense_x);
diff --git a/paddle/pten/tests/kernels/test_copy_dev_api.cc b/paddle/pten/tests/kernels/test_copy_dev_api.cc
index 29f68513fa7..cd589142aed 100644
--- a/paddle/pten/tests/kernels/test_copy_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_copy_dev_api.cc
@@ -38,7 +38,7 @@ TEST(DEV_API, copy) {
   auto dense_src = std::make_shared<pten::DenseTensor>(
       alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                            framework::make_ddim({2, 3}),
+                            pten::framework::make_ddim({2, 3}),
                             pten::DataLayout::NCHW));
   auto* dense_x_data =
       dense_src->mutable_data<float>(paddle::platform::CPUPlace());
@@ -46,7 +46,7 @@ TEST(DEV_API, copy) {
   auto dense_dst = std::make_shared<pten::DenseTensor>(
       alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                            framework::make_ddim({2, 3}),
+                            pten::framework::make_ddim({2, 3}),
                             pten::DataLayout::NCHW));
 
   for (size_t i = 0; i < 2; ++i) {
@@ -58,10 +58,10 @@ TEST(DEV_API, copy) {
   std::cout << typeid(a).name() << std::endl;
   // 2. test API
   pten::CPUContext dev_ctx;
-  dev_ctx.SetDeviceAllocator(
-      paddle::memory::allocation::AllocatorFacade::Instance()
-          .GetAllocator(paddle::platform::CPUPlace())
-          .get());
+  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(paddle::platform::CPUPlace())
+                           .get());
+  dev_ctx.Init();
   pten::Copy(dev_ctx, *(dense_src.get()), false, dense_dst.get());
 
   // 3. check result
diff --git a/paddle/pten/tests/kernels/test_creation_dev_api.cc b/paddle/pten/tests/kernels/test_creation_dev_api.cc
index 8b37c41d0b5..0e87fe554a2 100644
--- a/paddle/pten/tests/kernels/test_creation_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_creation_dev_api.cc
@@ -33,10 +33,10 @@ using DDim = pten::framework::DDim;
 TEST(DEV_API, empty) {
   // 1. create input
   pten::CPUContext dev_ctx;
-  dev_ctx.SetDeviceAllocator(
-      paddle::memory::allocation::AllocatorFacade::Instance()
-          .GetAllocator(paddle::platform::CPUPlace())
-          .get());
+  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(paddle::platform::CPUPlace())
+                           .get());
+  dev_ctx.Init();
 
   // 2. test API
   auto out = pten::Empty<float>(dev_ctx, {3, 2}, pten::DataType::INT32);
@@ -53,20 +53,21 @@ TEST(DEV_API, empty_like) {
   // 1. create tensor
   const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
-  pten::DenseTensor dense_x(alloc.get(),
-                            pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                                                  framework::make_ddim({3, 2}),
-                                                  pten::DataLayout::NCHW));
+  pten::DenseTensor dense_x(
+      alloc.get(),
+      pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                            pten::framework::make_ddim({3, 2}),
+                            pten::DataLayout::NCHW));
   auto* dense_x_data =
       dense_x.mutable_data<float>(paddle::platform::CPUPlace());
   dense_x_data[0] = 0;
 
   // 2. test API
   pten::CPUContext dev_ctx;
-  dev_ctx.SetDeviceAllocator(
-      paddle::memory::allocation::AllocatorFacade::Instance()
-          .GetAllocator(paddle::platform::CPUPlace())
-          .get());
+  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(paddle::platform::CPUPlace())
+                           .get());
+  dev_ctx.Init();
   auto out = pten::EmptyLike<float>(dev_ctx, dense_x);
 
   // 3. check result
@@ -83,10 +84,10 @@ TEST(DEV_API, full) {
 
   // 2. test API
   pten::CPUContext dev_ctx;
-  dev_ctx.SetDeviceAllocator(
-      paddle::memory::allocation::AllocatorFacade::Instance()
-          .GetAllocator(paddle::platform::CPUPlace())
-          .get());
+  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(paddle::platform::CPUPlace())
+                           .get());
+  dev_ctx.Init();
   auto out = pten::Full<float>(dev_ctx, {3, 2}, val, pten::DataType::FLOAT32);
 
   // 3. check result
@@ -106,20 +107,21 @@ TEST(DEV_API, full_like) {
   // 1. create tensor
   const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
-  pten::DenseTensor dense_x(alloc.get(),
-                            pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                                                  framework::make_ddim({3, 2}),
-                                                  pten::DataLayout::NCHW));
+  pten::DenseTensor dense_x(
+      alloc.get(),
+      pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                            pten::framework::make_ddim({3, 2}),
+                            pten::DataLayout::NCHW));
   auto* dense_x_data =
       dense_x.mutable_data<float>(paddle::platform::CPUPlace());
   dense_x_data[0] = 0;
   float val = 1.0;
 
   pten::CPUContext dev_ctx;
-  dev_ctx.SetDeviceAllocator(
-      paddle::memory::allocation::AllocatorFacade::Instance()
-          .GetAllocator(paddle::platform::CPUPlace())
-          .get());
+  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(paddle::platform::CPUPlace())
+                           .get());
+  dev_ctx.Init();
 
   // 2. test API
   auto out = pten::FullLike<float>(dev_ctx, dense_x, val);
diff --git a/paddle/pten/tests/kernels/test_dot_dev_api.cc b/paddle/pten/tests/kernels/test_dot_dev_api.cc
index c1f7d6aaba3..051f05c7805 100644
--- a/paddle/pten/tests/kernels/test_dot_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_dot_dev_api.cc
@@ -33,17 +33,19 @@ TEST(DEV_API, dot) {
   // 1. create tensor
   const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
-  pten::DenseTensor dense_x(alloc.get(),
-                            pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                                                  framework::make_ddim({3, 10}),
-                                                  pten::DataLayout::NCHW));
+  pten::DenseTensor dense_x(
+      alloc.get(),
+      pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                            pten::framework::make_ddim({3, 10}),
+                            pten::DataLayout::NCHW));
   auto* dense_x_data =
       dense_x.mutable_data<float>(paddle::platform::CPUPlace());
 
-  pten::DenseTensor dense_y(alloc.get(),
-                            pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                                                  framework::make_ddim({3, 10}),
-                                                  pten::DataLayout::NCHW));
+  pten::DenseTensor dense_y(
+      alloc.get(),
+      pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                            pten::framework::make_ddim({3, 10}),
+                            pten::DataLayout::NCHW));
   auto* dense_y_data =
       dense_y.mutable_data<float>(paddle::platform::CPUPlace());
 
@@ -58,10 +60,10 @@ TEST(DEV_API, dot) {
 
   // 2. test API
   pten::CPUContext dev_ctx;
-  dev_ctx.SetDeviceAllocator(
-      paddle::memory::allocation::AllocatorFacade::Instance()
-          .GetAllocator(paddle::platform::CPUPlace())
-          .get());
+  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(paddle::platform::CPUPlace())
+                           .get());
+  dev_ctx.Init();
   auto out = pten::Dot<float>(dev_ctx, dense_x, dense_y);
 
   // 3. check result
diff --git a/paddle/pten/tests/kernels/test_elementwise_dev_api.cc b/paddle/pten/tests/kernels/test_elementwise_dev_api.cc
index 9d4c86f0267..20dd5ddb4a3 100644
--- a/paddle/pten/tests/kernels/test_elementwise_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_elementwise_dev_api.cc
@@ -33,17 +33,19 @@ TEST(DEV_API, add) {
   // 1. create tensor
   const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
-  pten::DenseTensor dense_x(alloc.get(),
-                            pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                                                  framework::make_ddim({3, 10}),
-                                                  pten::DataLayout::NCHW));
+  pten::DenseTensor dense_x(
+      alloc.get(),
+      pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                            pten::framework::make_ddim({3, 10}),
+                            pten::DataLayout::NCHW));
   auto* dense_x_data =
       dense_x.mutable_data<float>(paddle::platform::CPUPlace());
 
-  pten::DenseTensor dense_y(alloc.get(),
-                            pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                                                  framework::make_ddim({10}),
-                                                  pten::DataLayout::NCHW));
+  pten::DenseTensor dense_y(
+      alloc.get(),
+      pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                            pten::framework::make_ddim({10}),
+                            pten::DataLayout::NCHW));
   auto* dense_y_data =
       dense_y.mutable_data<float>(paddle::platform::CPUPlace());
 
@@ -60,10 +62,10 @@ TEST(DEV_API, add) {
 
   // 2. test API
   pten::CPUContext dev_ctx;
-  dev_ctx.SetDeviceAllocator(
-      paddle::memory::allocation::AllocatorFacade::Instance()
-          .GetAllocator(paddle::platform::CPUPlace())
-          .get());
+  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(paddle::platform::CPUPlace())
+                           .get());
+  dev_ctx.Init();
   auto dense_out = pten::Add<float>(dev_ctx, dense_x, dense_y);
 
   // 3. check result
@@ -85,17 +87,19 @@ TEST(DEV_API, subtract) {
   // 1. create tensor
   const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
-  pten::DenseTensor dense_x(alloc.get(),
-                            pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                                                  framework::make_ddim({3, 10}),
-                                                  pten::DataLayout::NCHW));
+  pten::DenseTensor dense_x(
+      alloc.get(),
+      pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                            pten::framework::make_ddim({3, 10}),
+                            pten::DataLayout::NCHW));
   auto* dense_x_data =
       dense_x.mutable_data<float>(paddle::platform::CPUPlace());
 
-  pten::DenseTensor dense_y(alloc.get(),
-                            pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                                                  framework::make_ddim({10}),
-                                                  pten::DataLayout::NCHW));
+  pten::DenseTensor dense_y(
+      alloc.get(),
+      pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                            pten::framework::make_ddim({10}),
+                            pten::DataLayout::NCHW));
   auto* dense_y_data =
       dense_y.mutable_data<float>(paddle::platform::CPUPlace());
 
@@ -112,10 +116,10 @@ TEST(DEV_API, subtract) {
 
   // 2. test API
   pten::CPUContext dev_ctx;
-  dev_ctx.SetDeviceAllocator(
-      paddle::memory::allocation::AllocatorFacade::Instance()
-          .GetAllocator(paddle::platform::CPUPlace())
-          .get());
+  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(paddle::platform::CPUPlace())
+                           .get());
+  dev_ctx.Init();
   auto dense_out = pten::Subtract<float>(dev_ctx, dense_x, dense_y);
 
   // 3. check result
@@ -137,17 +141,19 @@ TEST(DEV_API, divide) {
   // 1. create tensor
   const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
-  pten::DenseTensor dense_x(alloc.get(),
-                            pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                                                  framework::make_ddim({3, 10}),
-                                                  pten::DataLayout::NCHW));
+  pten::DenseTensor dense_x(
+      alloc.get(),
+      pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                            pten::framework::make_ddim({3, 10}),
+                            pten::DataLayout::NCHW));
   auto* dense_x_data =
       dense_x.mutable_data<float>(paddle::platform::CPUPlace());
 
-  pten::DenseTensor dense_y(alloc.get(),
-                            pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                                                  framework::make_ddim({10}),
-                                                  pten::DataLayout::NCHW));
+  pten::DenseTensor dense_y(
+      alloc.get(),
+      pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                            pten::framework::make_ddim({10}),
+                            pten::DataLayout::NCHW));
   auto* dense_y_data =
       dense_y.mutable_data<float>(paddle::platform::CPUPlace());
 
@@ -164,10 +170,10 @@ TEST(DEV_API, divide) {
 
   // 2. test API
   pten::CPUContext dev_ctx;
-  dev_ctx.SetDeviceAllocator(
-      paddle::memory::allocation::AllocatorFacade::Instance()
-          .GetAllocator(paddle::platform::CPUPlace())
-          .get());
+  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(paddle::platform::CPUPlace())
+                           .get());
+  dev_ctx.Init();
   auto dense_out = pten::Divide<float>(dev_ctx, dense_x, dense_y);
 
   // 3. check result
@@ -189,17 +195,19 @@ TEST(DEV_API, multiply) {
   // 1. create tensor
   const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
-  pten::DenseTensor dense_x(alloc.get(),
-                            pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                                                  framework::make_ddim({3, 10}),
-                                                  pten::DataLayout::NCHW));
+  pten::DenseTensor dense_x(
+      alloc.get(),
+      pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                            pten::framework::make_ddim({3, 10}),
+                            pten::DataLayout::NCHW));
   auto* dense_x_data =
       dense_x.mutable_data<float>(paddle::platform::CPUPlace());
 
-  pten::DenseTensor dense_y(alloc.get(),
-                            pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                                                  framework::make_ddim({10}),
-                                                  pten::DataLayout::NCHW));
+  pten::DenseTensor dense_y(
+      alloc.get(),
+      pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                            pten::framework::make_ddim({10}),
+                            pten::DataLayout::NCHW));
   auto* dense_y_data =
       dense_y.mutable_data<float>(paddle::platform::CPUPlace());
 
@@ -216,10 +224,10 @@ TEST(DEV_API, multiply) {
 
   // 2. test API
   pten::CPUContext dev_ctx;
-  dev_ctx.SetDeviceAllocator(
-      paddle::memory::allocation::AllocatorFacade::Instance()
-          .GetAllocator(paddle::platform::CPUPlace())
-          .get());
+  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(paddle::platform::CPUPlace())
+                           .get());
+  dev_ctx.Init();
   auto dense_out = pten::Multiply<float>(dev_ctx, dense_x, dense_y);
 
   // 3. check result
diff --git a/paddle/pten/tests/kernels/test_flatten_dev_api.cc b/paddle/pten/tests/kernels/test_flatten_dev_api.cc
index 2ebf10916be..f9dc1df818e 100644
--- a/paddle/pten/tests/kernels/test_flatten_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_flatten_dev_api.cc
@@ -46,7 +46,7 @@ TEST(DEV_API, flatten) {
   pten::DenseTensor dense_x(
       alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                            framework::make_ddim({3, 2, 2, 3}),
+                            pten::framework::make_ddim({3, 2, 2, 3}),
                             pten::DataLayout::NCHW));
   auto* dense_x_data =
       dense_x.mutable_data<float>(paddle::platform::CPUPlace());
@@ -56,10 +56,10 @@ TEST(DEV_API, flatten) {
   }
   int start_axis = 1, stop_axis = 2;
   pten::CPUContext dev_ctx;
-  dev_ctx.SetDeviceAllocator(
-      paddle::memory::allocation::AllocatorFacade::Instance()
-          .GetAllocator(paddle::platform::CPUPlace())
-          .get());
+  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(paddle::platform::CPUPlace())
+                           .get());
+  dev_ctx.Init();
 
   // 2. test API
   auto out = pten::Flatten<float>(dev_ctx, dense_x, start_axis, stop_axis);
diff --git a/paddle/pten/tests/kernels/test_matmul_dev_api.cc b/paddle/pten/tests/kernels/test_matmul_dev_api.cc
index 87c91b10081..b1c0b40eae7 100644
--- a/paddle/pten/tests/kernels/test_matmul_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_matmul_dev_api.cc
@@ -34,7 +34,7 @@ TEST(DEV_API, dot) {
       paddle::platform::CPUPlace());
   DenseTensor dense_x(alloc.get(),
                       pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                                            framework::make_ddim({3, 3}),
+                                            pten::framework::make_ddim({3, 3}),
                                             pten::DataLayout::NCHW));
 
   auto* dense_x_data =
@@ -42,7 +42,7 @@ TEST(DEV_API, dot) {
 
   DenseTensor dense_y(alloc.get(),
                       pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                                            framework::make_ddim({3, 3}),
+                                            pten::framework::make_ddim({3, 3}),
                                             pten::DataLayout::NCHW));
   auto* dense_y_data =
       dense_y.mutable_data<float>(paddle::platform::CPUPlace());
@@ -55,10 +55,10 @@ TEST(DEV_API, dot) {
 
   // 2. test API
   pten::CPUContext dev_ctx;
-  dev_ctx.SetDeviceAllocator(
-      paddle::memory::allocation::AllocatorFacade::Instance()
-          .GetAllocator(paddle::platform::CPUPlace())
-          .get());
+  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(paddle::platform::CPUPlace())
+                           .get());
+  dev_ctx.Init();
   auto out = Matmul<float, CPUContext>(dev_ctx, dense_x, dense_y, false, false);
 
   // 3. check result
diff --git a/paddle/pten/tests/kernels/test_mean_dev_api.cc b/paddle/pten/tests/kernels/test_mean_dev_api.cc
index 3abf54d26af..4b84a131150 100644
--- a/paddle/pten/tests/kernels/test_mean_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_mean_dev_api.cc
@@ -32,10 +32,11 @@ TEST(DEV_API, mean) {
   // 1. create tensor
   const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
-  pten::DenseTensor dense_x(alloc.get(),
-                            pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                                                  framework::make_ddim({3, 4}),
-                                                  pten::DataLayout::NCHW));
+  pten::DenseTensor dense_x(
+      alloc.get(),
+      pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                            pten::framework::make_ddim({3, 4}),
+                            pten::DataLayout::NCHW));
   auto* dense_x_data =
       dense_x.mutable_data<float>(paddle::platform::CPUPlace());
 
@@ -48,10 +49,10 @@ TEST(DEV_API, mean) {
 
   // 2. test API
   pten::CPUContext dev_ctx;
-  dev_ctx.SetDeviceAllocator(
-      paddle::memory::allocation::AllocatorFacade::Instance()
-          .GetAllocator(paddle::platform::CPUPlace())
-          .get());
+  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(paddle::platform::CPUPlace())
+                           .get());
+  dev_ctx.Init();
   auto out = pten::Mean<float>(dev_ctx, dense_x, dims, false);
 
   // 3. check result
diff --git a/paddle/pten/tests/kernels/test_reshape_dev_api.cc b/paddle/pten/tests/kernels/test_reshape_dev_api.cc
index fe9b09c2555..58004e718ea 100644
--- a/paddle/pten/tests/kernels/test_reshape_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_reshape_dev_api.cc
@@ -36,7 +36,7 @@ TEST(DEV_API, reshape) {
   pten::DenseTensor dense_x(
       alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                            framework::make_ddim({3, 2, 2, 3}),
+                            pten::framework::make_ddim({3, 2, 2, 3}),
                             pten::DataLayout::NCHW));
   auto* dense_x_data =
       dense_x.mutable_data<float>(paddle::platform::CPUPlace());
@@ -48,10 +48,10 @@ TEST(DEV_API, reshape) {
 
   // 2. test API
   pten::CPUContext dev_ctx;
-  dev_ctx.SetDeviceAllocator(
-      paddle::memory::allocation::AllocatorFacade::Instance()
-          .GetAllocator(paddle::platform::CPUPlace())
-          .get());
+  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(paddle::platform::CPUPlace())
+                           .get());
+  dev_ctx.Init();
   auto out = pten::Reshape<float>(dev_ctx, dense_x, shape);
   // 3. check result
   std::vector<int64_t> expect_shape = {12, 3};
diff --git a/paddle/pten/tests/kernels/test_scale_dev_api.cc b/paddle/pten/tests/kernels/test_scale_dev_api.cc
index 80f12950094..ccad71711d6 100644
--- a/paddle/pten/tests/kernels/test_scale_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_scale_dev_api.cc
@@ -32,10 +32,11 @@ TEST(DEV_API, scale) {
   // 1. create tensor
   const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
-  pten::DenseTensor dense_x(alloc.get(),
-                            pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                                                  framework::make_ddim({3, 4}),
-                                                  pten::DataLayout::NCHW));
+  pten::DenseTensor dense_x(
+      alloc.get(),
+      pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                            pten::framework::make_ddim({3, 4}),
+                            pten::DataLayout::NCHW));
 
   auto* dense_x_data =
       dense_x.mutable_data<float>(paddle::platform::CPUPlace());
@@ -48,10 +49,11 @@ TEST(DEV_API, scale) {
 
   // 2. test API
   pten::CPUContext dev_ctx;
-  dev_ctx.SetDeviceAllocator(
-      paddle::memory::allocation::AllocatorFacade::Instance()
-          .GetAllocator(paddle::platform::CPUPlace())
-          .get());
+  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(paddle::platform::CPUPlace())
+                           .get());
+  dev_ctx.Init();
+
   auto out =
       pten::Scale<float>(dev_ctx, dense_x, scale, bias, bias_after_scale);
 
@@ -70,10 +72,11 @@ TEST(DEV_API, scale_host) {
   // 1. create tensor
   const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
-  pten::DenseTensor dense_x(alloc.get(),
-                            pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                                                  framework::make_ddim({3, 4}),
-                                                  pten::DataLayout::NCHW));
+  pten::DenseTensor dense_x(
+      alloc.get(),
+      pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                            pten::framework::make_ddim({3, 4}),
+                            pten::DataLayout::NCHW));
   auto* dense_x_data =
       dense_x.mutable_data<float>(paddle::platform::CPUPlace());
   for (size_t i = 0; i < 12; ++i) {
@@ -82,7 +85,7 @@ TEST(DEV_API, scale_host) {
 
   pten::DenseTensor scale(alloc.get(),
                           pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                                                framework::make_ddim({1}),
+                                                pten::framework::make_ddim({1}),
                                                 pten::DataLayout::NCHW));
   scale.data<float>()[0] = 2;
   float bias = 1;
@@ -90,10 +93,11 @@ TEST(DEV_API, scale_host) {
 
   // 2. test API
   pten::CPUContext dev_ctx;
-  dev_ctx.SetDeviceAllocator(
-      paddle::memory::allocation::AllocatorFacade::Instance()
-          .GetAllocator(paddle::platform::CPUPlace())
-          .get());
+  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(paddle::platform::CPUPlace())
+                           .get());
+  dev_ctx.Init();
+
   auto out =
       pten::Scale<float>(dev_ctx, dense_x, scale, bias, bias_after_scale);
 
diff --git a/paddle/pten/tests/kernels/test_sparse_utils_dev_api.cc b/paddle/pten/tests/kernels/test_sparse_utils_dev_api.cc
new file mode 100644
index 00000000000..967609e9a8c
--- /dev/null
+++ b/paddle/pten/tests/kernels/test_sparse_utils_dev_api.cc
@@ -0,0 +1,386 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF NCHW KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <memory>
+
+#include "paddle/pten/backends/gpu/gpu_context.h"
+#include "paddle/pten/common/place.h"
+#include "paddle/pten/kernels/copy_kernel.h"
+#include "paddle/pten/kernels/sparse/sparse_utils_kernel.h"
+
+#include "paddle/pten/api/lib/utils/allocator.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/kernel_registry.h"
+
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
+
+namespace pten {
+namespace tests {
+
+template <typename ValueT, typename IndicesT>
+inline void CheckResult(
+    const DeviceContext* dev_ctx,
+    const SparseCooTensor& coo,
+    const std::vector<ValueT> non_zero_elements,
+    const std::vector<IndicesT>& non_zero_indices,
+    const int64_t non_zero_num,
+    const std::shared_ptr<paddle::experimental::DefaultAllocator>& alloc) {
+  const DenseTensor real_indices = coo.non_zero_indices();
+  const DenseTensor real_elements = coo.non_zero_elements();
+  ASSERT_EQ(coo.nnz(), non_zero_num);
+
+#if defined(PADDLE_WITH_CUDA)
+  if (coo.place() == pten::GPUPlace()) {
+    const auto* dev_ctx_cuda = static_cast<const pten::GPUContext*>(dev_ctx);
+    DenseTensor indices(
+        alloc.get(),
+        DenseTensorMeta(
+            DataType::INT64, real_indices.dims(), real_indices.layout()));
+
+    DenseTensor elements(alloc.get(),
+                         DenseTensorMeta(real_elements.dtype(),
+                                         real_elements.dims(),
+                                         real_elements.layout()));
+    pten::Copy(*dev_ctx_cuda, real_indices, true, &indices);
+    pten::Copy(*dev_ctx_cuda, real_elements, true, &elements);
+
+    int cmp_indices = memcmp(indices.data<IndicesT>(),
+                             non_zero_indices.data(),
+                             non_zero_indices.size() * sizeof(IndicesT));
+    ASSERT_EQ(cmp_indices, 0);
+    int cmp_elements = memcmp(elements.data<ValueT>(),
+                              non_zero_elements.data(),
+                              non_zero_elements.size() * sizeof(ValueT));
+    ASSERT_EQ(cmp_elements, 0);
+  } else {
+#endif
+    int cmp_indices = memcmp(real_indices.data<IndicesT>(),
+                             non_zero_indices.data(),
+                             non_zero_indices.size() * sizeof(IndicesT));
+    ASSERT_EQ(cmp_indices, 0);
+    int cmp_elements = memcmp(real_elements.data<ValueT>(),
+                              non_zero_elements.data(),
+                              non_zero_elements.size() * sizeof(ValueT));
+    ASSERT_EQ(cmp_elements, 0);
+#if defined(PADDLE_WITH_CUDA)
+  }
+#endif
+}
+
+template <typename T>
+void TestDenseToSparseCoo(const DenseTensor& dense_x,
+                          const int64_t sparse_dim,
+                          const std::vector<T>& non_zero_data,
+                          const std::vector<int64_t>& indices_data,
+                          const int64_t non_zero_num) {
+  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+      paddle::platform::CPUPlace());
+
+  pten::CPUContext dev_ctx_cpu;
+  dev_ctx_cpu.Init();
+
+  // 1. test cpu
+  auto cpu_sparse_out =
+      sparse::DenseToSparseCoo<T>(dev_ctx_cpu, dense_x, sparse_dim);
+  CheckResult<T, int64_t>(&dev_ctx_cpu,
+                          cpu_sparse_out,
+                          non_zero_data,
+                          indices_data,
+                          non_zero_num,
+                          alloc);
+
+// 2. test cuda
+#if defined(PADDLE_WITH_CUDA)
+  // paddle::platform::DeviceContextPool& pool =
+  //     paddle::platform::DeviceContextPool::Instance();
+  // auto* dev_ctx_cuda = pool.GetByPlace(paddle::platform::CUDAPlace());
+  pten::GPUContext dev_ctx_gpu;
+  dev_ctx_gpu.PartialInitWithoutAllocator();
+  dev_ctx_gpu.SetAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(dev_ctx_gpu.GetPlace(), dev_ctx_gpu.stream())
+          .get());
+  dev_ctx_gpu.SetHostAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(pten::CPUPlace())
+          .get());
+  dev_ctx_gpu.PartialInitWithAllocator();
+
+  const auto cuda_alloc =
+      std::make_shared<paddle::experimental::DefaultAllocator>(
+          paddle::platform::CUDAPlace());
+  DenseTensor d_dense_x(
+      cuda_alloc.get(),
+      DenseTensorMeta(dense_x.dtype(), dense_x.dims(), dense_x.layout()));
+
+  pten::Copy(dev_ctx_gpu, dense_x, true, &d_dense_x);
+  auto sparse_out =
+      sparse::DenseToSparseCoo<T>(dev_ctx_gpu, d_dense_x, sparse_dim);
+  CheckResult<T, int64_t>(&dev_ctx_gpu,
+                          sparse_out,
+                          non_zero_data,
+                          indices_data,
+                          non_zero_num,
+                          alloc);
+#endif
+}
+
+TEST(DEV_API, to_sparse_coo) {
+  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+      paddle::platform::CPUPlace());
+
+  std::default_random_engine random(time(NULL));
+  std::uniform_real_distribution<float> dis(0.0, 1.0);
+  std::uniform_int_distribution<int> dis_int(4, 64);
+  const int rows = dis_int(random), cols = dis_int(random);
+  DenseTensor dense_x(
+      alloc.get(),
+      DenseTensorMeta(DataType::FLOAT32, {rows, cols}, DataLayout::NCHW));
+
+  pten::CPUPlace cpu;
+  auto* dense_x_data = dense_x.mutable_data<float>(cpu);
+  std::vector<float> dense_data(rows * cols);
+  std::vector<float> non_zero_data;
+  std::vector<int64_t> rows_data, cols_data;
+  const int64_t sparse_dim = 2;
+
+  const float zero_rate = dis(random);
+
+  int64_t non_zero_num = 0;
+  for (int i = 0; i < rows; i++) {
+    for (int j = 0; j < cols; j++) {
+      bool iszero = dis(random) < zero_rate;
+      if (iszero) {
+        dense_data[i * cols + j] = 0.0;
+      } else {
+        float data = dis(random);
+        dense_data[i * cols + j] = data;
+        non_zero_data.push_back(data);
+        rows_data.push_back(i);
+        cols_data.push_back(j);
+        non_zero_num += 1;
+      }
+    }
+  }
+
+  std::copy(
+      dense_data.data(), dense_data.data() + dense_data.size(), dense_x_data);
+
+  std::vector<int64_t> indices_data(non_zero_num * 2);
+  memcpy(&indices_data[0], &rows_data[0], non_zero_num * sizeof(int64_t));
+  memcpy(&indices_data[non_zero_num],
+         &cols_data[0],
+         non_zero_num * sizeof(int64_t));
+
+  TestDenseToSparseCoo(
+      dense_x, sparse_dim, non_zero_data, indices_data, non_zero_num);
+}
+
+TEST(DEV_API, to_sparse_coo_hybird) {
+  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+      paddle::platform::CPUPlace());
+
+  DenseTensor dense_x(
+      alloc.get(),
+      DenseTensorMeta(DataType::FLOAT32, {3, 3}, DataLayout::NCHW));
+
+  pten::CPUPlace cpu;
+  const int64_t sparse_dim = 1;  // the non zero element is a vector
+  auto* dense_x_data = dense_x.mutable_data<float>(cpu);
+  float dense_data[3][3] = {{0.0, 1.0, 0.0}, {0.0, 0.0, 0.0}, {3.2, 0.0, 0.0}};
+  std::vector<float> non_zero_data = {
+      /*element0(*/ 0.0, 1.0, 0.0 /*)*/, /*element1(*/ 3.2, 0.0, 0.0 /*)*/};
+  std::vector<int64_t> indices_data = {0, 2};
+  const int64_t non_zero_num = 2;
+
+  std::copy(&dense_data[0][0], &dense_data[0][0] + 9, dense_x_data);
+  TestDenseToSparseCoo(
+      dense_x, sparse_dim, non_zero_data, indices_data, non_zero_num);
+}
+
+TEST(DEV_API, to_sparse_coo_fp16) {
+  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+      paddle::platform::CPUPlace());
+
+  DenseTensor dense_x(
+      alloc.get(),
+      DenseTensorMeta(DataType::FLOAT16, {3, 3}, DataLayout::NCHW));
+
+  pten::CPUPlace cpu;
+  const int64_t sparse_dim = 2;
+  const int64_t non_zero_num = 2;
+  auto* dense_x_data = dense_x.mutable_data<pten::dtype::float16>(cpu);
+  float dense_data[3][3] = {{0.0, 1.0, 0.0}, {0.0, 0.0, 0.0}, {3.2, 0.0, 0.0}};
+  std::vector<float> data = {1.0, 3.2};
+  std::vector<pten::dtype::float16> non_zero_data(non_zero_num);
+  for (int i = 0; i < non_zero_num; i++) {
+    non_zero_data[i] = static_cast<pten::dtype::float16>(data[i]);
+  }
+  std::vector<int64_t> indices_data = {0, 2, 1, 0};
+
+  std::copy(&dense_data[0][0], &dense_data[0][0] + 9, dense_x_data);
+  TestDenseToSparseCoo<paddle::float16>(
+      dense_x, sparse_dim, non_zero_data, indices_data, non_zero_num);
+}
+
+TEST(DEV_API, to_sparse_coo_batch) {
+  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+      paddle::platform::CPUPlace());
+
+  DenseTensor dense_x(
+      alloc.get(),
+      DenseTensorMeta(DataType::FLOAT32, {2, 3, 3}, DataLayout::NCHW));
+
+  pten::CPUPlace cpu;
+  const int64_t sparse_dim = 3;
+  const int64_t non_zero_num = 4;
+  auto* dense_x_data = dense_x.mutable_data<float>(cpu);
+  float dense_data[2][3][3] = {
+      {{0.0, 1.0, 0.0}, {0.0, 0.0, 0.0}, {2.0, 0.0, 0.0}},
+      {{0.0, 0.0, 0.0}, {0.0, 3.0, 0.0}, {4.0, 0.0, 0.0}}};
+  std::vector<float> non_zero_data = {1.0, 2.0, 3.0, 4.0};
+  std::vector<int64_t> indices_data = {0, 0, 1, 1, 0, 2, 1, 2, 1, 0, 1, 0};
+  /*
+      0, 0, 1, 1,
+      0, 2, 1, 2,
+      1, 0, 1, 0
+   */
+
+  std::copy(&dense_data[0][0][0], &dense_data[0][0][0] + 18, dense_x_data);
+  TestDenseToSparseCoo<float>(
+      dense_x, sparse_dim, non_zero_data, indices_data, non_zero_num);
+}
+
+template <typename T>
+void TestSparseCsrToCoo(const DDim& dense_dims,
+                        const std::vector<T>& non_zero_data,
+                        const std::vector<int64_t>& crows_data,
+                        const std::vector<int64_t>& cols_data,
+                        const std::vector<int64_t>& indices_data,
+                        const int64_t non_zero_num) {
+  int batchs = 1;
+  int rows = dense_dims[0];
+  if (dense_dims.size() == 3) {
+    batchs = dense_dims[0];
+    rows = dense_dims[1];
+  }
+  pten::DenseTensorMeta crows_meta(
+      DataType::INT64, {batchs * (rows + 1)}, DataLayout::NCHW);
+  pten::DenseTensorMeta cols_meta(
+      DataType::INT64, {non_zero_num}, DataLayout::NCHW);
+  pten::DenseTensorMeta values_meta(
+      paddle::experimental::CppTypeToDataType<T>::Type(),
+      {non_zero_num},
+      DataLayout::NCHW);
+  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+      paddle::platform::CPUPlace());
+  pten::CPUPlace place;
+  pten::DenseTensor crows(alloc.get(), crows_meta);
+  pten::DenseTensor cols(alloc.get(), cols_meta);
+  pten::DenseTensor values(alloc.get(), values_meta);
+  memcpy(crows.mutable_data<int64_t>(place),
+         crows_data.data(),
+         crows_data.size() * sizeof(int64_t));
+  memcpy(cols.mutable_data<int64_t>(place),
+         cols_data.data(),
+         cols_data.size() * sizeof(int64_t));
+  memcpy(values.mutable_data<T>(place),
+         non_zero_data.data(),
+         non_zero_data.size() * sizeof(T));
+  pten::SparseCsrTensor csr(crows, cols, values, dense_dims);
+
+  // 1. test cpu
+  pten::CPUContext dev_ctx_cpu;
+  auto cpu_sparse_out = sparse::SparseCsrToCoo<T>(dev_ctx_cpu, csr);
+  CheckResult<T, int64_t>(&dev_ctx_cpu,
+                          cpu_sparse_out,
+                          non_zero_data,
+                          indices_data,
+                          non_zero_num,
+                          alloc);
+// 2. test cuda
+#if defined(PADDLE_WITH_CUDA)
+  pten::GPUContext dev_ctx_gpu;
+  dev_ctx_gpu.PartialInitWithoutAllocator();
+  dev_ctx_gpu.SetAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(dev_ctx_gpu.GetPlace(), dev_ctx_gpu.stream())
+          .get());
+  dev_ctx_gpu.SetHostAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(pten::CPUPlace())
+          .get());
+  dev_ctx_gpu.PartialInitWithAllocator();
+
+  const auto cuda_alloc =
+      std::make_shared<paddle::experimental::DefaultAllocator>(
+          paddle::platform::CUDAPlace());
+  // auto& pool = paddle::platform::DeviceContextPool::Instance();
+  // auto* dev_ctx_cuda = pool.GetByPlace(paddle::platform::CUDAPlace());
+  pten::DenseTensor d_crows(cuda_alloc.get(), crows_meta);
+  pten::DenseTensor d_cols(cuda_alloc.get(), cols_meta);
+  pten::DenseTensor d_values(cuda_alloc.get(), values_meta);
+  pten::Copy(dev_ctx_gpu, crows, true, &d_crows);
+  pten::Copy(dev_ctx_gpu, cols, true, &d_cols);
+  pten::Copy(dev_ctx_gpu, values, true, &d_values);
+  pten::SparseCsrTensor d_csr(d_crows, d_cols, d_values, dense_dims);
+  auto cuda_sparse_out = sparse::SparseCsrToCoo<T>(dev_ctx_gpu, d_csr);
+  CheckResult<T, int64_t>(&dev_ctx_gpu,
+                          cuda_sparse_out,
+                          non_zero_data,
+                          indices_data,
+                          non_zero_num,
+                          alloc);
+#endif
+}
+
+TEST(DEV_API, sparse_csr_to_coo) {
+  DDim dense_dims = framework::make_ddim({3, 3});
+  std::vector<float> non_zero_data = {1.0, 2.0, 3.0, 3.2};
+  std::vector<int64_t> indices_data = {0, 1, 1, 2, 1, 0, 2, 0};
+  std::vector<int64_t> cols_data = {1, 0, 2, 0};
+  std::vector<int64_t> crows_data = {0, 1, 3, 4};
+  const int64_t non_zero_num = 4;
+  TestSparseCsrToCoo(dense_dims,
+                     non_zero_data,
+                     crows_data,
+                     cols_data,
+                     indices_data,
+                     non_zero_num);
+}
+
+TEST(DEV_API, sparse_csr_to_coo_batch_and_fp16) {
+  DDim dense_dims = framework::make_ddim({2, 3, 3});
+  std::vector<float> non_zero_data = {1.0, 2.0, 3.0, 3.2, 1.0, 2.0, 3.0, 3.2};
+  std::vector<int64_t> cols_data = {1, 0, 2, 0, 1, 0, 2, 0};
+  std::vector<int64_t> crows_data = {0, 1, 3, 4, 0, 1, 3, 4};
+  std::vector<int64_t> indices_data = {0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 2,
+                                       0, 1, 1, 2, 1, 0, 2, 0, 1, 0, 2, 0};
+  const int64_t non_zero_num = 8;
+  using float16 = pten::dtype::float16;
+  std::vector<float16> non_zero_data_fp16(non_zero_num);
+  for (int64_t i = 0; i < non_zero_num; i++) {
+    non_zero_data_fp16[i] = static_cast<float16>(non_zero_data[i]);
+  }
+  TestSparseCsrToCoo(dense_dims,
+                     non_zero_data_fp16,
+                     crows_data,
+                     cols_data,
+                     indices_data,
+                     non_zero_num);
+}
+
+}  // namespace tests
+}  // namespace pten
diff --git a/paddle/pten/tests/kernels/test_sum_dev_api.cc b/paddle/pten/tests/kernels/test_sum_dev_api.cc
index 9b48d8908ff..b05d40692af 100644
--- a/paddle/pten/tests/kernels/test_sum_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_sum_dev_api.cc
@@ -31,10 +31,11 @@ TEST(DEV_API, sum) {
   // 1. create tensor
   const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
-  pten::DenseTensor dense_x(alloc.get(),
-                            pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                                                  framework::make_ddim({3, 4}),
-                                                  pten::DataLayout::NCHW));
+  pten::DenseTensor dense_x(
+      alloc.get(),
+      pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                            pten::framework::make_ddim({3, 4}),
+                            pten::DataLayout::NCHW));
   auto* dense_x_data =
       dense_x.mutable_data<float>(paddle::platform::CPUPlace());
 
@@ -46,10 +47,11 @@ TEST(DEV_API, sum) {
 
   std::vector<int64_t> axis = {0, 1};
   pten::CPUContext dev_ctx;
-  dev_ctx.SetDeviceAllocator(
-      paddle::memory::allocation::AllocatorFacade::Instance()
-          .GetAllocator(paddle::platform::CPUPlace())
-          .get());
+  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(paddle::platform::CPUPlace())
+                           .get());
+  dev_ctx.Init();
+
   // 2. test API
   auto out =
       pten::Sum<float>(dev_ctx, dense_x, axis, pten::DataType::FLOAT32, false);
diff --git a/python/paddle/autograd/functional.py b/python/paddle/autograd/functional.py
index b9cceafebaa..c663d37e7f2 100644
--- a/python/paddle/autograd/functional.py
+++ b/python/paddle/autograd/functional.py
@@ -909,12 +909,32 @@ def vhp(func, inputs, v=None, create_graph=False, allow_unused=False):
 
 class Jacobian(object):
     r"""
-    Object that represents the Jacobian matrix of a muli-input multi-output 
-    function.
+    Computes the Jacobian matrix of function `func`, which may take as input
+    single or multiple tensor typed arguments and output a single tensor or
+    multiple tensors. 
+    
+    In case `func` is multi-input and multi-output, i.e., 
+    
+    func: Callable[[Tensor, ...], [Tensor, ...]]
+
+    `func` is treated as a vector valued function with all its inputs flattened
+    into a single one dimensional tensor, or a two dimensional tensor with the
+    first dimension retained as the batching dimension. The same rule applies to
+    the function outputs.
+
+    Once the Jacobian J is constructed, there are four ways to retrieve the 
+    partial derivatives.
+
+    - J[:], retrieving the full matrix.
+    
+    - J[:, j], retrieving the partial derivatives w.r.t. the j'th input 
+    variable.
+
+    - J[i, :], retrieving the partial derivatives w.r.t. the i'th output 
+    variable.
 
-    The Jacobian values are lazily evaluated if accessed through indices.
-    In contrast, slicing access would trigger evaluating the full matrix
-    if it's not already computed.
+    - J[i, j], retrieving the partial derivatives w.r.t. the i'th output 
+    variable and the j'th input variable. 
 
     Examples:
         .. code-block:: python
@@ -984,7 +1004,10 @@ class Jacobian(object):
         return x.reshape(to)
 
     def flatten_all(self, xs):
-        return paddle.concat([self.flatten(x) for x in xs], axis=-1)
+        if isinstance(xs, (list, tuple)):
+            return paddle.concat([self.flatten(x) for x in xs], axis=-1)
+        else:
+            return self.flatten(xs)
 
     def shape(self):
         return (self.ydim, self.xdim)
@@ -995,23 +1018,23 @@ class Jacobian(object):
         else:
             i, j = tup, None
 
-        if isinstance(i, slice):
-            slicing = True
-        else:
-            slicing = False
+        full = isinstance(i, slice)
 
-        if slicing:
+        if full:
             if 'full' not in self.jacobian:
                 rows = [
                     self.flatten_all(gradients(self.y[..., i], self.xs))
                     for i in range(self.ydim)
                 ]
-                self.jacobian['full'] = paddle.stack(rows)
-            return self.jacobian['full'][i]
+                self.jacobian['full'] = full_jacobian = paddle.stack(rows)
+            else:
+                full_jacobian = self.jacobian['full']
+
+            return full_jacobian[i] if j is None else full_jacobian[i][..., j]
 
         assert 0 <= i < self.ydim, f"Jacobian index i={i} is not valid."
-        assert (j is None) or (
-            0 <= j < self.xdim), f"Jacobian index j={j} is not valid."
+        assert j is None or isinstance(j, slice) or (0 <= j < self.xdim), (
+            f"Jacobian index j={j} is not valid.")
         if 'full' in self.jacobian:
             JJ = self.jacobian['full']
         else:
@@ -1024,3 +1047,17 @@ class Jacobian(object):
             return JJ[i]
         else:
             return JJ[i][..., j]
+
+
+class Hessian(object):
+    def __init__(self, func, inputs, batch=False):
+        f_x = lambda xs: Jacobian(func, xs, batch=batch)[0]
+        self.symbolic = Jacobian(f_x, inputs, batch=batch)
+        self.xs = inputs
+        self.batch = batch
+
+    def __getitem__(self, tup):
+        return self.symbolic[tup]
+
+    def shape(self):
+        return self.symbolic.shape()
diff --git a/python/paddle/distributed/auto_parallel/completion.py b/python/paddle/distributed/auto_parallel/completion.py
index 54491f9e6c1..45ea9a3c9dd 100644
--- a/python/paddle/distributed/auto_parallel/completion.py
+++ b/python/paddle/distributed/auto_parallel/completion.py
@@ -325,10 +325,8 @@ class Completer:
 
     def complete_forward_annotation(self, serial_main_program):
         """ Complete annotation for the partial annotated serial_main_program.
-
         Arguments:
             serial_main_program: partial annotated serial_main_program.
-
         Returns:
             serial_main_program: completed annotated serial_main_program.
         """
@@ -443,6 +441,33 @@ class Completer:
                     dist_op_context.grad_op_id_to_op_id[grad_op.desc.id()])
                 assert forward_op is not None
 
+                if grad_op.type == "concat" and forward_op.type == "split":
+                    forward_op_dist_attr = dist_context.get_op_dist_attr_for_program(
+                        forward_op)
+                    output_var = vars[grad_op.desc.output('Out')[0]]
+                    split_input_var_name = forward_op.input("X")[0]
+                    ref_dims_mapping = forward_op_dist_attr.get_input_dims_mapping(
+                        split_input_var_name)
+                    ref_mesh = forward_op_dist_attr.process_mesh
+
+                    grad_op_dist_attr = OperatorDistributedAttribute()
+                    for input_name in grad_op.input_arg_names:
+                        grad_op_dist_attr.set_input_dims_mapping(
+                            input_name, ref_dims_mapping)
+
+                    output_var_dist_attr = TensorDistributedAttribute()
+                    output_var_dist_attr.dims_mapping = ref_dims_mapping
+                    output_var_dist_attr.process_mesh = ref_mesh
+                    dist_context.set_tensor_dist_attr_for_program(
+                        output_var, output_var_dist_attr)
+
+                    grad_op_dist_attr.set_output_dims_mapping(output_var.name,
+                                                              ref_dims_mapping)
+                    grad_op_dist_attr.process_mesh = ref_mesh
+                    dist_context.set_op_dist_attr_for_program(grad_op,
+                                                              grad_op_dist_attr)
+                    continue
+
                 # op dist attr
                 forward_op_dist_attr = self._dist_context.get_op_dist_attr_for_program(
                     forward_op)
diff --git a/python/paddle/distributed/auto_parallel/operators/__init__.py b/python/paddle/distributed/auto_parallel/operators/__init__.py
index ea743df8d64..9f84df2d896 100644
--- a/python/paddle/distributed/auto_parallel/operators/__init__.py
+++ b/python/paddle/distributed/auto_parallel/operators/__init__.py
@@ -26,3 +26,4 @@ from . import dist_default
 from . import dist_eltwise
 from . import dist_check_finite_and_unscale
 from . import dist_update_loss_scaling
+from . import dist_split
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_split.py b/python/paddle/distributed/auto_parallel/operators/dist_split.py
new file mode 100644
index 00000000000..289da80e1a7
--- /dev/null
+++ b/python/paddle/distributed/auto_parallel/operators/dist_split.py
@@ -0,0 +1,115 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .common import DistributedOperatorImplContainer
+from .common import DistributedOperatorImpl
+from .common import register_distributed_operator_impl_container
+from .common import register_distributed_operator_impl
+from ..utils import is_dim_shard
+from ..utils import is_valid_list_index
+from ..utils import compute_compatible_dim_mapping
+from ..utils import compute_compatible_dims_mapping
+from ..utils import compute_compatible_and_update_dim_mapping
+from .dist_default import DistributedDefaultImpl0
+
+
+class DistributedSplit(DistributedOperatorImplContainer):
+    def __init__(self, op_type):
+        super(DistributedSplit, self).__init__(op_type)
+
+
+register_distributed_operator_impl_container(DistributedSplit("split"))
+
+
+class DistributedSplitImpl(DistributedOperatorImpl):
+    def __init__(self, name):
+        super(DistributedSplitImpl, self).__init__(name)
+        self._forward_implemented = True
+        self._backward_implemented = True
+
+    def is_input_compatible(self, dist_op):
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
+        x_name = op_desc.input('X')[0]
+        axis = op_desc.attr('axis')
+        x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
+
+        if is_dim_shard(x_dims_mapping[axis]):
+            return False
+
+        return True
+
+    def is_output_compatible(self, dist_op):
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
+        out_names = op_desc.output('Out')
+        axis = op_desc.attr('axis')
+        for out_name in out_names:
+            out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
+            if is_dim_shard(out_dims_mapping[axis]):
+                return False
+
+        return True
+
+    def is_compatible(self, dist_op):
+        if (not self.is_input_compatible(dist_op)) or \
+            (not self.is_output_compatible(dist_op)):
+            return False
+
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
+        x_name = op_desc.input('X')[0]
+        axis = op_desc.attr('axis')
+        out_names = op_desc.output('Out')
+        x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
+        for out_name in out_names:
+            out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
+            if x_dims_mapping != out_dims_mapping:
+                return False
+
+        return True
+
+    def update_dims_mapping(self, dist_op):
+        changed = False
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
+        x_name = op_desc.input('X')[0]
+        out_names = op_desc.output('Out')
+        x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
+
+        for out_name in out_names:
+            out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
+            for i in range(len(x_dims_mapping)):
+                dim_changed = compute_compatible_and_update_dim_mapping(
+                    [x_dims_mapping, out_dims_mapping], [i, i])
+                if dim_changed:
+                    changed = True
+
+        return changed
+
+    def is_auto_compatible(self, dist_op):
+        raise NotImplementedError(
+            "Auto Search is not supported by dist split yet.")
+
+    @staticmethod
+    def forward(ctx, *args, **kwargs):
+        DistributedDefaultImpl0.forward(ctx, *args, **kwargs)
+
+    @staticmethod
+    def backward(ctx, *args, **kwargs):
+        DistributedDefaultImpl0.backward(ctx, *args, **kwargs)
+
+
+register_distributed_operator_impl("split",
+                                   DistributedSplitImpl("replicate_in_axis"))
diff --git a/python/paddle/distributed/auto_parallel/utils.py b/python/paddle/distributed/auto_parallel/utils.py
index f81291fa64f..75e0ae251ef 100644
--- a/python/paddle/distributed/auto_parallel/utils.py
+++ b/python/paddle/distributed/auto_parallel/utils.py
@@ -1002,7 +1002,7 @@ def set_grad_var_shape(program, dist_context):
         if op.type in ["check_finite_and_unscale", "update_loss_scaling"]:
             break
 
-        if op.type in ["sum"]:
+        if op.type in ["sum", "concat"]:
             continue
         if int(op.attr('op_role')) == int(OpRole.Backward):
             op_dist_attr = dist_context.get_op_dist_attr_for_program(op)
diff --git a/python/paddle/distributed/fleet/dataset/dataset.py b/python/paddle/distributed/fleet/dataset/dataset.py
index 84d1f21d154..235f4ece62d 100644
--- a/python/paddle/distributed/fleet/dataset/dataset.py
+++ b/python/paddle/distributed/fleet/dataset/dataset.py
@@ -141,6 +141,23 @@ class DatasetBase(object):
     def _set_input_type(self, input_type):
         self.proto_desc.input_type = input_type
 
+    def _set_uid_slot(self, uid_slot):
+        """
+        Set user slot name.
+
+        Examples:
+            .. code-block:: python
+
+              import paddle
+              dataset = paddle.distributed.fleet.DatasetBase()
+              dataset._set_uid_slot('6048')
+
+        Args:
+            set_uid_slot(string): user slot name
+        """
+        multi_slot = self.proto_desc.multi_slot_desc
+        multi_slot.uid_slot = uid_slot
+
     def _set_use_var(self, var_list):
         """
         Set Variables which you will use.
@@ -738,6 +755,23 @@ class InMemoryDataset(DatasetBase):
         self.merge_by_lineid = True
         self.parse_ins_id = True
 
+    def _set_shuffle_by_uid(self, enable_shuffle_uid):
+        """
+        Set if Dataset need to shuffle by uid.
+
+        Args:
+            set_shuffle_by_uid(bool): if shuffle according to uid or not
+
+        Examples:
+            .. code-block:: python
+
+              import paddle
+              paddle.enable_static()
+              dataset = paddle.distributed.InMemoryDataset()
+              dataset._set_shuffle_by_uid(True)
+        """
+        self.dataset.set_shuffle_by_uid(enable_shuffle_uid)
+
     def _set_generate_unique_feasigns(self, generate_uni_feasigns, shard_num):
         self.dataset.set_generate_unique_feasigns(generate_uni_feasigns)
         self.gen_uni_feasigns = generate_uni_feasigns
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
index fc5b93c6e25..ea17f96f7a1 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
@@ -70,7 +70,7 @@ class ShardingOptimizerStage2(Optimizer):
                  device="gpu",
                  **kw):
 
-        # super().__init__(optim._learning_rate, params, kw)
+        super().__init__(optim._learning_rate, params, kw)
 
         # Segmentation information
         self._dtype_rank_params = OrderedDict(
@@ -363,6 +363,10 @@ class ShardingOptimizerStage2(Optimizer):
         # Synchronize all the updated shards in between the ranks
         self._broadcast_params()
 
+    def minimize(self):
+        raise RuntimeError(
+            "optimizer.minimize() not support now, please use optimizer.step()")
+
     def _clear_cache(self):
         self.__segment_params.clear()
         self._dtype_rank_params.clear()
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py
index 9d7bd937411..484cd223949 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py
@@ -506,7 +506,13 @@ class ShardingStage3(nn.Layer):
             else:
                 opt_step()
 
+        def _opt_minimize(self):
+            raise RuntimeError(
+                "optimizer.minimize() not support now, please use optimizer.step()"
+            )
+
         self._optim.step = MethodType(_opt_step, self._optim)
+        self._optim.minimize = MethodType(_opt_minimize, self._optim)
 
     def _redefine_opt_clear(self):
         clear_func = self._clear_gradients
diff --git a/python/paddle/distributed/fleet/runtime/the_one_ps.py b/python/paddle/distributed/fleet/runtime/the_one_ps.py
index c561c250678..cc81f8b3e9e 100644
--- a/python/paddle/distributed/fleet/runtime/the_one_ps.py
+++ b/python/paddle/distributed/fleet/runtime/the_one_ps.py
@@ -943,7 +943,7 @@ class TheOnePSRuntime(RuntimeBase):
                         ctx.origin_varnames()[0]]
 
                     if self.compiled_strategy.is_geo_mode():
-                        table.table_class = "SparseGeoTable"
+                        table.table_class = "MemorySparseGeoTable"
                     else:
                         all_table_proto = self.context[
                             "user_defined_strategy"].sparse_table_configs
@@ -1306,6 +1306,7 @@ class TheOnePSRuntime(RuntimeBase):
             is_dense=True,
             split_dense_table=self.role_maker._is_heter_parameter_server_mode,
             use_origin_program=True)
+        # TODO(zhaocaibei123): for GEO: should call GeoCommunicator::RecvDense
         self._communicator.pull_dense(denses)
 
         generate_vars = self.context[
diff --git a/python/paddle/distributed/metric/__init__.py b/python/paddle/distributed/metric/__init__.py
new file mode 100644
index 00000000000..a5b0f4cb49d
--- /dev/null
+++ b/python/paddle/distributed/metric/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .metrics import init_metric  # noqa: F401
+from .metrics import print_auc  # noqa: F401 
diff --git a/python/paddle/distributed/metric/metrics.py b/python/paddle/distributed/metric/metrics.py
new file mode 100644
index 00000000000..5685b6f053e
--- /dev/null
+++ b/python/paddle/distributed/metric/metrics.py
@@ -0,0 +1,138 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import yaml
+import paddle.fluid as fluid
+import logging
+from paddle.distributed.utils import get_logger
+
+__all__ = []
+logger = get_logger(logging.INFO, name="metrics")
+
+
+# read metric config from yaml and init MetricMsg in fleet_wrapper
+def init_metric(metric_ptr,
+                metric_yaml_path,
+                cmatch_rank_var="",
+                mask_var="",
+                uid_var="",
+                phase=-1,
+                cmatch_rank_group="",
+                ignore_rank=False,
+                bucket_size=1000000):
+    yaml_fobj = open(metric_yaml_path)
+    if sys.version.startswith('2.7.13'):
+        content = yaml.load(yaml_fobj)
+    else:
+        content = yaml.load(yaml_fobj, Loader=yaml.FullLoader)
+
+    print("yaml metric config: \n")
+    print(content)
+
+    metric_runner_list = content['monitors']
+    if not metric_runner_list:
+        metric_runner_list = []
+
+    for metric_runner in metric_runner_list:
+        is_join = metric_runner['phase'] == 'JOINING'
+        phase = 1 if is_join else 0
+
+        if metric_runner['method'] == 'AucCalculator':
+            metric_ptr.init_metric(
+                metric_runner['method'], metric_runner['name'],
+                metric_runner['label'], metric_runner['target'],
+                cmatch_rank_var, mask_var, uid_var, phase, cmatch_rank_group,
+                ignore_rank, bucket_size)
+        elif metric_runner['method'] == 'MultiTaskAucCalculator':
+            metric_ptr.init_metric(
+                metric_runner['method'], metric_runner['name'],
+                metric_runner['label'], metric_runner['target'],
+                metric_runner['cmatch_var'], mask_var, uid_var, phase,
+                metric_runner['cmatch_group'], ignore_rank, bucket_size)
+        elif metric_runner['method'] == 'CmatchRankAucCalculator':
+            metric_ptr.init_metric(
+                metric_runner['method'], metric_runner['name'],
+                metric_runner['label'], metric_runner['target'],
+                metric_runner['cmatch_var'], mask_var, uid_var, phase,
+                metric_runner['cmatch_group'], metric_runner['ignore_rank'],
+                bucket_size)
+        elif metric_runner['method'] == 'MaskAucCalculator':
+            metric_ptr.init_metric(
+                metric_runner['method'], metric_runner['name'],
+                metric_runner['label'], metric_runner['target'],
+                cmatch_rank_var, metric_runner['mask'], uid_var, phase,
+                cmatch_rank_group, ignore_rank, bucket_size)
+        elif metric_runner['method'] == 'CmatchRankMaskAucCalculator':
+            metric_ptr.init_metric(
+                metric_runner['method'], metric_runner['name'],
+                metric_runner['label'], metric_runner['target'],
+                metric_runner['cmatch_var'], metric_runner['mask'], uid_var,
+                phase, metric_runner['cmatch_group'],
+                metric_runner['ignore_rank'], bucket_size)
+        elif metric_runner['method'] == 'WuAucCalculator':
+            metric_ptr.init_metric(
+                metric_runner['method'], metric_runner['name'],
+                metric_runner['label'], metric_runner['target'],
+                cmatch_rank_var, mask_var, metric_runner['uid'], phase,
+                cmatch_rank_group, ignore_rank, bucket_size)
+        else:
+            metric_ptr.init_metric(
+                metric_runner['method'], metric_runner['name'],
+                metric_runner['label'], metric_runner['target'],
+                cmatch_rank_var, mask_var, phase, cmatch_rank_group,
+                ignore_rank, bucket_size)
+
+
+def print_metric(metric_ptr, name):
+    """
+    print the metric value. Print directly in back-end
+    """
+    if name.find("wuauc") != -1:
+        metric = metric_ptr.get_wuauc_metric_msg(name)
+        monitor_msg = "%s: User Count=%.0f INS Count=%.0f UAUC=%.6f WUAUC=%.6f "\
+           % (name, metric[0], metric[1], metric[4], metric[5])
+    else:
+        metric = metric_ptr.get_metric_msg(name)
+        monitor_msg = "%s: AUC=%.6f BUCKET_ERROR=%.6f MAE=%.6f RMSE=%.6f "\
+            "Actual CTR=%.6f Predicted CTR=%.6f COPC=%.6f INS Count=%.0f"\
+            % (name, metric[0], metric[1], metric[2], metric[3], metric[4],
+                    metric[5], metric[6], metric[7])
+    # logger.info(monitor_msg)
+    return monitor_msg
+
+
+def print_auc(metric_ptr, is_day, phase="all"):
+    """
+    print metric according to stage and phase
+    """
+    if is_day is True:
+        stage = "day"
+        stage_num = -1
+    else:
+        stage = "pass"
+        stage_num = 1 if phase == "join" else 0
+    metric_results = []
+
+    name_list = metric_ptr.get_metric_name_list(stage_num)
+    if phase == "all":
+        for name in name_list:
+            if name.find(stage) != -1:
+                metric_results.append(print_metric(metric_ptr, name=name))
+    else:
+        for name in name_list:
+            if name.find(stage) != -1 and name.find(phase) != -1:
+                metric_results.append(print_metric(metric_ptr, name=name))
+
+    return metric_results
diff --git a/python/paddle/distributed/passes/cpp_pass.py b/python/paddle/distributed/passes/cpp_pass.py
index 97243b805cd..da409cfe333 100644
--- a/python/paddle/distributed/passes/cpp_pass.py
+++ b/python/paddle/distributed/passes/cpp_pass.py
@@ -52,3 +52,16 @@ class FuseBatchNormAddActPass(CPPPassWrapper):
 
     def _type(self):
         return PassType.FUSION_OPT
+
+
+@register_pass("fuse_relu_depthwise_conv")
+class FuseReluDepthwiseConvPass(CPPPassWrapper):
+    def __init__(self):
+        super(FuseReluDepthwiseConvPass, self).__init__()
+
+    @property
+    def cpp_name(self):
+        return "fuse_relu_depthwise_conv_pass"
+
+    def _type(self):
+        return PassType.FUSION_OPT
diff --git a/python/paddle/distribution/__init__.py b/python/paddle/distribution/__init__.py
index 6bc9048a1dd..334b0d4a3a5 100644
--- a/python/paddle/distribution/__init__.py
+++ b/python/paddle/distribution/__init__.py
@@ -18,6 +18,7 @@ from .dirichlet import Dirichlet
 from .distribution import Distribution
 from .exponential_family import ExponentialFamily
 from .kl import kl_divergence, register_kl
+from .multinomial import Multinomial
 from .normal import Normal
 from .uniform import Uniform
 
@@ -27,8 +28,9 @@ __all__ = [  # noqa
     'Dirichlet',
     'Distribution',
     'ExponentialFamily',
+    'Multinomial',
     'Normal',
     'Uniform',
     'kl_divergence',
-    'register_kl'
+    'register_kl',
 ]
diff --git a/python/paddle/distribution/beta.py b/python/paddle/distribution/beta.py
index 057dff2866b..82f3dced107 100644
--- a/python/paddle/distribution/beta.py
+++ b/python/paddle/distribution/beta.py
@@ -21,7 +21,14 @@ from .exponential_family import ExponentialFamily
 
 class Beta(ExponentialFamily):
     r"""
-    Beta distribution parameterized by alpha and beta
+    Beta distribution parameterized by alpha and beta.
+
+    In probability theory and statistics, the beta distribution is a family of 
+    continuous probability distributions defined on the interval [0, 1] 
+    parameterized by two positive shape parameters, denoted by alpha and beta, 
+    that appear as exponents of the random variable and control the shape of 
+    the distribution. The generalization to multiple variables is called a 
+    Dirichlet distribution.
 
     The probability density function (pdf) is
 
@@ -37,8 +44,14 @@ class Beta(ExponentialFamily):
 
 
     Args:
-        alpha (float|Tensor): alpha parameter of beta distribution, positive(>0).
-        beta (float|Tensor): beta parameter of beta distribution, positive(>0).
+        alpha (float|Tensor): Alpha parameter. It supports broadcast semantics. 
+            The value of alpha must be positive. When the parameter is a tensor, 
+            it represents multiple independent distribution with 
+            a batch_shape(refer to ``Distribution`` ).
+        beta (float|Tensor): Beta parameter. It supports broadcast semantics. 
+            The value of beta must be positive(>0). When the parameter is tensor, 
+            it represent multiple independent distribution with 
+            a batch_shape(refer to ``Distribution`` ). 
 
     Examples:
 
@@ -86,56 +99,56 @@ class Beta(ExponentialFamily):
 
     @property
     def mean(self):
-        """mean of beta distribution.
+        """Mean of beta distribution.
         """
         return self.alpha / (self.alpha + self.beta)
 
     @property
     def variance(self):
-        """variance of beat distribution
+        """Variance of beat distribution
         """
         sum = self.alpha + self.beta
         return self.alpha * self.beta / (sum.pow(2) * (sum + 1))
 
     def prob(self, value):
-        """probability density funciotn evaluated at value
+        """Probability density funciotn evaluated at value
 
         Args:
-            value (Tensor): value to be evaluated.
+            value (Tensor): Value to be evaluated.
         
         Returns:
-            Tensor: probability.
+            Tensor: Probability.
         """
         return paddle.exp(self.log_prob(value))
 
     def log_prob(self, value):
-        """log probability density funciton evaluated at value
+        """Log probability density funciton evaluated at value
 
         Args:
-            value (Tensor): value to be evaluated
+            value (Tensor): Value to be evaluated
         
         Returns:
-            Tensor: log probability.
+            Tensor: Log probability.
         """
         return self._dirichlet.log_prob(paddle.stack([value, 1.0 - value], -1))
 
     def sample(self, shape=()):
-        """sample from beta distribution with sample shape.
+        """Sample from beta distribution with sample shape.
 
         Args:
-            shape (Sequence[int], optional): sample shape.
+            shape (Sequence[int], optional): Sample shape.
 
         Returns:
-            sampled data with shape `sample_shape` + `batch_shape` + `event_shape`.
+            Sampled data with shape `sample_shape` + `batch_shape` + `event_shape`.
         """
         shape = shape if isinstance(shape, tuple) else tuple(shape)
-        return paddle.squeeze(self._dirichlet.sample(shape)[..., 0])
+        return paddle.squeeze(self._dirichlet.sample(shape)[..., 0], axis=-1)
 
     def entropy(self):
-        """entropy of dirichlet distribution
+        """Entropy of dirichlet distribution
 
         Returns:
-            Tensor: entropy.
+            Tensor: Entropy.
         """
         return self._dirichlet.entropy()
 
diff --git a/python/paddle/distribution/categorical.py b/python/paddle/distribution/categorical.py
index 151e060e29b..8cce6a54a3b 100644
--- a/python/paddle/distribution/categorical.py
+++ b/python/paddle/distribution/categorical.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -16,6 +16,7 @@ import math
 import warnings
 
 import numpy as np
+import paddle
 from paddle import _C_ops
 
 from ..fluid import core
@@ -123,7 +124,7 @@ class Categorical(Distribution):
 
         Returns:
             Tensor: A tensor with prepended dimensions shape.
-        
+
         Examples:
             .. code-block:: python
 
@@ -153,14 +154,22 @@ class Categorical(Distribution):
         logits_shape = list(self.logits.shape)
         if len(logits_shape) > 1:
             sample_shape = shape + logits_shape[:-1]
-            logits = nn.reshape(self.logits,
-                                [np.prod(logits_shape[:-1]), logits_shape[-1]])
+            logits = paddle.reshape(
+                self.logits, [np.prod(logits_shape[:-1]), logits_shape[-1]])
         else:
             sample_shape = shape
             logits = self.logits
 
-        sample_index = multinomial(logits, num_samples, True)
-        return nn.reshape(sample_index, sample_shape, name=name)
+        sample_index = multinomial(
+            self._logits_to_probs(logits), num_samples, True)
+
+        # multinomial sample shape is (logits.shape[:-1], num_samples), need to
+        # tanspose to (num_samples, logits.shape[:-1])
+        permute = list(range(sample_index.dim()))
+        permute.insert(0, permute.pop(-1))
+        sample_index = sample_index.transpose(permute)
+
+        return paddle.reshape(sample_index, sample_shape, name=name)
 
     def kl_divergence(self, other):
         """The KL-divergence between two Categorical distributions.
@@ -170,7 +179,7 @@ class Categorical(Distribution):
 
         Returns:
             Tensor: kl-divergence between two Categorical distributions.
-        
+
         Examples:
             .. code-block:: python
 
@@ -200,19 +209,20 @@ class Categorical(Distribution):
         if not in_dygraph_mode():
             check_type(other, 'other', Categorical, 'kl_divergence')
 
-        logits = self.logits - nn.reduce_max(self.logits, dim=-1, keep_dim=True)
-        other_logits = other.logits - nn.reduce_max(
-            other.logits, dim=-1, keep_dim=True)
+        logits = self.logits - \
+            paddle.max(self.logits, axis=-1, keepdim=True)
+        other_logits = other.logits - paddle.max(
+            other.logits, axis=-1, keepdim=True)
         e_logits = ops.exp(logits)
         other_e_logits = ops.exp(other_logits)
-        z = nn.reduce_sum(e_logits, dim=-1, keep_dim=True)
-        other_z = nn.reduce_sum(other_e_logits, dim=-1, keep_dim=True)
+        z = paddle.sum(e_logits, axis=-1, keepdim=True)
+        other_z = paddle.sum(other_e_logits, axis=-1, keepdim=True)
         prob = e_logits / z
-        kl = nn.reduce_sum(
-            prob * (logits - nn.log(z) - other_logits + nn.log(other_z)),
-            dim=-1,
-            keep_dim=True,
-            name=name)
+        kl = paddle.sum(prob * (
+            logits - paddle.log(z) - other_logits + paddle.log(other_z)),
+                        axis=-1,
+                        keepdim=True,
+                        name=name)
 
         return kl
 
@@ -221,7 +231,7 @@ class Categorical(Distribution):
 
         Returns:
             Tensor: Shannon entropy of Categorical distribution. The data type is float32.
-        
+
         Examples:
             .. code-block:: python
 
@@ -241,14 +251,14 @@ class Categorical(Distribution):
 
         """
         name = self.name + '_entropy'
-        logits = self.logits - nn.reduce_max(self.logits, dim=-1, keep_dim=True)
+        logits = self.logits - \
+            paddle.max(self.logits, axis=-1, keepdim=True)
         e_logits = ops.exp(logits)
-        z = nn.reduce_sum(e_logits, dim=-1, keep_dim=True)
+        z = paddle.sum(e_logits, axis=-1, keepdim=True)
         prob = e_logits / z
 
-        neg_entropy = nn.reduce_sum(
-            prob * (logits - nn.log(z)), dim=-1, keep_dim=True)
-        entropy = nn.scale(neg_entropy, scale=-1.0, name=name)
+        neg_entropy = paddle.sum(prob * (logits - paddle.log(z)), axis=-1)
+        entropy = paddle.scale(neg_entropy, scale=-1.0, name=name)
         return entropy
 
     def probs(self, value):
@@ -266,7 +276,7 @@ class Categorical(Distribution):
 
         Returns:
             Tensor: probability according to the category index.
-        
+
         Examples:
             .. code-block:: python
 
@@ -288,33 +298,33 @@ class Categorical(Distribution):
         """
         name = self.name + '_probs'
 
-        dist_sum = nn.reduce_sum(self.logits, dim=-1, keep_dim=True)
+        dist_sum = paddle.sum(self.logits, axis=-1, keepdim=True)
         prob = self.logits / dist_sum
 
         shape = list(prob.shape)
         value_shape = list(value.shape)
         if len(shape) == 1:
             num_value_in_one_dist = np.prod(value_shape)
-            index_value = nn.reshape(value, [num_value_in_one_dist, 1])
+            index_value = paddle.reshape(value, [num_value_in_one_dist, 1])
             index = index_value
         else:
             num_dist = np.prod(shape[:-1])
             num_value_in_one_dist = value_shape[-1]
-            prob = nn.reshape(prob, [num_dist, shape[-1]])
+            prob = paddle.reshape(prob, [num_dist, shape[-1]])
             if len(value_shape) == 1:
                 value = nn.expand(value, [num_dist])
                 value_shape = shape[:-1] + value_shape
-            index_value = nn.reshape(value, [num_dist, -1, 1])
+            index_value = paddle.reshape(value, [num_dist, -1, 1])
             if shape[:-1] != value_shape[:-1]:
                 raise ValueError(
                     "shape of value {} must match shape of logits {}".format(
                         str(value_shape[:-1]), str(shape[:-1])))
 
-            index_prefix = nn.unsqueeze(
+            index_prefix = paddle.unsqueeze(
                 arange(
-                    num_dist, dtype=index_value.dtype), axes=-1)
+                    num_dist, dtype=index_value.dtype), axis=-1)
             index_prefix = nn.expand(index_prefix, [1, num_value_in_one_dist])
-            index_prefix = nn.unsqueeze(index_prefix, axes=-1)
+            index_prefix = paddle.unsqueeze(index_prefix, axis=-1)
 
             if index_value.dtype != index_prefix.dtype:
                 tensor.cast(index_prefix, dtype=index_value.dtype)
@@ -322,7 +332,7 @@ class Categorical(Distribution):
 
         # value is the category index to search for the corresponding probability.
         select_prob = gather_nd(prob, index)
-        return nn.reshape(select_prob, value_shape, name=name)
+        return paddle.reshape(select_prob, value_shape, name=name)
 
     def log_prob(self, value):
         """Log probabilities of the given category. Refer to ``probs`` method.
@@ -332,7 +342,7 @@ class Categorical(Distribution):
 
         Returns:
             Tensor: Log probability.
-        
+
         Examples:
             .. code-block:: python
 
@@ -354,4 +364,4 @@ class Categorical(Distribution):
         """
         name = self.name + '_log_prob'
 
-        return nn.log(self.probs(value), name=name)
+        return paddle.log(self.probs(value), name=name)
diff --git a/python/paddle/distribution/dirichlet.py b/python/paddle/distribution/dirichlet.py
index 2ef38a5a52d..99f2e3b5a5f 100644
--- a/python/paddle/distribution/dirichlet.py
+++ b/python/paddle/distribution/dirichlet.py
@@ -22,23 +22,37 @@ from .exponential_family import ExponentialFamily
 
 class Dirichlet(ExponentialFamily):
     r"""
-    Dirichlet distribution with parameter concentration
+    Dirichlet distribution with parameter "concentration".
 
     The Dirichlet distribution is defined over the `(k-1)-simplex` using a 
     positive, lenght-k vector concentration(`k > 1`).
     The Dirichlet is identically the Beta distribution when `k = 2`.
 
+    For independent and identically distributed continuous random variable 
+    :math:`\boldsymbol X \in R_k` , and support 
+    :math:`\boldsymbol X \in (0,1), ||\boldsymbol X|| = 1` , 
     The probability density function (pdf) is
 
     .. math::
+    
+        f(\boldsymbol X; \boldsymbol \alpha) = \frac{1}{B(\boldsymbol \alpha)} \prod_{i=1}^{k}x_i^{\alpha_i-1} 
 
-        f(x_1,...,x_k; \alpha_1,...,\alpha_k) = \frac{1}{B(\alpha)} \prod_{i=1}^{k}x_i^{\alpha_i-1} 
+    where :math:`\boldsymbol \alpha = {\alpha_1,...,\alpha_k}, k \ge 2` is 
+    parameter, the normalizing constant is the multivariate beta function.
 
-    The normalizing constant is the multivariate beta function.
+    .. math::
+
+        B(\boldsymbol \alpha) = \frac{\prod_{i=1}^{k} \Gamma(\alpha_i)}{\Gamma(\alpha_0)}
+
+    :math:`\alpha_0=\sum_{i=1}^{k} \alpha_i` is the sum of parameters, 
+    :math:`\Gamma(\alpha)` is gamma function.
 
     Args:
-        concentration (Tensor): concentration parameter of dirichlet 
-            distribution
+        concentration (Tensor): "Concentration" parameter of dirichlet 
+            distribution, also called :math:`\alpha`. When it's over one 
+            dimension, the last axis denotes the parameter of distribution,
+            ``event_shape=concentration.shape[-1:]`` , axes other than last are
+            condsider batch dimensions with ``batch_shape=concentration.shape[:-1]`` .
 
     Examples:
 
@@ -68,59 +82,59 @@ class Dirichlet(ExponentialFamily):
 
     @property
     def mean(self):
-        """mean of Dirichelt distribution.
+        """Mean of Dirichelt distribution.
 
         Returns:
-            mean value of distribution.
+            Mean value of distribution.
         """
         return self.concentration / self.concentration.sum(-1, keepdim=True)
 
     @property
     def variance(self):
-        """variance of Dirichlet distribution.
+        """Variance of Dirichlet distribution.
 
         Returns:
-            variance value of distribution.
+            Variance value of distribution.
         """
         concentration0 = self.concentration.sum(-1, keepdim=True)
         return (self.concentration * (concentration0 - self.concentration)) / (
             concentration0.pow(2) * (concentration0 + 1))
 
     def sample(self, shape=()):
-        """sample from dirichlet distribution.
+        """Sample from dirichlet distribution.
 
         Args:
-            shape (Sequence[int], optional): sample shape. Defaults to empty tuple.
+            shape (Sequence[int], optional): Sample shape. Defaults to empty tuple.
         """
         shape = shape if isinstance(shape, tuple) else tuple(shape)
         return _dirichlet(self.concentration.expand(self._extend_shape(shape)))
 
     def prob(self, value):
-        """Probability density function(pdf) evaluated at value.
+        """Probability density function(PDF) evaluated at value.
 
         Args:
-            value (Tensor): value to be evaluated.
+            value (Tensor): Value to be evaluated.
 
         Returns:
-            pdf evaluated at value.
+            PDF evaluated at value.
         """
         return paddle.exp(self.log_prob(value))
 
     def log_prob(self, value):
-        """log of probability densitiy function.
+        """Log of probability densitiy function.
 
         Args:
-            value (Tensor): value to be evaluated.
+            value (Tensor): Value to be evaluated.
         """
         return ((paddle.log(value) * (self.concentration - 1.0)
                  ).sum(-1) + paddle.lgamma(self.concentration.sum(-1)) -
                 paddle.lgamma(self.concentration).sum(-1))
 
     def entropy(self):
-        """entropy of Dirichlet distribution.
+        """Entropy of Dirichlet distribution.
 
         Returns:
-            entropy of distribution.
+            Entropy of distribution.
         """
         concentration0 = self.concentration.sum(-1)
         k = self.concentration.shape[-1]
diff --git a/python/paddle/distribution/distribution.py b/python/paddle/distribution/distribution.py
index 9ba35cc4d3d..53de5254983 100644
--- a/python/paddle/distribution/distribution.py
+++ b/python/paddle/distribution/distribution.py
@@ -25,6 +25,7 @@ import math
 import warnings
 
 import numpy as np
+import paddle
 from paddle import _C_ops
 
 from ..fluid import core
@@ -102,7 +103,13 @@ class Distribution(object):
         raise NotImplementedError
 
     def probs(self, value):
-        """Probability density/mass function."""
+        """Probability density/mass function.
+        
+        .. note:: 
+        
+            This method will be deprecated in the future, please use `prob` 
+            instead.
+        """
         raise NotImplementedError
 
     def _extend_shape(self, sample_shape):
@@ -212,3 +219,22 @@ class Distribution(object):
             )
             return tensor.cast(value, dtype=param.dtype)
         return value
+
+    def _probs_to_logits(self, probs, is_binary=False):
+        r"""
+        Converts probabilities into logits. For the binary, probs denotes the 
+        probability of occurrence of the event indexed by `1`. For the 
+        multi-dimensional, values of last axis denote the probabilities of 
+        occurrence of each of the events.
+        """
+        return (paddle.log(probs) - paddle.log1p(-probs)) \
+            if is_binary else paddle.log(probs)
+
+    def _logits_to_probs(self, logits, is_binary=False):
+        r"""
+        Converts logits into probabilities. For the binary, each value denotes 
+        log odds, whereas for the multi-dimensional case, the values along the 
+        last dimension denote the log probabilities of the events.
+        """
+        return paddle.nn.functional.sigmoid(logits) \
+            if is_binary else paddle.nn.functional.softmax(logits, axis=-1)
diff --git a/python/paddle/distribution/exponential_family.py b/python/paddle/distribution/exponential_family.py
index 0ce743efe85..cea8c5970dd 100644
--- a/python/paddle/distribution/exponential_family.py
+++ b/python/paddle/distribution/exponential_family.py
@@ -33,6 +33,8 @@ class ExponentialFamily(Distribution):
     where :math:`\theta` denotes the natural parameters, :math:`t(x)` denotes 
     the sufficient statistic, :math:`F(\theta)` is the log normalizer function 
     for a given family and :math:`k(x)` is the carrier measure.
+
+    Distribution belongs to exponential family referring to https://en.wikipedia.org/wiki/Exponential_family
     """
 
     @property
diff --git a/python/paddle/distribution/kl.py b/python/paddle/distribution/kl.py
index ff6a8cde456..50a76abce56 100644
--- a/python/paddle/distribution/kl.py
+++ b/python/paddle/distribution/kl.py
@@ -43,10 +43,7 @@ def kl_divergence(p, q):
         q (Distribution): ``Distribution`` object.
 
     Returns:
-        Tensor: batchwise KL-divergence between distribution p and q.
-
-    Raises:
-        NotImplementedError: can't find register function for KL(p||Q).
+        Tensor: Batchwise KL-divergence between distribution p and q.
 
     Examples:
 
@@ -68,9 +65,15 @@ def kl_divergence(p, q):
 def register_kl(cls_p, cls_q):
     """Decorator for register a KL divergence implemention function.
 
+    The ``kl_divergence(p, q)`` function will search concrete implemention 
+    functions registered by ``register_kl``, according to multi-dispatch pattern. 
+    If an implemention function is found, it will return the result, otherwise, 
+    it will raise ``NotImplementError`` exception. Users can register 
+    implemention funciton by the decorator. 
+
     Args:
-        cls_p(Distribution): subclass derived from ``Distribution``.
-        cls_q(Distribution): subclass derived from ``Distribution``.
+        cls_p(Distribution): Subclass derived from ``Distribution``.
+        cls_q(Distribution): Subclass derived from ``Distribution``.
 
     Examples:
         .. code-block:: python
@@ -93,7 +96,7 @@ def register_kl(cls_p, cls_q):
 
 
 def _dispatch(cls_p, cls_q):
-    """multiple dispatch into concrete implement function"""
+    """Multiple dispatch into concrete implement function"""
 
     # find all matched super class pair of p and q
     matchs = [(super_p, super_q) for super_p, super_q in _REGISTER_TABLE
@@ -167,8 +170,7 @@ def _kl_uniform_uniform(p, q):
 
 @register_kl(ExponentialFamily, ExponentialFamily)
 def _kl_expfamily_expfamily(p, q):
-    """compute kl-divergence using `Bregman divergences` 
-    https://www.lix.polytechnique.fr/~nielsen/EntropyEF-ICIP2010.pdf
+    """Compute kl-divergence using `Bregman divergences <https://www.lix.polytechnique.fr/~nielsen/EntropyEF-ICIP2010.pdf>`_
     """
     if not type(p) == type(q):
         raise NotImplementedError
@@ -205,5 +207,5 @@ def _kl_expfamily_expfamily(p, q):
 
 
 def _sum_rightmost(value, n):
-    """sum value along rightmost n dim"""
+    """Sum elements along rightmost n dim"""
     return value.sum(list(range(-n, 0))) if n > 0 else value
diff --git a/python/paddle/distribution/multinomial.py b/python/paddle/distribution/multinomial.py
new file mode 100644
index 00000000000..c4110040fd1
--- /dev/null
+++ b/python/paddle/distribution/multinomial.py
@@ -0,0 +1,184 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+
+import paddle
+from paddle.distribution import categorical, distribution
+
+
+class Multinomial(distribution.Distribution):
+    r"""
+    Multinomial distribution parameterized by :attr:`total_count` and 
+    :attr:`probs`.
+
+    In probability theory, the multinomial distribution is a generalization of 
+    the binomial distribution, it models the probability of counts for each side
+    of a k-sided die rolled n times. When k is 2 and n is 1, the multinomial is 
+    the bernoulli distribution, when k is 2 and n is grater than 1, it is the 
+    binomial distribution, when k is grater than 2 and n is 1, it is the 
+    categorical distribution.
+
+    The probability mass function (PMF) for multinomial is
+
+    .. math::
+
+        f(x_1, ..., x_k; n, p_1,...,p_k) = \frac{n!}{x_1!...x_k!}p_1^{x_1}...p_k^{x_k}
+
+    where, :math:`n` is number of trials, k is the number of categories, 
+    :math:`p_i` denote probability of a trial falling into each category, 
+    :math:`{\textstyle \sum_{i=1}^{k}p_i=1}, p_i \ge 0`, and :math:`x_i` denote 
+    count of each category. 
+
+    Args:
+        total_count (int): Number of trials.
+        probs (Tensor): Probability of a trial falling into each category. Last 
+            axis of probs indexes over categories, other axes index over batches.
+            Probs value should between [0, 1], and sum to 1 along last axis. If 
+            the value over 1, it will be normalized to sum to 1 along the last 
+            axis. 
+
+    Examples:
+
+    .. code-block:: python
+
+        import paddle
+
+        multinomial = paddle.distribution.Multinomial(10, paddle.to_tensor([0.2, 0.3, 0.5]))
+        print(multinomial.sample((2, 3)))
+        # Tensor(shape=[2, 3, 3], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+        #        [[[1., 4., 5.],
+        #          [0., 2., 8.],
+        #          [2., 4., 4.]],
+
+        #         [[1., 6., 3.],
+        #          [3., 3., 4.],
+        #          [3., 4., 3.]]])
+    """
+
+    def __init__(self, total_count, probs):
+        if not isinstance(total_count, int) or total_count < 1:
+            raise ValueError(
+                'input parameter total_count must be int type and grater than zero.'
+            )
+
+        if probs.dim() < 1:
+            raise ValueError(
+                'probs parameter shoule not be none and over one dimension')
+
+        self.probs = probs / probs.sum(-1, keepdim=True)
+        self.total_count = total_count
+        self._categorical = categorical.Categorical(
+            logits=self._probs_to_logits(probs))
+
+        super(Multinomial, self).__init__(probs.shape[:-1], probs.shape[-1:])
+
+    @property
+    def mean(self):
+        """mean of multinomial distribuion.
+
+        Returns:
+            Tensor: mean value.
+        """
+        return self.probs * self.total_count
+
+    @property
+    def variance(self):
+        """variance of multinomial distribution.
+
+        Returns:
+            Tensor: variance value.
+        """
+        return self.total_count * self.probs * (1 - self.probs)
+
+    def prob(self, value):
+        """probability mass function evaluated at value.
+
+        Args:
+            value (Tensor): value to be evaluated.
+
+        Returns:
+            Tensor: probability of value.
+        """
+        return paddle.exp(self.log_prob(value))
+
+    def log_prob(self, value):
+        """probability mass function evaluated at value
+
+        Args:
+            value (Tensor): value to be evaluated.
+
+        Returns:
+            Tensor: probability of value.
+        """
+        if paddle.is_integer(value):
+            value = paddle.cast(value, self.probs.dtype)
+
+        logits, value = paddle.broadcast_tensors(
+            [paddle.log(self.probs), value])
+        logits[(value == 0) & (paddle.isinf(logits))] = 0
+
+        return (paddle.lgamma(value.sum(-1) + 1) -
+                paddle.lgamma(value + 1).sum(-1) + (value * logits).sum(-1))
+
+    def sample(self, shape=()):
+        """draw sample data from multinomial distribution
+
+        Args:
+            sample_shape (tuple, optional): [description]. Defaults to ().
+        """
+        if not isinstance(shape, collections.Iterable):
+            raise TypeError('sample shape must be Iterable object.')
+
+        samples = self._categorical.sample([self.total_count, ] + list(shape))
+        return paddle.nn.functional.one_hot(
+            samples, self.probs.shape[-1]).cast(self.probs.dtype).sum(0)
+
+    def entropy(self):
+        """entropy of multinomial distribution
+
+        Returns:
+            Tensor: entropy value
+        """
+        n = paddle.full(
+            shape=[1], fill_value=self.total_count, dtype=self.probs.dtype)
+        support = paddle.arange(
+            self.total_count + 1, dtype=self.probs.dtype).reshape((-1, ) + (
+                1, ) * len(self.probs.shape))[1:]
+
+        binomial_pmf = paddle.exp(self._binomial_logpmf(n, support))
+
+        return ((n * self._categorical.entropy() - paddle.lgamma(n + 1)) + (
+            (binomial_pmf * paddle.lgamma(support + 1)).sum([0, -1])))
+
+    def _binomial_logpmf(self, count, value):
+        logits = self._probs_to_logits(self.probs, is_binary=True)
+
+        factor_n = paddle.lgamma(count + 1)
+        factor_k = paddle.lgamma(value + 1)
+        factor_nmk = paddle.lgamma(count - value + 1)
+
+        norm = (count * _clip_by_zero(logits) + count *
+                paddle.log1p(paddle.exp(-paddle.abs(logits))) - factor_n)
+
+        return value * logits - factor_k - factor_nmk - norm
+
+
+def _binomial_support(count, dtype):
+    return paddle.arange(count + 1, dtype=dtype)
+
+
+def _clip_by_zero(x):
+    # like clip(x, min=0) but grad at 0 is 0.5
+    return (x.clip(min=0) + x - x.clip(max=0)) / 2
diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py
index 4805994b7aa..1637b33723b 100755
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -1132,10 +1132,10 @@ def _append_backward_ops_(block,
         # So rename here before _addup_repetitive_outputs_.
         if program._appending_grad_times > 1:
             for op_desc in grad_op_desc:
-                if not _is_grad_op_(op):
-                    for name in op_desc.input_arg_names():
-                        if name in rename_var_map:
-                            op_desc._rename_input(name, rename_var_map[name])
+                forward_op_inputs = op.desc.input_arg_names()
+                for name in op_desc.input_arg_names():
+                    if name in rename_var_map and name not in forward_op_inputs:
+                        op_desc._rename_input(name, rename_var_map[name])
                 for name in op_desc.output_arg_names():
                     if "@GRAD" not in name:
                         continue
diff --git a/python/paddle/fluid/incubate/fleet/base/role_maker.py b/python/paddle/fluid/incubate/fleet/base/role_maker.py
index 3761a5a4a4a..90387337faa 100644
--- a/python/paddle/fluid/incubate/fleet/base/role_maker.py
+++ b/python/paddle/fluid/incubate/fleet/base/role_maker.py
@@ -598,6 +598,7 @@ class GeneralRoleMaker(RoleMakerBase):
         self._hdfs_path = kwargs.get("path", "").rstrip("/")
         self._init_timeout_seconds = kwargs.get("init_timeout_seconds", 3600)
         self._run_timeout_seconds = kwargs.get("run_timeout_seconds", 9999999)
+        self._use_metric = kwargs.get("use_metric", False)
         ip_port = kwargs.get("http_ip_port", "")
         self._use_ps_gpu = kwargs.get("use_ps_gpu", False)
         self._http_ip_port = []
@@ -668,7 +669,7 @@ class GeneralRoleMaker(RoleMakerBase):
                                             self._hdfs_name, self._hdfs_ugi)
                     gloo.init()
                     self._node_type_comm = gloo
-                    if self._use_ps_gpu:
+                    if self._use_ps_gpu or self._use_metric:
                         Gloo_strategy = fluid.core.GlooParallelStrategy()
                         Gloo_strategy.rank = current_id
                         Gloo_strategy.rank_num = len(worker_endpoints)
diff --git a/python/paddle/fluid/layers/loss.py b/python/paddle/fluid/layers/loss.py
index 3db4a894d1a..07ed02181e8 100644
--- a/python/paddle/fluid/layers/loss.py
+++ b/python/paddle/fluid/layers/loss.py
@@ -1287,7 +1287,7 @@ def softmax_with_cross_entropy(logits,
     loss = helper.create_variable_for_type_inference(dtype=logits.dtype)
 
     outputs = {'Softmax': softmax, 'Loss': loss}
-    if core.is_compiled_with_npu():
+    if core.is_compiled_with_npu() or core.is_compiled_with_mlu():
         backprop = helper.create_variable_for_type_inference(dtype=logits.dtype)
         outputs['Backprop'] = backprop
     helper.append_op(
diff --git a/python/paddle/fluid/tests/custom_op/custom_relu_op.cc b/python/paddle/fluid/tests/custom_op/custom_relu_op.cc
index c5ec3191c1b..c89990be34c 100644
--- a/python/paddle/fluid/tests/custom_op/custom_relu_op.cc
+++ b/python/paddle/fluid/tests/custom_op/custom_relu_op.cc
@@ -151,3 +151,63 @@ PD_BUILD_GRAD_OP(custom_relu_no_x_in_backward)
     .Outputs({paddle::Grad("X")})
     .SetKernelFn(PD_KERNEL(ReluBackwardWithoutX))
     .SetInferShapeFn(PD_INFER_SHAPE(ReluBackwardWithoutXInferShape));
+
+void relu_cpu_forward_out(const paddle::Tensor& x, paddle::Tensor* out) {
+  PD_DISPATCH_FLOATING_TYPES(
+      x.type(), "relu_cpu_forward", ([&] {
+        relu_cpu_forward_kernel<data_t>(
+            x.data<data_t>(), out->mutable_data<data_t>(x.place()), x.size());
+      }));
+}
+
+void relu_cpu_backward_out(const paddle::Tensor& x,
+                           const paddle::Tensor& out,
+                           const paddle::Tensor& grad_out,
+                           paddle::Tensor* grad_x) {
+  PD_DISPATCH_FLOATING_TYPES(out.type(), "relu_cpu_backward", ([&] {
+                               relu_cpu_backward_kernel<data_t>(
+                                   grad_out.data<data_t>(),
+                                   out.data<data_t>(),
+                                   grad_x->mutable_data<data_t>(x.place()),
+                                   out.size());
+                             }));
+}
+
+void relu_cuda_forward_out(const paddle::Tensor& x, paddle::Tensor* out);
+void relu_cuda_backward_out(const paddle::Tensor& x,
+                            const paddle::Tensor& out,
+                            const paddle::Tensor& grad_out,
+                            paddle::Tensor* grad_x);
+
+void ReluForwardOut(const paddle::Tensor& x, paddle::Tensor* out) {
+  if (x.place() == paddle::PlaceType::kCPU) {
+    return relu_cpu_forward_out(x, out);
+  } else if (x.place() == paddle::PlaceType::kGPU) {
+    return relu_cuda_forward_out(x, out);
+  } else {
+    PD_THROW("Not implemented.");
+  }
+}
+
+void ReluBackwardOut(const paddle::Tensor& x,
+                     const paddle::Tensor& out,
+                     const paddle::Tensor& grad_out,
+                     paddle::Tensor* grad_x) {
+  if (x.place() == paddle::PlaceType::kCPU) {
+    return relu_cpu_backward_out(x, out, grad_out, grad_x);
+  } else if (x.place() == paddle::PlaceType::kGPU) {
+    return relu_cuda_backward_out(x, out, grad_out, grad_x);
+  } else {
+    PD_THROW("Not implemented.");
+  }
+}
+
+PD_BUILD_OP(custom_relu_out)
+    .Inputs({"X"})
+    .Outputs({"Out"})
+    .SetKernelFn(PD_KERNEL(ReluForwardOut));
+
+PD_BUILD_GRAD_OP(custom_relu_out)
+    .Inputs({"X", "Out", paddle::Grad("Out")})
+    .Outputs({paddle::Grad("X")})
+    .SetKernelFn(PD_KERNEL(ReluBackwardOut));
diff --git a/python/paddle/fluid/tests/custom_op/custom_relu_op.cu b/python/paddle/fluid/tests/custom_op/custom_relu_op.cu
index 637deeb9056..33c5ede299b 100644
--- a/python/paddle/fluid/tests/custom_op/custom_relu_op.cu
+++ b/python/paddle/fluid/tests/custom_op/custom_relu_op.cu
@@ -89,3 +89,31 @@ std::vector<paddle::Tensor> relu_cuda_backward_without_x(
 
   return {grad_x};
 }
+
+void relu_cuda_forward_out(const paddle::Tensor& x, paddle::Tensor* out) {
+  int numel = x.size();
+  int block = 512;
+  int grid = (numel + block - 1) / block;
+  PD_DISPATCH_FLOATING_AND_HALF_TYPES(
+      x.type(), "relu_cuda_forward_kernel", ([&] {
+        relu_cuda_forward_kernel<data_t><<<grid, block, 0, x.stream()>>>(
+            x.data<data_t>(), out->mutable_data<data_t>(x.place()), numel);
+      }));
+}
+
+void relu_cuda_backward_out(const paddle::Tensor& x,
+                            const paddle::Tensor& out,
+                            const paddle::Tensor& grad_out,
+                            paddle::Tensor* grad_x) {
+  int numel = out.size();
+  int block = 512;
+  int grid = (numel + block - 1) / block;
+  PD_DISPATCH_FLOATING_AND_HALF_TYPES(
+      out.type(), "relu_cuda_backward_kernel", ([&] {
+        relu_cuda_backward_kernel<data_t><<<grid, block, 0, x.stream()>>>(
+            grad_out.data<data_t>(),
+            out.data<data_t>(),
+            grad_x->mutable_data<data_t>(x.place()),
+            numel);
+      }));
+}
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py
index 16458841f44..407eb342ba9 100644
--- a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py
@@ -50,7 +50,8 @@ class TestJITLoad(unittest.TestCase):
     def setUp(self):
         self.custom_ops = [
             custom_module.custom_relu, custom_module.custom_relu_dup,
-            custom_module.custom_relu_no_x_in_backward
+            custom_module.custom_relu_no_x_in_backward,
+            custom_module.custom_relu_out
         ]
         self.dtypes = ['float32', 'float64']
         if paddle.is_compiled_with_cuda():
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 2e35277d70c..511aa7e06c4 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -960,7 +960,7 @@ else()
     set_tests_properties(test_layer_norm_op PROPERTIES TIMEOUT 150)
     set_tests_properties(test_pool3d_op PROPERTIES TIMEOUT 150)
 endif()
-set_tests_properties(test_imperative_selected_rows_to_lod_tensor PROPERTIES TIMEOUT 120)
+set_tests_properties(test_imperative_selected_rows_to_lod_tensor PROPERTIES TIMEOUT 200)
 set_tests_properties(test_index_select_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_parallel_ssa_graph_inference_feed_partial_data PROPERTIES TIMEOUT 120)
 set_tests_properties(test_parallel_executor_crf PROPERTIES TIMEOUT 120)
diff --git a/python/paddle/fluid/tests/unittests/autograd/test_jacobian_static.py b/python/paddle/fluid/tests/unittests/autograd/test_autograd_static.py
similarity index 53%
rename from python/paddle/fluid/tests/unittests/autograd/test_jacobian_static.py
rename to python/paddle/fluid/tests/unittests/autograd/test_autograd_static.py
index 28fc6932b07..60dc9d06b8a 100644
--- a/python/paddle/fluid/tests/unittests/autograd/test_jacobian_static.py
+++ b/python/paddle/fluid/tests/unittests/autograd/test_autograd_static.py
@@ -66,107 +66,38 @@ def approx_jacobian(f, xs, dtype, eps=1e-5, batch=False):
 
     ds = eps * np.eye(xdim, dtype=dtype)
 
-    fprimes_by_x = [(0.5 / eps) * (_f(x + d) - _f(x - d)) for d in ds]
+    fprimes_by_x = [(0.5 * (_f(x + d) - _f(x - d)) / eps) for d in ds]
     fprimes_by_y = np.stack(fprimes_by_x, axis=-1)
     return np.transpose(fprimes_by_y, [1, 0, 2]) if batch else fprimes_by_y
 
 
-class TestJacobianFloat32(unittest.TestCase):
-    @classmethod
-    def setUpClass(self):
-        paddle.enable_static()
-        if fluid.core.is_compiled_with_cuda():
-            self.place = fluid.CUDAPlace(0)
-        else:
-            self.place = fluid.CPUPlace()
-        self.np_dtype = np.float32
-        self.A = np.array([[1., 2.]]).astype('float32')
-        self.B = np.array([[1., 2.], [2., 1.]]).astype('float32')
-        self.C = np.array([[2., 2.], [2., 1.]]).astype('float32')
-        self.D = np.array(
-            [[[2., 2.], [2., 1.]], [[1., 2.], [2., 1.]]]).astype('float32')
-        self.E = np.array(
-            [[[3., 4.], [2., 3.]], [[2., 1.], [1., 3.]]]).astype('float32')
-        self.eps = 1e-4
-        self.rtol = 1e-2
-        self.atol = 1e-2
-
-    def run_test(self, pd_f, np_f, inps, dtype, batch=False):
-        def make_tensors(inps):
-            if isinstance(inps, list):
-                xs = [
-                    paddle.static.data(
-                        f'x{i}', inp.shape, dtype=inp.dtype)
-                    for i, inp in enumerate(inps)
-                ]
-            else:
-                xs = paddle.static.data(
-                    name='x', shape=inps.shape, dtype=inps.dtype)
-            return xs
-
-        main = fluid.Program()
-        startup = fluid.Program()
-        with fluid.program_guard(main, startup):
-            xs = make_tensors(inps)
-            JJ = paddle.autograd.functional.Jacobian(pd_f, xs, batch=batch)
-            nrow, ncol = JJ.shape()
-            full_jacobian = JJ[:]
-        exe = fluid.Executor(self.place)
-        exe.run(startup)
-        if isinstance(inps, list):
-            feeds = {f'x{i}': x for i, x in enumerate(inps)}
-        else:
-            feeds = {'x': inps}
-        pd_jacobians = exe.run(main, feed=feeds, fetch_list=[full_jacobian])[0]
-        np_jacobians = approx_jacobian(np_f, inps, dtype, self.eps, batch=batch)
-        self.assertTrue(
-            np.allclose(pd_jacobians, np_jacobians, self.rtol, self.atol))
-
-    def test_square(self):
-        def pd_f(x):
-            return paddle.multiply(x, x)
-
-        def np_f(x):
-            return np.multiply(x, x)
-
-        self.run_test(pd_f, np_f, self.A, np.dtype('float32'))
-
-    def test_mul(self):
-        def pd_f(xs):
-            x, y = xs
-            return paddle.multiply(x, y)
-
-        def np_f(xs):
-            x, y = xs
-            return np.multiply(x, y)
-
-        self.run_test(pd_f, np_f, [self.B, self.C], np.dtype('float32'))
-
-    def test_matmul(self):
-        def pd_f(xs):
-            x, y = xs
-            return paddle.matmul(x, y)
-
-        def np_f(xs):
-            x, y = xs
-            return np.matmul(x, y)
+def make_tensors(inps):
+    if isinstance(inps, list):
+        xs = [
+            paddle.static.data(
+                f'x{i}', inp.shape, dtype=inp.dtype)
+            for i, inp in enumerate(inps)
+        ]
+    else:
+        xs = paddle.static.data(name='x', shape=inps.shape, dtype=inps.dtype)
+    return xs
 
-        self.run_test(pd_f, np_f, [self.B, self.C], np.dtype('float32'))
 
-    def test_batch_matmul(self):
-        def pd_f(xs):
-            x, y = xs
-            return paddle.matmul(x, y)
+all_data_shapes = {
+    'A': [[1., 2.]],
+    'B': [[1., 2.], [2., 1.]],
+    'C': [[2., 2.], [2., 1.]],
+    'D': [[[2., 2.], [2., 1.]], [[1., 2.], [2., 1.]]],
+    'E': [[[3., 4.], [2., 3.]], [[2., 1.], [1., 3.]]],
+}
 
-        def np_f(xs):
-            x, y = xs
-            return np.matmul(x, y)
 
-        self.run_test(
-            pd_f, np_f, [self.D, self.E], np.dtype('float32'), batch=True)
+def prepare_data(test, input_shapes, dtype):
+    for name, shape in input_shapes.items():
+        setattr(test, name, np.array(shape, dtype=dtype))
 
 
-class TestJacobianFloat64(unittest.TestCase):
+class TestJacobianFloat32(unittest.TestCase):
     @classmethod
     def setUpClass(self):
         paddle.enable_static()
@@ -174,31 +105,13 @@ class TestJacobianFloat64(unittest.TestCase):
             self.place = fluid.CUDAPlace(0)
         else:
             self.place = fluid.CPUPlace()
-        self.np_dtype = np.float32
-        self.A = np.array([[1., 2.]]).astype('float64')
-        self.B = np.array([[1., 2.], [2., 1.]]).astype('float64')
-        self.C = np.array([[2., 2.], [2., 1.]]).astype('float64')
-        self.D = np.array(
-            [[[2., 2.], [2., 1.]], [[1., 2.], [2., 1.]]]).astype('float64')
-        self.E = np.array(
-            [[[3., 4.], [2., 3.]], [[2., 1.], [1., 3.]]]).astype('float64')
-        self.eps = 1e-7
-        self.rtol = 1e-6
-        self.atol = 1e-6
-
-    def run_test_by_fullmatrix(self, pd_f, np_f, inps, dtype, batch=False):
-        def make_tensors(inps):
-            if isinstance(inps, list):
-                xs = [
-                    paddle.static.data(
-                        f'x{i}', inp.shape, dtype=inp.dtype)
-                    for i, inp in enumerate(inps)
-                ]
-            else:
-                xs = paddle.static.data(
-                    name='x', shape=inps.shape, dtype=inps.dtype)
-            return xs
+        self.dtype = 'float32'
+        prepare_data(self, all_data_shapes, self.dtype)
+        self.eps = 1e-4
+        self.rtol = 1e-2
+        self.atol = 1e-2
 
+    def run_test_by_fullmatrix(self, pd_f, np_f, inps, batch=False):
         main = fluid.Program()
         startup = fluid.Program()
         with fluid.program_guard(main, startup):
@@ -213,23 +126,12 @@ class TestJacobianFloat64(unittest.TestCase):
         else:
             feeds = {'x': inps}
         pd_jacobians = exe.run(main, feed=feeds, fetch_list=[full_jacobian])[0]
-        np_jacobians = approx_jacobian(np_f, inps, dtype, self.eps, batch=batch)
+        np_jacobians = approx_jacobian(
+            np_f, inps, self.dtype, self.eps, batch=batch)
         self.assertTrue(
             np.allclose(pd_jacobians, np_jacobians, self.rtol, self.atol))
 
-    def run_test_by_rows(self, pd_f, np_f, inps, dtype, batch=False):
-        def make_tensors(inps):
-            if isinstance(inps, list):
-                xs = [
-                    paddle.static.data(
-                        f'x{i}', inp.shape, dtype=inp.dtype)
-                    for i, inp in enumerate(inps)
-                ]
-            else:
-                xs = paddle.static.data(
-                    name='x', shape=inps.shape, dtype=inps.dtype)
-            return xs
-
+    def run_test_by_rows(self, pd_f, np_f, inps, batch=False):
         main = fluid.Program()
         startup = fluid.Program()
         with fluid.program_guard(main, startup):
@@ -244,24 +146,12 @@ class TestJacobianFloat64(unittest.TestCase):
         else:
             feeds = {'x': inps}
         pd_jac = exe.run(main, feed=feeds, fetch_list=[rows])
-        np_jac = approx_jacobian(np_f, inps, dtype, self.eps, batch=batch)
+        np_jac = approx_jacobian(np_f, inps, self.dtype, self.eps, batch=batch)
         for i in range(nrow):
             self.assertTrue(
                 np.allclose(pd_jac[i], np_jac[i], self.rtol, self.atol))
 
-    def run_test_by_entries(self, pd_f, np_f, inps, dtype, batch=False):
-        def make_tensors(inps):
-            if isinstance(inps, list):
-                xs = [
-                    paddle.static.data(
-                        f'x{i}', inp.shape, dtype=inp.dtype)
-                    for i, inp in enumerate(inps)
-                ]
-            else:
-                xs = paddle.static.data(
-                    name='x', shape=inps.shape, dtype=inps.dtype)
-            return xs
-
+    def run_test_by_entries(self, pd_f, np_f, inps, batch=False):
         main = fluid.Program()
         startup = fluid.Program()
         with fluid.program_guard(main, startup):
@@ -276,7 +166,7 @@ class TestJacobianFloat64(unittest.TestCase):
         else:
             feeds = {'x': inps}
         pd_entries = exe.run(main, feed=feeds, fetch_list=[entries])
-        np_jac = approx_jacobian(np_f, inps, dtype, self.eps, batch=batch)
+        np_jac = approx_jacobian(np_f, inps, self.dtype, self.eps, batch=batch)
         np_entries = [
             np_jac[i, ..., j] for i in range(nrow) for j in range(ncol)
         ]
@@ -291,9 +181,9 @@ class TestJacobianFloat64(unittest.TestCase):
         def np_f(x):
             return np.multiply(x, x)
 
-        self.run_test_by_fullmatrix(pd_f, np_f, self.A, np.dtype('float64'))
-        self.run_test_by_rows(pd_f, np_f, self.A, np.dtype('float64'))
-        self.run_test_by_entries(pd_f, np_f, self.A, np.dtype('float64'))
+        self.run_test_by_fullmatrix(pd_f, np_f, self.A)
+        self.run_test_by_rows(pd_f, np_f, self.A)
+        self.run_test_by_entries(pd_f, np_f, self.A)
 
     def test_mul(self):
         def pd_f(xs):
@@ -304,11 +194,12 @@ class TestJacobianFloat64(unittest.TestCase):
             x, y = xs
             return np.multiply(x, y)
 
-        self.run_test_by_fullmatrix(pd_f, np_f, [self.B, self.C],
-                                    np.dtype('float64'))
-        self.run_test_by_rows(pd_f, np_f, [self.B, self.C], np.dtype('float64'))
-        self.run_test_by_entries(pd_f, np_f, [self.B, self.C],
-                                 np.dtype('float64'))
+        self.run_test_by_fullmatrix(
+            pd_f,
+            np_f,
+            [self.B, self.C], )
+        self.run_test_by_rows(pd_f, np_f, [self.B, self.C])
+        self.run_test_by_entries(pd_f, np_f, [self.B, self.C])
 
     def test_matmul(self):
         def pd_f(xs):
@@ -319,11 +210,9 @@ class TestJacobianFloat64(unittest.TestCase):
             x, y = xs
             return np.matmul(x, y)
 
-        self.run_test_by_fullmatrix(pd_f, np_f, [self.B, self.C],
-                                    np.dtype('float64'))
-        self.run_test_by_rows(pd_f, np_f, [self.B, self.C], np.dtype('float64'))
-        self.run_test_by_entries(pd_f, np_f, [self.B, self.C],
-                                 np.dtype('float64'))
+        self.run_test_by_fullmatrix(pd_f, np_f, [self.B, self.C])
+        self.run_test_by_rows(pd_f, np_f, [self.B, self.C])
+        self.run_test_by_entries(pd_f, np_f, [self.B, self.C])
 
     def test_batch_matmul(self):
         def pd_f(xs):
@@ -334,12 +223,85 @@ class TestJacobianFloat64(unittest.TestCase):
             x, y = xs
             return np.matmul(x, y)
 
-        self.run_test_by_fullmatrix(
-            pd_f, np_f, [self.D, self.E], np.dtype('float64'), batch=True)
-        self.run_test_by_rows(
-            pd_f, np_f, [self.D, self.E], np.dtype('float64'), batch=True)
-        self.run_test_by_entries(
-            pd_f, np_f, [self.D, self.E], np.dtype('float64'), batch=True)
+        self.run_test_by_fullmatrix(pd_f, np_f, [self.D, self.E], batch=True)
+        self.run_test_by_rows(pd_f, np_f, [self.D, self.E], batch=True)
+        self.run_test_by_entries(pd_f, np_f, [self.D, self.E], batch=True)
+
+
+class TestJacobianFloat64(TestJacobianFloat32):
+    @classmethod
+    def setUpClass(self):
+        paddle.enable_static()
+        if fluid.core.is_compiled_with_cuda():
+            self.place = fluid.CUDAPlace(0)
+        else:
+            self.place = fluid.CPUPlace()
+        self.dtype = 'float64'
+        prepare_data(self, all_data_shapes, self.dtype)
+        self.eps = 1e-7
+        self.rtol = 1e-6
+        self.atol = 1e-6
+
+
+class TestHessianFloat64(unittest.TestCase):
+    @classmethod
+    def setUpClass(self):
+        paddle.enable_static()
+        if fluid.core.is_compiled_with_cuda():
+            self.place = fluid.CUDAPlace(0)
+        else:
+            self.place = fluid.CPUPlace()
+        self.dtype = 'float64'
+        prepare_data(self, all_data_shapes, self.dtype)
+        self.eps = 1e-7
+        self.rtol = 1e-6
+        self.atol = 1e-6
+
+    def run_test_by_fullmatrix(self, pd_f, inps, np_hess, batch=False):
+        main = fluid.Program()
+        startup = fluid.Program()
+        with fluid.program_guard(main, startup):
+            xs = make_tensors(inps)
+            HH = paddle.autograd.functional.Hessian(pd_f, xs, batch=batch)
+            nrow, ncol = HH.shape()
+            full_hessian = HH[:]
+        exe = fluid.Executor(self.place)
+        exe.run(startup)
+        if isinstance(inps, list):
+            feeds = {f'x{i}': x for i, x in enumerate(inps)}
+        else:
+            feeds = {'x': inps}
+        pd_hess = exe.run(main, feed=feeds, fetch_list=[full_hessian])[0]
+        self.assertTrue(np.allclose(pd_hess, np_hess, self.rtol, self.atol))
+
+    def test_square(self):
+        def pd_f(x):
+            """Input is a square matrix."""
+            return paddle.matmul(x, x.T)
+
+        def np_hess(x):
+            dim = x.shape[0]
+            f_xx_upperleft = 2 * np.eye(dim, dtype=self.dtype)
+            f_xx = np.zeros([dim * dim, dim * dim], dtype=self.dtype)
+            f_xx[:dim, :dim] = f_xx_upperleft
+            return f_xx
+
+        self.run_test_by_fullmatrix(pd_f, self.B, np_hess(self.B))
+
+        def test_batch_square(self):
+            def pd_f(x):
+                """Input is a square matrix."""
+                return paddle.matmul(x, paddle.transpose(x, [0, 2, 1]))
+
+            def np_hess(x):
+                bat, dim, _ = x.shape
+                f_xx_upperleft = 2 * np.eye(dim, dtype=self.dtype)
+                f_xx = np.zeros([bat, dim * dim, dim * dim], dtype=self.dtype)
+                f_xx[..., :dim, :dim] = f_xx_upperleft
+                return f_xx
+
+            self.run_test_by_fullmatrix(
+                pd_f, self.E, np_hess(self.E), batch=True)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_relu_depthwise_conv_pass.py b/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_relu_depthwise_conv_pass.py
new file mode 100644
index 00000000000..c07744c882e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_relu_depthwise_conv_pass.py
@@ -0,0 +1,92 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import paddle
+import paddle.distributed.fleet as fleet
+import numpy as np
+import paddle.nn as nn
+from paddle.distributed.passes import new_pass, PassManager
+import unittest
+from dist_pass_test_base import DistPassTestBase
+
+
+class ReluDepthwiseConvNet(nn.Layer):
+    def __init__(self):
+        super(ReluDepthwiseConvNet, self).__init__()
+
+        self.conv1 = nn.Conv2D(3, 9, (3, 3))
+        self.relu = nn.ReLU()
+        self.conv2 = nn.Conv2D(9, 27, (3, 3), groups=9)
+
+    def forward(self, x):
+        out = self.conv1(x)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = paddle.flatten(out, 1)
+        return out
+
+
+class TestFuseReluDepthwiseConvPass(DistPassTestBase):
+    def init(self):
+        self.atol = 1e-4
+        self.rtol = 1e-4
+
+    def get_model(self, place, batch_size=32, image_shape=[3, 224, 224]):
+        image = paddle.static.data(
+            shape=[batch_size] + image_shape, dtype='float32', name='image')
+
+        model = ReluDepthwiseConvNet()
+        pred_out = model(image)
+        loss = paddle.mean(pred_out)
+        optimizer = paddle.optimizer.Adam(learning_rate=1e-3)
+
+        dist_strategy = fleet.DistributedStrategy()
+        dist_strategy.fuse_all_reduce_ops = False
+        dist_strategy.without_graph_optimization = True
+        fleet.init(is_collective=True, strategy=dist_strategy)
+        optimizer = fleet.distributed_optimizer(optimizer)
+        optimizer.minimize(loss)
+
+        rank = paddle.distributed.get_rank()
+
+        def reader():
+            seed = int(os.environ.get("SEED", 0))
+            np.random.seed(seed + rank)
+            for _ in range(10):
+                image_np = np.random.random(size=image.shape).astype('float32')
+                yield image_np,
+
+        main_program = paddle.static.default_main_program()
+        startup_program = paddle.static.default_startup_program()
+        return main_program, startup_program, [image], [loss], reader
+
+    def apply_passes(self, main_prog, startup_prog):
+        pass_manager = PassManager([new_pass("fuse_relu_depthwise_conv")])
+        pass_manager.apply([main_prog], [startup_prog])
+        print(pass_manager.names)
+
+        op_type = []
+        for op in main_prog.global_block().ops:
+            if op.type == "depthwise_conv2d":
+                self.assertTrue(op.desc.attr("fuse_relu_before_depthwise_conv"))
+            op_type.append(op.type)
+        self.assertTrue("depthwise_conv2d" in op_type)
+
+    def test_relu_depthwise_conv(self):
+        self.check_main()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/distribution/test_distribution_categorical.py b/python/paddle/fluid/tests/unittests/distribution/test_distribution_categorical.py
index 5f3f5e2a930..d92ec52edae 100644
--- a/python/paddle/fluid/tests/unittests/distribution/test_distribution_categorical.py
+++ b/python/paddle/fluid/tests/unittests/distribution/test_distribution_categorical.py
@@ -33,7 +33,7 @@ class CategoricalNumpy(DistributionNumpy):
         e_logits = np.exp(logits)
         z = np.sum(e_logits, axis=-1, keepdims=True)
         prob = e_logits / z
-        return -1. * np.sum(prob * (logits - np.log(z)), axis=-1, keepdims=True)
+        return -1. * np.sum(prob * (logits - np.log(z)), axis=-1)
 
     def kl_divergence(self, other):
         logits = self.logits - np.max(self.logits, axis=-1, keepdims=True)
diff --git a/python/paddle/fluid/tests/unittests/distribution/test_distribution_multinomial.py b/python/paddle/fluid/tests/unittests/distribution/test_distribution_multinomial.py
new file mode 100644
index 00000000000..bff723dfa29
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/distribution/test_distribution_multinomial.py
@@ -0,0 +1,139 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import scipy.stats
+
+import config
+
+
+@config.place(config.DEVICES)
+@config.parameterize((config.TEST_CASE_NAME, 'total_count', 'probs'), [
+    ('one-dim', 10, config.xrand((3, ))),
+    ('multi-dim', 9, config.xrand((10, 20))),
+    ('prob-sum-one', 10, np.array([0.5, 0.2, 0.3])),
+    ('prob-sum-non-one', 10, np.array([2., 3., 5.])),
+])
+class TestMultinomial(unittest.TestCase):
+    def setUp(self):
+        self._dist = paddle.distribution.Multinomial(
+            total_count=self.total_count, probs=paddle.to_tensor(self.probs))
+
+    def test_mean(self):
+        mean = self._dist.mean
+        self.assertEqual(mean.numpy().dtype, self.probs.dtype)
+        np.testing.assert_allclose(
+            mean,
+            self._np_mean(),
+            rtol=config.RTOL.get(str(self.probs.dtype)),
+            atol=config.ATOL.get(str(self.probs.dtype)))
+
+    def test_variance(self):
+        var = self._dist.variance
+        self.assertEqual(var.numpy().dtype, self.probs.dtype)
+        np.testing.assert_allclose(
+            var,
+            self._np_variance(),
+            rtol=config.RTOL.get(str(self.probs.dtype)),
+            atol=config.ATOL.get(str(self.probs.dtype)))
+
+    def test_entropy(self):
+        entropy = self._dist.entropy()
+        self.assertEqual(entropy.numpy().dtype, self.probs.dtype)
+        np.testing.assert_allclose(
+            entropy,
+            self._np_entropy(),
+            rtol=config.RTOL.get(str(self.probs.dtype)),
+            atol=config.ATOL.get(str(self.probs.dtype)))
+
+    def test_sample(self):
+        sample_shape = ()
+        samples = self._dist.sample(sample_shape)
+        self.assertEqual(samples.numpy().dtype, self.probs.dtype)
+        self.assertEqual(
+            tuple(samples.shape),
+            sample_shape + self._dist.batch_shape + self._dist.event_shape)
+
+        sample_shape = (6, )
+        samples = self._dist.sample(sample_shape)
+        self.assertEqual(samples.numpy().dtype, self.probs.dtype)
+        self.assertEqual(
+            tuple(samples.shape),
+            sample_shape + self._dist.batch_shape + self._dist.event_shape)
+        self.assertTrue(
+            np.all(samples.sum(-1).numpy() == self._dist.total_count))
+
+        sample_shape = (5000, )
+        samples = self._dist.sample(sample_shape)
+        sample_mean = samples.mean(axis=0)
+        # Tolerance value 0.2 is empirical value which is consistent with 
+        # TensorFlow
+        np.testing.assert_allclose(
+            sample_mean, self._dist.mean, atol=0, rtol=0.20)
+
+    def _np_variance(self):
+        probs = self.probs / self.probs.sum(-1, keepdims=True)
+        return self.total_count * probs * (1 - probs)
+
+    def _np_mean(self):
+        probs = self.probs / self.probs.sum(-1, keepdims=True)
+        return self.total_count * probs
+
+    def _np_entropy(self):
+        probs = self.probs / self.probs.sum(-1, keepdims=True)
+        return scipy.stats.multinomial.entropy(self.total_count, probs)
+
+
+@config.place(config.DEVICES)
+@config.parameterize(
+    (config.TEST_CASE_NAME, 'total_count', 'probs', 'value'),
+    [
+        ('value-float', 10, np.array([0.2, 0.3, 0.5]), np.array([2., 3., 5.])),
+        ('value-int', 10, np.array([0.2, 0.3, 0.5]), np.array([2, 3, 5])),
+        ('value-multi-dim', 10, np.array([[0.3, 0.7], [0.5, 0.5]]),
+         np.array([[4., 6], [8, 2]])),
+        # ('value-sum-non-n', 10, np.array([0.5, 0.2, 0.3]), np.array([4,5,2])),
+    ])
+class TestMultinomialPmf(unittest.TestCase):
+    def setUp(self):
+        self._dist = paddle.distribution.Multinomial(
+            total_count=self.total_count, probs=paddle.to_tensor(self.probs))
+
+    def test_prob(self):
+        np.testing.assert_allclose(
+            self._dist.prob(paddle.to_tensor(self.value)),
+            scipy.stats.multinomial.pmf(self.value, self.total_count,
+                                        self.probs),
+            rtol=config.RTOL.get(str(self.probs.dtype)),
+            atol=config.ATOL.get(str(self.probs.dtype)))
+
+
+@config.place(config.DEVICES)
+@config.parameterize((config.TEST_CASE_NAME, 'total_count', 'probs'), [
+    ('total_count_le_one', 0, np.array([0.3, 0.7])),
+    ('total_count_float', np.array([0.3, 0.7])),
+    ('probs_zero_dim', np.array(0)),
+])
+class TestMultinomialException(unittest.TestCase):
+    def TestInit(self):
+        with self.assertRaises(ValueError):
+            paddle.distribution.Multinomial(self.total_count,
+                                            paddle.to_tensor(self.probs))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/distribution/test_distribution_multinomial_static.py b/python/paddle/fluid/tests/unittests/distribution/test_distribution_multinomial_static.py
new file mode 100644
index 00000000000..2eb5b9769df
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/distribution/test_distribution_multinomial_static.py
@@ -0,0 +1,168 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import scipy.stats
+
+import config
+
+paddle.enable_static()
+
+
+@config.place(config.DEVICES)
+@config.parameterize((config.TEST_CASE_NAME, 'total_count', 'probs'), [
+    ('one-dim', 5, config.xrand((3, ))),
+    ('multi-dim', 9, config.xrand((2, 3))),
+    ('prob-sum-one', 5, np.array([0.5, 0.2, 0.3])),
+    ('prob-sum-non-one', 5, np.array([2., 3., 5.])),
+])
+class TestMultinomial(unittest.TestCase):
+    def setUp(self):
+        startup_program = paddle.static.Program()
+        main_program = paddle.static.Program()
+        executor = paddle.static.Executor(self.place)
+        with paddle.static.program_guard(main_program, startup_program):
+            probs = paddle.static.data('probs', self.probs.shape,
+                                       self.probs.dtype)
+            dist = paddle.distribution.Multinomial(self.total_count, probs)
+            mean = dist.mean
+            var = dist.variance
+            entropy = dist.entropy()
+            mini_samples = dist.sample(shape=(6, ))
+            large_samples = dist.sample(shape=(5000, ))
+        fetch_list = [mean, var, entropy, mini_samples, large_samples]
+        feed = {'probs': self.probs}
+
+        executor.run(startup_program)
+        [
+            self.mean, self.var, self.entropy, self.mini_samples,
+            self.large_samples
+        ] = executor.run(main_program, feed=feed, fetch_list=fetch_list)
+
+    def test_mean(self):
+        self.assertEqual(str(self.mean.dtype).split('.')[-1], self.probs.dtype)
+        np.testing.assert_allclose(
+            self.mean,
+            self._np_mean(),
+            rtol=config.RTOL.get(str(self.probs.dtype)),
+            atol=config.ATOL.get(str(self.probs.dtype)))
+
+    def test_variance(self):
+        self.assertEqual(str(self.var.dtype).split('.')[-1], self.probs.dtype)
+        np.testing.assert_allclose(
+            self.var,
+            self._np_variance(),
+            rtol=config.RTOL.get(str(self.probs.dtype)),
+            atol=config.ATOL.get(str(self.probs.dtype)))
+
+    def test_entropy(self):
+        self.assertEqual(
+            str(self.entropy.dtype).split('.')[-1], self.probs.dtype)
+        np.testing.assert_allclose(
+            self.entropy,
+            self._np_entropy(),
+            rtol=config.RTOL.get(str(self.probs.dtype)),
+            atol=config.ATOL.get(str(self.probs.dtype)))
+
+    def test_sample(self):
+        self.assertEqual(
+            str(self.mini_samples.dtype).split('.')[-1], self.probs.dtype)
+        self.assertTrue(np.all(self.mini_samples.sum(-1) == self.total_count))
+
+        sample_mean = self.large_samples.mean(axis=0)
+        np.testing.assert_allclose(sample_mean, self.mean, atol=0, rtol=0.20)
+
+    def _np_variance(self):
+        probs = self.probs / self.probs.sum(-1, keepdims=True)
+        return self.total_count * probs * (1 - probs)
+
+    def _np_mean(self):
+        probs = self.probs / self.probs.sum(-1, keepdims=True)
+        return self.total_count * probs
+
+    def _np_entropy(self):
+        probs = self.probs / self.probs.sum(-1, keepdims=True)
+        return scipy.stats.multinomial.entropy(self.total_count, probs)
+
+
+@config.place(config.DEVICES)
+@config.parameterize(
+    (config.TEST_CASE_NAME, 'total_count', 'probs', 'value'),
+    [
+        ('value-float', 5, np.array([0.2, 0.3, 0.5]), np.array([1., 1., 3.])),
+        ('value-int', 5, np.array([0.2, 0.3, 0.5]), np.array([2, 2, 1])),
+        ('value-multi-dim', 5, np.array([[0.3, 0.7], [0.5, 0.5]]),
+         np.array([[1., 4.], [2., 3.]])),
+        # ('value-sum-non-n', 10, np.array([0.5, 0.2, 0.3]), np.array([4,5,2])),
+    ])
+class TestMultinomialPmf(unittest.TestCase):
+    def setUp(self):
+        startup_program = paddle.static.Program()
+        main_program = paddle.static.Program()
+        executor = paddle.static.Executor(self.place)
+
+        with paddle.static.program_guard(main_program, startup_program):
+            probs = paddle.static.data('probs', self.probs.shape,
+                                       self.probs.dtype)
+            value = paddle.static.data('value', self.value.shape,
+                                       self.value.dtype)
+            dist = paddle.distribution.Multinomial(self.total_count, probs)
+            pmf = dist.prob(value)
+        feed = {'probs': self.probs, 'value': self.value}
+        fetch_list = [pmf]
+
+        executor.run(startup_program)
+        [self.pmf] = executor.run(main_program,
+                                  feed=feed,
+                                  fetch_list=fetch_list)
+
+    def test_prob(self):
+        np.testing.assert_allclose(
+            self.pmf,
+            scipy.stats.multinomial.pmf(self.value, self.total_count,
+                                        self.probs),
+            rtol=config.RTOL.get(str(self.probs.dtype)),
+            atol=config.ATOL.get(str(self.probs.dtype)))
+
+
+@config.place(config.DEVICES)
+@config.parameterize((config.TEST_CASE_NAME, 'total_count', 'probs'), [
+    ('total_count_le_one', 0, np.array([0.3, 0.7])),
+    ('total_count_float', np.array([0.3, 0.7])),
+    ('probs_zero_dim', np.array(0)),
+])
+class TestMultinomialException(unittest.TestCase):
+    def setUp(self):
+        startup_program = paddle.static.Program()
+        self.main_program = paddle.static.Program()
+        self.executor = paddle.static.Executor(self.place)
+
+        with paddle.static.program_guard(main_program, startup_program):
+            probs = paddle.static.data('probs', self.probs.shape,
+                                       self.probs.dtype)
+            dist = paddle.distribution.Multinomial(self.total_count, probs)
+        self.feed = {'probs': self.probs}
+
+        executor.run(startup_program)
+
+    def TestInit(self):
+        with self.assertRaises(ValueError):
+            self.executor.run(self.main_program, feed=self.feed, fetch=[])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_sharding_optimizer_stage2.py b/python/paddle/fluid/tests/unittests/dygraph_sharding_optimizer_stage2.py
index 6a9005b8ce6..705831d50f1 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_sharding_optimizer_stage2.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_sharding_optimizer_stage2.py
@@ -124,8 +124,17 @@ def train_mlp():
             avg_loss.backward()
             oss_optimizer.step()
 
-    # oss_optimizer clear cache
-    oss_optimizer._clear_cache()
+            # oss_optimizer clear cache
+            oss_optimizer._clear_cache()
+
+            # check optimizer.minimize() error
+            try:
+                oss_optimizer.minimize()
+            except:
+                print(
+                    "====== Find sharding_stage2_optimizer.minimize() error ======"
+                )
+            return
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py
index ddd31bc057f..9b218bf1302 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py
@@ -83,7 +83,8 @@ def train_mlp(model,
               accumulate_grad=False,
               batch_size=100,
               opt_group=False,
-              recompute=False):
+              recompute=False,
+              test_minimize=False):
     group = paddle.distributed.new_group([0, 1])
     if opt_group:
         optimizer = optimizer_setting(
@@ -113,6 +114,15 @@ def train_mlp(model,
             accumulate_grads=batch_size == 20,
             sync_comm=recompute)
 
+    # check optimizer.minimize() error
+    if test_minimize:
+        try:
+            optimizer.minimize()
+        except:
+            print(
+                "====== Find sharding_stage3_optimizer.minimize() error ======")
+        return
+
     train_reader = paddle.batch(
         reader_decorator(), batch_size=batch_size, drop_last=True)
 
@@ -160,8 +170,8 @@ def train_mlp(model,
 
 
 def test_stage2_stage3():
-    mlp, mlp1, mlp2, mlp3, mlp4, mlp5, mlp6, mlp7, mlp8 = MLP(), MLP(), MLP(
-    ), MLP(), MLP(), MLP(), MLP(), MLP(), MLP()
+    mlp, mlp1, mlp2, mlp3, mlp4, mlp5, mlp6, mlp7, mlp8, mlp9 = MLP(), MLP(
+    ), MLP(), MLP(), MLP(), MLP(), MLP(), MLP(), MLP(), MLP()
     state_dict = mlp.state_dict()
     mlp1.set_state_dict(state_dict)
     mlp2.set_state_dict(state_dict)
@@ -171,6 +181,8 @@ def test_stage2_stage3():
     mlp6.set_state_dict(state_dict)
     mlp7.set_state_dict(state_dict)
     mlp8.set_state_dict(state_dict)
+    mlp9.set_state_dict(state_dict)
+
     # fp32 
     stage2_params = train_mlp(
         mlp1, sharding_stage=2, use_pure_fp16=False, opt_group=False)
@@ -229,7 +241,14 @@ def test_stage2_stage3():
     for i in range(len(stage3_params)):
         np.testing.assert_allclose(
             stage3_params[i].numpy(), stage3_params_re[i].numpy(), rtol=1e-6)
-    return
+
+    # check optimizer.minimize() error
+    train_mlp(
+        mlp9,
+        sharding_stage=3,
+        use_pure_fp16=False,
+        opt_group=False,
+        test_minimize=True)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_softmax_with_cross_entropy_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_softmax_with_cross_entropy_op_mlu.py
new file mode 100644
index 00000000000..e626b6a0937
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_softmax_with_cross_entropy_op_mlu.py
@@ -0,0 +1,161 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from test_softmax_op import stable_softmax
+from test_softmax_with_cross_entropy_op import cross_entropy
+
+paddle.enable_static()
+SEED = 2021
+
+
+class TestSoftmaxWithCrossEntropyOp(OpTest):
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def initParams(self):
+        self.set_mlu()
+        self.op_type = "softmax_with_cross_entropy"
+        self.numeric_stable_mode = False
+        self.place = paddle.device.MLUPlace(0)
+        self.soft_label = False
+        self.init_dtype()
+        self.axis = -1
+        self.ignore_index = -1
+        self.shape = [41, 37]
+        np.random.seed(SEED)
+
+    def setUp(self):
+        self.initParams()
+
+        logits = getattr(
+            self, "logits",
+            np.random.uniform(0.1, 1.0, self.shape).astype(self.dtype))
+        softmax = np.apply_along_axis(stable_softmax, self.axis, logits)
+
+        if self.soft_label:
+            labels = np.random.uniform(0.1, 1.0, self.shape).astype(self.dtype)
+            labels /= np.sum(labels, axis=self.axis, keepdims=True)
+        else:
+            axis_dim = self.shape[self.axis]
+            self.shape[self.axis] = 1
+            labels = np.random.randint(0, axis_dim, self.shape, dtype="int64")
+
+        loss = cross_entropy(softmax, labels, self.soft_label, self.axis,
+                             self.ignore_index)
+
+        one_hot_label = np.eye(axis_dim)[labels.reshape(-1)]
+
+        self.inputs = {"Logits": logits, "Label": labels}
+        self.outputs = {
+            "Backprop": (softmax - one_hot_label).astype(self.dtype),
+            "Softmax": softmax.astype(self.dtype),
+            "Loss": loss.astype(self.dtype)
+        }
+        self.attrs = {
+            "numeric_stable_mode": self.numeric_stable_mode,
+            "soft_label": self.soft_label,
+            "ignore_index": self.ignore_index,
+        }
+
+        if self.axis != -1:
+            self.attrs['axis'] = self.axis
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
+        # fp32 has low precision, cpu and mlu both need to relax the max_relative_error if using fp32
+        self.check_grad_with_place(
+            self.place, ['Logits'],
+            'Loss',
+            numeric_grad_delta=0.001,
+            max_relative_error=0.5)
+
+
+class TestPowNet(unittest.TestCase):
+    def _test(self, run_mlu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(32, 32)).astype('float32')
+        b_np = np.random.random(size=(32, 32)).astype('float32')
+        label_np = np.random.randint(2, size=(32, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
+            b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[32, 1], dtype='int64')
+
+            sum = paddle.add(a, b)
+            z = paddle.pow(sum, 2.0)
+
+            fc_1 = fluid.layers.fc(input=z, size=128)
+            prediction = fluid.layers.fc(input=fc_1, size=2)
+
+            cost = fluid.layers.softmax_with_cross_entropy(prediction, label)
+            loss = fluid.layers.reduce_mean(cost)
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_mlu:
+            place = paddle.device.MLUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("Start run on {}".format(place))
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(
+                main_prog,
+                feed={"a": a_np,
+                      "b": b_np,
+                      "label": label_np},
+                fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_mlu(self):
+        cpu_pred, cpu_loss = self._test(False)
+        mlu_pred, mlu_loss = self._test(True)
+
+        self.assertTrue(np.allclose(mlu_pred, cpu_pred))
+        self.assertTrue(np.allclose(mlu_loss, cpu_loss))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index e05acdd6b42..754d7bd54b9 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -30,6 +30,7 @@ from copy import copy
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
+from paddle.fluid.framework import _in_eager_mode
 from paddle.fluid.framework import _test_eager_guard
 from paddle.fluid.backward import append_backward
 from paddle.fluid.op import Operator
@@ -1831,11 +1832,21 @@ class OpTest(unittest.TestCase):
                 for no_grad_val in no_grad_set:
                     del (inputs[no_grad_val])
 
-                grad_inputs = paddle.grad(
-                    outputs=fluid.layers.utils.flatten(outputs),
-                    inputs=fluid.layers.utils.flatten(inputs),
-                    grad_outputs=grad_outputs)
-                return [grad.numpy() for grad in grad_inputs]
+                if _in_eager_mode():
+                    core.eager.run_backward(
+                        fluid.layers.utils.flatten(outputs), grad_outputs,
+                        False)
+                    grad_inputs = []
+                    for inputs_list in inputs.values():
+                        for inp in inputs_list:
+                            grad_inputs.append(inp.grad.numpy())
+                    return grad_inputs
+                else:
+                    grad_inputs = paddle.grad(
+                        outputs=fluid.layers.utils.flatten(outputs),
+                        inputs=fluid.layers.utils.flatten(inputs),
+                        grad_outputs=grad_outputs)
+                    return [grad.numpy() for grad in grad_inputs]
 
     @staticmethod
     def _numpy_to_lod_tensor(np_value, lod, place):
diff --git a/python/paddle/fluid/tests/unittests/test_dataset.py b/python/paddle/fluid/tests/unittests/test_dataset.py
index fcdac1d6241..6a32a68db1b 100644
--- a/python/paddle/fluid/tests/unittests/test_dataset.py
+++ b/python/paddle/fluid/tests/unittests/test_dataset.py
@@ -70,6 +70,14 @@ class TestDataset(unittest.TestCase):
         self.assertTrue(dataset.parse_content)
         self.assertEqual(dataset.trainer_num, 1)
 
+    def test_shuffle_by_uid(self):
+        """
+        Testcase for shuffle_by_uid.
+        """
+        dataset = paddle.distributed.InMemoryDataset()
+        dataset._set_uid_slot('6048')
+        dataset._set_shuffle_by_uid(True)
+
     def test_run_with_dump(self):
         """
         Testcase for InMemoryDataset from create to run.
diff --git a/python/paddle/fluid/tests/unittests/test_diag_v2.py b/python/paddle/fluid/tests/unittests/test_diag_v2.py
index 1478cd888c4..0371fa05428 100644
--- a/python/paddle/fluid/tests/unittests/test_diag_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_diag_v2.py
@@ -21,6 +21,7 @@ import paddle
 import paddle.fluid as fluid
 from paddle.fluid import core
 from paddle.fluid import Program, program_guard
+from paddle.fluid.framework import _test_eager_guard
 
 
 class TestDiagV2Op(OpTest):
@@ -239,6 +240,9 @@ class TestDiagV2API(unittest.TestCase):
     def test_cpu(self):
         paddle.disable_static(place=paddle.fluid.CPUPlace())
         self.run_imperative()
+        with _test_eager_guard():
+            self.run_imperative()
+
         paddle.enable_static()
 
         with fluid.program_guard(fluid.Program()):
@@ -250,6 +254,8 @@ class TestDiagV2API(unittest.TestCase):
 
         paddle.disable_static(place=paddle.fluid.CUDAPlace(0))
         self.run_imperative()
+        with _test_eager_guard():
+            self.run_imperative()
         paddle.enable_static()
 
         with fluid.program_guard(fluid.Program()):
diff --git a/python/paddle/fluid/tests/unittests/test_diagonal_op.py b/python/paddle/fluid/tests/unittests/test_diagonal_op.py
index 5617716ecb6..4dab7c0df40 100644
--- a/python/paddle/fluid/tests/unittests/test_diagonal_op.py
+++ b/python/paddle/fluid/tests/unittests/test_diagonal_op.py
@@ -22,6 +22,7 @@ import paddle.nn.functional as F
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import paddle.tensor as tensor
+from paddle.fluid.framework import _test_eager_guard
 
 paddle.enable_static()
 
@@ -33,10 +34,10 @@ class TestDiagonalOp(OpTest):
         self.outputs = {'Out': self.target}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=True)
 
     def test_check_grad(self):
-        self.check_grad(['Input'], 'Out')
+        self.check_grad(['Input'], 'Out', check_eager=True)
 
     def init_config(self):
         self.case = np.random.randn(10, 5, 2).astype('float64')
@@ -79,7 +80,8 @@ class TestDiagonalOpCase2(TestDiagonalOp):
             ['Input'],
             'Out',
             user_defined_grads=[self.grad_x],
-            user_defined_grad_outputs=[self.grad_out])
+            user_defined_grad_outputs=[self.grad_out],
+            check_eager=True)
 
 
 class TestDiagonalOpCase3(TestDiagonalOp):
@@ -122,6 +124,10 @@ class TestDiagonalAPI(unittest.TestCase):
         self.assertEqual(np.allclose(out.numpy(), out_ref, rtol=1e-08), True)
         paddle.enable_static()
 
+    def test_api_eager_dygraph(self):
+        with _test_eager_guard():
+            self.test_api_dygraph()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_digamma_op.py b/python/paddle/fluid/tests/unittests/test_digamma_op.py
index 503094779a3..3cb31b888f4 100644
--- a/python/paddle/fluid/tests/unittests/test_digamma_op.py
+++ b/python/paddle/fluid/tests/unittests/test_digamma_op.py
@@ -20,6 +20,7 @@ import paddle
 import paddle.fluid as fluid
 import paddle.static as static
 from op_test import OpTest
+from paddle.fluid.framework import _test_eager_guard
 
 
 class TestDigammaOp(OpTest):
@@ -94,6 +95,10 @@ class TestDigammaAPI(unittest.TestCase):
                     res = paddle.digamma(input_t).numpy()
                     self.assertEqual(np.allclose(res, sc_res, rtol=1e-05), True)
 
+    def test_in_eager_dynamic_mode(self):
+        with _test_eager_guard():
+            self.test_in_dynamic_mode()
+
     def test_name_argument(self):
         with static.program_guard(static.Program()):
             x = static.data(name="x", shape=self._shape, dtype=self.dtypes[0])
@@ -114,6 +119,13 @@ class TestDigammaAPI(unittest.TestCase):
                 input_t = paddle.to_tensor(input)
                 res = paddle.digamma(input_t)
 
+        with self.assertRaises(RuntimeError):
+            with fluid.dygraph.guard():
+                with _test_eager_guard():
+                    input = np.random.random(self._shape).astype("int32")
+                    input_t = paddle.to_tensor(input)
+                    res = paddle.digamma(input_t)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_get_all_registered_op_kernels.py b/python/paddle/fluid/tests/unittests/test_get_all_registered_op_kernels.py
new file mode 100644
index 00000000000..815598d9017
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_get_all_registered_op_kernels.py
@@ -0,0 +1,43 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+from paddle.fluid import core
+from paddle import compat as cpt
+
+
+class TestGetAllRegisteredOpKernels(unittest.TestCase):
+    # reshape kernel is in fluid while not in pten
+    def test_pten_kernels(self):
+        self.assertTrue(core._get_all_register_op_kernels('pten')['sign'])
+        with self.assertRaises(KeyError):
+            core._get_all_register_op_kernels('pten')['reshape']
+
+    # sign kernel is removed from fluid and added into pten
+    def test_fluid_kernels(self):
+        self.assertTrue(core._get_all_register_op_kernels('fluid')['reshape'])
+        with self.assertRaises(KeyError):
+            core._get_all_register_op_kernels('fluid')['sign']
+
+    def test_all_kernels(self):
+        self.assertTrue(core._get_all_register_op_kernels('all')['reshape'])
+        self.assertTrue(core._get_all_register_op_kernels('all')['sign'])
+
+        self.assertTrue(core._get_all_register_op_kernels()['reshape'])
+        self.assertTrue(core._get_all_register_op_kernels()['sign'])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_trunc_op.py b/python/paddle/fluid/tests/unittests/test_trunc_op.py
index b4482b402ea..08a35db3ac4 100644
--- a/python/paddle/fluid/tests/unittests/test_trunc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_trunc_op.py
@@ -21,6 +21,7 @@ import paddle
 import paddle.fluid.core as core
 import paddle.fluid as fluid
 from paddle.fluid import Program, program_guard
+from paddle.fluid.framework import _test_eager_guard
 
 paddle.enable_static()
 
@@ -78,6 +79,10 @@ class TestTruncAPI(unittest.TestCase):
         self.assertEqual(np.allclose(out.numpy(), out_ref, rtol=1e-08), True)
         paddle.enable_static()
 
+    def test_api_eager_dygraph(self):
+        with _test_eager_guard():
+            self.test_api_dygraph()
+
     def test_errors(self):
         with paddle.static.program_guard(paddle.static.Program()):
             x = paddle.fluid.data('X', [20, 20], 'bool')
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_softmax_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_softmax_op_xpu.py
index f0f0e3d86df..aa56a463b90 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_softmax_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_softmax_op_xpu.py
@@ -18,6 +18,8 @@ import sys
 import unittest
 sys.path.append("..")
 from op_test_xpu import XPUOpTest
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
+
 paddle.enable_static()
 np.random.seed(10)
 
@@ -40,63 +42,49 @@ def ref_softmax(x, axis=None, dtype=None):
     return np.apply_along_axis(stable_softmax, axis, x_t)
 
 
-class TestXPUSoftmaxOp(XPUOpTest):
-    def setUp(self):
-        self.op_type = "softmax"
-        self.shape = [2, 3, 4, 5]
-        self.axis = -1
-        self.set_attrs()
-        self.init_type()
-
-        x = np.random.uniform(-1, 1, self.shape).astype(self.dtype)
-        out = np.apply_along_axis(stable_softmax, self.axis, x)
-
-        self.inputs = {'X': x}
-        self.outputs = {'Out': out}
-        self.attrs = {'axis': self.axis, 'use_xpu': True}
-
-    def init_type(self):
-        self.dtype = np.float16
-
-    def set_attrs(self):
-        pass
-
-    def test_check_output(self):
-        self.check_output_with_place(paddle.XPUPlace(0), atol=1e-4)
-
-    def test_check_grad(self):
-        self.check_grad_with_place(paddle.XPUPlace(0), ['X'], 'Out')
-
-
-# class TestXPUSoftmaxAxis3(TestXPUSoftmaxOp):
-#     def set_attrs(self):
-#         self.axis = 3
-
-# class TestXPUSoftmax2D(TestXPUSoftmaxOp):
-#     def set_attrs(self):
-#         self.shape = [10, 12]
-
-# class TestXPUSoftmax3D(TestXPUSoftmaxOp):
-#     def set_attrs(self):
-#         self.shape = [4, 5, 6]
-
-# class TestXPUSoftmaxAxis3FP16(TestXPUSoftmaxOp):
-#     def set_attrs(self):
-#         self.axis = 3
-#     def init_type(self):
-#         self.dtype = np.float16
-
-# class TestXPUSoftmax2DFP16(TestXPUSoftmaxOp):
-#     def set_attrs(self):
-#         self.shape = [10, 12]
-#     def init_type(self):
-#         self.dtype = np.float16
-
-# class TestXPUSoftmax3DFP16(TestXPUSoftmaxOp):
-#     def set_attrs(self):
-#         self.shape = [4, 5, 6]
-#     def init_type(self):
-#         self.dtype = np.float16
+class XPUTestSoftmaxOp(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'softmax'
+        self.use_dynamic_create_class = True
+
+    def dynamic_create_class(self):
+        base_class = self.TestSoftmaxOp
+        classes = []
+        shapes = [[2, 3, 4, 5], [7, 1], [63, 18], [2, 38512], [3, 4095]]
+        axis = [-1, 0, 1]
+        for shape in shapes:
+            for axi in axis:
+                class_name = 'XPUTestSoftmax_' + \
+                       str(shape) + "_" + str(axi)
+                attr_dict = {'shape': shape, 'axis': axi}
+                classes.append([class_name, attr_dict])
+        return base_class, classes
+
+    class TestSoftmaxOp(XPUOpTest):
+        def setUp(self):
+            self.op_type = "softmax"
+            if not hasattr(self, 'shape'):
+                self.shape = [1, 7]
+                self.axis = -1
+            self.dtype = np.float32
+
+            x = np.random.uniform(-1, 1, self.shape).astype(self.dtype)
+            out = np.apply_along_axis(stable_softmax, self.axis, x)
+
+            self.inputs = {'X': x}
+            self.outputs = {'Out': out}
+            self.attrs = {'axis': self.axis, 'use_xpu': True}
+
+        def test_check_output(self):
+            self.check_output_with_place(paddle.XPUPlace(0), atol=1e-4)
+
+        def test_check_grad(self):
+            self.check_grad_with_place(paddle.XPUPlace(0), ['X'], 'Out')
+
+
+support_types = get_xpu_op_support_types('softmax')
+for stype in support_types:
+    create_test_class(globals(), XPUTestSoftmaxOp, stype)
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/incubate/nn/layer/fused_transformer.py b/python/paddle/incubate/nn/layer/fused_transformer.py
index ca14c551754..d38e8d1193b 100644
--- a/python/paddle/incubate/nn/layer/fused_transformer.py
+++ b/python/paddle/incubate/nn/layer/fused_transformer.py
@@ -75,7 +75,7 @@ class FusedMultiHeadAttention(Layer):
                  embed_dim,
                  num_heads,
                  dropout_rate=0.5,
-                 attn_dropout_rate=None,
+                 attn_dropout_rate=0.5,
                  kdim=None,
                  vdim=None,
                  normalize_before=False,
diff --git a/python/paddle/nn/quant/quant_layers.py b/python/paddle/nn/quant/quant_layers.py
index 36f4c222877..7ad43da6ed5 100644
--- a/python/paddle/nn/quant/quant_layers.py
+++ b/python/paddle/nn/quant/quant_layers.py
@@ -67,7 +67,7 @@ class FakeQuantAbsMax(layers.Layer):
         if quant_on_weight:
             scale_attr = ParamAttr(
                 name=self._scale_name,
-                initializer=Constant(0.0),
+                initializer=Constant(0.001),
                 trainable=False)
             self._scale = self.create_parameter(
                 shape=[1], attr=scale_attr, dtype=self._dtype)
diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py
index 989da2db718..47dc02705f8 100644
--- a/python/paddle/optimizer/optimizer.py
+++ b/python/paddle/optimizer/optimizer.py
@@ -91,7 +91,7 @@ class Optimizer(object):
             loss = paddle.mean(out)
             adam = paddle.optimizer.Adam(learning_rate=0.1,
                     parameters=linear.parameters())
-            out.backward()
+            loss.backward()
             adam.step()
             adam.clear_grad()
 
@@ -114,7 +114,7 @@ class Optimizer(object):
                     'learning_rate': 0.1
                 }],
                 weight_decay=0.01)                   
-            out.backward()
+            loss.backward()
             sgd.step()
             sgd.clear_grad()
 
@@ -1153,7 +1153,7 @@ class Optimizer(object):
                 adam = paddle.optimizer.Adam(learning_rate=0.1,
                         parameters=linear.parameters(),
                         weight_decay=0.01)
-                out.backward()
+                loss.backward()
                 adam.minimize(loss)
                 adam.clear_grad()
 
diff --git a/python/paddle/tensor/search.py b/python/paddle/tensor/search.py
index e15d2d49d54..2a2e7d000a1 100644
--- a/python/paddle/tensor/search.py
+++ b/python/paddle/tensor/search.py
@@ -585,26 +585,49 @@ def where(condition, x=None, y=None, name=None):
     condition_shape = list(condition.shape)
     x_shape = list(x.shape)
     y_shape = list(y.shape)
+
     if x_shape == y_shape and condition_shape == x_shape:
-        if in_dygraph_mode():
-            return _C_ops.where(condition, x, y)
-        else:
-            helper = LayerHelper("where", **locals())
-            out = helper.create_variable_for_type_inference(dtype=x.dtype)
-
-            helper.append_op(
-                type='where',
-                inputs={'Condition': condition,
-                        'X': x,
-                        'Y': y},
-                outputs={'Out': [out]})
+        broadcast_condition = condition
+        broadcast_x = x
+        broadcast_y = y
+    else:
+        if core.is_compiled_with_xpu():
+            cond_int = layers.cast(condition, x.dtype)
+            cond_not_int = layers.cast(layers.logical_not(condition), x.dtype)
+            out1 = layers.elementwise_mul(x, cond_int)
+            out2 = layers.elementwise_mul(y, cond_not_int)
+            out = layers.elementwise_add(out1, out2)
             return out
+
+        zeros_like_x = layers.zeros_like(x)
+        zeros_like_y = layers.zeros_like(y)
+        zeros_like_condition = layers.zeros_like(condition)
+        zeros_like_condition = layers.cast(zeros_like_condition, x.dtype)
+        cast_cond = layers.cast(condition, x.dtype)
+
+        broadcast_zeros = layers.elementwise_add(zeros_like_x, zeros_like_y)
+        broadcast_zeros = layers.elementwise_add(broadcast_zeros,
+                                                 zeros_like_condition)
+        broadcast_x = layers.elementwise_add(x, broadcast_zeros)
+        broadcast_y = layers.elementwise_add(y, broadcast_zeros)
+        broadcast_condition = layers.elementwise_add(cast_cond, broadcast_zeros)
+        broadcast_condition = layers.cast(broadcast_condition, 'bool')
+
+    if in_dygraph_mode():
+        return _C_ops.where(broadcast_condition, broadcast_x, broadcast_y)
     else:
-        cond_int = layers.cast(condition, x.dtype)
-        cond_not_int = layers.cast(layers.logical_not(condition), x.dtype)
-        out1 = layers.elementwise_mul(x, cond_int)
-        out2 = layers.elementwise_mul(y, cond_not_int)
-        out = layers.elementwise_add(out1, out2)
+        helper = LayerHelper("where", **locals())
+        out = helper.create_variable_for_type_inference(dtype=x.dtype)
+
+        helper.append_op(
+            type='where',
+            inputs={
+                'Condition': broadcast_condition,
+                'X': broadcast_x,
+                'Y': broadcast_y
+            },
+            outputs={'Out': [out]})
+
         return out
 
 
diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml
index e5ccd6b0405..7768cb926e4 100644
--- a/python/paddle/utils/code_gen/api.yaml
+++ b/python/paddle/utils/code_gen/api.yaml
@@ -23,7 +23,7 @@
   output : Tensor
   infer_meta :
     func : ConcatInferMeta
-    param : [x, axis, true]
+    param : [x, axis]
   kernel :
     func : concat
 
diff --git a/python/paddle/utils/code_gen/api_gen.py b/python/paddle/utils/code_gen/api_gen.py
index 09182768f24..9a772ad126c 100644
--- a/python/paddle/utils/code_gen/api_gen.py
+++ b/python/paddle/utils/code_gen/api_gen.py
@@ -27,6 +27,7 @@ class API:
         # args:
         #   inputs:
         #     names : [], list of input names
+        #     input_info : {input_name : type}
         #   attrs:
         #     names : [], list of attribute names
         #     attr_info : { attr_name : (type, default_values)}
@@ -57,6 +58,18 @@ class API:
             if 'param' not in self.infer_meta:
                 self.infer_meta['param'] = None
 
+            self.data_transform = {
+                'skip_transform': [],
+                'support_trans_dtype': []
+            }
+            if 'data_transform' in api_item_yaml:
+                if 'skip_transform' in api_item_yaml['data_transform']:
+                    self.data_transform['skip_transform'] = api_item_yaml[
+                        'data_transform']['skip_transform']
+                if 'support_trans_dtype' in api_item_yaml['data_transform']:
+                    self.data_transform['support_trans_dtype'] = api_item_yaml[
+                        'data_transform']['support_trans_dtype']
+
     def gene_api_declaration(self):
         return f"""
 PADDLE_API {self.return_type} {self.api}({self.args['args_declare']});
@@ -64,13 +77,15 @@ PADDLE_API {self.return_type} {self.api}({self.args['args_declare']});
 
     def gene_output(self, output_type_list):
         kernel_output = ""
+        output_names = []
         output_create = ""
 
         if len(output_type_list) == 1:
             kernel_output = 'dense_out'
+            output_names.append('dense_out')
             output_create = f"""
   {self.return_type} out;
-  auto dense_out = SetKernelOutput(out_meta, kernel_backend, &out);"""
+  auto dense_out = SetKernelOutput(kernel_backend, &out);"""
 
         elif len(output_type_list) > 1:
             output_create = f"""
@@ -78,8 +93,9 @@ PADDLE_API {self.return_type} {self.api}({self.args['args_declare']});
 
             for i in range(len(output_type_list)):
                 kernel_output = kernel_output + f'dense_out_{i}, '
+                output_names.append(f'dense_out_{i}')
                 output_create = output_create + f"""
-  auto dense_out_{i} = SetKernelOutput(std::get<{i}>(out_meta), kernel_backend, &std::get<{i}>(out));"""
+  auto dense_out_{i} = SetKernelOutput(kernel_backend, &std::get<{i}>(out));"""
 
             kernel_output = kernel_output[:-2]
         else:
@@ -87,24 +103,25 @@ PADDLE_API {self.return_type} {self.api}({self.args['args_declare']});
                 "{} : Output error: the output should not be empty.".format(
                     self.api))
 
-        return kernel_output, output_create
+        return kernel_output, output_names, output_create
 
     def gene_api_code(self):
         if self.is_base_api:
-            input_tensors, kernel_args = gen_utils.get_kernel_args(
-                self.args['inputs']['names'], self.args['attrs'],
-                self.kernel['param'])
-            outputs_args, output_create = self.gene_output(self.out_type_list)
+            input_tensors, kernel_args, kernel_signature = gen_utils.get_kernel_args(
+                self.args['inputs'], self.args['attrs'], self.out_type_list,
+                self.kernel['param'], self.data_transform)
+            outputs_args, output_names, output_create = self.gene_output(
+                self.out_type_list)
             return f"""
 PADDLE_API {self.return_type} {self.api}({self.args["args_define"]}) {{
 {gen_utils.gene_kernel_select(self.api, self.args['inputs']['names'], self.args['attrs'], self.kernel)}
 
   auto* dev_ctx = GetDeviceContextByBackend(kernel_backend);
 {input_tensors}
-{gen_utils.gene_infer_meta(self.args['inputs']['names'], self.args['attrs']['names'], self.infer_meta)}
 {output_create}
-
-  auto* kernel_fn = kernel.GetVariadicKernelFn<pten::{self.api}_kernel>();
+{gen_utils.gene_infer_meta(self.args['inputs']['names'], self.args['attrs']['names'], output_names, self.infer_meta)}
+  using kernel_signature = {kernel_signature};
+  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
   (*kernel_fn)({kernel_args}, {outputs_args});
 
   return out;
@@ -136,9 +153,9 @@ def source_include(header_file_path):
 
 #include "glog/logging.h"
 
-#include "paddle/pten/api/include/kernel_signature.h"
 #include "paddle/pten/api/lib/api_registry.h"
 #include "paddle/pten/api/lib/api_utils.h"
+#include "paddle/pten/api/lib/data_transform.h"
 #include "paddle/pten/api/lib/kernel_dispatch.h"
 #include "paddle/pten/api/lib/utils/storage.h"
 #include "paddle/pten/core/kernel_registry.h"
diff --git a/python/paddle/utils/code_gen/backward_api_gen.py b/python/paddle/utils/code_gen/backward_api_gen.py
index d55759b51c2..53207a089fb 100644
--- a/python/paddle/utils/code_gen/backward_api_gen.py
+++ b/python/paddle/utils/code_gen/backward_api_gen.py
@@ -50,6 +50,20 @@ class BackwardAPI:
                     'param']) == 0:
                 self.infer_meta['param'] = None
 
+            self.data_transform = {
+                'skip_transform': [],
+                'support_trans_dtype': []
+            }
+            if 'data_transform' in backward_item_yaml:
+                if 'skip_transform' in backward_item_yaml['data_transform']:
+                    self.data_transform['skip_transform'] = backward_item_yaml[
+                        'data_transform']['skip_transform']
+                if 'support_trans_dtype' in backward_item_yaml[
+                        'data_transform']:
+                    self.data_transform[
+                        'support_trans_dtype'] = backward_item_yaml[
+                            'data_transform']['support_trans_dtype']
+
     def parse_forward_config(self, forward_config):
         # api_name (const Tensor& input, ... , int attr, ...) -> Tensor(out)
         result = re.search(
@@ -105,24 +119,32 @@ class BackwardAPI:
 
     def gene_output(self, output_type_list):
         kernel_output = ""
+        output_names = []
         output_create = ""
 
         if len(output_type_list) == 1:
-            return_type = output_type_list[0]
             kernel_output = 'dense_out'
+            output_names.append('dense_out')
             output_create = f"""
   {self.return_type} out;
-  auto dense_out = SetKernelOutput(out_meta, kernel_backend, &out);"""
+  auto dense_out = SetKernelOutput(kernel_backend, &out);"""
 
         elif len(output_type_list) > 1:
             output_create = f"""
-  {self.return_type} out;"""
+  {self.return_type} out({len(output_type_list)});"""
 
             for i, out_type_item in enumerate(output_type_list):
                 kernel_output = kernel_output + f'dense_out_{i}, '
-                get_out_code = f'&out[{i}][0]' if out_type_item == 'Tensor' else f'&out[{i}]'
+                output_names.append(f'dense_out_{i}')
+                if out_type_item == 'Tensor':
+                    get_out_code = f'&out[{i}][0]'
+                    output_create = output_create + f"""
+  out[{i}].emplace_back();"""
+
+                else:
+                    get_out_code = f'&out[{i}]'
                 output_create = output_create + f"""
-  auto dense_out_{i} = SetKernelOutput(std::get<{i}>(out_meta), kernel_backend, {get_out_code});"""
+  auto dense_out_{i} = SetKernelOutput(kernel_backend, {get_out_code});"""
 
             kernel_output = kernel_output[:-2]
         else:
@@ -130,14 +152,14 @@ class BackwardAPI:
                 "{} : Output error: the output should not be empty.".format(
                     self.backward_api))
 
-        return kernel_output, output_create
+        return kernel_output, output_names, output_create
 
     def gene_api_code(self):
         if self.is_base_api:
-            input_tensors, kernel_args = gen_utils.get_kernel_args(
-                self.args['inputs']['names'], self.args['attrs'],
-                self.kernel['param'])
-            outputs_args, output_create = self.gene_output(
+            input_tensors, kernel_args, kernel_signature = gen_utils.get_kernel_args(
+                self.args['inputs'], self.args['attrs'], self.output_type_list,
+                self.kernel['param'], self.data_transform)
+            outputs_args, output_names, output_create = self.gene_output(
                 self.output_type_list)
             return f"""
 // {self.return_comment}
@@ -146,10 +168,11 @@ class BackwardAPI:
 
   auto* dev_ctx = GetDeviceContextByBackend(kernel_backend);
 {input_tensors}
-{gen_utils.gene_infer_meta(self.args['inputs']['names'], self.args['attrs']['names'], self.infer_meta)}
 {output_create}
+{gen_utils.gene_infer_meta(self.args['inputs']['names'], self.args['attrs']['names'], output_names, self.infer_meta)}
 
-  auto* kernel_fn = kernel.GetVariadicKernelFn<pten::{self.backward_api}_kernel>();
+  using kernel_signature = {kernel_signature};
+  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
   (*kernel_fn)({kernel_args}, {outputs_args});
 
   return out;
@@ -197,9 +220,9 @@ def source_include(header_file_path):
 
 #include "glog/logging.h"
 
-#include "paddle/pten/api/include/kernel_signature.h"
 #include "paddle/pten/api/lib/api_registry.h"
 #include "paddle/pten/api/lib/api_utils.h"
+#include "paddle/pten/api/lib/data_transform.h"
 #include "paddle/pten/api/lib/kernel_dispatch.h"
 #include "paddle/pten/api/lib/utils/storage.h"
 #include "paddle/pten/core/kernel_registry.h"
diff --git a/python/paddle/utils/code_gen/gen_utils.py b/python/paddle/utils/code_gen/gen_utils.py
index bdc29420558..5ce5d96429e 100644
--- a/python/paddle/utils/code_gen/gen_utils.py
+++ b/python/paddle/utils/code_gen/gen_utils.py
@@ -15,6 +15,7 @@
 import re
 
 PREFIX_TENSOR_NAME = 'dense_'
+PREFIX_META_TENSOR_NAME = 'meta_'
 
 
 def parse_args(api_name, args_str):
@@ -265,13 +266,21 @@ def gene_kernel_select(api, input_names, attrs, kernel) -> str:
     return kernel_select_code
 
 
-def gene_infer_meta(input_names, attr_names, infer_meta) -> str:
-    infer_meta_params = infer_meta['param'] if infer_meta[
-        'param'] is not None else input_names + attr_names
+def gene_infer_meta(input_names, attr_names, output_names, infer_meta) -> str:
+    infer_meta_params = infer_meta['param'] + output_names if infer_meta[
+        'param'] is not None else input_names + attr_names + output_names
+    # generate meta tensors
+    meta_tensor_code = ""
     param_code = ""
     for param in infer_meta_params:
         if param in input_names:
-            param_code = param_code + "GetDenseTensorMeta(*" + PREFIX_TENSOR_NAME + param + "), "
+            param_code = param_code + "MakeMetaTensor(*" + PREFIX_TENSOR_NAME + param + "), "
+        elif param in output_names:
+            meta_tensor_code = meta_tensor_code + "  pten::MetaTensor " + param.replace(
+                PREFIX_TENSOR_NAME,
+                PREFIX_META_TENSOR_NAME) + "(" + param + ");\n"
+            param_code = param_code + "&" + param.replace(
+                PREFIX_TENSOR_NAME, PREFIX_META_TENSOR_NAME) + ", "
         elif param in attr_names:
             param_code = param_code + param + ", "
         elif isinstance(param, str):
@@ -282,12 +291,26 @@ def gene_infer_meta(input_names, attr_names, infer_meta) -> str:
             param_code = param_code + str(param) + ", "
 
     param_code = param_code[:-2]
-    return f"""
-  auto out_meta = pten::{infer_meta['func']}({param_code});
+    return f"""{meta_tensor_code}
+  pten::{infer_meta['func']}({param_code});
 """
 
 
-def get_kernel_args(input_names, attrs, kernel_param):
+def get_kernel_args(inputs, attrs, out_type_list, kernel_param, data_transform):
+    input_trans_map = {
+        'const Tensor&': 'const pten::DenseTensor&',
+        'const Tensor &': 'const pten::DenseTensor&',
+        'const std::vector<Tensor>&': 'const std::vector<pten::DenseTensor>&',
+        'const std::vector<Tensor> &': 'const std::vector<pten::DenseTensor>&'
+    }
+    out_trans_map = {
+        'Tensor': 'pten::DenseTensor*',
+        'std::vector<Tensor>': 'std::vector<pten::DenseTensor*>&'
+    }
+    input_names = inputs['names']
+    input_infos = inputs['input_info']
+    kernel_args_type_list = ['const platform::DeviceContext&']
+
     input_tensor_code = ""
     for input_name in input_names:
         # set input code
@@ -298,19 +321,46 @@ def get_kernel_args(input_names, attrs, kernel_param):
     if kernel_param is None:
         kernel_param = input_names + attr_names
 
+    input_tensor_code = ""
+    for i, input_name in enumerate(input_names):
+        # set input code
+        if input_name in kernel_param:
+            trans_flag = "{}"
+            if input_name in data_transform['skip_transform']:
+                trans_flag = "{true}"
+            elif input_name in data_transform['support_trans_dtype']:
+                trans_flag = "{false, true}"
+            input_tensor_code = input_tensor_code + f"""
+  auto {PREFIX_TENSOR_NAME}{input_name} = PrepareData({input_name}, kernel.InputAt({i}), {trans_flag});"""
+
+        else:
+            input_tensor_code = input_tensor_code + f"""
+  auto {PREFIX_TENSOR_NAME}{input_name} = TensorToDenseTensor({input_name});"""
+
     kernel_args = "*dev_ctx, "
     for param in kernel_param:
         if param in input_names:
             kernel_args = kernel_args + "*" + PREFIX_TENSOR_NAME + param + ", "
+            kernel_args_type_list.append(input_trans_map[input_infos[param]])
         elif param in attr_names:
             # set attr for kernel_context
             if 'ScalarArray' in attrs['attr_info'][param][0]:
+                kernel_args_type_list.append('const pten::ScalarArray&')
                 param = 'pten::ScalarArray(' + param + ')'
             elif 'Scalar' in attrs['attr_info'][param][0]:
+                kernel_args_type_list.append('const pten::Scalar&')
                 param = 'pten::Scalar(' + param + ')'
+            else:
+                kernel_args_type_list.append(attrs['attr_info'][param][0])
             kernel_args = kernel_args + param + ", "
         elif isinstance(param, bool):
             kernel_args = kernel_args + str(param).lower() + ", "
         else:
             kernel_args = kernel_args + str(param) + ", "
-    return input_tensor_code, kernel_args[:-2]
+
+    for out_type in out_type_list:
+        kernel_args_type_list.append(out_trans_map[out_type])
+
+    kernel_signature = "void(*)(" + ", ".join(kernel_args_type_list) + ")"
+
+    return input_tensor_code, kernel_args[:-2], kernel_signature
diff --git a/python/setup.py.in b/python/setup.py.in
index d1c0157c2b3..d5b237d2a3f 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -269,6 +269,7 @@ packages=['paddle',
           'paddle.dataset',
           'paddle.reader',
           'paddle.distributed',
+          'paddle.distributed.metric',
           'paddle.incubate',
           'paddle.incubate.optimizer',
           'paddle.incubate.checkpoint',
@@ -566,13 +567,15 @@ def find_files(pattern, root, recursive=False):
             break
 
 headers = (
+    # paddle level api headers
     list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle')) +
     list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/pten/api')) +  # pten unify api header
     list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/pten/api/ext')) +  # custom op api
     list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/pten/api/include')) +  # pten api
     list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/pten/common')) +  # pten common headers
-    # For paddle uew custom op, only copy data type headers from `paddle/fluid/platform`
-    # to `paddle/pten/api/ext`,
+    # pten level api headers (low level api)
+    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/pten/core', recursive=True)) +  # pten core headers
+    # utila api headers
     ['@PADDLE_SOURCE_DIR@/paddle/utils/any.h'] +
     ['@PADDLE_SOURCE_DIR@/paddle/utils/small_vector.h'])
 
@@ -619,8 +622,6 @@ class InstallHeaders(Command):
         elif 'third_party' not in header:
             # paddle headers
             install_dir = re.sub('@PADDLE_SOURCE_DIR@/', '', header)
-            if 'fluid' in install_dir:
-                install_dir = "paddle/pten/common/"
         else:
             # third_party
             install_dir = re.sub('${THIRD_PARTY_PATH}', 'third_party', header)
diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh
index 503c763c08c..09f22c33a84 100644
--- a/tools/check_file_diff_approvals.sh
+++ b/tools/check_file_diff_approvals.sh
@@ -229,18 +229,6 @@ if [ "${HAS_MODIFIED_ALLOCATION}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
     check_approval 1 6888866 39303645
   fi
 
-HAS_MODIFIED_TENSOR=`git diff --name-only upstream/$BRANCH | grep "paddle/fluid/framework/tensor" || true`
-if [ "${HAS_MODIFIED_TENSOR}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
-    echo_line="You must be approved by jim19930609 or chenwhql for paddle/fluid/framework/tensor. It is being modularized and refactored. Thanks!\n"
-    check_approval 1 22561442 22334008
-  fi
-
-HAS_MODIFIED_LOD_TENSOR=`git diff --name-only upstream/$BRANCH | grep "paddle/fluid/framework/lod_tensor" || true`
-if [ "${HAS_MODIFIED_LOD_TENSOR}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
-    echo_line="You must be approved by jim19930609 or chenwhql for paddle/fluid/framework/lod_tensor. It is being modularized and refactored. Thanks!\n"
-    check_approval 1 22561442 22334008
-  fi
-
 ALL_PADDLE_ENFORCE=`git diff -U0 upstream/$BRANCH |grep "^+" |grep -zoE "PADDLE_ENFORCE\(.[^,\);]+.[^;]*\);\s" || true`
 if [ "${ALL_PADDLE_ENFORCE}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
     echo_line="PADDLE_ENFORCE is not recommended. Please use PADDLE_ENFORCE_EQ/NE/GT/GE/LT/LE or PADDLE_ENFORCE_NOT_NULL or PADDLE_ENFORCE_GPU_SUCCESS instead, see [ https://github.com/PaddlePaddle/Paddle/wiki/PADDLE_ENFORCE-Rewriting-Specification ] for details.\nYou must have one RD (chenwhql (Recommend) , luotao1 (Recommend) or lanxianghit) approval for the usage (either add or delete) of PADDLE_ENFORCE.\n${ALL_PADDLE_ENFORCE}\n"
@@ -336,8 +324,8 @@ if [ "${NEW_OP_TEST_ADDED}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
     CHECK_WHOLE=$CHECK_OUTPUT$CHECK_OUTPUT_WITH_PLACE$CHECK_GRAD$CHECK_GRAD_CHECK
     if [ "${CHECK_WHOLE}" != "" ] ; then
         CHECK_OP=${CHECK_WHOLE//+/'\n+'}       
-        echo_line="Please use the default precision parameters of 'atol, rtol, eps, max_relative_error'. If you don't use the default value, you must have one RD (Xreki (Recommend), fuyinno4 (Recommend for kunlun), zhiqiu or qili93 (Recommend for NPU) , luotao1, lanxianghit or phlrain) approval for the usage of other values. The detailed information is in the link: https://github.cor/PaddlePaddle/Paddle/wiki/OP-test-accuracy-requirements. The error line is ${CHECK_OP}\n"
-        check_approval 1 6836917 47554610 12538138 43953930 35824027 6888866 16605440
+        echo_line="Please use the default precision parameters of 'atol, rtol, eps, max_relative_error'. If you don't use the default value, you must have one RD (Xreki (Recommend), fuyinno4, QingshuChen(Recommend for kunlun), zhiqiu or qili93 (Recommend for NPU) , luotao1, lanxianghit or phlrain) approval for the usage of other values. The detailed information is in the link: https://github.cor/PaddlePaddle/Paddle/wiki/OP-test-accuracy-requirements. The error line is ${CHECK_OP}\n"
+        check_approval 1 6836917 47554610 12538138 43953930 35824027 6888866 16605440 2002279
     fi
 fi
 
-- 
GitLab