From da50a0093e76dda388976e27a51f060348dd7be6 Mon Sep 17 00:00:00 2001
From: YuanRisheng <yuanrisheng@baidu.com>
Date: Fri, 26 May 2023 17:12:32 +0800
Subject: [PATCH] [PHI Decoupling]Create PHI shared lib (#53735)

* create phi so

* fix ci bugs

* fix py3 bugs

* add file

* fix py3 bugs

* fix windows bugs

* perfect so

* fix py3 bugs

* delete all static target in phi

* fix windows bugs

* fix py3 bugs

* fix ci bugs

* fix windows bugs

* fix bugs: gflags can't be linked by dynamic and static lib

* fix bugs that can not load 3rd party

* fix ci bugs

* fix compile bugs

* fix py3 bugs

* fix conflict

* fix xpu bugs

* fix mac compile bugs

* fix psgpu bugs

* fix inference failed

* deal with conflict

* fix LIBRARY_PATH bug

* fix windows bugs

* fix onednn error

* fix windows compile bugs

* fix windows compile bugs

* fix test_cuda_graph_static_mode_error aborted

* fix windows bugs

* fix mac-python3 error

* fix hip compile bugs

* change mode to static

* change to static mode

* fix ci bugs

* fix py3 bugs

* fix windows bugs

* fix bugs

* add static flag

* add PADDLE_API

* change position of PADDLE_API

* fix windows bugs

* change mode to dynamic lib

* fix windows static bugs

* deal with conflict

* fix windows unit bug

* fix coverage

* deal with conflict

* fix windows-inference

* fix py3 bugs

* fix bugs when compile type_info

* fix compile bugs

* fix py3 bugs

* fix windows bugs

* fix windows openblas

* fix xpu bugs

* fix enforce_test in windows

* update code according comment

* fix windows cmake bug

* fix windows bugs

* fix windows bugs

* delete cinn unittest

* fix cinn bugs

---------

Co-authored-by: lzydev <1528794076@qq.com>
---
 cmake/cblas.cmake                             |   1 -
 cmake/configure.cmake                         |  13 +
 cmake/external/warpctc.cmake                  |   3 +-
 cmake/generic.cmake                           |  48 +--
 cmake/inference_lib.cmake                     |   7 +
 cmake/operators.cmake                         |   7 +-
 cmake/phi.cmake                               |  31 ++
 paddle/fluid/dialect/CMakeLists.txt           |   2 +-
 .../distributed/auto_parallel/CMakeLists.txt  |   3 +-
 .../auto_parallel/test/CMakeLists.txt         |   8 +-
 .../distributed/collective/CMakeLists.txt     |  31 +-
 .../distributed/fleet_executor/CMakeLists.txt |   6 +-
 .../distributed/ps/service/CMakeLists.txt     |  11 +-
 .../fluid/distributed/ps/table/CMakeLists.txt |   4 +-
 paddle/fluid/distributed/rpc/CMakeLists.txt   |   2 +-
 paddle/fluid/distributed/test/CMakeLists.txt  |   2 +-
 paddle/fluid/eager/CMakeLists.txt             |  15 +-
 .../fluid/eager/accumulation/CMakeLists.txt   |   2 +-
 .../eager_generated/backwards/CMakeLists.txt  |   2 +-
 .../eager_generated/forwards/CMakeLists.txt   |   2 +-
 paddle/fluid/eager/api/utils/CMakeLists.txt   |   4 +-
 .../eager/auto_code_generator/CMakeLists.txt  |   9 +
 .../generator/eager_gen.py                    |   2 +-
 .../eager/custom_operator/CMakeLists.txt      |   2 +-
 paddle/fluid/eager/pylayer/CMakeLists.txt     |   2 +-
 paddle/fluid/framework/CMakeLists.txt         |  89 ++---
 paddle/fluid/framework/details/CMakeLists.txt |  58 ++-
 .../fluid/framework/details/build_strategy.cc |   3 +-
 paddle/fluid/framework/ir/CMakeLists.txt      |   7 +-
 .../fluid/framework/ir/fuse_adamw_op_pass.cc  |   2 +-
 .../framework/ir/fusion_group/CMakeLists.txt  |   4 +-
 .../ir/memory_optimize_pass/CMakeLists.txt    |   3 +-
 .../framework/new_executor/CMakeLists.txt     |   2 +-
 .../new_executor/interpreter/CMakeLists.txt   |   3 +-
 .../new_executor/workqueue/CMakeLists.txt     |   2 +-
 .../framework/paddle2cinn/CMakeLists.txt      |   4 +-
 paddle/fluid/framework/raw_tensor.h           |   3 +-
 paddle/fluid/framework/type_info.cc           |  54 +++
 paddle/fluid/imperative/CMakeLists.txt        |  32 +-
 paddle/fluid/inference/CMakeLists.txt         |  30 +-
 paddle/fluid/inference/api/CMakeLists.txt     |   4 +-
 .../fluid/inference/api/analysis_predictor.cc |   8 +-
 .../inference/api/demo_ci/CMakeLists.txt      |   2 +-
 paddle/fluid/inference/api/demo_ci/run.sh     |   1 +
 .../inference/api/details/CMakeLists.txt      |   4 +-
 .../fluid/inference/capi_exp/CMakeLists.txt   |   2 +-
 paddle/fluid/inference/goapi/test.sh          |   2 +-
 .../inference/tensorrt/convert/CMakeLists.txt |   3 +-
 paddle/fluid/memory/allocation/CMakeLists.txt |   4 +-
 .../memory/allocation/allocator_facade.cc     |   3 +-
 paddle/fluid/operators/CMakeLists.txt         |  19 +-
 paddle/fluid/operators/cinn/CMakeLists.txt    |   2 +-
 .../fluid/operators/collective/CMakeLists.txt |   5 +-
 .../fluid/operators/detection/CMakeLists.txt  |   4 +-
 .../fluid/operators/generator/CMakeLists.txt  |   2 +-
 paddle/fluid/operators/gru_op.cc              |   3 +-
 paddle/fluid/operators/math/CMakeLists.txt    |  13 +-
 paddle/fluid/operators/pscore/CMakeLists.txt  |   4 +-
 .../operators/sequence_ops/CMakeLists.txt     |   2 +-
 paddle/fluid/operators/var_conv_2d_op.cc      |   2 +-
 paddle/fluid/platform/CMakeLists.txt          |  58 +--
 paddle/fluid/platform/cpu_helper.cc           |   4 +-
 .../platform/device/custom/CMakeLists.txt     |   4 +-
 .../fluid/platform/device/gpu/CMakeLists.txt  |  10 +-
 .../fluid/platform/device/xpu/CMakeLists.txt  |  16 +-
 paddle/fluid/platform/dynload/CMakeLists.txt  |  18 +-
 paddle/fluid/platform/dynload/mklml.cc        |  32 --
 paddle/fluid/platform/dynload/mklml.h         | 113 ------
 paddle/fluid/platform/profiler.cc             |  16 +
 paddle/fluid/platform/profiler/CMakeLists.txt |   4 +-
 paddle/fluid/pybind/CMakeLists.txt            |  23 +-
 paddle/fluid/pybind/eager_utils.cc            |   4 +-
 paddle/phi/CMakeLists.txt                     | 177 +++++++--
 paddle/phi/api/CMakeLists.txt                 |   5 -
 paddle/phi/api/all.cc                         |  19 -
 paddle/phi/api/ext/op_meta_info.h             |  50 +--
 paddle/phi/api/include/tensor.h               |   4 +-
 paddle/phi/api/lib/CMakeLists.txt             | 365 ++++--------------
 paddle/phi/api/lib/context_pool.cc            |  11 +-
 paddle/phi/api/lib/op_meta_info.cc            |  88 ++++-
 paddle/phi/api/lib/tensor.cc                  |   4 +
 paddle/phi/api/profiler/CMakeLists.txt        |  14 +-
 paddle/phi/backends/CMakeLists.txt            |  35 +-
 paddle/phi/backends/cpu/cpu_context.cc        |   4 +
 paddle/phi/backends/custom/custom_context.cc  |   5 +
 paddle/phi/backends/dynload/CMakeLists.txt    |  79 ++--
 paddle/phi/backends/gpu/cuda/CMakeLists.txt   |   2 +-
 paddle/phi/backends/gpu/gpu_context.cc        |   9 +
 paddle/phi/backends/gpu/gpu_context.h         |   4 +
 paddle/phi/backends/onednn/onednn_context.cc  |   7 +
 paddle/phi/backends/onednn/onednn_context.h   |   7 +-
 paddle/phi/backends/xpu/xpu_context.cc        |   3 +
 paddle/phi/capi/CMakeLists.txt                |  12 -
 paddle/phi/capi/all.cc                        |  19 -
 paddle/phi/capi/lib/CMakeLists.txt            |  56 +--
 paddle/phi/common/CMakeLists.txt              |  27 +-
 paddle/phi/core/CMakeLists.txt                | 179 ++-------
 paddle/phi/core/compat/CMakeLists.txt         |  25 +-
 paddle/phi/core/compat/op_utils.cc            |  12 +
 paddle/phi/core/compat/op_utils.h             |  10 +-
 paddle/phi/core/dense_tensor.cc               |  25 +-
 paddle/phi/core/distributed/CMakeLists.txt    |  26 +-
 .../distributed/auto_parallel/CMakeLists.txt  |  23 +-
 .../phi/core/distributed/check/CMakeLists.txt |  12 +-
 .../phi/core/distributed/store/CMakeLists.txt |  14 +-
 .../phi/core/distributed/store/tcp_store.cc   |   8 +-
 paddle/phi/core/enforce.cc                    |  18 +-
 paddle/phi/core/flags.h                       |   2 +-
 paddle/phi/core/lod_utils.cc                  |   5 +-
 paddle/phi/core/selected_rows.cc              |   5 +
 paddle/phi/core/sparse_coo_tensor.cc          |  17 +-
 paddle/phi/core/sparse_csr_tensor.cc          |  30 +-
 paddle/phi/core/storage_properties.cc         |  32 ++
 paddle/phi/core/string_tensor.cc              |  11 +-
 paddle/phi/core/tensor_array.cc               |   5 +
 paddle/phi/core/utils/type_info.h             |   4 -
 paddle/phi/infermeta/CMakeLists.txt           |  18 +-
 paddle/phi/infermeta/multiary.cc              |  26 +-
 paddle/phi/infermeta/sparse/CMakeLists.txt    |  10 +-
 paddle/phi/infermeta/strings/CMakeLists.txt   |   5 +-
 paddle/phi/infermeta/unary.cc                 |  28 +-
 paddle/phi/kernels/CMakeLists.txt             | 187 ++-------
 paddle/phi/kernels/autotune/CMakeLists.txt    |  16 +-
 paddle/phi/kernels/autotune/cache_base.h      |   4 +-
 paddle/phi/kernels/cpu/rmsprop_kernel.cc      |   4 -
 paddle/phi/kernels/funcs/CMakeLists.txt       |  70 +---
 paddle/phi/kernels/funcs/blas/CMakeLists.txt  |   5 +-
 paddle/phi/kernels/funcs/blas/blas_impl.cu.h  |   5 +-
 .../phi/kernels/funcs/detail/CMakeLists.txt   |   2 +-
 paddle/phi/kernels/funcs/eigen/CMakeLists.txt |  18 +-
 paddle/phi/kernels/funcs/jit/CMakeLists.txt   |  24 +-
 .../phi/kernels/funcs/jit/gen/CMakeLists.txt  |   8 +-
 paddle/phi/kernels/funcs/jit/gen_base.h       |   2 +-
 .../phi/kernels/funcs/jit/more/CMakeLists.txt |   4 -
 .../funcs/jit/more/intrinsic/CMakeLists.txt   |   8 +-
 .../kernels/funcs/jit/more/mix/CMakeLists.txt |   8 +-
 .../kernels/funcs/jit/more/mkl/CMakeLists.txt |   8 +-
 .../kernels/funcs/jit/refer/CMakeLists.txt    |   8 +-
 .../phi/kernels/funcs/lapack/CMakeLists.txt   |   2 +-
 paddle/phi/kernels/funcs/math_function.h      |   3 +-
 ...matrix_inverse.cu.cc => matrix_inverse.cu} |   0
 .../cutlass/memory_efficient_attention.cu     |  43 ++-
 .../memory_efficient_attention_backward.cu    |  44 ++-
 paddle/phi/kernels/gpu/eigvalsh_kernel.cu     |   4 +
 paddle/phi/kernels/gpu/gelu_funcs.h           |   2 +-
 paddle/phi/kernels/impl/isclose_kernel_impl.h |  26 +-
 .../phi/kernels/impl/slice_grad_kernel_impl.h |  26 +-
 paddle/phi/kernels/transfer_layout_kernel.cc  |   1 +
 paddle/scripts/paddle_build.sh                |   4 +
 paddle/testing/CMakeLists.txt                 |   2 +-
 paddle/utils/CMakeLists.txt                   |   6 +-
 paddle/utils/string/CMakeLists.txt            |   6 +-
 python/env_dict.py.in                         |   3 +
 .../fluid/tests/unittests/CMakeLists.txt      |   8 -
 .../test_parallel_executor_run_cinn.py        |   4 +-
 .../unittests/test_resnet50_with_cinn.py      | 149 -------
 python/setup.py.in                            |  12 +-
 setup.py                                      |  23 +-
 test/CMakeLists.txt                           |   7 +-
 test/cpp/eager/CMakeLists.txt                 |   4 +-
 test/cpp/fluid/CMakeLists.txt                 |  10 +-
 test/cpp/fluid/benchmark/CMakeLists.txt       |   2 +-
 test/cpp/fluid/cinn/CMakeLists.txt            |  91 ++---
 test/cpp/fluid/fused/CMakeLists.txt           |  14 +-
 test/cpp/fluid/math/CMakeLists.txt            |  10 +-
 test/cpp/fluid/mkldnn/CMakeLists.txt          |   8 +-
 test/cpp/fluid/pscore/CMakeLists.txt          |  12 +-
 test/cpp/imperative/CMakeLists.txt            |   9 +-
 test/cpp/imperative/test_hooks.cc             |   3 +-
 test/cpp/inference/infer_ut/CMakeLists.txt    |   4 +-
 test/cpp/jit/CMakeLists.txt                   |   1 -
 test/cpp/new_executor/CMakeLists.txt          |   3 +-
 test/cpp/phi/api/CMakeLists.txt               |  22 +-
 test/cpp/phi/api/scale_api.h                  |   3 +-
 test/cpp/phi/common/CMakeLists.txt            |  14 +-
 test/cpp/phi/core/CMakeLists.txt              |  38 +-
 test/cpp/phi/core/test_type_info.cc           |   5 +
 test/cpp/phi/kernels/CMakeLists.txt           |  16 +-
 test/cpp/phi/ops/CMakeLists.txt               |   2 +-
 test/cpp/prim/CMakeLists.txt                  |  12 +-
 tools/parallel_UT_rule.py                     |   1 -
 181 files changed, 1355 insertions(+), 2147 deletions(-)
 create mode 100644 paddle/fluid/framework/type_info.cc
 delete mode 100644 paddle/fluid/platform/dynload/mklml.cc
 delete mode 100644 paddle/fluid/platform/dynload/mklml.h
 delete mode 100644 paddle/phi/api/all.cc
 delete mode 100644 paddle/phi/capi/all.cc
 create mode 100644 paddle/phi/core/storage_properties.cc
 rename paddle/phi/kernels/funcs/{matrix_inverse.cu.cc => matrix_inverse.cu} (100%)
 delete mode 100644 python/paddle/fluid/tests/unittests/test_resnet50_with_cinn.py

diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake
index 8aee8888708..b68ca023704 100644
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -40,7 +40,6 @@ if(WITH_MKLML)
   add_definitions(-DLAPACK_FOUND)
 
   add_dependencies(cblas mklml)
-  target_link_libraries(cblas dynload_mklml)
 
   message(STATUS "Found cblas and lapack in MKLML "
                  "(include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index ad789a53e83..c5b9e896686 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -235,3 +235,16 @@ endif()
 if(WITH_CUDNN_FRONTEND)
   add_definitions(-DPADDLE_WITH_CUDNN_FRONTEND)
 endif()
+
+set(WITH_PHI_SHARED
+    ON
+    CACHE BOOL "" FORCE)
+if(WIN32 OR WITH_ROCM)
+  set(WITH_PHI_SHARED
+      OFF
+      CACHE BOOL "" FORCE)
+endif()
+
+if(WITH_PHI_SHARED)
+  add_definitions(-DPHI_SHARED)
+endif()
diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake
index 749de1b46ef..f2fc570c048 100644
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -122,6 +122,5 @@ get_filename_component(WARPCTC_LIBRARY_PATH ${WARPCTC_LIBRARIES} DIRECTORY)
 include_directories(${WARPCTC_INCLUDE_DIR}
 )# For warpctc code to include its headers.
 
-add_library(warpctc SHARED IMPORTED GLOBAL)
-set_property(TARGET warpctc PROPERTY IMPORTED_LOCATION ${WARPCTC_LIBRARIES})
+add_library(warpctc INTERFACE)
 add_dependencies(warpctc extern_warpctc)
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 09a51306749..947d44950d5 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -364,20 +364,7 @@ function(cc_library TARGET_NAME)
         list(REMOVE_ITEM cc_library_DEPS warpctc)
         add_dependencies(${TARGET_NAME} warpctc)
       endif()
-      # Only deps libmklml.so, not link
-      if("${cc_library_DEPS};" MATCHES "mklml;")
-        list(REMOVE_ITEM cc_library_DEPS mklml)
-        if(NOT "${TARGET_NAME}" MATCHES "dynload_mklml")
-          list(APPEND cc_library_DEPS dynload_mklml)
-        endif()
-        add_dependencies(${TARGET_NAME} mklml)
-        if(WIN32)
-          target_link_libraries(${TARGET_NAME} ${MKLML_IOMP_LIB})
-        else()
-          target_link_libraries(${TARGET_NAME}
-                                "-L${MKLML_LIB_DIR} -liomp5 -Wl,--as-needed")
-        endif()
-      endif()
+
       # remove link to python, see notes at:
       # https://github.com/pybind/pybind11/blob/master/docs/compiling.rst#building-manually
       if("${cc_library_DEPS};" MATCHES "python;")
@@ -457,25 +444,10 @@ function(cc_test_build TARGET_NAME)
       endif()
     endif()
     get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
-    target_link_libraries(
-      ${TARGET_NAME}
-      ${cc_test_DEPS}
-      ${os_dependency_modules}
-      paddle_gtest_main
-      lod_tensor
-      memory
-      gtest
-      gflags
-      glog)
-    add_dependencies(
-      ${TARGET_NAME}
-      ${cc_test_DEPS}
-      paddle_gtest_main
-      lod_tensor
-      memory
-      gtest
-      gflags
-      glog)
+    target_link_libraries(${TARGET_NAME} ${cc_test_DEPS}
+                          ${os_dependency_modules} paddle_gtest_main gtest glog)
+    add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main gtest
+                     glog)
     common_link(${TARGET_NAME})
     if(WITH_ROCM)
       target_link_libraries(${TARGET_NAME} ${ROCM_HIPRTC_LIB})
@@ -670,7 +642,7 @@ function(nv_test TARGET_NAME)
     add_executable(${TARGET_NAME} ${nv_test_SRCS})
     get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
     target_link_libraries(${TARGET_NAME} ${nv_test_DEPS}
-                          ${os_dependency_modules} paddle_gtest_main)
+                          ${os_dependency_modules} paddle_gtest_main phi)
     add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main)
     common_link(${TARGET_NAME})
     add_test(${TARGET_NAME} ${TARGET_NAME})
@@ -774,8 +746,8 @@ function(hip_test TARGET_NAME)
       lod_tensor
       memory
       gtest
-      gflags
       glog
+      phi
       ${os_dependency_modules})
     add_dependencies(
       ${TARGET_NAME}
@@ -784,7 +756,7 @@ function(hip_test TARGET_NAME)
       lod_tensor
       memory
       gtest
-      gflags
+      phi
       glog)
     common_link(${TARGET_NAME})
     add_test(${TARGET_NAME} ${TARGET_NAME})
@@ -881,7 +853,7 @@ function(xpu_test TARGET_NAME)
       lod_tensor
       memory
       gtest
-      gflags
+      phi
       glog
       ${os_dependency_modules})
     add_dependencies(
@@ -891,7 +863,7 @@ function(xpu_test TARGET_NAME)
       lod_tensor
       memory
       gtest
-      gflags
+      phi
       glog)
     common_link(${TARGET_NAME})
     add_test(${TARGET_NAME} ${TARGET_NAME})
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index 8d47dbd5e9b..3731d23b813 100755
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -269,6 +269,13 @@ else()
     SRCS ${src_dir}/inference/api/paddle_*.h ${paddle_inference_lib}
     DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include
          ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib)
+
+  set(paddle_phi_lib ${PADDLE_BINARY_DIR}/paddle/phi/libphi.*)
+  copy(
+    inference_lib_dist
+    SRCS ${paddle_phi_lib}
+    DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib)
+
 endif()
 
 copy(
diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index 4c426d66876..a0f5d2c82ee 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -61,8 +61,7 @@ function(register_cu_kernel TARGET)
                         "${multiValueArgs}" ${ARGN})
 
   set(cu_srcs)
-  set(op_common_deps operator op_registry math_function layer
-                     common_infer_shape_functions)
+  set(op_common_deps operator op_registry layer common_infer_shape_functions)
   foreach(cu_src ${register_cu_kernel_SRCS})
     if(${cu_src} MATCHES ".*\\.cu$")
       list(APPEND cu_srcs ${cu_src})
@@ -113,7 +112,7 @@ function(register_mkldnn_kernel TARGET)
                         "${multiValueArgs}" ${ARGN})
 
   set(mkldnn_cc_srcs)
-  set(op_common_deps operator op_registry math_function layer
+  set(op_common_deps operator op_registry phi layer
                      common_infer_shape_functions)
   foreach(mkldnn_src ${register_mkldnn_kernel_SRCS})
     if(${mkldnn_src} MATCHES ".*_mkldnn_op.cc$")
@@ -164,7 +163,7 @@ function(op_library TARGET)
   set(MIOPEN_FILE)
   set(mkldnn_cc_srcs)
   set(MKLDNN_FILE)
-  set(op_common_deps operator op_registry math_function layer
+  set(op_common_deps operator op_registry phi layer
                      common_infer_shape_functions)
 
   # Option `UNITY` is used to specify that operator `TARGET` will compiles with Unity Build.
diff --git a/cmake/phi.cmake b/cmake/phi.cmake
index f97d5d3f8f7..8a500f93860 100644
--- a/cmake/phi.cmake
+++ b/cmake/phi.cmake
@@ -94,6 +94,13 @@ function(kernel_declare TARGET_LIST)
           continue()
         endif()
       endif()
+      # fusion group kernel is not supported in windows and mac
+      if(WIN32 OR APPLE)
+        string(FIND "${first_registry}" "fusion_group" pos)
+        if(pos GREATER 1)
+          continue()
+        endif()
+      endif()
       # some gpu kernel only can run on cuda, not support rocm, so we add this branch
       if(WITH_ROCM)
         string(FIND "${first_registry}" "cuda_only" pos)
@@ -216,3 +223,27 @@ function(prune_declaration_h)
     endif()
   endforeach()
 endfunction()
+
+function(collect_srcs SRC_GROUP)
+  set(options)
+  set(oneValueArgs)
+  set(multiValueArgs "SRCS")
+  cmake_parse_arguments(prefix "" "" "${multiValueArgs}" ${ARGN})
+  foreach(src ${prefix_SRCS})
+    set(${SRC_GROUP}
+        "${${SRC_GROUP}};${CMAKE_CURRENT_SOURCE_DIR}/${src}"
+        CACHE INTERNAL "")
+  endforeach()
+endfunction()
+
+function(collect_generated_srcs SRC_GROUP)
+  set(options)
+  set(oneValueArgs)
+  set(multiValueArgs "SRCS")
+  cmake_parse_arguments(prefix "" "" "${multiValueArgs}" ${ARGN})
+  foreach(src ${prefix_SRCS})
+    set(${SRC_GROUP}
+        "${${SRC_GROUP}};${src}"
+        CACHE INTERNAL "")
+  endforeach()
+endfunction()
diff --git a/paddle/fluid/dialect/CMakeLists.txt b/paddle/fluid/dialect/CMakeLists.txt
index 8130b75f637..24c18e24c23 100644
--- a/paddle/fluid/dialect/CMakeLists.txt
+++ b/paddle/fluid/dialect/CMakeLists.txt
@@ -49,5 +49,5 @@ file(GLOB PD_DIALECT_SRCS "*.cc")
 cc_library(
   pd_dialect
   SRCS ${PD_DIALECT_SRCS} ${op_source_file}
-  DEPS new_ir framework_proto dense_tensor phi_utils)
+  DEPS new_ir framework_proto phi phi_utils)
 target_include_directories(pd_dialect PRIVATE ${PD_DIALECT_BINARY_DIR})
diff --git a/paddle/fluid/distributed/auto_parallel/CMakeLists.txt b/paddle/fluid/distributed/auto_parallel/CMakeLists.txt
index 9d9cb97d855..a0806fa1a64 100644
--- a/paddle/fluid/distributed/auto_parallel/CMakeLists.txt
+++ b/paddle/fluid/distributed/auto_parallel/CMakeLists.txt
@@ -1,7 +1,6 @@
 cc_library(
   op_dist_attr
   SRCS dist_attr.cc
-  DEPS dist_attr process_mesh dist_mapper auto_parallel_proto proto_desc
-       phi_enforce)
+  DEPS phi auto_parallel_proto proto_desc)
 
 add_subdirectory(test)
diff --git a/paddle/fluid/distributed/auto_parallel/test/CMakeLists.txt b/paddle/fluid/distributed/auto_parallel/test/CMakeLists.txt
index fcc000e596b..15c0ed63052 100644
--- a/paddle/fluid/distributed/auto_parallel/test/CMakeLists.txt
+++ b/paddle/fluid/distributed/auto_parallel/test/CMakeLists.txt
@@ -1,19 +1,19 @@
 cc_test(
   device_mesh_test
   SRCS device_mesh_test.cc
-  DEPS device_mesh)
+  DEPS phi)
 
 cc_test(
   process_mesh_test
   SRCS process_mesh_test.cc
-  DEPS process_mesh)
+  DEPS phi)
 
 cc_test(
   dist_attr_test
   SRCS dist_attr_test.cc
-  DEPS dist_attr proto_desc)
+  DEPS phi proto_desc)
 
 cc_test(
   dist_mapper_test
   SRCS dist_mapper_test.cc
-  DEPS dist_mapper)
+  DEPS phi)
diff --git a/paddle/fluid/distributed/collective/CMakeLists.txt b/paddle/fluid/distributed/collective/CMakeLists.txt
index ef626ea2985..215f55f2d18 100644
--- a/paddle/fluid/distributed/collective/CMakeLists.txt
+++ b/paddle/fluid/distributed/collective/CMakeLists.txt
@@ -1,18 +1,18 @@
 cc_library(
   process_group
   SRCS process_group.cc
-  DEPS dense_tensor xxhash)
+  DEPS phi xxhash)
 
 cc_library(
   eager_reducer
   SRCS reducer.cc
-  DEPS eager_api process_group phi_api string_helper)
+  DEPS eager_api process_group phi string_helper)
 
 if(WITH_DISTRIBUTE)
   cc_library(
     process_group_gloo
     SRCS process_group_gloo.cc gloo_send_recv.cc
-    DEPS phi_api eager_api gloo_wrapper tcp_store)
+    DEPS phi eager_api gloo_wrapper)
 endif()
 
 if(WITH_NCCL OR WITH_RCCL)
@@ -20,28 +20,19 @@ if(WITH_NCCL OR WITH_RCCL)
     process_group_nccl
     SRCS process_group_nccl.cc nccl_tools.cc common.cc
     DEPS process_group
-         tcp_store
+         phi
          place
          enforce
          collective_helper
          device_context
-         ${DEVICE_EVENT_LIBS}
-         dense_tensor
-         comm_static_check
-         nccl_dynamic_check)
+         ${DEVICE_EVENT_LIBS})
 endif()
 
 if(WITH_XPU_BKCL)
   cc_library(
     process_group_bkcl
     SRCS process_group_bkcl.cc bkcl_tools.cc common.cc
-    DEPS process_group
-         tcp_store
-         place
-         enforce
-         collective_helper
-         device_context
-         dense_tensor)
+    DEPS process_group phi place enforce collective_helper device_context)
 endif()
 
 if(WITH_MPI)
@@ -55,15 +46,7 @@ if(WITH_CUSTOM_DEVICE)
   cc_library(
     process_group_custom
     SRCS process_group_custom.cc custom_ccl_tools.cc common.cc
-    DEPS process_group
-         tcp_store
-         phi_backends
-         place
-         enforce
-         collective_helper
-         device_context
-         comm_static_check
-         dense_tensor)
+    DEPS process_group phi place enforce collective_helper device_context)
 endif()
 
 set(COMM_UTILS_DEPS process_group)
diff --git a/paddle/fluid/distributed/fleet_executor/CMakeLists.txt b/paddle/fluid/distributed/fleet_executor/CMakeLists.txt
index 27733979c32..70153873ced 100755
--- a/paddle/fluid/distributed/fleet_executor/CMakeLists.txt
+++ b/paddle/fluid/distributed/fleet_executor/CMakeLists.txt
@@ -5,7 +5,7 @@ endif()
 proto_library(interceptor_message_proto SRCS interceptor_message.proto)
 
 if(WITH_ARM_BRPC)
-  set(BRPC_DEPS arm_brpc snappy gflags glog)
+  set(BRPC_DEPS arm_brpc snappy phi glog)
 elseif(WITH_DISTRIBUTE AND NOT WITH_PSLIB)
   set(BRPC_DEPS
       brpc
@@ -15,7 +15,7 @@ elseif(WITH_DISTRIBUTE AND NOT WITH_PSLIB)
       zlib
       leveldb
       snappy
-      gflags
+      phi
       glog)
 else()
   set(BRPC_DEPS "")
@@ -51,7 +51,7 @@ cc_library(
        collective_helper
        op_registry
        executor_gc_helper
-       gflags
+       phi
        glog
        ${BRPC_DEPS})
 
diff --git a/paddle/fluid/distributed/ps/service/CMakeLists.txt b/paddle/fluid/distributed/ps/service/CMakeLists.txt
index 0c5e460fcbd..8510273e13f 100755
--- a/paddle/fluid/distributed/ps/service/CMakeLists.txt
+++ b/paddle/fluid/distributed/ps/service/CMakeLists.txt
@@ -8,12 +8,11 @@ if(WITH_HETERPS)
       ssl
       crypto
       protobuf
-      gflags
+      phi
       glog
       zlib
       leveldb
       snappy
-      gflags
       glog
       device_context
       rocksdb)
@@ -25,12 +24,11 @@ else()
       ssl
       crypto
       protobuf
-      gflags
+      phi
       glog
       zlib
       leveldb
       snappy
-      gflags
       glog
       device_context)
 
@@ -122,8 +120,7 @@ cc_library(
        simple_threadpool
        simple_rpc
        scope
-       math_function
-       selected_rows_functor
+       phi
        ps_gpu_wrapper
        ${RPC_DEPS})
 
@@ -150,7 +147,7 @@ cc_library(
 #cc_library(
 #  communicator
 #  SRCS communicator/communicator.cc
-#  DEPS scope client table math_function selected_rows_functor ${RPC_DEPS})
+#  DEPS scope client table phi ${RPC_DEPS})
 #cc_library(
 #  ps_service
 #  SRCS ps_service/service.cc
diff --git a/paddle/fluid/distributed/ps/table/CMakeLists.txt b/paddle/fluid/distributed/ps/table/CMakeLists.txt
index 2a5c4ad25d1..507ce1dcef7 100644
--- a/paddle/fluid/distributed/ps/table/CMakeLists.txt
+++ b/paddle/fluid/distributed/ps/table/CMakeLists.txt
@@ -48,7 +48,7 @@ cc_library(
        string_helper
        simple_threadpool
        xxhash
-       generator)
+       phi)
 
 set_source_files_properties(
   tensor_accessor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
@@ -91,7 +91,7 @@ cc_library(
        ps_framework_proto
        string_helper
        device_context
-       gflags
+       phi
        glog
        fs
        afs_wrapper
diff --git a/paddle/fluid/distributed/rpc/CMakeLists.txt b/paddle/fluid/distributed/rpc/CMakeLists.txt
index ccac6022110..f4fb06c0d84 100644
--- a/paddle/fluid/distributed/rpc/CMakeLists.txt
+++ b/paddle/fluid/distributed/rpc/CMakeLists.txt
@@ -20,7 +20,7 @@ set(PADDLE_RPC_DEPS
     zlib
     leveldb
     snappy
-    gflags
+    phi
     glog
     pybind)
 proto_library(paddle_rpc_proto SRCS rpc.proto)
diff --git a/paddle/fluid/distributed/test/CMakeLists.txt b/paddle/fluid/distributed/test/CMakeLists.txt
index 30f14923e05..a7ce9615a45 100644
--- a/paddle/fluid/distributed/test/CMakeLists.txt
+++ b/paddle/fluid/distributed/test/CMakeLists.txt
@@ -73,7 +73,7 @@ cc_test_old(
   DEPS
   brpc_utils
   scope
-  math_function
+  phi
   ${COMMON_DEPS}
   ${RPC_DEPS})
 
diff --git a/paddle/fluid/eager/CMakeLists.txt b/paddle/fluid/eager/CMakeLists.txt
index a0ff3300ffa..aa9e7c7d2eb 100755
--- a/paddle/fluid/eager/CMakeLists.txt
+++ b/paddle/fluid/eager/CMakeLists.txt
@@ -1,12 +1,10 @@
 set(eager_deps
-    phi_api
-    phi_dygraph_api
+    phi
     hook_utils
     tensor_utils
     utils
     global_utils
     backward
-    phi_tensor
     tracer
     layer
     autograd_meta
@@ -48,27 +46,26 @@ if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
   cc_library(
     backward
     SRCS backward.cc
-    DEPS grad_tensor_holder utils autograd_meta grad_node_info switch_autotune)
+    DEPS grad_tensor_holder utils autograd_meta grad_node_info phi)
 endif()
 
 cc_library(
   eager_nan_inf_utils
   SRCS nan_inf_utils.cc
-  DEPS phi_tensor nan_inf_utils enforce)
+  DEPS phi nan_inf_utils enforce)
 cc_library(
   grad_node_info
   SRCS grad_node_info.cc
-  DEPS phi_api phi_tensor)
+  DEPS phi)
 
 cc_library(
   autograd_meta
   SRCS autograd_meta.cc
-  DEPS phi_api phi_tensor)
+  DEPS phi)
 cc_library(
   utils
   SRCS utils.cc
-  DEPS phi_api
-       phi_tensor
+  DEPS phi
        global_utils
        layer
        proto_desc
diff --git a/paddle/fluid/eager/accumulation/CMakeLists.txt b/paddle/fluid/eager/accumulation/CMakeLists.txt
index a924a9b106d..af37915bfc1 100755
--- a/paddle/fluid/eager/accumulation/CMakeLists.txt
+++ b/paddle/fluid/eager/accumulation/CMakeLists.txt
@@ -2,5 +2,5 @@ if(NOT (NOT WITH_PYTHON AND ON_INFER))
   cc_library(
     accumulation_node
     SRCS accumulation_node.cc
-    DEPS gradient_accumulator phi_api grad_node_info)
+    DEPS gradient_accumulator phi grad_node_info)
 endif()
diff --git a/paddle/fluid/eager/api/generated/eager_generated/backwards/CMakeLists.txt b/paddle/fluid/eager/api/generated/eager_generated/backwards/CMakeLists.txt
index 9f2b99d38d4..8537729da97 100644
--- a/paddle/fluid/eager/api/generated/eager_generated/backwards/CMakeLists.txt
+++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/CMakeLists.txt
@@ -1,7 +1,7 @@
 cc_library(
   scale_node
   SRCS scale_node.cc
-  DEPS global_utils phi phi_api grad_node_info)
+  DEPS global_utils phi grad_node_info)
 
 if(NOT (NOT WITH_PYTHON AND ON_INFER))
   cc_library(
diff --git a/paddle/fluid/eager/api/generated/eager_generated/forwards/CMakeLists.txt b/paddle/fluid/eager/api/generated/eager_generated/forwards/CMakeLists.txt
index 4c0625b4b46..5cda6ba553a 100644
--- a/paddle/fluid/eager/api/generated/eager_generated/forwards/CMakeLists.txt
+++ b/paddle/fluid/eager/api/generated/eager_generated/forwards/CMakeLists.txt
@@ -1,7 +1,7 @@
 cc_library(
   eager_scale
   SRCS scale.cc
-  DEPS phi_api phi autograd_meta scale_node)
+  DEPS phi autograd_meta scale_node)
 
 if(NOT (NOT WITH_PYTHON AND ON_INFER))
   cc_library(
diff --git a/paddle/fluid/eager/api/utils/CMakeLists.txt b/paddle/fluid/eager/api/utils/CMakeLists.txt
index dbb59e5aae7..94c77b73922 100755
--- a/paddle/fluid/eager/api/utils/CMakeLists.txt
+++ b/paddle/fluid/eager/api/utils/CMakeLists.txt
@@ -7,7 +7,7 @@ if(NOT (NOT WITH_PYTHON AND ON_INFER))
   cc_library(
     tensor_utils
     SRCS tensor_utils.cc
-    DEPS phi_api autograd_meta grad_node_info accumulation_node)
+    DEPS phi autograd_meta grad_node_info accumulation_node)
   cc_library(
     hook_utils
     SRCS hook_utils.cc
@@ -16,7 +16,7 @@ else()
   cc_library(
     tensor_utils
     SRCS tensor_utils.cc
-    DEPS phi_api autograd_meta grad_node_info)
+    DEPS phi autograd_meta grad_node_info)
   cc_library(
     hook_utils
     SRCS hook_utils.cc
diff --git a/paddle/fluid/eager/auto_code_generator/CMakeLists.txt b/paddle/fluid/eager/auto_code_generator/CMakeLists.txt
index 003bf273e3c..d187b1abb11 100644
--- a/paddle/fluid/eager/auto_code_generator/CMakeLists.txt
+++ b/paddle/fluid/eager/auto_code_generator/CMakeLists.txt
@@ -52,6 +52,15 @@ if(WIN32)
     set(eager_generator_path "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}")
   endif()
 
+  if(WITH_PHI_SHARED)
+    message("Copied phi.dll for Eager AutoCodeGen")
+    add_custom_command(
+      OUTPUT ${eager_generator_path}/phi.dll
+      COMMAND ${CMAKE_COMMAND} -E copy ${PHI_LIB} ${eager_generator_path}
+      DEPENDS phi)
+    list(APPEND EAGER_CODEGEN_DEPS ${eager_generator_path}/phi.dll)
+  endif()
+
   if(${CBLAS_PROVIDER} STREQUAL MKLML)
     message("Copied libiomp5md.dll for Eager AutoCodeGen")
     add_custom_command(
diff --git a/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
index 532eabdef43..709372dd98e 100644
--- a/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
@@ -392,7 +392,7 @@ FORWARD_CC_FILE_TEMPLATE = """
 #include "paddle/fluid/eager/api/manual/eager_manual/dygraph_forward_api.h"
 #include "paddle/phi/core/flags.h"
 
-DECLARE_bool(check_nan_inf);
+PHI_DECLARE_bool(check_nan_inf);
 PHI_DECLARE_string(tensor_operants_mode);
 {}
 {}
diff --git a/paddle/fluid/eager/custom_operator/CMakeLists.txt b/paddle/fluid/eager/custom_operator/CMakeLists.txt
index 424194557dd..ea8c2a89f35 100644
--- a/paddle/fluid/eager/custom_operator/CMakeLists.txt
+++ b/paddle/fluid/eager/custom_operator/CMakeLists.txt
@@ -1,4 +1,4 @@
 cc_library(
   custom_operator_node
   SRCS custom_operator_node.cc
-  DEPS phi_tensor phi_api grad_node_info custom_operator op_meta_info)
+  DEPS phi grad_node_info custom_operator)
diff --git a/paddle/fluid/eager/pylayer/CMakeLists.txt b/paddle/fluid/eager/pylayer/CMakeLists.txt
index 4b0ad071117..fe7a57fe795 100644
--- a/paddle/fluid/eager/pylayer/CMakeLists.txt
+++ b/paddle/fluid/eager/pylayer/CMakeLists.txt
@@ -1,4 +1,4 @@
 cc_library(
   py_layer_node
   SRCS py_layer_node.cc
-  DEPS pybind phi_api grad_node_info)
+  DEPS pybind phi grad_node_info)
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index d4451c7c491..ff74b96534e 100755
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -115,7 +115,7 @@ proto_library(trainer_desc_proto SRCS trainer_desc.proto DEPS framework_proto
 cc_library(
   string_array
   SRCS string_array.cc
-  DEPS utf8proc phi_enforce)
+  DEPS utf8proc phi)
 
 cc_library(
   data_type
@@ -130,7 +130,7 @@ cc_test(
 cc_library(
   tensor
   SRCS tensor_util.cc
-  DEPS place memory data_type device_context dense_tensor)
+  DEPS place memory data_type device_context phi)
 
 cc_test(
   tensor_test
@@ -166,12 +166,12 @@ cc_test(
 cc_library(
   lod_tensor
   SRCS lod_tensor.cc
-  DEPS ddim mixed_vector place tensor framework_proto version)
+  DEPS phi place tensor framework_proto version)
 
 cc_test(
   lod_tensor_test
   SRCS lod_tensor_test.cc
-  DEPS lod_utils lod_tensor memory)
+  DEPS phi lod_tensor memory)
 
 if(WITH_GPU)
   nv_test(
@@ -188,12 +188,12 @@ endif()
 cc_library(
   garbage_collector
   SRCS garbage_collector.cc
-  DEPS device_context memory gflags glog)
+  DEPS device_context memory phi glog)
 
 cc_library(
   reader
   SRCS reader.cc
-  DEPS lod_tensor ddim)
+  DEPS lod_tensor phi)
 cc_test(
   reader_test
   SRCS reader_test.cc
@@ -202,13 +202,12 @@ cc_test(
 cc_test(
   threadpool_test
   SRCS threadpool_test.cc
-  DEPS threadpool)
+  DEPS phi)
 
 cc_library(
   var_type_traits
   SRCS var_type_traits.cc
-  DEPS framework_proto scope tensor_array sparse_coo_tensor sparse_csr_tensor
-       extended_tensor)
+  DEPS framework_proto scope phi)
 if(WITH_GPU)
   target_link_libraries(var_type_traits dynload_cuda)
 endif()
@@ -242,7 +241,7 @@ endif()
 cc_library(
   scope
   SRCS scope.cc
-  DEPS glog threadpool xxhash var_type_traits)
+  DEPS glog phi xxhash var_type_traits)
 cc_library(
   device_worker
   SRCS device_worker.cc
@@ -273,12 +272,12 @@ if(WITH_GPU)
   nv_test(
     data_device_transform_test
     SRCS data_device_transform_test.cu
-    DEPS operator op_registry device_context math_function scope)
+    DEPS operator op_registry device_context phi scope)
 elseif(WITH_ROCM)
   hip_test(
     data_device_transform_test
     SRCS data_device_transform_test.cu
-    DEPS operator op_registry device_context math_function scope)
+    DEPS operator op_registry device_context phi scope)
 endif()
 
 if(WITH_GPU)
@@ -333,7 +332,7 @@ endif()
 cc_library(
   data_layout_transform
   SRCS data_layout_transform.cc
-  DEPS tensor math_function phi_data_layout_transform)
+  DEPS tensor phi)
 cc_test(
   data_layout_transform_test
   SRCS data_layout_transform_test.cc
@@ -342,14 +341,13 @@ cc_test(
 cc_library(
   data_transform
   SRCS data_transform.cc
-  DEPS math_function
+  DEPS phi
        tensor
        framework_proto
        selected_rows_utils
        data_device_transform
        data_type_transform
-       data_layout_transform
-       phi_data_transform)
+       data_layout_transform)
 
 cc_library(
   attribute
@@ -400,7 +398,7 @@ cc_library(
 cc_library(
   shape_inference
   SRCS shape_inference.cc
-  DEPS ddim attribute selected_rows_utils)
+  DEPS phi attribute selected_rows_utils)
 
 # every source file that includes "dnnl.h" must depends on mkldnn
 # or, the first one should depends on mkldnn
@@ -433,30 +431,17 @@ if(WITH_XPU)
     phi_utils
     SRCS phi_utils.cc
     DEPS lod_tensor
-         dense_tensor
          selected_rows_utils
-         int_array
-         scalar
          place
          phi
          var_type_traits
          op_info
-         xpu_op_list
-         convert_utils)
+         xpu_op_list)
 else()
   cc_library(
     phi_utils
     SRCS phi_utils.cc
-    DEPS lod_tensor
-         dense_tensor
-         selected_rows_utils
-         int_array
-         scalar
-         place
-         phi
-         var_type_traits
-         op_info
-         convert_utils)
+    DEPS lod_tensor selected_rows_utils place phi var_type_traits op_info)
 endif()
 
 if(WITH_XPU)
@@ -482,11 +467,10 @@ if(WITH_XPU)
          unused_var_check
          nan_inf_utils
          phi_utils
-         kernel_factory
          infershape_utils
-         op_utils
+         phi
          op_compat_infos
-         get_kerneltype_forvar_utils)
+         type_info)
 else()
   cc_library(
     operator
@@ -509,11 +493,10 @@ else()
          unused_var_check
          nan_inf_utils
          phi_utils
-         kernel_factory
          infershape_utils
-         op_utils
+         phi
          op_compat_infos
-         get_kerneltype_forvar_utils)
+         type_info)
 endif()
 
 cc_test(
@@ -543,7 +526,7 @@ cc_library(
        version
        xxhash
        op_dist_attr
-       scalar
+       phi
        op_version_proto
        op_version_registry)
 
@@ -853,7 +836,7 @@ if(WITH_DISTRIBUTE)
            heter_server
            brpc
            fleet_executor
-           flags)
+           phi)
     set(DISTRIBUTE_COMPILE_FLAGS "")
     if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
       set(DISTRIBUTE_COMPILE_FLAGS "${DISTRIBUTE_COMPILE_FLAGS} -faligned-new")
@@ -1071,7 +1054,7 @@ if(WITH_PSCORE)
            executor
            heter_server
            gloo_wrapper
-           eigen_function
+           phi
            ${RPC_DEPS}
            graph_gpu_wrapper)
   else()
@@ -1088,7 +1071,7 @@ if(WITH_PSCORE)
            executor
            heter_server
            gloo_wrapper
-           eigen_function
+           phi
            ${RPC_DEPS})
   endif()
 else()
@@ -1112,7 +1095,7 @@ cc_test(
 cc_library(
   selected_rows_utils
   SRCS selected_rows_utils.cc
-  DEPS selected_rows device_context)
+  DEPS phi device_context)
 cc_test(
   selected_rows_utils_test
   SRCS selected_rows_utils_test.cc
@@ -1162,12 +1145,11 @@ cc_library(
        phi
        phi_utils
        op_info
-       shape_inference
-       sparse_coo_tensor)
+       shape_inference)
 cc_test(
   infershape_utils_test
   SRCS infershape_utils_test.cc
-  DEPS infershape_utils infermeta_utils meta_tensor)
+  DEPS infershape_utils phi)
 
 # Get the current working branch
 execute_process(
@@ -1198,12 +1180,15 @@ cc_library(
        operator
        dynamic_loader
        string_helper
-       phi_tensor
-       op_meta_info
-       phi_api
-       tensor_api
-       phi_tensor_operants
-       operants_manager)
+       phi
+       imperative_flag
+       layer)
+
+cc_library(type_info SRCS type_info.cc)
+add_dependencies(type_info framework_proto auto_parallel_proto xxhash)
+if(WITH_MKLDNN)
+  add_dependencies(type_info mkldnn)
+endif()
 
 set(FLUID_FRAMEWORK_MODULES
     proto_desc
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index 99ebd6a370b..b660cbcef2b 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -10,15 +10,15 @@ cc_library(
 cc_library(
   scale_loss_grad_op_handle
   SRCS scale_loss_grad_op_handle.cc
-  DEPS op_handle_base scope lod_tensor ddim memory)
+  DEPS op_handle_base scope lod_tensor phi memory)
 cc_library(
   fetch_op_handle
   SRCS fetch_op_handle.cc
-  DEPS op_handle_base scope lod_tensor ddim memory)
+  DEPS op_handle_base scope lod_tensor phi memory)
 cc_library(
   fetch_async_op_handle
   SRCS fetch_async_op_handle.cc
-  DEPS op_handle_base scope lod_tensor ddim memory)
+  DEPS op_handle_base scope lod_tensor phi memory)
 
 cc_library(
   share_tensor_buffer_functor
@@ -78,7 +78,7 @@ if(WITH_GPU)
     DEPS op_handle_base
          scope
          lod_tensor
-         ddim
+         phi
          memory
          dynload_cuda
          variable_visitor)
@@ -88,7 +88,7 @@ if(WITH_GPU)
     DEPS op_handle_base
          scope
          lod_tensor
-         ddim
+         phi
          memory
          dynload_cuda
          variable_visitor
@@ -99,7 +99,7 @@ if(WITH_GPU)
     DEPS op_handle_base
          scope
          lod_tensor
-         ddim
+         phi
          memory
          dynload_cuda
          variable_visitor
@@ -114,7 +114,7 @@ if(WITH_GPU)
       DEPS op_handle_base
            scope
            lod_tensor
-           ddim
+           phi
            memory
            dynload_cuda
            variable_visitor
@@ -126,19 +126,17 @@ if(WITH_GPU)
     nv_library(
       reduce_op_handle
       SRCS reduce_op_handle.cc
-      DEPS op_handle_base variable_visitor scope ddim dynload_cuda
-           selected_rows_functor)
+      DEPS op_handle_base variable_visitor scope phi dynload_cuda)
   else()
     nv_library(
       reduce_op_handle
       SRCS reduce_op_handle.cc
-      DEPS op_handle_base variable_visitor scope ddim dynload_cuda
-           selected_rows_functor)
+      DEPS op_handle_base variable_visitor scope phi dynload_cuda)
   endif()
   nv_library(
     broadcast_op_handle
     SRCS broadcast_op_handle.cc
-    DEPS op_handle_base scope ddim memory variable_visitor dynload_cuda)
+    DEPS op_handle_base scope phi memory variable_visitor dynload_cuda)
   nv_library(
     fused_broadcast_op_handle
     SRCS fused_broadcast_op_handle.cc
@@ -154,7 +152,7 @@ elseif(WITH_ROCM)
     DEPS op_handle_base
          scope
          lod_tensor
-         ddim
+         phi
          memory
          dynload_cuda
          variable_visitor)
@@ -164,7 +162,7 @@ elseif(WITH_ROCM)
     DEPS op_handle_base
          scope
          lod_tensor
-         ddim
+         phi
          memory
          dynload_cuda
          variable_visitor
@@ -175,7 +173,7 @@ elseif(WITH_ROCM)
     DEPS op_handle_base
          scope
          lod_tensor
-         ddim
+         phi
          memory
          dynload_cuda
          variable_visitor
@@ -187,19 +185,17 @@ elseif(WITH_ROCM)
     hip_library(
       reduce_op_handle
       SRCS reduce_op_handle.cc
-      DEPS op_handle_base variable_visitor scope ddim dynload_cuda
-           selected_rows_functor)
+      DEPS op_handle_base variable_visitor scope phi dynload_cuda)
   else()
     hip_library(
       reduce_op_handle
       SRCS reduce_op_handle.cc
-      DEPS op_handle_base variable_visitor scope ddim dynload_cuda
-           selected_rows_functor)
+      DEPS op_handle_base variable_visitor scope phi dynload_cuda)
   endif()
   hip_library(
     broadcast_op_handle
     SRCS broadcast_op_handle.cc
-    DEPS op_handle_base scope ddim memory variable_visitor dynload_cuda)
+    DEPS op_handle_base scope phi memory variable_visitor dynload_cuda)
   hip_library(
     fused_broadcast_op_handle
     SRCS fused_broadcast_op_handle.cc
@@ -212,14 +208,14 @@ else()
   cc_library(
     all_reduce_op_handle
     SRCS all_reduce_op_handle.cc
-    DEPS op_handle_base scope lod_tensor ddim memory variable_visitor)
+    DEPS op_handle_base scope lod_tensor phi memory variable_visitor)
   cc_library(
     fused_all_reduce_op_handle
     SRCS fused_all_reduce_op_handle.cc
     DEPS op_handle_base
          scope
          lod_tensor
-         ddim
+         phi
          memory
          variable_visitor
          place)
@@ -229,7 +225,7 @@ else()
     DEPS op_handle_base
          scope
          lod_tensor
-         ddim
+         phi
          memory
          variable_visitor
          place
@@ -239,17 +235,17 @@ else()
     cc_library(
       reduce_op_handle
       SRCS reduce_op_handle.cc
-      DEPS op_handle_base variable_visitor scope ddim selected_rows_functor)
+      DEPS op_handle_base variable_visitor scope phi)
   else()
     cc_library(
       reduce_op_handle
       SRCS reduce_op_handle.cc
-      DEPS op_handle_base variable_visitor scope ddim selected_rows_functor)
+      DEPS op_handle_base variable_visitor scope phi)
   endif()
   cc_library(
     broadcast_op_handle
     SRCS broadcast_op_handle.cc
-    DEPS op_handle_base scope ddim memory variable_visitor)
+    DEPS op_handle_base scope phi memory variable_visitor)
   cc_library(
     fused_broadcast_op_handle
     SRCS fused_broadcast_op_handle.cc
@@ -259,7 +255,7 @@ endif()
 cc_library(
   gather_op_handle
   SRCS gather_op_handle.cc
-  DEPS op_handle_base scope ddim memory variable_visitor)
+  DEPS op_handle_base scope phi memory variable_visitor)
 
 cc_library(
   eager_deletion_op_handle
@@ -305,7 +301,7 @@ cc_test(
   DEPS var_handle
        op_handle_base
        scope
-       ddim
+       phi
        memory
        device_context
        broadcast_op_handle)
@@ -317,7 +313,7 @@ cc_test_old(
   var_handle
   op_handle_base
   scope
-  ddim
+  phi
   memory
   device_context
   gather_op_handle)
@@ -330,12 +326,12 @@ cc_library(
   scope_buffered_ssa_graph_executor
   SRCS scope_buffered_ssa_graph_executor.cc
   DEPS ssa_graph_executor scope_buffered_monitor)
-#cc_test(reduce_op_handle_test SRCS reduce_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
+#cc_test(reduce_op_handle_test SRCS reduce_op_handle_test.cc DEPS var_handle op_handle_base scope phi memory
 #        device_context reduce_op_handle )
 cc_library(
   bind_threaded_ssa_graph_executor
   SRCS bind_threaded_ssa_graph_executor.cc
-  DEPS fetch_op_handle gflags ssa_graph_executor scope simple_threadpool
+  DEPS fetch_op_handle phi ssa_graph_executor scope simple_threadpool
        device_context)
 cc_library(
   fast_threaded_ssa_graph_executor
diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc
index b0349966bb5..69f7a49ce55 100644
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -20,9 +20,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/details/reduce_op_handle.h"
 #include "paddle/fluid/framework/ir/graph_printer.h"
 #include "paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h"
+#include "paddle/phi/core/flags.h"
 
 DECLARE_bool(convert_all_blocks);
-DECLARE_bool(use_mkldnn);
+PHI_DECLARE_bool(use_mkldnn);
 #ifdef PADDLE_WITH_CINN
 DECLARE_bool(use_cinn);
 #endif
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index 7f22793fc0c..a18607595e1 100755
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -32,7 +32,7 @@ cc_library(
 cc_library(
   cost_model
   SRCS cost_model.cc
-  DEPS executor graph profiler proto_desc phi_device_tracer)
+  DEPS executor graph profiler proto_desc phi)
 
 set(GRAPH_PATTERN_DETECTOR_DEPS graph graph_helper graph_traits)
 if(WITH_TESTING)
@@ -458,9 +458,6 @@ if(WITH_MKLDNN)
       graph_to_program_pass
       conv_op
       conv_transpose_op
-      math_function
-      im2col
-      vol2col
       batch_norm_op
       generated_op
       activation_op
@@ -468,7 +465,7 @@ if(WITH_MKLDNN)
       concat_and_split
       naive_executor
       device_context
-      eigen_function)
+      phi)
   if(WITH_GPU OR WITH_ROCM)
     set(TEST_CONV_BN_PASS_DEPS ${TEST_CONV_BN_PASS_DEPS} depthwise_conv)
   endif()
diff --git a/paddle/fluid/framework/ir/fuse_adamw_op_pass.cc b/paddle/fluid/framework/ir/fuse_adamw_op_pass.cc
index c26032fadc2..edceedd546b 100644
--- a/paddle/fluid/framework/ir/fuse_adamw_op_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_adamw_op_pass.cc
@@ -221,7 +221,7 @@ bool InitAndCheckAttrs(const size_t &found_adamw_count,
     }
   }
 
-  // Check whether with_decay and multi_precision are matched。
+  // Check whether with_decay and multi_precision are matched
   if (config->with_decay !=
           PADDLE_GET_CONST(bool, adamw_op_desc->GetAttr("with_decay")) ||
       config->multi_precision !=
diff --git a/paddle/fluid/framework/ir/fusion_group/CMakeLists.txt b/paddle/fluid/framework/ir/fusion_group/CMakeLists.txt
index 5e05108b666..2357247b37d 100644
--- a/paddle/fluid/framework/ir/fusion_group/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/fusion_group/CMakeLists.txt
@@ -6,13 +6,13 @@ if(WITH_GPU OR WITH_ROCM)
   cc_test(
     test_code_generator
     SRCS code_generator_tester.cc
-    DEPS code_generator phi_backends lod_tensor graph_viz_pass)
+    DEPS code_generator phi lod_tensor graph_viz_pass)
 endif()
 
 cc_library(
   fusion_group_pass
   SRCS fusion_group_pass.cc elementwise_group_detector.cc
-  DEPS subgraph_detector fuse_pass_base code_generator phi_backends)
+  DEPS subgraph_detector fuse_pass_base code_generator phi)
 cc_test(
   test_fusion_group_pass
   SRCS fusion_group_pass_tester.cc
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt b/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt
index 1723e881cd5..ffb1606b95c 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt
@@ -76,5 +76,4 @@ cc_library(
 cc_test(
   test_reference_count_pass_last_lived_ops
   SRCS test_reference_count_pass_last_lived_ops.cc
-  DEPS parallel_executor elementwise_mul_op elementwise_add_op generated_op
-       eigen_function)
+  DEPS parallel_executor elementwise_mul_op elementwise_add_op generated_op phi)
diff --git a/paddle/fluid/framework/new_executor/CMakeLists.txt b/paddle/fluid/framework/new_executor/CMakeLists.txt
index 33311fef61a..894275697f7 100644
--- a/paddle/fluid/framework/new_executor/CMakeLists.txt
+++ b/paddle/fluid/framework/new_executor/CMakeLists.txt
@@ -16,4 +16,4 @@ cc_library(
 cc_library(
   staticgraph_executor_statistics
   SRCS executor_statistics.cc
-  DEPS enforce glog phi_os_info)
+  DEPS enforce glog phi)
diff --git a/paddle/fluid/framework/new_executor/interpreter/CMakeLists.txt b/paddle/fluid/framework/new_executor/interpreter/CMakeLists.txt
index 3885c29c6a9..55ab3c68c0f 100644
--- a/paddle/fluid/framework/new_executor/interpreter/CMakeLists.txt
+++ b/paddle/fluid/framework/new_executor/interpreter/CMakeLists.txt
@@ -6,7 +6,6 @@ set(INTERPRETER_DEPS
     device_context
     global_utils
     op_registry
-    phi_tensor_utils
     scope
     framework_proto
     data_feed_proto
@@ -31,7 +30,7 @@ set(INTERPRETER_DEPS
     enforce
     scope
     glog
-    comm_context_manager
+    phi
     ${DEVICE_EVENT_LIBS}
     glog)
 
diff --git a/paddle/fluid/framework/new_executor/workqueue/CMakeLists.txt b/paddle/fluid/framework/new_executor/workqueue/CMakeLists.txt
index e1826df133c..b0ab1826fb4 100644
--- a/paddle/fluid/framework/new_executor/workqueue/CMakeLists.txt
+++ b/paddle/fluid/framework/new_executor/workqueue/CMakeLists.txt
@@ -5,7 +5,7 @@ cc_library(
 cc_library(
   workqueue
   SRCS workqueue.cc
-  DEPS workqueue_utils enforce glog phi_os_info)
+  DEPS workqueue_utils enforce glog phi)
 cc_test(
   workqueue_test
   SRCS workqueue_test.cc
diff --git a/paddle/fluid/framework/paddle2cinn/CMakeLists.txt b/paddle/fluid/framework/paddle2cinn/CMakeLists.txt
index f6a18330407..a415c7d5832 100644
--- a/paddle/fluid/framework/paddle2cinn/CMakeLists.txt
+++ b/paddle/fluid/framework/paddle2cinn/CMakeLists.txt
@@ -5,7 +5,7 @@ pass_library(
   cinn_subgraph_detector
   subgraph_detector
   cinn_compiler
-  errors
+  phi
   enforce)
 
 pass_library(cinn_zero_tensor_trick_pass base)
@@ -17,7 +17,7 @@ cc_library(
 cc_library(
   transform_type
   SRCS transform_type.cc
-  DEPS errors enforce cinn)
+  DEPS phi enforce cinn)
 cc_library(
   cinn_cache_key
   SRCS cinn_cache_key.cc
diff --git a/paddle/fluid/framework/raw_tensor.h b/paddle/fluid/framework/raw_tensor.h
index 60ccd6a5bae..d5130e21de2 100644
--- a/paddle/fluid/framework/raw_tensor.h
+++ b/paddle/fluid/framework/raw_tensor.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <unordered_map>
 
+#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/extended_tensor.h"
 #include "paddle/utils/any.h"
 
@@ -52,7 +53,7 @@ class RawTensor : public phi::ExtendedTensor,
   T& Get() const {
     PADDLE_ENFORCE_EQ(data_.empty(),
                       false,
-                      platform::errors::PreconditionNotMet(
+                      phi::errors::PreconditionNotMet(
                           "The data in RawTensor is empty. Please set data "
                           "before using it."));
 
diff --git a/paddle/fluid/framework/type_info.cc b/paddle/fluid/framework/type_info.cc
new file mode 100644
index 00000000000..b24e7fa53a3
--- /dev/null
+++ b/paddle/fluid/framework/type_info.cc
@@ -0,0 +1,54 @@
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/eager/eager_tensor.h"
+#include "paddle/fluid/framework/feed_fetch_type.h"
+#include "paddle/fluid/framework/raw_tensor.h"
+#include "paddle/fluid/framework/string_array.h"
+#include "paddle/fluid/prim/utils/static/desc_tensor.h"
+
+namespace phi {
+template <>
+const TypeInfo<phi::TensorBase>
+    TypeInfoTraits<phi::TensorBase, paddle::framework::RawTensor>::kType =
+        RegisterStaticType<phi::TensorBase>(
+            paddle::framework::RawTensor::name());
+
+template <>
+const TypeInfo<phi::TensorBase>
+    TypeInfoTraits<phi::TensorBase, paddle::framework::Vocab>::kType =
+        RegisterStaticType<phi::TensorBase>(paddle::framework::Vocab::name());
+
+template <>
+const TypeInfo<phi::TensorBase>
+    TypeInfoTraits<phi::TensorBase, paddle::framework::Strings>::kType =
+        RegisterStaticType<phi::TensorBase>(paddle::framework::Strings::name());
+
+template <>
+const TypeInfo<phi::TensorBase>
+    TypeInfoTraits<phi::TensorBase, paddle::framework::FeedList>::kType =
+        RegisterStaticType<phi::TensorBase>(
+            paddle::framework::FeedList::name());
+
+template <>
+const TypeInfo<phi::TensorBase>
+    TypeInfoTraits<phi::TensorBase, egr::VariableCompatTensor>::kType =
+        RegisterStaticType<phi::TensorBase>(egr::VariableCompatTensor::name());
+
+template <>
+const TypeInfo<phi::TensorBase>
+    TypeInfoTraits<phi::TensorBase, paddle::prim::DescTensor>::kType =
+        RegisterStaticType<phi::TensorBase>(paddle::prim::DescTensor::name());
+
+}  // namespace phi
diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt
index f6fe845b30c..2894b450756 100644
--- a/paddle/fluid/imperative/CMakeLists.txt
+++ b/paddle/fluid/imperative/CMakeLists.txt
@@ -1,11 +1,11 @@
 cc_library(
   imperative_flag
   SRCS flags.cc
-  DEPS gflags flags)
+  DEPS phi)
 cc_library(
   var_helper
   SRCS var_helper.cc
-  DEPS tensor selected_rows extended_tensor)
+  DEPS tensor phi)
 if(WITH_XPU)
   cc_library(
     prepared_operator
@@ -20,8 +20,7 @@ if(WITH_XPU)
          op_kernel_type
          data_transform
          nan_inf_utils
-         scalar
-         int_array
+         phi
          var_helper
          profiler
          place)
@@ -38,8 +37,7 @@ else()
          op_kernel_type
          data_transform
          nan_inf_utils
-         scalar
-         int_array
+         phi
          var_helper
          profiler
          place)
@@ -47,14 +45,14 @@ endif()
 cc_library(
   layer
   SRCS layer.cc
-  DEPS prepared_operator math_function imperative_flag variable_helper
-       op_registry var_helper)
+  DEPS prepared_operator phi imperative_flag variable_helper op_registry
+       var_helper)
 add_subdirectory(jit)
 if(WITH_GPU)
   cc_library(
     layout_autotune
     SRCS layout_autotune.cc
-    DEPS op_info phi_backends)
+    DEPS op_info phi)
 else()
   cc_library(
     layout_autotune
@@ -80,15 +78,15 @@ cc_library(
 cc_library(
   basic_engine
   SRCS basic_engine.cc
-  DEPS layer gradient_accumulator switch_autotune)
+  DEPS layer gradient_accumulator phi)
 cc_library(
   engine
   SRCS basic_engine.cc partial_grad_engine.cc
-  DEPS layer gradient_accumulator switch_autotune)
+  DEPS layer gradient_accumulator phi)
 cc_library(
   imperative_profiler
   SRCS profiler.cc
-  DEPS flags)
+  DEPS phi)
 if(NOT WIN32)
   if(WITH_NCCL OR WITH_RCCL)
     cc_library(
@@ -174,12 +172,4 @@ endif()
 cc_library(
   gradient_accumulator
   SRCS gradient_accumulator.cc
-  DEPS blas
-       operator
-       lod_tensor
-       selected_rows_utils
-       selected_rows_functor
-       var_type_traits
-       layer
-       math_function
-       phi_tensor)
+  DEPS operator lod_tensor selected_rows_utils var_type_traits layer phi)
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index 237a7608160..037025405fc 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -32,14 +32,8 @@ endif()
 
 # fluid_modules exclude API-interface of inference/api and inference/capi_exp
 get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
-get_property(phi_modules GLOBAL PROPERTY PHI_MODULES)
-get_property(phi_kernels GLOBAL PROPERTY PHI_KERNELS)
 set(utils_modules pretty_log string_helper benchmark)
 
-if(WITH_CUSTOM_DEVICE)
-  set(fluid_modules ${fluid_modules} phi_capi)
-endif()
-
 add_subdirectory(api)
 
 # Create static inference library if needed
@@ -51,7 +45,6 @@ set(STATIC_INFERENCE_API
     reset_tensor_array
     analysis_config
     paddle_pass_builder
-    phi
     ${mkldnn_quantizer_cfg})
 
 set(OP_LIST
@@ -64,16 +57,14 @@ set(KERNEL_LIST
 
 #windows GPU static library over the limit, so not create_static_lib, and cc_library is dummy
 if(WIN32 AND WITH_GPU)
-  cc_library(paddle_inference DEPS ${fluid_modules} phi ${STATIC_INFERENCE_API}
+  cc_library(paddle_inference DEPS ${fluid_modules} ${STATIC_INFERENCE_API}
                                    ${utils_modules})
 else()
   # message("${fluid_modules}")
-  # message("PHI_MODULES ${phi_modules}")
-  # message("${phi_kernels}")
   # message("${STATIC_INFERENCE_API}")
   # message("${utils_modules}")
-  create_static_lib(paddle_inference ${fluid_modules} ${phi_modules}
-                    ${phi_kernels} ${STATIC_INFERENCE_API} ${utils_modules})
+  create_static_lib(paddle_inference ${fluid_modules} ${STATIC_INFERENCE_API}
+                    ${utils_modules})
 endif()
 
 if(NOT APPLE)
@@ -103,7 +94,7 @@ set(SHARED_INFERENCE_SRCS
 # shared inference library deps
 list(REMOVE_ITEM fluid_modules standalone_executor
      interpretercore_garbage_collector)
-set(SHARED_INFERENCE_DEPS ${fluid_modules} phi analysis_predictor
+set(SHARED_INFERENCE_DEPS phi ${fluid_modules} analysis_predictor
                           ${utils_modules})
 
 if(WITH_CRYPTO)
@@ -124,12 +115,6 @@ if(WITH_ONNXRUNTIME)
       ${CMAKE_CURRENT_SOURCE_DIR}/api/onnxruntime_predictor.cc)
 endif()
 
-#export all symbols for paddle/phi/api/include/api.h on paddle_inference_shared, only for UNIX
-if(UNIX)
-  set(SHARED_INFERENCE_DEPS ${SHARED_INFERENCE_DEPS}
-                            $<TARGET_OBJECTS:phi_function_api>)
-endif()
-
 # Create shared inference library
 cc_library(
   paddle_inference_shared SHARED
@@ -141,12 +126,15 @@ target_link_libraries(paddle_inference_shared ${os_dependency_modules})
 if(WIN32)
   set_property(TARGET paddle_inference_shared
                PROPERTY WINDOWS_EXPORT_ALL_SYMBOLS ON)
-  target_link_libraries(paddle_inference_shared gflags)
+  target_link_libraries(paddle_inference_shared phi)
 endif()
 
 set_target_properties(paddle_inference_shared PROPERTIES OUTPUT_NAME
                                                          paddle_inference)
-if(NOT APPLE AND NOT WIN32)
+if(NOT APPLE
+   AND NOT WIN32
+   AND NOT WITH_TESTING
+   AND NOT WITH_INFERENCE_API_TEST)
   # TODO(liuyiqun): Temporarily disable the link flag because it is not support on Mac.
   set(LINK_FLAGS
       "-Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/paddle_inference.map")
diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
index b681e56d3b9..8ca1de1f63c 100755
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -41,7 +41,7 @@ if(WITH_CRYPTO)
   list(APPEND paddle_inference_api_deps paddle_crypto)
 endif()
 if(WITH_CUSTOM_DEVICE)
-  set(paddle_inference_api_deps ${paddle_inference_api_deps} phi_capi)
+  set(paddle_inference_api_deps ${paddle_inference_api_deps} phi)
 endif()
 
 cc_library(
@@ -50,7 +50,7 @@ cc_library(
   DEPS ${paddle_inference_api_deps})
 
 if(WIN32)
-  target_link_libraries(paddle_inference_api gflags)
+  target_link_libraries(paddle_inference_api phi)
 endif()
 
 set(inference_deps ${analysis_deps} paddle_inference_api analysis
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 18c036a1ebe..831fa36535d 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -72,7 +72,7 @@
 #endif
 
 #ifdef PADDLE_WITH_MKLML
-#include "paddle/fluid/platform/dynload/mklml.h"
+#include "paddle/phi/backends/dynload/mklml.h"
 #endif
 
 #ifdef PADDLE_WITH_MKLDNN
@@ -1121,7 +1121,7 @@ bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
   // Frees unused memory allocated by the Intel® MKL Memory Allocator to
   // avoid memory leak. See:
   // https://software.intel.com/en-us/mkl-developer-reference-c-mkl-free-buffers
-  platform::dynload::MKL_Free_Buffers();
+  phi::dynload::MKL_Free_Buffers();
 #endif
   return true;
 }
@@ -1185,7 +1185,7 @@ bool AnalysisPredictor::Run(const std::vector<paddle::Tensor> &inputs,
   // Frees unused memory allocated by the Intel® MKL Memory Allocator to
   // avoid memory leak. See:
   // https://software.intel.com/en-us/mkl-developer-reference-c-mkl-free-buffers
-  platform::dynload::MKL_Free_Buffers();
+  phi::dynload::MKL_Free_Buffers();
 #endif
   return true;
 }
@@ -2100,7 +2100,7 @@ bool AnalysisPredictor::ZeroCopyRun() {
   // Frees unused memory allocated by the Intel® MKL Memory Allocator to
   // avoid memory leak. See:
   // https://software.intel.com/en-us/mkl-developer-reference-c-mkl-free-buffers
-  platform::dynload::MKL_Free_Buffers();
+  phi::dynload::MKL_Free_Buffers();
 #endif
   return true;
 }
diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
index 7a58b386ad6..e2c4b007c52 100644
--- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
@@ -199,7 +199,7 @@ if(NOT WIN32)
       ${MATH_LIB}
       ${MKLDNN_LIB}
       glog
-      gflags
+      phi
       protobuf
       xxhash
       cryptopp
diff --git a/paddle/fluid/inference/api/demo_ci/run.sh b/paddle/fluid/inference/api/demo_ci/run.sh
index f11319d7665..50112b20f29 100755
--- a/paddle/fluid/inference/api/demo_ci/run.sh
+++ b/paddle/fluid/inference/api/demo_ci/run.sh
@@ -29,6 +29,7 @@ WIN_DETECT=$(echo `uname` | grep "Win") # detect current platform
 
 cd `dirname $0`
 current_dir=`pwd`
+
 if [ $2 == ON ]; then
   # You can export yourself if move the install path
   MKL_LIB=${inference_install_dir}/third_party/install/mklml/lib
diff --git a/paddle/fluid/inference/api/details/CMakeLists.txt b/paddle/fluid/inference/api/details/CMakeLists.txt
index 5d2357d362e..105ff16747d 100644
--- a/paddle/fluid/inference/api/details/CMakeLists.txt
+++ b/paddle/fluid/inference/api/details/CMakeLists.txt
@@ -25,7 +25,7 @@ if(WITH_ONNXRUNTIME)
   cc_library(
     zero_copy_tensor_dummy
     SRCS zero_copy_tensor_dummy.cc
-    DEPS onnxruntime phi_enforce)
+    DEPS onnxruntime phi)
 else()
   cc_library(
     zero_copy_tensor
@@ -34,7 +34,7 @@ else()
   cc_library(
     zero_copy_tensor_dummy
     SRCS zero_copy_tensor_dummy.cc
-    DEPS phi_enforce)
+    DEPS phi)
 endif()
 
 cc_test(
diff --git a/paddle/fluid/inference/capi_exp/CMakeLists.txt b/paddle/fluid/inference/capi_exp/CMakeLists.txt
index 26d76c280bd..30bafbf488a 100644
--- a/paddle/fluid/inference/capi_exp/CMakeLists.txt
+++ b/paddle/fluid/inference/capi_exp/CMakeLists.txt
@@ -39,7 +39,7 @@ if(APPLE)
     utf8proc
     cryptopp
     protobuf
-    gflags
+    phi
     cblas)
 endif()
 
diff --git a/paddle/fluid/inference/goapi/test.sh b/paddle/fluid/inference/goapi/test.sh
index cff9fd4aa7c..fbde661d177 100644
--- a/paddle/fluid/inference/goapi/test.sh
+++ b/paddle/fluid/inference/goapi/test.sh
@@ -23,7 +23,7 @@ fi
 # 2. set LD_LIBRARY_PATH
 export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$PWD/paddle_inference_c/third_party/install/mklml/lib/:$PWD/paddle_inference_c/third_party/install/mkldnn/lib/:$PWD/paddle_inference_c/paddle/lib/
 export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$PWD/paddle_inference_c/third_party/install/onnxruntime/lib/:$PWD/paddle_inference_c/third_party/install/paddle2onnx/lib/
-
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${PADDLE_ROOT}/build/paddle/phi/
 # 3. go test
 go clean -testcache
 go test -v ./...
diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
index a52d6b1c39d..1437ef5f31a 100755
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -141,8 +141,7 @@ nv_test(
 nv_test(
   test_custom_plugin_creater
   SRCS test_custom_plugin_creater.cc
-  DEPS paddle_framework tensorrt_converter op_meta_info custom_operator
-       init_phi)
+  DEPS paddle_framework tensorrt_converter phi custom_operator init_phi)
 
 if(WITH_ONNXRUNTIME AND WIN32)
   # Copy onnxruntime for some c++ test in Windows, since the test will
diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt
index 4dc408241f4..bb1d9e2e897 100644
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -1,6 +1,6 @@
 include(ExternalProject)
 
-set(ALLOCATOR_DEPS place stats profiler phi_backends device_context)
+set(ALLOCATOR_DEPS place stats profiler phi device_context)
 set(ALLOCATOR_SRCS
     allocator.cc
     cpu_allocator.cc
@@ -32,7 +32,7 @@ if(WITH_GPU OR WITH_ROCM)
 endif()
 
 if(WITH_GPU)
-  list(APPEND ALLOCATOR_DEPS phi_backends)
+  list(APPEND ALLOCATOR_DEPS phi)
 endif()
 
 if(CUDA_VERSION VERSION_GREATER_EQUAL 10.2)
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 931372a0d9a..251ec771728 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -124,7 +124,7 @@ class CUDAGraphAllocator
       : underlying_allocator_(allocator) {}
 
  public:
-  ~CUDAGraphAllocator() { VLOG(10) << "CUDAGraphAllocator destructed"; }
+  ~CUDAGraphAllocator() {}
 
   static std::shared_ptr<Allocator> Create(
       const std::shared_ptr<Allocator>& allocator) {
@@ -1137,7 +1137,6 @@ void AllocatorFacade::RemoveMemoryPoolOfCUDAGraph(int64_t id) {
   if (ref_cnt == 0) {
     cuda_graph_map_.erase(id);
     cuda_graph_ref_cnt_.erase(ref_cnt_iter);
-    VLOG(10) << "Remove memory pool of CUDA Graph with memory ID " << id;
   } else {
     VLOG(10) << "Decrease memory pool ID " << id << " reference count to be "
              << ref_cnt;
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index b12ec19b9b9..aef36587ed5 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -90,7 +90,7 @@ if(WITH_UNITY_BUILD)
     include(unity_build_rule.cmake)
 endif()
 
-set(OP_HEADER_DEPS ${OP_HEADER_DEPS} phi phi_utils backward_infermeta sparse_backward_infermeta static_prim_api get_expected_kernel_func)
+set(OP_HEADER_DEPS ${OP_HEADER_DEPS} phi phi_utils static_prim_api get_expected_kernel_func)
 
 register_operators(EXCLUDES py_func_op dgc_op generated_op1 generated_op2 generated_op3 generated_op4 load_combine_op lstm_op run_program_op quantize_linear_op
         recurrent_op save_combine_op sparse_attention_op sync_batch_norm_op activation_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS})
@@ -125,7 +125,7 @@ if (WITH_GPU OR WITH_ROCM)
     endif()
 endif()
 
-op_library(lstm_op DEPS ${OP_HEADER_DEPS}  lstm_compute)
+op_library(lstm_op DEPS ${OP_HEADER_DEPS})
 op_library(recurrent_op DEPS ${OP_HEADER_DEPS})
 
 set(COMMON_OP_DEPS ${OP_HEADER_DEPS})
@@ -136,17 +136,16 @@ if (WITH_DGC)
 endif()
 
 cc_library(common_infer_shape_functions SRCS common_infer_shape_functions.cc DEPS operator)
-cc_library(ops_extra_info SRCS ops_extra_info.cc DEPS attribute cudnn_workspace_helper)
+cc_library(ops_extra_info SRCS ops_extra_info.cc DEPS attribute phi)
 
-set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows_utils lapack_function
-lod_tensor maxouting unpooling pooling lod_rank_table context_project
-sequence_pooling executor generator static_prim_api)
+set(COMMON_OP_DEPS ${COMMON_OP_DEPS} phi)
+set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_utils
+lod_tensor unpooling lod_rank_table context_project executor static_prim_api)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc static_prim_api static_utils static_global_utils prim_utils)
-set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler sample_prob tree2col)
-set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions beam_search fc_functor matrix_inverse matrix_solve)
+set(COMMON_OP_DEPS ${COMMON_OP_DEPS} cos_sim_functor memory concat_and_split sampler sample_prob tree2col)
+set(COMMON_OP_DEPS ${COMMON_OP_DEPS} beam_search)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} box_wrapper ps_gpu_wrapper)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} common_infer_shape_functions)
-set(COMMON_OP_DEPS ${COMMON_OP_DEPS} eigen_function)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} processgroup_comm_utils)
 if(WITH_NCCL OR WITH_RCCL)
   set(COMMON_OP_DEPS ${COMMON_OP_DEPS} process_group_nccl)
@@ -189,7 +188,7 @@ endif()
 copy_if_different(${pybind_file} ${pybind_file_final})
 
 if (WITH_CUSTOM_DEVICE)
-cc_library(custom_device_common_op_registry SRCS custom_device_common_op_registry.cc DEPS operator phi_api)
+cc_library(custom_device_common_op_registry SRCS custom_device_common_op_registry.cc DEPS operator phi type_info)
 endif()
 
 if(NOT "${OP_LIST}" STREQUAL "")
diff --git a/paddle/fluid/operators/cinn/CMakeLists.txt b/paddle/fluid/operators/cinn/CMakeLists.txt
index b700b2798fc..d1a77af60aa 100644
--- a/paddle/fluid/operators/cinn/CMakeLists.txt
+++ b/paddle/fluid/operators/cinn/CMakeLists.txt
@@ -7,7 +7,7 @@ cc_library(
 cc_library(
   cinn_launch_context
   SRCS cinn_launch_context.cc
-  DEPS ddim
+  DEPS phi
        lod_tensor
        scope
        proto_desc
diff --git a/paddle/fluid/operators/collective/CMakeLists.txt b/paddle/fluid/operators/collective/CMakeLists.txt
index 8d523f90ace..cef1390ed23 100644
--- a/paddle/fluid/operators/collective/CMakeLists.txt
+++ b/paddle/fluid/operators/collective/CMakeLists.txt
@@ -18,7 +18,7 @@ foreach(src ${OPS})
 endforeach()
 
 if(WITH_GLOO)
-  set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} gloo_wrapper comm_context_manager)
+  set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} gloo_wrapper phi)
 endif()
 
 register_operators(
@@ -31,8 +31,7 @@ register_operators(
   ${COLLECTIVE_DEPS})
 
 if(WITH_NCCL OR WITH_RCCL)
-  set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} nccl_common collective_helper
-                      comm_context_manager nccl_comm_context)
+  set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} nccl_common collective_helper phi)
   op_library(c_gen_nccl_id_op DEPS ${COLLECTIVE_DEPS})
   op_library(gen_nccl_id_op DEPS ${COLLECTIVE_DEPS})
 endif()
diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt
index 1bca2068f83..554c701b11e 100644
--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
@@ -51,8 +51,8 @@ detection_library(polygon_box_transform_op SRCS polygon_box_transform_op.cc
 detection_library(rpn_target_assign_op SRCS rpn_target_assign_op.cc)
 detection_library(generate_proposal_labels_op SRCS
                   generate_proposal_labels_op.cc)
-detection_library(multiclass_nms_op SRCS multiclass_nms_op.cc DEPS gpc)
-detection_library(locality_aware_nms_op SRCS locality_aware_nms_op.cc DEPS gpc)
+detection_library(multiclass_nms_op SRCS multiclass_nms_op.cc DEPS phi)
+detection_library(locality_aware_nms_op SRCS locality_aware_nms_op.cc DEPS phi)
 detection_library(box_clip_op SRCS box_clip_op.cc box_clip_op.cu)
 detection_library(box_decoder_and_assign_op SRCS box_decoder_and_assign_op.cc
                   box_decoder_and_assign_op.cu)
diff --git a/paddle/fluid/operators/generator/CMakeLists.txt b/paddle/fluid/operators/generator/CMakeLists.txt
index 43e8c158da0..124a4f21133 100644
--- a/paddle/fluid/operators/generator/CMakeLists.txt
+++ b/paddle/fluid/operators/generator/CMakeLists.txt
@@ -289,7 +289,7 @@ file(APPEND ${op_utils_header}
 # Automatically generate the registration code of all arg map functions
 # and compile the corresponding target to avoid frequent code conflicts
 # when writing to same file
-register_op_utils(op_compat_infos DEPS op_utils)
+register_op_utils(op_compat_infos DEPS phi)
 
 copy_if_different(${op_utils_header} ${op_utils_header_final})
 
diff --git a/paddle/fluid/operators/gru_op.cc b/paddle/fluid/operators/gru_op.cc
index 921076a4a14..6c3294ac5e2 100644
--- a/paddle/fluid/operators/gru_op.cc
+++ b/paddle/fluid/operators/gru_op.cc
@@ -17,11 +17,12 @@ limitations under the License. */
 #include <memory>
 #include <string>
 
+#include "paddle/phi/core/flags.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/detail/gru_cpu_kernel.h"
 #include "paddle/phi/kernels/funcs/detail/gru_kernel.h"
 
-DECLARE_int32(paddle_num_threads);
+PHI_DECLARE_int32(paddle_num_threads);
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
index c439ace9714..af14333b9d1 100644
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -6,21 +6,20 @@ if(WITH_XPU)
 endif()
 
 # please add new math_library in alphabetical order
-math_library(concat_and_split DEPS concat_and_split_functor)
-math_library(context_project DEPS im2col math_function)
+math_library(concat_and_split DEPS phi)
+math_library(context_project DEPS phi)
 math_library(cos_sim_functor)
 math_library(depthwise_conv)
 math_library(sample_prob)
-math_library(sampler DEPS generator)
+math_library(sampler DEPS phi)
 
-# math_library(math_function DEPS blas dense_tensor tensor)
 if(WITH_XPU)
-  math_library(beam_search DEPS math_function beam_search_xpu)
+  math_library(beam_search DEPS phi beam_search_xpu)
 else()
-  math_library(beam_search DEPS math_function)
+  math_library(beam_search DEPS phi)
 endif()
 
 math_library(unpooling)
 math_library(prelu)
 math_library(bert_encoder_functor)
-math_library(tree2col DEPS math_function)
+math_library(tree2col DEPS phi)
diff --git a/paddle/fluid/operators/pscore/CMakeLists.txt b/paddle/fluid/operators/pscore/CMakeLists.txt
index d2aa95c2fd3..5a397699951 100755
--- a/paddle/fluid/operators/pscore/CMakeLists.txt
+++ b/paddle/fluid/operators/pscore/CMakeLists.txt
@@ -20,7 +20,7 @@ if(WITH_ARM_BRPC)
     framework_proto
     sendrecv_rpc
     arm_brpc
-    gflags
+    phi
     glog
     snappy
     device_context)
@@ -42,7 +42,7 @@ else()
     ssl
     crypto
     protobuf
-    gflags
+    phi
     glog
     zlib
     snappy
diff --git a/paddle/fluid/operators/sequence_ops/CMakeLists.txt b/paddle/fluid/operators/sequence_ops/CMakeLists.txt
index 06281b6f376..1bd10f19e03 100644
--- a/paddle/fluid/operators/sequence_ops/CMakeLists.txt
+++ b/paddle/fluid/operators/sequence_ops/CMakeLists.txt
@@ -6,5 +6,5 @@ endif()
 register_operators()
 
 if(WITH_UNITY_BUILD)
-  target_link_libraries(paddle_operators_sequence_ops_unity sequence_pooling)
+  target_link_libraries(paddle_operators_sequence_ops_unity phi)
 endif()
diff --git a/paddle/fluid/operators/var_conv_2d_op.cc b/paddle/fluid/operators/var_conv_2d_op.cc
index 112aefbe7f2..31c1f61ed4b 100644
--- a/paddle/fluid/operators/var_conv_2d_op.cc
+++ b/paddle/fluid/operators/var_conv_2d_op.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 #include <memory>
 #include <vector>
 
-#include "paddle/fluid/platform/dynload/mklml.h"
+#include "paddle/phi/backends/dynload/mklml.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 0907b05622d..e2efc315ca5 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -6,9 +6,9 @@ cc_library(
 cc_test(
   errors_test
   SRCS errors_test.cc
-  DEPS errors enforce)
+  DEPS phi enforce)
 
-set(enforce_deps flags errors flags phi_enforce)
+set(enforce_deps phi)
 if(WITH_GPU)
   set(enforce_deps ${enforce_deps} external_error_proto)
 endif()
@@ -26,20 +26,20 @@ cc_test(
 cc_test(
   cpu_info_test
   SRCS cpu_info_test.cc
-  DEPS phi_backends)
+  DEPS phi)
 cc_test(
   os_info_test
   SRCS os_info_test.cc
-  DEPS phi_os_info)
+  DEPS phi)
 
 cc_library(
   place
   SRCS place.cc
-  DEPS enforce phi_place)
+  DEPS enforce phi)
 cc_test(
   place_test
   SRCS place_test.cc
-  DEPS place glog gflags)
+  DEPS place glog phi)
 
 if(WITH_MKLDNN)
   set(MKLDNN_CTX_DEPS mkldnn)
@@ -104,7 +104,7 @@ endif()
 cc_library(
   init
   SRCS init.cc
-  DEPS device_context custom_kernel context_pool memcpy)
+  DEPS device_context phi memcpy)
 
 # memcpy depends on device_context, here add deps individually for
 # avoiding cycle dependencies
@@ -117,7 +117,6 @@ cc_library(
        xxhash
        ${STREAM_CALLBACK_DEPS}
        place
-       phi_place
        eigen3
        cpu_helper
        framework_proto
@@ -126,12 +125,8 @@ cc_library(
        ${MKLDNN_CTX_DEPS}
        ${dgc_deps}
        dlpack
-       cudnn_workspace_helper
-       ${XPU_CTX_DEPS}
-       phi_backends
-       phi_device_context
-       generator
-       phi_enforce)
+       phi
+       ${XPU_CTX_DEPS})
 
 cc_library(
   collective_helper
@@ -189,12 +184,12 @@ if(WITH_GPU)
       cuda_graph_with_memory_pool
       SRCS cuda_graph_with_memory_pool.cc
       DEPS ${DEVICE_EVENT_LIBS} device_event_custom_device device_context
-           allocator phi_backends)
+           allocator phi)
   else()
     nv_library(
       cuda_graph_with_memory_pool
       SRCS cuda_graph_with_memory_pool.cc
-      DEPS ${DEVICE_EVENT_LIBS} device_context allocator phi_backends)
+      DEPS ${DEVICE_EVENT_LIBS} device_context allocator phi)
   endif()
   nv_test(
     device_context_test
@@ -245,7 +240,7 @@ cc_test(
 cc_library(
   lodtensor_printer
   SRCS lodtensor_printer.cc
-  DEPS ddim
+  DEPS phi
        place
        tensor
        scope
@@ -263,41 +258,30 @@ if(WITH_GPU)
   nv_library(
     profiler
     SRCS profiler.cc profiler.cu
-    DEPS phi_os_info
-         phi_device_tracer
+    DEPS phi
          gpu_info
          enforce
          dynload_cuda
          new_profiler
          stats
          op_proto_maker
-         shape_inference
-         phi_profiler)
+         shape_inference)
 elseif(WITH_ROCM)
   hip_library(
     profiler
     SRCS profiler.cc profiler.cu
-    DEPS phi_os_info
-         phi_device_tracer
+    DEPS phi
          gpu_info
          enforce
          new_profiler
          stats
          op_proto_maker
-         shape_inference
-         phi_profiler)
+         shape_inference)
 else()
   cc_library(
     profiler
     SRCS profiler.cc
-    DEPS phi_os_info
-         phi_device_tracer
-         enforce
-         new_profiler
-         stats
-         op_proto_maker
-         shape_inference
-         phi_profiler)
+    DEPS phi enforce new_profiler stats op_proto_maker shape_inference)
 endif()
 
 cc_test(
@@ -333,7 +317,7 @@ if(WITH_GPU)
   nv_test(
     test_limit_gpu_memory
     SRCS test_limit_gpu_memory.cu
-    DEPS gpu_info flags)
+    DEPS gpu_info phi)
   nv_library(
     cuda_device_guard
     SRCS cuda_device_guard.cc
@@ -348,7 +332,7 @@ if(WITH_ROCM)
   hip_test(
     test_limit_gpu_memory
     SRCS test_limit_gpu_memory.cu
-    DEPS gpu_info flags)
+    DEPS gpu_info phi)
   hip_library(
     cuda_device_guard
     SRCS cuda_device_guard.cc
@@ -360,7 +344,7 @@ if(NOT APPLE AND NOT WIN32)
     cc_test(
       device_code_test
       SRCS device_code_test.cc
-      DEPS phi_backends lod_tensor)
+      DEPS phi lod_tensor)
   endif()
 endif()
 
@@ -382,4 +366,4 @@ cc_library(
 cc_test(
   init_phi_test
   SRCS init_phi_test.cc
-  DEPS phi_tensor init_phi)
+  DEPS phi init_phi)
diff --git a/paddle/fluid/platform/cpu_helper.cc b/paddle/fluid/platform/cpu_helper.cc
index 8a9501c0dc7..af1640cfd9a 100644
--- a/paddle/fluid/platform/cpu_helper.cc
+++ b/paddle/fluid/platform/cpu_helper.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 #ifdef PADDLE_WITH_MKLML
 #include <omp.h>
 
-#include "paddle/fluid/platform/dynload/mklml.h"
+#include "paddle/phi/backends/dynload/mklml.h"
 #endif
 
 #ifdef PADDLE_USE_OPENBLAS
@@ -40,7 +40,7 @@ void SetNumThreads(int num_threads) {
   openblas_set_num_threads(real_num_threads);
 #elif defined(PADDLE_WITH_MKLML)
   int real_num_threads = num_threads > 1 ? num_threads : 1;
-  platform::dynload::MKL_Set_Num_Threads(real_num_threads);
+  phi::dynload::MKL_Set_Num_Threads(real_num_threads);
   omp_set_num_threads(real_num_threads);
 #elif defined(PADDLE_USE_REFERENCE_CBLAS)
   // cblas not support multi-thread
diff --git a/paddle/fluid/platform/device/custom/CMakeLists.txt b/paddle/fluid/platform/device/custom/CMakeLists.txt
index 3846111489f..8e081781e29 100644
--- a/paddle/fluid/platform/device/custom/CMakeLists.txt
+++ b/paddle/fluid/platform/device/custom/CMakeLists.txt
@@ -2,9 +2,9 @@ if(WITH_CUSTOM_DEVICE)
   cc_library(
     custom_device_resource_pool
     SRCS custom_device_resource_pool.cc
-    DEPS gflags glog enforce monitor)
+    DEPS phi glog enforce monitor)
   cc_test(
     custom_device_test
     SRCS custom_device_test.cc
-    DEPS phi_tensor_utils phi_backends phi_device_context gradient_accumulator)
+    DEPS phi gradient_accumulator)
 endif()
diff --git a/paddle/fluid/platform/device/gpu/CMakeLists.txt b/paddle/fluid/platform/device/gpu/CMakeLists.txt
index a6f6bc232e6..897f8d3732b 100644
--- a/paddle/fluid/platform/device/gpu/CMakeLists.txt
+++ b/paddle/fluid/platform/device/gpu/CMakeLists.txt
@@ -3,13 +3,7 @@ if(WITH_GPU)
   nv_library(
     gpu_info
     SRCS gpu_info.cc
-    DEPS phi_backends
-         gflags
-         glog
-         enforce
-         monitor
-         dynload_cuda
-         malloc)
+    DEPS phi glog enforce monitor dynload_cuda malloc)
 
   nv_test(cuda_helper_test SRCS cuda_helper_test.cu)
   nv_test(
@@ -21,7 +15,7 @@ elseif(WITH_ROCM)
   hip_library(
     gpu_info
     SRCS gpu_info.cc
-    DEPS phi_backends gflags glog enforce monitor dynload_cuda)
+    DEPS phi glog enforce monitor dynload_cuda)
 
   hip_test(cuda_helper_test SRCS cuda_helper_test.cu)
   hip_test(
diff --git a/paddle/fluid/platform/device/xpu/CMakeLists.txt b/paddle/fluid/platform/device/xpu/CMakeLists.txt
index 70a1c3fc3b0..2f09e25de27 100644
--- a/paddle/fluid/platform/device/xpu/CMakeLists.txt
+++ b/paddle/fluid/platform/device/xpu/CMakeLists.txt
@@ -14,23 +14,11 @@ set(XPU_CTX_DEPS
 cc_library(
   xpu_info
   SRCS xpu_info.cc
-  DEPS gflags
-       glog
-       enforce
-       xpulib
-       device_context
-       place
-       phi_backends)
+  DEPS glog enforce xpulib device_context place phi)
 cc_library(
   xpu_op_list
   SRCS xpu_op_list.cc
-  DEPS gflags
-       glog
-       enforce
-       xpulib
-       device_context
-       op_kernel_type
-       phi_backends)
+  DEPS glog enforce xpulib device_context op_kernel_type phi)
 cc_library(
   xpu_resource_pool
   SRCS xpu_resource_pool.cc
diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt
index 3cbbc32b400..976223be354 100644
--- a/paddle/fluid/platform/dynload/CMakeLists.txt
+++ b/paddle/fluid/platform/dynload/CMakeLists.txt
@@ -1,7 +1,7 @@
 cc_library(
   dynamic_loader
   SRCS dynamic_loader.cc
-  DEPS glog gflags enforce phi_dynamic_loader)
+  DEPS glog enforce phi)
 
 list(
   APPEND
@@ -57,26 +57,20 @@ if(WITH_ROCM)
   hip_library(
     dynload_cuda
     SRCS ${HIP_SRCS}
-    DEPS dynamic_loader phi_dynload_cuda)
+    DEPS dynamic_loader phi)
   cc_library(
     dynload_warpctc
     SRCS warpctc.cc
-    DEPS dynamic_loader warpctc phi_dynload_warpctc)
+    DEPS dynamic_loader warpctc phi)
 else()
   nv_library(
     dynload_cuda
     SRCS ${CUDA_SRCS}
-    DEPS dynamic_loader phi_dynload_cuda)
+    DEPS dynamic_loader phi)
   cc_library(
     dynload_warpctc
     SRCS warpctc.cc
-    DEPS dynamic_loader warpctc phi_dynload_warpctc)
-endif()
-if(WITH_MKLML)
-  cc_library(
-    dynload_mklml
-    SRCS mklml.cc
-    DEPS dynamic_loader mklml phi_dynload_mklml)
+    DEPS dynamic_loader warpctc phi)
 endif()
 
 # TODO(TJ): add iomp, mkldnn?
@@ -86,6 +80,6 @@ if(MKL_FOUND AND WITH_ONEMKL)
   cc_library(
     dynload_mklrt
     SRCS mklrt.cc
-    DEPS dynamic_loader phi_dynload_mklrt)
+    DEPS dynamic_loader phi)
   target_include_directories(dynload_mklrt PRIVATE ${MKL_INCLUDE})
 endif()
diff --git a/paddle/fluid/platform/dynload/mklml.cc b/paddle/fluid/platform/dynload/mklml.cc
deleted file mode 100644
index ff475b2312c..00000000000
--- a/paddle/fluid/platform/dynload/mklml.cc
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/platform/dynload/mklml.h"
-
-namespace paddle {
-namespace platform {
-namespace dynload {
-
-#define DEFINE_WRAP(__name) DynLoad__##__name __name
-
-MKLML_ROUTINE_EACH(DEFINE_WRAP);
-
-#if !defined(_WIN32)
-DEFINE_WRAP(mkl_scsrmm);
-DEFINE_WRAP(mkl_dcsrmm);
-#endif
-
-}  // namespace dynload
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/mklml.h b/paddle/fluid/platform/dynload/mklml.h
deleted file mode 100644
index 78cae9a0821..00000000000
--- a/paddle/fluid/platform/dynload/mklml.h
+++ /dev/null
@@ -1,113 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <mkl.h>
-
-#include <mutex>  // NOLINT
-
-#include "paddle/phi/backends/dynload/mklml.h"
-
-namespace paddle {
-namespace platform {
-namespace dynload {
-
-/**
- * The following macro definition can generate structs
- * (for each function) to dynamic load mklml routine
- * via operator overloading.
- */
-#define DYNAMIC_LOAD_MKLML_WRAP(__name)                      \
-  using DynLoad__##__name = phi::dynload::DynLoad__##__name; \
-  extern DynLoad__##__name __name
-
-#define PLATFORM_DECLARE_DYNAMIC_LOAD_MKLML_WRAP(__name) \
-  DYNAMIC_LOAD_MKLML_WRAP(__name)
-
-#define MKLML_ROUTINE_EACH(__macro) \
-  __macro(cblas_sgemm);             \
-  __macro(cblas_dgemm);             \
-  __macro(cblas_cgemm);             \
-  __macro(cblas_zgemm);             \
-  __macro(cblas_saxpy);             \
-  __macro(cblas_daxpy);             \
-  __macro(cblas_caxpy);             \
-  __macro(cblas_zaxpy);             \
-  __macro(cblas_scopy);             \
-  __macro(cblas_dcopy);             \
-  __macro(cblas_ccopy);             \
-  __macro(cblas_zcopy);             \
-  __macro(cblas_sgemv);             \
-  __macro(cblas_dgemv);             \
-  __macro(cblas_cgemv);             \
-  __macro(cblas_zgemv);             \
-  __macro(cblas_strsm);             \
-  __macro(cblas_dtrsm);             \
-  __macro(cblas_ctrsm);             \
-  __macro(cblas_ztrsm);             \
-  __macro(cblas_sgemm_alloc);       \
-  __macro(cblas_dgemm_alloc);       \
-  __macro(cblas_sgemm_pack);        \
-  __macro(cblas_dgemm_pack);        \
-  __macro(cblas_sgemm_compute);     \
-  __macro(cblas_dgemm_compute);     \
-  __macro(cblas_sgemm_free);        \
-  __macro(cblas_dgemm_free);        \
-  __macro(cblas_sgemm_batch);       \
-  __macro(cblas_dgemm_batch);       \
-  __macro(cblas_cgemm_batch);       \
-  __macro(cblas_zgemm_batch);       \
-  __macro(cblas_sdot);              \
-  __macro(cblas_ddot);              \
-  __macro(cblas_sasum);             \
-  __macro(cblas_dasum);             \
-  __macro(cblas_isamax);            \
-  __macro(cblas_idamax);            \
-  __macro(cblas_sscal);             \
-  __macro(cblas_dscal);             \
-  __macro(vsAdd);                   \
-  __macro(vdAdd);                   \
-  __macro(vsSub);                   \
-  __macro(vdSub);                   \
-  __macro(vsMul);                   \
-  __macro(vdMul);                   \
-  __macro(vsDiv);                   \
-  __macro(vdDiv);                   \
-  __macro(vsExp);                   \
-  __macro(vdExp);                   \
-  __macro(vsSqr);                   \
-  __macro(vdSqr);                   \
-  __macro(vsPowx);                  \
-  __macro(vdPowx);                  \
-  __macro(vsInv);                   \
-  __macro(vdInv);                   \
-  __macro(vmsErf);                  \
-  __macro(vmdErf);                  \
-  __macro(MKL_Free_Buffers);        \
-  __macro(MKL_Set_Num_Threads);     \
-  __macro(MKL_Get_Max_Threads);
-
-MKLML_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_MKLML_WRAP);
-
-#if !defined(_WIN32)
-DYNAMIC_LOAD_MKLML_WRAP(mkl_scsrmm);
-DYNAMIC_LOAD_MKLML_WRAP(mkl_dcsrmm);
-#endif
-
-#undef DYNAMIC_LOAD_MKLML_WRAP
-
-}  // namespace dynload
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc
index cd8e8ea350f..2c65023988d 100644
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -40,6 +40,22 @@ PADDLE_DEFINE_EXPORTED_bool(enable_rpc_profiler,
 
 DEFINE_bool(enable_record_memory, false, "enable memory recorder");
 
+#if defined(_WIN32) && defined(PHI_SHARED)
+phi::ProfilerState phi::ProfilerHelper::g_state = phi::ProfilerState::kDisabled;
+bool phi::ProfilerHelper::g_enable_nvprof_hook = false;
+thread_local uint64_t phi::ProfilerHelper::g_thread_id;
+uint32_t phi::ProfilerHelper::g_next_thread_id = 0;
+std::mutex phi::ProfilerHelper::g_all_event_lists_mutex;
+std::list<std::shared_ptr<phi::EventList<phi::Event>>>
+    phi::ProfilerHelper::g_all_event_lists;
+thread_local std::shared_ptr<phi::EventList<phi::Event>>
+    phi::ProfilerHelper::g_event_list;
+std::list<std::shared_ptr<phi::EventList<phi::MemEvent>>>
+    phi::ProfilerHelper::g_all_mem_event_lists;
+thread_local std::shared_ptr<phi::EventList<phi::MemEvent>>
+    phi::ProfilerHelper::g_mem_event_list;
+std::mutex phi::ProfilerHelper::g_all_mem_event_lists_mutex;
+#endif
 namespace paddle {
 namespace platform {
 
diff --git a/paddle/fluid/platform/profiler/CMakeLists.txt b/paddle/fluid/platform/profiler/CMakeLists.txt
index 66f07791ad0..df5b9818d69 100644
--- a/paddle/fluid/platform/profiler/CMakeLists.txt
+++ b/paddle/fluid/platform/profiler/CMakeLists.txt
@@ -1,7 +1,7 @@
 cc_library(
   host_tracer
   SRCS host_tracer.cc
-  DEPS framework_proto enforce ddim var_type_traits)
+  DEPS framework_proto enforce phi var_type_traits)
 cc_library(
   cuda_tracer
   SRCS cuda_tracer.cc cupti_data_process.cc
@@ -28,7 +28,7 @@ cc_library(
 cc_library(
   cpu_utilization
   SRCS cpu_utilization.cc
-  DEPS phi_backends phi_os_info enforce glog)
+  DEPS phi enforce glog)
 cc_library(
   new_profiler
   SRCS profiler.cc
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index bb0c614ba03..382b9d24aaa 100755
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -28,7 +28,6 @@ set(PYBIND_DEPS
     gloo_wrapper
     infer_io_utils
     heter_wrapper
-    generator
     op_version_registry
     ps_gpu_wrapper
     custom_operator
@@ -37,16 +36,13 @@ set(PYBIND_DEPS
     fleet_executor
     global_utils
     phi_utils
-    tcp_store
-    comm_context_manager
+    phi
     new_profiler
-    auto_parallel
     jit_layer
     jit_property
     prim_utils
-    operants_manager
-    phi_tensor_operants
-    static_tensor_operants)
+    static_tensor_operants
+    type_info)
 
 if(WITH_PSCORE)
   set(PYBIND_DEPS ${PYBIND_DEPS} ps_service)
@@ -65,7 +61,7 @@ if(WITH_RPC)
       zlib
       leveldb
       snappy
-      gflags
+      phi
       glog)
 endif()
 if(WITH_GPU OR WITH_ROCM)
@@ -148,7 +144,6 @@ set(PYBIND_SRCS
     auto_parallel_py.cc)
 
 if(WITH_CUSTOM_DEVICE)
-  set(PYBIND_DEPS ${PYBIND_DEPS} phi_capi)
   set(PYBIND_DEPS ${PYBIND_DEPS} custom_device_common_op_registry)
 endif()
 
@@ -334,6 +329,14 @@ if(WITH_PYTHON)
       ")\n"
       "exit /b 0")
 
+    if(WITH_PHI_SHARED)
+      add_custom_command(
+        OUTPUT ${op_impl_path}/phi.dll
+        COMMAND ${CMAKE_COMMAND} -E copy ${PHI_LIB} ${op_impl_path}
+        DEPENDS phi)
+      list(APPEND EAGER_OP_IMPL_DEPS ${op_impl_path}/libiomp5md.dll)
+    endif()
+
     if(${CBLAS_PROVIDER} STREQUAL MKLML)
       add_custom_command(
         OUTPUT ${op_impl_path}/libiomp5md.dll
@@ -481,10 +484,8 @@ if(WITH_PYTHON)
     list(APPEND PYBIND_DEPS python)
     list(APPEND PYBIND_DEPS custom_operator)
     list(APPEND PYBIND_DEPS custom_operator_node)
-    list(APPEND PYBIND_DEPS tensor_api)
     list(APPEND PYBIND_DEPS eager_tensor_operants)
     list(APPEND PYBIND_DEPS pybind_util)
-    list(APPEND PYBIND_DEPS flags)
   endif()
 
   # On Linux, cc_library(paddle SHARED ..) will generate the libpaddle.so,
diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc
index a78831efc3b..3f49622bd04 100644
--- a/paddle/fluid/pybind/eager_utils.cc
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -38,7 +38,9 @@ limitations under the License. */
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/dense_tensor.h"
-DECLARE_bool(check_nan_inf);
+#include "paddle/phi/core/flags.h"
+
+PHI_DECLARE_bool(check_nan_inf);
 
 namespace paddle {
 namespace pybind {
diff --git a/paddle/phi/CMakeLists.txt b/paddle/phi/CMakeLists.txt
index f14f4ee9880..2395e024090 100644
--- a/paddle/phi/CMakeLists.txt
+++ b/paddle/phi/CMakeLists.txt
@@ -3,6 +3,15 @@ configure_file(config.h.in ${CMAKE_CURRENT_SOURCE_DIR}/config.h)
 # phi auto cmake utils
 include(phi)
 
+set(common_srcs CACHE INTERNAL "" FORCE)
+set(api_srcs CACHE INTERNAL "" FORCE)
+set(capi_srcs CACHE INTERNAL "" FORCE)
+set(core_srcs CACHE INTERNAL "" FORCE)
+set(backends_srcs CACHE INTERNAL "" FORCE)
+set(kernels_srcs CACHE INTERNAL "" FORCE)
+set(infermeta_srcs CACHE INTERNAL "" FORCE)
+#set(excluded_srcs CACHE INTERNAL "" FORCE)
+
 # paddle experimental common components
 add_subdirectory(common)
 
@@ -24,29 +33,153 @@ if(WITH_CUSTOM_DEVICE)
   add_subdirectory(capi)
 endif()
 
-# make an unity target for compile deps
 set(PHI_DEPS
-    convert_utils
-    dense_tensor
-    phi_backends
-    kernel_factory
-    kernel_context
-    arg_map_context
-    infermeta
-    lod_utils
-    sparse_csr_tensor
-    sparse_coo_tensor
-    string_tensor
-    api_scalar
-    api_int_array
-    extended_tensor
-    dist_attr
-    dist_mapper)
-
-get_property(phi_kernels GLOBAL PROPERTY PHI_KERNELS)
-set(PHI_DEPS ${PHI_DEPS} ${phi_kernels})
-
-cc_library(phi DEPS ${PHI_DEPS})
+    phi_profiler_proto
+    auto_parallel_proto
+    gflags
+    glog
+    warpctc
+    warprnnt
+    eigen3
+    xxhash
+    cblas
+    utf8proc)
+
+if(WITH_GPU)
+  list(APPEND PHI_DEPS external_error_proto)
+endif()
+
+if(WITH_ASCEND_CL)
+  list(APPEND PHI_DEPS npu_hccl)
+endif()
+
+if(WITH_FLASHATTN)
+  list(APPEND PHI_DEPS flashattn)
+endif()
+
+if(WITH_XBYAK)
+  list(APPEND PHI_DEPS xbyak)
+endif()
+
+if(WITH_MKLDNN)
+  list(APPEND PHI_DEPS mkldnn)
+endif()
+
+if(WITH_GLOO)
+  list(APPEND PHI_DEPS gloo)
+endif()
+
+if(WITH_CUDNN_FRONTEND)
+  list(APPEND PHI_DEPS cudnn-frontend)
+endif()
+
+if(WITH_POCKETFFT)
+  list(APPEND PHI_DEPS pocketfft)
+endif()
+
+if(WITH_MKLML)
+  list(APPEND PHI_DEPS pocketfft dynload_mklml)
+endif()
+
+if(WITH_XPU)
+  list(APPEND PHI_DEPS xpulib)
+endif()
+
+set(PHI_SRCS
+    ${common_srcs}
+    ${api_srcs}
+    ${core_srcs}
+    ${backends_srcs}
+    ${kernels_srcs}
+    ${infermeta_srcs}
+    ${capi_srcs})
+
+if(WITH_PHI_SHARED)
+  set(PHI_BUILD_TYPE
+      SHARED
+      CACHE INTERNAL "" FORCE)
+else()
+  set(PHI_BUILD_TYPE
+      STATIC
+      CACHE INTERNAL "" FORCE)
+endif()
+
+if(WITH_GPU)
+  add_definitions(-DCUDA_REAL_ARCHS=${NVCC_FLAGS_EXTRA_real_archs}
+  )# for backends/gpu/gpu_resources.cc
+  nv_library(
+    phi ${PHI_BUILD_TYPE}
+    SRCS ${PHI_SRCS}
+    DEPS ${PHI_DEPS})
+elseif(WITH_ROCM)
+  hip_add_library(phi ${PHI_BUILD_TYPE} ${PHI_SRCS})
+  target_link_libraries(phi ${PHI_DEPS})
+elseif(WITH_XPU_KP)
+  xpu_library(
+    phi ${PHI_BUILD_TYPE}
+    SRCS ${PHI_SRCS}
+    DEPS ${PHI_DEPS})
+else()
+  cc_library(
+    phi ${PHI_BUILD_TYPE}
+    SRCS ${PHI_SRCS}
+    DEPS ${PHI_DEPS})
+endif()
+
+if(WIN32)
+  target_link_libraries(phi shlwapi.lib)
+endif()
+
+if(WIN32)
+  if(WITH_PHI_SHARED)
+    set_property(TARGET phi PROPERTY WINDOWS_EXPORT_ALL_SYMBOLS ON)
+    set(PHI_NAME
+        phi.dll
+        CACHE INTERNAL "" FORCE)
+  else()
+    set(PHI_NAME
+        phi.lib
+        CACHE INTERNAL "" FORCE)
+  endif()
+elseif(APPLE)
+  if(WITH_PHI_SHARED)
+    set(PHI_NAME
+        libphi.dylib
+        CACHE INTERNAL "" FORCE)
+  else()
+    set(PHI_NAME
+        libphi.a
+        CACHE INTERNAL "" FORCE)
+  endif()
+else()
+  if(WITH_PHI_SHARED)
+    set(PHI_NAME
+        libphi.so
+        CACHE INTERNAL "" FORCE)
+  else()
+    set(PHI_NAME
+        libphi.a
+        CACHE INTERNAL "" FORCE)
+  endif()
+endif()
+
+set(PHI_LIB
+    "${CMAKE_CURRENT_BINARY_DIR}/${PHI_NAME}"
+    CACHE FILEPATH "PHI Library" FORCE)
+
+if(MKL_FOUND AND WITH_ONEMKL)
+  target_include_directories(phi PRIVATE ${MKL_INCLUDE})
+endif()
+
+add_dependencies(phi extern_lapack)
+if(WITH_CUTLASS)
+  add_dependencies(phi cutlass_codegen)
+  add_definitions("-DPADDLE_WITH_MEMORY_EFFICIENT_ATTENTION"
+  )# for memory_efficient_attention.h
+endif()
+if(WITH_FLASHATTN)
+  add_dependencies(phi flashattn)
+endif()
 
 set(phi_extension_header_file
     ${CMAKE_CURRENT_SOURCE_DIR}/extension.h
diff --git a/paddle/phi/api/CMakeLists.txt b/paddle/phi/api/CMakeLists.txt
index 854c2d2fbfc..1827dfbeb7f 100644
--- a/paddle/phi/api/CMakeLists.txt
+++ b/paddle/phi/api/CMakeLists.txt
@@ -1,7 +1,2 @@
 add_subdirectory(profiler)
 add_subdirectory(lib)
-cc_library(
-  phi_api
-  SRCS all.cc
-  DEPS phi_function_api phi_bw_function_api sparse_api sparse_bw_api
-       strings_api)
diff --git a/paddle/phi/api/all.cc b/paddle/phi/api/all.cc
deleted file mode 100644
index 20f3a492f71..00000000000
--- a/paddle/phi/api/all.cc
+++ /dev/null
@@ -1,19 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/phi/api/all.h"
-
-namespace paddle {
-namespace experimental {}  // namespace experimental
-}  // namespace paddle
diff --git a/paddle/phi/api/ext/op_meta_info.h b/paddle/phi/api/ext/op_meta_info.h
index 21a433df4b8..73a784a6eb9 100644
--- a/paddle/phi/api/ext/op_meta_info.h
+++ b/paddle/phi/api/ext/op_meta_info.h
@@ -112,9 +112,7 @@ class PADDLE_API CustomOpKernelContext {
   void EmplaceBackOutput(Tensor&& output);
   void EmplaceBackOutputs(const std::vector<Tensor>& outputs);
   void EmplaceBackAttr(paddle::any attr);
-  void EmplaceBackAttrs(const std::vector<paddle::any>& attrs) {
-    attrs_ = std::move(attrs);
-  }
+  void EmplaceBackAttrs(const std::vector<paddle::any>& attrs);
   const std::pair<size_t, size_t>& InputRangeAt(size_t idx) const;
   const std::pair<size_t, size_t>& OutputRangeAt(size_t idx) const;
 
@@ -125,13 +123,9 @@ class PADDLE_API CustomOpKernelContext {
   paddle::optional<Tensor> OptionalInputAt(size_t idx);
   paddle::optional<std::vector<Tensor>> OptionalInputsBetween(size_t start,
                                                               size_t end);
-  const std::vector<paddle::any>& Attrs() const { return attrs_; }
-  const std::vector<std::pair<size_t, size_t>>& InputRange() {
-    return input_range_;
-  }
-  const std::vector<std::pair<size_t, size_t>>& OutputRange() {
-    return output_range_;
-  }
+  const std::vector<paddle::any>& Attrs() const;
+  const std::vector<std::pair<size_t, size_t>>& InputRange();
+  const std::vector<std::pair<size_t, size_t>>& OutputRange();
   Tensor* MutableOutputAt(size_t idx);
   std::vector<Tensor*> MutableOutputBetween(size_t start, size_t end);
   std::vector<Tensor> OutputsBetween(size_t start, size_t end);
@@ -811,38 +805,20 @@ class PADDLE_API OpMetaInfo {
 //////////////// Op Meta Info Helper /////////////////
 class OpMetaInfoHelper {
  public:
-  static const std::string& GetOpName(const paddle::OpMetaInfo& info) {
-    return info.name_;
-  }
+  static const std::string& GetOpName(const paddle::OpMetaInfo& info);
   static const std::vector<std::string>& GetInputs(
-      const paddle::OpMetaInfo& info) {
-    return info.inputs_;
-  }
+      const paddle::OpMetaInfo& info);
   static const std::vector<std::string>& GetOutputs(
-      const paddle::OpMetaInfo& info) {
-    return info.outputs_;
-  }
+      const paddle::OpMetaInfo& info);
   static const std::vector<std::string>& GetAttrs(
-      const paddle::OpMetaInfo& info) {
-    return info.attrs_;
-  }
+      const paddle::OpMetaInfo& info);
   static const std::unordered_map<std::string, std::string>& GetInplaceMap(
-      const paddle::OpMetaInfo& info) {
-    return info.inplace_map_;
-  }
+      const paddle::OpMetaInfo& info);
   static const std::unordered_map<std::string, std::string>&
-  GetInplaceReverseMap(const paddle::OpMetaInfo& info) {
-    return info.inplace_reverse_map_;
-  }
-  static const KernelFunc& GetKernelFn(const paddle::OpMetaInfo& info) {
-    return info.kernel_fn_;
-  }
-  static const InferShapeFunc& GetInferShapeFn(const paddle::OpMetaInfo& info) {
-    return info.infer_shape_fn_;
-  }
-  static const InferDtypeFunc& GetInferDtypeFn(const paddle::OpMetaInfo& info) {
-    return info.infer_dtype_fn_;
-  }
+  GetInplaceReverseMap(const paddle::OpMetaInfo& info);
+  static const KernelFunc& GetKernelFn(const paddle::OpMetaInfo& info);
+  static const InferShapeFunc& GetInferShapeFn(const paddle::OpMetaInfo& info);
+  static const InferDtypeFunc& GetInferDtypeFn(const paddle::OpMetaInfo& info);
 };
 
 //////////////// Op Meta Info Map /////////////////
diff --git a/paddle/phi/api/include/tensor.h b/paddle/phi/api/include/tensor.h
index dd1c1637acf..b68db1f45fa 100644
--- a/paddle/phi/api/include/tensor.h
+++ b/paddle/phi/api/include/tensor.h
@@ -410,7 +410,7 @@ class PADDLE_API Tensor final {
    *
    * @return const std::string&
    */
-  const std::string& name() const { return name_; }
+  const std::string& name() const;
 
   /**
    * @brief Set name of Tensor.
@@ -419,7 +419,7 @@ class PADDLE_API Tensor final {
    *
    * @param const std::string& name
    */
-  void set_name(const std::string& name) { name_ = name; }
+  void set_name(const std::string& name);
 
   /* Part 5: Data Transform methods */
   /* Alert!!!!: All copy method can only deep copy impl, autograd info only be
diff --git a/paddle/phi/api/lib/CMakeLists.txt b/paddle/phi/api/lib/CMakeLists.txt
index 03b75ee6760..1bf3883b083 100644
--- a/paddle/phi/api/lib/CMakeLists.txt
+++ b/paddle/phi/api/lib/CMakeLists.txt
@@ -1,38 +1,3 @@
-if(WITH_GPU)
-  nv_library(
-    phi_tensor_raw
-    SRCS tensor.cc
-    DEPS tensor_base
-         dense_tensor
-         phi_enforce
-         context_pool
-         tensor_api
-         int_array
-         scalar)
-elseif(WITH_ROCM)
-  hip_library(
-    phi_tensor_raw
-    SRCS tensor.cc
-    DEPS tensor_base
-         dense_tensor
-         phi_enforce
-         context_pool
-         tensor_api
-         int_array
-         scalar)
-else()
-  cc_library(
-    phi_tensor_raw
-    SRCS tensor.cc
-    DEPS tensor_base
-         dense_tensor
-         phi_enforce
-         context_pool
-         tensor_api
-         int_array
-         scalar)
-endif()
-
 set(api_gen_base ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/generator/api_base.py)
 
 # forward api file
@@ -157,157 +122,77 @@ if(NOT PYTHONINTERP_FOUND)
   find_package(PythonInterp REQUIRED)
 endif()
 
+execute_process(COMMAND ${PYTHON_EXECUTABLE} -m pip install pyyaml)
+
 # generate forward api
-add_custom_command(
-  OUTPUT ${api_header_file} ${api_source_file}
-  COMMAND ${PYTHON_EXECUTABLE} -m pip install pyyaml
+execute_process(
   COMMAND
     ${PYTHON_EXECUTABLE} ${api_gen_file} --api_yaml_path ${api_yaml_file}
     ${legacy_api_yaml_file} --api_header_path ${api_header_file_tmp}
-    --api_source_path ${api_source_file_tmp}
-  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${api_header_file_tmp}
-          ${api_header_file}
-  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${api_source_file_tmp}
-          ${api_source_file}
-  COMMENT "copy_if_different ${api_header_file} ${api_source_file}"
-  DEPENDS ${api_yaml_file} ${legacy_api_yaml_file} ${api_gen_file}
-          ${api_gen_base}
-  VERBATIM)
+    --api_source_path ${api_source_file_tmp})
 
 # generate backward api
-add_custom_command(
-  OUTPUT ${bw_api_header_file} ${bw_api_source_file} ${bw_api_header_file_tmp}
-         ${bw_api_source_file_tmp}
+execute_process(
   COMMAND
     ${PYTHON_EXECUTABLE} ${bw_api_gen_file} --backward_yaml_path
     ${bw_api_yaml_file} ${legacy_bw_api_yaml_file} --backward_header_path
-    ${bw_api_header_file_tmp} --backward_source_path ${bw_api_source_file_tmp}
-  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${bw_api_header_file_tmp}
-          ${bw_api_header_file}
-  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${bw_api_source_file_tmp}
-          ${bw_api_source_file}
-  COMMENT "copy_if_different ${bw_api_header_file} ${bw_api_source_file}"
-  DEPENDS ${bw_api_yaml_file} ${bw_api_gen_file} ${api_gen_base}
-          ${legacy_bw_api_yaml_file}
-  VERBATIM)
+    ${bw_api_header_file_tmp} --backward_source_path ${bw_api_source_file_tmp})
 
 # generate fused_op api
-add_custom_command(
-  OUTPUT ${fused_api_header_file} ${fused_api_source_file}
-  COMMAND ${PYTHON_EXECUTABLE} -m pip install pyyaml
+execute_process(
   COMMAND
     ${PYTHON_EXECUTABLE} ${api_gen_file} --api_yaml_path ${fused_api_yaml_file}
     --is_fused_ops_yaml --api_header_path ${fused_api_header_file_tmp}
-    --api_source_path ${fused_api_source_file_tmp}
-  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${fused_api_header_file_tmp}
-          ${fused_api_header_file}
-  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${fused_api_source_file_tmp}
-          ${fused_api_source_file}
-  COMMENT "copy_if_different ${fused_api_header_file} ${fused_api_source_file}"
-  DEPENDS ${fused_api_yaml_file} ${api_gen_file} ${api_gen_base}
-  VERBATIM)
+    --api_source_path ${fused_api_source_file_tmp})
 
 # generate fused_op backward api
-add_custom_command(
-  OUTPUT ${fused_bw_api_header_file} ${fused_bw_api_source_file}
-         ${fused_bw_api_header_file_tmp} ${fused_bw_api_source_file_tmp}
+execute_process(
   COMMAND
     ${PYTHON_EXECUTABLE} ${fused_bw_api_gen_file} --backward_yaml_path
     ${fused_bw_api_yaml_file} --is_fused_backward_yaml --backward_header_path
     ${fused_bw_api_header_file_tmp} --backward_source_path
-    ${fused_bw_api_source_file_tmp}
-  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${fused_bw_api_header_file_tmp}
-          ${fused_bw_api_header_file}
-  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${fused_bw_api_source_file_tmp}
-          ${fused_bw_api_source_file}
-  COMMENT
-    "copy_if_different ${fused_bw_api_header_file} ${fused_bw_api_source_file}"
-  DEPENDS ${fused_bw_api_yaml_file} ${bw_api_gen_file} ${api_gen_base}
-  VERBATIM)
+    ${fused_bw_api_source_file_tmp})
 
 # generate sparse api
-add_custom_command(
-  OUTPUT ${sparse_api_header_file} ${sparse_api_source_file}
+execute_process(
   COMMAND
     ${PYTHON_EXECUTABLE} ${sparse_api_gen_file} --api_yaml_path
     ${sparse_api_yaml_file} --api_header_path ${sparse_api_header_file_tmp}
-    --api_source_path ${sparse_api_source_file_tmp}
-  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${sparse_api_header_file_tmp}
-          ${sparse_api_header_file}
-  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${sparse_api_source_file_tmp}
-          ${sparse_api_source_file}
-  COMMENT
-    "copy_if_different ${sparse_api_header_file} ${sparse_sparse_api_source_file}"
-  DEPENDS ${sparse_api_yaml_file} ${sparse_api_gen_file} ${api_gen_base}
-          ${api_gen_file}
-  VERBATIM)
+    --api_source_path ${sparse_api_source_file_tmp})
 
 # generate backward sparse api
-add_custom_command(
-  OUTPUT ${sparse_bw_api_header_file} ${sparse_bw_api_source_file}
+execute_process(
   COMMAND
     ${PYTHON_EXECUTABLE} ${sparse_bw_api_gen_file} --api_yaml_path
     ${sparse_bw_api_yaml_file} --api_header_path
     ${sparse_bw_api_header_file_tmp} --api_source_path
-    ${sparse_bw_api_source_file_tmp}
-  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${sparse_bw_api_header_file_tmp}
-          ${sparse_bw_api_header_file}
-  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${sparse_bw_api_source_file_tmp}
-          ${sparse_bw_api_source_file}
-  COMMENT
-    "copy_if_different ${sparse_bw_api_header_file} ${sparse_bw_sparse_api_source_file}"
-  DEPENDS ${sparse_bw_api_yaml_file} ${sparse_bw_api_gen_file} ${api_gen_base}
-          ${api_gen_file} ${sparse_api_gen_file} ${bw_api_gen_file}
-  VERBATIM)
+    ${sparse_bw_api_source_file_tmp})
 
 # generate strings api
-add_custom_command(
-  OUTPUT ${strings_api_header_file} ${strings_api_source_file}
+execute_process(
   COMMAND
     ${PYTHON_EXECUTABLE} ${strings_api_gen_file} --api_yaml_path
     ${strings_api_yaml_file} --api_header_path ${strings_api_header_file_tmp}
-    --api_source_path ${strings_api_source_file_tmp}
-  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${strings_api_header_file_tmp}
-          ${strings_api_header_file}
-  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${strings_api_source_file_tmp}
-          ${strings_api_source_file}
-  COMMENT
-    "copy_if_different ${strings_api_header_file} ${strings_strings_api_source_file}"
-  DEPENDS ${strings_api_yaml_file} ${strings_api_gen_file} ${api_gen_base}
-          ${api_gen_file}
-  VERBATIM)
+    --api_source_path ${strings_api_source_file_tmp})
 
 # generate dygraph(intermediate) api
-add_custom_command(
-  OUTPUT ${dygraph_api_header_file} ${dygraph_api_source_file}
+execute_process(
   COMMAND
     ${PYTHON_EXECUTABLE} ${im_api_gen_file} --api_yaml_path ${api_yaml_file}
     ${legacy_api_yaml_file} --sparse_api_yaml_path ${sparse_api_yaml_file}
     --dygraph_api_header_path ${dygraph_api_header_file_tmp}
-    --dygraph_api_source_path ${dygraph_api_source_file_tmp}
-  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${dygraph_api_header_file_tmp}
-          ${dygraph_api_header_file}
-  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${dygraph_api_source_file_tmp}
-          ${dygraph_api_source_file}
-  DEPENDS ${api_yaml_file} ${legacy_api_yaml_file} ${sparse_api_yaml_file}
-          ${im_api_gen_file} ${api_gen_base} ${api_gen_file}
-  VERBATIM)
+    --dygraph_api_source_path ${dygraph_api_source_file_tmp})
 
 # generate wrapped infermeta
-add_custom_command(
-  OUTPUT ${wrapped_infermeta_header_file} ${wrapped_infermeta_source_file}
+execute_process(
   COMMAND
     ${PYTHON_EXECUTABLE} ${wrapped_infermeta_gen_file} --api_yaml_path
     ${api_yaml_file} ${legacy_api_yaml_file} --wrapped_infermeta_header_path
     ${wrapped_infermeta_header_file} --wrapped_infermeta_source_path
-    ${wrapped_infermeta_source_file}
-  DEPENDS ${api_yaml_file} ${legacy_api_yaml_file} ${wrapped_infermeta_gen_file}
-          ${api_gen_base}
-  VERBATIM)
+    ${wrapped_infermeta_source_file})
 
 # generate tensor and tensor operants file
 message("create or copy auto-geneated tensor files")
-execute_process(COMMAND ${PYTHON_EXECUTABLE} -m pip install pyyaml)
 execute_process(
   WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/generator
   COMMAND
@@ -324,154 +209,70 @@ if(${_result})
   message(FATAL_ERROR "tensor codegen failed, exiting.")
 endif()
 
-set(generated_tensor_files
-    "${operants_base_file}" "${tensor_api_source_file}"
-    "${phi_tensor_operants_header_file}" "${phi_tensor_operants_source_file}"
-    "${operants_manager_header_file}" "${operants_manager_source_file}")
+set(generated_files
+    "${operants_base_file}"
+    "${tensor_api_source_file}"
+    "${phi_tensor_operants_header_file}"
+    "${phi_tensor_operants_source_file}"
+    "${operants_manager_header_file}"
+    "${operants_manager_source_file}"
+    "${wrapped_infermeta_source_file}"
+    "${api_source_file}"
+    "${api_header_file}"
+    "${bw_api_source_file}"
+    "${bw_api_header_file}"
+    "${fused_api_source_file}"
+    "${fused_api_header_file}"
+    "${fused_bw_api_source_file}"
+    "${fused_bw_api_header_file}"
+    "${sparse_api_source_file}"
+    "${sparse_api_header_file}"
+    "${sparse_bw_api_source_file}"
+    "${sparse_bw_api_header_file}"
+    "${dygraph_api_source_file}"
+    "${dygraph_api_header_file}"
+    "${strings_api_source_file}"
+    "${strings_api_header_file}")
 
-foreach(generated_tensor_file ${generated_tensor_files})
-  if(EXISTS "${generated_tensor_file}.tmp" AND EXISTS
-                                               "${generated_tensor_file}")
-    execute_process(
-      COMMAND ${CMAKE_COMMAND} -E copy_if_different
-              "${generated_tensor_file}.tmp" "${generated_tensor_file}")
-    message(
-      "copy if different ${generated_tensor_file}.tmp ${generated_tensor_file}")
-  elseif(EXISTS "${generated_tensor_file}.tmp")
-    execute_process(
-      COMMAND ${CMAKE_COMMAND} -E copy "${generated_tensor_file}.tmp"
-              "${generated_tensor_file}")
-    message("copy ${generated_tensor_file}.tmp ${generated_tensor_file}")
+foreach(generated_file ${generated_files})
+  if(EXISTS "${generated_file}.tmp" AND EXISTS "${generated_file}")
+    execute_process(COMMAND ${CMAKE_COMMAND} -E copy_if_different
+                            "${generated_file}.tmp" "${generated_file}")
+    message("copy if different ${generated_file}.tmp ${generated_file}")
+  elseif(EXISTS "${generated_file}.tmp")
+    execute_process(COMMAND ${CMAKE_COMMAND} -E copy "${generated_file}.tmp"
+                            "${generated_file}")
+    message("copy ${generated_file}.tmp ${generated_file}")
   endif()
 endforeach()
 
-cc_library(
-  op_meta_info
-  SRCS op_meta_info.cc
-  DEPS phi_tensor_raw)
-cc_library(
-  wrapped_infermeta
-  SRCS ${wrapped_infermeta_source_file}
-  DEPS phi)
-cc_library(
-  context_pool
-  SRCS context_pool.cc
-  DEPS phi_backends phi_enforce place init phi_device_context)
-cc_library(
-  api_tensor_utils
-  SRCS tensor_utils.cc
-  DEPS phi_tensor_raw)
-
-cc_library(
-  kernel_dispatch
-  SRCS kernel_dispatch.cc
-  DEPS phi_tensor_raw phi_backends kernel_factory context_pool)
-cc_library(
-  api_gen_utils
-  SRCS api_gen_utils.cc
-  DEPS phi_tensor_raw selected_rows sparse_csr_tensor sparse_coo_tensor
-       infermeta_utils)
-cc_library(
-  phi_data_transform
-  SRCS data_transform.cc
-  DEPS phi_tensor_raw phi tensor)
-cc_library(
-  api_custom_impl
-  SRCS api_custom_impl.cc
-  DEPS phi_tensor_raw
-       phi
-       kernel_dispatch
-       api_gen_utils
-       backward_infermeta
-       phi_data_transform
-       phi_profiler)
-cc_library(
-  phi_function_api
-  SRCS ${api_source_file} ${fused_api_source_file}
-  DEPS phi_tensor_raw
-       phi
-       kernel_dispatch
-       api_gen_utils
-       phi_data_transform
-       api_custom_impl
-       api_tensor_utils
-       phi_profiler)
-cc_library(
-  phi_bw_function_api
-  SRCS ${bw_api_source_file} ${fused_bw_api_source_file}
-  DEPS phi_tensor_raw
-       phi
-       kernel_dispatch
-       api_gen_utils
-       backward_infermeta
-       sparse_backward_infermeta
-       phi_data_transform
-       phi_function_api
-       api_custom_impl
-       global_utils
-       phi_profiler)
-cc_library(
-  sparse_api
-  SRCS ${sparse_api_source_file}
-  DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_profiler)
-cc_library(
-  sparse_bw_api
-  SRCS ${sparse_bw_api_source_file}
-  DEPS phi_tensor_raw
-       phi
-       kernel_dispatch
-       api_gen_utils
-       sparse_api
-       sparse_backward_infermeta
-       phi_profiler)
-cc_library(
-  phi_dygraph_api
-  SRCS ${dygraph_api_source_file}
-  DEPS phi_tensor_raw
-       phi
-       kernel_dispatch
-       api_gen_utils
-       phi_data_transform
-       phi_function_api
-       sparse_api
-       phi_profiler)
-cc_library(
-  strings_api
-  SRCS ${strings_api_source_file}
-  DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_profiler)
-cc_library(
-  phi_tensor
-  SRCS tensor_method.cc
-  DEPS phi_tensor_raw
-       phi_function_api
-       api_gen_utils
-       kernel_dispatch
-       infermeta
-       sparse_infermeta
-       sparse_api
-       strings_api)
-cc_library(
-  tensor_copy
-  SRCS tensor_copy.cc
-  DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils)
-cc_library(
-  api_scalar
-  SRCS scalar.cc
-  DEPS tensor_copy)
-cc_library(
-  api_int_array
-  SRCS int_array.cc
-  DEPS tensor_copy)
-
-cc_library(
-  phi_tensor_operants
-  SRCS ${phi_tensor_operants_source_file}
-  DEPS phi_function_api)
-cc_library(
-  operants_manager
-  SRCS ${operants_manager_source_file}
-  DEPS phi_enforce)
-cc_library(
-  tensor_api
-  SRCS ${tensor_api_source_file}
-  DEPS operants_manager)
+collect_srcs(
+  api_srcs
+  SRCS
+  tensor.cc
+  op_meta_info.cc
+  context_pool.cc
+  tensor_utils.cc
+  kernel_dispatch.cc
+  api_gen_utils.cc
+  data_transform.cc
+  api_custom_impl.cc
+  tensor_method.cc
+  tensor_copy.cc
+  scalar.cc
+  int_array.cc)
+collect_generated_srcs(
+  api_srcs
+  SRCS
+  ${wrapped_infermeta_source_file}
+  ${api_source_file}
+  ${bw_api_source_file}
+  ${fused_api_source_file}
+  ${fused_bw_api_source_file}
+  ${sparse_api_source_file}
+  ${sparse_bw_api_source_file}
+  ${dygraph_api_source_file}
+  ${strings_api_source_file}
+  ${phi_tensor_operants_source_file}
+  ${operants_manager_source_file}
+  ${tensor_api_source_file})
diff --git a/paddle/phi/api/lib/context_pool.cc b/paddle/phi/api/lib/context_pool.cc
index 6b5b7790e74..292bd8a7e47 100644
--- a/paddle/phi/api/lib/context_pool.cc
+++ b/paddle/phi/api/lib/context_pool.cc
@@ -65,11 +65,12 @@ PADDLE_API phi::Allocator* GetAllocator(const phi::Place& place) {
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PADDLE_API phi::CUDAStream* GetCurrentCUDAStream(const phi::Place& place) {
-  PADDLE_ENFORCE(place.GetType() == phi::AllocationType::GPU,
-                 phi::errors::InvalidArgument(
-                     "GetCurrentCUDAStream only supports GPUPlace input. "
-                     "However, your input is place=%s",
-                     place));
+  PADDLE_ENFORCE_EQ(place.GetType(),
+                    phi::AllocationType::GPU,
+                    phi::errors::InvalidArgument(
+                        "GetCurrentCUDAStream only supports GPUPlace input. "
+                        "However, your input is place=%s",
+                        place));
 
   auto& pool = paddle::experimental::DeviceContextPool::Instance();
   const phi::GPUContext* dev_ctx =
diff --git a/paddle/phi/api/lib/op_meta_info.cc b/paddle/phi/api/lib/op_meta_info.cc
index e1221969cf2..90335269536 100644
--- a/paddle/phi/api/lib/op_meta_info.cc
+++ b/paddle/phi/api/lib/op_meta_info.cc
@@ -119,6 +119,11 @@ void CustomOpKernelContext::EmplaceBackAttr(paddle::any attr) {
           << " has value of type: " << attrs_[attrs_.size() - 1].type().name();
 }
 
+void CustomOpKernelContext::EmplaceBackAttrs(
+    const std::vector<paddle::any>& attrs) {
+  attrs_ = std::move(attrs);
+}
+
 const Tensor& CustomOpKernelContext::InputAt(size_t idx) const {
   return inputs_.at(idx);
 }
@@ -132,6 +137,10 @@ std::vector<Tensor> CustomOpKernelContext::InputsBetween(size_t start,
   return rlt;
 }
 
+const std::vector<paddle::any>& CustomOpKernelContext::Attrs() const {
+  return attrs_;
+}
+
 Tensor& CustomOpKernelContext::MutableInputAt(size_t idx) {
   return inputs_.at(idx);
 }
@@ -193,6 +202,16 @@ const std::pair<size_t, size_t>& CustomOpKernelContext::OutputRangeAt(
   return output_range_.at(idx);
 }
 
+const std::vector<std::pair<size_t, size_t>>&
+CustomOpKernelContext::InputRange() {
+  return input_range_;
+}
+
+const std::vector<std::pair<size_t, size_t>>&
+CustomOpKernelContext::OutputRange() {
+  return output_range_;
+}
+
 void CustomOpKernelContext::ConstructInplaceIndex(
     const std::vector<std::string>& inputs,
     const std::vector<std::string>& outputs,
@@ -208,8 +227,9 @@ void CustomOpKernelContext::ConstructInplaceIndex(
       continue;
     }
     auto out_iter = find(outputs.begin(), outputs.end(), inplace_map.at(input));
-    PADDLE_ENFORCE(
-        out_iter != outputs.end(),
+    PADDLE_ENFORCE_NE(
+        out_iter,
+        outputs.end(),
         phi::errors::NotFound("Can't find the mapped value of %s, please check "
                               "the input of `Inplace` again and make "
                               "sure you registered your op accurately. ",
@@ -253,8 +273,9 @@ void CustomOpKernelContext::AssignInplaceOutputs() {
     size_t out_start_idx = output_range_[pair.second].first;
     size_t out_end_idx = output_range_[pair.second].second;
     size_t assign_tensor_size = in_end_idx - in_start_idx;
-    PADDLE_ENFORCE(
-        assign_tensor_size == out_end_idx - out_start_idx,
+    PADDLE_ENFORCE_EQ(
+        assign_tensor_size,
+        out_end_idx - out_start_idx,
         phi::errors::OutOfRange("When assigning inplaced tensor, Input vector "
                                 "size %d mismatch output vector size %d",
                                 in_end_idx - in_start_idx,
@@ -316,6 +337,43 @@ OpMetaInfo& OpMetaInfo::SetInferDtypeFn(InferDtypeFunc&& func) {
   return *this;
 }
 
+//////////////// Op Meta Info Helper /////////////////
+const std::string& OpMetaInfoHelper::GetOpName(const paddle::OpMetaInfo& info) {
+  return info.name_;
+}
+const std::vector<std::string>& OpMetaInfoHelper::GetInputs(
+    const paddle::OpMetaInfo& info) {
+  return info.inputs_;
+}
+const std::vector<std::string>& OpMetaInfoHelper::GetOutputs(
+    const paddle::OpMetaInfo& info) {
+  return info.outputs_;
+}
+const std::vector<std::string>& OpMetaInfoHelper::GetAttrs(
+    const paddle::OpMetaInfo& info) {
+  return info.attrs_;
+}
+const std::unordered_map<std::string, std::string>&
+OpMetaInfoHelper::GetInplaceMap(const paddle::OpMetaInfo& info) {
+  return info.inplace_map_;
+}
+const std::unordered_map<std::string, std::string>&
+OpMetaInfoHelper::GetInplaceReverseMap(const paddle::OpMetaInfo& info) {
+  return info.inplace_reverse_map_;
+}
+const KernelFunc& OpMetaInfoHelper::GetKernelFn(
+    const paddle::OpMetaInfo& info) {
+  return info.kernel_fn_;
+}
+const InferShapeFunc& OpMetaInfoHelper::GetInferShapeFn(
+    const paddle::OpMetaInfo& info) {
+  return info.infer_shape_fn_;
+}
+const InferDtypeFunc& OpMetaInfoHelper::GetInferDtypeFn(
+    const paddle::OpMetaInfo& info) {
+  return info.infer_dtype_fn_;
+}
+
 //////////////// Op Meta Info Map /////////////////
 
 std::vector<OpMetaInfo>& OpMetaInfoMap::operator[](const std::string& name) {
@@ -414,21 +472,23 @@ OpMetaInfoBuilder& OpMetaInfoBuilder::SetInplaceMap(
   const std::vector<std::string>& outputs =
       OpMetaInfoHelper::GetOutputs(*info_ptr_);
   for (const auto& pair : inplace_map) {
-    PADDLE_ENFORCE(
-        std::find(inputs.begin(), inputs.end(), pair.first) != inputs.cend(),
+    PADDLE_ENFORCE_NE(
+        std::find(inputs.begin(), inputs.end(), pair.first),
+        inputs.cend(),
         phi::errors::PreconditionNotMet(
             "The register of operator %s's `SetInplaceMap` failed. "
             "Please make sure: 1. Call `Inputs` and `Outputs` before "
             "`SetInplaceMap`; 2. The keys of inplace_map are inside `Inputs`",
             name_));
-    PADDLE_ENFORCE(std::find(outputs.begin(), outputs.end(), pair.second) !=
-                       outputs.cend(),
-                   phi::errors::PreconditionNotMet(
-                       "The register of operator %s's `SetInplaceMap` failed. "
-                       "Please make sure: 1. Call `Inputs` and `Outputs` "
-                       "before `SetInplaceMap`; 2. The values of inplace_map "
-                       "are inside `Outputs`",
-                       name_));
+    PADDLE_ENFORCE_NE(
+        std::find(outputs.begin(), outputs.end(), pair.second),
+        outputs.cend(),
+        phi::errors::PreconditionNotMet(
+            "The register of operator %s's `SetInplaceMap` failed. "
+            "Please make sure: 1. Call `Inputs` and `Outputs` "
+            "before `SetInplaceMap`; 2. The values of inplace_map "
+            "are inside `Outputs`",
+            name_));
   }
   info_ptr_->SetInplaceMap(
       std::forward<std::unordered_map<std::string, std::string>>(inplace_map));
diff --git a/paddle/phi/api/lib/tensor.cc b/paddle/phi/api/lib/tensor.cc
index 634c37933cb..e9c68367b16 100644
--- a/paddle/phi/api/lib/tensor.cc
+++ b/paddle/phi/api/lib/tensor.cc
@@ -358,6 +358,10 @@ gpuStream_t Tensor::stream() const {
 }
 #endif
 
+const std::string &Tensor::name() const { return name_; }
+
+void Tensor::set_name(const std::string &name) { name_ = name; }
+
 /* Part 5: Status utils methods */
 
 bool Tensor::defined() const { return impl_ != nullptr; }
diff --git a/paddle/phi/api/profiler/CMakeLists.txt b/paddle/phi/api/profiler/CMakeLists.txt
index 14e3ace536a..ec569fe9fbc 100644
--- a/paddle/phi/api/profiler/CMakeLists.txt
+++ b/paddle/phi/api/profiler/CMakeLists.txt
@@ -26,16 +26,4 @@ if(WITH_PYTHON AND EXISTS ${PADDLE_BINARY_DIR})
   endif()
 endif()
 
-if(WITH_GPU OR WITH_ROCM)
-  set(GPU_CTX_DEPS dynload_cuda dynamic_loader)
-endif()
-
-cc_library(
-  phi_device_tracer
-  SRCS device_tracer.cc
-  DEPS phi_profiler_proto ${GPU_CTX_DEPS})
-
-cc_library(
-  phi_profiler
-  SRCS profiler.cc
-  DEPS phi_os_info phi_device_tracer phi_enforce)
+collect_srcs(api_srcs SRCS device_tracer.cc profiler.cc)
diff --git a/paddle/phi/backends/CMakeLists.txt b/paddle/phi/backends/CMakeLists.txt
index 828437c8f2a..3ec479398a2 100644
--- a/paddle/phi/backends/CMakeLists.txt
+++ b/paddle/phi/backends/CMakeLists.txt
@@ -2,17 +2,6 @@ add_subdirectory(dynload)
 add_subdirectory(gpu)
 
 set(BACKENDS_SRCS all_context.cc cpu/cpu_context.cc cpu/cpu_info.cc)
-set(BACKENDS_DEPS
-    enforce
-    place
-    flags
-    eigen3
-    phi_device_context
-    generator
-    phi_os_info)
-if(WITH_XBYAK)
-  list(APPEND BACKENDS_DEPS xbyak)
-endif()
 
 if(NOT APPLE AND NOT WIN32)
   list(APPEND BACKENDS_SRCS device_code.cc)
@@ -23,16 +12,10 @@ if(WITH_GPU OR WITH_ROCM)
        gpu/gpu_resources.cc)
   if(WITH_GPU)
     list(APPEND BACKENDS_SRCS gpu/cuda/cuda_info.cc gpu/cuda/cuda_graph.cc)
-    set_source_files_properties(
-      gpu/gpu_resources.cc
-      PROPERTIES COMPILE_FLAGS
-                 "-DCUDA_REAL_ARCHS=\"${NVCC_FLAGS_EXTRA_real_archs}\"")
-
   endif()
   if(WITH_ROCM)
     list(APPEND BACKENDS_SRCS gpu/rocm/rocm_info.cc)
   endif()
-  list(APPEND BACKENDS_DEPS phi_dynload_cuda)
 endif()
 
 if(WITH_XPU)
@@ -45,7 +28,6 @@ if(WITH_MKLDNN)
   list(APPEND BACKENDS_SRCS onednn/onednn_context.cc)
   list(APPEND BACKENDS_SRCS onednn/axpy_handler.cc)
   list(APPEND BACKENDS_SRCS onednn/matmul_utils.cc)
-  list(APPEND BACKENDS_DEPS mkldnn)
 endif()
 
 list(
@@ -55,26 +37,25 @@ list(
   device_guard.cc
   stream.cc
   event.cc
-  device_base.cc
   device_manager.cc
   context_pool.cc)
 
+if(WITH_GPU
+   OR WITH_ROCM
+   OR WITH_CUSTOM_DEVICE)
+  list(APPEND BACKENDS_SRCS device_base.cc)
+endif()
+
 if(WITH_CUSTOM_DEVICE)
   list(APPEND BACKENDS_SRCS custom/custom_context.cc custom/custom_device.cc
        custom/custom_device_op_list.cc)
 endif()
 
-add_library(phi_backends "${BACKENDS_SRCS}")
-target_link_libraries(phi_backends ${BACKENDS_DEPS})
-
-# for inference library
-get_property(phi_modules GLOBAL PROPERTY PHI_MODULES)
-set(phi_modules ${phi_modules} phi_backends)
-set_property(GLOBAL PROPERTY PHI_MODULES "${phi_modules}")
+collect_srcs(backends_srcs SRCS ${BACKENDS_SRCS})
 
 if(WITH_CUSTOM_DEVICE)
   cc_test(
     capi_test
     SRCS custom/capi_test.cc
-    DEPS phi_capi)
+    DEPS phi)
 endif()
diff --git a/paddle/phi/backends/cpu/cpu_context.cc b/paddle/phi/backends/cpu/cpu_context.cc
index d42189e00ee..4538a96dc99 100644
--- a/paddle/phi/backends/cpu/cpu_context.cc
+++ b/paddle/phi/backends/cpu/cpu_context.cc
@@ -24,6 +24,10 @@
 
 namespace phi {
 
+template <>
+const TypeInfo<DeviceContext> TypeInfoTraits<DeviceContext, CPUContext>::kType =
+    RegisterStaticType<DeviceContext>(CPUContext::name());
+
 struct CPUContext::Impl {
   Impl() : place_(CPUPlace()) {}
 
diff --git a/paddle/phi/backends/custom/custom_context.cc b/paddle/phi/backends/custom/custom_context.cc
index ddba0baea7e..d382c766cfd 100644
--- a/paddle/phi/backends/custom/custom_context.cc
+++ b/paddle/phi/backends/custom/custom_context.cc
@@ -19,6 +19,11 @@ limitations under the License. */
 
 namespace phi {
 
+template <>
+const TypeInfo<DeviceContext>
+    TypeInfoTraits<DeviceContext, CustomContext>::kType =
+        RegisterStaticType<DeviceContext>(CustomContext::name());
+
 struct CustomContext::Impl {
   explicit Impl(const CustomPlace& place) : place_(place) {}
 
diff --git a/paddle/phi/backends/dynload/CMakeLists.txt b/paddle/phi/backends/dynload/CMakeLists.txt
index 2d4e84beb69..568c54cb342 100644
--- a/paddle/phi/backends/dynload/CMakeLists.txt
+++ b/paddle/phi/backends/dynload/CMakeLists.txt
@@ -1,8 +1,8 @@
-cc_library(
-  phi_dynamic_loader
-  SRCS dynamic_loader.cc port.cc
-  DEPS enforce glog gflags)
-
+set(DYNLOAD_COMMON_SRCS dynamic_loader.cc port.cc warpctc.cc warprnnt.cc
+                        lapack.cc)
+if(WITH_ASCEND_CL)
+  list(REMOVE_ITEM DYNLOAD_COMMON_SRCS warprnnt.cc)
+endif()
 list(
   APPEND
   CUDA_SRCS
@@ -60,66 +60,39 @@ configure_file(cupti_lib_path.h.in ${CMAKE_CURRENT_BINARY_DIR}/cupti_lib_path.h)
 if(CUPTI_FOUND)
   list(APPEND CUDA_SRCS cupti.cc)
 endif()
-if(WITH_ROCM)
-  hip_library(
-    phi_dynload_cuda
-    SRCS ${HIP_SRCS}
-    DEPS phi_dynamic_loader)
-  cc_library(
-    phi_dynload_warpctc
-    SRCS warpctc.cc
-    DEPS phi_dynamic_loader warpctc)
-  cc_library(
-    phi_dynload_warprnnt
-    SRCS warprnnt.cc
-    DEPS phi_dynamic_loader warprnnt)
-else()
-  nv_library(
-    phi_dynload_cuda
-    SRCS ${CUDA_SRCS}
-    DEPS phi_dynamic_loader)
-  cc_library(
-    phi_dynload_warpctc
-    SRCS warpctc.cc
-    DEPS phi_dynamic_loader warpctc)
-  cc_library(
-    phi_dynload_warprnnt
-    SRCS warprnnt.cc
-    DEPS phi_dynamic_loader warprnnt)
-endif()
+
 if(WITH_MKLML)
-  cc_library(
-    phi_dynload_mklml
-    SRCS mklml.cc
-    DEPS phi_dynamic_loader mklml)
+  # Only deps libmklml.so, not link
+  add_library(dynload_mklml STATIC mklml.cc)
+  add_dependencies(dynload_mklml mklml)
+  if(WIN32)
+    target_link_libraries(dynload_mklml ${MKLML_IOMP_LIB})
+  else()
+    target_link_libraries(dynload_mklml
+                          "-L${MKLML_LIB_DIR} -liomp5 -Wl,--as-needed")
+  endif()
 endif()
 
 if(WITH_FLASHATTN)
-  cc_library(
-    phi_dynload_flashattn
-    SRCS flashattn.cc
-    DEPS phi_dynamic_loader flashattn)
+  list(APPEND DYNLOAD_COMMON_SRCS flashattn.cc)
 endif()
 
-cc_library(
-  phi_dynload_lapack
-  SRCS lapack.cc
-  DEPS phi_dynamic_loader)
-add_dependencies(phi_dynload_lapack extern_lapack)
-# TODO(TJ): add iomp, mkldnn?
-
 if(MKL_FOUND AND WITH_ONEMKL)
   message("ONEMKL INCLUDE directory is ${MKL_INCLUDE}")
-  cc_library(
-    phi_dynload_mklrt
-    SRCS mklrt.cc
-    DEPS phi_dynamic_loader)
-  target_include_directories(phi_dynload_mklrt PRIVATE ${MKL_INCLUDE})
+  list(APPEND DYNLOAD_COMMON_SRCS mklrt.cc)
+endif()
+
+if(WITH_ROCM)
+  collect_srcs(backends_srcs SRCS ${DYNLOAD_COMMON_SRCS} ${HIP_SRCS})
+elseif(WITH_GPU)
+  collect_srcs(backends_srcs SRCS ${DYNLOAD_COMMON_SRCS} ${CUDA_SRCS})
+else()
+  collect_srcs(backends_srcs SRCS ${DYNLOAD_COMMON_SRCS})
 endif()
 
 if(WITH_CUDNN_FRONTEND)
   nv_test(
     cudnn_frontend_test
     SRCS cudnn_frontend_test.cc
-    DEPS phi_dynload_cuda cudnn-frontend)
+    DEPS phi cudnn-frontend)
 endif()
diff --git a/paddle/phi/backends/gpu/cuda/CMakeLists.txt b/paddle/phi/backends/gpu/cuda/CMakeLists.txt
index 7768cdd1161..6f138d4a0dd 100644
--- a/paddle/phi/backends/gpu/cuda/CMakeLists.txt
+++ b/paddle/phi/backends/gpu/cuda/CMakeLists.txt
@@ -1 +1 @@
-cc_library(cudnn_workspace_helper SRCS cudnn_workspace_helper.cc)
+collect_srcs(backends_srcs SRCS cudnn_workspace_helper.cc)
diff --git a/paddle/phi/backends/gpu/gpu_context.cc b/paddle/phi/backends/gpu/gpu_context.cc
index 5c9c010d365..5ab7019e601 100644
--- a/paddle/phi/backends/gpu/gpu_context.cc
+++ b/paddle/phi/backends/gpu/gpu_context.cc
@@ -59,6 +59,15 @@ limitations under the License. */
 
 namespace phi {
 
+template <>
+const TypeInfo<DeviceContext> TypeInfoTraits<DeviceContext, GPUContext>::kType =
+    RegisterStaticType<DeviceContext>(GPUContext::name());
+
+template <>
+const TypeInfo<DeviceContext>
+    TypeInfoTraits<DeviceContext, GPUPinnedContext>::kType =
+        RegisterStaticType<DeviceContext>(GPUPinnedContext::name());
+
 namespace internal {
 
 class EigenGpuStreamDevice : public Eigen::StreamInterface {
diff --git a/paddle/phi/backends/gpu/gpu_context.h b/paddle/phi/backends/gpu/gpu_context.h
index 0b72f8b30c0..ef7df28d9a9 100644
--- a/paddle/phi/backends/gpu/gpu_context.h
+++ b/paddle/phi/backends/gpu/gpu_context.h
@@ -15,6 +15,8 @@ limitations under the License. */
 
 #pragma once
 
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+
 #include <array>
 #include <functional>
 #include <mutex>
@@ -305,3 +307,5 @@ class GPUPinnedContext
 };
 #endif
 }  // namespace phi
+
+#endif
diff --git a/paddle/phi/backends/onednn/onednn_context.cc b/paddle/phi/backends/onednn/onednn_context.cc
index 5095b5c234b..9ad73795da4 100644
--- a/paddle/phi/backends/onednn/onednn_context.cc
+++ b/paddle/phi/backends/onednn/onednn_context.cc
@@ -83,6 +83,11 @@ void OneDNNContextThreadLocals::Body::log_lib_version(void) {
   }
 }
 
+OneDNNContextThreadLocals::Body& OneDNNContextThreadLocals::fetch() {
+  thread_local Body b;
+  return b;
+}
+
 struct OneDNNContext::Impl {
   Impl() : p_blobmap_() {
     p_blobmap_.reset(new BlobMap());
@@ -462,5 +467,7 @@ const std::vector<std::string>& OneDNNContext::GetOutputsName(
   return impl_->GetOutputsName(output);
 }
 
+const char* OneDNNContext::name() { return "OneDNNContext"; }
+
 }  // namespace phi
 #endif
diff --git a/paddle/phi/backends/onednn/onednn_context.h b/paddle/phi/backends/onednn/onednn_context.h
index 79eaa05948c..8262a8bb290 100644
--- a/paddle/phi/backends/onednn/onednn_context.h
+++ b/paddle/phi/backends/onednn/onednn_context.h
@@ -76,10 +76,7 @@ class OneDNNContextThreadLocals {
   static constexpr size_t kMKLDNNSessionID_Default = 0;
   // mkldnn session id for cache clearing mode
   static constexpr size_t kMKLDNNSessionID_CacheClearing = -1;
-  static Body& fetch() {
-    thread_local Body b;
-    return b;
-  }
+  static Body& fetch();
 };
 
 class OneDNNContext : public CPUContext {
@@ -157,7 +154,7 @@ class OneDNNContext : public CPUContext {
   const std::vector<std::string>& GetOutputsName(
       const std::string& output) const;
 
-  static const char* name() { return "OneDNNContext"; }
+  static const char* name();
 
  private:
   struct Impl;
diff --git a/paddle/phi/backends/xpu/xpu_context.cc b/paddle/phi/backends/xpu/xpu_context.cc
index 44f247ff259..0c554270b51 100644
--- a/paddle/phi/backends/xpu/xpu_context.cc
+++ b/paddle/phi/backends/xpu/xpu_context.cc
@@ -30,6 +30,9 @@ namespace xpu = baidu::xpu::api;
 
 namespace phi {
 
+template <>
+const TypeInfo<DeviceContext> TypeInfoTraits<DeviceContext, XPUContext>::kType =
+    RegisterStaticType<DeviceContext>(XPUContext::name());
 struct XPUContext::Impl {
   void SetL3Cache(int l3_size = 14155776) {
     const int MAX_XPU_NUM = 16;
diff --git a/paddle/phi/capi/CMakeLists.txt b/paddle/phi/capi/CMakeLists.txt
index c00c38cfa3a..3ea7a4199b2 100644
--- a/paddle/phi/capi/CMakeLists.txt
+++ b/paddle/phi/capi/CMakeLists.txt
@@ -1,13 +1 @@
 add_subdirectory(lib)
-cc_library(
-  phi_capi
-  SRCS all.cc
-  DEPS phi_c_data_type
-       phi_c_device_context
-       phi_c_int_array
-       phi_c_kernel_context
-       phi_c_kernel_factory
-       phi_c_kernel_registry
-       phi_c_place
-       phi_c_scalar
-       phi_c_tensor)
diff --git a/paddle/phi/capi/all.cc b/paddle/phi/capi/all.cc
deleted file mode 100644
index 3d9c9315b31..00000000000
--- a/paddle/phi/capi/all.cc
+++ /dev/null
@@ -1,19 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/phi/capi/all.h"
-
-namespace paddle {
-namespace capi {}  // namespace capi
-}  // namespace paddle
diff --git a/paddle/phi/capi/lib/CMakeLists.txt b/paddle/phi/capi/lib/CMakeLists.txt
index 60afb74a6d4..8cf3c9caf8e 100644
--- a/paddle/phi/capi/lib/CMakeLists.txt
+++ b/paddle/phi/capi/lib/CMakeLists.txt
@@ -1,44 +1,12 @@
-cc_library(
-  phi_c_data_type
-  SRCS c_data_type.cc
-  DEPS dense_tensor)
-
-cc_library(
-  phi_c_device_context
-  SRCS c_device_context.cc
-  DEPS phi_backends)
-
-cc_library(
-  phi_c_int_array
-  SRCS c_int_array.cc
-  DEPS int_array)
-
-cc_library(
-  phi_c_kernel_context
-  SRCS c_kernel_context.cc
-  DEPS kernel_context)
-
-cc_library(
-  phi_c_kernel_factory
-  SRCS c_kernel_factory.cc
-  DEPS kernel_factory)
-
-cc_library(
-  phi_c_kernel_registry
-  SRCS c_kernel_registry.cc
-  DEPS dense_tensor)
-
-cc_library(
-  phi_c_place
-  SRCS c_place.cc
-  DEPS phi_place)
-
-cc_library(
-  phi_c_scalar
-  SRCS c_scalar.cc
-  DEPS scalar)
-
-cc_library(
-  phi_c_tensor
-  SRCS c_tensor.cc
-  DEPS dense_tensor)
+collect_srcs(
+  capi_srcs
+  SRCS
+  c_data_type.cc
+  c_device_context.cc
+  c_int_array.cc
+  c_kernel_context.cc
+  c_kernel_factory.cc
+  c_kernel_registry.cc
+  c_place.cc
+  c_scalar.cc
+  c_tensor.cc)
diff --git a/paddle/phi/common/CMakeLists.txt b/paddle/phi/common/CMakeLists.txt
index 67f6fa9729c..5fe96a2a682 100644
--- a/paddle/phi/common/CMakeLists.txt
+++ b/paddle/phi/common/CMakeLists.txt
@@ -1,26 +1 @@
-if(WITH_GPU)
-  nv_library(
-    phi_place
-    SRCS place.cc
-    DEPS phi_backends)
-elseif(WITH_ROCM)
-  hip_library(
-    phi_place
-    SRCS place.cc
-    DEPS phi_backends)
-else()
-  cc_library(phi_place SRCS place.cc)
-endif()
-
-cc_library(
-  scalar
-  SRCS scalar.cc
-  DEPS phi_enforce phi_tensor_utils)
-cc_library(
-  int_array
-  SRCS int_array.cc
-  DEPS phi_enforce phi_tensor_utils)
-cc_library(
-  memory_utils
-  SRCS memory_utils.cc
-  DEPS phi_enforce phi_place)
+collect_srcs(common_srcs SRCS place.cc scalar.cc int_array.cc memory_utils.cc)
diff --git a/paddle/phi/core/CMakeLists.txt b/paddle/phi/core/CMakeLists.txt
index 9cb2cec158a..3500d880907 100644
--- a/paddle/phi/core/CMakeLists.txt
+++ b/paddle/phi/core/CMakeLists.txt
@@ -6,150 +6,35 @@ if(WITH_GPU)
   proto_library(external_error_proto SRCS external_error.proto)
 endif()
 
-cc_library(
-  flags
-  SRCS flags.cc
-  DEPS gflags)
-
-cc_library(errors SRCS errors.cc)
-set(phi_enforce_deps errors flags)
-if(WITH_GPU)
-  set(phi_enforce_deps ${phi_enforce_deps} external_error_proto)
-endif()
-cc_library(
-  phi_enforce
-  SRCS enforce.cc
-  DEPS ${phi_enforce_deps})
-
-cc_library(
-  phi_os_info
-  SRCS os_info.cc
-  DEPS phi_enforce)
-
-if(WITH_XPU)
-  cc_library(
-    kernel_factory
-    SRCS kernel_factory.cc
-    DEPS phi_enforce convert_utils phi_backends)
-else()
-  cc_library(
-    kernel_factory
-    SRCS kernel_factory.cc
-    DEPS phi_enforce convert_utils)
-endif()
-cc_library(
-  kernel_context
-  SRCS kernel_context.cc
-  DEPS phi_enforce phi_backends)
-
-cc_library(
-  ddim
-  SRCS ddim.cc
-  DEPS phi_enforce)
-cc_library(
-  tensor_base
-  SRCS tensor_base.cc allocator.cc
-  DEPS phi_enforce)
-cc_library(
-  tensor_meta
-  SRCS tensor_meta.cc
-  DEPS phi_enforce)
-cc_library(
-  lod_utils
-  SRCS lod_utils.cc
-  DEPS phi_enforce)
-cc_library(
-  threadpool
-  SRCS threadpool.cc
-  DEPS phi_enforce)
-
-cc_library(
-  dense_tensor
-  SRCS dense_tensor.cc dense_tensor_impl.cc
-  DEPS convert_utils tensor_meta tensor_base ddim)
-
-target_link_libraries(dense_tensor memory_utils)
-
-cc_library(
-  sparse_coo_tensor
-  SRCS sparse_coo_tensor.cc
-  DEPS tensor_meta tensor_base)
-cc_library(
-  sparse_csr_tensor
-  SRCS sparse_csr_tensor.cc
-  DEPS dense_tensor tensor_base)
-cc_library(
-  string_tensor
-  SRCS string_tensor.cc
-  DEPS convert_utils tensor_meta tensor_base)
-
-cc_library(
-  tensor_array
-  SRCS tensor_array.cc
-  DEPS dense_tensor tensor_base)
-
-cc_library(
-  extended_tensor
-  SRCS extended_tensor.cc
-  DEPS tensor_base)
-
-cc_library(
-  meta_tensor
-  SRCS meta_tensor.cc
-  DEPS tensor_base tensor_meta dense_tensor)
-cc_library(
-  infermeta_utils
-  SRCS infermeta_utils.cc
-  DEPS meta_tensor)
-
-cc_library(
-  selected_rows
-  SRCS selected_rows_impl.cc selected_rows.cc
-  DEPS tensor_base dense_tensor phi_enforce ddim)
-cc_library(
-  phi_device_context
-  SRCS device_context.cc
-  DEPS dense_tensor selected_rows)
-
-cc_library(
-  custom_kernel
-  SRCS custom_kernel.cc
-  DEPS kernel_factory)
-
-cc_library(
-  mixed_vector
-  SRCS mixed_vector.cc
-  DEPS phi_backends place memory)
-
-cc_library(
-  generator
-  SRCS generator.cc
-  DEPS enforce place)
-
-# Will remove once we implemented MKLDNN_Tensor
-if(WITH_MKLDNN)
-  add_dependencies(dense_tensor mkldnn)
-  add_dependencies(tensor_base mkldnn)
-endif()
-
-if(WITH_GPU)
-  nv_library(
-    phi_tensor_utils
-    SRCS tensor_utils.cc
-    DEPS phi_backends dense_tensor selected_rows memcpy memory_utils)
-elseif(WITH_ROCM)
-  hip_library(
-    phi_tensor_utils
-    SRCS tensor_utils.cc
-    DEPS phi_backends dense_tensor selected_rows memcpy memory_utils)
-elseif(WITH_XPU_KP)
-  xpu_library(
-    phi_tensor_utils
-    SRCS tensor_utils.cc
-    DEPS phi_backends dense_tensor selected_rows memcpy memory_utils)
-else()
-  cc_library(
-    phi_tensor_utils
-    SRCS tensor_utils.cc
-    DEPS dense_tensor selected_rows memcpy phi_backends memory_utils)
-endif()
+collect_srcs(
+  core_srcs
+  SRCS
+  flags.cc
+  errors.cc
+  enforce.cc
+  os_info.cc
+  kernel_context.cc
+  ddim.cc
+  tensor_base.cc
+  allocator.cc
+  tensor_meta.cc
+  lod_utils.cc
+  threadpool.cc
+  dense_tensor.cc
+  dense_tensor_impl.cc
+  sparse_coo_tensor.cc
+  sparse_csr_tensor.cc
+  string_tensor.cc
+  tensor_array.cc
+  extended_tensor.cc
+  meta_tensor.cc
+  infermeta_utils.cc
+  selected_rows_impl.cc
+  selected_rows.cc
+  device_context.cc
+  custom_kernel.cc
+  mixed_vector.cc
+  generator.cc
+  kernel_factory.cc
+  tensor_utils.cc
+  storage_properties.cc)
diff --git a/paddle/phi/core/compat/CMakeLists.txt b/paddle/phi/core/compat/CMakeLists.txt
index 3234f1004f0..4df1ac8e932 100644
--- a/paddle/phi/core/compat/CMakeLists.txt
+++ b/paddle/phi/core/compat/CMakeLists.txt
@@ -1,23 +1,2 @@
-cc_library(
-  arg_map_context
-  SRCS arg_map_context.cc
-  DEPS phi_enforce)
-cc_library(
-  op_utils
-  SRCS op_utils.cc
-  DEPS arg_map_context enforce)
-cc_library(
-  get_kerneltype_forvar_utils
-  SRCS get_kerneltype_forvar_utils.cc
-  DEPS enforce)
-
-set(convert_utils_deps data_type place op_utils phi_backends)
-
-if(WITH_MKLDNN)
-  set(convert_utils_deps ${convert_utils_deps} mkldnn)
-endif()
-
-cc_library(
-  convert_utils
-  SRCS convert_utils.cc
-  DEPS ${convert_utils_deps})
+collect_srcs(core_srcs SRCS arg_map_context.cc op_utils.cc
+             get_kerneltype_forvar_utils.cc convert_utils.cc)
diff --git a/paddle/phi/core/compat/op_utils.cc b/paddle/phi/core/compat/op_utils.cc
index 086cf6da5f1..11c887785f1 100644
--- a/paddle/phi/core/compat/op_utils.cc
+++ b/paddle/phi/core/compat/op_utils.cc
@@ -26,4 +26,16 @@ OpUtilsMap& OpUtilsMap::Instance() {
   return g_op_utils_map;
 }
 
+BaseKernelNameRegistrar::BaseKernelNameRegistrar(const char* op_type,
+                                                 const char* base_kernel_name) {
+  OpUtilsMap::Instance().InsertBaseKernelName(op_type, base_kernel_name);
+  OpUtilsMap::Instance().InsertFluidOplName(op_type, base_kernel_name);
+}
+
+ArgumentMappingFnRegistrar::ArgumentMappingFnRegistrar(
+    const char* op_type, ArgumentMappingFn arg_mapping_fn) {
+  OpUtilsMap::Instance().InsertArgumentMappingFn(op_type,
+                                                 std::move(arg_mapping_fn));
+}
+
 }  // namespace phi
diff --git a/paddle/phi/core/compat/op_utils.h b/paddle/phi/core/compat/op_utils.h
index f3e594eae11..cfa64714966 100644
--- a/paddle/phi/core/compat/op_utils.h
+++ b/paddle/phi/core/compat/op_utils.h
@@ -210,18 +210,12 @@ class OpUtilsMap {
 };
 
 struct BaseKernelNameRegistrar {
-  BaseKernelNameRegistrar(const char* op_type, const char* base_kernel_name) {
-    OpUtilsMap::Instance().InsertBaseKernelName(op_type, base_kernel_name);
-    OpUtilsMap::Instance().InsertFluidOplName(op_type, base_kernel_name);
-  }
+  BaseKernelNameRegistrar(const char* op_type, const char* base_kernel_name);
 };
 
 struct ArgumentMappingFnRegistrar {
   ArgumentMappingFnRegistrar(const char* op_type,
-                             ArgumentMappingFn arg_mapping_fn) {
-    OpUtilsMap::Instance().InsertArgumentMappingFn(op_type,
-                                                   std::move(arg_mapping_fn));
-  }
+                             ArgumentMappingFn arg_mapping_fn);
 };
 
 #define PD_REGISTER_BASE_KERNEL_NAME(op_type, base_kernel_name)               \
diff --git a/paddle/phi/core/dense_tensor.cc b/paddle/phi/core/dense_tensor.cc
index 2c8f36f6c34..4346cc6f32b 100644
--- a/paddle/phi/core/dense_tensor.cc
+++ b/paddle/phi/core/dense_tensor.cc
@@ -42,6 +42,11 @@ limitations under the License. */
 
 namespace phi {
 
+template <>
+const TypeInfo<phi::TensorBase>
+    TypeInfoTraits<phi::TensorBase, DenseTensor>::kType =
+        RegisterStaticType<phi::TensorBase>(DenseTensor::name());
+
 DenseTensor::DenseTensor(Allocator* a, const DenseTensorMeta& meta)
     : meta_(meta), holder_(a->Allocate(SizeOf(dtype()) * numel())) {}
 
@@ -115,8 +120,9 @@ void* DenseTensor::AllocateFrom(Allocator* allocator,
   if (fake_alloc) {
     bytes = 0;
   } else {
-    PADDLE_ENFORCE(
+    PADDLE_ENFORCE_EQ(
         valid(),
+        true,
         phi::errors::PreconditionNotMet("The meta data must be valid when "
                                         "call the mutable data function."));
     if (requested_size) {
@@ -169,8 +175,9 @@ const T* DenseTensor::data() const {
 template <typename T>
 T* DenseTensor::data() {
   T* ret = static_cast<T*>(data());
-  PADDLE_ENFORCE(
-      (dtype() == phi::CppTypeToDataType<T>::Type()),
+  PADDLE_ENFORCE_EQ(
+      dtype(),
+      phi::CppTypeToDataType<T>::Type(),
       phi::errors::InvalidArgument(
           "The type of data we are trying to retrieve (%s) does not match the "
           "type of data (%s) currently contained in the container.",
@@ -200,16 +207,18 @@ const void* DenseTensor::data() const {
 }
 
 void DenseTensor::set_meta(DenseTensorMeta&& meta) {
-  PADDLE_ENFORCE(!meta_.valid(),
-                 phi::errors::InvalidArgument(
-                     "Only when the original attribute of Tensor is "
-                     "incomplete, can it be reset."));
+  PADDLE_ENFORCE_EQ(meta_.valid(),
+                    false,
+                    phi::errors::InvalidArgument(
+                        "Only when the original attribute of Tensor is "
+                        "incomplete, can it be reset."));
   meta_ = std::move(meta);
 }
 
 void DenseTensor::set_meta(const DenseTensorMeta& meta) {
-  PADDLE_ENFORCE(
+  PADDLE_ENFORCE_EQ(
       meta.valid(),
+      true,
       phi::errors::InvalidArgument(
           "Input meta is invalid, please check the meta attribute."));
   meta_.dims = meta.dims;
diff --git a/paddle/phi/core/distributed/CMakeLists.txt b/paddle/phi/core/distributed/CMakeLists.txt
index b68a6890485..e759b7d9c8d 100644
--- a/paddle/phi/core/distributed/CMakeLists.txt
+++ b/paddle/phi/core/distributed/CMakeLists.txt
@@ -2,32 +2,14 @@ add_subdirectory(check)
 add_subdirectory(store)
 add_subdirectory(auto_parallel)
 
-set(COMM_CONTEXT_MANAGER_DEPS tcp_store)
+set(DISTRIBUTED_COMMON_SRCS comm_context_manager.cc)
 
 if(WITH_NCCL OR WITH_RCCL)
-  cc_library(
-    nccl_comm_context
-    SRCS nccl_comm_context.cc
-    DEPS dense_tensor comm_static_check nccl_dynamic_check)
-
-  list(APPEND COMM_CONTEXT_MANAGER_DEPS nccl_comm_context)
+  list(APPEND DISTRIBUTED_COMMON_SRCS nccl_comm_context.cc)
 endif()
 
 if(WITH_GLOO)
-  cc_library(
-    gloo_utils
-    SRCS gloo_utils.cc
-    DEPS gloo dense_tensor enforce tcp_store)
-
-  cc_library(
-    gloo_comm_context
-    SRCS gloo_comm_context.cc
-    DEPS gloo_utils comm_static_check)
-
-  list(APPEND COMM_CONTEXT_MANAGER_DEPS gloo_comm_context gloo_store)
+  list(APPEND DISTRIBUTED_COMMON_SRCS gloo_utils.cc gloo_comm_context.cc)
 endif()
 
-cc_library(
-  comm_context_manager
-  SRCS comm_context_manager.cc
-  DEPS ${COMM_CONTEXT_MANAGER_DEPS})
+collect_srcs(core_srcs SRCS ${DISTRIBUTED_COMMON_SRCS})
diff --git a/paddle/phi/core/distributed/auto_parallel/CMakeLists.txt b/paddle/phi/core/distributed/auto_parallel/CMakeLists.txt
index 2c4728c5a4c..d6e52ca8044 100644
--- a/paddle/phi/core/distributed/auto_parallel/CMakeLists.txt
+++ b/paddle/phi/core/distributed/auto_parallel/CMakeLists.txt
@@ -1,23 +1,4 @@
 proto_library(auto_parallel_proto SRCS auto_parallel.proto)
 
-cc_library(
-  device_mesh
-  SRCS device_mesh.cc
-  DEPS auto_parallel_proto phi_enforce)
-
-cc_library(
-  process_mesh
-  SRCS process_mesh.cc
-  DEPS auto_parallel_proto phi_enforce)
-
-cc_library(
-  dist_attr
-  SRCS dist_attr.cc
-  DEPS process_mesh auto_parallel_proto proto_desc phi_enforce)
-
-cc_library(
-  dist_mapper
-  SRCS dist_mapper.cc
-  DEPS device_mesh auto_parallel_proto phi_enforce)
-
-cc_library(auto_parallel DEPS device_mesh process_mesh dist_attr dist_mapper)
+collect_srcs(core_srcs SRCS device_mesh.cc process_mesh.cc dist_attr.cc
+             dist_mapper.cc)
diff --git a/paddle/phi/core/distributed/check/CMakeLists.txt b/paddle/phi/core/distributed/check/CMakeLists.txt
index 76f4977263d..1721a4a4602 100644
--- a/paddle/phi/core/distributed/check/CMakeLists.txt
+++ b/paddle/phi/core/distributed/check/CMakeLists.txt
@@ -1,11 +1,7 @@
-cc_library(
-  comm_static_check
-  SRCS static_check.cc
-  DEPS place dense_tensor enforce)
+set(CHECK_COMMON_SRCS static_check.cc)
 
 if(WITH_NCCL OR WITH_RCCL)
-  cc_library(
-    nccl_dynamic_check
-    SRCS nccl_dynamic_check.cc
-    DEPS dense_tensor)
+  list(APPEND CHECK_COMMON_SRCS nccl_dynamic_check.cc)
 endif()
+
+collect_srcs(core_srcs SRCS ${CHECK_COMMON_SRCS})
diff --git a/paddle/phi/core/distributed/store/CMakeLists.txt b/paddle/phi/core/distributed/store/CMakeLists.txt
index d6b35eb342b..8eaa76eac1c 100644
--- a/paddle/phi/core/distributed/store/CMakeLists.txt
+++ b/paddle/phi/core/distributed/store/CMakeLists.txt
@@ -1,18 +1,14 @@
-cc_library(
-  tcp_store
-  SRCS tcp_store.cc tcp_utils.cc socket.cpp store.cc
-  DEPS enforce glog)
+set(STORE_COMMON_SRCS tcp_store.cc tcp_utils.cc socket.cpp store.cc)
 
 if(WITH_GLOO)
-  cc_library(
-    gloo_store
-    SRCS gloo_store.cc
-    DEPS gloo)
+  list(APPEND STORE_COMMON_SRCS gloo_store.cc)
 endif()
 
+collect_srcs(core_srcs SRCS ${STORE_COMMON_SRCS})
+
 if(NOT WIN32)
   cc_test(
     test_c_tcp_store
     SRCS test_tcp_store.cc
-    DEPS tcp_store)
+    DEPS phi)
 endif()
diff --git a/paddle/phi/core/distributed/store/tcp_store.cc b/paddle/phi/core/distributed/store/tcp_store.cc
index 98b1ad3f850..baae37148f7 100644
--- a/paddle/phi/core/distributed/store/tcp_store.cc
+++ b/paddle/phi/core/distributed/store/tcp_store.cc
@@ -139,8 +139,9 @@ void MasterDaemon::StopByControlFd() {
 #else
 void MasterDaemon::InitControlFd() {
   ghStopEvent_ = CreateEvent(NULL, TRUE, FALSE, NULL);
-  PADDLE_ENFORCE(ghStopEvent_,
-                 phi::errors::Fatal("failed to cread control pipe"));
+  PADDLE_ENFORCE_NE(ghStopEvent_,
+                    nullptr,
+                    phi::errors::Fatal("failed to cread control pipe"));
 }
 void MasterDaemon::CloseControlFd() { CloseHandle(ghStopEvent_); }
 void MasterDaemon::StopByControlFd() { SetEvent(ghStopEvent_); }
@@ -422,8 +423,9 @@ void TCPStore::wait(const std::string& key) {
   VLOG(3) << "TCPStore wait.";
   _client->send_command_for_key(Command::WAIT, _key_prefix + key);
   reply = _client->receive_value<ReplyType>();
-  PADDLE_ENFORCE(
+  PADDLE_ENFORCE_EQ(
       reply == ReplyType::STOP_WAIT,
+      true,
       phi::errors::InvalidArgument("Stop_waiting response is expected"));
 }
 
diff --git a/paddle/phi/core/enforce.cc b/paddle/phi/core/enforce.cc
index 9630c6c96d4..1291571141c 100644
--- a/paddle/phi/core/enforce.cc
+++ b/paddle/phi/core/enforce.cc
@@ -280,13 +280,19 @@ std::string GetExternalErrorMsg(T status) {
       if (std::string::npos != last_slash_idx) {
         strModule.erase(last_slash_idx, std::string::npos);
       }
-      if (compare_path.compare("avx.so") == 0) {
+      // TODO(lizhiyu02): I don't know what the 'compare_path.compare("avx.so")
+      // == 0' means, while
+      //  'compare_path.find("dist-packages") != std::string::npos' means that
+      //  after using 'pip install paddle'.
+      if (compare_path.compare("avx.so") == 0 ||
+          strModule.find("dist-packages") != std::string::npos) {
         filePath =
             strModule +
             "/../include/third_party/externalError/data/externalErrorMsg.pb";
       } else {
+        // Just for unittest
         filePath = strModule +
-                   "/../../third_party/externalError/data/externalErrorMsg.pb";
+                   "/../third_party/externalError/data/externalErrorMsg.pb";
       }
     }
 #else
@@ -303,14 +309,14 @@ std::string GetExternalErrorMsg(T status) {
     if (std::string::npos != last_slash_idx) {
       strModule.erase(last_slash_idx, std::string::npos);
     }
-    if (compare_path.compare("avx.pyd") == 0) {
+    if (strModule.find("dist-packages") != std::string::npos) {
       filePath = strModule +
                  "\\..\\include\\third_"
                  "party\\externalerror\\data\\externalErrorMsg.pb";
     } else {
-      filePath =
-          strModule +
-          "\\..\\..\\third_party\\externalerror\\data\\externalErrorMsg.pb";
+      filePath = strModule +
+                 "\\..\\..\\third_party"
+                 "\\externalerror\\data\\externalErrorMsg.pb";
     }
 #endif
     std::ifstream fin(filePath, std::ios::in | std::ios::binary);
diff --git a/paddle/phi/core/flags.h b/paddle/phi/core/flags.h
index 0112be93b7f..e8711c73f30 100644
--- a/paddle/phi/core/flags.h
+++ b/paddle/phi/core/flags.h
@@ -24,7 +24,7 @@
 
 #include "paddle/utils/variant.h"
 
-#if defined(_WIN32) && defined(BUILD_PHI_SHARED)
+#if defined(_WIN32)
 #define PHI_EXPORT_FLAG __declspec(dllexport)
 #define PHI_IMPORT_FLAG __declspec(dllimport)
 #else
diff --git a/paddle/phi/core/lod_utils.cc b/paddle/phi/core/lod_utils.cc
index d775ad1a18f..dac1059182c 100644
--- a/paddle/phi/core/lod_utils.cc
+++ b/paddle/phi/core/lod_utils.cc
@@ -32,8 +32,9 @@ LoD ToAbsOffset(const LoD &in) {
 }
 
 void AppendLoD(LoD *lod, const LoD &lod_length) {
-  PADDLE_ENFORCE(
-      lod->empty() || lod->size() == lod_length.size(),
+  PADDLE_ENFORCE_EQ(
+      (lod->empty() || lod->size() == lod_length.size()),
+      true,
       phi::errors::InvalidArgument(
           "The input LoD length should be equal to the appended LoD size, but "
           "received input LoD length is %d, actual LoD size is %d.",
diff --git a/paddle/phi/core/selected_rows.cc b/paddle/phi/core/selected_rows.cc
index ec2d0d61fae..3ececdfc0bb 100644
--- a/paddle/phi/core/selected_rows.cc
+++ b/paddle/phi/core/selected_rows.cc
@@ -16,6 +16,11 @@ limitations under the License. */
 
 namespace phi {
 
+template <>
+const TypeInfo<phi::TensorBase>
+    TypeInfoTraits<phi::TensorBase, SelectedRows>::kType =
+        RegisterStaticType<phi::TensorBase>(SelectedRows::name());
+
 SelectedRows::SelectedRows(const std::vector<int64_t>& rows,
                            const int64_t& height)
     : impl_(std::make_shared<phi::SelectedRowsImpl>(rows, height)) {}
diff --git a/paddle/phi/core/sparse_coo_tensor.cc b/paddle/phi/core/sparse_coo_tensor.cc
index b7b0d06de8a..d76064b5a3d 100644
--- a/paddle/phi/core/sparse_coo_tensor.cc
+++ b/paddle/phi/core/sparse_coo_tensor.cc
@@ -16,6 +16,11 @@ limitations under the License. */
 
 namespace phi {
 
+template <>
+const TypeInfo<phi::TensorBase>
+    TypeInfoTraits<phi::TensorBase, SparseCooTensor>::kType =
+        RegisterStaticType<phi::TensorBase>(SparseCooTensor::name());
+
 SparseCooTensor::SparseCooTensor() {
   DenseTensor non_zero_indices, non_zero_elements;
   this->SetMember(non_zero_indices, non_zero_elements, {1}, true);
@@ -155,16 +160,18 @@ int32_t SparseCooTensor::dense_dim() const {
 }
 
 void SparseCooTensor::set_meta(SparseTensorMeta&& meta) {
-  PADDLE_ENFORCE(!meta_.valid(),
-                 phi::errors::InvalidArgument(
-                     "Only when the original attribute of Tensor is "
-                     "incomplete, can it be reset."));
+  PADDLE_ENFORCE_EQ(meta_.valid(),
+                    false,
+                    phi::errors::InvalidArgument(
+                        "Only when the original attribute of Tensor is "
+                        "incomplete, can it be reset."));
   meta_ = std::move(meta);
 }
 
 void SparseCooTensor::set_meta(const SparseTensorMeta& meta) {
-  PADDLE_ENFORCE(
+  PADDLE_ENFORCE_EQ(
       meta.valid(),
+      true,
       phi::errors::InvalidArgument(
           "Input meta is invalid, please check the meta attribute."));
   meta_.dims = meta.dims;
diff --git a/paddle/phi/core/sparse_csr_tensor.cc b/paddle/phi/core/sparse_csr_tensor.cc
index 32680106a96..156a324f8b6 100644
--- a/paddle/phi/core/sparse_csr_tensor.cc
+++ b/paddle/phi/core/sparse_csr_tensor.cc
@@ -16,6 +16,11 @@ limitations under the License. */
 
 namespace phi {
 
+template <>
+const TypeInfo<phi::TensorBase>
+    TypeInfoTraits<phi::TensorBase, SparseCsrTensor>::kType =
+        RegisterStaticType<phi::TensorBase>(SparseCsrTensor::name());
+
 SparseCsrTensor::SparseCsrTensor() {
   DenseTensor crows, cols, values;
   this->non_zero_crows_ = crows;
@@ -26,8 +31,9 @@ SparseCsrTensor::SparseCsrTensor() {
 inline void check_shape(const DDim& dims) {
   bool valid = dims.size() == 2 || dims.size() == 3;
 
-  PADDLE_ENFORCE(
+  PADDLE_ENFORCE_EQ(
       valid,
+      true,
       phi::errors::InvalidArgument("the SparseCsrTensor only support 2-D or "
                                    "3-D Tensor, but get %d-D Tensor",
                                    dims.size()));
@@ -96,10 +102,12 @@ void SparseCsrTensor::set_layout(const DataLayout layout) {
 
 void SparseCsrTensor::Resize(const DDim& dense_dims,
                              const int64_t non_zero_num) {
-  PADDLE_ENFORCE(this->initialized(),
-                 phi::errors::InvalidArgument(
-                     "the SparseCsrTensor must be initialized when call Resize "
-                     "function."));
+  PADDLE_ENFORCE_EQ(
+      this->initialized(),
+      true,
+      phi::errors::InvalidArgument(
+          "the SparseCsrTensor must be initialized when call Resize "
+          "function."));
   check_shape(dense_dims);
 
   int64_t crows_size = dense_dims[0] + 1;
@@ -139,16 +147,18 @@ void SparseCsrTensor::SetMember(const DenseTensor& non_zero_crows,
 }
 
 void SparseCsrTensor::set_meta(SparseTensorMeta&& meta) {
-  PADDLE_ENFORCE(!meta_.valid(),
-                 phi::errors::InvalidArgument(
-                     "Only when the original attribute of Tensor is "
-                     "incomplete, can it be reset."));
+  PADDLE_ENFORCE_EQ(meta_.valid(),
+                    false,
+                    phi::errors::InvalidArgument(
+                        "Only when the original attribute of Tensor is "
+                        "incomplete, can it be reset."));
   meta_ = std::move(meta);
 }
 
 void SparseCsrTensor::set_meta(const SparseTensorMeta& meta) {
-  PADDLE_ENFORCE(
+  PADDLE_ENFORCE_EQ(
       meta.valid(),
+      true,
       phi::errors::InvalidArgument(
           "Input meta is invalid, please check the meta attribute."));
   meta_.dims = meta.dims;
diff --git a/paddle/phi/core/storage_properties.cc b/paddle/phi/core/storage_properties.cc
new file mode 100644
index 00000000000..f05a3572f5e
--- /dev/null
+++ b/paddle/phi/core/storage_properties.cc
@@ -0,0 +1,32 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/core/storage_properties.h"
+
+namespace phi {
+
+#ifdef PADDLE_WITH_MKLDNN
+template <>
+const TypeInfo<StorageProperties>
+    TypeInfoTraits<StorageProperties, OneDNNStorageProperties>::kType =
+        RegisterStaticType<StorageProperties>(OneDNNStorageProperties::name());
+
+#endif
+
+template <>
+const TypeInfo<StorageProperties>
+    TypeInfoTraits<StorageProperties, NPUStorageProperties>::kType =
+        RegisterStaticType<StorageProperties>(NPUStorageProperties::name());
+
+}  // namespace phi
diff --git a/paddle/phi/core/string_tensor.cc b/paddle/phi/core/string_tensor.cc
index e82915a38ab..428c890c1f2 100644
--- a/paddle/phi/core/string_tensor.cc
+++ b/paddle/phi/core/string_tensor.cc
@@ -21,6 +21,11 @@ limitations under the License. */
 
 namespace phi {
 
+template <>
+const TypeInfo<phi::TensorBase>
+    TypeInfoTraits<phi::TensorBase, StringTensor>::kType =
+        RegisterStaticType<phi::TensorBase>(StringTensor::name());
+
 StringTensor::StringTensor() { meta_.offset = 0; }
 
 StringTensor::StringTensor(Allocator* a, const StringTensorMeta& meta)
@@ -91,8 +96,9 @@ dtype::pstring* StringTensor::data() {
 }
 
 void StringTensor::set_meta(const StringTensorMeta& meta) {
-  PADDLE_ENFORCE(
+  PADDLE_ENFORCE_EQ(
       meta.valid(),
+      true,
       phi::errors::InvalidArgument(
           "Input meta is invalid, please check the meta attribute."));
   meta_.dims = meta.dims;
@@ -143,8 +149,9 @@ void* StringTensor::AllocateFrom(Allocator* allocator,
   if (fake_alloc) {
     bytes = 0;
   } else {
-    PADDLE_ENFORCE(
+    PADDLE_ENFORCE_EQ(
         valid(),
+        true,
         errors::PreconditionNotMet("The meta data must be valid when call the "
                                    "mutable data function."));
     if (requested_size) {
diff --git a/paddle/phi/core/tensor_array.cc b/paddle/phi/core/tensor_array.cc
index e774bd0da44..f30b17251cd 100644
--- a/paddle/phi/core/tensor_array.cc
+++ b/paddle/phi/core/tensor_array.cc
@@ -16,6 +16,11 @@ limitations under the License. */
 
 namespace phi {
 
+template <>
+const TypeInfo<phi::TensorBase>
+    TypeInfoTraits<phi::TensorBase, TensorArray>::kType =
+        RegisterStaticType<phi::TensorBase>(TensorArray::name());
+
 TensorArray::TensorArray(const std::vector<DenseTensor>& vec) {
   tensors_ = vec;
 }
diff --git a/paddle/phi/core/utils/type_info.h b/paddle/phi/core/utils/type_info.h
index 33a4e09933a..1b3d0f8683b 100644
--- a/paddle/phi/core/utils/type_info.h
+++ b/paddle/phi/core/utils/type_info.h
@@ -52,8 +52,4 @@ class TypeInfoTraits {
 template <typename BaseT>
 TypeInfo<BaseT> RegisterStaticType(const std::string& type);
 
-template <typename BaseT, typename DerivedT>
-const TypeInfo<BaseT> TypeInfoTraits<BaseT, DerivedT>::kType =
-    RegisterStaticType<BaseT>(DerivedT::name());
-
 }  // namespace phi
diff --git a/paddle/phi/infermeta/CMakeLists.txt b/paddle/phi/infermeta/CMakeLists.txt
index fe3c8abfbd3..f53f655b244 100644
--- a/paddle/phi/infermeta/CMakeLists.txt
+++ b/paddle/phi/infermeta/CMakeLists.txt
@@ -1,10 +1,12 @@
-cc_library(
-  infermeta
-  SRCS nullary.cc unary.cc binary.cc ternary.cc multiary.cc fusion.cc
-  DEPS convert_utils meta_tensor infermeta_utils xxhash)
-cc_library(
-  backward_infermeta
-  SRCS backward.cc
-  DEPS meta_tensor convert_utils)
 add_subdirectory(strings)
 add_subdirectory(sparse)
+collect_srcs(
+  infermeta_srcs
+  SRCS
+  nullary.cc
+  unary.cc
+  binary.cc
+  ternary.cc
+  multiary.cc
+  fusion.cc
+  backward.cc)
diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc
index 832680b7f59..efe2e1c65bd 100644
--- a/paddle/phi/infermeta/multiary.cc
+++ b/paddle/phi/infermeta/multiary.cc
@@ -1668,9 +1668,10 @@ static void Interpolate2DInferShapeCheck(
     MetaConfig config) {
   auto dim_x = x.dims();
 
-  PADDLE_ENFORCE(
-      "bilinear" == interp_method || "nearest" == interp_method ||
-          "bicubic" == interp_method,
+  PADDLE_ENFORCE_EQ(
+      ("bilinear" == interp_method || "nearest" == interp_method ||
+       "bicubic" == interp_method),
+      true,
       phi::errors::InvalidArgument(
           "Interpolation method can only be \"bilinear\" or \"nearest\" when "
           "Input(X) dimension is 4, but got method = %s.",
@@ -1818,12 +1819,14 @@ static void Interpolate3DInferShapeCheck(
     MetaConfig config) {
   auto dim_x = x.dims();
 
-  PADDLE_ENFORCE("nearest" == interp_method || "trilinear" == interp_method,
-                 phi::errors::InvalidArgument(
-                     "Interpolation method can only be \"trilinear\" or "
-                     "\"nearest\" when Input(X) "
-                     "dimension is 5, but got method = %s .",
-                     interp_method));
+  PADDLE_ENFORCE_EQ(
+      ("nearest" == interp_method || "trilinear" == interp_method),
+      true,
+      phi::errors::InvalidArgument(
+          "Interpolation method can only be \"trilinear\" or "
+          "\"nearest\" when Input(X) "
+          "dimension is 5, but got method = %s .",
+          interp_method));
   const DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
 
   for (int i = 0; i < dim_x.size(); ++i) {
@@ -1972,8 +1975,9 @@ void InterpolateInferMeta(
     MetaTensor* output,
     MetaConfig config) {
   auto dim_x = x.dims();  // NCHW format
-  PADDLE_ENFORCE(
-      dim_x.size() == 3 || dim_x.size() == 4 || dim_x.size() == 5,
+  PADDLE_ENFORCE_EQ(
+      (dim_x.size() == 3 || dim_x.size() == 4 || dim_x.size() == 5),
+      true,
       phi::errors::Unimplemented(
           "Input(X) dimension must be 3, 4 or 5, but got dimension = %d .",
           dim_x.size()));
diff --git a/paddle/phi/infermeta/sparse/CMakeLists.txt b/paddle/phi/infermeta/sparse/CMakeLists.txt
index 8717ef2cf6f..f48ae8c33d7 100644
--- a/paddle/phi/infermeta/sparse/CMakeLists.txt
+++ b/paddle/phi/infermeta/sparse/CMakeLists.txt
@@ -1,9 +1 @@
-cc_library(
-  sparse_infermeta
-  SRCS unary.cc binary.cc multiary.cc
-  DEPS convert_utils infermeta_utils)
-
-cc_library(
-  sparse_backward_infermeta
-  SRCS backward.cc
-  DEPS meta_tensor convert_utils)
+collect_srcs(infermeta_srcs SRCS unary.cc binary.cc multiary.cc backward.cc)
diff --git a/paddle/phi/infermeta/strings/CMakeLists.txt b/paddle/phi/infermeta/strings/CMakeLists.txt
index c2f891fe712..c6ed4a715a2 100644
--- a/paddle/phi/infermeta/strings/CMakeLists.txt
+++ b/paddle/phi/infermeta/strings/CMakeLists.txt
@@ -1,4 +1 @@
-cc_library(
-  string_infermeta
-  SRCS nullary.cc unary.cc
-  DEPS convert_utils infermeta_utils)
+collect_srcs(infermeta_srcs SRCS nullary.cc unary.cc)
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index 67ad639f648..92cf654aee8 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -2088,10 +2088,12 @@ void MaxPoolWithIndexInferMeta(const MetaTensor& x,
 
   auto x_dims = x.dims();
 
-  PADDLE_ENFORCE(x_dims.size() == 4 || x_dims.size() == 5,
-                 errors::InvalidArgument("Pooling intput should be 4-D or "
-                                         "5-D tensor but received %dD-Tensor",
-                                         x_dims.size()));
+  PADDLE_ENFORCE_EQ(
+      (x_dims.size() == 4 || x_dims.size() == 5),
+      true,
+      errors::InvalidArgument("Pooling intput should be 4-D or "
+                              "5-D tensor but received %dD-Tensor",
+                              x_dims.size()));
 
   if (global_pooling) {
     kernel_size_.resize(static_cast<size_t>(x_dims.size()) - 2);
@@ -4430,15 +4432,15 @@ void TransposeInferMeta(const MetaTensor& x,
 
   // Note: x_rank > axis_size when fuse squeeze2 + transpose2, else x_rank ==
   // axis_size
-  PADDLE_ENFORCE_GE(
-      x_rank,
-      axis_size,
-      errors::InvalidArgument("The input tensor's dimension "
-                              "should be equal to the axis's size. "
-                              "But received input tensor's dimension is %d, "
-                              "axis's size is %d",
-                              x_rank,
-                              axis_size));
+  PADDLE_ENFORCE_GE(x_rank,
+                    axis_size,
+                    errors::InvalidArgument(
+                        "The input tensor's dimension "
+                        "should be equal to or greater than the axis's size. "
+                        "But received input tensor's dimension is %d, "
+                        "axis's size is %d",
+                        x_rank,
+                        axis_size));
 
   std::vector<int> formated_axis = axis;
   std::vector<int> count(axis_size, 0);
diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
index d7f9849ad94..347eadc4d4f 100644
--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -19,84 +19,6 @@ add_subdirectory(funcs)
 # kernel autotune
 add_subdirectory(autotune)
 
-# phi depends all phi kernel targets
-set_property(GLOBAL PROPERTY PHI_KERNELS "")
-
-# [ 1. Common kernel compilation dependencies ]
-set(COMMON_KERNEL_DEPS
-    dense_tensor
-    string_tensor
-    sparse_coo_tensor
-    sparse_csr_tensor
-    tensor_array
-    int_array
-    scalar
-    kernel_context
-    kernel_factory
-    arg_map_context
-    convert_utils
-    lod_utils
-    custom_kernel
-    string_infermeta
-    phi_tensor_utils)
-set(COMMON_KERNEL_DEPS
-    ${COMMON_KERNEL_DEPS}
-    eigen_function
-    blas
-    math_function
-    im2col
-    vol2col
-    concat_and_split_functor
-    selected_rows_functor)
-# remove this dep after removing fluid deps on tensor creation
-set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} lod_utils)
-set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} infermeta infermeta_utils
-                       sparse_infermeta)
-set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} switch_autotune)
-
-set(COMMON_KERNEL_DEPS
-    ${COMMON_KERNEL_DEPS}
-    threadpool
-    jit_kernel_helper
-    softmax
-    cross_entropy
-    matrix_bit_code
-    lapack_function
-    lstm_compute
-    gru_compute
-    deformable_conv_functor
-    matrix_reduce
-    segment_pooling
-    pooling
-    maxouting
-    matrix_inverse
-    matrix_solve
-    phi_dynload_warpctc
-    phi_dynload_warprnnt
-    sequence_padding
-    sequence_pooling
-    sequence_scale
-    fft
-    phi_data_layout_transform
-    gpc
-    utf8proc
-    gather_scatter_functor)
-
-set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} process_group)
-
-if(WITH_FLASHATTN)
-  set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} phi_dynload_flashattn)
-endif()
-
-if(WITH_NCCL OR WITH_RCCL)
-  set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} nccl_comm_context)
-endif()
-if(WITH_GLOO)
-  set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} gloo_comm_context)
-endif()
-if(WITH_CUDNN_FRONTEND)
-  set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} cudnn-frontend)
-endif()
 copy_if_different(${kernel_declare_file} ${kernel_declare_file_final})
 
 file(GLOB kernel_h "*.h" "selected_rows/*.h" "sparse/*.h" "strings/*.h")
@@ -105,8 +27,8 @@ file(GLOB kernel_primitive_h "primitive/*.h")
 
 # fusion ops would be included here
 file(
-  GLOB
-  kernel_cu
+  GLOB kernel_cu
+  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
   "gpu/*.cu"
   "gpu/*.cu.cc"
   "gpudnn/*.cu"
@@ -118,6 +40,10 @@ file(
   "strings/gpu/*.cu"
   "fusion/gpu/*.cu")
 
+if(APPLE OR WIN32)
+  list(REMOVE_ITEM kernel_cu "fusion/gpu/fusion_group_kernel.cu")
+endif()
+
 if(DEFINED REDUCE_INFERENCE_LIB_SIZE)
   list(FILTER kernel_cu EXCLUDE REGEX ".*_grad_kernel\\.cc$")
   list(FILTER kernel_cu EXCLUDE REGEX ".*_grad_kernel\\.cu$")
@@ -146,22 +72,19 @@ if(WITH_CUTLASS)
     )
   endif()
 
-  file(GLOB cutlass_cu "fusion/cutlass/conv2d/generated/*.cu"
-       "fusion/cutlass/conv2d/*.cu" "fusion/cutlass/*.cu"
-       "fusion/cutlass/memory_efficient_attention/autogen/impl/*.cu")
-  add_definitions("-DPADDLE_WITH_MEMORY_EFFICIENT_ATTENTION")
+  file(
+    GLOB cutlass_cu
+    RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
+    "fusion/cutlass/conv2d/generated/*.cu" "fusion/cutlass/conv2d/*.cu"
+    "fusion/cutlass/*.cu"
+    "fusion/cutlass/memory_efficient_attention/autogen/impl/*.cu")
   list(APPEND kernel_cu ${cutlass_cu})
 endif()
 
-if(APPLE OR WIN32)
-  list(REMOVE_ITEM kernel_cu
-       "${CMAKE_CURRENT_SOURCE_DIR}/fusion/gpu/fusion_group_kernel.cu")
-endif()
-
 if(WITH_MKLDNN)
   file(
-    GLOB
-    kernel_cc
+    GLOB kernel_cc
+    RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
     "*.cc"
     "cpu/*.cc"
     "legacy/*.cc"
@@ -171,6 +94,8 @@ if(WITH_MKLDNN)
     "selected_rows/cpu/*.cc"
     "sparse/*.cc"
     "sparse/cpu/*.cc"
+    "legacy/*.cc"
+    "legacy/cpu/*.cc"
     "strings/*.cc"
     "strings/cpu/*.cc"
     "onednn/*.cc"
@@ -179,8 +104,8 @@ if(WITH_MKLDNN)
     "fusion/cpu/*.cc")
 else()
   file(
-    GLOB
-    kernel_cc
+    GLOB kernel_cc
+    RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
     "*.cc"
     "cpu/*.cc"
     "legacy/*.cc"
@@ -189,6 +114,8 @@ else()
     "selected_rows/cpu/*.cc"
     "sparse/*.cc"
     "sparse/cpu/*.cc"
+    "legacy/*.cc"
+    "legacy/cpu/*.cc"
     "strings/*.cc"
     "strings/cpu/*.cc"
     "fusion/*.cc"
@@ -200,32 +127,17 @@ if(DEFINED REDUCE_INFERENCE_LIB_SIZE)
 endif()
 
 file(
-  GLOB
-  kernel_xpu
-  "xpu/*.cc"
-  "legacy/xpu/*.cc"
-  "selected_rows/xpu/*.cc"
-  "fusion/xpu/*.cc"
+  GLOB kernel_xpu
+  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
+  "xpu/*.cc" "legacy/xpu/*.cc" "selected_rows/xpu/*.cc" "fusion/xpu/*.cc"
   "sparse/xpu/*.cc")
 
-if(WITH_MKLDNN)
-  set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} get_kerneltype_forvar_utils)
-endif()
-
 if(WITH_GPU OR WITH_ROCM)
-  if(WITH_GPU)
-    add_library(phi_gpu ${kernel_cu} ${kernel_cc})
-    if(WITH_CUTLASS)
-      add_dependencies(phi_gpu cutlass_codegen)
-    endif()
-  elseif(WITH_ROCM)
-    hip_add_library(phi_gpu STATIC ${kernel_cu} ${kernel_cc})
-  endif()
+  collect_srcs(kernels_srcs SRCS ${kernel_cu})
   kernel_declare("${kernel_cu}")
-  kernel_declare("${kernel_cc}")
-  target_link_libraries(phi_gpu ${COMMON_KERNEL_DEPS})
-  set(ADD_PHI_KERNELS ${ADD_PHI_KERNELS} phi_gpu)
-elseif(WITH_XPU)
+endif()
+
+if(WITH_XPU)
   if(WITH_XPU_KP)
     file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/kps/
          DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/kps/)
@@ -237,52 +149,23 @@ elseif(WITH_XPU)
       file(RENAME ${kernel} "${CMAKE_CURRENT_BINARY_DIR}/kps/${name}.kps")
     endforeach()
     file(GLOB kernel_xpu_kps "${CMAKE_CURRENT_BINARY_DIR}/kps/*.kps")
-    file(
-      GLOB kernel_cc_relative
-      RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
-      "*.cc"
-      "cpu/*.cc"
-      "legacy/*.cc"
-      "legacy/cpu/*.cc"
-      "selected_rows/*.cc"
-      "selected_rows/cpu/*.cc"
-      "sparse/*.cc"
-      "sparse/cpu/*.cc"
-      "strings/*.cc"
-      "strings/cpu/*.cc"
-      "fusion/*.cc"
-      "fusion/cpu/*.cc")
-    foreach(kernel ${kernel_cc_relative})
+
+    foreach(kernel ${kernel_cc})
       file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/${kernel}
            DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/${kernel})
     endforeach()
     file(GLOB_RECURSE kernel_xpu_cc "${CMAKE_CURRENT_BINARY_DIR}/*.cc")
-    xpu_add_library(
-      phi_xpu
-      STATIC
-      ${kernel_xpu}
-      ${kernel_xpu_kps}
-      ${kernel_xpu_cc}
-      DEPENDS
-      ${COMMON_KERNEL_DEPS})
-    kernel_declare("${kernel_xpu_cc}")
-  else()
-    add_library(phi_xpu ${kernel_xpu} ${kernel_cc})
-    kernel_declare("${kernel_cc}")
+
+    set(kernel_cc ${kernel_xpu_cc})
+    collect_generated_srcs(kernels_srcs SRCS ${kernel_xpu_kps})
   endif()
+  collect_srcs(kernels_srcs SRCS ${kernel_xpu})
   kernel_declare("${kernel_xpu}")
   kernel_declare("${kernel_xpu_kps}")
-
-  target_link_libraries(phi_xpu ${COMMON_KERNEL_DEPS})
-  set(ADD_PHI_KERNELS ${ADD_PHI_KERNELS} phi_xpu)
-else()
-  add_library(phi_cpu ${kernel_cc})
-  target_link_libraries(phi_cpu ${COMMON_KERNEL_DEPS})
-  kernel_declare("${kernel_cc}")
-  set(ADD_PHI_KERNELS phi_cpu)
 endif()
 
-set_property(GLOBAL PROPERTY PHI_KERNELS ${ADD_PHI_KERNELS})
+collect_srcs(kernels_srcs SRCS ${kernel_cc})
+kernel_declare("${kernel_cc}")
 
 if(NOT "${KERNEL_LIST}" STREQUAL "")
   prune_declaration_h()
diff --git a/paddle/phi/kernels/autotune/CMakeLists.txt b/paddle/phi/kernels/autotune/CMakeLists.txt
index aa05fcd74cc..456e6770a70 100644
--- a/paddle/phi/kernels/autotune/CMakeLists.txt
+++ b/paddle/phi/kernels/autotune/CMakeLists.txt
@@ -1,15 +1 @@
-if(WITH_CUDNN_FRONTEND)
-  cc_library(
-    cache
-    SRCS cache.cc
-    DEPS cudnn-frontend phi_enforce)
-else()
-  cc_library(
-    cache
-    SRCS cache.cc
-    DEPS phi_enforce)
-endif()
-cc_library(
-  switch_autotune
-  SRCS switch_autotune.cc
-  DEPS cache flags)
+collect_srcs(kernels_srcs SRCS cache.cc switch_autotune.cc)
diff --git a/paddle/phi/kernels/autotune/cache_base.h b/paddle/phi/kernels/autotune/cache_base.h
index 798898f4dd7..68463e900c3 100644
--- a/paddle/phi/kernels/autotune/cache_base.h
+++ b/paddle/phi/kernels/autotune/cache_base.h
@@ -18,11 +18,11 @@
 #include <unordered_map>
 #include <vector>
 
-#include "gflags/gflags.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/errors.h"
+#include "paddle/phi/core/flags.h"
 
-DECLARE_int32(search_cache_max_number);
+PHI_DECLARE_int32(search_cache_max_number);
 
 inline void HashCombine(std::size_t* seed UNUSED) {}
 
diff --git a/paddle/phi/kernels/cpu/rmsprop_kernel.cc b/paddle/phi/kernels/cpu/rmsprop_kernel.cc
index f72f912e5be..fd2b4b43c5d 100644
--- a/paddle/phi/kernels/cpu/rmsprop_kernel.cc
+++ b/paddle/phi/kernels/cpu/rmsprop_kernel.cc
@@ -105,10 +105,6 @@ struct RmsFunctor<T, phi::CPUContext> {
   }
 };
 
-template struct RmsFunctor<phi::GPUContext, float>;
-template struct RmsFunctor<phi::GPUContext, double>;
-template struct RmsFunctor<phi::GPUContext, phi::dtype::float16>;
-
 }  // namespace phi
 PD_REGISTER_KERNEL(
     rmsprop, CPU, ALL_LAYOUT, phi::RmspropDenseKernel, float, double) {}
diff --git a/paddle/phi/kernels/funcs/CMakeLists.txt b/paddle/phi/kernels/funcs/CMakeLists.txt
index bd1774d756c..999625cf3df 100644
--- a/paddle/phi/kernels/funcs/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/CMakeLists.txt
@@ -4,67 +4,15 @@ add_subdirectory(lapack)
 add_subdirectory(detail)
 add_subdirectory(jit)
 
-math_library(deformable_conv_functor DEPS dense_tensor)
-math_library(concat_and_split_functor DEPS dense_tensor)
-math_library(fc_functor DEPS blas jit_kernel_helper)
-math_library(gpc DEPS phi_enforce)
-math_library(gru_compute DEPS activation_functions math_function)
-math_library(lstm_compute DEPS activation_functions)
-math_library(math_function DEPS blas dense_tensor)
-math_library(matrix_reduce DEPS dense_tensor)
-math_library(matrix_inverse DEPS dense_tensor eigen3 blas)
-math_library(pooling DEPS dense_tensor)
-math_library(segment_pooling)
-math_library(sequence2batch)
-math_library(matrix_solve DEPS dense_tensor eigen3 blas math_function)
-math_library(cross_entropy)
-math_library(im2col)
-math_library(vol2col)
-math_library(softmax DEPS math_function)
-math_library(maxouting)
-math_library(matrix_bit_code)
-math_library(sequence_scale)
-math_library(sequence_padding DEPS lod_utils)
-math_library(sequence_pooling DEPS math_function jit_kernel_helper)
-
-cc_library(
-  phi_data_layout_transform
-  SRCS data_layout_transform.cc
-  DEPS tensor blas)
-
+file(
+  GLOB func_cc_srcs
+  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
+  "*.cc")
 if(WITH_GPU OR WITH_ROCM)
-  if(MKL_FOUND AND WITH_ONEMKL)
-    math_library(fft spectral_op.cu DEPS dynload_cuda dynload_mklrt
-                 dense_tensor)
-    target_include_directories(fft PRIVATE ${MKL_INCLUDE})
-  else()
-    math_library(fft spectral_op.cu DEPS dynload_cuda dense_tensor pocketfft)
-  endif()
-else()
-  if(MKL_FOUND AND WITH_ONEMKL)
-    mathp_library(fft DEPS dynload_mklrt dense_tensor)
-    target_include_directories(fft PRIVATE ${MKL_INCLUDE})
-  else()
-    math_library(fft DEPS dense_tensor pocketfft)
-  endif()
+  file(
+    GLOB func_cu_srcs
+    RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
+    "*.cu")
 endif()
 
-if(WITH_MKLDNN)
-  math_library(selected_rows_functor DEPS selected_rows_utils math_function
-               blas mixed_vector)
-else()
-  math_library(selected_rows_functor DEPS selected_rows_utils math_function
-               blas mixed_vector)
-endif()
-
-if(WITH_ROCM)
-  hip_library(
-    gather_scatter_functor
-    SRCS gather_scatter_functor.cc gather_scatter_functor.cu
-    DEPS tensor)
-else()
-  cc_library(
-    gather_scatter_functor
-    SRCS gather_scatter_functor.cc gather_scatter_functor.cu
-    DEPS tensor)
-endif()
+collect_srcs(kernels_srcs SRCS ${func_cc_srcs} ${func_cu_srcs})
diff --git a/paddle/phi/kernels/funcs/blas/CMakeLists.txt b/paddle/phi/kernels/funcs/blas/CMakeLists.txt
index 6f08472efab..4a0feb20bd6 100644
--- a/paddle/phi/kernels/funcs/blas/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/blas/CMakeLists.txt
@@ -1,4 +1 @@
-cc_library(
-  blas
-  SRCS blas.cc
-  DEPS cblas framework_proto phi_backends)
+collect_srcs(kernels_srcs SRCS blas.cc)
diff --git a/paddle/phi/kernels/funcs/blas/blas_impl.cu.h b/paddle/phi/kernels/funcs/blas/blas_impl.cu.h
index c08903e7d37..2f0f3f7cd70 100644
--- a/paddle/phi/kernels/funcs/blas/blas_impl.cu.h
+++ b/paddle/phi/kernels/funcs/blas/blas_impl.cu.h
@@ -19,10 +19,11 @@
 
 #include "paddle/phi/backends/dynload/cublas.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/flags.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
-DECLARE_bool(enable_cublas_tensor_op_math);
-DECLARE_bool(gemm_use_half_precision_compute_type);
+PHI_DECLARE_bool(enable_cublas_tensor_op_math);
+PHI_DECLARE_bool(gemm_use_half_precision_compute_type);
 
 namespace phi {
 namespace funcs {
diff --git a/paddle/phi/kernels/funcs/detail/CMakeLists.txt b/paddle/phi/kernels/funcs/detail/CMakeLists.txt
index 0df1c060f90..15c5ba0ac78 100644
--- a/paddle/phi/kernels/funcs/detail/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/detail/CMakeLists.txt
@@ -1 +1 @@
-cc_library(activation_functions SRCS avx_functions.cc)
+collect_srcs(kernels_srcs SRCS avx_functions.cc)
diff --git a/paddle/phi/kernels/funcs/eigen/CMakeLists.txt b/paddle/phi/kernels/funcs/eigen/CMakeLists.txt
index de771f12fbf..30d6dc6013c 100644
--- a/paddle/phi/kernels/funcs/eigen/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/eigen/CMakeLists.txt
@@ -6,19 +6,5 @@ file(
   GLOB EIGEN_CU_SOURCES
   RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
   "*.cu")
-if(WITH_GPU)
-  nv_library(
-    eigen_function
-    SRCS ${EIGEN_CC_SOURCES} ${EIGEN_CU_SOURCES}
-    DEPS eigen3)
-elseif(WITH_ROCM)
-  hip_library(
-    eigen_function
-    SRCS ${EIGEN_CC_SOURCES} ${EIGEN_CU_SOURCES}
-    DEPS eigen3)
-else()
-  cc_library(
-    eigen_function
-    SRCS ${EIGEN_CC_SOURCES}
-    DEPS eigen3)
-endif()
+
+collect_srcs(kernels_srcs SRCS ${EIGEN_CC_SOURCES} ${EIGEN_CU_SOURCES})
diff --git a/paddle/phi/kernels/funcs/jit/CMakeLists.txt b/paddle/phi/kernels/funcs/jit/CMakeLists.txt
index cb9dc6a3757..fd44ca30810 100644
--- a/paddle/phi/kernels/funcs/jit/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/jit/CMakeLists.txt
@@ -9,17 +9,13 @@ file(APPEND ${jit_file} "\#include \"paddle/phi/kernels/funcs/jit/helper.h\"\n")
 file(APPEND ${jit_file}
      "\#include \"paddle/phi/kernels/funcs/jit/registry.h\"\n\n")
 
-set(JIT_KERNEL_DEPS device_context cblas gflags enforce place xxhash)
-
 file(
   GLOB jit_kernel_cc_srcs
   RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
   "*.cc")
 list(REMOVE_ITEM jit_kernel_cc_srcs test.cc benchmark.cc)
-cc_library(
-  jit_kernel_base
-  SRCS ${jit_kernel_cc_srcs}
-  DEPS ${JIT_KERNEL_DEPS})
+
+collect_srcs(kernels_srcs SRCS ${jit_kernel_cc_srcs})
 
 copy_if_different(${jit_file} ${jit_file_final})
 
@@ -30,14 +26,11 @@ if(WITH_XBYAK)
   add_subdirectory(gen)
 endif()
 
-cc_library(
-  jit_kernel_helper INTERFACE
-  SRCS ${jit_kernel_cc_srcs}
-  DEPS jit_kernel_base ${JIT_KERNEL_DEPS})
 cc_test(
   jit_kernel_test
   SRCS test.cc
-  DEPS jit_kernel_helper)
+  DEPS phi)
+
 if(NOT WIN32)
   set(cuda_less12_and_gcc_greater12 false)
   if(DEFINED CMAKE_CUDA_COMPILER_VERSION)
@@ -47,14 +40,7 @@ if(NOT WIN32)
     endif()
   endif()
   if(NOT cuda_less12_and_gcc_greater12)
-    cc_binary(
-      jit_kernel_benchmark
-      SRCS
-      benchmark.cc
-      DEPS
-      jit_kernel_helper
-      phi_device_tracer
-      tensor)
+    cc_binary(jit_kernel_benchmark SRCS benchmark.cc DEPS phi)
   endif()
 endif()
 if(WITH_TESTING AND TEST jit_kernel_test)
diff --git a/paddle/phi/kernels/funcs/jit/gen/CMakeLists.txt b/paddle/phi/kernels/funcs/jit/gen/CMakeLists.txt
index e2b9b51590f..fc16fc4740e 100644
--- a/paddle/phi/kernels/funcs/jit/gen/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/jit/gen/CMakeLists.txt
@@ -3,13 +3,7 @@ file(
   RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
   "*.cc")
 
-cc_library(
-  jit_kernel_jitcode
-  SRCS ${jitcode_cc_srcs}
-  DEPS jit_kernel_base xbyak)
-set(JIT_KERNEL_DEPS
-    ${JIT_KERNEL_DEPS} xbyak jit_kernel_jitcode
-    PARENT_SCOPE)
+collect_srcs(kernels_srcs SRCS ${jitcode_cc_srcs})
 
 function(USE_JITKERNEL_GEN TARGET)
   file(APPEND ${jit_file} "USE_JITKERNEL_GEN(${TARGET});\n")
diff --git a/paddle/phi/kernels/funcs/jit/gen_base.h b/paddle/phi/kernels/funcs/jit/gen_base.h
index c72c0c52792..dfad19eff34 100644
--- a/paddle/phi/kernels/funcs/jit/gen_base.h
+++ b/paddle/phi/kernels/funcs/jit/gen_base.h
@@ -33,7 +33,7 @@ namespace jit {
 
 class GenBase : public Kernel {
  public:
-  virtual ~GenBase() = default;
+  virtual ~GenBase() {}
   virtual std::string name() const = 0;
   virtual size_t getSize() const = 0;
   virtual const unsigned char* getCodeInternal() const = 0;
diff --git a/paddle/phi/kernels/funcs/jit/more/CMakeLists.txt b/paddle/phi/kernels/funcs/jit/more/CMakeLists.txt
index 0851ca065b5..ad536a05d12 100644
--- a/paddle/phi/kernels/funcs/jit/more/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/jit/more/CMakeLists.txt
@@ -12,7 +12,3 @@ endif()
 
 # mix should be last
 add_subdirectory(mix)
-
-set(JIT_KERNEL_DEPS
-    ${JIT_KERNEL_DEPS}
-    PARENT_SCOPE)
diff --git a/paddle/phi/kernels/funcs/jit/more/intrinsic/CMakeLists.txt b/paddle/phi/kernels/funcs/jit/more/intrinsic/CMakeLists.txt
index c6222c9b29b..dbf94d7483e 100644
--- a/paddle/phi/kernels/funcs/jit/more/intrinsic/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/jit/more/intrinsic/CMakeLists.txt
@@ -2,14 +2,8 @@ file(
   GLOB jit_kernel_cc_intrinsic
   RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
   "*.cc")
-cc_library(
-  jit_kernel_intrinsic
-  SRCS ${jit_kernel_cc_intrinsic}
-  DEPS jit_kernel_base)
 
-set(JIT_KERNEL_DEPS
-    ${JIT_KERNEL_DEPS} jit_kernel_intrinsic
-    PARENT_SCOPE)
+collect_srcs(kernels_srcs SRCS ${jit_kernel_cc_intrinsic})
 
 # use mkl kernels by name and type
 use_jitkernel_more(kCRFDecoding, intrinsic)
diff --git a/paddle/phi/kernels/funcs/jit/more/mix/CMakeLists.txt b/paddle/phi/kernels/funcs/jit/more/mix/CMakeLists.txt
index 2fa8557c1d8..21b74179f73 100644
--- a/paddle/phi/kernels/funcs/jit/more/mix/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/jit/more/mix/CMakeLists.txt
@@ -2,14 +2,8 @@ file(
   GLOB jit_kernel_mix_cc
   RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
   "*.cc")
-cc_library(
-  jit_kernel_mix
-  SRCS ${jit_kernel_mix_cc}
-  DEPS jit_kernel_base)
 
-set(JIT_KERNEL_DEPS
-    ${JIT_KERNEL_DEPS} jit_kernel_mix
-    PARENT_SCOPE)
+collect_srcs(kernels_srcs SRCS ${jit_kernel_mix_cc})
 
 use_jitkernel_more(kVSigmoid, mix)
 use_jitkernel_more(kVTanh, mix)
diff --git a/paddle/phi/kernels/funcs/jit/more/mkl/CMakeLists.txt b/paddle/phi/kernels/funcs/jit/more/mkl/CMakeLists.txt
index 7f6df06f87a..0c5d21002d1 100644
--- a/paddle/phi/kernels/funcs/jit/more/mkl/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/jit/more/mkl/CMakeLists.txt
@@ -1,10 +1,4 @@
-cc_library(
-  jit_kernel_mkl
-  SRCS mkl.cc
-  DEPS jit_kernel_base dynload_mklml)
-set(JIT_KERNEL_DEPS
-    ${JIT_KERNEL_DEPS} dynload_mklml jit_kernel_mkl
-    PARENT_SCOPE)
+collect_srcs(kernels_srcs SRCS mkl.cc)
 
 # use mkl kernels by name and type
 use_jitkernel_more(kMatMul, mkl)
diff --git a/paddle/phi/kernels/funcs/jit/refer/CMakeLists.txt b/paddle/phi/kernels/funcs/jit/refer/CMakeLists.txt
index 632dc98eb71..825ce47a601 100644
--- a/paddle/phi/kernels/funcs/jit/refer/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/jit/refer/CMakeLists.txt
@@ -1,10 +1,4 @@
-cc_library(
-  jit_kernel_refer
-  SRCS refer.cc
-  DEPS jit_kernel_base)
-set(JIT_KERNEL_DEPS
-    ${JIT_KERNEL_DEPS} jit_kernel_refer
-    PARENT_SCOPE)
+collect_srcs(kernels_srcs SRCS refer.cc)
 
 function(USE_JITKERNEL_REFER TARGET)
   file(APPEND ${jit_file} "USE_JITKERNEL_REFER(${TARGET});\n")
diff --git a/paddle/phi/kernels/funcs/lapack/CMakeLists.txt b/paddle/phi/kernels/funcs/lapack/CMakeLists.txt
index 1a53470b2e6..3321eddf41c 100644
--- a/paddle/phi/kernels/funcs/lapack/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/lapack/CMakeLists.txt
@@ -1 +1 @@
-math_library(lapack_function DEPS phi_dynload_lapack)
+collect_srcs(kernels_srcs SRCS lapack_function.cc)
diff --git a/paddle/phi/kernels/funcs/math_function.h b/paddle/phi/kernels/funcs/math_function.h
index e2824cf4e26..b42714e80db 100644
--- a/paddle/phi/kernels/funcs/math_function.h
+++ b/paddle/phi/kernels/funcs/math_function.h
@@ -25,6 +25,7 @@ limitations under the License. */
 namespace phi {
 namespace funcs {
 
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 template <typename T>
 void BatchTranspose(T* output,
                     const T* input,
@@ -32,7 +33,7 @@ void BatchTranspose(T* output,
                     int64_t m,
                     int64_t n,
                     const phi::GPUContext* dev_ctx);
-
+#endif
 template <typename DeviceContext, typename T>
 struct TransposeNormal {
   // for dims >= 7 situation
diff --git a/paddle/phi/kernels/funcs/matrix_inverse.cu.cc b/paddle/phi/kernels/funcs/matrix_inverse.cu
similarity index 100%
rename from paddle/phi/kernels/funcs/matrix_inverse.cu.cc
rename to paddle/phi/kernels/funcs/matrix_inverse.cu
diff --git a/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention.cu b/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention.cu
index 36578a361d7..8e564ff7dfc 100644
--- a/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention.cu
+++ b/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention.cu
@@ -12,17 +12,21 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/autogen/memory_efficient_attention.h"
-#include "paddle/fluid/memory/malloc.h"
-#include "paddle/fluid/platform/errors.h"
+#include "glog/logging.h"
+
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/errors.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/autogen/memory_efficient_attention.h"
+#include "paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/gemm_kernel_utils.h"
 #include "paddle/phi/kernels/fusion/cutlass/memory_efficient_attention_utils.h"
 
 namespace phi {
 namespace fusion {
 namespace cutlass_internal {
 
+using gemm_kernel_utils::getMaximumSharedMemoryPerBlockKb;
+
 template <typename T, typename Context>
 void MemoryEfficientAttentionForwardKernel(
     const Context& ctx,
@@ -124,9 +128,9 @@ void MemoryEfficientAttentionForwardKernel(
     VLOG(3) << "kAlignLSE" << kAlignLSE;
 
     typename KernelType::Params p;
-    p.query_ptr = SafeGetTensorPtr<scalar_t>(query);
-    p.key_ptr = SafeGetTensorPtr<scalar_t>(key);
-    p.value_ptr = SafeGetTensorPtr<scalar_t>(value);
+    p.query_ptr = phi::SafeGetTensorPtr<scalar_t>(query);
+    p.key_ptr = phi::SafeGetTensorPtr<scalar_t>(key);
+    p.value_ptr = phi::SafeGetTensorPtr<scalar_t>(value);
     p.logsumexp_ptr = is_test ? nullptr : logsumexp->data<float>();
     VLOG(3) << "logsumexp_ptr" << p.logsumexp_ptr;
 
@@ -134,19 +138,19 @@ void MemoryEfficientAttentionForwardKernel(
     if (KernelType::kNeedsOutputAccumulatorBuffer) {
       out_accum.Resize(output->dims());
       p.output_accum_ptr =
-          SafeAllocTensor<typename KernelType::output_accum_t, Context>(
+          phi::SafeAllocTensor<typename KernelType::output_accum_t, Context>(
               ctx, &out_accum);
       VLOG(3) << "output_accum_ptr " << p.output_accum_ptr;
     } else {
       p.output_accum_ptr = nullptr;
     }
-    p.output_ptr =
-        SafeAllocTensor<typename KernelType::output_t, Context>(ctx, output);
+    p.output_ptr = phi::SafeAllocTensor<typename KernelType::output_t, Context>(
+        ctx, output);
     VLOG(3) << "output_ptr " << p.output_ptr;
 
     if (cu_seqlens_q) {
-      p.seqstart_q_ptr = SafeGetTensorPtr<int32_t>(cu_seqlens_q);
-      p.seqstart_k_ptr = SafeGetTensorPtr<int32_t>(cu_seqlens_k);
+      p.seqstart_q_ptr = phi::SafeGetTensorPtr<int32_t>(cu_seqlens_q);
+      p.seqstart_k_ptr = phi::SafeGetTensorPtr<int32_t>(cu_seqlens_k);
       VLOG(3) << "seqstart_q_ptr " << p.seqstart_q_ptr;
     } else {
       p.seqstart_q_ptr = nullptr;
@@ -164,7 +168,7 @@ void MemoryEfficientAttentionForwardKernel(
         cu_seqlens_q ? cu_seqlens_q.get().dims()[0] - 1 : q_dims[0]);
     p.causal = causal;
     if (causal_diagonal) {
-      p.causal_diagonal_ptr = SafeGetTensorPtr<int32_t>(causal_diagonal);
+      p.causal_diagonal_ptr = phi::SafeGetTensorPtr<int32_t>(causal_diagonal);
     } else {
       p.causal_diagonal_ptr = nullptr;
     }
@@ -172,7 +176,7 @@ void MemoryEfficientAttentionForwardKernel(
 
     p.seqlen_k_ptr = nullptr;
     if (seqlen_k) {
-      p.seqlen_k_ptr = SafeGetTensorPtr<int32_t>(seqlen_k);
+      p.seqlen_k_ptr = phi::SafeGetTensorPtr<int32_t>(seqlen_k);
     } else {
       p.seqlen_k_ptr = nullptr;
     }
@@ -197,7 +201,7 @@ void MemoryEfficientAttentionForwardKernel(
     PD_MEA_CHECK_OVERFLOW(p.o_strideM, DimStride(output->dims(), 1));
 
     if (bias) {
-      p.attn_bias_ptr = SafeGetTensorPtr<scalar_t>(bias);
+      p.attn_bias_ptr = phi::SafeGetTensorPtr<scalar_t>(bias);
       PD_MEA_CHECK_OVERFLOW(
           p.bias_strideB,
           GetMemoryEfficientBiasStrideB(bias.get().dims(), q_dims, k_dims));
@@ -215,7 +219,8 @@ void MemoryEfficientAttentionForwardKernel(
     seed_dims[0] = 2;
     seed_and_offset->Resize(seed_dims);
     ctx.template HostAlloc<int64_t>(seed_and_offset);
-    int64_t* seed_and_offset_ptr = SafeGetTensorPtr<int64_t>(seed_and_offset);
+    int64_t* seed_and_offset_ptr =
+        phi::SafeGetTensorPtr<int64_t>(seed_and_offset);
 
     auto gen = ctx.GetGenerator();
     uint64_t inc = query.dims()[0] * query.dims()[2] * 32;
@@ -254,10 +259,10 @@ void MemoryEfficientAttentionForwardKernel(
                 ctx.stream()>>>(p);
   };
   dispatch_cutlass_forward<T>(ctx, launchKernel);
-  PADDLE_ENFORCE_EQ(kernel_launched,
-                    true,
-                    paddle::platform::errors::InvalidArgument(
-                        "the kernel should not be launched"));
+  PADDLE_ENFORCE_EQ(
+      kernel_launched,
+      true,
+      phi::errors::InvalidArgument("the kernel should not be launched"));
 }
 
 }  // namespace cutlass_internal
diff --git a/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention_backward.cu b/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention_backward.cu
index 00d09cf00a8..2e16f9db347 100644
--- a/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention_backward.cu
+++ b/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention_backward.cu
@@ -15,16 +15,16 @@
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/platform/errors.h"
 #include "paddle/phi/api/include/tensor_operants.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/autogen/memory_efficient_attention.h"
-#include "paddle/phi/kernels/fusion/cutlass/memory_efficient_attention_utils.h"
-
 #include "paddle/phi/kernels/cast_kernel.h"
 #include "paddle/phi/kernels/cum_kernel.h"
 #include "paddle/phi/kernels/elementwise_add_kernel.h"
 #include "paddle/phi/kernels/elementwise_multiply_kernel.h"
 #include "paddle/phi/kernels/funcs/get_pad_lse.cu.h"
+#include "paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/autogen/memory_efficient_attention.h"
+#include "paddle/phi/kernels/fusion/cutlass/memory_efficient_attention_utils.h"
 #include "paddle/phi/kernels/matmul_kernel.h"
 #include "paddle/phi/kernels/reduce_sum_kernel.h"
 #include "paddle/phi/kernels/reshape_kernel.h"
@@ -34,6 +34,8 @@ namespace phi {
 namespace fusion {
 namespace cutlass_internal {
 
+using gemm_kernel_utils::getMaximumSharedMemoryPerBlockKb;
+
 template <typename T, typename Context>
 void MemoryEfficientAttentionBackwardKernel(
     const Context& ctx,
@@ -387,9 +389,9 @@ void MemoryEfficientAttentionBackwardKernel(
     VLOG(3) << "delta has been set" << delta.data();
 
     typename KernelType::Params p;
-    p.query_ptr = SafeGetTensorPtr<scalar_t>(query);
-    p.key_ptr = SafeGetTensorPtr<scalar_t>(key);
-    p.value_ptr = SafeGetTensorPtr<scalar_t>(value);
+    p.query_ptr = phi::SafeGetTensorPtr<scalar_t>(query);
+    p.key_ptr = phi::SafeGetTensorPtr<scalar_t>(key);
+    p.value_ptr = phi::SafeGetTensorPtr<scalar_t>(value);
 
     bool force_pad_inf = (compute_capacity == 75);
     const std::string data_format = "NCHW";
@@ -400,14 +402,14 @@ void MemoryEfficientAttentionBackwardKernel(
                                        32,
                                        data_format,
                                        force_pad_inf);
-    p.logsumexp_ptr = SafeGetTensorPtr<float>(padded_lse);
+    p.logsumexp_ptr = phi::SafeGetTensorPtr<float>(padded_lse);
     VLOG(3) << "logsumexp_ptr" << p.logsumexp_ptr;
-    p.output_ptr = SafeGetTensorPtr<scalar_t>(output);
-    p.grad_output_ptr = SafeGetTensorPtr<scalar_t>(output_grad);
-    p.grad_query_ptr = SafeAllocTensor<scalar_t, Context>(ctx, query_grad);
-    p.grad_key_ptr = SafeAllocTensor<scalar_t, Context>(ctx, key_grad);
-    p.grad_value_ptr = SafeAllocTensor<scalar_t, Context>(ctx, value_grad);
-    p.delta_ptr = SafeGetTensorPtr<float>(delta);
+    p.output_ptr = phi::SafeGetTensorPtr<scalar_t>(output);
+    p.grad_output_ptr = phi::SafeGetTensorPtr<scalar_t>(output_grad);
+    p.grad_query_ptr = phi::SafeAllocTensor<scalar_t, Context>(ctx, query_grad);
+    p.grad_key_ptr = phi::SafeAllocTensor<scalar_t, Context>(ctx, key_grad);
+    p.grad_value_ptr = phi::SafeAllocTensor<scalar_t, Context>(ctx, value_grad);
+    p.delta_ptr = phi::SafeGetTensorPtr<float>(delta);
     PD_MEA_CHECK_OVERFLOW(p.head_dim, q_dims[3]);
     PD_MEA_CHECK_OVERFLOW(p.head_dim_value, v_dims[3]);
 
@@ -427,8 +429,8 @@ void MemoryEfficientAttentionBackwardKernel(
     VLOG(3) << "p.scale" << p.scale;
 
     if (cu_seqlens_q) {
-      p.cu_seqlens_q_ptr = SafeGetTensorPtr<int32_t>(cu_seqlens_q);
-      p.cu_seqlens_k_ptr = SafeGetTensorPtr<int32_t>(cu_seqlens_k);
+      p.cu_seqlens_q_ptr = phi::SafeGetTensorPtr<int32_t>(cu_seqlens_q);
+      p.cu_seqlens_k_ptr = phi::SafeGetTensorPtr<int32_t>(cu_seqlens_k);
       VLOG(3) << "p.cu_seqlens_q_ptr" << p.cu_seqlens_q_ptr;
     }
 
@@ -483,7 +485,7 @@ void MemoryEfficientAttentionBackwardKernel(
     PD_MEA_CHECK_OVERFLOW(p.delta_strideB, DimStride(delta.dims(), 0));
 
     if (bias) {
-      p.bias_ptr = SafeGetTensorPtr<scalar_t>(bias);
+      p.bias_ptr = phi::SafeGetTensorPtr<scalar_t>(bias);
       PD_MEA_CHECK_OVERFLOW(
           p.bias_strideB,
           GetMemoryEfficientBiasStrideB(bias.get().dims(), q_dims, k_dims));
@@ -491,7 +493,8 @@ void MemoryEfficientAttentionBackwardKernel(
       PD_MEA_CHECK_OVERFLOW(p.bias_strideM, k_dims[1]);
       VLOG(3) << "p.bias_ptr" << p.bias_ptr;
       if (bias_grad) {
-        p.grad_bias_ptr = SafeAllocTensor<scalar_t, Context>(ctx, bias_grad);
+        p.grad_bias_ptr =
+            phi::SafeAllocTensor<scalar_t, Context>(ctx, bias_grad);
         PD_MEA_CHECK_OVERFLOW(p.gB_strideB, q_dims[2] * q_dims[1] * k_dims[1]);
         PD_MEA_CHECK_OVERFLOW(p.gB_strideH, q_dims[1] * k_dims[1]);
         PD_MEA_CHECK_OVERFLOW(p.gB_strideM, k_dims[1]);
@@ -504,7 +507,8 @@ void MemoryEfficientAttentionBackwardKernel(
       p.grad_bias_ptr = nullptr;
     }
     if (dropout_p != 0) {
-      int64_t* seed_and_offset_ptr = SafeGetTensorPtr<int64_t>(seed_and_offset);
+      int64_t* seed_and_offset_ptr =
+          phi::SafeGetTensorPtr<int64_t>(seed_and_offset);
       p.seed = (uint64_t)seed_and_offset_ptr[0];
       p.offset = (uint64_t)seed_and_offset_ptr[1];
       p.dropout_prob = dropout_p;
@@ -514,9 +518,9 @@ void MemoryEfficientAttentionBackwardKernel(
     }
 
     int64_t size_bytes = p.workspace_size();
-    paddle::memory::AllocationPtr temp_workspace{nullptr};
+    phi::Allocator::AllocationPtr temp_workspace{nullptr};
     VLOG(3) << "size_bytes " << size_bytes;
-    temp_workspace = paddle::memory::Alloc(
+    temp_workspace = phi::memory_utils::Alloc(
         ctx.GetPlace(),
         size_bytes,
         phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
diff --git a/paddle/phi/kernels/gpu/eigvalsh_kernel.cu b/paddle/phi/kernels/gpu/eigvalsh_kernel.cu
index a075dad6cdd..9671cc9f3e8 100644
--- a/paddle/phi/kernels/gpu/eigvalsh_kernel.cu
+++ b/paddle/phi/kernels/gpu/eigvalsh_kernel.cu
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#ifndef PADDLE_WITH_HIP
+
 #include "paddle/phi/kernels/eigvalsh_kernel.h"
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
@@ -29,3 +31,5 @@ PD_REGISTER_KERNEL(eigvalsh,  // cuda_only
                    phi::dtype::complex<double>) {
   kernel->InputAt(1).SetDataType(phi::dtype::ToReal(kernel_key.dtype()));
 }
+
+#endif  // not PADDLE_WITH_HIP
diff --git a/paddle/phi/kernels/gpu/gelu_funcs.h b/paddle/phi/kernels/gpu/gelu_funcs.h
index a8f685c0ab0..06e789cc1b8 100644
--- a/paddle/phi/kernels/gpu/gelu_funcs.h
+++ b/paddle/phi/kernels/gpu/gelu_funcs.h
@@ -22,7 +22,7 @@
 #include "paddle/phi/core/flags.h"
 #include "paddle/phi/kernels/funcs/aligned_vector.h"
 
-DECLARE_bool(use_fast_math);
+PHI_DECLARE_bool(use_fast_math);
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/impl/isclose_kernel_impl.h b/paddle/phi/kernels/impl/isclose_kernel_impl.h
index 4d9d0cd7b86..de59cb0c32c 100644
--- a/paddle/phi/kernels/impl/isclose_kernel_impl.h
+++ b/paddle/phi/kernels/impl/isclose_kernel_impl.h
@@ -52,19 +52,6 @@ struct GetTensorValue<phi::CPUContext, T> {
   }
 };
 
-template <typename T>
-struct GetTensorValue<phi::GPUContext, T> {
-  T operator()(const phi::GPUContext& dev_ctx,
-               const DenseTensor& tensor) const {
-    const T* data = tensor.data<T>();
-    T value;
-    const auto gpu_place = dev_ctx.GetPlace();
-    memory_utils::Copy(
-        phi::CPUPlace(), &value, gpu_place, data, sizeof(T), dev_ctx.stream());
-    return value;
-  }
-};
-
 template <typename T>
 struct IscloseFunctor<phi::CPUContext, T> {
   void operator()(const phi::CPUContext& ctx,
@@ -127,6 +114,19 @@ __global__ void IscloseCUDAKernel(const T* in_data,
   }
 }
 
+template <typename T>
+struct GetTensorValue<phi::GPUContext, T> {
+  T operator()(const phi::GPUContext& dev_ctx,
+               const DenseTensor& tensor) const {
+    const T* data = tensor.data<T>();
+    T value;
+    const auto gpu_place = dev_ctx.GetPlace();
+    memory_utils::Copy(
+        phi::CPUPlace(), &value, gpu_place, data, sizeof(T), dev_ctx.stream());
+    return value;
+  }
+};
+
 template <typename T>
 struct IscloseFunctor<phi::GPUContext, T> {
   void operator()(const phi::GPUContext& dev_ctx,
diff --git a/paddle/phi/kernels/impl/slice_grad_kernel_impl.h b/paddle/phi/kernels/impl/slice_grad_kernel_impl.h
index 33ec8f8a841..ac2769e041e 100644
--- a/paddle/phi/kernels/impl/slice_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/slice_grad_kernel_impl.h
@@ -30,7 +30,7 @@ void LaunchEigenPadding(
     const DDim& in_dims,
     const DenseTensor* d_out,
     const DDim& out_dims,
-    const Eigen::array<std::pair<int64_t, int64_t>, D>& paddings) {
+    const std::array<std::pair<int64_t, int64_t>, D>& paddings) {
   auto& place = *context.eigen_device();
   auto d_in_t = EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
       *d_input, in_dims);
@@ -40,7 +40,7 @@ void LaunchEigenPadding(
   if (d_input->numel() <= Eigen::NumTraits<int>::highest()) {
     // similar to tf.pad:
     // if element number less than INT_MAX, change the type of index to int
-    Eigen::array<std::pair<int, int>, D> paddings_32bit;
+    std::array<std::pair<int, int>, D> paddings_32bit;
     for (size_t i = 0; i < D; i++) {
       paddings_32bit[i] = std::make_pair(paddings[i].first, paddings[i].second);
     }
@@ -63,7 +63,7 @@ void EigenPaddingCompute(
     const DDim& in_dims,
     const DenseTensor* d_out,
     const DDim& out_dims,
-    const Eigen::array<std::pair<int64_t, int64_t>, D>& paddings) {
+    const std::array<std::pair<int64_t, int64_t>, D>& paddings) {
   if (D <= 3) {
     // if dimension less than 3, cannot reduce dimension
     LaunchEigenPadding<T, Context, D>(
@@ -97,7 +97,7 @@ void EigenPaddingCompute(
         // only last dimension need padding,
         // reshape the dimension of tensor in 2: [preceding, padding]
         std::vector<int64_t> in_tore_shape(2, 1), out_tore_shape(2, 1);
-        Eigen::array<std::pair<int64_t, int64_t>, 2> reshaped_padding;
+        std::array<std::pair<int64_t, int64_t>, 2> reshaped_padding;
 
         // first dimension is the accumulate of preceding dimension
         for (int i = 0; i < pad_dim; i++) {
@@ -119,18 +119,18 @@ void EigenPaddingCompute(
         reshaped_padding[1].first = paddings[pad_dim].first;
         reshaped_padding[1].second = paddings[pad_dim].second;
 
-        LaunchEigenPadding<T, Context>(context,
-                                       d_input,
-                                       reshaped_in_dims,
-                                       d_out,
-                                       reshaped_out_dims,
-                                       reshaped_padding);
+        LaunchEigenPadding<T, Context, 2>(context,
+                                          d_input,
+                                          reshaped_in_dims,
+                                          d_out,
+                                          reshaped_out_dims,
+                                          reshaped_padding);
       } else if (pad_dim == 0) {
         // only first dimension need padding,
         // reshape the dimension of tensor in 2: [padding, succeeding]
         // similar to (D - 1)
         std::vector<int64_t> in_tore_shape(2, 1), out_tore_shape(2, 1);
-        Eigen::array<std::pair<int64_t, int64_t>, 2> reshaped_padding;
+        std::array<std::pair<int64_t, int64_t>, 2> reshaped_padding;
 
         // first dimension is the padding dimension
         in_tore_shape[0] = in_dims[pad_dim];
@@ -163,7 +163,7 @@ void EigenPaddingCompute(
         // reshape the dimension of tensor in 3:
         // [preceding, padding, succeeding]
         std::vector<int64_t> in_tore_shape(3, 1), out_tore_shape(3, 1);
-        Eigen::array<std::pair<int64_t, int64_t>, 3> reshaped_padding;
+        std::array<std::pair<int64_t, int64_t>, 3> reshaped_padding;
 
         // first dimension is the accumulate of preceding dimension
         for (int i = 0; i < pad_dim; i++) {
@@ -261,7 +261,7 @@ void SliceGradCompute(const Context& ctx,
     offsets[axis] = start;
   }
 
-  Eigen::array<std::pair<int64_t, int64_t>, D> paddings;
+  std::array<std::pair<int64_t, int64_t>, D> paddings;
   for (size_t i = 0; i < paddings.size(); ++i) {
     paddings[i].first = offsets[i];
     paddings[i].second = (in_dims[i] - out_dims[i]) - offsets[i];
diff --git a/paddle/phi/kernels/transfer_layout_kernel.cc b/paddle/phi/kernels/transfer_layout_kernel.cc
index 3dec79b13b5..84b978436e1 100644
--- a/paddle/phi/kernels/transfer_layout_kernel.cc
+++ b/paddle/phi/kernels/transfer_layout_kernel.cc
@@ -112,6 +112,7 @@ void TransferLayoutGeneral(const Context& dev_ctx,
     }
   }
 #endif
+
   PD_VISIT_ALL_TYPES(x.dtype(), "CastDataLayout", ([&] {
                        CastDataLayout<data_t, Context>(dev_ctx, x, axis, out);
                      }));
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 697b74c39a4..2dd6ddd550a 100644
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -3568,6 +3568,7 @@ function run_setup_mac(){
             if [ -d "/Library/Frameworks/Python.framework/Versions/3.7" ]; then
                 export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.7/lib/
                 export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:/Library/Frameworks/Python.framework/Versions/3.7/lib/
+                export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${PADDLE_ROOT}/build/third_party/install/lapack/lib
                 export PATH=/Library/Frameworks/Python.framework/Versions/3.7/bin/:${PATH}
                 #after changing "PYTHON_LIBRARY:FILEPATH" to "PYTHON_LIBRARY" ,we can use export
                 export PYTHON_EXECUTABLE=/Library/Frameworks/Python.framework/Versions/3.7/bin/python3
@@ -3581,6 +3582,7 @@ function run_setup_mac(){
             if [ -d "/Library/Frameworks/Python.framework/Versions/3.8" ]; then
                 export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.8/lib/
                 export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:/Library/Frameworks/Python.framework/Versions/3.8/lib/
+                export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${PADDLE_ROOT}/build/third_party/install/lapack/lib
                 export PATH=/Library/Frameworks/Python.framework/Versions/3.8/bin/:${PATH}
                 #after changing "PYTHON_LIBRARY:FILEPATH" to "PYTHON_LIBRARY" ,we can use export
                 export PYTHON_EXECUTABLE=/Library/Frameworks/Python.framework/Versions/3.8/bin/python3
@@ -3594,6 +3596,7 @@ function run_setup_mac(){
             if [ -d "/Library/Frameworks/Python.framework/Versions/3.9" ]; then
                 export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.9/lib/
                 export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:/Library/Frameworks/Python.framework/Versions/3.9/lib/
+                export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${PADDLE_ROOT}/build/third_party/install/lapack/lib
                 export PATH=/Library/Frameworks/Python.framework/Versions/3.9/bin/:${PATH}
                 #after changing "PYTHON_LIBRARY:FILEPATH" to "PYTHON_LIBRARY" ,we can use export
                 export PYTHON_EXECUTABLE=/Library/Frameworks/Python.framework/Versions/3.9/bin/python3
@@ -3607,6 +3610,7 @@ function run_setup_mac(){
             if [ -d "/Library/Frameworks/Python.framework/Versions/3.10" ]; then
                 export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.10/lib/
                 export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:/Library/Frameworks/Python.framework/Versions/3.10/lib/
+                export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${PADDLE_ROOT}/build/third_party/install/lapack/lib
                 export PATH=/Library/Frameworks/Python.framework/Versions/3.10/bin/:${PATH}
                 #after changing "PYTHON_LIBRARY:FILEPATH" to "PYTHON_LIBRARY" ,we can use export
                 export PYTHON_EXECUTABLE=/Library/Frameworks/Python.framework/Versions/3.10/bin/python3
diff --git a/paddle/testing/CMakeLists.txt b/paddle/testing/CMakeLists.txt
index 2c345473193..4c5f3049f23 100644
--- a/paddle/testing/CMakeLists.txt
+++ b/paddle/testing/CMakeLists.txt
@@ -4,7 +4,7 @@ if(WITH_TESTING)
   set(paddle_gtest_main_deps
       device_context
       gtest
-      gflags
+      phi
       init
       memory
       phi_utils
diff --git a/paddle/utils/CMakeLists.txt b/paddle/utils/CMakeLists.txt
index e8de5c30bbb..e7d5282c2f2 100644
--- a/paddle/utils/CMakeLists.txt
+++ b/paddle/utils/CMakeLists.txt
@@ -3,11 +3,11 @@ add_subdirectory(string)
 cc_test(
   array_ref_test
   SRCS array_ref_test.cc
-  DEPS gtest gflags)
+  DEPS gtest phi)
 cc_test(
   small_vector_test
   SRCS small_vector_test.cc
-  DEPS gtest gflags)
+  DEPS gtest phi)
 cc_test(
   variant_test
   SRCS variant_test.cc
@@ -17,5 +17,5 @@ if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
   cc_library(
     pybind_util
     SRCS pybind.cc
-    DEPS phi_tensor_raw flags)
+    DEPS phi)
 endif()
diff --git a/paddle/utils/string/CMakeLists.txt b/paddle/utils/string/CMakeLists.txt
index 89b95385eb1..ddfc8f96b2e 100644
--- a/paddle/utils/string/CMakeLists.txt
+++ b/paddle/utils/string/CMakeLists.txt
@@ -1,15 +1,15 @@
 cc_library(
   pretty_log
   SRCS pretty_log.cc
-  DEPS flags)
+  DEPS phi)
 cc_library(
   string_helper
   SRCS string_helper.cc
-  DEPS flags)
+  DEPS phi)
 cc_test(
   stringprintf_test
   SRCS printf_test.cc
-  DEPS gflags)
+  DEPS phi)
 cc_test(to_string_test SRCS to_string_test.cc)
 cc_test(split_test SRCS split_test.cc)
 cc_test(
diff --git a/python/env_dict.py.in b/python/env_dict.py.in
index d8ae5f9144b..f72ea07c602 100644
--- a/python/env_dict.py.in
+++ b/python/env_dict.py.in
@@ -10,6 +10,9 @@ env_dict={
     'CUDA_VERSION':'@CUDA_VERSION@',
     'WITH_PSLI':'@WITH_PSLI@',
     'FLUID_CORE_NAME':'@FLUID_CORE_NAME@',
+    'PHI_LIB':'@PHI_LIB@',
+    'PHI_NAME':'@PHI_NAME@',
+    'WITH_PHI_SHARED':'@WITH_PHI_SHARED@',
     'WARPCTC_LIBRARIES':'@WARPCTC_LIBRARIES@',
     'WARPRNNT_LIBRARIES':'@WARPRNNT_LIBRARIES@',
     'FLASHATTN_LIBRARIES':'@FLASHATTN_LIBRARIES@',
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index d4c50707cbe..b7bd5d5fa0e 100755
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -1134,14 +1134,6 @@ foreach(TEST_CINN_OPS ${TEST_CINN_OPS})
 endforeach()
 
 if(WITH_CINN AND WITH_TESTING)
-  set_tests_properties(
-    test_resnet50_with_cinn
-    PROPERTIES
-      LABELS
-      "RUN_TYPE=CINN"
-      ENVIRONMENT
-      FLAGS_allow_cinn_ops="conv2d;conv2d_grad;elementwise_add;elementwise_add_grad;relu;relu_grad;sum"
-  )
   set_tests_properties(
     test_parallel_executor_run_cinn
     PROPERTIES
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_run_cinn.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_run_cinn.py
index c598073f434..2ca34842f0b 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_run_cinn.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_run_cinn.py
@@ -123,9 +123,9 @@ class TestParallelExecutorRunCinn(unittest.TestCase):
         shutil.rmtree(self.tmpdir)
 
     def test_run_with_cinn(self):
-        cinn_losses = train(self.tmpdir, "paddle")
+        cinn_losses = np.array(train(self.tmpdir, "paddle")).flatten()
         set_cinn_flag(False)
-        pd_losses = train(self.tmpdir, "cinn")
+        pd_losses = np.array(train(self.tmpdir, "cinn")).flatten()
         np.testing.assert_allclose(
             cinn_losses, pd_losses, rtol=1e-05, atol=1e-05
         )
diff --git a/python/paddle/fluid/tests/unittests/test_resnet50_with_cinn.py b/python/paddle/fluid/tests/unittests/test_resnet50_with_cinn.py
deleted file mode 100644
index d262319eee8..00000000000
--- a/python/paddle/fluid/tests/unittests/test_resnet50_with_cinn.py
+++ /dev/null
@@ -1,149 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-import unittest
-
-import numpy as np
-
-import paddle
-from paddle.fluid import core
-
-paddle.enable_static()
-
-logging.basicConfig(
-    format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO
-)
-logger = logging.getLogger(__name__)
-
-
-def set_cinn_flag(val):
-    cinn_compiled = False
-    try:
-        paddle.set_flags({'FLAGS_use_cinn': val})
-        cinn_compiled = True
-    except ValueError:
-        logger.warning("The used paddle is not compiled with CINN.")
-    return cinn_compiled
-
-
-@unittest.skipIf(not set_cinn_flag(True), "Paddle is not compiled with CINN.")
-class TestResnet50Accuracy(unittest.TestCase):
-    def reader(self, limit):
-        for _ in range(limit):
-            yield {
-                'image': np.random.randint(
-                    0, 256, size=[32, 3, 224, 224]
-                ).astype('float32'),
-                'label': np.random.randint(0, 1000, size=[32]).astype('int64'),
-            }
-
-    def generate_random_data(self, loop_num=10):
-        feed = []
-        data = self.reader(loop_num)
-        for _ in range(loop_num):
-            feed.append(next(data))
-        return feed
-
-    def build_program(self, main_program, startup_program):
-        with paddle.static.program_guard(main_program, startup_program):
-            image = paddle.static.data(
-                name='image', shape=[32, 3, 224, 224], dtype='float32'
-            )
-            label = paddle.static.data(name='label', shape=[32], dtype='int64')
-
-            # TODO: stop_gradient slower training speed, need fix
-            image.stop_gradient = False
-
-            model = paddle.vision.models.resnet50()
-            prediction = model(image)
-
-            loss = paddle.nn.functional.cross_entropy(
-                input=prediction, label=label
-            )
-            loss = paddle.mean(loss)
-            adam = paddle.optimizer.Adam(learning_rate=0.001)
-            adam.minimize(loss)
-        return loss
-
-    def train(self, place, iters, feed, use_cinn=False, seed=1234):
-        np.random.seed(seed)
-        paddle.seed(seed)
-        if paddle.is_compiled_with_cuda():
-            paddle.set_flags({'FLAGS_cudnn_deterministic': 1})
-        set_cinn_flag(use_cinn)
-
-        startup_program = paddle.static.Program()
-        main_program = paddle.static.Program()
-
-        loss = self.build_program(main_program, startup_program)
-        exe = paddle.static.Executor(place)
-
-        compiled_prog = paddle.static.CompiledProgram(main_program)
-        loss_vals = []
-        scope = paddle.static.Scope()
-
-        with paddle.static.scope_guard(scope):
-            exe.run(startup_program)
-            for step in range(iters):
-                loss_v = exe.run(
-                    compiled_prog,
-                    feed=feed[step],
-                    fetch_list=[loss],
-                    return_numpy=True,
-                )
-                loss_vals.append(loss_v[0])
-        return loss_vals
-
-    def test_check_resnet50_accuracy(self):
-        place = (
-            paddle.CUDAPlace(0)
-            if paddle.is_compiled_with_cuda()
-            else paddle.CPUPlace()
-        )
-
-        loop_num = 10
-        feed = self.generate_random_data(loop_num)
-
-        loss_c = self.train(place, loop_num, feed, use_cinn=True)
-        loss_p = self.train(place, loop_num, feed, use_cinn=False)
-        print("Losses of CINN:")
-        print(loss_c)
-        print("Losses of Paddle")
-        print(loss_p)
-        np.testing.assert_allclose(loss_c, loss_p, rtol=1e-05, atol=1e-05)
-
-    def test_check_resnet50_accuracy_with_composite(self):
-        place = (
-            paddle.CUDAPlace(0)
-            if paddle.is_compiled_with_cuda()
-            else paddle.CPUPlace()
-        )
-
-        loop_num = 10
-        feed = self.generate_random_data(loop_num)
-        core._set_prim_backward_enabled(True)
-        core._add_skip_comp_ops("batch_norm")
-        loss_c = self.train(place, loop_num, feed, use_cinn=True)
-        core._set_prim_backward_enabled(False)
-        loss_p = self.train(place, loop_num, feed, use_cinn=True)
-        print("Losses of Composite + CINN:")
-        print(loss_c)
-        print("Losses of CINN: ")
-        print(loss_p)
-        np.testing.assert_allclose(loss_c, loss_p, rtol=1e-05, atol=1e-05)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/setup.py.in b/python/setup.py.in
index 89acf5fe09d..9a6517a7d55 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -561,7 +561,11 @@ package_dir={
 libs_path='${PADDLE_BINARY_DIR}/python/paddle/libs'
 
 package_data['paddle.libs']= []
-package_data['paddle.libs']=[
+if('${WITH_PHI_SHARED}' == 'ON'):
+    package_data['paddle.libs'] = [('libphi' if os.name != 'nt' else 'phi') + ext_name]
+    shutil.copy('${PHI_LIB}', libs_path)
+
+package_data['paddle.libs']+=[
     ('libwarpctc' if os.name != 'nt' else 'warpctc') + ext_name,
     ('libwarprnnt' if os.name != 'nt' else 'warprnnt') + ext_name,
 ]
@@ -722,8 +726,14 @@ if '${CMAKE_BUILD_TYPE}' == 'Release':
         if "@APPLE@" == "1":
             commands = ["install_name_tool -id '@loader_path/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/${FLUID_CORE_NAME}" + '.so']
             commands.append("install_name_tool -add_rpath '@loader_path/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/${FLUID_CORE_NAME}" + '.so')
+            if('${WITH_PHI_SHARED}' == 'ON'):
+                # change rpath of phi.ext for loading 3rd party libb
+                commands.append("install_name_tool -add_rpath '@loader_path' ${PADDLE_BINARY_DIR}/python/paddle/libs/${PHI_NAME}")
         else:
             commands = ["patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/${FLUID_CORE_NAME}" + '.so']
+            if('${WITH_PHI_SHARED}' == 'ON'):
+                # change rpath of phi.ext for loading 3rd party lib
+                commands.append("patchelf --set-rpath '$ORIGIN' ${PADDLE_BINARY_DIR}/python/paddle/libs/${PHI_NAME}")
         # The sw_64 not suppot patchelf, so we just disable that.
         if platform.machine() != 'sw_64' and platform.machine() != 'mips64':
             for command in commands:
diff --git a/setup.py b/setup.py
index e10308e1dff..f8858321ae6 100644
--- a/setup.py
+++ b/setup.py
@@ -966,7 +966,14 @@ def get_package_data_and_package_dir():
     # put all thirdparty libraries in paddle.libs
     libs_path = paddle_binary_dir + '/python/paddle/libs'
     package_data['paddle.libs'] = []
-    package_data['paddle.libs'] = [
+
+    if env_dict.get("WITH_PHI_SHARED") == "ON":
+        package_data['paddle.libs'] = [
+            ('libphi' if os.name != 'nt' else 'phi') + ext_suffix
+        ]
+        shutil.copy(env_dict.get("PHI_LIB"), libs_path)
+
+    package_data['paddle.libs'] += [
         ('libwarpctc' if os.name != 'nt' else 'warpctc') + ext_suffix,
         ('libwarprnnt' if os.name != 'nt' else 'warprnnt') + ext_suffix,
     ]
@@ -1204,6 +1211,13 @@ def get_package_data_and_package_dir():
                     + env_dict.get("FLUID_CORE_NAME")
                     + '.so'
                 )
+                if env_dict.get("WITH_PHI_SHARED") == "ON":
+                    commands.append(
+                        "install_name_tool -add_rpath '@loader_path' "
+                        + env_dict.get("PADDLE_BINARY_DIR")
+                        + '/python/paddle/libs/'
+                        + env_dict.get("PHI_NAME")
+                    )
             else:
                 commands = [
                     "patchelf --set-rpath '$ORIGIN/../libs/' "
@@ -1212,6 +1226,13 @@ def get_package_data_and_package_dir():
                     + env_dict.get("FLUID_CORE_NAME")
                     + '.so'
                 ]
+                if env_dict.get("WITH_PHI_SHARED") == "ON":
+                    commands.append(
+                        "patchelf --set-rpath '$ORIGIN' "
+                        + env_dict.get("PADDLE_BINARY_DIR")
+                        + '/python/paddle/libs/'
+                        + env_dict.get("PHI_NAME")
+                    )
             # The sw_64 not suppot patchelf, so we just disable that.
             if platform.machine() != 'sw_64' and platform.machine() != 'mips64':
                 for command in commands:
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 258ea9025dd..975446b6002 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -168,6 +168,7 @@ if(${len} GREATER_EQUAL 1)
       add_executable(${test_name} ${test_src})
       target_link_libraries(${test_name} paddle_gtest_main_new)
       target_link_libraries(${test_name} $<TARGET_LINKER_FILE:${paddle_lib}>)
+      target_link_libraries(${test_name} $<TARGET_LINKER_FILE:phi>)
       add_dependencies(${test_name} ${paddle_lib} paddle_gtest_main_new)
       if(WITH_GPU)
         target_link_libraries(${test_name} ${CUDA_CUDART_LIBRARY}
@@ -177,8 +178,10 @@ if(${len} GREATER_EQUAL 1)
         target_link_libraries(${test_name} ${ROCM_HIPRTC_LIB})
       endif()
       if(APPLE)
-        target_link_libraries(${test_name}
-                              "-Wl,-rpath,$<TARGET_FILE_DIR:${paddle_lib}>")
+        target_link_libraries(
+          ${test_name}
+          "-Wl,-rpath,$<TARGET_FILE_DIR:${paddle_lib}> -Wl,-rpath,$<TARGET_FILE_DIR:phi>"
+        )
       endif()
       if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
         target_link_libraries(${test_name} ${PYTHON_LIBRARIES})
diff --git a/test/cpp/eager/CMakeLists.txt b/test/cpp/eager/CMakeLists.txt
index d8d3a6304cf..7747a42d250 100644
--- a/test/cpp/eager/CMakeLists.txt
+++ b/test/cpp/eager/CMakeLists.txt
@@ -1,12 +1,10 @@
 set(eager_deps
-    phi_api
-    phi_dygraph_api
+    phi
     hook_utils
     tensor_utils
     utils
     global_utils
     backward
-    phi_tensor
     tracer
     layer
     autograd_meta
diff --git a/test/cpp/fluid/CMakeLists.txt b/test/cpp/fluid/CMakeLists.txt
index 590816d1b5e..b7936fadaad 100644
--- a/test/cpp/fluid/CMakeLists.txt
+++ b/test/cpp/fluid/CMakeLists.txt
@@ -42,7 +42,7 @@ cc_test(
   test_common_infer_shape_functions
   SRCS test_common_infer_shape_functions.cc
   DEPS common_infer_shape_functions ${COMMON_OP_DEPS} activation_op
-       elementwise_add_op softmax generated_static_op)
+       elementwise_add_op phi generated_static_op)
 cc_test(
   gather_test
   SRCS gather_test.cc
@@ -54,7 +54,7 @@ cc_test(
 cc_test(
   scatter_test
   SRCS scatter_test.cc
-  DEPS tensor math_function)
+  DEPS tensor phi)
 cc_test(
   beam_search_decode_op_test
   SRCS beam_search_decode_op_test.cc
@@ -72,7 +72,7 @@ if(WITH_GPU)
   nv_test(
     dropout_op_test
     SRCS dropout_op_test.cc
-    DEPS dropout_op tensor generator)
+    DEPS dropout_op tensor phi)
   nv_test(
     test_leaky_relu_grad_grad_functor
     SRCS test_leaky_relu_grad_grad_functor.cc
@@ -81,12 +81,12 @@ if(WITH_GPU)
   nv_test(
     feed_forward_test
     SRCS feed_forward_test.cu
-    DEPS elementwise_add_op matmul_op tensor generator)
+    DEPS elementwise_add_op matmul_op tensor phi)
 elseif(WITH_ROCM)
   hip_test(
     dropout_op_test
     SRCS dropout_op_test.cc
-    DEPS dropout_op tensor generator)
+    DEPS dropout_op tensor phi)
   hip_test(
     test_leaky_relu_grad_grad_functor
     SRCS test_leaky_relu_grad_grad_functor.cc
diff --git a/test/cpp/fluid/benchmark/CMakeLists.txt b/test/cpp/fluid/benchmark/CMakeLists.txt
index a5a799d71da..9111dfe2ff3 100644
--- a/test/cpp/fluid/benchmark/CMakeLists.txt
+++ b/test/cpp/fluid/benchmark/CMakeLists.txt
@@ -11,7 +11,7 @@ cc_test(
        scope
        ${GLOB_OP_LIB}
        ${GLOB_OPERATOR_DEPS}
-       eigen_function)
+       phi)
 
 if(WITH_ONNXRUNTIME AND WIN32)
   # Copy onnxruntime for some c++ test in Windows, since the test will
diff --git a/test/cpp/fluid/cinn/CMakeLists.txt b/test/cpp/fluid/cinn/CMakeLists.txt
index f396d1c58cc..2553457a9e1 100644
--- a/test/cpp/fluid/cinn/CMakeLists.txt
+++ b/test/cpp/fluid/cinn/CMakeLists.txt
@@ -1,46 +1,49 @@
-cc_test_old(
-  cinn_launch_context_test
-  SRCS
-  cinn_launch_context_test.cc
-  DEPS
-  ddim
-  lod_tensor
-  scope
-  proto_desc
-  graph
-  cinn_launch_context
-  cinn_instruction_run_op
-  cinn)
-target_link_libraries(cinn_launch_context_test ${PYTHON_LIBRARIES})
-set_tests_properties(cinn_launch_context_test PROPERTIES LABELS "RUN_TYPE=CINN")
+if(WITH_TESTING)
+  cc_test_old(
+    cinn_launch_context_test
+    SRCS
+    cinn_launch_context_test.cc
+    DEPS
+    phi
+    lod_tensor
+    scope
+    proto_desc
+    graph
+    cinn_launch_context
+    cinn_instruction_run_op
+    cinn)
+  target_link_libraries(cinn_launch_context_test ${PYTHON_LIBRARIES})
+  set_tests_properties(cinn_launch_context_test PROPERTIES LABELS
+                                                           "RUN_TYPE=CINN")
 
-set(CINN_RUN_ENVIRONMENT
-    "OMP_NUM_THREADS=1;runtime_include_dir=${PADDLE_BINARY_DIR}/third_party/CINN/src/external_cinn/cinn/runtime/cuda"
-)
-# cc_test_old(
-#   cinn_launch_op_test
-#   SRCS
-#   cinn_launch_op_test.cc
-#   DEPS
-#   cinn_compiler
-#   cinn_launch_op
-#   cinn_instruction_run_op
-#   elementwise_add_op
-#   gflags)
-# set_tests_properties(
-#   cinn_launch_op_test PROPERTIES LABELS "RUN_TYPE=CINN" ENVIRONMENT
-#                                  "${CINN_RUN_ENVIRONMENT}")
+  set(CINN_RUN_ENVIRONMENT
+      "OMP_NUM_THREADS=1;runtime_include_dir=${PADDLE_BINARY_DIR}/third_party/CINN/src/external_cinn/cinn/runtime/cuda"
+  )
+  # cc_test_old(
+  #   cinn_launch_op_test
+  #   SRCS
+  #   cinn_launch_op_test.cc
+  #   DEPS
+  #   cinn_compiler
+  #   cinn_launch_op
+  #   cinn_instruction_run_op
+  #   elementwise_add_op
+  #   gflags)
+  # set_tests_properties(
+  #   cinn_launch_op_test PROPERTIES LABELS "RUN_TYPE=CINN" ENVIRONMENT
+  #                                  "${CINN_RUN_ENVIRONMENT}")
 
-cc_test_old(
-  cinn_instruction_run_op_test
-  SRCS
-  cinn_instruction_run_op_test.cc
-  DEPS
-  cinn_compiler
-  cinn_launch_op
-  cinn_instruction_run_op
-  elementwise_add_op)
-target_link_libraries(cinn_instruction_run_op_test ${PYTHON_LIBRARIES})
-set_tests_properties(
-  cinn_instruction_run_op_test PROPERTIES LABELS "RUN_TYPE=CINN" ENVIRONMENT
-                                          "${CINN_RUN_ENVIRONMENT}")
+  cc_test_old(
+    cinn_instruction_run_op_test
+    SRCS
+    cinn_instruction_run_op_test.cc
+    DEPS
+    cinn_compiler
+    cinn_launch_op
+    cinn_instruction_run_op
+    elementwise_add_op)
+  target_link_libraries(cinn_instruction_run_op_test ${PYTHON_LIBRARIES})
+  set_tests_properties(
+    cinn_instruction_run_op_test PROPERTIES LABELS "RUN_TYPE=CINN" ENVIRONMENT
+                                            "${CINN_RUN_ENVIRONMENT}")
+endif()
diff --git a/test/cpp/fluid/fused/CMakeLists.txt b/test/cpp/fluid/fused/CMakeLists.txt
index ff239f2f0c6..6529e13c90c 100644
--- a/test/cpp/fluid/fused/CMakeLists.txt
+++ b/test/cpp/fluid/fused/CMakeLists.txt
@@ -15,7 +15,7 @@ if(WITH_GPU OR WITH_ROCM)
            dropout_op
            generated_op
            device_context
-           generator
+           phi
            memory)
     nv_test(
       test_fused_dropout_act_bias
@@ -25,7 +25,7 @@ if(WITH_GPU OR WITH_ROCM)
            dropout_op
            generated_op
            device_context
-           generator
+           phi
            memory)
     nv_test(
       test_fused_layernorm_residual_dropout_bias
@@ -35,7 +35,7 @@ if(WITH_GPU OR WITH_ROCM)
            dropout_op
            generated_op
            device_context
-           generator
+           phi
            memory)
   endif()
   # resnet_unit needs cudnn 8.0 above
@@ -44,15 +44,11 @@ if(WITH_GPU OR WITH_ROCM)
       test_cudnn_norm_conv
       SRCS cudnn_norm_conv_test.cc
       DEPS conv_op
-           blas
-           im2col
-           vol2col
            depthwise_conv
-           eigen_function
            tensor
            op_registry
            device_context
-           generator
+           phi
            memory)
     cc_test(
       test_cudnn_bn_add_relu
@@ -62,7 +58,7 @@ if(WITH_GPU OR WITH_ROCM)
            tensor
            op_registry
            device_context
-           generator
+           phi
            memory)
   endif()
 endif()
diff --git a/test/cpp/fluid/math/CMakeLists.txt b/test/cpp/fluid/math/CMakeLists.txt
index cbe53b0828c..1edc2f25e68 100644
--- a/test/cpp/fluid/math/CMakeLists.txt
+++ b/test/cpp/fluid/math/CMakeLists.txt
@@ -1,15 +1,15 @@
 cc_test(
   selected_rows_functor_test
   SRCS selected_rows_functor_test.cc
-  DEPS allocator selected_rows_functor)
+  DEPS allocator phi)
 cc_test(
   im2col_test
   SRCS im2col_test.cc
-  DEPS im2col)
+  DEPS phi)
 cc_test(
   vol2col_test
   SRCS vol2col_test.cc
-  DEPS vol2col)
+  DEPS phi)
 cc_test(
   beam_search_test
   SRCS beam_search_test.cc
@@ -18,13 +18,13 @@ if(WITH_GPU)
   nv_test(
     selected_rows_functor_gpu_test
     SRCS selected_rows_functor_test.cu.cc
-    DEPS selected_rows_functor math_function)
+    DEPS phi)
 endif()
 if(WITH_ROCM)
   hip_test(
     selected_rows_functor_gpu_test
     SRCS selected_rows_functor_test.cu.cc
-    DEPS selected_rows_functor math_function)
+    DEPS phi)
 endif()
 cc_test(
   concat_test
diff --git a/test/cpp/fluid/mkldnn/CMakeLists.txt b/test/cpp/fluid/mkldnn/CMakeLists.txt
index dae56ea5eb6..d08e30b346a 100644
--- a/test/cpp/fluid/mkldnn/CMakeLists.txt
+++ b/test/cpp/fluid/mkldnn/CMakeLists.txt
@@ -4,7 +4,7 @@ cc_test(
   DEPS op_registry
        elementwise_add_op
        activation_op
-       softmax
+       phi
        scope
        device_context
        enforce
@@ -17,9 +17,7 @@ set(TEST_MKLDNN_CACHING_DEPS
     elementwise_add_op
     activation_op
     conv_op
-    im2col
-    vol2col
-    softmax
+    phi
     scope
     device_context
     enforce
@@ -44,7 +42,7 @@ cc_test_old(
   crop_op
   activation_op
   generated_op
-  pooling
+  phi
   transpose_op
   fused_transpose_op
   scope
diff --git a/test/cpp/fluid/pscore/CMakeLists.txt b/test/cpp/fluid/pscore/CMakeLists.txt
index c195c6c7975..c19df6b4696 100644
--- a/test/cpp/fluid/pscore/CMakeLists.txt
+++ b/test/cpp/fluid/pscore/CMakeLists.txt
@@ -68,7 +68,7 @@ cc_test_old(
   scope
   proto_desc
   generated_op
-  eigen_function)
+  phi)
 
 set_source_files_properties(
   send_and_recv_op_cpu_test.cc PROPERTIES COMPILE_FLAGS
@@ -85,7 +85,7 @@ cc_test_old(
   send_and_recv_op
   ${RPC_DEPS}
   ${DISTRIBUTE_DEPS}
-  eigen_function)
+  phi)
 
 set_source_files_properties(
   send_and_recv_op_gpu_test.cc PROPERTIES COMPILE_FLAGS
@@ -102,7 +102,7 @@ cc_test_old(
   send_and_recv_op
   ${RPC_DEPS}
   ${DISTRIBUTE_DEPS}
-  eigen_function)
+  phi)
 
 set_source_files_properties(
   heter_listen_and_server_test.cc PROPERTIES COMPILE_FLAGS
@@ -119,10 +119,10 @@ cc_test_old(
   heter_listen_and_serv_op
   ${RPC_DEPS}
   ${DISTRIBUTE_DEPS}
-  eigen_function)
+  phi)
 
 #set_source_files_properties(heter_cloud_comm_cpu_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-#cc_test(heter_cloud_comm_cpu_test SRCS heter_cloud_comm_cpu_test.cc DEPS executor scope proto_desc generated_static_op heter_listen_and_serv_op ${RPC_DEPS} ${DISTRIBUTE_DEPS} eigen_function)
+#cc_test(heter_cloud_comm_cpu_test SRCS heter_cloud_comm_cpu_test.cc DEPS executor scope proto_desc generated_static_op heter_listen_and_serv_op ${RPC_DEPS} ${DISTRIBUTE_DEPS} phi)
 
 set_source_files_properties(
   switch_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
@@ -138,4 +138,4 @@ cc_binary(
   heter_listen_and_serv_op
   ${RPC_DEPS}
   ${DISTRIBUTE_DEPS}
-  eigen_function)
+  phi)
diff --git a/test/cpp/imperative/CMakeLists.txt b/test/cpp/imperative/CMakeLists.txt
index acecb4fe010..c0a103603ce 100644
--- a/test/cpp/imperative/CMakeLists.txt
+++ b/test/cpp/imperative/CMakeLists.txt
@@ -33,14 +33,7 @@ endif()
 cc_test(
   test_gradient_accmulator
   SRCS test_gradient_accmulator.cc
-  DEPS memcpy
-       selected_rows_utils
-       selected_rows_functor
-       gradient_accumulator
-       math_function
-       phi_tensor
-       phi_api
-       phi_utils)
+  DEPS memcpy selected_rows_utils gradient_accumulator phi phi_utils)
 cc_test(
   test_layer
   SRCS test_layer.cc
diff --git a/test/cpp/imperative/test_hooks.cc b/test/cpp/imperative/test_hooks.cc
index 005ac3d3168..8f28ebaf294 100644
--- a/test/cpp/imperative/test_hooks.cc
+++ b/test/cpp/imperative/test_hooks.cc
@@ -24,6 +24,7 @@
 #include "paddle/fluid/imperative/hooks.h"
 #include "paddle/fluid/imperative/tracer.h"
 #include "paddle/fluid/memory/memcpy.h"
+#include "paddle/phi/core/flags.h"
 #include "paddle/phi/core/kernel_registry.h"
 
 PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
@@ -35,7 +36,7 @@ namespace platform = paddle::platform;
 namespace framework = paddle::framework;
 namespace memory = paddle::memory;
 
-DECLARE_bool(sort_sum_gradient);
+PHI_DECLARE_bool(sort_sum_gradient);
 
 namespace paddle {
 namespace imperative {
diff --git a/test/cpp/inference/infer_ut/CMakeLists.txt b/test/cpp/inference/infer_ut/CMakeLists.txt
index e27cf7ffe1d..a1e39ca142d 100644
--- a/test/cpp/inference/infer_ut/CMakeLists.txt
+++ b/test/cpp/inference/infer_ut/CMakeLists.txt
@@ -224,7 +224,7 @@ if(NOT WIN32)
       ${MATH_LIB}
       ${MKLDNN_LIB}
       glog
-      gflags
+      phi
       protobuf
       xxhash
       cryptopp
@@ -235,7 +235,7 @@ else()
       ${MATH_LIB}
       ${MKLDNN_LIB}
       glog
-      gflags_static
+      phi
       libprotobuf
       xxhash
       cryptopp-static
diff --git a/test/cpp/jit/CMakeLists.txt b/test/cpp/jit/CMakeLists.txt
index c5490c7aa4b..708f48bbf49 100644
--- a/test/cpp/jit/CMakeLists.txt
+++ b/test/cpp/jit/CMakeLists.txt
@@ -8,7 +8,6 @@ if(WITH_TESTING AND NOT WIN32)
     WORKING_DIRECTORY "${CC_TESTS_DIR}")
   set(JIT_DEPS
       phi
-      phi_api
       elementwise_add_op
       matmul_v2_op
       activation_op
diff --git a/test/cpp/new_executor/CMakeLists.txt b/test/cpp/new_executor/CMakeLists.txt
index 11e4e9a84e1..30af210725c 100644
--- a/test/cpp/new_executor/CMakeLists.txt
+++ b/test/cpp/new_executor/CMakeLists.txt
@@ -37,8 +37,7 @@ if(WITH_GPU
       fetch_v2_op)
 
   # All deps of the operators above, part of GLOB_OPERATOR_DEPS.
-  set(OP_DEPS generator softmax selected_rows_functor jit_kernel_helper
-              concat_and_split cross_entropy)
+  set(OP_DEPS phi concat_and_split cross_entropy)
   cc_test(standalone_executor_test SRCS standalone_executor_test.cc)
 
   # add_dependencies(standalone_executor_test download_program)
diff --git a/test/cpp/phi/api/CMakeLists.txt b/test/cpp/phi/api/CMakeLists.txt
index c2898a2fde2..fd06e6d460d 100644
--- a/test/cpp/phi/api/CMakeLists.txt
+++ b/test/cpp/phi/api/CMakeLists.txt
@@ -1,48 +1,48 @@
-set(COMMON_API_TEST_DEPS phi_tensor phi_api api_tensor_utils)
+set(COMMON_API_TEST_DEPS phi)
 
 if(WITH_GPU)
   nv_test(
     test_phi_tensor
     SRCS test_phi_tensor.cc
-    DEPS glog selected_rows ${COMMON_API_TEST_DEPS})
+    DEPS glog ${COMMON_API_TEST_DEPS})
   nv_test(
     test_allocator
     SRCS test_allocator.cu
-    DEPS place device_context context_pool)
+    DEPS place device_context phi)
   nv_test(
     test_cuda_stream
     SRCS test_cuda_stream.cu
-    DEPS context_pool)
+    DEPS phi)
   nv_test(
     test_from_blob
     SRCS test_from_blob.cc
-    DEPS phi_backends ${COMMON_API_TEST_DEPS})
+    DEPS ${COMMON_API_TEST_DEPS})
 elseif(WITH_ROCM)
   hip_test(
     test_phi_tensor
     SRCS test_phi_tensor.cc
-    DEPS glog selected_rows ${COMMON_API_TEST_DEPS})
+    DEPS glog ${COMMON_API_TEST_DEPS})
   hip_test(
     test_allocator
     SRCS test_allocator.cu
-    DEPS place device_context context_pool)
+    DEPS place device_context phi)
   hip_test(
     test_cuda_stream
     SRCS test_cuda_stream.cu
-    DEPS context_pool)
+    DEPS phi)
   hip_test(
     test_from_blob
     SRCS test_from_blob.cc
-    DEPS phi_backends ${COMMON_API_TEST_DEPS})
+    DEPS ${COMMON_API_TEST_DEPS})
 else()
   cc_test(
     test_phi_tensor
     SRCS test_phi_tensor.cc
-    DEPS glog selected_rows ${COMMON_API_TEST_DEPS})
+    DEPS glog ${COMMON_API_TEST_DEPS})
   cc_test(
     test_from_blob
     SRCS test_from_blob.cc
-    DEPS phi_backends ${COMMON_API_TEST_DEPS})
+    DEPS ${COMMON_API_TEST_DEPS})
 endif()
 
 cc_test(
diff --git a/test/cpp/phi/api/scale_api.h b/test/cpp/phi/api/scale_api.h
index c8ab3c7e985..571ab0defbc 100644
--- a/test/cpp/phi/api/scale_api.h
+++ b/test/cpp/phi/api/scale_api.h
@@ -21,12 +21,13 @@
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/common/int_array.h"
 #include "paddle/phi/common/scalar.h"
+#include "paddle/phi/core/flags.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/meta_tensor.h"
 #include "paddle/phi/infermeta/unary.h"
 #include "paddle/phi/kernels/scale_kernel.h"
 
-DECLARE_int32(low_precision_op_list);
+PHI_DECLARE_int32(low_precision_op_list);
 namespace paddle {
 namespace experimental {
 
diff --git a/test/cpp/phi/common/CMakeLists.txt b/test/cpp/phi/common/CMakeLists.txt
index ed9eaf7fef0..b40e7e9f5a4 100644
--- a/test/cpp/phi/common/CMakeLists.txt
+++ b/test/cpp/phi/common/CMakeLists.txt
@@ -13,32 +13,32 @@ cc_test(
 cc_test(
   phi_test_place
   SRCS test_place.cc
-  DEPS phi_place)
+  DEPS phi)
 cc_test(
   phi_test_int_array
   SRCS test_int_array.cc
-  DEPS int_array api_int_array phi phi_api)
+  DEPS phi)
 cc_test(
   phi_test_scalar_cpu
   SRCS test_scalar.cc
-  DEPS scalar api_scalar)
+  DEPS phi)
 if(WITH_GPU)
   nv_test(
     phi_test_scalar
     SRCS test_scalar.cu
-    DEPS scalar api_scalar)
+    DEPS phi)
   nv_test(
     transform_test
     SRCS transform_test.cu
-    DEPS memory place phi_backends)
+    DEPS memory place phi)
 endif()
 if(WITH_ROCM)
   hip_test(
     phi_test_scalar
     SRCS test_scalar.cu
-    DEPS scalar api_scalar)
+    DEPS phi)
   hip_test(
     transform_test
     SRCS transform_test.cu
-    DEPS memory place phi_backends)
+    DEPS memory place phi)
 endif()
diff --git a/test/cpp/phi/core/CMakeLists.txt b/test/cpp/phi/core/CMakeLists.txt
index 0fa3cca9e2a..7b8bb1ff8b2 100644
--- a/test/cpp/phi/core/CMakeLists.txt
+++ b/test/cpp/phi/core/CMakeLists.txt
@@ -1,59 +1,51 @@
 cc_test(
   test_custom_kernel
   SRCS test_custom_kernel.cc
-  DEPS custom_kernel scalar)
+  DEPS phi)
 cc_test(
   test_dense_tensor
   SRCS test_dense_tensor.cc
-  DEPS dense_tensor)
+  DEPS phi)
 cc_test(test_intrusive_ptr SRCS test_intrusive_ptr.cc)
 cc_test(test_type_info SRCS test_type_info.cc)
 cc_test(
   test_kernel_factory
   SRCS test_kernel_factory.cc
-  DEPS kernel_factory phi)
+  DEPS phi)
 cc_test(
   test_sparse_coo_tensor
   SRCS test_sparse_coo_tensor.cc
-  DEPS dense_tensor sparse_coo_tensor)
+  DEPS phi)
 cc_test(
   test_sparse_csr_tensor
   SRCS test_sparse_csr_tensor.cc
-  DEPS dense_tensor sparse_csr_tensor)
+  DEPS phi)
 cc_test(
   test_op_utils
   SRCS test_op_utils.cc
   DEPS op_compat_infos)
-cc_test_old(
-  test_meta_fn_utils
-  SRCS
-  test_meta_fn_utils.cc
-  DEPS
-  dense_tensor
-  wrapped_infermeta
-  infermeta
-  infermeta_utils)
+cc_test_old(test_meta_fn_utils SRCS test_meta_fn_utils.cc DEPS phi)
 
 cc_test(
   test_ddim
   SRCS test_ddim.cc
-  DEPS ddim)
+  DEPS phi)
 if(WITH_GPU)
   nv_test(
     test_dim
     SRCS test_dim.cu
-    DEPS ddim)
+    DEPS phi)
 elseif(WITH_ROCM)
   hip_test(
     test_dim
     SRCS test_dim.cu
-    DEPS ddim)
+    DEPS phi)
 endif()
 
 cc_test(
   selected_rows_test
   SRCS test_selected_rows.cc
-  DEPS selected_rows)
+  DEPS phi)
 if(WITH_TESTING AND TEST selected_rows_test)
   set_tests_properties(selected_rows_test PROPERTIES TIMEOUT 120)
 endif()
@@ -63,27 +55,27 @@ endif()
 cc_test(
   test_string_tensor
   SRCS test_string_tensor.cc
-  DEPS string_tensor)
+  DEPS phi)
 cc_test(unroll_array_ops_test SRCS unroll_array_ops_test.cc)
 
 cc_test(
   test_tensor_array
   SRCS test_tensor_array.cc
-  DEPS tensor_array)
+  DEPS phi)
 
 if(WITH_GPU)
   nv_test(
     test_mixed_vector
     SRCS test_mixed_vector.cc test_mixed_vector.cu
-    DEPS mixed_vector place memory phi_backends tensor)
+    DEPS place memory phi tensor)
 elseif(WITH_ROCM)
   hip_test(
     test_mixed_vector
     SRCS test_mixed_vector.cc test_mixed_vector.cu
-    DEPS mixed_vector place memory phi_backends tensor)
+    DEPS place memory phi tensor)
 else()
   cc_test(
     test_mixed_vector
     SRCS test_mixed_vector.cc
-    DEPS mixed_vector place memory phi_backends tensor)
+    DEPS place memory phi tensor)
 endif()
diff --git a/test/cpp/phi/core/test_type_info.cc b/test/cpp/phi/core/test_type_info.cc
index 56980ebbe21..40e89f6203d 100644
--- a/test/cpp/phi/core/test_type_info.cc
+++ b/test/cpp/phi/core/test_type_info.cc
@@ -17,6 +17,11 @@ limitations under the License. */
 #include "paddle/phi/core/utils/type_registry.h"
 
 namespace phi {
+
+template <typename BaseT, typename DerivedT>
+const TypeInfo<BaseT> TypeInfoTraits<BaseT, DerivedT>::kType =
+    RegisterStaticType<BaseT>(DerivedT::name());
+
 namespace tests {
 
 template <typename T>
diff --git a/test/cpp/phi/kernels/CMakeLists.txt b/test/cpp/phi/kernels/CMakeLists.txt
index 3e7f394f186..a4906b3d1a8 100644
--- a/test/cpp/phi/kernels/CMakeLists.txt
+++ b/test/cpp/phi/kernels/CMakeLists.txt
@@ -1,12 +1,12 @@
 cc_test(
   test_math_function
   SRCS test_math_function.cc
-  DEPS math_function)
+  DEPS phi)
 if(WITH_GPU)
   nv_test(
     test_math_function_gpu
     SRCS test_math_function.cu
-    DEPS math_function)
+    DEPS phi)
   nv_test(
     test_broadcast_gpu
     SRCS test_ternary_broadcast.cu
@@ -16,13 +16,13 @@ if(WITH_ROCM)
   hip_test(
     test_math_function_gpu
     SRCS test_math_function.cu
-    DEPS math_function)
+    DEPS phi)
 endif()
 
 cc_test(
   test_cpu_vec
   SRCS test_cpu_vec.cc
-  DEPS blas phi_backends)
+  DEPS phi)
 
 # For String Kernels
 cc_test(
@@ -94,19 +94,19 @@ endif()
 cc_test(
   test_cache
   SRCS test_cache.cc
-  DEPS gtest cache)
+  DEPS gtest phi)
 
 cc_test(
   strided_memcpy_test
   SRCS strided_memcpy_test.cc
-  DEPS phi_backends memory)
+  DEPS phi memory)
 
 cc_test(
   sequence_padding_test
   SRCS sequence_padding_test.cc
-  DEPS sequence_padding)
+  DEPS phi)
 
 cc_test(
   sequence_pooling_test
   SRCS sequence_pooling_test.cc
-  DEPS sequence_pooling)
+  DEPS phi)
diff --git a/test/cpp/phi/ops/CMakeLists.txt b/test/cpp/phi/ops/CMakeLists.txt
index 634af80f05a..4e6cf31f75c 100644
--- a/test/cpp/phi/ops/CMakeLists.txt
+++ b/test/cpp/phi/ops/CMakeLists.txt
@@ -1,4 +1,4 @@
 cc_test(
   test_op_signature
   SRCS test_op_signature.cc
-  DEPS op_utils)
+  DEPS phi)
diff --git a/test/cpp/prim/CMakeLists.txt b/test/cpp/prim/CMakeLists.txt
index 92845d5bd81..947e446ca93 100644
--- a/test/cpp/prim/CMakeLists.txt
+++ b/test/cpp/prim/CMakeLists.txt
@@ -1,12 +1,10 @@
 set(prim_eager_deps
-    phi_api
-    phi_dygraph_api
+    phi
     hook_utils
     tensor_utils
     utils
     global_utils
     backward
-    phi_tensor
     tracer
     layer
     autograd_meta
@@ -33,20 +31,16 @@ cc_test_old(
   elementwise_pow_op
   fill_constant_op
   activation_op
-  phi_api
-  phi_dygraph_api
+  phi
   static_global_utils
   static_tensor_operants
-  tensor_api
-  operants_manager
   generated_static_op)
 
 if(NOT (NOT WITH_PYTHON AND ON_INFER))
   cc_library(
     init_env_utils
     SRCS init_env_utils.cc
-    DEPS operants_manager tensor_api eager_tensor_operants
-         static_tensor_operants)
+    DEPS phi eager_tensor_operants static_tensor_operants)
 
   cc_test_old(
     test_comp_eager
diff --git a/tools/parallel_UT_rule.py b/tools/parallel_UT_rule.py
index ec9130ff6d9..71f63175c45 100755
--- a/tools/parallel_UT_rule.py
+++ b/tools/parallel_UT_rule.py
@@ -2221,7 +2221,6 @@ CPU_PARALLEL_JOB = [
     'test_egr_ds_grad_tensor_holder',
     'test_egr_ds_auotgrad_meta',
     'test_egr_ds_accumulation_node',
-    'test_resnet50_with_cinn',
     'test_parallel_dygraph_sync_batch_norm',
     'test_monitor',
     'test_mkldnn_quantizer',
-- 
GitLab