[PHI Decoupling]Create PHI shared lib (#53735)

* create phi so * fix ci bugs * fix py3 bugs * add file * fix py3 bugs * fix windows bugs * perfect so * fix py3 bugs * delete all static target in phi * fix windows bugs * fix py3 bugs * fix ci bugs * fix windows bugs * fix bugs: gflags can't be linked by dynamic and static lib * fix bugs that can not load 3rd party * fix ci bugs * fix compile bugs * fix py3 bugs * fix conflict * fix xpu bugs * fix mac compile bugs * fix psgpu bugs * fix inference failed * deal with conflict * fix LIBRARY_PATH bug * fix windows bugs * fix onednn error * fix windows compile bugs * fix windows compile bugs * fix test_cuda_graph_static_mode_error aborted * fix windows bugs * fix mac-python3 error * fix hip compile bugs * change mode to static * change to static mode * fix ci bugs * fix py3 bugs * fix windows bugs * fix bugs * add static flag * add PADDLE_API * change position of PADDLE_API * fix windows bugs * change mode to dynamic lib * fix windows static bugs * deal with conflict * fix windows unit bug * fix coverage * deal with conflict * fix windows-inference * fix py3 bugs * fix bugs when compile type_info * fix compile bugs * fix py3 bugs * fix windows bugs * fix windows openblas * fix xpu bugs * fix enforce_test in windows * update code according comment * fix windows cmake bug * fix windows bugs * fix windows bugs * delete cinn unittest * fix cinn bugs --------- Co-authored-by: lzydev <1528794076@qq.com>

[PHI Decoupling]Create PHI shared lib (#53735)
* create phi so * fix ci bugs * fix py3 bugs * add file * fix py3 bugs * fix windows bugs * perfect so * fix py3 bugs * delete all static target in phi * fix windows bugs * fix py3 bugs * fix ci bugs * fix windows bugs * fix bugs: gflags can't be linked by dynamic and static lib * fix bugs that can not load 3rd party * fix ci bugs * fix compile bugs * fix py3 bugs * fix conflict * fix xpu bugs * fix mac compile bugs * fix psgpu bugs * fix inference failed * deal with conflict * fix LIBRARY_PATH bug * fix windows bugs * fix onednn error * fix windows compile bugs * fix windows compile bugs * fix test_cuda_graph_static_mode_error aborted * fix windows bugs * fix mac-python3 error * fix hip compile bugs * change mode to static * change to static mode * fix ci bugs * fix py3 bugs * fix windows bugs * fix bugs * add static flag * add PADDLE_API * change position of PADDLE_API * fix windows bugs * change mode to dynamic lib * fix windows static bugs * deal with conflict * fix windows unit bug * fix coverage * deal with conflict * fix windows-inference * fix py3 bugs * fix bugs when compile type_info * fix compile bugs * fix py3 bugs * fix windows bugs * fix windows openblas * fix xpu bugs * fix enforce_test in windows * update code according comment * fix windows cmake bug * fix windows bugs * fix windows bugs * delete cinn unittest * fix cinn bugs --------- Co-authored-by: lzydev <1528794076@qq.com>
da50a009 · YuanRisheng · GitHub · 7aabdfd9 · da50a009 · da50a009
180 changed file
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -40,7 +40,6 @@ if(WITH_MKLML)
  add_definitions(-DLAPACK_FOUND)
  add_dependencies(cblas mklml)
-  target_link_libraries(cblas dynload_mklml)
  message(STATUS "Found cblas and lapack in MKLML "
                 "(include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")

--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -235,3 +235,16 @@ endif()
 if(WITH_CUDNN_FRONTEND)
  add_definitions(-DPADDLE_WITH_CUDNN_FRONTEND)
 endif()
+set(WITH_PHI_SHARED
+    ON
+    CACHE BOOL "" FORCE)
+if(WIN32 OR WITH_ROCM)
+  set(WITH_PHI_SHARED
+      OFF
+      CACHE BOOL "" FORCE)
+endif()
+if(WITH_PHI_SHARED)
+  add_definitions(-DPHI_SHARED)
+endif()
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -122,6 +122,5 @@ get_filename_component(WARPCTC_LIBRARY_PATH ${WARPCTC_LIBRARIES} DIRECTORY)
 include_directories(${WARPCTC_INCLUDE_DIR}
 )# For warpctc code to include its headers.
-add_library(warpctc SHARED IMPORTED GLOBAL)
+add_library(warpctc INTERFACE)
-set_property(TARGET warpctc PROPERTY IMPORTED_LOCATION ${WARPCTC_LIBRARIES})
 add_dependencies(warpctc extern_warpctc)
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -364,20 +364,7 @@ function(cc_library TARGET_NAME)
        list(REMOVE_ITEM cc_library_DEPS warpctc)
        add_dependencies(${TARGET_NAME} warpctc)
      endif()
-      # Only deps libmklml.so, not link
-      if("${cc_library_DEPS};" MATCHES "mklml;")
-        list(REMOVE_ITEM cc_library_DEPS mklml)
-        if(NOT "${TARGET_NAME}" MATCHES "dynload_mklml")
-          list(APPEND cc_library_DEPS dynload_mklml)
-        endif()
-        add_dependencies(${TARGET_NAME} mklml)
-        if(WIN32)
-          target_link_libraries(${TARGET_NAME} ${MKLML_IOMP_LIB})
-        else()
-          target_link_libraries(${TARGET_NAME}
-                                "-L${MKLML_LIB_DIR} -liomp5 -Wl,--as-needed")
-        endif()
-      endif()
      # remove link to python, see notes at:
      # https://github.com/pybind/pybind11/blob/master/docs/compiling.rst#building-manually
      if("${cc_library_DEPS};" MATCHES "python;")
@@ -457,24 +444,9 @@ function(cc_test_build TARGET_NAME)
      endif()
    endif()
    get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
-    target_link_libraries(
+    target_link_libraries(${TARGET_NAME} ${cc_test_DEPS}
-      ${TARGET_NAME}
+                          ${os_dependency_modules} paddle_gtest_main gtest glog)
-      ${cc_test_DEPS}
+    add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main gtest
-      ${os_dependency_modules}
-      paddle_gtest_main
-      lod_tensor
-      memory
-      gtest
-      gflags
-      glog)
-    add_dependencies(
-      ${TARGET_NAME}
-      ${cc_test_DEPS}
-      paddle_gtest_main
-      lod_tensor
-      memory
-      gtest
-      gflags
                     glog)
    common_link(${TARGET_NAME})
    if(WITH_ROCM)
@@ -670,7 +642,7 @@ function(nv_test TARGET_NAME)
    add_executable(${TARGET_NAME} ${nv_test_SRCS})
    get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
    target_link_libraries(${TARGET_NAME} ${nv_test_DEPS}
-                          ${os_dependency_modules} paddle_gtest_main)
+                          ${os_dependency_modules} paddle_gtest_main phi)
    add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main)
    common_link(${TARGET_NAME})
    add_test(${TARGET_NAME} ${TARGET_NAME})
@@ -774,8 +746,8 @@ function(hip_test TARGET_NAME)
      lod_tensor
      memory
      gtest
-      gflags
      glog
+      phi
      ${os_dependency_modules})
    add_dependencies(
      ${TARGET_NAME}
@@ -784,7 +756,7 @@ function(hip_test TARGET_NAME)
      lod_tensor
      memory
      gtest
-      gflags
+      phi
      glog)
    common_link(${TARGET_NAME})
    add_test(${TARGET_NAME} ${TARGET_NAME})
@@ -881,7 +853,7 @@ function(xpu_test TARGET_NAME)
      lod_tensor
      memory
      gtest
-      gflags
+      phi
      glog
      ${os_dependency_modules})
    add_dependencies(
@@ -891,7 +863,7 @@ function(xpu_test TARGET_NAME)
      lod_tensor
      memory
      gtest
-      gflags
+      phi
      glog)
    common_link(${TARGET_NAME})
    add_test(${TARGET_NAME} ${TARGET_NAME})

--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -269,6 +269,13 @@ else()
    SRCS ${src_dir}/inference/api/paddle_*.h ${paddle_inference_lib}
    DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include
         ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib)
+  set(paddle_phi_lib ${PADDLE_BINARY_DIR}/paddle/phi/libphi.*)
+  copy(
+    inference_lib_dist
+    SRCS ${paddle_phi_lib}
+    DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib)
 endif()
 copy(

--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -61,8 +61,7 @@ function(register_cu_kernel TARGET)
                        "${multiValueArgs}" ${ARGN})
  set(cu_srcs)
-  set(op_common_deps operator op_registry math_function layer
+  set(op_common_deps operator op_registry layer common_infer_shape_functions)
-                     common_infer_shape_functions)
  foreach(cu_src ${register_cu_kernel_SRCS})
    if(${cu_src} MATCHES ".*\\.cu$")
      list(APPEND cu_srcs ${cu_src})
@@ -113,7 +112,7 @@ function(register_mkldnn_kernel TARGET)
                        "${multiValueArgs}" ${ARGN})
  set(mkldnn_cc_srcs)
-  set(op_common_deps operator op_registry math_function layer
+  set(op_common_deps operator op_registry phi layer
                     common_infer_shape_functions)
  foreach(mkldnn_src ${register_mkldnn_kernel_SRCS})
    if(${mkldnn_src} MATCHES ".*_mkldnn_op.cc$")
@@ -164,7 +163,7 @@ function(op_library TARGET)
  set(MIOPEN_FILE)
  set(mkldnn_cc_srcs)
  set(MKLDNN_FILE)
-  set(op_common_deps operator op_registry math_function layer
+  set(op_common_deps operator op_registry phi layer
                     common_infer_shape_functions)
  # Option `UNITY` is used to specify that operator `TARGET` will compiles with Unity Build.

--- a/cmake/phi.cmake
+++ b/cmake/phi.cmake
@@ -94,6 +94,13 @@ function(kernel_declare TARGET_LIST)
          continue()
        endif()
      endif()
+      # fusion group kernel is not supported in windows and mac
+      if(WIN32 OR APPLE)
+        string(FIND "${first_registry}" "fusion_group" pos)
+        if(pos GREATER 1)
+          continue()
+        endif()
+      endif()
      # some gpu kernel only can run on cuda, not support rocm, so we add this branch
      if(WITH_ROCM)
        string(FIND "${first_registry}" "cuda_only" pos)
@@ -216,3 +223,27 @@ function(prune_declaration_h)
    endif()
  endforeach()
 endfunction()
+function(collect_srcs SRC_GROUP)
+  set(options)
+  set(oneValueArgs)
+  set(multiValueArgs "SRCS")
+  cmake_parse_arguments(prefix "" "" "${multiValueArgs}" ${ARGN})
+  foreach(src ${prefix_SRCS})
+    set(${SRC_GROUP}
+        "${${SRC_GROUP}};${CMAKE_CURRENT_SOURCE_DIR}/${src}"
+        CACHE INTERNAL "")
+  endforeach()
+endfunction()
+function(collect_generated_srcs SRC_GROUP)
+  set(options)
+  set(oneValueArgs)
+  set(multiValueArgs "SRCS")
+  cmake_parse_arguments(prefix "" "" "${multiValueArgs}" ${ARGN})
+  foreach(src ${prefix_SRCS})
+    set(${SRC_GROUP}
+        "${${SRC_GROUP}};${src}"
+        CACHE INTERNAL "")
+  endforeach()
+endfunction()
--- a/paddle/fluid/dialect/CMakeLists.txt
+++ b/paddle/fluid/dialect/CMakeLists.txt
@@ -49,5 +49,5 @@ file(GLOB PD_DIALECT_SRCS "*.cc")
 cc_library(
  pd_dialect
  SRCS ${PD_DIALECT_SRCS} ${op_source_file}
-  DEPS new_ir framework_proto dense_tensor phi_utils)
+  DEPS new_ir framework_proto phi phi_utils)
 target_include_directories(pd_dialect PRIVATE ${PD_DIALECT_BINARY_DIR})
--- a/paddle/fluid/distributed/auto_parallel/CMakeLists.txt
+++ b/paddle/fluid/distributed/auto_parallel/CMakeLists.txt
 cc_library(
  op_dist_attr
  SRCS dist_attr.cc
-  DEPS dist_attr process_mesh dist_mapper auto_parallel_proto proto_desc
+  DEPS phi auto_parallel_proto proto_desc)
-       phi_enforce)
 add_subdirectory(test)
--- a/paddle/fluid/distributed/auto_parallel/test/CMakeLists.txt
+++ b/paddle/fluid/distributed/auto_parallel/test/CMakeLists.txt
 cc_test(
  device_mesh_test
  SRCS device_mesh_test.cc
-  DEPS device_mesh)
+  DEPS phi)
 cc_test(
  process_mesh_test
  SRCS process_mesh_test.cc
-  DEPS process_mesh)
+  DEPS phi)
 cc_test(
  dist_attr_test
  SRCS dist_attr_test.cc
-  DEPS dist_attr proto_desc)
+  DEPS phi proto_desc)
 cc_test(
  dist_mapper_test
  SRCS dist_mapper_test.cc
-  DEPS dist_mapper)
+  DEPS phi)
--- a/paddle/fluid/distributed/collective/CMakeLists.txt
+++ b/paddle/fluid/distributed/collective/CMakeLists.txt
 cc_library(
  process_group
  SRCS process_group.cc
-  DEPS dense_tensor xxhash)
+  DEPS phi xxhash)
 cc_library(
  eager_reducer
  SRCS reducer.cc
-  DEPS eager_api process_group phi_api string_helper)
+  DEPS eager_api process_group phi string_helper)
 if(WITH_DISTRIBUTE)
  cc_library(
    process_group_gloo
    SRCS process_group_gloo.cc gloo_send_recv.cc
-    DEPS phi_api eager_api gloo_wrapper tcp_store)
+    DEPS phi eager_api gloo_wrapper)
 endif()
 if(WITH_NCCL OR WITH_RCCL)
@@ -20,28 +20,19 @@ if(WITH_NCCL OR WITH_RCCL)
    process_group_nccl
    SRCS process_group_nccl.cc nccl_tools.cc common.cc
    DEPS process_group
-         tcp_store
+         phi
         place
         enforce
         collective_helper
         device_context
-         ${DEVICE_EVENT_LIBS}
+         ${DEVICE_EVENT_LIBS})
-         dense_tensor
-         comm_static_check
-         nccl_dynamic_check)
 endif()
 if(WITH_XPU_BKCL)
  cc_library(
    process_group_bkcl
    SRCS process_group_bkcl.cc bkcl_tools.cc common.cc
-    DEPS process_group
+    DEPS process_group phi place enforce collective_helper device_context)
-         tcp_store
-         place
-         enforce
-         collective_helper
-         device_context
-         dense_tensor)
 endif()
 if(WITH_MPI)
@@ -55,15 +46,7 @@ if(WITH_CUSTOM_DEVICE)
  cc_library(
    process_group_custom
    SRCS process_group_custom.cc custom_ccl_tools.cc common.cc
-    DEPS process_group
+    DEPS process_group phi place enforce collective_helper device_context)
-         tcp_store
-         phi_backends
-         place
-         enforce
-         collective_helper
-         device_context
-         comm_static_check
-         dense_tensor)
 endif()
 set(COMM_UTILS_DEPS process_group)

--- a/paddle/fluid/distributed/fleet_executor/CMakeLists.txt
+++ b/paddle/fluid/distributed/fleet_executor/CMakeLists.txt
@@ -5,7 +5,7 @@ endif()
 proto_library(interceptor_message_proto SRCS interceptor_message.proto)
 if(WITH_ARM_BRPC)
-  set(BRPC_DEPS arm_brpc snappy gflags glog)
+  set(BRPC_DEPS arm_brpc snappy phi glog)
 elseif(WITH_DISTRIBUTE AND NOT WITH_PSLIB)
  set(BRPC_DEPS
      brpc
@@ -15,7 +15,7 @@ elseif(WITH_DISTRIBUTE AND NOT WITH_PSLIB)
      zlib
      leveldb
      snappy
-      gflags
+      phi
      glog)
 else()
  set(BRPC_DEPS "")
@@ -51,7 +51,7 @@ cc_library(
       collective_helper
       op_registry
       executor_gc_helper
-       gflags
+       phi
       glog
       ${BRPC_DEPS})

--- a/paddle/fluid/distributed/ps/service/CMakeLists.txt
+++ b/paddle/fluid/distributed/ps/service/CMakeLists.txt
@@ -8,12 +8,11 @@ if(WITH_HETERPS)
      ssl
      crypto
      protobuf
-      gflags
+      phi
      glog
      zlib
      leveldb
      snappy
-      gflags
      glog
      device_context
      rocksdb)
@@ -25,12 +24,11 @@ else()
      ssl
      crypto
      protobuf
-      gflags
+      phi
      glog
      zlib
      leveldb
      snappy
-      gflags
      glog
      device_context)
@@ -122,8 +120,7 @@ cc_library(
       simple_threadpool
       simple_rpc
       scope
-       math_function
+       phi
-       selected_rows_functor
       ps_gpu_wrapper
       ${RPC_DEPS})
@@ -150,7 +147,7 @@ cc_library(
 #cc_library(
 #  communicator
 #  SRCS communicator/communicator.cc
-#  DEPS scope client table math_function selected_rows_functor ${RPC_DEPS})
+#  DEPS scope client table phi ${RPC_DEPS})
 #cc_library(
 #  ps_service
 #  SRCS ps_service/service.cc

--- a/paddle/fluid/distributed/ps/table/CMakeLists.txt
+++ b/paddle/fluid/distributed/ps/table/CMakeLists.txt
@@ -48,7 +48,7 @@ cc_library(
       string_helper
       simple_threadpool
       xxhash
-       generator)
+       phi)
 set_source_files_properties(
  tensor_accessor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
@@ -91,7 +91,7 @@ cc_library(
       ps_framework_proto
       string_helper
       device_context
-       gflags
+       phi
       glog
       fs
       afs_wrapper

--- a/paddle/fluid/distributed/rpc/CMakeLists.txt
+++ b/paddle/fluid/distributed/rpc/CMakeLists.txt
@@ -20,7 +20,7 @@ set(PADDLE_RPC_DEPS
    zlib
    leveldb
    snappy
-    gflags
+    phi
    glog
    pybind)
 proto_library(paddle_rpc_proto SRCS rpc.proto)

--- a/paddle/fluid/distributed/test/CMakeLists.txt
+++ b/paddle/fluid/distributed/test/CMakeLists.txt
@@ -73,7 +73,7 @@ cc_test_old(
  DEPS
  brpc_utils
  scope
-  math_function
+  phi
  ${COMMON_DEPS}
  ${RPC_DEPS})

--- a/paddle/fluid/eager/CMakeLists.txt
+++ b/paddle/fluid/eager/CMakeLists.txt
 set(eager_deps
-    phi_api
+    phi
-    phi_dygraph_api
    hook_utils
    tensor_utils
    utils
    global_utils
    backward
-    phi_tensor
    tracer
    layer
    autograd_meta
@@ -48,27 +46,26 @@ if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
  cc_library(
    backward
    SRCS backward.cc
-    DEPS grad_tensor_holder utils autograd_meta grad_node_info switch_autotune)
+    DEPS grad_tensor_holder utils autograd_meta grad_node_info phi)
 endif()
 cc_library(
  eager_nan_inf_utils
  SRCS nan_inf_utils.cc
-  DEPS phi_tensor nan_inf_utils enforce)
+  DEPS phi nan_inf_utils enforce)
 cc_library(
  grad_node_info
  SRCS grad_node_info.cc
-  DEPS phi_api phi_tensor)
+  DEPS phi)
 cc_library(
  autograd_meta
  SRCS autograd_meta.cc
-  DEPS phi_api phi_tensor)
+  DEPS phi)
 cc_library(
  utils
  SRCS utils.cc
-  DEPS phi_api
+  DEPS phi
-       phi_tensor
       global_utils
       layer
       proto_desc

--- a/paddle/fluid/eager/accumulation/CMakeLists.txt
+++ b/paddle/fluid/eager/accumulation/CMakeLists.txt
@@ -2,5 +2,5 @@ if(NOT (NOT WITH_PYTHON AND ON_INFER))
  cc_library(
    accumulation_node
    SRCS accumulation_node.cc
-    DEPS gradient_accumulator phi_api grad_node_info)
+    DEPS gradient_accumulator phi grad_node_info)
 endif()
--- a/paddle/fluid/eager/api/generated/eager_generated/backwards/CMakeLists.txt
+++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/CMakeLists.txt
 cc_library(
  scale_node
  SRCS scale_node.cc
-  DEPS global_utils phi phi_api grad_node_info)
+  DEPS global_utils phi grad_node_info)
 if(NOT (NOT WITH_PYTHON AND ON_INFER))
  cc_library(

--- a/paddle/fluid/eager/api/generated/eager_generated/forwards/CMakeLists.txt
+++ b/paddle/fluid/eager/api/generated/eager_generated/forwards/CMakeLists.txt
 cc_library(
  eager_scale
  SRCS scale.cc
-  DEPS phi_api phi autograd_meta scale_node)
+  DEPS phi autograd_meta scale_node)
 if(NOT (NOT WITH_PYTHON AND ON_INFER))
  cc_library(

--- a/paddle/fluid/eager/api/utils/CMakeLists.txt
+++ b/paddle/fluid/eager/api/utils/CMakeLists.txt
@@ -7,7 +7,7 @@ if(NOT (NOT WITH_PYTHON AND ON_INFER))
  cc_library(
    tensor_utils
    SRCS tensor_utils.cc
-    DEPS phi_api autograd_meta grad_node_info accumulation_node)
+    DEPS phi autograd_meta grad_node_info accumulation_node)
  cc_library(
    hook_utils
    SRCS hook_utils.cc
@@ -16,7 +16,7 @@ else()
  cc_library(
    tensor_utils
    SRCS tensor_utils.cc
-    DEPS phi_api autograd_meta grad_node_info)
+    DEPS phi autograd_meta grad_node_info)
  cc_library(
    hook_utils
    SRCS hook_utils.cc

--- a/paddle/fluid/eager/auto_code_generator/CMakeLists.txt
+++ b/paddle/fluid/eager/auto_code_generator/CMakeLists.txt
@@ -52,6 +52,15 @@ if(WIN32)
    set(eager_generator_path "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}")
  endif()
+  if(WITH_PHI_SHARED)
+    message("Copied phi.dll for Eager AutoCodeGen")
+    add_custom_command(
+      OUTPUT ${eager_generator_path}/phi.dll
+      COMMAND ${CMAKE_COMMAND} -E copy ${PHI_LIB} ${eager_generator_path}
+      DEPENDS phi)
+    list(APPEND EAGER_CODEGEN_DEPS ${eager_generator_path}/phi.dll)
+  endif()
  if(${CBLAS_PROVIDER} STREQUAL MKLML)
    message("Copied libiomp5md.dll for Eager AutoCodeGen")
    add_custom_command(

--- a/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
@@ -392,7 +392,7 @@ FORWARD_CC_FILE_TEMPLATE = """
 #include "paddle/fluid/eager/api/manual/eager_manual/dygraph_forward_api.h"
 #include "paddle/phi/core/flags.h"
-DECLARE_bool(check_nan_inf);
+PHI_DECLARE_bool(check_nan_inf);
 PHI_DECLARE_string(tensor_operants_mode);
 {}
 {}

--- a/paddle/fluid/eager/custom_operator/CMakeLists.txt
+++ b/paddle/fluid/eager/custom_operator/CMakeLists.txt
 cc_library(
  custom_operator_node
  SRCS custom_operator_node.cc
-  DEPS phi_tensor phi_api grad_node_info custom_operator op_meta_info)
+  DEPS phi grad_node_info custom_operator)
--- a/paddle/fluid/eager/pylayer/CMakeLists.txt
+++ b/paddle/fluid/eager/pylayer/CMakeLists.txt
 cc_library(
  py_layer_node
  SRCS py_layer_node.cc
-  DEPS pybind phi_api grad_node_info)
+  DEPS pybind phi grad_node_info)
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -115,7 +115,7 @@ proto_library(trainer_desc_proto SRCS trainer_desc.proto DEPS framework_proto
 cc_library(
  string_array
  SRCS string_array.cc
-  DEPS utf8proc phi_enforce)
+  DEPS utf8proc phi)
 cc_library(
  data_type
@@ -130,7 +130,7 @@ cc_test(
 cc_library(
  tensor
  SRCS tensor_util.cc
-  DEPS place memory data_type device_context dense_tensor)
+  DEPS place memory data_type device_context phi)
 cc_test(
  tensor_test
@@ -166,12 +166,12 @@ cc_test(
 cc_library(
  lod_tensor
  SRCS lod_tensor.cc
-  DEPS ddim mixed_vector place tensor framework_proto version)
+  DEPS phi place tensor framework_proto version)
 cc_test(
  lod_tensor_test
  SRCS lod_tensor_test.cc
-  DEPS lod_utils lod_tensor memory)
+  DEPS phi lod_tensor memory)
 if(WITH_GPU)
  nv_test(
@@ -188,12 +188,12 @@ endif()
 cc_library(
  garbage_collector
  SRCS garbage_collector.cc
-  DEPS device_context memory gflags glog)
+  DEPS device_context memory phi glog)
 cc_library(
  reader
  SRCS reader.cc
-  DEPS lod_tensor ddim)
+  DEPS lod_tensor phi)
 cc_test(
  reader_test
  SRCS reader_test.cc
@@ -202,13 +202,12 @@ cc_test(
 cc_test(
  threadpool_test
  SRCS threadpool_test.cc
-  DEPS threadpool)
+  DEPS phi)
 cc_library(
  var_type_traits
  SRCS var_type_traits.cc
-  DEPS framework_proto scope tensor_array sparse_coo_tensor sparse_csr_tensor
+  DEPS framework_proto scope phi)
-       extended_tensor)
 if(WITH_GPU)
  target_link_libraries(var_type_traits dynload_cuda)
 endif()
@@ -242,7 +241,7 @@ endif()
 cc_library(
  scope
  SRCS scope.cc
-  DEPS glog threadpool xxhash var_type_traits)
+  DEPS glog phi xxhash var_type_traits)
 cc_library(
  device_worker
  SRCS device_worker.cc
@@ -273,12 +272,12 @@ if(WITH_GPU)
  nv_test(
    data_device_transform_test
    SRCS data_device_transform_test.cu
-    DEPS operator op_registry device_context math_function scope)
+    DEPS operator op_registry device_context phi scope)
 elseif(WITH_ROCM)
  hip_test(
    data_device_transform_test
    SRCS data_device_transform_test.cu
-    DEPS operator op_registry device_context math_function scope)
+    DEPS operator op_registry device_context phi scope)
 endif()
 if(WITH_GPU)
@@ -333,7 +332,7 @@ endif()
 cc_library(
  data_layout_transform
  SRCS data_layout_transform.cc
-  DEPS tensor math_function phi_data_layout_transform)
+  DEPS tensor phi)
 cc_test(
  data_layout_transform_test
  SRCS data_layout_transform_test.cc
@@ -342,14 +341,13 @@ cc_test(
 cc_library(
  data_transform
  SRCS data_transform.cc
-  DEPS math_function
+  DEPS phi
       tensor
       framework_proto
       selected_rows_utils
       data_device_transform
       data_type_transform
-       data_layout_transform
+       data_layout_transform)
-       phi_data_transform)
 cc_library(
  attribute
@@ -400,7 +398,7 @@ cc_library(
 cc_library(
  shape_inference
  SRCS shape_inference.cc
-  DEPS ddim attribute selected_rows_utils)
+  DEPS phi attribute selected_rows_utils)
 # every source file that includes "dnnl.h" must depends on mkldnn
 # or, the first one should depends on mkldnn
@@ -433,30 +431,17 @@ if(WITH_XPU)
    phi_utils
    SRCS phi_utils.cc
    DEPS lod_tensor
-         dense_tensor
         selected_rows_utils
-         int_array
-         scalar
         place
         phi
         var_type_traits
         op_info
-         xpu_op_list
+         xpu_op_list)
-         convert_utils)
 else()
  cc_library(
    phi_utils
    SRCS phi_utils.cc
-    DEPS lod_tensor
+    DEPS lod_tensor selected_rows_utils place phi var_type_traits op_info)
-         dense_tensor
-         selected_rows_utils
-         int_array
-         scalar
-         place
-         phi
-         var_type_traits
-         op_info
-         convert_utils)
 endif()
 if(WITH_XPU)
@@ -482,11 +467,10 @@ if(WITH_XPU)
         unused_var_check
         nan_inf_utils
         phi_utils
-         kernel_factory
         infershape_utils
-         op_utils
+         phi
         op_compat_infos
-         get_kerneltype_forvar_utils)
+         type_info)
 else()
  cc_library(
    operator
@@ -509,11 +493,10 @@ else()
         unused_var_check
         nan_inf_utils
         phi_utils
-         kernel_factory
         infershape_utils
-         op_utils
+         phi
         op_compat_infos
-         get_kerneltype_forvar_utils)
+         type_info)
 endif()
 cc_test(
@@ -543,7 +526,7 @@ cc_library(
       version
       xxhash
       op_dist_attr
-       scalar
+       phi
       op_version_proto
       op_version_registry)
@@ -853,7 +836,7 @@ if(WITH_DISTRIBUTE)
           heter_server
           brpc
           fleet_executor
-           flags)
+           phi)
    set(DISTRIBUTE_COMPILE_FLAGS "")
    if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
      set(DISTRIBUTE_COMPILE_FLAGS "${DISTRIBUTE_COMPILE_FLAGS} -faligned-new")
@@ -1071,7 +1054,7 @@ if(WITH_PSCORE)
           executor
           heter_server
           gloo_wrapper
-           eigen_function
+           phi
           ${RPC_DEPS}
           graph_gpu_wrapper)
  else()
@@ -1088,7 +1071,7 @@ if(WITH_PSCORE)
           executor
           heter_server
           gloo_wrapper
-           eigen_function
+           phi
           ${RPC_DEPS})
  endif()
 else()
@@ -1112,7 +1095,7 @@ cc_test(
 cc_library(
  selected_rows_utils
  SRCS selected_rows_utils.cc
-  DEPS selected_rows device_context)
+  DEPS phi device_context)
 cc_test(
  selected_rows_utils_test
  SRCS selected_rows_utils_test.cc
@@ -1162,12 +1145,11 @@ cc_library(
       phi
       phi_utils
       op_info
-       shape_inference
+       shape_inference)
-       sparse_coo_tensor)
 cc_test(
  infershape_utils_test
  SRCS infershape_utils_test.cc
-  DEPS infershape_utils infermeta_utils meta_tensor)
+  DEPS infershape_utils phi)
 # Get the current working branch
 execute_process(
@@ -1198,12 +1180,15 @@ cc_library(
       operator
       dynamic_loader
       string_helper
-       phi_tensor
+       phi
-       op_meta_info
+       imperative_flag
-       phi_api
+       layer)
-       tensor_api
-       phi_tensor_operants
+cc_library(type_info SRCS type_info.cc)
-       operants_manager)
+add_dependencies(type_info framework_proto auto_parallel_proto xxhash)
+if(WITH_MKLDNN)
+  add_dependencies(type_info mkldnn)
+endif()
 set(FLUID_FRAMEWORK_MODULES
    proto_desc

--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -10,15 +10,15 @@ cc_library(
 cc_library(
  scale_loss_grad_op_handle
  SRCS scale_loss_grad_op_handle.cc
-  DEPS op_handle_base scope lod_tensor ddim memory)
+  DEPS op_handle_base scope lod_tensor phi memory)
 cc_library(
  fetch_op_handle
  SRCS fetch_op_handle.cc
-  DEPS op_handle_base scope lod_tensor ddim memory)
+  DEPS op_handle_base scope lod_tensor phi memory)
 cc_library(
  fetch_async_op_handle
  SRCS fetch_async_op_handle.cc
-  DEPS op_handle_base scope lod_tensor ddim memory)
+  DEPS op_handle_base scope lod_tensor phi memory)
 cc_library(
  share_tensor_buffer_functor
@@ -78,7 +78,7 @@ if(WITH_GPU)
    DEPS op_handle_base
         scope
         lod_tensor
-         ddim
+         phi
         memory
         dynload_cuda
         variable_visitor)
@@ -88,7 +88,7 @@ if(WITH_GPU)
    DEPS op_handle_base
         scope
         lod_tensor
-         ddim
+         phi
         memory
         dynload_cuda
         variable_visitor
@@ -99,7 +99,7 @@ if(WITH_GPU)
    DEPS op_handle_base
         scope
         lod_tensor
-         ddim
+         phi
         memory
         dynload_cuda
         variable_visitor
@@ -114,7 +114,7 @@ if(WITH_GPU)
      DEPS op_handle_base
           scope
           lod_tensor
-           ddim
+           phi
           memory
           dynload_cuda
           variable_visitor
@@ -126,19 +126,17 @@ if(WITH_GPU)
    nv_library(
      reduce_op_handle
      SRCS reduce_op_handle.cc
-      DEPS op_handle_base variable_visitor scope ddim dynload_cuda
+      DEPS op_handle_base variable_visitor scope phi dynload_cuda)
-           selected_rows_functor)
  else()
    nv_library(
      reduce_op_handle
      SRCS reduce_op_handle.cc
-      DEPS op_handle_base variable_visitor scope ddim dynload_cuda
+      DEPS op_handle_base variable_visitor scope phi dynload_cuda)
-           selected_rows_functor)
  endif()
  nv_library(
    broadcast_op_handle
    SRCS broadcast_op_handle.cc
-    DEPS op_handle_base scope ddim memory variable_visitor dynload_cuda)
+    DEPS op_handle_base scope phi memory variable_visitor dynload_cuda)
  nv_library(
    fused_broadcast_op_handle
    SRCS fused_broadcast_op_handle.cc
@@ -154,7 +152,7 @@ elseif(WITH_ROCM)
    DEPS op_handle_base
         scope
         lod_tensor
-         ddim
+         phi
         memory
         dynload_cuda
         variable_visitor)
@@ -164,7 +162,7 @@ elseif(WITH_ROCM)
    DEPS op_handle_base
         scope
         lod_tensor
-         ddim
+         phi
         memory
         dynload_cuda
         variable_visitor
@@ -175,7 +173,7 @@ elseif(WITH_ROCM)
    DEPS op_handle_base
         scope
         lod_tensor
-         ddim
+         phi
         memory
         dynload_cuda
         variable_visitor
@@ -187,19 +185,17 @@ elseif(WITH_ROCM)
    hip_library(
      reduce_op_handle
      SRCS reduce_op_handle.cc
-      DEPS op_handle_base variable_visitor scope ddim dynload_cuda
+      DEPS op_handle_base variable_visitor scope phi dynload_cuda)
-           selected_rows_functor)
  else()
    hip_library(
      reduce_op_handle
      SRCS reduce_op_handle.cc
-      DEPS op_handle_base variable_visitor scope ddim dynload_cuda
+      DEPS op_handle_base variable_visitor scope phi dynload_cuda)
-           selected_rows_functor)
  endif()
  hip_library(
    broadcast_op_handle
    SRCS broadcast_op_handle.cc
-    DEPS op_handle_base scope ddim memory variable_visitor dynload_cuda)
+    DEPS op_handle_base scope phi memory variable_visitor dynload_cuda)
  hip_library(
    fused_broadcast_op_handle
    SRCS fused_broadcast_op_handle.cc
@@ -212,14 +208,14 @@ else()
  cc_library(
    all_reduce_op_handle
    SRCS all_reduce_op_handle.cc
-    DEPS op_handle_base scope lod_tensor ddim memory variable_visitor)
+    DEPS op_handle_base scope lod_tensor phi memory variable_visitor)
  cc_library(
    fused_all_reduce_op_handle
    SRCS fused_all_reduce_op_handle.cc
    DEPS op_handle_base
         scope
         lod_tensor
-         ddim
+         phi
         memory
         variable_visitor
         place)
@@ -229,7 +225,7 @@ else()
    DEPS op_handle_base
         scope
         lod_tensor
-         ddim
+         phi
         memory
         variable_visitor
         place
@@ -239,17 +235,17 @@ else()
    cc_library(
      reduce_op_handle
      SRCS reduce_op_handle.cc
-      DEPS op_handle_base variable_visitor scope ddim selected_rows_functor)
+      DEPS op_handle_base variable_visitor scope phi)
  else()
    cc_library(
      reduce_op_handle
      SRCS reduce_op_handle.cc
-      DEPS op_handle_base variable_visitor scope ddim selected_rows_functor)
+      DEPS op_handle_base variable_visitor scope phi)
  endif()
  cc_library(
    broadcast_op_handle
    SRCS broadcast_op_handle.cc
-    DEPS op_handle_base scope ddim memory variable_visitor)
+    DEPS op_handle_base scope phi memory variable_visitor)
  cc_library(
    fused_broadcast_op_handle
    SRCS fused_broadcast_op_handle.cc
@@ -259,7 +255,7 @@ endif()
 cc_library(
  gather_op_handle
  SRCS gather_op_handle.cc
-  DEPS op_handle_base scope ddim memory variable_visitor)
+  DEPS op_handle_base scope phi memory variable_visitor)
 cc_library(
  eager_deletion_op_handle
@@ -305,7 +301,7 @@ cc_test(
  DEPS var_handle
       op_handle_base
       scope
-       ddim
+       phi
       memory
       device_context
       broadcast_op_handle)
@@ -317,7 +313,7 @@ cc_test_old(
  var_handle
  op_handle_base
  scope
-  ddim
+  phi
  memory
  device_context
  gather_op_handle)
@@ -330,12 +326,12 @@ cc_library(
  scope_buffered_ssa_graph_executor
  SRCS scope_buffered_ssa_graph_executor.cc
  DEPS ssa_graph_executor scope_buffered_monitor)
-#cc_test(reduce_op_handle_test SRCS reduce_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
+#cc_test(reduce_op_handle_test SRCS reduce_op_handle_test.cc DEPS var_handle op_handle_base scope phi memory
 #        device_context reduce_op_handle )
 cc_library(
  bind_threaded_ssa_graph_executor
  SRCS bind_threaded_ssa_graph_executor.cc
-  DEPS fetch_op_handle gflags ssa_graph_executor scope simple_threadpool
+  DEPS fetch_op_handle phi ssa_graph_executor scope simple_threadpool
       device_context)
 cc_library(
  fast_threaded_ssa_graph_executor

--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -20,9 +20,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/details/reduce_op_handle.h"
 #include "paddle/fluid/framework/ir/graph_printer.h"
 #include "paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h"
+#include "paddle/phi/core/flags.h"
 DECLARE_bool(convert_all_blocks);
-DECLARE_bool(use_mkldnn);
+PHI_DECLARE_bool(use_mkldnn);
 #ifdef PADDLE_WITH_CINN
 DECLARE_bool(use_cinn);
 #endif

--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -32,7 +32,7 @@ cc_library(
 cc_library(
  cost_model
  SRCS cost_model.cc
-  DEPS executor graph profiler proto_desc phi_device_tracer)
+  DEPS executor graph profiler proto_desc phi)
 set(GRAPH_PATTERN_DETECTOR_DEPS graph graph_helper graph_traits)
 if(WITH_TESTING)
@@ -458,9 +458,6 @@ if(WITH_MKLDNN)
      graph_to_program_pass
      conv_op
      conv_transpose_op
-      math_function
-      im2col
-      vol2col
      batch_norm_op
      generated_op
      activation_op
@@ -468,7 +465,7 @@ if(WITH_MKLDNN)
      concat_and_split
      naive_executor
      device_context
-      eigen_function)
+      phi)
  if(WITH_GPU OR WITH_ROCM)
    set(TEST_CONV_BN_PASS_DEPS ${TEST_CONV_BN_PASS_DEPS} depthwise_conv)
  endif()

--- a/paddle/fluid/framework/ir/fuse_adamw_op_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_adamw_op_pass.cc
@@ -221,7 +221,7 @@ bool InitAndCheckAttrs(const size_t &found_adamw_count,
    }
  }
-  // Check whether with_decay and multi_precision are matched。
+  // Check whether with_decay and multi_precision are matched
  if (config->with_decay !=
          PADDLE_GET_CONST(bool, adamw_op_desc->GetAttr("with_decay")) ||
      config->multi_precision !=

--- a/paddle/fluid/framework/ir/fusion_group/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/fusion_group/CMakeLists.txt
@@ -6,13 +6,13 @@ if(WITH_GPU OR WITH_ROCM)
  cc_test(
    test_code_generator
    SRCS code_generator_tester.cc
-    DEPS code_generator phi_backends lod_tensor graph_viz_pass)
+    DEPS code_generator phi lod_tensor graph_viz_pass)
 endif()
 cc_library(
  fusion_group_pass
  SRCS fusion_group_pass.cc elementwise_group_detector.cc
-  DEPS subgraph_detector fuse_pass_base code_generator phi_backends)
+  DEPS subgraph_detector fuse_pass_base code_generator phi)
 cc_test(
  test_fusion_group_pass
  SRCS fusion_group_pass_tester.cc

--- a/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt
@@ -76,5 +76,4 @@ cc_library(
 cc_test(
  test_reference_count_pass_last_lived_ops
  SRCS test_reference_count_pass_last_lived_ops.cc
-  DEPS parallel_executor elementwise_mul_op elementwise_add_op generated_op
+  DEPS parallel_executor elementwise_mul_op elementwise_add_op generated_op phi)
-       eigen_function)
--- a/paddle/fluid/framework/new_executor/CMakeLists.txt
+++ b/paddle/fluid/framework/new_executor/CMakeLists.txt
@@ -16,4 +16,4 @@ cc_library(
 cc_library(
  staticgraph_executor_statistics
  SRCS executor_statistics.cc
-  DEPS enforce glog phi_os_info)
+  DEPS enforce glog phi)
--- a/paddle/fluid/framework/new_executor/interpreter/CMakeLists.txt
+++ b/paddle/fluid/framework/new_executor/interpreter/CMakeLists.txt
@@ -6,7 +6,6 @@ set(INTERPRETER_DEPS
    device_context
    global_utils
    op_registry
-    phi_tensor_utils
    scope
    framework_proto
    data_feed_proto
@@ -31,7 +30,7 @@ set(INTERPRETER_DEPS
    enforce
    scope
    glog
-    comm_context_manager
+    phi
    ${DEVICE_EVENT_LIBS}
    glog)

--- a/paddle/fluid/framework/new_executor/workqueue/CMakeLists.txt
+++ b/paddle/fluid/framework/new_executor/workqueue/CMakeLists.txt
@@ -5,7 +5,7 @@ cc_library(
 cc_library(
  workqueue
  SRCS workqueue.cc
-  DEPS workqueue_utils enforce glog phi_os_info)
+  DEPS workqueue_utils enforce glog phi)
 cc_test(
  workqueue_test
  SRCS workqueue_test.cc

--- a/paddle/fluid/framework/paddle2cinn/CMakeLists.txt
+++ b/paddle/fluid/framework/paddle2cinn/CMakeLists.txt
@@ -5,7 +5,7 @@ pass_library(
  cinn_subgraph_detector
  subgraph_detector
  cinn_compiler
-  errors
+  phi
  enforce)
 pass_library(cinn_zero_tensor_trick_pass base)
@@ -17,7 +17,7 @@ cc_library(
 cc_library(
  transform_type
  SRCS transform_type.cc
-  DEPS errors enforce cinn)
+  DEPS phi enforce cinn)
 cc_library(
  cinn_cache_key
  SRCS cinn_cache_key.cc

--- a/paddle/fluid/framework/raw_tensor.h
+++ b/paddle/fluid/framework/raw_tensor.h
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <unordered_map>
+#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/extended_tensor.h"
 #include "paddle/utils/any.h"
@@ -52,7 +53,7 @@ class RawTensor : public phi::ExtendedTensor,
  T& Get() const {
    PADDLE_ENFORCE_EQ(data_.empty(),
                      false,
-                      platform::errors::PreconditionNotMet(
+                      phi::errors::PreconditionNotMet(
                          "The data in RawTensor is empty. Please set data "
                          "before using it."));

--- a/paddle/fluid/framework/type_info.cc
+++ b/paddle/fluid/framework/type_info.cc
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/eager/eager_tensor.h"
+#include "paddle/fluid/framework/feed_fetch_type.h"
+#include "paddle/fluid/framework/raw_tensor.h"
+#include "paddle/fluid/framework/string_array.h"
+#include "paddle/fluid/prim/utils/static/desc_tensor.h"
+namespace phi {
+template <>
+const TypeInfo<phi::TensorBase>
+    TypeInfoTraits<phi::TensorBase, paddle::framework::RawTensor>::kType =
+        RegisterStaticType<phi::TensorBase>(
+            paddle::framework::RawTensor::name());
+template <>
+const TypeInfo<phi::TensorBase>
+    TypeInfoTraits<phi::TensorBase, paddle::framework::Vocab>::kType =
+        RegisterStaticType<phi::TensorBase>(paddle::framework::Vocab::name());
+template <>
+const TypeInfo<phi::TensorBase>
+    TypeInfoTraits<phi::TensorBase, paddle::framework::Strings>::kType =
+        RegisterStaticType<phi::TensorBase>(paddle::framework::Strings::name());
+template <>
+const TypeInfo<phi::TensorBase>
+    TypeInfoTraits<phi::TensorBase, paddle::framework::FeedList>::kType =
+        RegisterStaticType<phi::TensorBase>(
+            paddle::framework::FeedList::name());
+template <>
+const TypeInfo<phi::TensorBase>
+    TypeInfoTraits<phi::TensorBase, egr::VariableCompatTensor>::kType =
+        RegisterStaticType<phi::TensorBase>(egr::VariableCompatTensor::name());
+template <>
+const TypeInfo<phi::TensorBase>
+    TypeInfoTraits<phi::TensorBase, paddle::prim::DescTensor>::kType =
+        RegisterStaticType<phi::TensorBase>(paddle::prim::DescTensor::name());
+}  // namespace phi
--- a/paddle/fluid/imperative/CMakeLists.txt
+++ b/paddle/fluid/imperative/CMakeLists.txt
 cc_library(
  imperative_flag
  SRCS flags.cc
-  DEPS gflags flags)
+  DEPS phi)
 cc_library(
  var_helper
  SRCS var_helper.cc
-  DEPS tensor selected_rows extended_tensor)
+  DEPS tensor phi)
 if(WITH_XPU)
  cc_library(
    prepared_operator
@@ -20,8 +20,7 @@ if(WITH_XPU)
         op_kernel_type
         data_transform
         nan_inf_utils
-         scalar
+         phi
-         int_array
         var_helper
         profiler
         place)
@@ -38,8 +37,7 @@ else()
         op_kernel_type
         data_transform
         nan_inf_utils
-         scalar
+         phi
-         int_array
         var_helper
         profiler
         place)
@@ -47,14 +45,14 @@ endif()
 cc_library(
  layer
  SRCS layer.cc
-  DEPS prepared_operator math_function imperative_flag variable_helper
+  DEPS prepared_operator phi imperative_flag variable_helper op_registry
-       op_registry var_helper)
+       var_helper)
 add_subdirectory(jit)
 if(WITH_GPU)
  cc_library(
    layout_autotune
    SRCS layout_autotune.cc
-    DEPS op_info phi_backends)
+    DEPS op_info phi)
 else()
  cc_library(
    layout_autotune
@@ -80,15 +78,15 @@ cc_library(
 cc_library(
  basic_engine
  SRCS basic_engine.cc
-  DEPS layer gradient_accumulator switch_autotune)
+  DEPS layer gradient_accumulator phi)
 cc_library(
  engine
  SRCS basic_engine.cc partial_grad_engine.cc
-  DEPS layer gradient_accumulator switch_autotune)
+  DEPS layer gradient_accumulator phi)
 cc_library(
  imperative_profiler
  SRCS profiler.cc
-  DEPS flags)
+  DEPS phi)
 if(NOT WIN32)
  if(WITH_NCCL OR WITH_RCCL)
    cc_library(
@@ -174,12 +172,4 @@ endif()
 cc_library(
  gradient_accumulator
  SRCS gradient_accumulator.cc
-  DEPS blas
+  DEPS operator lod_tensor selected_rows_utils var_type_traits layer phi)
-       operator
-       lod_tensor
-       selected_rows_utils
-       selected_rows_functor
-       var_type_traits
-       layer
-       math_function
-       phi_tensor)
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -32,14 +32,8 @@ endif()
 # fluid_modules exclude API-interface of inference/api and inference/capi_exp
 get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
-get_property(phi_modules GLOBAL PROPERTY PHI_MODULES)
-get_property(phi_kernels GLOBAL PROPERTY PHI_KERNELS)
 set(utils_modules pretty_log string_helper benchmark)
-if(WITH_CUSTOM_DEVICE)
-  set(fluid_modules ${fluid_modules} phi_capi)
-endif()
 add_subdirectory(api)
 # Create static inference library if needed
@@ -51,7 +45,6 @@ set(STATIC_INFERENCE_API
    reset_tensor_array
    analysis_config
    paddle_pass_builder
-    phi
    ${mkldnn_quantizer_cfg})
 set(OP_LIST
@@ -64,16 +57,14 @@ set(KERNEL_LIST
 #windows GPU static library over the limit, so not create_static_lib, and cc_library is dummy
 if(WIN32 AND WITH_GPU)
-  cc_library(paddle_inference DEPS ${fluid_modules} phi ${STATIC_INFERENCE_API}
+  cc_library(paddle_inference DEPS ${fluid_modules} ${STATIC_INFERENCE_API}
                                   ${utils_modules})
 else()
  # message("${fluid_modules}")
-  # message("PHI_MODULES ${phi_modules}")
-  # message("${phi_kernels}")
  # message("${STATIC_INFERENCE_API}")
  # message("${utils_modules}")
-  create_static_lib(paddle_inference ${fluid_modules} ${phi_modules}
+  create_static_lib(paddle_inference ${fluid_modules} ${STATIC_INFERENCE_API}
-                    ${phi_kernels} ${STATIC_INFERENCE_API} ${utils_modules})
+                    ${utils_modules})
 endif()
 if(NOT APPLE)
@@ -103,7 +94,7 @@ set(SHARED_INFERENCE_SRCS
 # shared inference library deps
 list(REMOVE_ITEM fluid_modules standalone_executor
     interpretercore_garbage_collector)
-set(SHARED_INFERENCE_DEPS ${fluid_modules} phi analysis_predictor
+set(SHARED_INFERENCE_DEPS phi ${fluid_modules} analysis_predictor
                          ${utils_modules})
 if(WITH_CRYPTO)
@@ -124,12 +115,6 @@ if(WITH_ONNXRUNTIME)
      ${CMAKE_CURRENT_SOURCE_DIR}/api/onnxruntime_predictor.cc)
 endif()
-#export all symbols for paddle/phi/api/include/api.h on paddle_inference_shared, only for UNIX
-if(UNIX)
-  set(SHARED_INFERENCE_DEPS ${SHARED_INFERENCE_DEPS}
-                            $<TARGET_OBJECTS:phi_function_api>)
-endif()
 # Create shared inference library
 cc_library(
  paddle_inference_shared SHARED
@@ -141,12 +126,15 @@ target_link_libraries(paddle_inference_shared ${os_dependency_modules})
 if(WIN32)
  set_property(TARGET paddle_inference_shared
               PROPERTY WINDOWS_EXPORT_ALL_SYMBOLS ON)
-  target_link_libraries(paddle_inference_shared gflags)
+  target_link_libraries(paddle_inference_shared phi)
 endif()
 set_target_properties(paddle_inference_shared PROPERTIES OUTPUT_NAME
                                                         paddle_inference)
-if(NOT APPLE AND NOT WIN32)
+if(NOT APPLE
+   AND NOT WIN32
+   AND NOT WITH_TESTING
+   AND NOT WITH_INFERENCE_API_TEST)
  # TODO(liuyiqun): Temporarily disable the link flag because it is not support on Mac.
  set(LINK_FLAGS
      "-Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/paddle_inference.map")

--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -41,7 +41,7 @@ if(WITH_CRYPTO)
  list(APPEND paddle_inference_api_deps paddle_crypto)
 endif()
 if(WITH_CUSTOM_DEVICE)
-  set(paddle_inference_api_deps ${paddle_inference_api_deps} phi_capi)
+  set(paddle_inference_api_deps ${paddle_inference_api_deps} phi)
 endif()
 cc_library(
@@ -50,7 +50,7 @@ cc_library(
  DEPS ${paddle_inference_api_deps})
 if(WIN32)
-  target_link_libraries(paddle_inference_api gflags)
+  target_link_libraries(paddle_inference_api phi)
 endif()
 set(inference_deps ${analysis_deps} paddle_inference_api analysis

--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -72,7 +72,7 @@
 #endif
 #ifdef PADDLE_WITH_MKLML
-#include "paddle/fluid/platform/dynload/mklml.h"
+#include "paddle/phi/backends/dynload/mklml.h"
 #endif
 #ifdef PADDLE_WITH_MKLDNN
@@ -1121,7 +1121,7 @@ bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
  // Frees unused memory allocated by the Intel® MKL Memory Allocator to
  // avoid memory leak. See:
  // https://software.intel.com/en-us/mkl-developer-reference-c-mkl-free-buffers
-  platform::dynload::MKL_Free_Buffers();
+  phi::dynload::MKL_Free_Buffers();
 #endif
  return true;
 }
@@ -1185,7 +1185,7 @@ bool AnalysisPredictor::Run(const std::vector<paddle::Tensor> &inputs,
  // Frees unused memory allocated by the Intel® MKL Memory Allocator to
  // avoid memory leak. See:
  // https://software.intel.com/en-us/mkl-developer-reference-c-mkl-free-buffers
-  platform::dynload::MKL_Free_Buffers();
+  phi::dynload::MKL_Free_Buffers();
 #endif
  return true;
 }
@@ -2100,7 +2100,7 @@ bool AnalysisPredictor::ZeroCopyRun() {
  // Frees unused memory allocated by the Intel® MKL Memory Allocator to
  // avoid memory leak. See:
  // https://software.intel.com/en-us/mkl-developer-reference-c-mkl-free-buffers
-  platform::dynload::MKL_Free_Buffers();
+  phi::dynload::MKL_Free_Buffers();
 #endif
  return true;
 }

--- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
@@ -199,7 +199,7 @@ if(NOT WIN32)
      ${MATH_LIB}
      ${MKLDNN_LIB}
      glog
-      gflags
+      phi
      protobuf
      xxhash
      cryptopp

--- a/paddle/fluid/inference/api/demo_ci/run.sh
+++ b/paddle/fluid/inference/api/demo_ci/run.sh
@@ -29,6 +29,7 @@ WIN_DETECT=$(echo `uname` | grep "Win") # detect current platform
 cd `dirname $0`
 current_dir=`pwd`
 if [ $2 == ON ]; then
  # You can export yourself if move the install path
  MKL_LIB=${inference_install_dir}/third_party/install/mklml/lib

--- a/paddle/fluid/inference/api/details/CMakeLists.txt
+++ b/paddle/fluid/inference/api/details/CMakeLists.txt
@@ -25,7 +25,7 @@ if(WITH_ONNXRUNTIME)
  cc_library(
    zero_copy_tensor_dummy
    SRCS zero_copy_tensor_dummy.cc
-    DEPS onnxruntime phi_enforce)
+    DEPS onnxruntime phi)
 else()
  cc_library(
    zero_copy_tensor
@@ -34,7 +34,7 @@ else()
  cc_library(
    zero_copy_tensor_dummy
    SRCS zero_copy_tensor_dummy.cc
-    DEPS phi_enforce)
+    DEPS phi)
 endif()
 cc_test(

--- a/paddle/fluid/inference/capi_exp/CMakeLists.txt
+++ b/paddle/fluid/inference/capi_exp/CMakeLists.txt
@@ -39,7 +39,7 @@ if(APPLE)
    utf8proc
    cryptopp
    protobuf
-    gflags
+    phi
    cblas)
 endif()

--- a/paddle/fluid/inference/goapi/test.sh
+++ b/paddle/fluid/inference/goapi/test.sh
@@ -23,7 +23,7 @@ fi
 # 2. set LD_LIBRARY_PATH
 export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$PWD/paddle_inference_c/third_party/install/mklml/lib/:$PWD/paddle_inference_c/third_party/install/mkldnn/lib/:$PWD/paddle_inference_c/paddle/lib/
 export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$PWD/paddle_inference_c/third_party/install/onnxruntime/lib/:$PWD/paddle_inference_c/third_party/install/paddle2onnx/lib/
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${PADDLE_ROOT}/build/paddle/phi/
 # 3. go test
 go clean -testcache
 go test -v ./...
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -141,8 +141,7 @@ nv_test(
 nv_test(
  test_custom_plugin_creater
  SRCS test_custom_plugin_creater.cc
-  DEPS paddle_framework tensorrt_converter op_meta_info custom_operator
+  DEPS paddle_framework tensorrt_converter phi custom_operator init_phi)
-       init_phi)
 if(WITH_ONNXRUNTIME AND WIN32)
  # Copy onnxruntime for some c++ test in Windows, since the test will

--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
 include(ExternalProject)
-set(ALLOCATOR_DEPS place stats profiler phi_backends device_context)
+set(ALLOCATOR_DEPS place stats profiler phi device_context)
 set(ALLOCATOR_SRCS
    allocator.cc
    cpu_allocator.cc
@@ -32,7 +32,7 @@ if(WITH_GPU OR WITH_ROCM)
 endif()
 if(WITH_GPU)
-  list(APPEND ALLOCATOR_DEPS phi_backends)
+  list(APPEND ALLOCATOR_DEPS phi)
 endif()
 if(CUDA_VERSION VERSION_GREATER_EQUAL 10.2)

--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -124,7 +124,7 @@ class CUDAGraphAllocator
      : underlying_allocator_(allocator) {}
 public:
-  ~CUDAGraphAllocator() { VLOG(10) << "CUDAGraphAllocator destructed"; }
+  ~CUDAGraphAllocator() {}
  static std::shared_ptr<Allocator> Create(
      const std::shared_ptr<Allocator>& allocator) {
@@ -1137,7 +1137,6 @@ void AllocatorFacade::RemoveMemoryPoolOfCUDAGraph(int64_t id) {
  if (ref_cnt == 0) {
    cuda_graph_map_.erase(id);
    cuda_graph_ref_cnt_.erase(ref_cnt_iter);
-    VLOG(10) << "Remove memory pool of CUDA Graph with memory ID " << id;
  } else {
    VLOG(10) << "Decrease memory pool ID " << id << " reference count to be "
             << ref_cnt;

--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -90,7 +90,7 @@ if(WITH_UNITY_BUILD)
    include(unity_build_rule.cmake)
 endif()
-set(OP_HEADER_DEPS ${OP_HEADER_DEPS} phi phi_utils backward_infermeta sparse_backward_infermeta static_prim_api get_expected_kernel_func)
+set(OP_HEADER_DEPS ${OP_HEADER_DEPS} phi phi_utils static_prim_api get_expected_kernel_func)
 register_operators(EXCLUDES py_func_op dgc_op generated_op1 generated_op2 generated_op3 generated_op4 load_combine_op lstm_op run_program_op quantize_linear_op
        recurrent_op save_combine_op sparse_attention_op sync_batch_norm_op activation_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS})
@@ -125,7 +125,7 @@ if (WITH_GPU OR WITH_ROCM)
    endif()
 endif()
-op_library(lstm_op DEPS ${OP_HEADER_DEPS}  lstm_compute)
+op_library(lstm_op DEPS ${OP_HEADER_DEPS})
 op_library(recurrent_op DEPS ${OP_HEADER_DEPS})
 set(COMMON_OP_DEPS ${OP_HEADER_DEPS})
@@ -136,17 +136,16 @@ if (WITH_DGC)
 endif()
 cc_library(common_infer_shape_functions SRCS common_infer_shape_functions.cc DEPS operator)
-cc_library(ops_extra_info SRCS ops_extra_info.cc DEPS attribute cudnn_workspace_helper)
+cc_library(ops_extra_info SRCS ops_extra_info.cc DEPS attribute phi)
-set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows_utils lapack_function
+set(COMMON_OP_DEPS ${COMMON_OP_DEPS} phi)
-lod_tensor maxouting unpooling pooling lod_rank_table context_project
+set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_utils
-sequence_pooling executor generator static_prim_api)
+lod_tensor unpooling lod_rank_table context_project executor static_prim_api)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc static_prim_api static_utils static_global_utils prim_utils)
-set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler sample_prob tree2col)
+set(COMMON_OP_DEPS ${COMMON_OP_DEPS} cos_sim_functor memory concat_and_split sampler sample_prob tree2col)
-set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions beam_search fc_functor matrix_inverse matrix_solve)
+set(COMMON_OP_DEPS ${COMMON_OP_DEPS} beam_search)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} box_wrapper ps_gpu_wrapper)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} common_infer_shape_functions)
-set(COMMON_OP_DEPS ${COMMON_OP_DEPS} eigen_function)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} processgroup_comm_utils)
 if(WITH_NCCL OR WITH_RCCL)
  set(COMMON_OP_DEPS ${COMMON_OP_DEPS} process_group_nccl)
@@ -189,7 +188,7 @@ endif()
 copy_if_different(${pybind_file} ${pybind_file_final})
 if (WITH_CUSTOM_DEVICE)
-cc_library(custom_device_common_op_registry SRCS custom_device_common_op_registry.cc DEPS operator phi_api)
+cc_library(custom_device_common_op_registry SRCS custom_device_common_op_registry.cc DEPS operator phi type_info)
 endif()
 if(NOT "${OP_LIST}" STREQUAL "")

--- a/paddle/fluid/operators/cinn/CMakeLists.txt
+++ b/paddle/fluid/operators/cinn/CMakeLists.txt
@@ -7,7 +7,7 @@ cc_library(
 cc_library(
  cinn_launch_context
  SRCS cinn_launch_context.cc
-  DEPS ddim
+  DEPS phi
       lod_tensor
       scope
       proto_desc

--- a/paddle/fluid/operators/collective/CMakeLists.txt
+++ b/paddle/fluid/operators/collective/CMakeLists.txt
@@ -18,7 +18,7 @@ foreach(src ${OPS})
 endforeach()
 if(WITH_GLOO)
-  set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} gloo_wrapper comm_context_manager)
+  set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} gloo_wrapper phi)
 endif()
 register_operators(
@@ -31,8 +31,7 @@ register_operators(
  ${COLLECTIVE_DEPS})
 if(WITH_NCCL OR WITH_RCCL)
-  set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} nccl_common collective_helper
+  set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} nccl_common collective_helper phi)
-                      comm_context_manager nccl_comm_context)
  op_library(c_gen_nccl_id_op DEPS ${COLLECTIVE_DEPS})
  op_library(gen_nccl_id_op DEPS ${COLLECTIVE_DEPS})
 endif()

--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
@@ -51,8 +51,8 @@ detection_library(polygon_box_transform_op SRCS polygon_box_transform_op.cc
 detection_library(rpn_target_assign_op SRCS rpn_target_assign_op.cc)
 detection_library(generate_proposal_labels_op SRCS
                  generate_proposal_labels_op.cc)
-detection_library(multiclass_nms_op SRCS multiclass_nms_op.cc DEPS gpc)
+detection_library(multiclass_nms_op SRCS multiclass_nms_op.cc DEPS phi)
-detection_library(locality_aware_nms_op SRCS locality_aware_nms_op.cc DEPS gpc)
+detection_library(locality_aware_nms_op SRCS locality_aware_nms_op.cc DEPS phi)
 detection_library(box_clip_op SRCS box_clip_op.cc box_clip_op.cu)
 detection_library(box_decoder_and_assign_op SRCS box_decoder_and_assign_op.cc
                  box_decoder_and_assign_op.cu)

--- a/paddle/fluid/operators/generator/CMakeLists.txt
+++ b/paddle/fluid/operators/generator/CMakeLists.txt
@@ -289,7 +289,7 @@ file(APPEND ${op_utils_header}
 # Automatically generate the registration code of all arg map functions
 # and compile the corresponding target to avoid frequent code conflicts
 # when writing to same file
-register_op_utils(op_compat_infos DEPS op_utils)
+register_op_utils(op_compat_infos DEPS phi)
 copy_if_different(${op_utils_header} ${op_utils_header_final})

--- a/paddle/fluid/operators/gru_op.cc
+++ b/paddle/fluid/operators/gru_op.cc
@@ -17,11 +17,12 @@ limitations under the License. */
 #include <memory>
 #include <string>
+#include "paddle/phi/core/flags.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/detail/gru_cpu_kernel.h"
 #include "paddle/phi/kernels/funcs/detail/gru_kernel.h"
-DECLARE_int32(paddle_num_threads);
+PHI_DECLARE_int32(paddle_num_threads);
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -6,21 +6,20 @@ if(WITH_XPU)
 endif()
 # please add new math_library in alphabetical order
-math_library(concat_and_split DEPS concat_and_split_functor)
+math_library(concat_and_split DEPS phi)
-math_library(context_project DEPS im2col math_function)
+math_library(context_project DEPS phi)
 math_library(cos_sim_functor)
 math_library(depthwise_conv)
 math_library(sample_prob)
-math_library(sampler DEPS generator)
+math_library(sampler DEPS phi)
-# math_library(math_function DEPS blas dense_tensor tensor)
 if(WITH_XPU)
-  math_library(beam_search DEPS math_function beam_search_xpu)
+  math_library(beam_search DEPS phi beam_search_xpu)
 else()
-  math_library(beam_search DEPS math_function)
+  math_library(beam_search DEPS phi)
 endif()
 math_library(unpooling)
 math_library(prelu)
 math_library(bert_encoder_functor)
-math_library(tree2col DEPS math_function)
+math_library(tree2col DEPS phi)
--- a/paddle/fluid/operators/pscore/CMakeLists.txt
+++ b/paddle/fluid/operators/pscore/CMakeLists.txt
@@ -20,7 +20,7 @@ if(WITH_ARM_BRPC)
    framework_proto
    sendrecv_rpc
    arm_brpc
-    gflags
+    phi
    glog
    snappy
    device_context)
@@ -42,7 +42,7 @@ else()
    ssl
    crypto
    protobuf
-    gflags
+    phi
    glog
    zlib
    snappy

--- a/paddle/fluid/operators/sequence_ops/CMakeLists.txt
+++ b/paddle/fluid/operators/sequence_ops/CMakeLists.txt
@@ -6,5 +6,5 @@ endif()
 register_operators()
 if(WITH_UNITY_BUILD)
-  target_link_libraries(paddle_operators_sequence_ops_unity sequence_pooling)
+  target_link_libraries(paddle_operators_sequence_ops_unity phi)
 endif()
--- a/paddle/fluid/operators/var_conv_2d_op.cc
+++ b/paddle/fluid/operators/var_conv_2d_op.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 #include <memory>
 #include <vector>
-#include "paddle/fluid/platform/dynload/mklml.h"
+#include "paddle/phi/backends/dynload/mklml.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/math_function.h"

--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -6,9 +6,9 @@ cc_library(
 cc_test(
  errors_test
  SRCS errors_test.cc
-  DEPS errors enforce)
+  DEPS phi enforce)
-set(enforce_deps flags errors flags phi_enforce)
+set(enforce_deps phi)
 if(WITH_GPU)
  set(enforce_deps ${enforce_deps} external_error_proto)
 endif()
@@ -26,20 +26,20 @@ cc_test(
 cc_test(
  cpu_info_test
  SRCS cpu_info_test.cc
-  DEPS phi_backends)
+  DEPS phi)
 cc_test(
  os_info_test
  SRCS os_info_test.cc
-  DEPS phi_os_info)
+  DEPS phi)
 cc_library(
  place
  SRCS place.cc
-  DEPS enforce phi_place)
+  DEPS enforce phi)
 cc_test(
  place_test
  SRCS place_test.cc
-  DEPS place glog gflags)
+  DEPS place glog phi)
 if(WITH_MKLDNN)
  set(MKLDNN_CTX_DEPS mkldnn)
@@ -104,7 +104,7 @@ endif()
 cc_library(
  init
  SRCS init.cc
-  DEPS device_context custom_kernel context_pool memcpy)
+  DEPS device_context phi memcpy)
 # memcpy depends on device_context, here add deps individually for
 # avoiding cycle dependencies
@@ -117,7 +117,6 @@ cc_library(
       xxhash
       ${STREAM_CALLBACK_DEPS}
       place
-       phi_place
       eigen3
       cpu_helper
       framework_proto
@@ -126,12 +125,8 @@ cc_library(
       ${MKLDNN_CTX_DEPS}
       ${dgc_deps}
       dlpack
-       cudnn_workspace_helper
+       phi
-       ${XPU_CTX_DEPS}
+       ${XPU_CTX_DEPS})
-       phi_backends
-       phi_device_context
-       generator
-       phi_enforce)
 cc_library(
  collective_helper
@@ -189,12 +184,12 @@ if(WITH_GPU)
      cuda_graph_with_memory_pool
      SRCS cuda_graph_with_memory_pool.cc
      DEPS ${DEVICE_EVENT_LIBS} device_event_custom_device device_context
-           allocator phi_backends)
+           allocator phi)
  else()
    nv_library(
      cuda_graph_with_memory_pool
      SRCS cuda_graph_with_memory_pool.cc
-      DEPS ${DEVICE_EVENT_LIBS} device_context allocator phi_backends)
+      DEPS ${DEVICE_EVENT_LIBS} device_context allocator phi)
  endif()
  nv_test(
    device_context_test
@@ -245,7 +240,7 @@ cc_test(
 cc_library(
  lodtensor_printer
  SRCS lodtensor_printer.cc
-  DEPS ddim
+  DEPS phi
       place
       tensor
       scope
@@ -263,41 +258,30 @@ if(WITH_GPU)
  nv_library(
    profiler
    SRCS profiler.cc profiler.cu
-    DEPS phi_os_info
+    DEPS phi
-         phi_device_tracer
         gpu_info
         enforce
         dynload_cuda
         new_profiler
         stats
         op_proto_maker
-         shape_inference
+         shape_inference)
-         phi_profiler)
 elseif(WITH_ROCM)
  hip_library(
    profiler
    SRCS profiler.cc profiler.cu
-    DEPS phi_os_info
+    DEPS phi
-         phi_device_tracer
         gpu_info
         enforce
         new_profiler
         stats
         op_proto_maker
-         shape_inference
+         shape_inference)
-         phi_profiler)
 else()
  cc_library(
    profiler
    SRCS profiler.cc
-    DEPS phi_os_info
+    DEPS phi enforce new_profiler stats op_proto_maker shape_inference)
-         phi_device_tracer
-         enforce
-         new_profiler
-         stats
-         op_proto_maker
-         shape_inference
-         phi_profiler)
 endif()
 cc_test(
@@ -333,7 +317,7 @@ if(WITH_GPU)
  nv_test(
    test_limit_gpu_memory
    SRCS test_limit_gpu_memory.cu
-    DEPS gpu_info flags)
+    DEPS gpu_info phi)
  nv_library(
    cuda_device_guard
    SRCS cuda_device_guard.cc
@@ -348,7 +332,7 @@ if(WITH_ROCM)
  hip_test(
    test_limit_gpu_memory
    SRCS test_limit_gpu_memory.cu
-    DEPS gpu_info flags)
+    DEPS gpu_info phi)
  hip_library(
    cuda_device_guard
    SRCS cuda_device_guard.cc
@@ -360,7 +344,7 @@ if(NOT APPLE AND NOT WIN32)
    cc_test(
      device_code_test
      SRCS device_code_test.cc
-      DEPS phi_backends lod_tensor)
+      DEPS phi lod_tensor)
  endif()
 endif()
@@ -382,4 +366,4 @@ cc_library(
 cc_test(
  init_phi_test
  SRCS init_phi_test.cc
-  DEPS phi_tensor init_phi)
+  DEPS phi init_phi)
--- a/paddle/fluid/platform/cpu_helper.cc
+++ b/paddle/fluid/platform/cpu_helper.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 #ifdef PADDLE_WITH_MKLML
 #include <omp.h>
-#include "paddle/fluid/platform/dynload/mklml.h"
+#include "paddle/phi/backends/dynload/mklml.h"
 #endif
 #ifdef PADDLE_USE_OPENBLAS
@@ -40,7 +40,7 @@ void SetNumThreads(int num_threads) {
  openblas_set_num_threads(real_num_threads);
 #elif defined(PADDLE_WITH_MKLML)
  int real_num_threads = num_threads > 1 ? num_threads : 1;
-  platform::dynload::MKL_Set_Num_Threads(real_num_threads);
+  phi::dynload::MKL_Set_Num_Threads(real_num_threads);
  omp_set_num_threads(real_num_threads);
 #elif defined(PADDLE_USE_REFERENCE_CBLAS)
  // cblas not support multi-thread

--- a/paddle/fluid/platform/device/custom/CMakeLists.txt
+++ b/paddle/fluid/platform/device/custom/CMakeLists.txt
@@ -2,9 +2,9 @@ if(WITH_CUSTOM_DEVICE)
  cc_library(
    custom_device_resource_pool
    SRCS custom_device_resource_pool.cc
-    DEPS gflags glog enforce monitor)
+    DEPS phi glog enforce monitor)
  cc_test(
    custom_device_test
    SRCS custom_device_test.cc
-    DEPS phi_tensor_utils phi_backends phi_device_context gradient_accumulator)
+    DEPS phi gradient_accumulator)
 endif()
--- a/paddle/fluid/platform/device/gpu/CMakeLists.txt
+++ b/paddle/fluid/platform/device/gpu/CMakeLists.txt
@@ -3,13 +3,7 @@ if(WITH_GPU)
  nv_library(
    gpu_info
    SRCS gpu_info.cc
-    DEPS phi_backends
+    DEPS phi glog enforce monitor dynload_cuda malloc)
-         gflags
-         glog
-         enforce
-         monitor
-         dynload_cuda
-         malloc)
  nv_test(cuda_helper_test SRCS cuda_helper_test.cu)
  nv_test(
@@ -21,7 +15,7 @@ elseif(WITH_ROCM)
  hip_library(
    gpu_info
    SRCS gpu_info.cc
-    DEPS phi_backends gflags glog enforce monitor dynload_cuda)
+    DEPS phi glog enforce monitor dynload_cuda)
  hip_test(cuda_helper_test SRCS cuda_helper_test.cu)
  hip_test(

--- a/paddle/fluid/platform/device/xpu/CMakeLists.txt
+++ b/paddle/fluid/platform/device/xpu/CMakeLists.txt
@@ -14,23 +14,11 @@ set(XPU_CTX_DEPS
 cc_library(
  xpu_info
  SRCS xpu_info.cc
-  DEPS gflags
+  DEPS glog enforce xpulib device_context place phi)
-       glog
-       enforce
-       xpulib
-       device_context
-       place
-       phi_backends)
 cc_library(
  xpu_op_list
  SRCS xpu_op_list.cc
-  DEPS gflags
+  DEPS glog enforce xpulib device_context op_kernel_type phi)
-       glog
-       enforce
-       xpulib
-       device_context
-       op_kernel_type
-       phi_backends)
 cc_library(
  xpu_resource_pool
  SRCS xpu_resource_pool.cc

--- a/paddle/fluid/platform/dynload/CMakeLists.txt
+++ b/paddle/fluid/platform/dynload/CMakeLists.txt
 cc_library(
  dynamic_loader
  SRCS dynamic_loader.cc
-  DEPS glog gflags enforce phi_dynamic_loader)
+  DEPS glog enforce phi)
 list(
  APPEND
@@ -57,26 +57,20 @@ if(WITH_ROCM)
  hip_library(
    dynload_cuda
    SRCS ${HIP_SRCS}
-    DEPS dynamic_loader phi_dynload_cuda)
+    DEPS dynamic_loader phi)
  cc_library(
    dynload_warpctc
    SRCS warpctc.cc
-    DEPS dynamic_loader warpctc phi_dynload_warpctc)
+    DEPS dynamic_loader warpctc phi)
 else()
  nv_library(
    dynload_cuda
    SRCS ${CUDA_SRCS}
-    DEPS dynamic_loader phi_dynload_cuda)
+    DEPS dynamic_loader phi)
  cc_library(
    dynload_warpctc
    SRCS warpctc.cc
-    DEPS dynamic_loader warpctc phi_dynload_warpctc)
+    DEPS dynamic_loader warpctc phi)
-endif()
-if(WITH_MKLML)
-  cc_library(
-    dynload_mklml
-    SRCS mklml.cc
-    DEPS dynamic_loader mklml phi_dynload_mklml)
 endif()
 # TODO(TJ): add iomp, mkldnn?
@@ -86,6 +80,6 @@ if(MKL_FOUND AND WITH_ONEMKL)
  cc_library(
    dynload_mklrt
    SRCS mklrt.cc
-    DEPS dynamic_loader phi_dynload_mklrt)
+    DEPS dynamic_loader phi)
  target_include_directories(dynload_mklrt PRIVATE ${MKL_INCLUDE})
 endif()
--- a/paddle/fluid/platform/dynload/mklml.h
+++ b/paddle/fluid/platform/dynload/mklml.h
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-#include <mkl.h>
-#include <mutex>  // NOLINT
-#include "paddle/phi/backends/dynload/mklml.h"
-namespace paddle {
-namespace platform {
-namespace dynload {
-/**
- * The following macro definition can generate structs
- * (for each function) to dynamic load mklml routine
- * via operator overloading.
- */
-#define DYNAMIC_LOAD_MKLML_WRAP(__name)                      \
-  using DynLoad__##__name = phi::dynload::DynLoad__##__name; \
-  extern DynLoad__##__name __name
-#define PLATFORM_DECLARE_DYNAMIC_LOAD_MKLML_WRAP(__name) \
-  DYNAMIC_LOAD_MKLML_WRAP(__name)
-#define MKLML_ROUTINE_EACH(__macro) \
-  __macro(cblas_sgemm);             \
-  __macro(cblas_dgemm);             \
-  __macro(cblas_cgemm);             \
-  __macro(cblas_zgemm);             \
-  __macro(cblas_saxpy);             \
-  __macro(cblas_daxpy);             \
-  __macro(cblas_caxpy);             \
-  __macro(cblas_zaxpy);             \
-  __macro(cblas_scopy);             \
-  __macro(cblas_dcopy);             \
-  __macro(cblas_ccopy);             \
-  __macro(cblas_zcopy);             \
-  __macro(cblas_sgemv);             \
-  __macro(cblas_dgemv);             \
-  __macro(cblas_cgemv);             \
-  __macro(cblas_zgemv);             \
-  __macro(cblas_strsm);             \
-  __macro(cblas_dtrsm);             \
-  __macro(cblas_ctrsm);             \
-  __macro(cblas_ztrsm);             \
-  __macro(cblas_sgemm_alloc);       \
-  __macro(cblas_dgemm_alloc);       \
-  __macro(cblas_sgemm_pack);        \
-  __macro(cblas_dgemm_pack);        \
-  __macro(cblas_sgemm_compute);     \
-  __macro(cblas_dgemm_compute);     \
-  __macro(cblas_sgemm_free);        \
-  __macro(cblas_dgemm_free);        \
-  __macro(cblas_sgemm_batch);       \
-  __macro(cblas_dgemm_batch);       \
-  __macro(cblas_cgemm_batch);       \
-  __macro(cblas_zgemm_batch);       \
-  __macro(cblas_sdot);              \
-  __macro(cblas_ddot);              \
-  __macro(cblas_sasum);             \
-  __macro(cblas_dasum);             \
-  __macro(cblas_isamax);            \
-  __macro(cblas_idamax);            \
-  __macro(cblas_sscal);             \
-  __macro(cblas_dscal);             \
-  __macro(vsAdd);                   \
-  __macro(vdAdd);                   \
-  __macro(vsSub);                   \
-  __macro(vdSub);                   \
-  __macro(vsMul);                   \
-  __macro(vdMul);                   \
-  __macro(vsDiv);                   \
-  __macro(vdDiv);                   \
-  __macro(vsExp);                   \
-  __macro(vdExp);                   \
-  __macro(vsSqr);                   \
-  __macro(vdSqr);                   \
-  __macro(vsPowx);                  \
-  __macro(vdPowx);                  \
-  __macro(vsInv);                   \
-  __macro(vdInv);                   \
-  __macro(vmsErf);                  \
-  __macro(vmdErf);                  \
-  __macro(MKL_Free_Buffers);        \
-  __macro(MKL_Set_Num_Threads);     \
-  __macro(MKL_Get_Max_Threads);
-MKLML_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_MKLML_WRAP);
-#if !defined(_WIN32)
-DYNAMIC_LOAD_MKLML_WRAP(mkl_scsrmm);
-DYNAMIC_LOAD_MKLML_WRAP(mkl_dcsrmm);
-#endif
-#undef DYNAMIC_LOAD_MKLML_WRAP
-}  // namespace dynload
-}  // namespace platform
-}  // namespace paddle
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -40,6 +40,22 @@ PADDLE_DEFINE_EXPORTED_bool(enable_rpc_profiler,
 DEFINE_bool(enable_record_memory, false, "enable memory recorder");
+#if defined(_WIN32) && defined(PHI_SHARED)
+phi::ProfilerState phi::ProfilerHelper::g_state = phi::ProfilerState::kDisabled;
+bool phi::ProfilerHelper::g_enable_nvprof_hook = false;
+thread_local uint64_t phi::ProfilerHelper::g_thread_id;
+uint32_t phi::ProfilerHelper::g_next_thread_id = 0;
+std::mutex phi::ProfilerHelper::g_all_event_lists_mutex;
+std::list<std::shared_ptr<phi::EventList<phi::Event>>>
+    phi::ProfilerHelper::g_all_event_lists;
+thread_local std::shared_ptr<phi::EventList<phi::Event>>
+    phi::ProfilerHelper::g_event_list;
+std::list<std::shared_ptr<phi::EventList<phi::MemEvent>>>
+    phi::ProfilerHelper::g_all_mem_event_lists;
+thread_local std::shared_ptr<phi::EventList<phi::MemEvent>>
+    phi::ProfilerHelper::g_mem_event_list;
+std::mutex phi::ProfilerHelper::g_all_mem_event_lists_mutex;
+#endif
 namespace paddle {
 namespace platform {

--- a/paddle/fluid/platform/profiler/CMakeLists.txt
+++ b/paddle/fluid/platform/profiler/CMakeLists.txt
 cc_library(
  host_tracer
  SRCS host_tracer.cc
-  DEPS framework_proto enforce ddim var_type_traits)
+  DEPS framework_proto enforce phi var_type_traits)
 cc_library(
  cuda_tracer
  SRCS cuda_tracer.cc cupti_data_process.cc
@@ -28,7 +28,7 @@ cc_library(
 cc_library(
  cpu_utilization
  SRCS cpu_utilization.cc
-  DEPS phi_backends phi_os_info enforce glog)
+  DEPS phi enforce glog)
 cc_library(
  new_profiler
  SRCS profiler.cc

--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -28,7 +28,6 @@ set(PYBIND_DEPS
    gloo_wrapper
    infer_io_utils
    heter_wrapper
-    generator
    op_version_registry
    ps_gpu_wrapper
    custom_operator
@@ -37,16 +36,13 @@ set(PYBIND_DEPS
    fleet_executor
    global_utils
    phi_utils
-    tcp_store
+    phi
-    comm_context_manager
    new_profiler
-    auto_parallel
    jit_layer
    jit_property
    prim_utils
-    operants_manager
+    static_tensor_operants
-    phi_tensor_operants
+    type_info)
-    static_tensor_operants)
 if(WITH_PSCORE)
  set(PYBIND_DEPS ${PYBIND_DEPS} ps_service)
@@ -65,7 +61,7 @@ if(WITH_RPC)
      zlib
      leveldb
      snappy
-      gflags
+      phi
      glog)
 endif()
 if(WITH_GPU OR WITH_ROCM)
@@ -148,7 +144,6 @@ set(PYBIND_SRCS
    auto_parallel_py.cc)
 if(WITH_CUSTOM_DEVICE)
-  set(PYBIND_DEPS ${PYBIND_DEPS} phi_capi)
  set(PYBIND_DEPS ${PYBIND_DEPS} custom_device_common_op_registry)
 endif()
@@ -334,6 +329,14 @@ if(WITH_PYTHON)
      ")\n"
      "exit /b 0")
+    if(WITH_PHI_SHARED)
+      add_custom_command(
+        OUTPUT ${op_impl_path}/phi.dll
+        COMMAND ${CMAKE_COMMAND} -E copy ${PHI_LIB} ${op_impl_path}
+        DEPENDS phi)
+      list(APPEND EAGER_OP_IMPL_DEPS ${op_impl_path}/libiomp5md.dll)
+    endif()
    if(${CBLAS_PROVIDER} STREQUAL MKLML)
      add_custom_command(
        OUTPUT ${op_impl_path}/libiomp5md.dll
@@ -481,10 +484,8 @@ if(WITH_PYTHON)
    list(APPEND PYBIND_DEPS python)
    list(APPEND PYBIND_DEPS custom_operator)
    list(APPEND PYBIND_DEPS custom_operator_node)
-    list(APPEND PYBIND_DEPS tensor_api)
    list(APPEND PYBIND_DEPS eager_tensor_operants)
    list(APPEND PYBIND_DEPS pybind_util)
-    list(APPEND PYBIND_DEPS flags)
  endif()
  # On Linux, cc_library(paddle SHARED ..) will generate the libpaddle.so,

--- a/paddle/fluid/pybind/eager_utils.cc
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -38,7 +38,9 @@ limitations under the License. */
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/dense_tensor.h"
-DECLARE_bool(check_nan_inf);
+#include "paddle/phi/core/flags.h"
+PHI_DECLARE_bool(check_nan_inf);
 namespace paddle {
 namespace pybind {

--- a/paddle/phi/CMakeLists.txt
+++ b/paddle/phi/CMakeLists.txt
@@ -3,6 +3,15 @@ configure_file(config.h.in ${CMAKE_CURRENT_SOURCE_DIR}/config.h)
 # phi auto cmake utils
 include(phi)
+set(common_srcs CACHE INTERNAL "" FORCE)
+set(api_srcs CACHE INTERNAL "" FORCE)
+set(capi_srcs CACHE INTERNAL "" FORCE)
+set(core_srcs CACHE INTERNAL "" FORCE)
+set(backends_srcs CACHE INTERNAL "" FORCE)
+set(kernels_srcs CACHE INTERNAL "" FORCE)
+set(infermeta_srcs CACHE INTERNAL "" FORCE)
+#set(excluded_srcs CACHE INTERNAL "" FORCE)
 # paddle experimental common components
 add_subdirectory(common)
@@ -24,29 +33,153 @@ if(WITH_CUSTOM_DEVICE)
  add_subdirectory(capi)
 endif()
-# make an unity target for compile deps
 set(PHI_DEPS
-    convert_utils
+    phi_profiler_proto
-    dense_tensor
+    auto_parallel_proto
-    phi_backends
+    gflags
-    kernel_factory
+    glog
-    kernel_context
+    warpctc
-    arg_map_context
+    warprnnt
-    infermeta
+    eigen3
-    lod_utils
+    xxhash
-    sparse_csr_tensor
+    cblas
-    sparse_coo_tensor
+    utf8proc)
-    string_tensor
-    api_scalar
+if(WITH_GPU)
-    api_int_array
+  list(APPEND PHI_DEPS external_error_proto)
-    extended_tensor
+endif()
-    dist_attr
-    dist_mapper)
+if(WITH_ASCEND_CL)
+  list(APPEND PHI_DEPS npu_hccl)
-get_property(phi_kernels GLOBAL PROPERTY PHI_KERNELS)
+endif()
-set(PHI_DEPS ${PHI_DEPS} ${phi_kernels})
+if(WITH_FLASHATTN)
-cc_library(phi DEPS ${PHI_DEPS})
+  list(APPEND PHI_DEPS flashattn)
+endif()
+if(WITH_XBYAK)
+  list(APPEND PHI_DEPS xbyak)
+endif()
+if(WITH_MKLDNN)
+  list(APPEND PHI_DEPS mkldnn)
+endif()
+if(WITH_GLOO)
+  list(APPEND PHI_DEPS gloo)
+endif()
+if(WITH_CUDNN_FRONTEND)
+  list(APPEND PHI_DEPS cudnn-frontend)
+endif()
+if(WITH_POCKETFFT)
+  list(APPEND PHI_DEPS pocketfft)
+endif()
+if(WITH_MKLML)
+  list(APPEND PHI_DEPS pocketfft dynload_mklml)
+endif()
+if(WITH_XPU)
+  list(APPEND PHI_DEPS xpulib)
+endif()
+set(PHI_SRCS
+    ${common_srcs}
+    ${api_srcs}
+    ${core_srcs}
+    ${backends_srcs}
+    ${kernels_srcs}
+    ${infermeta_srcs}
+    ${capi_srcs})
+if(WITH_PHI_SHARED)
+  set(PHI_BUILD_TYPE
+      SHARED
+      CACHE INTERNAL "" FORCE)
+else()
+  set(PHI_BUILD_TYPE
+      STATIC
+      CACHE INTERNAL "" FORCE)
+endif()
+if(WITH_GPU)
+  add_definitions(-DCUDA_REAL_ARCHS=${NVCC_FLAGS_EXTRA_real_archs}
+  )# for backends/gpu/gpu_resources.cc
+  nv_library(
+    phi ${PHI_BUILD_TYPE}
+    SRCS ${PHI_SRCS}
+    DEPS ${PHI_DEPS})
+elseif(WITH_ROCM)
+  hip_add_library(phi ${PHI_BUILD_TYPE} ${PHI_SRCS})
+  target_link_libraries(phi ${PHI_DEPS})
+elseif(WITH_XPU_KP)
+  xpu_library(
+    phi ${PHI_BUILD_TYPE}
+    SRCS ${PHI_SRCS}
+    DEPS ${PHI_DEPS})
+else()
+  cc_library(
+    phi ${PHI_BUILD_TYPE}
+    SRCS ${PHI_SRCS}
+    DEPS ${PHI_DEPS})
+endif()
+if(WIN32)
+  target_link_libraries(phi shlwapi.lib)
+endif()
+if(WIN32)
+  if(WITH_PHI_SHARED)
+    set_property(TARGET phi PROPERTY WINDOWS_EXPORT_ALL_SYMBOLS ON)
+    set(PHI_NAME
+        phi.dll
+        CACHE INTERNAL "" FORCE)
+  else()
+    set(PHI_NAME
+        phi.lib
+        CACHE INTERNAL "" FORCE)
+  endif()
+elseif(APPLE)
+  if(WITH_PHI_SHARED)
+    set(PHI_NAME
+        libphi.dylib
+        CACHE INTERNAL "" FORCE)
+  else()
+    set(PHI_NAME
+        libphi.a
+        CACHE INTERNAL "" FORCE)
+  endif()
+else()
+  if(WITH_PHI_SHARED)
+    set(PHI_NAME
+        libphi.so
+        CACHE INTERNAL "" FORCE)
+  else()
+    set(PHI_NAME
+        libphi.a
+        CACHE INTERNAL "" FORCE)
+  endif()
+endif()
+set(PHI_LIB
+    "${CMAKE_CURRENT_BINARY_DIR}/${PHI_NAME}"
+    CACHE FILEPATH "PHI Library" FORCE)
+if(MKL_FOUND AND WITH_ONEMKL)
+  target_include_directories(phi PRIVATE ${MKL_INCLUDE})
+endif()
+add_dependencies(phi extern_lapack)
+if(WITH_CUTLASS)
+  add_dependencies(phi cutlass_codegen)
+  add_definitions("-DPADDLE_WITH_MEMORY_EFFICIENT_ATTENTION"
+  )# for memory_efficient_attention.h
+endif()
+if(WITH_FLASHATTN)
+  add_dependencies(phi flashattn)
+endif()
 set(phi_extension_header_file
    ${CMAKE_CURRENT_SOURCE_DIR}/extension.h

--- a/paddle/phi/api/CMakeLists.txt
+++ b/paddle/phi/api/CMakeLists.txt
 add_subdirectory(profiler)
 add_subdirectory(lib)
-cc_library(
-  phi_api
-  SRCS all.cc
-  DEPS phi_function_api phi_bw_function_api sparse_api sparse_bw_api
-       strings_api)
--- a/paddle/phi/api/all.cc
+++ b/paddle/phi/api/all.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/phi/api/all.h"
-namespace paddle {
-namespace experimental {}  // namespace experimental
-}  // namespace paddle
--- a/paddle/phi/api/ext/op_meta_info.h
+++ b/paddle/phi/api/ext/op_meta_info.h
@@ -112,9 +112,7 @@ class PADDLE_API CustomOpKernelContext {
  void EmplaceBackOutput(Tensor&& output);
  void EmplaceBackOutputs(const std::vector<Tensor>& outputs);
  void EmplaceBackAttr(paddle::any attr);
-  void EmplaceBackAttrs(const std::vector<paddle::any>& attrs) {
+  void EmplaceBackAttrs(const std::vector<paddle::any>& attrs);
-    attrs_ = std::move(attrs);
-  }
  const std::pair<size_t, size_t>& InputRangeAt(size_t idx) const;
  const std::pair<size_t, size_t>& OutputRangeAt(size_t idx) const;
@@ -125,13 +123,9 @@ class PADDLE_API CustomOpKernelContext {
  paddle::optional<Tensor> OptionalInputAt(size_t idx);
  paddle::optional<std::vector<Tensor>> OptionalInputsBetween(size_t start,
                                                              size_t end);
-  const std::vector<paddle::any>& Attrs() const { return attrs_; }
+  const std::vector<paddle::any>& Attrs() const;
-  const std::vector<std::pair<size_t, size_t>>& InputRange() {
+  const std::vector<std::pair<size_t, size_t>>& InputRange();
-    return input_range_;
+  const std::vector<std::pair<size_t, size_t>>& OutputRange();
-  }
-  const std::vector<std::pair<size_t, size_t>>& OutputRange() {
-    return output_range_;
-  }
  Tensor* MutableOutputAt(size_t idx);
  std::vector<Tensor*> MutableOutputBetween(size_t start, size_t end);
  std::vector<Tensor> OutputsBetween(size_t start, size_t end);
@@ -811,38 +805,20 @@ class PADDLE_API OpMetaInfo {
 //////////////// Op Meta Info Helper /////////////////
 class OpMetaInfoHelper {
 public:
-  static const std::string& GetOpName(const paddle::OpMetaInfo& info) {
+  static const std::string& GetOpName(const paddle::OpMetaInfo& info);
-    return info.name_;
-  }
  static const std::vector<std::string>& GetInputs(
-      const paddle::OpMetaInfo& info) {
+      const paddle::OpMetaInfo& info);
-    return info.inputs_;
-  }
  static const std::vector<std::string>& GetOutputs(
-      const paddle::OpMetaInfo& info) {
+      const paddle::OpMetaInfo& info);
-    return info.outputs_;
-  }
  static const std::vector<std::string>& GetAttrs(
-      const paddle::OpMetaInfo& info) {
+      const paddle::OpMetaInfo& info);
-    return info.attrs_;
-  }
  static const std::unordered_map<std::string, std::string>& GetInplaceMap(
-      const paddle::OpMetaInfo& info) {
+      const paddle::OpMetaInfo& info);
-    return info.inplace_map_;
-  }
  static const std::unordered_map<std::string, std::string>&
-  GetInplaceReverseMap(const paddle::OpMetaInfo& info) {
+  GetInplaceReverseMap(const paddle::OpMetaInfo& info);
-    return info.inplace_reverse_map_;
+  static const KernelFunc& GetKernelFn(const paddle::OpMetaInfo& info);
-  }
+  static const InferShapeFunc& GetInferShapeFn(const paddle::OpMetaInfo& info);
-  static const KernelFunc& GetKernelFn(const paddle::OpMetaInfo& info) {
+  static const InferDtypeFunc& GetInferDtypeFn(const paddle::OpMetaInfo& info);
-    return info.kernel_fn_;
-  }
-  static const InferShapeFunc& GetInferShapeFn(const paddle::OpMetaInfo& info) {
-    return info.infer_shape_fn_;
-  }
-  static const InferDtypeFunc& GetInferDtypeFn(const paddle::OpMetaInfo& info) {
-    return info.infer_dtype_fn_;
-  }
 };
 //////////////// Op Meta Info Map /////////////////

--- a/paddle/phi/api/include/tensor.h
+++ b/paddle/phi/api/include/tensor.h
@@ -410,7 +410,7 @@ class PADDLE_API Tensor final {
   *
   * @return const std::string&
   */
-  const std::string& name() const { return name_; }
+  const std::string& name() const;
  /**
   * @brief Set name of Tensor.
@@ -419,7 +419,7 @@ class PADDLE_API Tensor final {
   *
   * @param const std::string& name
   */
-  void set_name(const std::string& name) { name_ = name; }
+  void set_name(const std::string& name);
  /* Part 5: Data Transform methods */
  /* Alert!!!!: All copy method can only deep copy impl, autograd info only be

--- a/paddle/phi/api/lib/CMakeLists.txt
+++ b/paddle/phi/api/lib/CMakeLists.txt
-if(WITH_GPU)
-  nv_library(
-    phi_tensor_raw
-    SRCS tensor.cc
-    DEPS tensor_base
-         dense_tensor
-         phi_enforce
-         context_pool
-         tensor_api
-         int_array
-         scalar)
-elseif(WITH_ROCM)
-  hip_library(
-    phi_tensor_raw
-    SRCS tensor.cc
-    DEPS tensor_base
-         dense_tensor
-         phi_enforce
-         context_pool
-         tensor_api
-         int_array
-         scalar)
-else()
-  cc_library(
-    phi_tensor_raw
-    SRCS tensor.cc
-    DEPS tensor_base
-         dense_tensor
-         phi_enforce
-         context_pool
-         tensor_api
-         int_array
-         scalar)
-endif()
 set(api_gen_base ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/generator/api_base.py)
 # forward api file
@@ -157,157 +122,77 @@ if(NOT PYTHONINTERP_FOUND)
  find_package(PythonInterp REQUIRED)
 endif()
+execute_process(COMMAND ${PYTHON_EXECUTABLE} -m pip install pyyaml)
 # generate forward api
-add_custom_command(
+execute_process(
-  OUTPUT ${api_header_file} ${api_source_file}
-  COMMAND ${PYTHON_EXECUTABLE} -m pip install pyyaml
  COMMAND
    ${PYTHON_EXECUTABLE} ${api_gen_file} --api_yaml_path ${api_yaml_file}
    ${legacy_api_yaml_file} --api_header_path ${api_header_file_tmp}
-    --api_source_path ${api_source_file_tmp}
+    --api_source_path ${api_source_file_tmp})
-  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${api_header_file_tmp}
-          ${api_header_file}
-  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${api_source_file_tmp}
-          ${api_source_file}
-  COMMENT "copy_if_different ${api_header_file} ${api_source_file}"
-  DEPENDS ${api_yaml_file} ${legacy_api_yaml_file} ${api_gen_file}
-          ${api_gen_base}
-  VERBATIM)
 # generate backward api
-add_custom_command(
+execute_process(
-  OUTPUT ${bw_api_header_file} ${bw_api_source_file} ${bw_api_header_file_tmp}
-         ${bw_api_source_file_tmp}
  COMMAND
    ${PYTHON_EXECUTABLE} ${bw_api_gen_file} --backward_yaml_path
    ${bw_api_yaml_file} ${legacy_bw_api_yaml_file} --backward_header_path
-    ${bw_api_header_file_tmp} --backward_source_path ${bw_api_source_file_tmp}
+    ${bw_api_header_file_tmp} --backward_source_path ${bw_api_source_file_tmp})
-  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${bw_api_header_file_tmp}
-          ${bw_api_header_file}
-  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${bw_api_source_file_tmp}
-          ${bw_api_source_file}
-  COMMENT "copy_if_different ${bw_api_header_file} ${bw_api_source_file}"
-  DEPENDS ${bw_api_yaml_file} ${bw_api_gen_file} ${api_gen_base}
-          ${legacy_bw_api_yaml_file}
-  VERBATIM)
 # generate fused_op api
-add_custom_command(
+execute_process(
-  OUTPUT ${fused_api_header_file} ${fused_api_source_file}
-  COMMAND ${PYTHON_EXECUTABLE} -m pip install pyyaml
  COMMAND
    ${PYTHON_EXECUTABLE} ${api_gen_file} --api_yaml_path ${fused_api_yaml_file}
    --is_fused_ops_yaml --api_header_path ${fused_api_header_file_tmp}
-    --api_source_path ${fused_api_source_file_tmp}
+    --api_source_path ${fused_api_source_file_tmp})
-  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${fused_api_header_file_tmp}
-          ${fused_api_header_file}
-  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${fused_api_source_file_tmp}
-          ${fused_api_source_file}
-  COMMENT "copy_if_different ${fused_api_header_file} ${fused_api_source_file}"
-  DEPENDS ${fused_api_yaml_file} ${api_gen_file} ${api_gen_base}
-  VERBATIM)
 # generate fused_op backward api
-add_custom_command(
+execute_process(
-  OUTPUT ${fused_bw_api_header_file} ${fused_bw_api_source_file}
-         ${fused_bw_api_header_file_tmp} ${fused_bw_api_source_file_tmp}
  COMMAND
    ${PYTHON_EXECUTABLE} ${fused_bw_api_gen_file} --backward_yaml_path
    ${fused_bw_api_yaml_file} --is_fused_backward_yaml --backward_header_path
    ${fused_bw_api_header_file_tmp} --backward_source_path
-    ${fused_bw_api_source_file_tmp}
+    ${fused_bw_api_source_file_tmp})
-  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${fused_bw_api_header_file_tmp}
-          ${fused_bw_api_header_file}
-  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${fused_bw_api_source_file_tmp}
-          ${fused_bw_api_source_file}
-  COMMENT
-    "copy_if_different ${fused_bw_api_header_file} ${fused_bw_api_source_file}"
-  DEPENDS ${fused_bw_api_yaml_file} ${bw_api_gen_file} ${api_gen_base}
-  VERBATIM)
 # generate sparse api
-add_custom_command(
+execute_process(
-  OUTPUT ${sparse_api_header_file} ${sparse_api_source_file}
  COMMAND
    ${PYTHON_EXECUTABLE} ${sparse_api_gen_file} --api_yaml_path
    ${sparse_api_yaml_file} --api_header_path ${sparse_api_header_file_tmp}
-    --api_source_path ${sparse_api_source_file_tmp}
+    --api_source_path ${sparse_api_source_file_tmp})
-  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${sparse_api_header_file_tmp}
-          ${sparse_api_header_file}
-  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${sparse_api_source_file_tmp}
-          ${sparse_api_source_file}
-  COMMENT
-    "copy_if_different ${sparse_api_header_file} ${sparse_sparse_api_source_file}"
-  DEPENDS ${sparse_api_yaml_file} ${sparse_api_gen_file} ${api_gen_base}
-          ${api_gen_file}
-  VERBATIM)
 # generate backward sparse api
-add_custom_command(
+execute_process(
-  OUTPUT ${sparse_bw_api_header_file} ${sparse_bw_api_source_file}
  COMMAND
    ${PYTHON_EXECUTABLE} ${sparse_bw_api_gen_file} --api_yaml_path
    ${sparse_bw_api_yaml_file} --api_header_path
    ${sparse_bw_api_header_file_tmp} --api_source_path
-    ${sparse_bw_api_source_file_tmp}
+    ${sparse_bw_api_source_file_tmp})
-  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${sparse_bw_api_header_file_tmp}
-          ${sparse_bw_api_header_file}
-  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${sparse_bw_api_source_file_tmp}
-          ${sparse_bw_api_source_file}
-  COMMENT
-    "copy_if_different ${sparse_bw_api_header_file} ${sparse_bw_sparse_api_source_file}"
-  DEPENDS ${sparse_bw_api_yaml_file} ${sparse_bw_api_gen_file} ${api_gen_base}
-          ${api_gen_file} ${sparse_api_gen_file} ${bw_api_gen_file}
-  VERBATIM)
 # generate strings api
-add_custom_command(
+execute_process(
-  OUTPUT ${strings_api_header_file} ${strings_api_source_file}
  COMMAND
    ${PYTHON_EXECUTABLE} ${strings_api_gen_file} --api_yaml_path
    ${strings_api_yaml_file} --api_header_path ${strings_api_header_file_tmp}
-    --api_source_path ${strings_api_source_file_tmp}
+    --api_source_path ${strings_api_source_file_tmp})
-  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${strings_api_header_file_tmp}
-          ${strings_api_header_file}
-  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${strings_api_source_file_tmp}
-          ${strings_api_source_file}
-  COMMENT
-    "copy_if_different ${strings_api_header_file} ${strings_strings_api_source_file}"
-  DEPENDS ${strings_api_yaml_file} ${strings_api_gen_file} ${api_gen_base}
-          ${api_gen_file}
-  VERBATIM)
 # generate dygraph(intermediate) api
-add_custom_command(
+execute_process(
-  OUTPUT ${dygraph_api_header_file} ${dygraph_api_source_file}
  COMMAND
    ${PYTHON_EXECUTABLE} ${im_api_gen_file} --api_yaml_path ${api_yaml_file}
    ${legacy_api_yaml_file} --sparse_api_yaml_path ${sparse_api_yaml_file}
    --dygraph_api_header_path ${dygraph_api_header_file_tmp}
-    --dygraph_api_source_path ${dygraph_api_source_file_tmp}
+    --dygraph_api_source_path ${dygraph_api_source_file_tmp})
-  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${dygraph_api_header_file_tmp}
-          ${dygraph_api_header_file}
-  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${dygraph_api_source_file_tmp}
-          ${dygraph_api_source_file}
-  DEPENDS ${api_yaml_file} ${legacy_api_yaml_file} ${sparse_api_yaml_file}
-          ${im_api_gen_file} ${api_gen_base} ${api_gen_file}
-  VERBATIM)
 # generate wrapped infermeta
-add_custom_command(
+execute_process(
-  OUTPUT ${wrapped_infermeta_header_file} ${wrapped_infermeta_source_file}
  COMMAND
    ${PYTHON_EXECUTABLE} ${wrapped_infermeta_gen_file} --api_yaml_path
    ${api_yaml_file} ${legacy_api_yaml_file} --wrapped_infermeta_header_path
    ${wrapped_infermeta_header_file} --wrapped_infermeta_source_path
-    ${wrapped_infermeta_source_file}
+    ${wrapped_infermeta_source_file})
-  DEPENDS ${api_yaml_file} ${legacy_api_yaml_file} ${wrapped_infermeta_gen_file}
-          ${api_gen_base}
-  VERBATIM)
 # generate tensor and tensor operants file
 message("create or copy auto-geneated tensor files")
-execute_process(COMMAND ${PYTHON_EXECUTABLE} -m pip install pyyaml)
 execute_process(
  WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/generator
  COMMAND
@@ -324,154 +209,70 @@ if(${_result})
  message(FATAL_ERROR "tensor codegen failed, exiting.")
 endif()
-set(generated_tensor_files
+set(generated_files
-    "${operants_base_file}" "${tensor_api_source_file}"
+    "${operants_base_file}"
-    "${phi_tensor_operants_header_file}" "${phi_tensor_operants_source_file}"
+    "${tensor_api_source_file}"
-    "${operants_manager_header_file}" "${operants_manager_source_file}")
+    "${phi_tensor_operants_header_file}"
+    "${phi_tensor_operants_source_file}"
+    "${operants_manager_header_file}"
+    "${operants_manager_source_file}"
+    "${wrapped_infermeta_source_file}"
+    "${api_source_file}"
+    "${api_header_file}"
+    "${bw_api_source_file}"
+    "${bw_api_header_file}"
+    "${fused_api_source_file}"
+    "${fused_api_header_file}"
+    "${fused_bw_api_source_file}"
+    "${fused_bw_api_header_file}"
+    "${sparse_api_source_file}"
+    "${sparse_api_header_file}"
+    "${sparse_bw_api_source_file}"
+    "${sparse_bw_api_header_file}"
+    "${dygraph_api_source_file}"
+    "${dygraph_api_header_file}"
+    "${strings_api_source_file}"
+    "${strings_api_header_file}")
-foreach(generated_tensor_file ${generated_tensor_files})
+foreach(generated_file ${generated_files})
-  if(EXISTS "${generated_tensor_file}.tmp" AND EXISTS
+  if(EXISTS "${generated_file}.tmp" AND EXISTS "${generated_file}")
-                                               "${generated_tensor_file}")
+    execute_process(COMMAND ${CMAKE_COMMAND} -E copy_if_different
-    execute_process(
+                            "${generated_file}.tmp" "${generated_file}")
-      COMMAND ${CMAKE_COMMAND} -E copy_if_different
+    message("copy if different ${generated_file}.tmp ${generated_file}")
-              "${generated_tensor_file}.tmp" "${generated_tensor_file}")
+  elseif(EXISTS "${generated_file}.tmp")
-    message(
+    execute_process(COMMAND ${CMAKE_COMMAND} -E copy "${generated_file}.tmp"
-      "copy if different ${generated_tensor_file}.tmp ${generated_tensor_file}")
+                            "${generated_file}")
-  elseif(EXISTS "${generated_tensor_file}.tmp")
+    message("copy ${generated_file}.tmp ${generated_file}")
-    execute_process(
-      COMMAND ${CMAKE_COMMAND} -E copy "${generated_tensor_file}.tmp"
-              "${generated_tensor_file}")
-    message("copy ${generated_tensor_file}.tmp ${generated_tensor_file}")
  endif()
 endforeach()
-cc_library(
+collect_srcs(
-  op_meta_info
+  api_srcs
-  SRCS op_meta_info.cc
+  SRCS
-  DEPS phi_tensor_raw)
+  tensor.cc
-cc_library(
+  op_meta_info.cc
-  wrapped_infermeta
+  context_pool.cc
-  SRCS ${wrapped_infermeta_source_file}
+  tensor_utils.cc
-  DEPS phi)
+  kernel_dispatch.cc
-cc_library(
+  api_gen_utils.cc
-  context_pool
+  data_transform.cc
-  SRCS context_pool.cc
+  api_custom_impl.cc
-  DEPS phi_backends phi_enforce place init phi_device_context)
+  tensor_method.cc
-cc_library(
+  tensor_copy.cc
-  api_tensor_utils
+  scalar.cc
-  SRCS tensor_utils.cc
+  int_array.cc)
-  DEPS phi_tensor_raw)
+collect_generated_srcs(
+  api_srcs
-cc_library(
+  SRCS
-  kernel_dispatch
+  ${wrapped_infermeta_source_file}
-  SRCS kernel_dispatch.cc
+  ${api_source_file}
-  DEPS phi_tensor_raw phi_backends kernel_factory context_pool)
+  ${bw_api_source_file}
-cc_library(
+  ${fused_api_source_file}
-  api_gen_utils
+  ${fused_bw_api_source_file}
-  SRCS api_gen_utils.cc
+  ${sparse_api_source_file}
-  DEPS phi_tensor_raw selected_rows sparse_csr_tensor sparse_coo_tensor
+  ${sparse_bw_api_source_file}
-       infermeta_utils)
+  ${dygraph_api_source_file}
-cc_library(
+  ${strings_api_source_file}
-  phi_data_transform
+  ${phi_tensor_operants_source_file}
-  SRCS data_transform.cc
+  ${operants_manager_source_file}
-  DEPS phi_tensor_raw phi tensor)
+  ${tensor_api_source_file})
-cc_library(
-  api_custom_impl
-  SRCS api_custom_impl.cc
-  DEPS phi_tensor_raw
-       phi
-       kernel_dispatch
-       api_gen_utils
-       backward_infermeta
-       phi_data_transform
-       phi_profiler)
-cc_library(
-  phi_function_api
-  SRCS ${api_source_file} ${fused_api_source_file}
-  DEPS phi_tensor_raw
-       phi
-       kernel_dispatch
-       api_gen_utils
-       phi_data_transform
-       api_custom_impl
-       api_tensor_utils
-       phi_profiler)
-cc_library(
-  phi_bw_function_api
-  SRCS ${bw_api_source_file} ${fused_bw_api_source_file}
-  DEPS phi_tensor_raw
-       phi
-       kernel_dispatch
-       api_gen_utils
-       backward_infermeta
-       sparse_backward_infermeta
-       phi_data_transform
-       phi_function_api
-       api_custom_impl
-       global_utils
-       phi_profiler)
-cc_library(
-  sparse_api
-  SRCS ${sparse_api_source_file}
-  DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_profiler)
-cc_library(
-  sparse_bw_api
-  SRCS ${sparse_bw_api_source_file}
-  DEPS phi_tensor_raw
-       phi
-       kernel_dispatch
-       api_gen_utils
-       sparse_api
-       sparse_backward_infermeta
-       phi_profiler)
-cc_library(
-  phi_dygraph_api
-  SRCS ${dygraph_api_source_file}
-  DEPS phi_tensor_raw
-       phi
-       kernel_dispatch
-       api_gen_utils
-       phi_data_transform
-       phi_function_api
-       sparse_api
-       phi_profiler)
-cc_library(
-  strings_api
-  SRCS ${strings_api_source_file}
-  DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_profiler)
-cc_library(
-  phi_tensor
-  SRCS tensor_method.cc
-  DEPS phi_tensor_raw
-       phi_function_api
-       api_gen_utils
-       kernel_dispatch
-       infermeta
-       sparse_infermeta
-       sparse_api
-       strings_api)
-cc_library(
-  tensor_copy
-  SRCS tensor_copy.cc
-  DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils)
-cc_library(
-  api_scalar
-  SRCS scalar.cc
-  DEPS tensor_copy)
-cc_library(
-  api_int_array
-  SRCS int_array.cc
-  DEPS tensor_copy)
-cc_library(
-  phi_tensor_operants
-  SRCS ${phi_tensor_operants_source_file}
-  DEPS phi_function_api)
-cc_library(
-  operants_manager
-  SRCS ${operants_manager_source_file}
-  DEPS phi_enforce)
-cc_library(
-  tensor_api
-  SRCS ${tensor_api_source_file}
-  DEPS operants_manager)
--- a/paddle/phi/api/lib/context_pool.cc
+++ b/paddle/phi/api/lib/context_pool.cc
@@ -65,7 +65,8 @@ PADDLE_API phi::Allocator* GetAllocator(const phi::Place& place) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PADDLE_API phi::CUDAStream* GetCurrentCUDAStream(const phi::Place& place) {
-  PADDLE_ENFORCE(place.GetType() == phi::AllocationType::GPU,
+  PADDLE_ENFORCE_EQ(place.GetType(),
+                    phi::AllocationType::GPU,
                    phi::errors::InvalidArgument(
                        "GetCurrentCUDAStream only supports GPUPlace input. "
                        "However, your input is place=%s",

--- a/paddle/phi/api/lib/op_meta_info.cc
+++ b/paddle/phi/api/lib/op_meta_info.cc
@@ -119,6 +119,11 @@ void CustomOpKernelContext::EmplaceBackAttr(paddle::any attr) {
          << " has value of type: " << attrs_[attrs_.size() - 1].type().name();
 }
+void CustomOpKernelContext::EmplaceBackAttrs(
+    const std::vector<paddle::any>& attrs) {
+  attrs_ = std::move(attrs);
+}
 const Tensor& CustomOpKernelContext::InputAt(size_t idx) const {
  return inputs_.at(idx);
 }
@@ -132,6 +137,10 @@ std::vector<Tensor> CustomOpKernelContext::InputsBetween(size_t start,
  return rlt;
 }
+const std::vector<paddle::any>& CustomOpKernelContext::Attrs() const {
+  return attrs_;
+}
 Tensor& CustomOpKernelContext::MutableInputAt(size_t idx) {
  return inputs_.at(idx);
 }
@@ -193,6 +202,16 @@ const std::pair<size_t, size_t>& CustomOpKernelContext::OutputRangeAt(
  return output_range_.at(idx);
 }
+const std::vector<std::pair<size_t, size_t>>&
+CustomOpKernelContext::InputRange() {
+  return input_range_;
+}
+const std::vector<std::pair<size_t, size_t>>&
+CustomOpKernelContext::OutputRange() {
+  return output_range_;
+}
 void CustomOpKernelContext::ConstructInplaceIndex(
    const std::vector<std::string>& inputs,
    const std::vector<std::string>& outputs,
@@ -208,8 +227,9 @@ void CustomOpKernelContext::ConstructInplaceIndex(
      continue;
    }
    auto out_iter = find(outputs.begin(), outputs.end(), inplace_map.at(input));
-    PADDLE_ENFORCE(
+    PADDLE_ENFORCE_NE(
-        out_iter != outputs.end(),
+        out_iter,
+        outputs.end(),
        phi::errors::NotFound("Can't find the mapped value of %s, please check "
                              "the input of `Inplace` again and make "
                              "sure you registered your op accurately. ",
@@ -253,8 +273,9 @@ void CustomOpKernelContext::AssignInplaceOutputs() {
    size_t out_start_idx = output_range_[pair.second].first;
    size_t out_end_idx = output_range_[pair.second].second;
    size_t assign_tensor_size = in_end_idx - in_start_idx;
-    PADDLE_ENFORCE(
+    PADDLE_ENFORCE_EQ(
-        assign_tensor_size == out_end_idx - out_start_idx,
+        assign_tensor_size,
+        out_end_idx - out_start_idx,
        phi::errors::OutOfRange("When assigning inplaced tensor, Input vector "
                                "size %d mismatch output vector size %d",
                                in_end_idx - in_start_idx,
@@ -316,6 +337,43 @@ OpMetaInfo& OpMetaInfo::SetInferDtypeFn(InferDtypeFunc&& func) {
  return *this;
 }
+//////////////// Op Meta Info Helper /////////////////
+const std::string& OpMetaInfoHelper::GetOpName(const paddle::OpMetaInfo& info) {
+  return info.name_;
+}
+const std::vector<std::string>& OpMetaInfoHelper::GetInputs(
+    const paddle::OpMetaInfo& info) {
+  return info.inputs_;
+}
+const std::vector<std::string>& OpMetaInfoHelper::GetOutputs(
+    const paddle::OpMetaInfo& info) {
+  return info.outputs_;
+}
+const std::vector<std::string>& OpMetaInfoHelper::GetAttrs(
+    const paddle::OpMetaInfo& info) {
+  return info.attrs_;
+}
+const std::unordered_map<std::string, std::string>&
+OpMetaInfoHelper::GetInplaceMap(const paddle::OpMetaInfo& info) {
+  return info.inplace_map_;
+}
+const std::unordered_map<std::string, std::string>&
+OpMetaInfoHelper::GetInplaceReverseMap(const paddle::OpMetaInfo& info) {
+  return info.inplace_reverse_map_;
+}
+const KernelFunc& OpMetaInfoHelper::GetKernelFn(
+    const paddle::OpMetaInfo& info) {
+  return info.kernel_fn_;
+}
+const InferShapeFunc& OpMetaInfoHelper::GetInferShapeFn(
+    const paddle::OpMetaInfo& info) {
+  return info.infer_shape_fn_;
+}
+const InferDtypeFunc& OpMetaInfoHelper::GetInferDtypeFn(
+    const paddle::OpMetaInfo& info) {
+  return info.infer_dtype_fn_;
+}
 //////////////// Op Meta Info Map /////////////////
 std::vector<OpMetaInfo>& OpMetaInfoMap::operator[](const std::string& name) {
@@ -414,14 +472,16 @@ OpMetaInfoBuilder& OpMetaInfoBuilder::SetInplaceMap(
  const std::vector<std::string>& outputs =
      OpMetaInfoHelper::GetOutputs(*info_ptr_);
  for (const auto& pair : inplace_map) {
-    PADDLE_ENFORCE(
+    PADDLE_ENFORCE_NE(
-        std::find(inputs.begin(), inputs.end(), pair.first) != inputs.cend(),
+        std::find(inputs.begin(), inputs.end(), pair.first),
+        inputs.cend(),
        phi::errors::PreconditionNotMet(
            "The register of operator %s's `SetInplaceMap` failed. "
            "Please make sure: 1. Call `Inputs` and `Outputs` before "
            "`SetInplaceMap`; 2. The keys of inplace_map are inside `Inputs`",
            name_));
-    PADDLE_ENFORCE(std::find(outputs.begin(), outputs.end(), pair.second) !=
+    PADDLE_ENFORCE_NE(
+        std::find(outputs.begin(), outputs.end(), pair.second),
        outputs.cend(),
        phi::errors::PreconditionNotMet(
            "The register of operator %s's `SetInplaceMap` failed. "

--- a/paddle/phi/api/lib/tensor.cc
+++ b/paddle/phi/api/lib/tensor.cc
@@ -358,6 +358,10 @@ gpuStream_t Tensor::stream() const {
 }
 #endif
+const std::string &Tensor::name() const { return name_; }
+void Tensor::set_name(const std::string &name) { name_ = name; }
 /* Part 5: Status utils methods */
 bool Tensor::defined() const { return impl_ != nullptr; }

--- a/paddle/phi/api/profiler/CMakeLists.txt
+++ b/paddle/phi/api/profiler/CMakeLists.txt
@@ -26,16 +26,4 @@ if(WITH_PYTHON AND EXISTS ${PADDLE_BINARY_DIR})
  endif()
 endif()
-if(WITH_GPU OR WITH_ROCM)
+collect_srcs(api_srcs SRCS device_tracer.cc profiler.cc)
-  set(GPU_CTX_DEPS dynload_cuda dynamic_loader)
-endif()
-cc_library(
-  phi_device_tracer
-  SRCS device_tracer.cc
-  DEPS phi_profiler_proto ${GPU_CTX_DEPS})
-cc_library(
-  phi_profiler
-  SRCS profiler.cc
-  DEPS phi_os_info phi_device_tracer phi_enforce)
--- a/paddle/phi/backends/CMakeLists.txt
+++ b/paddle/phi/backends/CMakeLists.txt
@@ -2,17 +2,6 @@ add_subdirectory(dynload)
 add_subdirectory(gpu)
 set(BACKENDS_SRCS all_context.cc cpu/cpu_context.cc cpu/cpu_info.cc)
-set(BACKENDS_DEPS
-    enforce
-    place
-    flags
-    eigen3
-    phi_device_context
-    generator
-    phi_os_info)
-if(WITH_XBYAK)
-  list(APPEND BACKENDS_DEPS xbyak)
-endif()
 if(NOT APPLE AND NOT WIN32)
  list(APPEND BACKENDS_SRCS device_code.cc)
@@ -23,16 +12,10 @@ if(WITH_GPU OR WITH_ROCM)
       gpu/gpu_resources.cc)
  if(WITH_GPU)
    list(APPEND BACKENDS_SRCS gpu/cuda/cuda_info.cc gpu/cuda/cuda_graph.cc)
-    set_source_files_properties(
-      gpu/gpu_resources.cc
-      PROPERTIES COMPILE_FLAGS
-                 "-DCUDA_REAL_ARCHS=\"${NVCC_FLAGS_EXTRA_real_archs}\"")
  endif()
  if(WITH_ROCM)
    list(APPEND BACKENDS_SRCS gpu/rocm/rocm_info.cc)
  endif()
-  list(APPEND BACKENDS_DEPS phi_dynload_cuda)
 endif()
 if(WITH_XPU)
@@ -45,7 +28,6 @@ if(WITH_MKLDNN)
  list(APPEND BACKENDS_SRCS onednn/onednn_context.cc)
  list(APPEND BACKENDS_SRCS onednn/axpy_handler.cc)
  list(APPEND BACKENDS_SRCS onednn/matmul_utils.cc)
-  list(APPEND BACKENDS_DEPS mkldnn)
 endif()
 list(
@@ -55,26 +37,25 @@ list(
  device_guard.cc
  stream.cc
  event.cc
-  device_base.cc
  device_manager.cc
  context_pool.cc)
+if(WITH_GPU
+   OR WITH_ROCM
+   OR WITH_CUSTOM_DEVICE)
+  list(APPEND BACKENDS_SRCS device_base.cc)
+endif()
 if(WITH_CUSTOM_DEVICE)
  list(APPEND BACKENDS_SRCS custom/custom_context.cc custom/custom_device.cc
       custom/custom_device_op_list.cc)
 endif()
-add_library(phi_backends "${BACKENDS_SRCS}")
+collect_srcs(backends_srcs SRCS ${BACKENDS_SRCS})
-target_link_libraries(phi_backends ${BACKENDS_DEPS})
-# for inference library
-get_property(phi_modules GLOBAL PROPERTY PHI_MODULES)
-set(phi_modules ${phi_modules} phi_backends)
-set_property(GLOBAL PROPERTY PHI_MODULES "${phi_modules}")
 if(WITH_CUSTOM_DEVICE)
  cc_test(
    capi_test
    SRCS custom/capi_test.cc
-    DEPS phi_capi)
+    DEPS phi)
 endif()
--- a/paddle/phi/backends/cpu/cpu_context.cc
+++ b/paddle/phi/backends/cpu/cpu_context.cc
@@ -24,6 +24,10 @@
 namespace phi {
+template <>
+const TypeInfo<DeviceContext> TypeInfoTraits<DeviceContext, CPUContext>::kType =
+    RegisterStaticType<DeviceContext>(CPUContext::name());
 struct CPUContext::Impl {
  Impl() : place_(CPUPlace()) {}

--- a/paddle/phi/backends/custom/custom_context.cc
+++ b/paddle/phi/backends/custom/custom_context.cc
@@ -19,6 +19,11 @@ limitations under the License. */
 namespace phi {
+template <>
+const TypeInfo<DeviceContext>
+    TypeInfoTraits<DeviceContext, CustomContext>::kType =
+        RegisterStaticType<DeviceContext>(CustomContext::name());
 struct CustomContext::Impl {
  explicit Impl(const CustomPlace& place) : place_(place) {}

--- a/paddle/phi/backends/dynload/CMakeLists.txt
+++ b/paddle/phi/backends/dynload/CMakeLists.txt
-cc_library(
+set(DYNLOAD_COMMON_SRCS dynamic_loader.cc port.cc warpctc.cc warprnnt.cc
-  phi_dynamic_loader
+                        lapack.cc)
-  SRCS dynamic_loader.cc port.cc
+if(WITH_ASCEND_CL)
-  DEPS enforce glog gflags)
+  list(REMOVE_ITEM DYNLOAD_COMMON_SRCS warprnnt.cc)
+endif()
 list(
  APPEND
  CUDA_SRCS
@@ -60,66 +60,39 @@ configure_file(cupti_lib_path.h.in ${CMAKE_CURRENT_BINARY_DIR}/cupti_lib_path.h)
 if(CUPTI_FOUND)
  list(APPEND CUDA_SRCS cupti.cc)
 endif()
-if(WITH_ROCM)
-  hip_library(
-    phi_dynload_cuda
-    SRCS ${HIP_SRCS}
-    DEPS phi_dynamic_loader)
-  cc_library(
-    phi_dynload_warpctc
-    SRCS warpctc.cc
-    DEPS phi_dynamic_loader warpctc)
-  cc_library(
-    phi_dynload_warprnnt
-    SRCS warprnnt.cc
-    DEPS phi_dynamic_loader warprnnt)
-else()
-  nv_library(
-    phi_dynload_cuda
-    SRCS ${CUDA_SRCS}
-    DEPS phi_dynamic_loader)
-  cc_library(
-    phi_dynload_warpctc
-    SRCS warpctc.cc
-    DEPS phi_dynamic_loader warpctc)
-  cc_library(
-    phi_dynload_warprnnt
-    SRCS warprnnt.cc
-    DEPS phi_dynamic_loader warprnnt)
-endif()
 if(WITH_MKLML)
-  cc_library(
+  # Only deps libmklml.so, not link
-    phi_dynload_mklml
+  add_library(dynload_mklml STATIC mklml.cc)
-    SRCS mklml.cc
+  add_dependencies(dynload_mklml mklml)
-    DEPS phi_dynamic_loader mklml)
+  if(WIN32)
+    target_link_libraries(dynload_mklml ${MKLML_IOMP_LIB})
+  else()
+    target_link_libraries(dynload_mklml
+                          "-L${MKLML_LIB_DIR} -liomp5 -Wl,--as-needed")
+  endif()
 endif()
 if(WITH_FLASHATTN)
-  cc_library(
+  list(APPEND DYNLOAD_COMMON_SRCS flashattn.cc)
-    phi_dynload_flashattn
-    SRCS flashattn.cc
-    DEPS phi_dynamic_loader flashattn)
 endif()
-cc_library(
-  phi_dynload_lapack
-  SRCS lapack.cc
-  DEPS phi_dynamic_loader)
-add_dependencies(phi_dynload_lapack extern_lapack)
-# TODO(TJ): add iomp, mkldnn?
 if(MKL_FOUND AND WITH_ONEMKL)
  message("ONEMKL INCLUDE directory is ${MKL_INCLUDE}")
-  cc_library(
+  list(APPEND DYNLOAD_COMMON_SRCS mklrt.cc)
-    phi_dynload_mklrt
+endif()
-    SRCS mklrt.cc
-    DEPS phi_dynamic_loader)
+if(WITH_ROCM)
-  target_include_directories(phi_dynload_mklrt PRIVATE ${MKL_INCLUDE})
+  collect_srcs(backends_srcs SRCS ${DYNLOAD_COMMON_SRCS} ${HIP_SRCS})
+elseif(WITH_GPU)
+  collect_srcs(backends_srcs SRCS ${DYNLOAD_COMMON_SRCS} ${CUDA_SRCS})
+else()
+  collect_srcs(backends_srcs SRCS ${DYNLOAD_COMMON_SRCS})
 endif()
 if(WITH_CUDNN_FRONTEND)
  nv_test(
    cudnn_frontend_test
    SRCS cudnn_frontend_test.cc
-    DEPS phi_dynload_cuda cudnn-frontend)
+    DEPS phi cudnn-frontend)
 endif()
--- a/paddle/phi/backends/gpu/cuda/CMakeLists.txt
+++ b/paddle/phi/backends/gpu/cuda/CMakeLists.txt
-cc_library(cudnn_workspace_helper SRCS cudnn_workspace_helper.cc)
+collect_srcs(backends_srcs SRCS cudnn_workspace_helper.cc)
--- a/paddle/phi/backends/gpu/gpu_context.cc
+++ b/paddle/phi/backends/gpu/gpu_context.cc
@@ -59,6 +59,15 @@ limitations under the License. */
 namespace phi {
+template <>
+const TypeInfo<DeviceContext> TypeInfoTraits<DeviceContext, GPUContext>::kType =
+    RegisterStaticType<DeviceContext>(GPUContext::name());
+template <>
+const TypeInfo<DeviceContext>
+    TypeInfoTraits<DeviceContext, GPUPinnedContext>::kType =
+        RegisterStaticType<DeviceContext>(GPUPinnedContext::name());
 namespace internal {
 class EigenGpuStreamDevice : public Eigen::StreamInterface {

--- a/paddle/phi/backends/gpu/gpu_context.h
+++ b/paddle/phi/backends/gpu/gpu_context.h
@@ -15,6 +15,8 @@ limitations under the License. */
 #pragma once
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include <array>
 #include <functional>
 #include <mutex>
@@ -305,3 +307,5 @@ class GPUPinnedContext
 };
 #endif
 }  // namespace phi
+#endif
--- a/paddle/phi/backends/onednn/onednn_context.cc
+++ b/paddle/phi/backends/onednn/onednn_context.cc
@@ -83,6 +83,11 @@ void OneDNNContextThreadLocals::Body::log_lib_version(void) {
  }
 }
+OneDNNContextThreadLocals::Body& OneDNNContextThreadLocals::fetch() {
+  thread_local Body b;
+  return b;
+}
 struct OneDNNContext::Impl {
  Impl() : p_blobmap_() {
    p_blobmap_.reset(new BlobMap());
@@ -462,5 +467,7 @@ const std::vector<std::string>& OneDNNContext::GetOutputsName(
  return impl_->GetOutputsName(output);
 }
+const char* OneDNNContext::name() { return "OneDNNContext"; }
 }  // namespace phi
 #endif
--- a/paddle/phi/backends/onednn/onednn_context.h
+++ b/paddle/phi/backends/onednn/onednn_context.h
@@ -76,10 +76,7 @@ class OneDNNContextThreadLocals {
  static constexpr size_t kMKLDNNSessionID_Default = 0;
  // mkldnn session id for cache clearing mode
  static constexpr size_t kMKLDNNSessionID_CacheClearing = -1;
-  static Body& fetch() {
+  static Body& fetch();
-    thread_local Body b;
-    return b;
-  }
 };
 class OneDNNContext : public CPUContext {
@@ -157,7 +154,7 @@ class OneDNNContext : public CPUContext {
  const std::vector<std::string>& GetOutputsName(
      const std::string& output) const;
-  static const char* name() { return "OneDNNContext"; }
+  static const char* name();
 private:
  struct Impl;

--- a/paddle/phi/backends/xpu/xpu_context.cc
+++ b/paddle/phi/backends/xpu/xpu_context.cc
@@ -30,6 +30,9 @@ namespace xpu = baidu::xpu::api;
 namespace phi {
+template <>
+const TypeInfo<DeviceContext> TypeInfoTraits<DeviceContext, XPUContext>::kType =
+    RegisterStaticType<DeviceContext>(XPUContext::name());
 struct XPUContext::Impl {
  void SetL3Cache(int l3_size = 14155776) {
    const int MAX_XPU_NUM = 16;

--- a/paddle/phi/capi/CMakeLists.txt
+++ b/paddle/phi/capi/CMakeLists.txt
 add_subdirectory(lib)
-cc_library(
-  phi_capi
-  SRCS all.cc
-  DEPS phi_c_data_type
-       phi_c_device_context
-       phi_c_int_array
-       phi_c_kernel_context
-       phi_c_kernel_factory
-       phi_c_kernel_registry
-       phi_c_place
-       phi_c_scalar
-       phi_c_tensor)
--- a/paddle/phi/capi/all.cc
+++ b/paddle/phi/capi/all.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/phi/capi/all.h"
-namespace paddle {
-namespace capi {}  // namespace capi
-}  // namespace paddle
--- a/paddle/phi/capi/lib/CMakeLists.txt
+++ b/paddle/phi/capi/lib/CMakeLists.txt
-cc_library(
+collect_srcs(
-  phi_c_data_type
+  capi_srcs
-  SRCS c_data_type.cc
+  SRCS
-  DEPS dense_tensor)
+  c_data_type.cc
+  c_device_context.cc
-cc_library(
+  c_int_array.cc
-  phi_c_device_context
+  c_kernel_context.cc
-  SRCS c_device_context.cc
+  c_kernel_factory.cc
-  DEPS phi_backends)
+  c_kernel_registry.cc
+  c_place.cc
-cc_library(
+  c_scalar.cc
-  phi_c_int_array
+  c_tensor.cc)
-  SRCS c_int_array.cc
-  DEPS int_array)
-cc_library(
-  phi_c_kernel_context
-  SRCS c_kernel_context.cc
-  DEPS kernel_context)
-cc_library(
-  phi_c_kernel_factory
-  SRCS c_kernel_factory.cc
-  DEPS kernel_factory)
-cc_library(
-  phi_c_kernel_registry
-  SRCS c_kernel_registry.cc
-  DEPS dense_tensor)
-cc_library(
-  phi_c_place
-  SRCS c_place.cc
-  DEPS phi_place)
-cc_library(
-  phi_c_scalar
-  SRCS c_scalar.cc
-  DEPS scalar)
-cc_library(
-  phi_c_tensor
-  SRCS c_tensor.cc
-  DEPS dense_tensor)
--- a/paddle/phi/common/CMakeLists.txt
+++ b/paddle/phi/common/CMakeLists.txt
-if(WITH_GPU)
+collect_srcs(common_srcs SRCS place.cc scalar.cc int_array.cc memory_utils.cc)
-  nv_library(
-    phi_place
-    SRCS place.cc
-    DEPS phi_backends)
-elseif(WITH_ROCM)
-  hip_library(
-    phi_place
-    SRCS place.cc
-    DEPS phi_backends)
-else()
-  cc_library(phi_place SRCS place.cc)
-endif()
-cc_library(
-  scalar
-  SRCS scalar.cc
-  DEPS phi_enforce phi_tensor_utils)
-cc_library(
-  int_array
-  SRCS int_array.cc
-  DEPS phi_enforce phi_tensor_utils)
-cc_library(
-  memory_utils
-  SRCS memory_utils.cc
-  DEPS phi_enforce phi_place)
--- a/paddle/phi/core/CMakeLists.txt
+++ b/paddle/phi/core/CMakeLists.txt
@@ -6,150 +6,35 @@ if(WITH_GPU)
  proto_library(external_error_proto SRCS external_error.proto)
 endif()
-cc_library(
+collect_srcs(
-  flags
+  core_srcs
-  SRCS flags.cc
+  SRCS
-  DEPS gflags)
+  flags.cc
+  errors.cc
-cc_library(errors SRCS errors.cc)
+  enforce.cc
-set(phi_enforce_deps errors flags)
+  os_info.cc
-if(WITH_GPU)
+  kernel_context.cc
-  set(phi_enforce_deps ${phi_enforce_deps} external_error_proto)
+  ddim.cc
-endif()
+  tensor_base.cc
-cc_library(
+  allocator.cc
-  phi_enforce
+  tensor_meta.cc
-  SRCS enforce.cc
+  lod_utils.cc
-  DEPS ${phi_enforce_deps})
+  threadpool.cc
+  dense_tensor.cc
-cc_library(
+  dense_tensor_impl.cc
-  phi_os_info
+  sparse_coo_tensor.cc
-  SRCS os_info.cc
+  sparse_csr_tensor.cc
-  DEPS phi_enforce)
+  string_tensor.cc
+  tensor_array.cc
-if(WITH_XPU)
+  extended_tensor.cc
-  cc_library(
+  meta_tensor.cc
-    kernel_factory
+  infermeta_utils.cc
-    SRCS kernel_factory.cc
+  selected_rows_impl.cc
-    DEPS phi_enforce convert_utils phi_backends)
+  selected_rows.cc
-else()
+  device_context.cc
-  cc_library(
+  custom_kernel.cc
-    kernel_factory
+  mixed_vector.cc
-    SRCS kernel_factory.cc
+  generator.cc
-    DEPS phi_enforce convert_utils)
+  kernel_factory.cc
-endif()
+  tensor_utils.cc
-cc_library(
+  storage_properties.cc)
-  kernel_context
-  SRCS kernel_context.cc
-  DEPS phi_enforce phi_backends)
-cc_library(
-  ddim
-  SRCS ddim.cc
-  DEPS phi_enforce)
-cc_library(
-  tensor_base
-  SRCS tensor_base.cc allocator.cc
-  DEPS phi_enforce)
-cc_library(
-  tensor_meta
-  SRCS tensor_meta.cc
-  DEPS phi_enforce)
-cc_library(
-  lod_utils
-  SRCS lod_utils.cc
-  DEPS phi_enforce)
-cc_library(
-  threadpool
-  SRCS threadpool.cc
-  DEPS phi_enforce)
-cc_library(
-  dense_tensor
-  SRCS dense_tensor.cc dense_tensor_impl.cc
-  DEPS convert_utils tensor_meta tensor_base ddim)
-target_link_libraries(dense_tensor memory_utils)
-cc_library(
-  sparse_coo_tensor
-  SRCS sparse_coo_tensor.cc
-  DEPS tensor_meta tensor_base)
-cc_library(
-  sparse_csr_tensor
-  SRCS sparse_csr_tensor.cc
-  DEPS dense_tensor tensor_base)
-cc_library(
-  string_tensor
-  SRCS string_tensor.cc
-  DEPS convert_utils tensor_meta tensor_base)
-cc_library(
-  tensor_array
-  SRCS tensor_array.cc
-  DEPS dense_tensor tensor_base)
-cc_library(
-  extended_tensor
-  SRCS extended_tensor.cc
-  DEPS tensor_base)
-cc_library(
-  meta_tensor
-  SRCS meta_tensor.cc
-  DEPS tensor_base tensor_meta dense_tensor)
-cc_library(
-  infermeta_utils
-  SRCS infermeta_utils.cc
-  DEPS meta_tensor)
-cc_library(
-  selected_rows
-  SRCS selected_rows_impl.cc selected_rows.cc
-  DEPS tensor_base dense_tensor phi_enforce ddim)
-cc_library(
-  phi_device_context
-  SRCS device_context.cc
-  DEPS dense_tensor selected_rows)
-cc_library(
-  custom_kernel
-  SRCS custom_kernel.cc
-  DEPS kernel_factory)
-cc_library(
-  mixed_vector
-  SRCS mixed_vector.cc
-  DEPS phi_backends place memory)
-cc_library(
-  generator
-  SRCS generator.cc
-  DEPS enforce place)
-# Will remove once we implemented MKLDNN_Tensor
-if(WITH_MKLDNN)
-  add_dependencies(dense_tensor mkldnn)
-  add_dependencies(tensor_base mkldnn)
-endif()
-if(WITH_GPU)
-  nv_library(
-    phi_tensor_utils
-    SRCS tensor_utils.cc
-    DEPS phi_backends dense_tensor selected_rows memcpy memory_utils)
-elseif(WITH_ROCM)
-  hip_library(
-    phi_tensor_utils
-    SRCS tensor_utils.cc
-    DEPS phi_backends dense_tensor selected_rows memcpy memory_utils)
-elseif(WITH_XPU_KP)
-  xpu_library(
-    phi_tensor_utils
-    SRCS tensor_utils.cc
-    DEPS phi_backends dense_tensor selected_rows memcpy memory_utils)
-else()
-  cc_library(
-    phi_tensor_utils
-    SRCS tensor_utils.cc
-    DEPS dense_tensor selected_rows memcpy phi_backends memory_utils)
-endif()
--- a/paddle/phi/core/compat/CMakeLists.txt
+++ b/paddle/phi/core/compat/CMakeLists.txt
-cc_library(
+collect_srcs(core_srcs SRCS arg_map_context.cc op_utils.cc
-  arg_map_context
+             get_kerneltype_forvar_utils.cc convert_utils.cc)
-  SRCS arg_map_context.cc
-  DEPS phi_enforce)
-cc_library(
-  op_utils
-  SRCS op_utils.cc
-  DEPS arg_map_context enforce)
-cc_library(
-  get_kerneltype_forvar_utils
-  SRCS get_kerneltype_forvar_utils.cc
-  DEPS enforce)
-set(convert_utils_deps data_type place op_utils phi_backends)
-if(WITH_MKLDNN)
-  set(convert_utils_deps ${convert_utils_deps} mkldnn)
-endif()
-cc_library(
-  convert_utils
-  SRCS convert_utils.cc
-  DEPS ${convert_utils_deps})
--- a/paddle/phi/core/compat/op_utils.cc
+++ b/paddle/phi/core/compat/op_utils.cc
@@ -26,4 +26,16 @@ OpUtilsMap& OpUtilsMap::Instance() {
  return g_op_utils_map;
 }
+BaseKernelNameRegistrar::BaseKernelNameRegistrar(const char* op_type,
+                                                 const char* base_kernel_name) {
+  OpUtilsMap::Instance().InsertBaseKernelName(op_type, base_kernel_name);
+  OpUtilsMap::Instance().InsertFluidOplName(op_type, base_kernel_name);
+}
+ArgumentMappingFnRegistrar::ArgumentMappingFnRegistrar(
+    const char* op_type, ArgumentMappingFn arg_mapping_fn) {
+  OpUtilsMap::Instance().InsertArgumentMappingFn(op_type,
+                                                 std::move(arg_mapping_fn));
+}
 }  // namespace phi
--- a/paddle/phi/core/compat/op_utils.h
+++ b/paddle/phi/core/compat/op_utils.h
@@ -210,18 +210,12 @@ class OpUtilsMap {
 };
 struct BaseKernelNameRegistrar {
-  BaseKernelNameRegistrar(const char* op_type, const char* base_kernel_name) {
+  BaseKernelNameRegistrar(const char* op_type, const char* base_kernel_name);
-    OpUtilsMap::Instance().InsertBaseKernelName(op_type, base_kernel_name);
-    OpUtilsMap::Instance().InsertFluidOplName(op_type, base_kernel_name);
-  }
 };
 struct ArgumentMappingFnRegistrar {
  ArgumentMappingFnRegistrar(const char* op_type,
-                             ArgumentMappingFn arg_mapping_fn) {
+                             ArgumentMappingFn arg_mapping_fn);
-    OpUtilsMap::Instance().InsertArgumentMappingFn(op_type,
-                                                   std::move(arg_mapping_fn));
-  }
 };
 #define PD_REGISTER_BASE_KERNEL_NAME(op_type, base_kernel_name)               \

--- a/paddle/phi/core/dense_tensor.cc
+++ b/paddle/phi/core/dense_tensor.cc
@@ -42,6 +42,11 @@ limitations under the License. */
 namespace phi {
+template <>
+const TypeInfo<phi::TensorBase>
+    TypeInfoTraits<phi::TensorBase, DenseTensor>::kType =
+        RegisterStaticType<phi::TensorBase>(DenseTensor::name());
 DenseTensor::DenseTensor(Allocator* a, const DenseTensorMeta& meta)
    : meta_(meta), holder_(a->Allocate(SizeOf(dtype()) * numel())) {}
@@ -115,8 +120,9 @@ void* DenseTensor::AllocateFrom(Allocator* allocator,
  if (fake_alloc) {
    bytes = 0;
  } else {
-    PADDLE_ENFORCE(
+    PADDLE_ENFORCE_EQ(
        valid(),
+        true,
        phi::errors::PreconditionNotMet("The meta data must be valid when "
                                        "call the mutable data function."));
    if (requested_size) {
@@ -169,8 +175,9 @@ const T* DenseTensor::data() const {
 template <typename T>
 T* DenseTensor::data() {
  T* ret = static_cast<T*>(data());
-  PADDLE_ENFORCE(
+  PADDLE_ENFORCE_EQ(
-      (dtype() == phi::CppTypeToDataType<T>::Type()),
+      dtype(),
+      phi::CppTypeToDataType<T>::Type(),
      phi::errors::InvalidArgument(
          "The type of data we are trying to retrieve (%s) does not match the "
          "type of data (%s) currently contained in the container.",
@@ -200,7 +207,8 @@ const void* DenseTensor::data() const {
 }
 void DenseTensor::set_meta(DenseTensorMeta&& meta) {
-  PADDLE_ENFORCE(!meta_.valid(),
+  PADDLE_ENFORCE_EQ(meta_.valid(),
+                    false,
                    phi::errors::InvalidArgument(
                        "Only when the original attribute of Tensor is "
                        "incomplete, can it be reset."));
@@ -208,8 +216,9 @@ void DenseTensor::set_meta(DenseTensorMeta&& meta) {
 }
 void DenseTensor::set_meta(const DenseTensorMeta& meta) {
-  PADDLE_ENFORCE(
+  PADDLE_ENFORCE_EQ(
      meta.valid(),
+      true,
      phi::errors::InvalidArgument(
          "Input meta is invalid, please check the meta attribute."));
  meta_.dims = meta.dims;

--- a/paddle/phi/core/distributed/CMakeLists.txt
+++ b/paddle/phi/core/distributed/CMakeLists.txt
@@ -2,32 +2,14 @@ add_subdirectory(check)
 add_subdirectory(store)
 add_subdirectory(auto_parallel)
-set(COMM_CONTEXT_MANAGER_DEPS tcp_store)
+set(DISTRIBUTED_COMMON_SRCS comm_context_manager.cc)
 if(WITH_NCCL OR WITH_RCCL)
-  cc_library(
+  list(APPEND DISTRIBUTED_COMMON_SRCS nccl_comm_context.cc)
-    nccl_comm_context
-    SRCS nccl_comm_context.cc
-    DEPS dense_tensor comm_static_check nccl_dynamic_check)
-  list(APPEND COMM_CONTEXT_MANAGER_DEPS nccl_comm_context)
 endif()
 if(WITH_GLOO)
-  cc_library(
+  list(APPEND DISTRIBUTED_COMMON_SRCS gloo_utils.cc gloo_comm_context.cc)
-    gloo_utils
-    SRCS gloo_utils.cc
-    DEPS gloo dense_tensor enforce tcp_store)
-  cc_library(
-    gloo_comm_context
-    SRCS gloo_comm_context.cc
-    DEPS gloo_utils comm_static_check)
-  list(APPEND COMM_CONTEXT_MANAGER_DEPS gloo_comm_context gloo_store)
 endif()
-cc_library(
+collect_srcs(core_srcs SRCS ${DISTRIBUTED_COMMON_SRCS})
-  comm_context_manager
-  SRCS comm_context_manager.cc
-  DEPS ${COMM_CONTEXT_MANAGER_DEPS})
--- a/paddle/phi/core/distributed/auto_parallel/CMakeLists.txt
+++ b/paddle/phi/core/distributed/auto_parallel/CMakeLists.txt
 proto_library(auto_parallel_proto SRCS auto_parallel.proto)
-cc_library(
+collect_srcs(core_srcs SRCS device_mesh.cc process_mesh.cc dist_attr.cc
-  device_mesh
+             dist_mapper.cc)
-  SRCS device_mesh.cc
-  DEPS auto_parallel_proto phi_enforce)
-cc_library(
-  process_mesh
-  SRCS process_mesh.cc
-  DEPS auto_parallel_proto phi_enforce)
-cc_library(
-  dist_attr
-  SRCS dist_attr.cc
-  DEPS process_mesh auto_parallel_proto proto_desc phi_enforce)
-cc_library(
-  dist_mapper
-  SRCS dist_mapper.cc
-  DEPS device_mesh auto_parallel_proto phi_enforce)
-cc_library(auto_parallel DEPS device_mesh process_mesh dist_attr dist_mapper)
--- a/paddle/phi/core/distributed/check/CMakeLists.txt
+++ b/paddle/phi/core/distributed/check/CMakeLists.txt
-cc_library(
+set(CHECK_COMMON_SRCS static_check.cc)
-  comm_static_check
-  SRCS static_check.cc
-  DEPS place dense_tensor enforce)
 if(WITH_NCCL OR WITH_RCCL)
-  cc_library(
+  list(APPEND CHECK_COMMON_SRCS nccl_dynamic_check.cc)
-    nccl_dynamic_check
-    SRCS nccl_dynamic_check.cc
-    DEPS dense_tensor)
 endif()
+collect_srcs(core_srcs SRCS ${CHECK_COMMON_SRCS})
--- a/paddle/phi/core/distributed/store/CMakeLists.txt
+++ b/paddle/phi/core/distributed/store/CMakeLists.txt
-cc_library(
+set(STORE_COMMON_SRCS tcp_store.cc tcp_utils.cc socket.cpp store.cc)
-  tcp_store
-  SRCS tcp_store.cc tcp_utils.cc socket.cpp store.cc
-  DEPS enforce glog)
 if(WITH_GLOO)
-  cc_library(
+  list(APPEND STORE_COMMON_SRCS gloo_store.cc)
-    gloo_store
-    SRCS gloo_store.cc
-    DEPS gloo)
 endif()
+collect_srcs(core_srcs SRCS ${STORE_COMMON_SRCS})
 if(NOT WIN32)
  cc_test(
    test_c_tcp_store
    SRCS test_tcp_store.cc
-    DEPS tcp_store)
+    DEPS phi)
 endif()
--- a/paddle/phi/core/distributed/store/tcp_store.cc
+++ b/paddle/phi/core/distributed/store/tcp_store.cc
@@ -139,7 +139,8 @@ void MasterDaemon::StopByControlFd() {
 #else
 void MasterDaemon::InitControlFd() {
  ghStopEvent_ = CreateEvent(NULL, TRUE, FALSE, NULL);
-  PADDLE_ENFORCE(ghStopEvent_,
+  PADDLE_ENFORCE_NE(ghStopEvent_,
+                    nullptr,
                    phi::errors::Fatal("failed to cread control pipe"));
 }
 void MasterDaemon::CloseControlFd() { CloseHandle(ghStopEvent_); }
@@ -422,8 +423,9 @@ void TCPStore::wait(const std::string& key) {
  VLOG(3) << "TCPStore wait.";
  _client->send_command_for_key(Command::WAIT, _key_prefix + key);
  reply = _client->receive_value<ReplyType>();
-  PADDLE_ENFORCE(
+  PADDLE_ENFORCE_EQ(
      reply == ReplyType::STOP_WAIT,
+      true,
      phi::errors::InvalidArgument("Stop_waiting response is expected"));
 }

--- a/paddle/phi/core/enforce.cc
+++ b/paddle/phi/core/enforce.cc
@@ -280,13 +280,19 @@ std::string GetExternalErrorMsg(T status) {
      if (std::string::npos != last_slash_idx) {
        strModule.erase(last_slash_idx, std::string::npos);
      }
-      if (compare_path.compare("avx.so") == 0) {
+      // TODO(lizhiyu02): I don't know what the 'compare_path.compare("avx.so")
+      // == 0' means, while
+      //  'compare_path.find("dist-packages") != std::string::npos' means that
+      //  after using 'pip install paddle'.
+      if (compare_path.compare("avx.so") == 0 ||
+          strModule.find("dist-packages") != std::string::npos) {
        filePath =
            strModule +
            "/../include/third_party/externalError/data/externalErrorMsg.pb";
      } else {
+        // Just for unittest
        filePath = strModule +
-                   "/../../third_party/externalError/data/externalErrorMsg.pb";
+                   "/../third_party/externalError/data/externalErrorMsg.pb";
      }
    }
 #else
@@ -303,14 +309,14 @@ std::string GetExternalErrorMsg(T status) {
    if (std::string::npos != last_slash_idx) {
      strModule.erase(last_slash_idx, std::string::npos);
    }
-    if (compare_path.compare("avx.pyd") == 0) {
+    if (strModule.find("dist-packages") != std::string::npos) {
      filePath = strModule +
                 "\\..\\include\\third_"
                 "party\\externalerror\\data\\externalErrorMsg.pb";
    } else {
-      filePath =
+      filePath = strModule +
-          strModule +
+                 "\\..\\..\\third_party"
-          "\\..\\..\\third_party\\externalerror\\data\\externalErrorMsg.pb";
+                 "\\externalerror\\data\\externalErrorMsg.pb";
    }
 #endif
    std::ifstream fin(filePath, std::ios::in | std::ios::binary);

--- a/paddle/phi/core/flags.h
+++ b/paddle/phi/core/flags.h
@@ -24,7 +24,7 @@
 #include "paddle/utils/variant.h"
-#if defined(_WIN32) && defined(BUILD_PHI_SHARED)
+#if defined(_WIN32)
 #define PHI_EXPORT_FLAG __declspec(dllexport)
 #define PHI_IMPORT_FLAG __declspec(dllimport)
 #else

--- a/paddle/phi/core/lod_utils.cc
+++ b/paddle/phi/core/lod_utils.cc
@@ -32,8 +32,9 @@ LoD ToAbsOffset(const LoD &in) {
 }
 void AppendLoD(LoD *lod, const LoD &lod_length) {
-  PADDLE_ENFORCE(
+  PADDLE_ENFORCE_EQ(
-      lod->empty() || lod->size() == lod_length.size(),
+      (lod->empty() || lod->size() == lod_length.size()),
+      true,
      phi::errors::InvalidArgument(
          "The input LoD length should be equal to the appended LoD size, but "
          "received input LoD length is %d, actual LoD size is %d.",

--- a/paddle/phi/core/selected_rows.cc
+++ b/paddle/phi/core/selected_rows.cc
@@ -16,6 +16,11 @@ limitations under the License. */
 namespace phi {
+template <>
+const TypeInfo<phi::TensorBase>
+    TypeInfoTraits<phi::TensorBase, SelectedRows>::kType =
+        RegisterStaticType<phi::TensorBase>(SelectedRows::name());
 SelectedRows::SelectedRows(const std::vector<int64_t>& rows,
                           const int64_t& height)
    : impl_(std::make_shared<phi::SelectedRowsImpl>(rows, height)) {}

--- a/paddle/phi/core/sparse_coo_tensor.cc
+++ b/paddle/phi/core/sparse_coo_tensor.cc
@@ -16,6 +16,11 @@ limitations under the License. */
 namespace phi {
+template <>
+const TypeInfo<phi::TensorBase>
+    TypeInfoTraits<phi::TensorBase, SparseCooTensor>::kType =
+        RegisterStaticType<phi::TensorBase>(SparseCooTensor::name());
 SparseCooTensor::SparseCooTensor() {
  DenseTensor non_zero_indices, non_zero_elements;
  this->SetMember(non_zero_indices, non_zero_elements, {1}, true);
@@ -155,7 +160,8 @@ int32_t SparseCooTensor::dense_dim() const {
 }
 void SparseCooTensor::set_meta(SparseTensorMeta&& meta) {
-  PADDLE_ENFORCE(!meta_.valid(),
+  PADDLE_ENFORCE_EQ(meta_.valid(),
+                    false,
                    phi::errors::InvalidArgument(
                        "Only when the original attribute of Tensor is "
                        "incomplete, can it be reset."));
@@ -163,8 +169,9 @@ void SparseCooTensor::set_meta(SparseTensorMeta&& meta) {
 }
 void SparseCooTensor::set_meta(const SparseTensorMeta& meta) {
-  PADDLE_ENFORCE(
+  PADDLE_ENFORCE_EQ(
      meta.valid(),
+      true,
      phi::errors::InvalidArgument(
          "Input meta is invalid, please check the meta attribute."));
  meta_.dims = meta.dims;

--- a/paddle/phi/core/sparse_csr_tensor.cc
+++ b/paddle/phi/core/sparse_csr_tensor.cc
@@ -16,6 +16,11 @@ limitations under the License. */
 namespace phi {
+template <>
+const TypeInfo<phi::TensorBase>
+    TypeInfoTraits<phi::TensorBase, SparseCsrTensor>::kType =
+        RegisterStaticType<phi::TensorBase>(SparseCsrTensor::name());
 SparseCsrTensor::SparseCsrTensor() {
  DenseTensor crows, cols, values;
  this->non_zero_crows_ = crows;
@@ -26,8 +31,9 @@ SparseCsrTensor::SparseCsrTensor() {
 inline void check_shape(const DDim& dims) {
  bool valid = dims.size() == 2 || dims.size() == 3;
-  PADDLE_ENFORCE(
+  PADDLE_ENFORCE_EQ(
      valid,
+      true,
      phi::errors::InvalidArgument("the SparseCsrTensor only support 2-D or "
                                   "3-D Tensor, but get %d-D Tensor",
                                   dims.size()));
@@ -96,7 +102,9 @@ void SparseCsrTensor::set_layout(const DataLayout layout) {
 void SparseCsrTensor::Resize(const DDim& dense_dims,
                             const int64_t non_zero_num) {
-  PADDLE_ENFORCE(this->initialized(),
+  PADDLE_ENFORCE_EQ(
+      this->initialized(),
+      true,
      phi::errors::InvalidArgument(
          "the SparseCsrTensor must be initialized when call Resize "
          "function."));
@@ -139,7 +147,8 @@ void SparseCsrTensor::SetMember(const DenseTensor& non_zero_crows,
 }
 void SparseCsrTensor::set_meta(SparseTensorMeta&& meta) {
-  PADDLE_ENFORCE(!meta_.valid(),
+  PADDLE_ENFORCE_EQ(meta_.valid(),
+                    false,
                    phi::errors::InvalidArgument(
                        "Only when the original attribute of Tensor is "
                        "incomplete, can it be reset."));
@@ -147,8 +156,9 @@ void SparseCsrTensor::set_meta(SparseTensorMeta&& meta) {
 }
 void SparseCsrTensor::set_meta(const SparseTensorMeta& meta) {
-  PADDLE_ENFORCE(
+  PADDLE_ENFORCE_EQ(
      meta.valid(),
+      true,
      phi::errors::InvalidArgument(
          "Input meta is invalid, please check the meta attribute."));
  meta_.dims = meta.dims;

--- a/paddle/fluid/platform/dynload/mklml.cc
+++ b/paddle/fluid/platform/dynload/mklml.cc
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,21 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/platform/dynload/mklml.h"
+#include "paddle/phi/core/storage_properties.h"
-namespace paddle {
+namespace phi {
-namespace platform {
-namespace dynload {
-#define DEFINE_WRAP(__name) DynLoad__##__name __name
+#ifdef PADDLE_WITH_MKLDNN
+template <>
+const TypeInfo<StorageProperties>
+    TypeInfoTraits<StorageProperties, OneDNNStorageProperties>::kType =
+        RegisterStaticType<StorageProperties>(OneDNNStorageProperties::name());
-MKLML_ROUTINE_EACH(DEFINE_WRAP);
-#if !defined(_WIN32)
-DEFINE_WRAP(mkl_scsrmm);
-DEFINE_WRAP(mkl_dcsrmm);
 #endif
-}  // namespace dynload
+template <>
-}  // namespace platform
+const TypeInfo<StorageProperties>
-}  // namespace paddle
+    TypeInfoTraits<StorageProperties, NPUStorageProperties>::kType =
+        RegisterStaticType<StorageProperties>(NPUStorageProperties::name());
+}  // namespace phi
--- a/paddle/phi/core/string_tensor.cc
+++ b/paddle/phi/core/string_tensor.cc
@@ -21,6 +21,11 @@ limitations under the License. */
 namespace phi {
+template <>
+const TypeInfo<phi::TensorBase>
+    TypeInfoTraits<phi::TensorBase, StringTensor>::kType =
+        RegisterStaticType<phi::TensorBase>(StringTensor::name());
 StringTensor::StringTensor() { meta_.offset = 0; }
 StringTensor::StringTensor(Allocator* a, const StringTensorMeta& meta)
@@ -91,8 +96,9 @@ dtype::pstring* StringTensor::data() {
 }
 void StringTensor::set_meta(const StringTensorMeta& meta) {
-  PADDLE_ENFORCE(
+  PADDLE_ENFORCE_EQ(
      meta.valid(),
+      true,
      phi::errors::InvalidArgument(
          "Input meta is invalid, please check the meta attribute."));
  meta_.dims = meta.dims;
@@ -143,8 +149,9 @@ void* StringTensor::AllocateFrom(Allocator* allocator,
  if (fake_alloc) {
    bytes = 0;
  } else {
-    PADDLE_ENFORCE(
+    PADDLE_ENFORCE_EQ(
        valid(),
+        true,
        errors::PreconditionNotMet("The meta data must be valid when call the "
                                   "mutable data function."));
    if (requested_size) {

--- a/paddle/phi/core/tensor_array.cc
+++ b/paddle/phi/core/tensor_array.cc
@@ -16,6 +16,11 @@ limitations under the License. */
 namespace phi {
+template <>
+const TypeInfo<phi::TensorBase>
+    TypeInfoTraits<phi::TensorBase, TensorArray>::kType =
+        RegisterStaticType<phi::TensorBase>(TensorArray::name());
 TensorArray::TensorArray(const std::vector<DenseTensor>& vec) {
  tensors_ = vec;
 }

--- a/paddle/phi/core/utils/type_info.h
+++ b/paddle/phi/core/utils/type_info.h
@@ -52,8 +52,4 @@ class TypeInfoTraits {
 template <typename BaseT>
 TypeInfo<BaseT> RegisterStaticType(const std::string& type);
-template <typename BaseT, typename DerivedT>
-const TypeInfo<BaseT> TypeInfoTraits<BaseT, DerivedT>::kType =
-    RegisterStaticType<BaseT>(DerivedT::name());
 }  // namespace phi
--- a/paddle/phi/infermeta/CMakeLists.txt
+++ b/paddle/phi/infermeta/CMakeLists.txt
-cc_library(
-  infermeta
-  SRCS nullary.cc unary.cc binary.cc ternary.cc multiary.cc fusion.cc
-  DEPS convert_utils meta_tensor infermeta_utils xxhash)
-cc_library(
-  backward_infermeta
-  SRCS backward.cc
-  DEPS meta_tensor convert_utils)
 add_subdirectory(strings)
 add_subdirectory(sparse)
+collect_srcs(
+  infermeta_srcs
+  SRCS
+  nullary.cc
+  unary.cc
+  binary.cc
+  ternary.cc
+  multiary.cc
+  fusion.cc
+  backward.cc)
--- a/paddle/phi/infermeta/multiary.cc
+++ b/paddle/phi/infermeta/multiary.cc
@@ -1668,9 +1668,10 @@ static void Interpolate2DInferShapeCheck(
    MetaConfig config) {
  auto dim_x = x.dims();
-  PADDLE_ENFORCE(
+  PADDLE_ENFORCE_EQ(
-      "bilinear" == interp_method || "nearest" == interp_method ||
+      ("bilinear" == interp_method || "nearest" == interp_method ||
-          "bicubic" == interp_method,
+       "bicubic" == interp_method),
+      true,
      phi::errors::InvalidArgument(
          "Interpolation method can only be \"bilinear\" or \"nearest\" when "
          "Input(X) dimension is 4, but got method = %s.",
@@ -1818,7 +1819,9 @@ static void Interpolate3DInferShapeCheck(
    MetaConfig config) {
  auto dim_x = x.dims();
-  PADDLE_ENFORCE("nearest" == interp_method || "trilinear" == interp_method,
+  PADDLE_ENFORCE_EQ(
+      ("nearest" == interp_method || "trilinear" == interp_method),
+      true,
      phi::errors::InvalidArgument(
          "Interpolation method can only be \"trilinear\" or "
          "\"nearest\" when Input(X) "
@@ -1972,8 +1975,9 @@ void InterpolateInferMeta(
    MetaTensor* output,
    MetaConfig config) {
  auto dim_x = x.dims();  // NCHW format
-  PADDLE_ENFORCE(
+  PADDLE_ENFORCE_EQ(
-      dim_x.size() == 3 || dim_x.size() == 4 || dim_x.size() == 5,
+      (dim_x.size() == 3 || dim_x.size() == 4 || dim_x.size() == 5),
+      true,
      phi::errors::Unimplemented(
          "Input(X) dimension must be 3, 4 or 5, but got dimension = %d .",
          dim_x.size()));

--- a/paddle/phi/infermeta/sparse/CMakeLists.txt
+++ b/paddle/phi/infermeta/sparse/CMakeLists.txt
-cc_library(
+collect_srcs(infermeta_srcs SRCS unary.cc binary.cc multiary.cc backward.cc)
-  sparse_infermeta
-  SRCS unary.cc binary.cc multiary.cc
-  DEPS convert_utils infermeta_utils)
-cc_library(
-  sparse_backward_infermeta
-  SRCS backward.cc
-  DEPS meta_tensor convert_utils)
--- a/paddle/phi/infermeta/strings/CMakeLists.txt
+++ b/paddle/phi/infermeta/strings/CMakeLists.txt
-cc_library(
+collect_srcs(infermeta_srcs SRCS nullary.cc unary.cc)
-  string_infermeta
-  SRCS nullary.cc unary.cc
-  DEPS convert_utils infermeta_utils)
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -2088,7 +2088,9 @@ void MaxPoolWithIndexInferMeta(const MetaTensor& x,
  auto x_dims = x.dims();
-  PADDLE_ENFORCE(x_dims.size() == 4 || x_dims.size() == 5,
+  PADDLE_ENFORCE_EQ(
+      (x_dims.size() == 4 || x_dims.size() == 5),
+      true,
      errors::InvalidArgument("Pooling intput should be 4-D or "
                              "5-D tensor but received %dD-Tensor",
                              x_dims.size()));
@@ -4430,11 +4432,11 @@ void TransposeInferMeta(const MetaTensor& x,
  // Note: x_rank > axis_size when fuse squeeze2 + transpose2, else x_rank ==
  // axis_size
-  PADDLE_ENFORCE_GE(
+  PADDLE_ENFORCE_GE(x_rank,
-      x_rank,
                    axis_size,
-      errors::InvalidArgument("The input tensor's dimension "
+                    errors::InvalidArgument(
-                              "should be equal to the axis's size. "
+                        "The input tensor's dimension "
+                        "should be equal to or greater than the axis's size. "
                        "But received input tensor's dimension is %d, "
                        "axis's size is %d",
                        x_rank,

--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -19,84 +19,6 @@ add_subdirectory(funcs)
 # kernel autotune
 add_subdirectory(autotune)
-# phi depends all phi kernel targets
-set_property(GLOBAL PROPERTY PHI_KERNELS "")
-# [ 1. Common kernel compilation dependencies ]
-set(COMMON_KERNEL_DEPS
-    dense_tensor
-    string_tensor
-    sparse_coo_tensor
-    sparse_csr_tensor
-    tensor_array
-    int_array
-    scalar
-    kernel_context
-    kernel_factory
-    arg_map_context
-    convert_utils
-    lod_utils
-    custom_kernel
-    string_infermeta
-    phi_tensor_utils)
-set(COMMON_KERNEL_DEPS
-    ${COMMON_KERNEL_DEPS}
-    eigen_function
-    blas
-    math_function
-    im2col
-    vol2col
-    concat_and_split_functor
-    selected_rows_functor)
-# remove this dep after removing fluid deps on tensor creation
-set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} lod_utils)
-set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} infermeta infermeta_utils
-                       sparse_infermeta)
-set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} switch_autotune)
-set(COMMON_KERNEL_DEPS
-    ${COMMON_KERNEL_DEPS}
-    threadpool
-    jit_kernel_helper
-    softmax
-    cross_entropy
-    matrix_bit_code
-    lapack_function
-    lstm_compute
-    gru_compute
-    deformable_conv_functor
-    matrix_reduce
-    segment_pooling
-    pooling
-    maxouting
-    matrix_inverse
-    matrix_solve
-    phi_dynload_warpctc
-    phi_dynload_warprnnt
-    sequence_padding
-    sequence_pooling
-    sequence_scale
-    fft
-    phi_data_layout_transform
-    gpc
-    utf8proc
-    gather_scatter_functor)
-set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} process_group)
-if(WITH_FLASHATTN)
-  set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} phi_dynload_flashattn)
-endif()
-if(WITH_NCCL OR WITH_RCCL)
-  set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} nccl_comm_context)
-endif()
-if(WITH_GLOO)
-  set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} gloo_comm_context)
-endif()
-if(WITH_CUDNN_FRONTEND)
-  set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} cudnn-frontend)
-endif()
 copy_if_different(${kernel_declare_file} ${kernel_declare_file_final})
 file(GLOB kernel_h "*.h" "selected_rows/*.h" "sparse/*.h" "strings/*.h")
@@ -105,8 +27,8 @@ file(GLOB kernel_primitive_h "primitive/*.h")
 # fusion ops would be included here
 file(
-  GLOB
+  GLOB kernel_cu
-  kernel_cu
+  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
  "gpu/*.cu"
  "gpu/*.cu.cc"
  "gpudnn/*.cu"
@@ -118,6 +40,10 @@ file(
  "strings/gpu/*.cu"
  "fusion/gpu/*.cu")
+if(APPLE OR WIN32)
+  list(REMOVE_ITEM kernel_cu "fusion/gpu/fusion_group_kernel.cu")
+endif()
 if(DEFINED REDUCE_INFERENCE_LIB_SIZE)
  list(FILTER kernel_cu EXCLUDE REGEX ".*_grad_kernel\\.cc$")
  list(FILTER kernel_cu EXCLUDE REGEX ".*_grad_kernel\\.cu$")
@@ -146,22 +72,19 @@ if(WITH_CUTLASS)
    )
  endif()
-  file(GLOB cutlass_cu "fusion/cutlass/conv2d/generated/*.cu"
+  file(
-       "fusion/cutlass/conv2d/*.cu" "fusion/cutlass/*.cu"
+    GLOB cutlass_cu
+    RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
+    "fusion/cutlass/conv2d/generated/*.cu" "fusion/cutlass/conv2d/*.cu"
+    "fusion/cutlass/*.cu"
    "fusion/cutlass/memory_efficient_attention/autogen/impl/*.cu")
-  add_definitions("-DPADDLE_WITH_MEMORY_EFFICIENT_ATTENTION")
  list(APPEND kernel_cu ${cutlass_cu})
 endif()
-if(APPLE OR WIN32)
-  list(REMOVE_ITEM kernel_cu
-       "${CMAKE_CURRENT_SOURCE_DIR}/fusion/gpu/fusion_group_kernel.cu")
-endif()
 if(WITH_MKLDNN)
  file(
-    GLOB
+    GLOB kernel_cc
-    kernel_cc
+    RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
    "*.cc"
    "cpu/*.cc"
    "legacy/*.cc"
@@ -171,6 +94,8 @@ if(WITH_MKLDNN)
    "selected_rows/cpu/*.cc"
    "sparse/*.cc"
    "sparse/cpu/*.cc"
+    "legacy/*.cc"
+    "legacy/cpu/*.cc"
    "strings/*.cc"
    "strings/cpu/*.cc"
    "onednn/*.cc"
@@ -179,8 +104,8 @@ if(WITH_MKLDNN)
    "fusion/cpu/*.cc")
 else()
  file(
-    GLOB
+    GLOB kernel_cc
-    kernel_cc
+    RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
    "*.cc"
    "cpu/*.cc"
    "legacy/*.cc"
@@ -189,6 +114,8 @@ else()
    "selected_rows/cpu/*.cc"
    "sparse/*.cc"
    "sparse/cpu/*.cc"
+    "legacy/*.cc"
+    "legacy/cpu/*.cc"
    "strings/*.cc"
    "strings/cpu/*.cc"
    "fusion/*.cc"
@@ -200,32 +127,17 @@ if(DEFINED REDUCE_INFERENCE_LIB_SIZE)
 endif()
 file(
-  GLOB
+  GLOB kernel_xpu
-  kernel_xpu
+  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
-  "xpu/*.cc"
+  "xpu/*.cc" "legacy/xpu/*.cc" "selected_rows/xpu/*.cc" "fusion/xpu/*.cc"
-  "legacy/xpu/*.cc"
-  "selected_rows/xpu/*.cc"
-  "fusion/xpu/*.cc"
  "sparse/xpu/*.cc")
-if(WITH_MKLDNN)
-  set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} get_kerneltype_forvar_utils)
-endif()
 if(WITH_GPU OR WITH_ROCM)
-  if(WITH_GPU)
+  collect_srcs(kernels_srcs SRCS ${kernel_cu})
-    add_library(phi_gpu ${kernel_cu} ${kernel_cc})
-    if(WITH_CUTLASS)
-      add_dependencies(phi_gpu cutlass_codegen)
-    endif()
-  elseif(WITH_ROCM)
-    hip_add_library(phi_gpu STATIC ${kernel_cu} ${kernel_cc})
-  endif()
  kernel_declare("${kernel_cu}")
-  kernel_declare("${kernel_cc}")
+endif()
-  target_link_libraries(phi_gpu ${COMMON_KERNEL_DEPS})
-  set(ADD_PHI_KERNELS ${ADD_PHI_KERNELS} phi_gpu)
+if(WITH_XPU)
-elseif(WITH_XPU)
  if(WITH_XPU_KP)
    file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/kps/
         DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/kps/)
@@ -237,52 +149,23 @@ elseif(WITH_XPU)
      file(RENAME ${kernel} "${CMAKE_CURRENT_BINARY_DIR}/kps/${name}.kps")
    endforeach()
    file(GLOB kernel_xpu_kps "${CMAKE_CURRENT_BINARY_DIR}/kps/*.kps")
-    file(
-      GLOB kernel_cc_relative
+    foreach(kernel ${kernel_cc})
-      RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
-      "*.cc"
-      "cpu/*.cc"
-      "legacy/*.cc"
-      "legacy/cpu/*.cc"
-      "selected_rows/*.cc"
-      "selected_rows/cpu/*.cc"
-      "sparse/*.cc"
-      "sparse/cpu/*.cc"
-      "strings/*.cc"
-      "strings/cpu/*.cc"
-      "fusion/*.cc"
-      "fusion/cpu/*.cc")
-    foreach(kernel ${kernel_cc_relative})
      file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/${kernel}
           DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/${kernel})
    endforeach()
    file(GLOB_RECURSE kernel_xpu_cc "${CMAKE_CURRENT_BINARY_DIR}/*.cc")
-    xpu_add_library(
-      phi_xpu
+    set(kernel_cc ${kernel_xpu_cc})
-      STATIC
+    collect_generated_srcs(kernels_srcs SRCS ${kernel_xpu_kps})
-      ${kernel_xpu}
-      ${kernel_xpu_kps}
-      ${kernel_xpu_cc}
-      DEPENDS
-      ${COMMON_KERNEL_DEPS})
-    kernel_declare("${kernel_xpu_cc}")
-  else()
-    add_library(phi_xpu ${kernel_xpu} ${kernel_cc})
-    kernel_declare("${kernel_cc}")
  endif()
+  collect_srcs(kernels_srcs SRCS ${kernel_xpu})
  kernel_declare("${kernel_xpu}")
  kernel_declare("${kernel_xpu_kps}")
-  target_link_libraries(phi_xpu ${COMMON_KERNEL_DEPS})
-  set(ADD_PHI_KERNELS ${ADD_PHI_KERNELS} phi_xpu)
-else()
-  add_library(phi_cpu ${kernel_cc})
-  target_link_libraries(phi_cpu ${COMMON_KERNEL_DEPS})
-  kernel_declare("${kernel_cc}")
-  set(ADD_PHI_KERNELS phi_cpu)
 endif()
-set_property(GLOBAL PROPERTY PHI_KERNELS ${ADD_PHI_KERNELS})
+collect_srcs(kernels_srcs SRCS ${kernel_cc})
+kernel_declare("${kernel_cc}")
 if(NOT "${KERNEL_LIST}" STREQUAL "")
  prune_declaration_h()

--- a/paddle/phi/kernels/autotune/CMakeLists.txt
+++ b/paddle/phi/kernels/autotune/CMakeLists.txt
-if(WITH_CUDNN_FRONTEND)
+collect_srcs(kernels_srcs SRCS cache.cc switch_autotune.cc)
-  cc_library(
-    cache
-    SRCS cache.cc
-    DEPS cudnn-frontend phi_enforce)
-else()
-  cc_library(
-    cache
-    SRCS cache.cc
-    DEPS phi_enforce)
-endif()
-cc_library(
-  switch_autotune
-  SRCS switch_autotune.cc
-  DEPS cache flags)
--- a/paddle/phi/kernels/autotune/cache_base.h
+++ b/paddle/phi/kernels/autotune/cache_base.h
@@ -18,11 +18,11 @@
 #include <unordered_map>
 #include <vector>
-#include "gflags/gflags.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/errors.h"
+#include "paddle/phi/core/flags.h"
-DECLARE_int32(search_cache_max_number);
+PHI_DECLARE_int32(search_cache_max_number);
 inline void HashCombine(std::size_t* seed UNUSED) {}

--- a/paddle/phi/kernels/cpu/rmsprop_kernel.cc
+++ b/paddle/phi/kernels/cpu/rmsprop_kernel.cc
@@ -105,10 +105,6 @@ struct RmsFunctor<T, phi::CPUContext> {
  }
 };
-template struct RmsFunctor<phi::GPUContext, float>;
-template struct RmsFunctor<phi::GPUContext, double>;
-template struct RmsFunctor<phi::GPUContext, phi::dtype::float16>;
 }  // namespace phi
 PD_REGISTER_KERNEL(
    rmsprop, CPU, ALL_LAYOUT, phi::RmspropDenseKernel, float, double) {}

--- a/paddle/phi/kernels/funcs/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/CMakeLists.txt
@@ -4,67 +4,15 @@ add_subdirectory(lapack)
 add_subdirectory(detail)
 add_subdirectory(jit)
-math_library(deformable_conv_functor DEPS dense_tensor)
+file(
-math_library(concat_and_split_functor DEPS dense_tensor)
+  GLOB func_cc_srcs
-math_library(fc_functor DEPS blas jit_kernel_helper)
+  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
-math_library(gpc DEPS phi_enforce)
+  "*.cc")
-math_library(gru_compute DEPS activation_functions math_function)
-math_library(lstm_compute DEPS activation_functions)
-math_library(math_function DEPS blas dense_tensor)
-math_library(matrix_reduce DEPS dense_tensor)
-math_library(matrix_inverse DEPS dense_tensor eigen3 blas)
-math_library(pooling DEPS dense_tensor)
-math_library(segment_pooling)
-math_library(sequence2batch)
-math_library(matrix_solve DEPS dense_tensor eigen3 blas math_function)
-math_library(cross_entropy)
-math_library(im2col)
-math_library(vol2col)
-math_library(softmax DEPS math_function)
-math_library(maxouting)
-math_library(matrix_bit_code)
-math_library(sequence_scale)
-math_library(sequence_padding DEPS lod_utils)
-math_library(sequence_pooling DEPS math_function jit_kernel_helper)
-cc_library(
-  phi_data_layout_transform
-  SRCS data_layout_transform.cc
-  DEPS tensor blas)
 if(WITH_GPU OR WITH_ROCM)
-  if(MKL_FOUND AND WITH_ONEMKL)
+  file(
-    math_library(fft spectral_op.cu DEPS dynload_cuda dynload_mklrt
+    GLOB func_cu_srcs
-                 dense_tensor)
+    RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
-    target_include_directories(fft PRIVATE ${MKL_INCLUDE})
+    "*.cu")
-  else()
-    math_library(fft spectral_op.cu DEPS dynload_cuda dense_tensor pocketfft)
-  endif()
-else()
-  if(MKL_FOUND AND WITH_ONEMKL)
-    mathp_library(fft DEPS dynload_mklrt dense_tensor)
-    target_include_directories(fft PRIVATE ${MKL_INCLUDE})
-  else()
-    math_library(fft DEPS dense_tensor pocketfft)
-  endif()
 endif()
-if(WITH_MKLDNN)
+collect_srcs(kernels_srcs SRCS ${func_cc_srcs} ${func_cu_srcs})
-  math_library(selected_rows_functor DEPS selected_rows_utils math_function
-               blas mixed_vector)
-else()
-  math_library(selected_rows_functor DEPS selected_rows_utils math_function
-               blas mixed_vector)
-endif()
-if(WITH_ROCM)
-  hip_library(
-    gather_scatter_functor
-    SRCS gather_scatter_functor.cc gather_scatter_functor.cu
-    DEPS tensor)
-else()
-  cc_library(
-    gather_scatter_functor
-    SRCS gather_scatter_functor.cc gather_scatter_functor.cu
-    DEPS tensor)
-endif()
--- a/paddle/phi/kernels/funcs/blas/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/blas/CMakeLists.txt
-cc_library(
+collect_srcs(kernels_srcs SRCS blas.cc)
-  blas
-  SRCS blas.cc
-  DEPS cblas framework_proto phi_backends)
--- a/paddle/phi/kernels/funcs/blas/blas_impl.cu.h
+++ b/paddle/phi/kernels/funcs/blas/blas_impl.cu.h
@@ -19,10 +19,11 @@
 #include "paddle/phi/backends/dynload/cublas.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/flags.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
-DECLARE_bool(enable_cublas_tensor_op_math);
+PHI_DECLARE_bool(enable_cublas_tensor_op_math);
-DECLARE_bool(gemm_use_half_precision_compute_type);
+PHI_DECLARE_bool(gemm_use_half_precision_compute_type);
 namespace phi {
 namespace funcs {

--- a/paddle/phi/kernels/funcs/detail/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/detail/CMakeLists.txt
-cc_library(activation_functions SRCS avx_functions.cc)
+collect_srcs(kernels_srcs SRCS avx_functions.cc)
--- a/paddle/phi/kernels/funcs/eigen/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/eigen/CMakeLists.txt
@@ -6,19 +6,5 @@ file(
  GLOB EIGEN_CU_SOURCES
  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
  "*.cu")
-if(WITH_GPU)
-  nv_library(
+collect_srcs(kernels_srcs SRCS ${EIGEN_CC_SOURCES} ${EIGEN_CU_SOURCES})
-    eigen_function
-    SRCS ${EIGEN_CC_SOURCES} ${EIGEN_CU_SOURCES}
-    DEPS eigen3)
-elseif(WITH_ROCM)
-  hip_library(
-    eigen_function
-    SRCS ${EIGEN_CC_SOURCES} ${EIGEN_CU_SOURCES}
-    DEPS eigen3)
-else()
-  cc_library(
-    eigen_function
-    SRCS ${EIGEN_CC_SOURCES}
-    DEPS eigen3)
-endif()
--- a/paddle/phi/kernels/funcs/jit/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/jit/CMakeLists.txt
@@ -9,17 +9,13 @@ file(APPEND ${jit_file} "\#include \"paddle/phi/kernels/funcs/jit/helper.h\"\n")
 file(APPEND ${jit_file}
     "\#include \"paddle/phi/kernels/funcs/jit/registry.h\"\n\n")
-set(JIT_KERNEL_DEPS device_context cblas gflags enforce place xxhash)
 file(
  GLOB jit_kernel_cc_srcs
  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
  "*.cc")
 list(REMOVE_ITEM jit_kernel_cc_srcs test.cc benchmark.cc)
-cc_library(
-  jit_kernel_base
+collect_srcs(kernels_srcs SRCS ${jit_kernel_cc_srcs})
-  SRCS ${jit_kernel_cc_srcs}
-  DEPS ${JIT_KERNEL_DEPS})
 copy_if_different(${jit_file} ${jit_file_final})
@@ -30,14 +26,11 @@ if(WITH_XBYAK)
  add_subdirectory(gen)
 endif()
-cc_library(
-  jit_kernel_helper INTERFACE
-  SRCS ${jit_kernel_cc_srcs}
-  DEPS jit_kernel_base ${JIT_KERNEL_DEPS})
 cc_test(
  jit_kernel_test
  SRCS test.cc
-  DEPS jit_kernel_helper)
+  DEPS phi)
 if(NOT WIN32)
  set(cuda_less12_and_gcc_greater12 false)
  if(DEFINED CMAKE_CUDA_COMPILER_VERSION)
@@ -47,14 +40,7 @@ if(NOT WIN32)
    endif()
  endif()
  if(NOT cuda_less12_and_gcc_greater12)
-    cc_binary(
+    cc_binary(jit_kernel_benchmark SRCS benchmark.cc DEPS phi)
-      jit_kernel_benchmark
-      SRCS
-      benchmark.cc
-      DEPS
-      jit_kernel_helper
-      phi_device_tracer
-      tensor)
  endif()
 endif()
 if(WITH_TESTING AND TEST jit_kernel_test)

--- a/paddle/phi/kernels/funcs/jit/gen/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/jit/gen/CMakeLists.txt
@@ -3,13 +3,7 @@ file(
  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
  "*.cc")
-cc_library(
+collect_srcs(kernels_srcs SRCS ${jitcode_cc_srcs})
-  jit_kernel_jitcode
-  SRCS ${jitcode_cc_srcs}
-  DEPS jit_kernel_base xbyak)
-set(JIT_KERNEL_DEPS
-    ${JIT_KERNEL_DEPS} xbyak jit_kernel_jitcode
-    PARENT_SCOPE)
 function(USE_JITKERNEL_GEN TARGET)
  file(APPEND ${jit_file} "USE_JITKERNEL_GEN(${TARGET});\n")

--- a/paddle/phi/kernels/funcs/jit/gen_base.h
+++ b/paddle/phi/kernels/funcs/jit/gen_base.h
@@ -33,7 +33,7 @@ namespace jit {
 class GenBase : public Kernel {
 public:
-  virtual ~GenBase() = default;
+  virtual ~GenBase() {}
  virtual std::string name() const = 0;
  virtual size_t getSize() const = 0;
  virtual const unsigned char* getCodeInternal() const = 0;

--- a/paddle/phi/kernels/funcs/jit/more/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/jit/more/CMakeLists.txt
@@ -12,7 +12,3 @@ endif()
 # mix should be last
 add_subdirectory(mix)
-set(JIT_KERNEL_DEPS
-    ${JIT_KERNEL_DEPS}
-    PARENT_SCOPE)
--- a/paddle/phi/kernels/funcs/jit/more/intrinsic/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/jit/more/intrinsic/CMakeLists.txt
@@ -2,14 +2,8 @@ file(
  GLOB jit_kernel_cc_intrinsic
  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
  "*.cc")
-cc_library(
-  jit_kernel_intrinsic
-  SRCS ${jit_kernel_cc_intrinsic}
-  DEPS jit_kernel_base)
-set(JIT_KERNEL_DEPS
+collect_srcs(kernels_srcs SRCS ${jit_kernel_cc_intrinsic})
-    ${JIT_KERNEL_DEPS} jit_kernel_intrinsic
-    PARENT_SCOPE)
 # use mkl kernels by name and type
 use_jitkernel_more(kCRFDecoding, intrinsic)

--- a/paddle/phi/kernels/funcs/jit/more/mix/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/jit/more/mix/CMakeLists.txt
@@ -2,14 +2,8 @@ file(
  GLOB jit_kernel_mix_cc
  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
  "*.cc")
-cc_library(
-  jit_kernel_mix
-  SRCS ${jit_kernel_mix_cc}
-  DEPS jit_kernel_base)
-set(JIT_KERNEL_DEPS
+collect_srcs(kernels_srcs SRCS ${jit_kernel_mix_cc})
-    ${JIT_KERNEL_DEPS} jit_kernel_mix
-    PARENT_SCOPE)
 use_jitkernel_more(kVSigmoid, mix)
 use_jitkernel_more(kVTanh, mix)

--- a/paddle/phi/kernels/funcs/jit/more/mkl/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/jit/more/mkl/CMakeLists.txt
-cc_library(
+collect_srcs(kernels_srcs SRCS mkl.cc)
-  jit_kernel_mkl
-  SRCS mkl.cc
-  DEPS jit_kernel_base dynload_mklml)
-set(JIT_KERNEL_DEPS
-    ${JIT_KERNEL_DEPS} dynload_mklml jit_kernel_mkl
-    PARENT_SCOPE)
 # use mkl kernels by name and type
 use_jitkernel_more(kMatMul, mkl)

--- a/paddle/phi/kernels/funcs/jit/refer/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/jit/refer/CMakeLists.txt
-cc_library(
+collect_srcs(kernels_srcs SRCS refer.cc)
-  jit_kernel_refer
-  SRCS refer.cc
-  DEPS jit_kernel_base)
-set(JIT_KERNEL_DEPS
-    ${JIT_KERNEL_DEPS} jit_kernel_refer
-    PARENT_SCOPE)
 function(USE_JITKERNEL_REFER TARGET)
  file(APPEND ${jit_file} "USE_JITKERNEL_REFER(${TARGET});\n")

--- a/paddle/phi/kernels/funcs/lapack/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/lapack/CMakeLists.txt
-math_library(lapack_function DEPS phi_dynload_lapack)
+collect_srcs(kernels_srcs SRCS lapack_function.cc)
--- a/paddle/phi/kernels/funcs/math_function.h
+++ b/paddle/phi/kernels/funcs/math_function.h
@@ -25,6 +25,7 @@ limitations under the License. */
 namespace phi {
 namespace funcs {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 template <typename T>
 void BatchTranspose(T* output,
                    const T* input,
@@ -32,7 +33,7 @@ void BatchTranspose(T* output,
                    int64_t m,
                    int64_t n,
                    const phi::GPUContext* dev_ctx);
+#endif
 template <typename DeviceContext, typename T>
 struct TransposeNormal {
  // for dims >= 7 situation

--- a/paddle/phi/kernels/funcs/matrix_inverse.cu.cc
+++ b/paddle/phi/kernels/funcs/matrix_inverse.cu.cc
--- a/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention.cu
+++ b/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention.cu
@@ -12,17 +12,21 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/autogen/memory_efficient_attention.h"
+#include "glog/logging.h"
-#include "paddle/fluid/memory/malloc.h"
-#include "paddle/fluid/platform/errors.h"
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/errors.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/autogen/memory_efficient_attention.h"
+#include "paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/gemm_kernel_utils.h"
 #include "paddle/phi/kernels/fusion/cutlass/memory_efficient_attention_utils.h"
 namespace phi {
 namespace fusion {
 namespace cutlass_internal {
+using gemm_kernel_utils::getMaximumSharedMemoryPerBlockKb;
 template <typename T, typename Context>
 void MemoryEfficientAttentionForwardKernel(
    const Context& ctx,
@@ -124,9 +128,9 @@ void MemoryEfficientAttentionForwardKernel(
    VLOG(3) << "kAlignLSE" << kAlignLSE;
    typename KernelType::Params p;
-    p.query_ptr = SafeGetTensorPtr<scalar_t>(query);
+    p.query_ptr = phi::SafeGetTensorPtr<scalar_t>(query);
-    p.key_ptr = SafeGetTensorPtr<scalar_t>(key);
+    p.key_ptr = phi::SafeGetTensorPtr<scalar_t>(key);
-    p.value_ptr = SafeGetTensorPtr<scalar_t>(value);
+    p.value_ptr = phi::SafeGetTensorPtr<scalar_t>(value);
    p.logsumexp_ptr = is_test ? nullptr : logsumexp->data<float>();
    VLOG(3) << "logsumexp_ptr" << p.logsumexp_ptr;
@@ -134,19 +138,19 @@ void MemoryEfficientAttentionForwardKernel(
    if (KernelType::kNeedsOutputAccumulatorBuffer) {
      out_accum.Resize(output->dims());
      p.output_accum_ptr =
-          SafeAllocTensor<typename KernelType::output_accum_t, Context>(
+          phi::SafeAllocTensor<typename KernelType::output_accum_t, Context>(
              ctx, &out_accum);
      VLOG(3) << "output_accum_ptr " << p.output_accum_ptr;
    } else {
      p.output_accum_ptr = nullptr;
    }
-    p.output_ptr =
+    p.output_ptr = phi::SafeAllocTensor<typename KernelType::output_t, Context>(
-        SafeAllocTensor<typename KernelType::output_t, Context>(ctx, output);
+        ctx, output);
    VLOG(3) << "output_ptr " << p.output_ptr;
    if (cu_seqlens_q) {
-      p.seqstart_q_ptr = SafeGetTensorPtr<int32_t>(cu_seqlens_q);
+      p.seqstart_q_ptr = phi::SafeGetTensorPtr<int32_t>(cu_seqlens_q);
-      p.seqstart_k_ptr = SafeGetTensorPtr<int32_t>(cu_seqlens_k);
+      p.seqstart_k_ptr = phi::SafeGetTensorPtr<int32_t>(cu_seqlens_k);
      VLOG(3) << "seqstart_q_ptr " << p.seqstart_q_ptr;
    } else {
      p.seqstart_q_ptr = nullptr;
@@ -164,7 +168,7 @@ void MemoryEfficientAttentionForwardKernel(
        cu_seqlens_q ? cu_seqlens_q.get().dims()[0] - 1 : q_dims[0]);
    p.causal = causal;
    if (causal_diagonal) {
-      p.causal_diagonal_ptr = SafeGetTensorPtr<int32_t>(causal_diagonal);
+      p.causal_diagonal_ptr = phi::SafeGetTensorPtr<int32_t>(causal_diagonal);
    } else {
      p.causal_diagonal_ptr = nullptr;
    }
@@ -172,7 +176,7 @@ void MemoryEfficientAttentionForwardKernel(
    p.seqlen_k_ptr = nullptr;
    if (seqlen_k) {
-      p.seqlen_k_ptr = SafeGetTensorPtr<int32_t>(seqlen_k);
+      p.seqlen_k_ptr = phi::SafeGetTensorPtr<int32_t>(seqlen_k);
    } else {
      p.seqlen_k_ptr = nullptr;
    }
@@ -197,7 +201,7 @@ void MemoryEfficientAttentionForwardKernel(
    PD_MEA_CHECK_OVERFLOW(p.o_strideM, DimStride(output->dims(), 1));
    if (bias) {
-      p.attn_bias_ptr = SafeGetTensorPtr<scalar_t>(bias);
+      p.attn_bias_ptr = phi::SafeGetTensorPtr<scalar_t>(bias);
      PD_MEA_CHECK_OVERFLOW(
          p.bias_strideB,
          GetMemoryEfficientBiasStrideB(bias.get().dims(), q_dims, k_dims));
@@ -215,7 +219,8 @@ void MemoryEfficientAttentionForwardKernel(
    seed_dims[0] = 2;
    seed_and_offset->Resize(seed_dims);
    ctx.template HostAlloc<int64_t>(seed_and_offset);
-    int64_t* seed_and_offset_ptr = SafeGetTensorPtr<int64_t>(seed_and_offset);
+    int64_t* seed_and_offset_ptr =
+        phi::SafeGetTensorPtr<int64_t>(seed_and_offset);
    auto gen = ctx.GetGenerator();
    uint64_t inc = query.dims()[0] * query.dims()[2] * 32;
@@ -254,10 +259,10 @@ void MemoryEfficientAttentionForwardKernel(
                ctx.stream()>>>(p);
  };
  dispatch_cutlass_forward<T>(ctx, launchKernel);
-  PADDLE_ENFORCE_EQ(kernel_launched,
+  PADDLE_ENFORCE_EQ(
+      kernel_launched,
      true,
-                    paddle::platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument("the kernel should not be launched"));
-                        "the kernel should not be launched"));
 }
 }  // namespace cutlass_internal

--- a/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention_backward.cu
+++ b/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention_backward.cu
@@ -15,16 +15,16 @@
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/platform/errors.h"
 #include "paddle/phi/api/include/tensor_operants.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/autogen/memory_efficient_attention.h"
-#include "paddle/phi/kernels/fusion/cutlass/memory_efficient_attention_utils.h"
 #include "paddle/phi/kernels/cast_kernel.h"
 #include "paddle/phi/kernels/cum_kernel.h"
 #include "paddle/phi/kernels/elementwise_add_kernel.h"
 #include "paddle/phi/kernels/elementwise_multiply_kernel.h"
 #include "paddle/phi/kernels/funcs/get_pad_lse.cu.h"
+#include "paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/autogen/memory_efficient_attention.h"
+#include "paddle/phi/kernels/fusion/cutlass/memory_efficient_attention_utils.h"
 #include "paddle/phi/kernels/matmul_kernel.h"
 #include "paddle/phi/kernels/reduce_sum_kernel.h"
 #include "paddle/phi/kernels/reshape_kernel.h"
@@ -34,6 +34,8 @@ namespace phi {
 namespace fusion {
 namespace cutlass_internal {
+using gemm_kernel_utils::getMaximumSharedMemoryPerBlockKb;
 template <typename T, typename Context>
 void MemoryEfficientAttentionBackwardKernel(
    const Context& ctx,
@@ -387,9 +389,9 @@ void MemoryEfficientAttentionBackwardKernel(
    VLOG(3) << "delta has been set" << delta.data();
    typename KernelType::Params p;
-    p.query_ptr = SafeGetTensorPtr<scalar_t>(query);
+    p.query_ptr = phi::SafeGetTensorPtr<scalar_t>(query);
-    p.key_ptr = SafeGetTensorPtr<scalar_t>(key);
+    p.key_ptr = phi::SafeGetTensorPtr<scalar_t>(key);
-    p.value_ptr = SafeGetTensorPtr<scalar_t>(value);
+    p.value_ptr = phi::SafeGetTensorPtr<scalar_t>(value);
    bool force_pad_inf = (compute_capacity == 75);
    const std::string data_format = "NCHW";
@@ -400,14 +402,14 @@ void MemoryEfficientAttentionBackwardKernel(
                                       32,
                                       data_format,
                                       force_pad_inf);
-    p.logsumexp_ptr = SafeGetTensorPtr<float>(padded_lse);
+    p.logsumexp_ptr = phi::SafeGetTensorPtr<float>(padded_lse);
    VLOG(3) << "logsumexp_ptr" << p.logsumexp_ptr;
-    p.output_ptr = SafeGetTensorPtr<scalar_t>(output);
+    p.output_ptr = phi::SafeGetTensorPtr<scalar_t>(output);
-    p.grad_output_ptr = SafeGetTensorPtr<scalar_t>(output_grad);
+    p.grad_output_ptr = phi::SafeGetTensorPtr<scalar_t>(output_grad);
-    p.grad_query_ptr = SafeAllocTensor<scalar_t, Context>(ctx, query_grad);
+    p.grad_query_ptr = phi::SafeAllocTensor<scalar_t, Context>(ctx, query_grad);
-    p.grad_key_ptr = SafeAllocTensor<scalar_t, Context>(ctx, key_grad);
+    p.grad_key_ptr = phi::SafeAllocTensor<scalar_t, Context>(ctx, key_grad);
-    p.grad_value_ptr = SafeAllocTensor<scalar_t, Context>(ctx, value_grad);
+    p.grad_value_ptr = phi::SafeAllocTensor<scalar_t, Context>(ctx, value_grad);
-    p.delta_ptr = SafeGetTensorPtr<float>(delta);
+    p.delta_ptr = phi::SafeGetTensorPtr<float>(delta);
    PD_MEA_CHECK_OVERFLOW(p.head_dim, q_dims[3]);
    PD_MEA_CHECK_OVERFLOW(p.head_dim_value, v_dims[3]);
@@ -427,8 +429,8 @@ void MemoryEfficientAttentionBackwardKernel(
    VLOG(3) << "p.scale" << p.scale;
    if (cu_seqlens_q) {
-      p.cu_seqlens_q_ptr = SafeGetTensorPtr<int32_t>(cu_seqlens_q);
+      p.cu_seqlens_q_ptr = phi::SafeGetTensorPtr<int32_t>(cu_seqlens_q);
-      p.cu_seqlens_k_ptr = SafeGetTensorPtr<int32_t>(cu_seqlens_k);
+      p.cu_seqlens_k_ptr = phi::SafeGetTensorPtr<int32_t>(cu_seqlens_k);
      VLOG(3) << "p.cu_seqlens_q_ptr" << p.cu_seqlens_q_ptr;
    }
@@ -483,7 +485,7 @@ void MemoryEfficientAttentionBackwardKernel(
    PD_MEA_CHECK_OVERFLOW(p.delta_strideB, DimStride(delta.dims(), 0));
    if (bias) {
-      p.bias_ptr = SafeGetTensorPtr<scalar_t>(bias);
+      p.bias_ptr = phi::SafeGetTensorPtr<scalar_t>(bias);
      PD_MEA_CHECK_OVERFLOW(
          p.bias_strideB,
          GetMemoryEfficientBiasStrideB(bias.get().dims(), q_dims, k_dims));
@@ -491,7 +493,8 @@ void MemoryEfficientAttentionBackwardKernel(
      PD_MEA_CHECK_OVERFLOW(p.bias_strideM, k_dims[1]);
      VLOG(3) << "p.bias_ptr" << p.bias_ptr;
      if (bias_grad) {
-        p.grad_bias_ptr = SafeAllocTensor<scalar_t, Context>(ctx, bias_grad);
+        p.grad_bias_ptr =
+            phi::SafeAllocTensor<scalar_t, Context>(ctx, bias_grad);
        PD_MEA_CHECK_OVERFLOW(p.gB_strideB, q_dims[2] * q_dims[1] * k_dims[1]);
        PD_MEA_CHECK_OVERFLOW(p.gB_strideH, q_dims[1] * k_dims[1]);
        PD_MEA_CHECK_OVERFLOW(p.gB_strideM, k_dims[1]);
@@ -504,7 +507,8 @@ void MemoryEfficientAttentionBackwardKernel(
      p.grad_bias_ptr = nullptr;
    }
    if (dropout_p != 0) {
-      int64_t* seed_and_offset_ptr = SafeGetTensorPtr<int64_t>(seed_and_offset);
+      int64_t* seed_and_offset_ptr =
+          phi::SafeGetTensorPtr<int64_t>(seed_and_offset);
      p.seed = (uint64_t)seed_and_offset_ptr[0];
      p.offset = (uint64_t)seed_and_offset_ptr[1];
      p.dropout_prob = dropout_p;
@@ -514,9 +518,9 @@ void MemoryEfficientAttentionBackwardKernel(
    }
    int64_t size_bytes = p.workspace_size();
-    paddle::memory::AllocationPtr temp_workspace{nullptr};
+    phi::Allocator::AllocationPtr temp_workspace{nullptr};
    VLOG(3) << "size_bytes " << size_bytes;
-    temp_workspace = paddle::memory::Alloc(
+    temp_workspace = phi::memory_utils::Alloc(
        ctx.GetPlace(),
        size_bytes,
        phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));

--- a/paddle/phi/kernels/gpu/eigvalsh_kernel.cu
+++ b/paddle/phi/kernels/gpu/eigvalsh_kernel.cu
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifndef PADDLE_WITH_HIP
 #include "paddle/phi/kernels/eigvalsh_kernel.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
@@ -29,3 +31,5 @@ PD_REGISTER_KERNEL(eigvalsh,  // cuda_only
                   phi::dtype::complex<double>) {
  kernel->InputAt(1).SetDataType(phi::dtype::ToReal(kernel_key.dtype()));
 }
+#endif  // not PADDLE_WITH_HIP
--- a/paddle/phi/kernels/gpu/gelu_funcs.h
+++ b/paddle/phi/kernels/gpu/gelu_funcs.h
@@ -22,7 +22,7 @@
 #include "paddle/phi/core/flags.h"
 #include "paddle/phi/kernels/funcs/aligned_vector.h"
-DECLARE_bool(use_fast_math);
+PHI_DECLARE_bool(use_fast_math);
 namespace phi {

--- a/paddle/phi/kernels/impl/isclose_kernel_impl.h
+++ b/paddle/phi/kernels/impl/isclose_kernel_impl.h
@@ -52,19 +52,6 @@ struct GetTensorValue<phi::CPUContext, T> {
  }
 };
-template <typename T>
-struct GetTensorValue<phi::GPUContext, T> {
-  T operator()(const phi::GPUContext& dev_ctx,
-               const DenseTensor& tensor) const {
-    const T* data = tensor.data<T>();
-    T value;
-    const auto gpu_place = dev_ctx.GetPlace();
-    memory_utils::Copy(
-        phi::CPUPlace(), &value, gpu_place, data, sizeof(T), dev_ctx.stream());
-    return value;
-  }
-};
 template <typename T>
 struct IscloseFunctor<phi::CPUContext, T> {
  void operator()(const phi::CPUContext& ctx,
@@ -127,6 +114,19 @@ __global__ void IscloseCUDAKernel(const T* in_data,
  }
 }
+template <typename T>
+struct GetTensorValue<phi::GPUContext, T> {
+  T operator()(const phi::GPUContext& dev_ctx,
+               const DenseTensor& tensor) const {
+    const T* data = tensor.data<T>();
+    T value;
+    const auto gpu_place = dev_ctx.GetPlace();
+    memory_utils::Copy(
+        phi::CPUPlace(), &value, gpu_place, data, sizeof(T), dev_ctx.stream());
+    return value;
+  }
+};
 template <typename T>
 struct IscloseFunctor<phi::GPUContext, T> {
  void operator()(const phi::GPUContext& dev_ctx,

--- a/paddle/phi/kernels/impl/slice_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/slice_grad_kernel_impl.h
@@ -30,7 +30,7 @@ void LaunchEigenPadding(
    const DDim& in_dims,
    const DenseTensor* d_out,
    const DDim& out_dims,
-    const Eigen::array<std::pair<int64_t, int64_t>, D>& paddings) {
+    const std::array<std::pair<int64_t, int64_t>, D>& paddings) {
  auto& place = *context.eigen_device();
  auto d_in_t = EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
      *d_input, in_dims);
@@ -40,7 +40,7 @@ void LaunchEigenPadding(
  if (d_input->numel() <= Eigen::NumTraits<int>::highest()) {
    // similar to tf.pad:
    // if element number less than INT_MAX, change the type of index to int
-    Eigen::array<std::pair<int, int>, D> paddings_32bit;
+    std::array<std::pair<int, int>, D> paddings_32bit;
    for (size_t i = 0; i < D; i++) {
      paddings_32bit[i] = std::make_pair(paddings[i].first, paddings[i].second);
    }
@@ -63,7 +63,7 @@ void EigenPaddingCompute(
    const DDim& in_dims,
    const DenseTensor* d_out,
    const DDim& out_dims,
-    const Eigen::array<std::pair<int64_t, int64_t>, D>& paddings) {
+    const std::array<std::pair<int64_t, int64_t>, D>& paddings) {
  if (D <= 3) {
    // if dimension less than 3, cannot reduce dimension
    LaunchEigenPadding<T, Context, D>(
@@ -97,7 +97,7 @@ void EigenPaddingCompute(
        // only last dimension need padding,
        // reshape the dimension of tensor in 2: [preceding, padding]
        std::vector<int64_t> in_tore_shape(2, 1), out_tore_shape(2, 1);
-        Eigen::array<std::pair<int64_t, int64_t>, 2> reshaped_padding;
+        std::array<std::pair<int64_t, int64_t>, 2> reshaped_padding;
        // first dimension is the accumulate of preceding dimension
        for (int i = 0; i < pad_dim; i++) {
@@ -119,7 +119,7 @@ void EigenPaddingCompute(
        reshaped_padding[1].first = paddings[pad_dim].first;
        reshaped_padding[1].second = paddings[pad_dim].second;
-        LaunchEigenPadding<T, Context>(context,
+        LaunchEigenPadding<T, Context, 2>(context,
                                          d_input,
                                          reshaped_in_dims,
                                          d_out,
@@ -130,7 +130,7 @@ void EigenPaddingCompute(
        // reshape the dimension of tensor in 2: [padding, succeeding]
        // similar to (D - 1)
        std::vector<int64_t> in_tore_shape(2, 1), out_tore_shape(2, 1);
-        Eigen::array<std::pair<int64_t, int64_t>, 2> reshaped_padding;
+        std::array<std::pair<int64_t, int64_t>, 2> reshaped_padding;
        // first dimension is the padding dimension
        in_tore_shape[0] = in_dims[pad_dim];
@@ -163,7 +163,7 @@ void EigenPaddingCompute(
        // reshape the dimension of tensor in 3:
        // [preceding, padding, succeeding]
        std::vector<int64_t> in_tore_shape(3, 1), out_tore_shape(3, 1);
-        Eigen::array<std::pair<int64_t, int64_t>, 3> reshaped_padding;
+        std::array<std::pair<int64_t, int64_t>, 3> reshaped_padding;
        // first dimension is the accumulate of preceding dimension
        for (int i = 0; i < pad_dim; i++) {
@@ -261,7 +261,7 @@ void SliceGradCompute(const Context& ctx,
    offsets[axis] = start;
  }
-  Eigen::array<std::pair<int64_t, int64_t>, D> paddings;
+  std::array<std::pair<int64_t, int64_t>, D> paddings;
  for (size_t i = 0; i < paddings.size(); ++i) {
    paddings[i].first = offsets[i];
    paddings[i].second = (in_dims[i] - out_dims[i]) - offsets[i];

--- a/paddle/phi/kernels/transfer_layout_kernel.cc
+++ b/paddle/phi/kernels/transfer_layout_kernel.cc
@@ -112,6 +112,7 @@ void TransferLayoutGeneral(const Context& dev_ctx,
    }
  }
 #endif
  PD_VISIT_ALL_TYPES(x.dtype(), "CastDataLayout", ([&] {
                       CastDataLayout<data_t, Context>(dev_ctx, x, axis, out);
                     }));

--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -3568,6 +3568,7 @@ function run_setup_mac(){
            if [ -d "/Library/Frameworks/Python.framework/Versions/3.7" ]; then
                export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.7/lib/
                export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:/Library/Frameworks/Python.framework/Versions/3.7/lib/
+                export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${PADDLE_ROOT}/build/third_party/install/lapack/lib
                export PATH=/Library/Frameworks/Python.framework/Versions/3.7/bin/:${PATH}
                #after changing "PYTHON_LIBRARY:FILEPATH" to "PYTHON_LIBRARY" ,we can use export
                export PYTHON_EXECUTABLE=/Library/Frameworks/Python.framework/Versions/3.7/bin/python3
@@ -3581,6 +3582,7 @@ function run_setup_mac(){
            if [ -d "/Library/Frameworks/Python.framework/Versions/3.8" ]; then
                export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.8/lib/
                export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:/Library/Frameworks/Python.framework/Versions/3.8/lib/
+                export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${PADDLE_ROOT}/build/third_party/install/lapack/lib
                export PATH=/Library/Frameworks/Python.framework/Versions/3.8/bin/:${PATH}
                #after changing "PYTHON_LIBRARY:FILEPATH" to "PYTHON_LIBRARY" ,we can use export
                export PYTHON_EXECUTABLE=/Library/Frameworks/Python.framework/Versions/3.8/bin/python3
@@ -3594,6 +3596,7 @@ function run_setup_mac(){
            if [ -d "/Library/Frameworks/Python.framework/Versions/3.9" ]; then
                export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.9/lib/
                export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:/Library/Frameworks/Python.framework/Versions/3.9/lib/
+                export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${PADDLE_ROOT}/build/third_party/install/lapack/lib
                export PATH=/Library/Frameworks/Python.framework/Versions/3.9/bin/:${PATH}
                #after changing "PYTHON_LIBRARY:FILEPATH" to "PYTHON_LIBRARY" ,we can use export
                export PYTHON_EXECUTABLE=/Library/Frameworks/Python.framework/Versions/3.9/bin/python3
@@ -3607,6 +3610,7 @@ function run_setup_mac(){
            if [ -d "/Library/Frameworks/Python.framework/Versions/3.10" ]; then
                export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.10/lib/
                export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:/Library/Frameworks/Python.framework/Versions/3.10/lib/
+                export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${PADDLE_ROOT}/build/third_party/install/lapack/lib
                export PATH=/Library/Frameworks/Python.framework/Versions/3.10/bin/:${PATH}
                #after changing "PYTHON_LIBRARY:FILEPATH" to "PYTHON_LIBRARY" ,we can use export
                export PYTHON_EXECUTABLE=/Library/Frameworks/Python.framework/Versions/3.10/bin/python3

--- a/paddle/testing/CMakeLists.txt
+++ b/paddle/testing/CMakeLists.txt
@@ -4,7 +4,7 @@ if(WITH_TESTING)
  set(paddle_gtest_main_deps
      device_context
      gtest
-      gflags
+      phi
      init
      memory
      phi_utils

--- a/paddle/utils/CMakeLists.txt
+++ b/paddle/utils/CMakeLists.txt
@@ -3,11 +3,11 @@ add_subdirectory(string)
 cc_test(
  array_ref_test
  SRCS array_ref_test.cc
-  DEPS gtest gflags)
+  DEPS gtest phi)
 cc_test(
  small_vector_test
  SRCS small_vector_test.cc
-  DEPS gtest gflags)
+  DEPS gtest phi)
 cc_test(
  variant_test
  SRCS variant_test.cc
@@ -17,5 +17,5 @@ if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
  cc_library(
    pybind_util
    SRCS pybind.cc
-    DEPS phi_tensor_raw flags)
+    DEPS phi)
 endif()
--- a/paddle/utils/string/CMakeLists.txt
+++ b/paddle/utils/string/CMakeLists.txt
 cc_library(
  pretty_log
  SRCS pretty_log.cc
-  DEPS flags)
+  DEPS phi)
 cc_library(
  string_helper
  SRCS string_helper.cc
-  DEPS flags)
+  DEPS phi)
 cc_test(
  stringprintf_test
  SRCS printf_test.cc
-  DEPS gflags)
+  DEPS phi)
 cc_test(to_string_test SRCS to_string_test.cc)
 cc_test(split_test SRCS split_test.cc)
 cc_test(

--- a/python/env_dict.py.in
+++ b/python/env_dict.py.in
@@ -10,6 +10,9 @@ env_dict={
    'CUDA_VERSION':'@CUDA_VERSION@',
    'WITH_PSLI':'@WITH_PSLI@',
    'FLUID_CORE_NAME':'@FLUID_CORE_NAME@',
+    'PHI_LIB':'@PHI_LIB@',
+    'PHI_NAME':'@PHI_NAME@',
+    'WITH_PHI_SHARED':'@WITH_PHI_SHARED@',
    'WARPCTC_LIBRARIES':'@WARPCTC_LIBRARIES@',
    'WARPRNNT_LIBRARIES':'@WARPRNNT_LIBRARIES@',
    'FLASHATTN_LIBRARIES':'@FLASHATTN_LIBRARIES@',

--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -1134,14 +1134,6 @@ foreach(TEST_CINN_OPS ${TEST_CINN_OPS})
 endforeach()
 if(WITH_CINN AND WITH_TESTING)
-  set_tests_properties(
-    test_resnet50_with_cinn
-    PROPERTIES
-      LABELS
-      "RUN_TYPE=CINN"
-      ENVIRONMENT
-      FLAGS_allow_cinn_ops="conv2d;conv2d_grad;elementwise_add;elementwise_add_grad;relu;relu_grad;sum"
-  )
  set_tests_properties(
    test_parallel_executor_run_cinn
    PROPERTIES

--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_run_cinn.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_run_cinn.py
@@ -123,9 +123,9 @@ class TestParallelExecutorRunCinn(unittest.TestCase):
        shutil.rmtree(self.tmpdir)
    def test_run_with_cinn(self):
-        cinn_losses = train(self.tmpdir, "paddle")
+        cinn_losses = np.array(train(self.tmpdir, "paddle")).flatten()
        set_cinn_flag(False)
-        pd_losses = train(self.tmpdir, "cinn")
+        pd_losses = np.array(train(self.tmpdir, "cinn")).flatten()
        np.testing.assert_allclose(
            cinn_losses, pd_losses, rtol=1e-05, atol=1e-05
        )

--- a/python/paddle/fluid/tests/unittests/test_resnet50_with_cinn.py
+++ b/python/paddle/fluid/tests/unittests/test_resnet50_with_cinn.py
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import logging
-import unittest
-import numpy as np
-import paddle
-from paddle.fluid import core
-paddle.enable_static()
-logging.basicConfig(
-    format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO
-)
-logger = logging.getLogger(__name__)
-def set_cinn_flag(val):
-    cinn_compiled = False
-    try:
-        paddle.set_flags({'FLAGS_use_cinn': val})
-        cinn_compiled = True
-    except ValueError:
-        logger.warning("The used paddle is not compiled with CINN.")
-    return cinn_compiled
-@unittest.skipIf(not set_cinn_flag(True), "Paddle is not compiled with CINN.")
-class TestResnet50Accuracy(unittest.TestCase):
-    def reader(self, limit):
-        for _ in range(limit):
-            yield {
-                'image': np.random.randint(
-                    0, 256, size=[32, 3, 224, 224]
-                ).astype('float32'),
-                'label': np.random.randint(0, 1000, size=[32]).astype('int64'),
-            }
-    def generate_random_data(self, loop_num=10):
-        feed = []
-        data = self.reader(loop_num)
-        for _ in range(loop_num):
-            feed.append(next(data))
-        return feed
-    def build_program(self, main_program, startup_program):
-        with paddle.static.program_guard(main_program, startup_program):
-            image = paddle.static.data(
-                name='image', shape=[32, 3, 224, 224], dtype='float32'
-            )
-            label = paddle.static.data(name='label', shape=[32], dtype='int64')
-            # TODO: stop_gradient slower training speed, need fix
-            image.stop_gradient = False
-            model = paddle.vision.models.resnet50()
-            prediction = model(image)
-            loss = paddle.nn.functional.cross_entropy(
-                input=prediction, label=label
-            )
-            loss = paddle.mean(loss)
-            adam = paddle.optimizer.Adam(learning_rate=0.001)
-            adam.minimize(loss)
-        return loss
-    def train(self, place, iters, feed, use_cinn=False, seed=1234):
-        np.random.seed(seed)
-        paddle.seed(seed)
-        if paddle.is_compiled_with_cuda():
-            paddle.set_flags({'FLAGS_cudnn_deterministic': 1})
-        set_cinn_flag(use_cinn)
-        startup_program = paddle.static.Program()
-        main_program = paddle.static.Program()
-        loss = self.build_program(main_program, startup_program)
-        exe = paddle.static.Executor(place)
-        compiled_prog = paddle.static.CompiledProgram(main_program)
-        loss_vals = []
-        scope = paddle.static.Scope()
-        with paddle.static.scope_guard(scope):
-            exe.run(startup_program)
-            for step in range(iters):
-                loss_v = exe.run(
-                    compiled_prog,
-                    feed=feed[step],
-                    fetch_list=[loss],
-                    return_numpy=True,
-                )
-                loss_vals.append(loss_v[0])
-        return loss_vals
-    def test_check_resnet50_accuracy(self):
-        place = (
-            paddle.CUDAPlace(0)
-            if paddle.is_compiled_with_cuda()
-            else paddle.CPUPlace()
-        )
-        loop_num = 10
-        feed = self.generate_random_data(loop_num)
-        loss_c = self.train(place, loop_num, feed, use_cinn=True)
-        loss_p = self.train(place, loop_num, feed, use_cinn=False)
-        print("Losses of CINN:")
-        print(loss_c)
-        print("Losses of Paddle")
-        print(loss_p)
-        np.testing.assert_allclose(loss_c, loss_p, rtol=1e-05, atol=1e-05)
-    def test_check_resnet50_accuracy_with_composite(self):
-        place = (
-            paddle.CUDAPlace(0)
-            if paddle.is_compiled_with_cuda()
-            else paddle.CPUPlace()
-        )
-        loop_num = 10
-        feed = self.generate_random_data(loop_num)
-        core._set_prim_backward_enabled(True)
-        core._add_skip_comp_ops("batch_norm")
-        loss_c = self.train(place, loop_num, feed, use_cinn=True)
-        core._set_prim_backward_enabled(False)
-        loss_p = self.train(place, loop_num, feed, use_cinn=True)
-        print("Losses of Composite + CINN:")
-        print(loss_c)
-        print("Losses of CINN: ")
-        print(loss_p)
-        np.testing.assert_allclose(loss_c, loss_p, rtol=1e-05, atol=1e-05)
-if __name__ == '__main__':
-    unittest.main()
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -561,7 +561,11 @@ package_dir={
 libs_path='${PADDLE_BINARY_DIR}/python/paddle/libs'
 package_data['paddle.libs']= []
-package_data['paddle.libs']=[
+if('${WITH_PHI_SHARED}' == 'ON'):
+    package_data['paddle.libs'] = [('libphi' if os.name != 'nt' else 'phi') + ext_name]
+    shutil.copy('${PHI_LIB}', libs_path)
+package_data['paddle.libs']+=[
    ('libwarpctc' if os.name != 'nt' else 'warpctc') + ext_name,
    ('libwarprnnt' if os.name != 'nt' else 'warprnnt') + ext_name,
 ]
@@ -722,8 +726,14 @@ if '${CMAKE_BUILD_TYPE}' == 'Release':
        if "@APPLE@" == "1":
            commands = ["install_name_tool -id '@loader_path/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/${FLUID_CORE_NAME}" + '.so']
            commands.append("install_name_tool -add_rpath '@loader_path/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/${FLUID_CORE_NAME}" + '.so')
+            if('${WITH_PHI_SHARED}' == 'ON'):
+                # change rpath of phi.ext for loading 3rd party libb
+                commands.append("install_name_tool -add_rpath '@loader_path' ${PADDLE_BINARY_DIR}/python/paddle/libs/${PHI_NAME}")
        else:
            commands = ["patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/${FLUID_CORE_NAME}" + '.so']
+            if('${WITH_PHI_SHARED}' == 'ON'):
+                # change rpath of phi.ext for loading 3rd party lib
+                commands.append("patchelf --set-rpath '$ORIGIN' ${PADDLE_BINARY_DIR}/python/paddle/libs/${PHI_NAME}")
        # The sw_64 not suppot patchelf, so we just disable that.
        if platform.machine() != 'sw_64' and platform.machine() != 'mips64':
            for command in commands:

--- a/setup.py
+++ b/setup.py
@@ -966,7 +966,14 @@ def get_package_data_and_package_dir():
    # put all thirdparty libraries in paddle.libs
    libs_path = paddle_binary_dir + '/python/paddle/libs'
    package_data['paddle.libs'] = []
+    if env_dict.get("WITH_PHI_SHARED") == "ON":
        package_data['paddle.libs'] = [
+            ('libphi' if os.name != 'nt' else 'phi') + ext_suffix
+        ]
+        shutil.copy(env_dict.get("PHI_LIB"), libs_path)
+    package_data['paddle.libs'] += [
        ('libwarpctc' if os.name != 'nt' else 'warpctc') + ext_suffix,
        ('libwarprnnt' if os.name != 'nt' else 'warprnnt') + ext_suffix,
    ]
@@ -1204,6 +1211,13 @@ def get_package_data_and_package_dir():
                    + env_dict.get("FLUID_CORE_NAME")
                    + '.so'
                )
+                if env_dict.get("WITH_PHI_SHARED") == "ON":
+                    commands.append(
+                        "install_name_tool -add_rpath '@loader_path' "
+                        + env_dict.get("PADDLE_BINARY_DIR")
+                        + '/python/paddle/libs/'
+                        + env_dict.get("PHI_NAME")
+                    )
            else:
                commands = [
                    "patchelf --set-rpath '$ORIGIN/../libs/' "
@@ -1212,6 +1226,13 @@ def get_package_data_and_package_dir():
                    + env_dict.get("FLUID_CORE_NAME")
                    + '.so'
                ]
+                if env_dict.get("WITH_PHI_SHARED") == "ON":
+                    commands.append(
+                        "patchelf --set-rpath '$ORIGIN' "
+                        + env_dict.get("PADDLE_BINARY_DIR")
+                        + '/python/paddle/libs/'
+                        + env_dict.get("PHI_NAME")
+                    )
            # The sw_64 not suppot patchelf, so we just disable that.
            if platform.machine() != 'sw_64' and platform.machine() != 'mips64':
                for command in commands:

--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -168,6 +168,7 @@ if(${len} GREATER_EQUAL 1)
      add_executable(${test_name} ${test_src})
      target_link_libraries(${test_name} paddle_gtest_main_new)
      target_link_libraries(${test_name} $<TARGET_LINKER_FILE:${paddle_lib}>)
+      target_link_libraries(${test_name} $<TARGET_LINKER_FILE:phi>)
      add_dependencies(${test_name} ${paddle_lib} paddle_gtest_main_new)
      if(WITH_GPU)
        target_link_libraries(${test_name} ${CUDA_CUDART_LIBRARY}
@@ -177,8 +178,10 @@ if(${len} GREATER_EQUAL 1)
        target_link_libraries(${test_name} ${ROCM_HIPRTC_LIB})
      endif()
      if(APPLE)
-        target_link_libraries(${test_name}
+        target_link_libraries(
-                              "-Wl,-rpath,$<TARGET_FILE_DIR:${paddle_lib}>")
+          ${test_name}
+          "-Wl,-rpath,$<TARGET_FILE_DIR:${paddle_lib}> -Wl,-rpath,$<TARGET_FILE_DIR:phi>"
+        )
      endif()
      if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
        target_link_libraries(${test_name} ${PYTHON_LIBRARIES})

--- a/test/cpp/eager/CMakeLists.txt
+++ b/test/cpp/eager/CMakeLists.txt
 set(eager_deps
-    phi_api
+    phi
-    phi_dygraph_api
    hook_utils
    tensor_utils
    utils
    global_utils
    backward
-    phi_tensor
    tracer
    layer
    autograd_meta

--- a/test/cpp/fluid/CMakeLists.txt
+++ b/test/cpp/fluid/CMakeLists.txt
@@ -42,7 +42,7 @@ cc_test(
  test_common_infer_shape_functions
  SRCS test_common_infer_shape_functions.cc
  DEPS common_infer_shape_functions ${COMMON_OP_DEPS} activation_op
-       elementwise_add_op softmax generated_static_op)
+       elementwise_add_op phi generated_static_op)
 cc_test(
  gather_test
  SRCS gather_test.cc
@@ -54,7 +54,7 @@ cc_test(
 cc_test(
  scatter_test
  SRCS scatter_test.cc
-  DEPS tensor math_function)
+  DEPS tensor phi)
 cc_test(
  beam_search_decode_op_test
  SRCS beam_search_decode_op_test.cc
@@ -72,7 +72,7 @@ if(WITH_GPU)
  nv_test(
    dropout_op_test
    SRCS dropout_op_test.cc
-    DEPS dropout_op tensor generator)
+    DEPS dropout_op tensor phi)
  nv_test(
    test_leaky_relu_grad_grad_functor
    SRCS test_leaky_relu_grad_grad_functor.cc
@@ -81,12 +81,12 @@ if(WITH_GPU)
  nv_test(
    feed_forward_test
    SRCS feed_forward_test.cu
-    DEPS elementwise_add_op matmul_op tensor generator)
+    DEPS elementwise_add_op matmul_op tensor phi)
 elseif(WITH_ROCM)
  hip_test(
    dropout_op_test
    SRCS dropout_op_test.cc
-    DEPS dropout_op tensor generator)
+    DEPS dropout_op tensor phi)
  hip_test(
    test_leaky_relu_grad_grad_functor
    SRCS test_leaky_relu_grad_grad_functor.cc

--- a/test/cpp/fluid/benchmark/CMakeLists.txt
+++ b/test/cpp/fluid/benchmark/CMakeLists.txt
@@ -11,7 +11,7 @@ cc_test(
       scope
       ${GLOB_OP_LIB}
       ${GLOB_OPERATOR_DEPS}
-       eigen_function)
+       phi)
 if(WITH_ONNXRUNTIME AND WIN32)
  # Copy onnxruntime for some c++ test in Windows, since the test will

--- a/test/cpp/fluid/cinn/CMakeLists.txt
+++ b/test/cpp/fluid/cinn/CMakeLists.txt
-cc_test_old(
+if(WITH_TESTING)
+  cc_test_old(
    cinn_launch_context_test
    SRCS
    cinn_launch_context_test.cc
    DEPS
-  ddim
+    phi
    lod_tensor
    scope
    proto_desc
@@ -11,27 +12,28 @@ cc_test_old(
    cinn_launch_context
    cinn_instruction_run_op
    cinn)
-target_link_libraries(cinn_launch_context_test ${PYTHON_LIBRARIES})
+  target_link_libraries(cinn_launch_context_test ${PYTHON_LIBRARIES})
-set_tests_properties(cinn_launch_context_test PROPERTIES LABELS "RUN_TYPE=CINN")
+  set_tests_properties(cinn_launch_context_test PROPERTIES LABELS
+                                                           "RUN_TYPE=CINN")
-set(CINN_RUN_ENVIRONMENT
+  set(CINN_RUN_ENVIRONMENT
      "OMP_NUM_THREADS=1;runtime_include_dir=${PADDLE_BINARY_DIR}/third_party/CINN/src/external_cinn/cinn/runtime/cuda"
-)
+  )
-# cc_test_old(
+  # cc_test_old(
-#   cinn_launch_op_test
+  #   cinn_launch_op_test
-#   SRCS
+  #   SRCS
-#   cinn_launch_op_test.cc
+  #   cinn_launch_op_test.cc
-#   DEPS
+  #   DEPS
-#   cinn_compiler
+  #   cinn_compiler
-#   cinn_launch_op
+  #   cinn_launch_op
-#   cinn_instruction_run_op
+  #   cinn_instruction_run_op
-#   elementwise_add_op
+  #   elementwise_add_op
-#   gflags)
+  #   gflags)
-# set_tests_properties(
+  # set_tests_properties(
-#   cinn_launch_op_test PROPERTIES LABELS "RUN_TYPE=CINN" ENVIRONMENT
+  #   cinn_launch_op_test PROPERTIES LABELS "RUN_TYPE=CINN" ENVIRONMENT
-#                                  "${CINN_RUN_ENVIRONMENT}")
+  #                                  "${CINN_RUN_ENVIRONMENT}")
-cc_test_old(
+  cc_test_old(
    cinn_instruction_run_op_test
    SRCS
    cinn_instruction_run_op_test.cc
@@ -40,7 +42,8 @@ cc_test_old(
    cinn_launch_op
    cinn_instruction_run_op
    elementwise_add_op)
-target_link_libraries(cinn_instruction_run_op_test ${PYTHON_LIBRARIES})
+  target_link_libraries(cinn_instruction_run_op_test ${PYTHON_LIBRARIES})
-set_tests_properties(
+  set_tests_properties(
    cinn_instruction_run_op_test PROPERTIES LABELS "RUN_TYPE=CINN" ENVIRONMENT
                                            "${CINN_RUN_ENVIRONMENT}")
+endif()
--- a/test/cpp/fluid/fused/CMakeLists.txt
+++ b/test/cpp/fluid/fused/CMakeLists.txt
@@ -15,7 +15,7 @@ if(WITH_GPU OR WITH_ROCM)
           dropout_op
           generated_op
           device_context
-           generator
+           phi
           memory)
    nv_test(
      test_fused_dropout_act_bias
@@ -25,7 +25,7 @@ if(WITH_GPU OR WITH_ROCM)
           dropout_op
           generated_op
           device_context
-           generator
+           phi
           memory)
    nv_test(
      test_fused_layernorm_residual_dropout_bias
@@ -35,7 +35,7 @@ if(WITH_GPU OR WITH_ROCM)
           dropout_op
           generated_op
           device_context
-           generator
+           phi
           memory)
  endif()
  # resnet_unit needs cudnn 8.0 above
@@ -44,15 +44,11 @@ if(WITH_GPU OR WITH_ROCM)
      test_cudnn_norm_conv
      SRCS cudnn_norm_conv_test.cc
      DEPS conv_op
-           blas
-           im2col
-           vol2col
           depthwise_conv
-           eigen_function
           tensor
           op_registry
           device_context
-           generator
+           phi
           memory)
    cc_test(
      test_cudnn_bn_add_relu
@@ -62,7 +58,7 @@ if(WITH_GPU OR WITH_ROCM)
           tensor
           op_registry
           device_context
-           generator
+           phi
           memory)
  endif()
 endif()
--- a/test/cpp/fluid/math/CMakeLists.txt
+++ b/test/cpp/fluid/math/CMakeLists.txt
 cc_test(
  selected_rows_functor_test
  SRCS selected_rows_functor_test.cc
-  DEPS allocator selected_rows_functor)
+  DEPS allocator phi)
 cc_test(
  im2col_test
  SRCS im2col_test.cc
-  DEPS im2col)
+  DEPS phi)
 cc_test(
  vol2col_test
  SRCS vol2col_test.cc
-  DEPS vol2col)
+  DEPS phi)
 cc_test(
  beam_search_test
  SRCS beam_search_test.cc
@@ -18,13 +18,13 @@ if(WITH_GPU)
  nv_test(
    selected_rows_functor_gpu_test
    SRCS selected_rows_functor_test.cu.cc
-    DEPS selected_rows_functor math_function)
+    DEPS phi)
 endif()
 if(WITH_ROCM)
  hip_test(
    selected_rows_functor_gpu_test
    SRCS selected_rows_functor_test.cu.cc
-    DEPS selected_rows_functor math_function)
+    DEPS phi)
 endif()
 cc_test(
  concat_test

--- a/test/cpp/fluid/mkldnn/CMakeLists.txt
+++ b/test/cpp/fluid/mkldnn/CMakeLists.txt
@@ -4,7 +4,7 @@ cc_test(
  DEPS op_registry
       elementwise_add_op
       activation_op
-       softmax
+       phi
       scope
       device_context
       enforce
@@ -17,9 +17,7 @@ set(TEST_MKLDNN_CACHING_DEPS
    elementwise_add_op
    activation_op
    conv_op
-    im2col
+    phi
-    vol2col
-    softmax
    scope
    device_context
    enforce
@@ -44,7 +42,7 @@ cc_test_old(
  crop_op
  activation_op
  generated_op
-  pooling
+  phi
  transpose_op
  fused_transpose_op
  scope

--- a/test/cpp/fluid/pscore/CMakeLists.txt
+++ b/test/cpp/fluid/pscore/CMakeLists.txt
@@ -68,7 +68,7 @@ cc_test_old(
  scope
  proto_desc
  generated_op
-  eigen_function)
+  phi)
 set_source_files_properties(
  send_and_recv_op_cpu_test.cc PROPERTIES COMPILE_FLAGS
@@ -85,7 +85,7 @@ cc_test_old(
  send_and_recv_op
  ${RPC_DEPS}
  ${DISTRIBUTE_DEPS}
-  eigen_function)
+  phi)
 set_source_files_properties(
  send_and_recv_op_gpu_test.cc PROPERTIES COMPILE_FLAGS
@@ -102,7 +102,7 @@ cc_test_old(
  send_and_recv_op
  ${RPC_DEPS}
  ${DISTRIBUTE_DEPS}
-  eigen_function)
+  phi)
 set_source_files_properties(
  heter_listen_and_server_test.cc PROPERTIES COMPILE_FLAGS
@@ -119,10 +119,10 @@ cc_test_old(
  heter_listen_and_serv_op
  ${RPC_DEPS}
  ${DISTRIBUTE_DEPS}
-  eigen_function)
+  phi)
 #set_source_files_properties(heter_cloud_comm_cpu_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-#cc_test(heter_cloud_comm_cpu_test SRCS heter_cloud_comm_cpu_test.cc DEPS executor scope proto_desc generated_static_op heter_listen_and_serv_op ${RPC_DEPS} ${DISTRIBUTE_DEPS} eigen_function)
+#cc_test(heter_cloud_comm_cpu_test SRCS heter_cloud_comm_cpu_test.cc DEPS executor scope proto_desc generated_static_op heter_listen_and_serv_op ${RPC_DEPS} ${DISTRIBUTE_DEPS} phi)
 set_source_files_properties(
  switch_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
@@ -138,4 +138,4 @@ cc_binary(
  heter_listen_and_serv_op
  ${RPC_DEPS}
  ${DISTRIBUTE_DEPS}
-  eigen_function)
+  phi)
--- a/test/cpp/imperative/CMakeLists.txt
+++ b/test/cpp/imperative/CMakeLists.txt
@@ -33,14 +33,7 @@ endif()
 cc_test(
  test_gradient_accmulator
  SRCS test_gradient_accmulator.cc
-  DEPS memcpy
+  DEPS memcpy selected_rows_utils gradient_accumulator phi phi_utils)
-       selected_rows_utils
-       selected_rows_functor
-       gradient_accumulator
-       math_function
-       phi_tensor
-       phi_api
-       phi_utils)
 cc_test(
  test_layer
  SRCS test_layer.cc

--- a/test/cpp/imperative/test_hooks.cc
+++ b/test/cpp/imperative/test_hooks.cc
@@ -24,6 +24,7 @@
 #include "paddle/fluid/imperative/hooks.h"
 #include "paddle/fluid/imperative/tracer.h"
 #include "paddle/fluid/memory/memcpy.h"
+#include "paddle/phi/core/flags.h"
 #include "paddle/phi/core/kernel_registry.h"
 PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
@@ -35,7 +36,7 @@ namespace platform = paddle::platform;
 namespace framework = paddle::framework;
 namespace memory = paddle::memory;
-DECLARE_bool(sort_sum_gradient);
+PHI_DECLARE_bool(sort_sum_gradient);
 namespace paddle {
 namespace imperative {

--- a/test/cpp/inference/infer_ut/CMakeLists.txt
+++ b/test/cpp/inference/infer_ut/CMakeLists.txt
@@ -224,7 +224,7 @@ if(NOT WIN32)
      ${MATH_LIB}
      ${MKLDNN_LIB}
      glog
-      gflags
+      phi
      protobuf
      xxhash
      cryptopp
@@ -235,7 +235,7 @@ else()
      ${MATH_LIB}
      ${MKLDNN_LIB}
      glog
-      gflags_static
+      phi
      libprotobuf
      xxhash
      cryptopp-static

--- a/test/cpp/jit/CMakeLists.txt
+++ b/test/cpp/jit/CMakeLists.txt
@@ -8,7 +8,6 @@ if(WITH_TESTING AND NOT WIN32)
    WORKING_DIRECTORY "${CC_TESTS_DIR}")
  set(JIT_DEPS
      phi
-      phi_api
      elementwise_add_op
      matmul_v2_op
      activation_op

--- a/test/cpp/new_executor/CMakeLists.txt
+++ b/test/cpp/new_executor/CMakeLists.txt
@@ -37,8 +37,7 @@ if(WITH_GPU
      fetch_v2_op)
  # All deps of the operators above, part of GLOB_OPERATOR_DEPS.
-  set(OP_DEPS generator softmax selected_rows_functor jit_kernel_helper
+  set(OP_DEPS phi concat_and_split cross_entropy)
-              concat_and_split cross_entropy)
  cc_test(standalone_executor_test SRCS standalone_executor_test.cc)
  # add_dependencies(standalone_executor_test download_program)

--- a/test/cpp/phi/api/CMakeLists.txt
+++ b/test/cpp/phi/api/CMakeLists.txt
-set(COMMON_API_TEST_DEPS phi_tensor phi_api api_tensor_utils)
+set(COMMON_API_TEST_DEPS phi)
 if(WITH_GPU)
  nv_test(
    test_phi_tensor
    SRCS test_phi_tensor.cc
-    DEPS glog selected_rows ${COMMON_API_TEST_DEPS})
+    DEPS glog ${COMMON_API_TEST_DEPS})
  nv_test(
    test_allocator
    SRCS test_allocator.cu
-    DEPS place device_context context_pool)
+    DEPS place device_context phi)
  nv_test(
    test_cuda_stream
    SRCS test_cuda_stream.cu
-    DEPS context_pool)
+    DEPS phi)
  nv_test(
    test_from_blob
    SRCS test_from_blob.cc
-    DEPS phi_backends ${COMMON_API_TEST_DEPS})
+    DEPS ${COMMON_API_TEST_DEPS})
 elseif(WITH_ROCM)
  hip_test(
    test_phi_tensor
    SRCS test_phi_tensor.cc
-    DEPS glog selected_rows ${COMMON_API_TEST_DEPS})
+    DEPS glog ${COMMON_API_TEST_DEPS})
  hip_test(
    test_allocator
    SRCS test_allocator.cu
-    DEPS place device_context context_pool)
+    DEPS place device_context phi)
  hip_test(
    test_cuda_stream
    SRCS test_cuda_stream.cu
-    DEPS context_pool)
+    DEPS phi)
  hip_test(
    test_from_blob
    SRCS test_from_blob.cc
-    DEPS phi_backends ${COMMON_API_TEST_DEPS})
+    DEPS ${COMMON_API_TEST_DEPS})
 else()
  cc_test(
    test_phi_tensor
    SRCS test_phi_tensor.cc
-    DEPS glog selected_rows ${COMMON_API_TEST_DEPS})
+    DEPS glog ${COMMON_API_TEST_DEPS})
  cc_test(
    test_from_blob
    SRCS test_from_blob.cc
-    DEPS phi_backends ${COMMON_API_TEST_DEPS})
+    DEPS ${COMMON_API_TEST_DEPS})
 endif()
 cc_test(

--- a/test/cpp/phi/api/scale_api.h
+++ b/test/cpp/phi/api/scale_api.h
@@ -21,12 +21,13 @@
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/common/int_array.h"
 #include "paddle/phi/common/scalar.h"
+#include "paddle/phi/core/flags.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/meta_tensor.h"
 #include "paddle/phi/infermeta/unary.h"
 #include "paddle/phi/kernels/scale_kernel.h"
-DECLARE_int32(low_precision_op_list);
+PHI_DECLARE_int32(low_precision_op_list);
 namespace paddle {
 namespace experimental {

--- a/test/cpp/phi/common/CMakeLists.txt
+++ b/test/cpp/phi/common/CMakeLists.txt
@@ -13,32 +13,32 @@ cc_test(
 cc_test(
  phi_test_place
  SRCS test_place.cc
-  DEPS phi_place)
+  DEPS phi)
 cc_test(
  phi_test_int_array
  SRCS test_int_array.cc
-  DEPS int_array api_int_array phi phi_api)
+  DEPS phi)
 cc_test(
  phi_test_scalar_cpu
  SRCS test_scalar.cc
-  DEPS scalar api_scalar)
+  DEPS phi)
 if(WITH_GPU)
  nv_test(
    phi_test_scalar
    SRCS test_scalar.cu
-    DEPS scalar api_scalar)
+    DEPS phi)
  nv_test(
    transform_test
    SRCS transform_test.cu
-    DEPS memory place phi_backends)
+    DEPS memory place phi)
 endif()
 if(WITH_ROCM)
  hip_test(
    phi_test_scalar
    SRCS test_scalar.cu
-    DEPS scalar api_scalar)
+    DEPS phi)
  hip_test(
    transform_test
    SRCS transform_test.cu
-    DEPS memory place phi_backends)
+    DEPS memory place phi)
 endif()
--- a/test/cpp/phi/core/CMakeLists.txt
+++ b/test/cpp/phi/core/CMakeLists.txt
 cc_test(
  test_custom_kernel
  SRCS test_custom_kernel.cc
-  DEPS custom_kernel scalar)
+  DEPS phi)
 cc_test(
  test_dense_tensor
  SRCS test_dense_tensor.cc
-  DEPS dense_tensor)
+  DEPS phi)
 cc_test(test_intrusive_ptr SRCS test_intrusive_ptr.cc)
 cc_test(test_type_info SRCS test_type_info.cc)
 cc_test(
  test_kernel_factory
  SRCS test_kernel_factory.cc
-  DEPS kernel_factory phi)
+  DEPS phi)
 cc_test(
  test_sparse_coo_tensor
  SRCS test_sparse_coo_tensor.cc
-  DEPS dense_tensor sparse_coo_tensor)
+  DEPS phi)
 cc_test(
  test_sparse_csr_tensor
  SRCS test_sparse_csr_tensor.cc
-  DEPS dense_tensor sparse_csr_tensor)
+  DEPS phi)
 cc_test(
  test_op_utils
  SRCS test_op_utils.cc
  DEPS op_compat_infos)
-cc_test_old(
+cc_test_old(test_meta_fn_utils SRCS test_meta_fn_utils.cc DEPS phi)
-  test_meta_fn_utils
-  SRCS
-  test_meta_fn_utils.cc
-  DEPS
-  dense_tensor
-  wrapped_infermeta
-  infermeta
-  infermeta_utils)
 cc_test(
  test_ddim
  SRCS test_ddim.cc
-  DEPS ddim)
+  DEPS phi)
 if(WITH_GPU)
  nv_test(
    test_dim
    SRCS test_dim.cu
-    DEPS ddim)
+    DEPS phi)
 elseif(WITH_ROCM)
  hip_test(
    test_dim
    SRCS test_dim.cu
-    DEPS ddim)
+    DEPS phi)
 endif()
 cc_test(
  selected_rows_test
  SRCS test_selected_rows.cc
-  DEPS selected_rows)
+  DEPS phi)
 if(WITH_TESTING AND TEST selected_rows_test)
  set_tests_properties(selected_rows_test PROPERTIES TIMEOUT 120)
 endif()
@@ -63,27 +55,27 @@ endif()
 cc_test(
  test_string_tensor
  SRCS test_string_tensor.cc
-  DEPS string_tensor)
+  DEPS phi)
 cc_test(unroll_array_ops_test SRCS unroll_array_ops_test.cc)
 cc_test(
  test_tensor_array
  SRCS test_tensor_array.cc
-  DEPS tensor_array)
+  DEPS phi)
 if(WITH_GPU)
  nv_test(
    test_mixed_vector
    SRCS test_mixed_vector.cc test_mixed_vector.cu
-    DEPS mixed_vector place memory phi_backends tensor)
+    DEPS place memory phi tensor)
 elseif(WITH_ROCM)
  hip_test(
    test_mixed_vector
    SRCS test_mixed_vector.cc test_mixed_vector.cu
-    DEPS mixed_vector place memory phi_backends tensor)
+    DEPS place memory phi tensor)
 else()
  cc_test(
    test_mixed_vector
    SRCS test_mixed_vector.cc
-    DEPS mixed_vector place memory phi_backends tensor)
+    DEPS place memory phi tensor)
 endif()
--- a/test/cpp/phi/core/test_type_info.cc
+++ b/test/cpp/phi/core/test_type_info.cc
@@ -17,6 +17,11 @@ limitations under the License. */
 #include "paddle/phi/core/utils/type_registry.h"
 namespace phi {
+template <typename BaseT, typename DerivedT>
+const TypeInfo<BaseT> TypeInfoTraits<BaseT, DerivedT>::kType =
+    RegisterStaticType<BaseT>(DerivedT::name());
 namespace tests {
 template <typename T>

--- a/test/cpp/phi/kernels/CMakeLists.txt
+++ b/test/cpp/phi/kernels/CMakeLists.txt
 cc_test(
  test_math_function
  SRCS test_math_function.cc
-  DEPS math_function)
+  DEPS phi)
 if(WITH_GPU)
  nv_test(
    test_math_function_gpu
    SRCS test_math_function.cu
-    DEPS math_function)
+    DEPS phi)
  nv_test(
    test_broadcast_gpu
    SRCS test_ternary_broadcast.cu
@@ -16,13 +16,13 @@ if(WITH_ROCM)
  hip_test(
    test_math_function_gpu
    SRCS test_math_function.cu
-    DEPS math_function)
+    DEPS phi)
 endif()
 cc_test(
  test_cpu_vec
  SRCS test_cpu_vec.cc
-  DEPS blas phi_backends)
+  DEPS phi)
 # For String Kernels
 cc_test(
@@ -94,19 +94,19 @@ endif()
 cc_test(
  test_cache
  SRCS test_cache.cc
-  DEPS gtest cache)
+  DEPS gtest phi)
 cc_test(
  strided_memcpy_test
  SRCS strided_memcpy_test.cc
-  DEPS phi_backends memory)
+  DEPS phi memory)
 cc_test(
  sequence_padding_test
  SRCS sequence_padding_test.cc
-  DEPS sequence_padding)
+  DEPS phi)
 cc_test(
  sequence_pooling_test
  SRCS sequence_pooling_test.cc
-  DEPS sequence_pooling)
+  DEPS phi)
--- a/test/cpp/phi/ops/CMakeLists.txt
+++ b/test/cpp/phi/ops/CMakeLists.txt
 cc_test(
  test_op_signature
  SRCS test_op_signature.cc
-  DEPS op_utils)
+  DEPS phi)
--- a/test/cpp/prim/CMakeLists.txt
+++ b/test/cpp/prim/CMakeLists.txt
 set(prim_eager_deps
-    phi_api
+    phi
-    phi_dygraph_api
    hook_utils
    tensor_utils
    utils
    global_utils
    backward
-    phi_tensor
    tracer
    layer
    autograd_meta
@@ -33,20 +31,16 @@ cc_test_old(
  elementwise_pow_op
  fill_constant_op
  activation_op
-  phi_api
+  phi
-  phi_dygraph_api
  static_global_utils
  static_tensor_operants
-  tensor_api
-  operants_manager
  generated_static_op)
 if(NOT (NOT WITH_PYTHON AND ON_INFER))
  cc_library(
    init_env_utils
    SRCS init_env_utils.cc
-    DEPS operants_manager tensor_api eager_tensor_operants
+    DEPS phi eager_tensor_operants static_tensor_operants)
-         static_tensor_operants)
  cc_test_old(
    test_comp_eager

--- a/tools/parallel_UT_rule.py
+++ b/tools/parallel_UT_rule.py
@@ -2221,7 +2221,6 @@ CPU_PARALLEL_JOB = [
    'test_egr_ds_grad_tensor_holder',
    'test_egr_ds_auotgrad_meta',
    'test_egr_ds_accumulation_node',
-    'test_resnet50_with_cinn',
    'test_parallel_dygraph_sync_batch_norm',
    'test_monitor',
    'test_mkldnn_quantizer',