[PHI Decoupling]Create PHI shared lib (#53735)

* create phi so * fix ci bugs * fix py3 bugs * add file * fix py3 bugs * fix windows bugs * perfect so * fix py3 bugs * delete all static target in phi * fix windows bugs * fix py3 bugs * fix ci bugs * fix windows bugs * fix bugs: gflags can't be linked by dynamic and static lib * fix bugs that can not load 3rd party * fix ci bugs * fix compile bugs * fix py3 bugs * fix conflict * fix xpu bugs * fix mac compile bugs * fix psgpu bugs * fix inference failed * deal with conflict * fix LIBRARY_PATH bug * fix windows bugs * fix onednn error * fix windows compile bugs * fix windows compile bugs * fix test_cuda_graph_static_mode_error aborted * fix windows bugs * fix mac-python3 error * fix hip compile bugs * change mode to static * change to static mode * fix ci bugs * fix py3 bugs * fix windows bugs * fix bugs * add static flag * add PADDLE_API * change position of PADDLE_API * fix windows bugs * change mode to dynamic lib * fix windows static bugs * deal with conflict * fix windows unit bug * fix coverage * deal with conflict * fix windows-inference * fix py3 bugs * fix bugs when compile type_info * fix compile bugs * fix py3 bugs * fix windows bugs * fix windows openblas * fix xpu bugs * fix enforce_test in windows * update code according comment * fix windows cmake bug * fix windows bugs * fix windows bugs * delete cinn unittest * fix cinn bugs --------- Co-authored-by: lzydev <1528794076@qq.com>

[PHI Decoupling]Create PHI shared lib (#53735)
* create phi so * fix ci bugs * fix py3 bugs * add file * fix py3 bugs * fix windows bugs * perfect so * fix py3 bugs * delete all static target in phi * fix windows bugs * fix py3 bugs * fix ci bugs * fix windows bugs * fix bugs: gflags can't be linked by dynamic and static lib * fix bugs that can not load 3rd party * fix ci bugs * fix compile bugs * fix py3 bugs * fix conflict * fix xpu bugs * fix mac compile bugs * fix psgpu bugs * fix inference failed * deal with conflict * fix LIBRARY_PATH bug * fix windows bugs * fix onednn error * fix windows compile bugs * fix windows compile bugs * fix test_cuda_graph_static_mode_error aborted * fix windows bugs * fix mac-python3 error * fix hip compile bugs * change mode to static * change to static mode * fix ci bugs * fix py3 bugs * fix windows bugs * fix bugs * add static flag * add PADDLE_API * change position of PADDLE_API * fix windows bugs * change mode to dynamic lib * fix windows static bugs * deal with conflict * fix windows unit bug * fix coverage * deal with conflict * fix windows-inference * fix py3 bugs * fix bugs when compile type_info * fix compile bugs * fix py3 bugs * fix windows bugs * fix windows openblas * fix xpu bugs * fix enforce_test in windows * update code according comment * fix windows cmake bug * fix windows bugs * fix windows bugs * delete cinn unittest * fix cinn bugs --------- Co-authored-by: lzydev <1528794076@qq.com>
da50a009 · YuanRisheng · GitHub · 7aabdfd9 · da50a009 · da50a009
180 changed file
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -40,7 +40,6 @@ if(WITH_MKLML)
  add_definitions(-DLAPACK_FOUND)

  add_dependencies(cblas mklml)
-  target_link_libraries(cblas dynload_mklml)

  message(STATUS "Found cblas and lapack in MKLML "
                 "(include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")

--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -235,3 +235,16 @@ endif()
 if(WITH_CUDNN_FRONTEND)
  add_definitions(-DPADDLE_WITH_CUDNN_FRONTEND)
 endif()
+
+set(WITH_PHI_SHARED
+    ON
+    CACHE BOOL "" FORCE)
+if(WIN32 OR WITH_ROCM)
+  set(WITH_PHI_SHARED
+      OFF
+      CACHE BOOL "" FORCE)
+endif()
+
+if(WITH_PHI_SHARED)
+  add_definitions(-DPHI_SHARED)
+endif()
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -122,6 +122,5 @@ get_filename_component(WARPCTC_LIBRARY_PATH ${WARPCTC_LIBRARIES} DIRECTORY)
 include_directories(${WARPCTC_INCLUDE_DIR}
 )# For warpctc code to include its headers.

-add_library(warpctc SHARED IMPORTED GLOBAL)
-set_property(TARGET warpctc PROPERTY IMPORTED_LOCATION ${WARPCTC_LIBRARIES})
+add_library(warpctc INTERFACE)
 add_dependencies(warpctc extern_warpctc)
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -364,20 +364,7 @@ function(cc_library TARGET_NAME)
        list(REMOVE_ITEM cc_library_DEPS warpctc)
        add_dependencies(${TARGET_NAME} warpctc)
      endif()
-      # Only deps libmklml.so, not link
-      if("${cc_library_DEPS};" MATCHES "mklml;")
-        list(REMOVE_ITEM cc_library_DEPS mklml)
-        if(NOT "${TARGET_NAME}" MATCHES "dynload_mklml")
-          list(APPEND cc_library_DEPS dynload_mklml)
-        endif()
-        add_dependencies(${TARGET_NAME} mklml)
-        if(WIN32)
-          target_link_libraries(${TARGET_NAME} ${MKLML_IOMP_LIB})
-        else()
-          target_link_libraries(${TARGET_NAME}
-                                "-L${MKLML_LIB_DIR} -liomp5 -Wl,--as-needed")
-        endif()
-      endif()
+
      # remove link to python, see notes at:
      # https://github.com/pybind/pybind11/blob/master/docs/compiling.rst#building-manually
      if("${cc_library_DEPS};" MATCHES "python;")
@@ -457,25 +444,10 @@ function(cc_test_build TARGET_NAME)
      endif()
    endif()
    get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
-    target_link_libraries(
-      ${TARGET_NAME}
-      ${cc_test_DEPS}
-      ${os_dependency_modules}
-      paddle_gtest_main
-      lod_tensor
-      memory
-      gtest
-      gflags
-      glog)
-    add_dependencies(
-      ${TARGET_NAME}
-      ${cc_test_DEPS}
-      paddle_gtest_main
-      lod_tensor
-      memory
-      gtest
-      gflags
-      glog)
+    target_link_libraries(${TARGET_NAME} ${cc_test_DEPS}
+                          ${os_dependency_modules} paddle_gtest_main gtest glog)
+    add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main gtest
+                     glog)
    common_link(${TARGET_NAME})
    if(WITH_ROCM)
      target_link_libraries(${TARGET_NAME} ${ROCM_HIPRTC_LIB})
@@ -670,7 +642,7 @@ function(nv_test TARGET_NAME)
    add_executable(${TARGET_NAME} ${nv_test_SRCS})
    get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
    target_link_libraries(${TARGET_NAME} ${nv_test_DEPS}
-                          ${os_dependency_modules} paddle_gtest_main)
+                          ${os_dependency_modules} paddle_gtest_main phi)
    add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main)
    common_link(${TARGET_NAME})
    add_test(${TARGET_NAME} ${TARGET_NAME})
@@ -774,8 +746,8 @@ function(hip_test TARGET_NAME)
      lod_tensor
      memory
      gtest
-      gflags
      glog
+      phi
      ${os_dependency_modules})
    add_dependencies(
      ${TARGET_NAME}
@@ -784,7 +756,7 @@ function(hip_test TARGET_NAME)
      lod_tensor
      memory
      gtest
-      gflags
+      phi
      glog)
    common_link(${TARGET_NAME})
    add_test(${TARGET_NAME} ${TARGET_NAME})
@@ -881,7 +853,7 @@ function(xpu_test TARGET_NAME)
      lod_tensor
      memory
      gtest
-      gflags
+      phi
      glog
      ${os_dependency_modules})
    add_dependencies(
@@ -891,7 +863,7 @@ function(xpu_test TARGET_NAME)
      lod_tensor
      memory
      gtest
-      gflags
+      phi
      glog)
    common_link(${TARGET_NAME})
    add_test(${TARGET_NAME} ${TARGET_NAME})

--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -269,6 +269,13 @@ else()
    SRCS ${src_dir}/inference/api/paddle_*.h ${paddle_inference_lib}
    DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include
         ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib)
+
+  set(paddle_phi_lib ${PADDLE_BINARY_DIR}/paddle/phi/libphi.*)
+  copy(
+    inference_lib_dist
+    SRCS ${paddle_phi_lib}
+    DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib)
+
 endif()

 copy(

--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -61,8 +61,7 @@ function(register_cu_kernel TARGET)
                        "${multiValueArgs}" ${ARGN})

  set(cu_srcs)
-  set(op_common_deps operator op_registry math_function layer
-                     common_infer_shape_functions)
+  set(op_common_deps operator op_registry layer common_infer_shape_functions)
  foreach(cu_src ${register_cu_kernel_SRCS})
    if(${cu_src} MATCHES ".*\\.cu$")
      list(APPEND cu_srcs ${cu_src})
@@ -113,7 +112,7 @@ function(register_mkldnn_kernel TARGET)
                        "${multiValueArgs}" ${ARGN})

  set(mkldnn_cc_srcs)
-  set(op_common_deps operator op_registry math_function layer
+  set(op_common_deps operator op_registry phi layer
                     common_infer_shape_functions)
  foreach(mkldnn_src ${register_mkldnn_kernel_SRCS})
    if(${mkldnn_src} MATCHES ".*_mkldnn_op.cc$")
@@ -164,7 +163,7 @@ function(op_library TARGET)
  set(MIOPEN_FILE)
  set(mkldnn_cc_srcs)
  set(MKLDNN_FILE)
-  set(op_common_deps operator op_registry math_function layer
+  set(op_common_deps operator op_registry phi layer
                     common_infer_shape_functions)

  # Option `UNITY` is used to specify that operator `TARGET` will compiles with Unity Build.

--- a/cmake/phi.cmake
+++ b/cmake/phi.cmake
@@ -94,6 +94,13 @@ function(kernel_declare TARGET_LIST)
          continue()
        endif()
      endif()
+      # fusion group kernel is not supported in windows and mac
+      if(WIN32 OR APPLE)
+        string(FIND "${first_registry}" "fusion_group" pos)
+        if(pos GREATER 1)
+          continue()
+        endif()
+      endif()
      # some gpu kernel only can run on cuda, not support rocm, so we add this branch
      if(WITH_ROCM)
        string(FIND "${first_registry}" "cuda_only" pos)
@@ -216,3 +223,27 @@ function(prune_declaration_h)
    endif()
  endforeach()
 endfunction()
+
+function(collect_srcs SRC_GROUP)
+  set(options)
+  set(oneValueArgs)
+  set(multiValueArgs "SRCS")
+  cmake_parse_arguments(prefix "" "" "${multiValueArgs}" ${ARGN})
+  foreach(src ${prefix_SRCS})
+    set(${SRC_GROUP}
+        "${${SRC_GROUP}};${CMAKE_CURRENT_SOURCE_DIR}/${src}"
+        CACHE INTERNAL "")
+  endforeach()
+endfunction()
+
+function(collect_generated_srcs SRC_GROUP)
+  set(options)
+  set(oneValueArgs)
+  set(multiValueArgs "SRCS")
+  cmake_parse_arguments(prefix "" "" "${multiValueArgs}" ${ARGN})
+  foreach(src ${prefix_SRCS})
+    set(${SRC_GROUP}
+        "${${SRC_GROUP}};${src}"
+        CACHE INTERNAL "")
+  endforeach()
+endfunction()
--- a/paddle/fluid/dialect/CMakeLists.txt
+++ b/paddle/fluid/dialect/CMakeLists.txt
@@ -49,5 +49,5 @@ file(GLOB PD_DIALECT_SRCS "*.cc")
 cc_library(
  pd_dialect
  SRCS ${PD_DIALECT_SRCS} ${op_source_file}
-  DEPS new_ir framework_proto dense_tensor phi_utils)
+  DEPS new_ir framework_proto phi phi_utils)
 target_include_directories(pd_dialect PRIVATE ${PD_DIALECT_BINARY_DIR})
--- a/paddle/fluid/distributed/auto_parallel/CMakeLists.txt
+++ b/paddle/fluid/distributed/auto_parallel/CMakeLists.txt
 cc_library(
  op_dist_attr
  SRCS dist_attr.cc
-  DEPS dist_attr process_mesh dist_mapper auto_parallel_proto proto_desc
-       phi_enforce)
+  DEPS phi auto_parallel_proto proto_desc)

 add_subdirectory(test)
--- a/paddle/fluid/distributed/auto_parallel/test/CMakeLists.txt
+++ b/paddle/fluid/distributed/auto_parallel/test/CMakeLists.txt
 cc_test(
  device_mesh_test
  SRCS device_mesh_test.cc
-  DEPS device_mesh)
+  DEPS phi)

 cc_test(
  process_mesh_test
  SRCS process_mesh_test.cc
-  DEPS process_mesh)
+  DEPS phi)

 cc_test(
  dist_attr_test
  SRCS dist_attr_test.cc
-  DEPS dist_attr proto_desc)
+  DEPS phi proto_desc)

 cc_test(
  dist_mapper_test
  SRCS dist_mapper_test.cc
-  DEPS dist_mapper)
+  DEPS phi)
--- a/paddle/fluid/distributed/collective/CMakeLists.txt
+++ b/paddle/fluid/distributed/collective/CMakeLists.txt
 cc_library(
  process_group
  SRCS process_group.cc
-  DEPS dense_tensor xxhash)
+  DEPS phi xxhash)

 cc_library(
  eager_reducer
  SRCS reducer.cc
-  DEPS eager_api process_group phi_api string_helper)
+  DEPS eager_api process_group phi string_helper)

 if(WITH_DISTRIBUTE)
  cc_library(
    process_group_gloo
    SRCS process_group_gloo.cc gloo_send_recv.cc
-    DEPS phi_api eager_api gloo_wrapper tcp_store)
+    DEPS phi eager_api gloo_wrapper)
 endif()

 if(WITH_NCCL OR WITH_RCCL)
@@ -20,28 +20,19 @@ if(WITH_NCCL OR WITH_RCCL)
    process_group_nccl
    SRCS process_group_nccl.cc nccl_tools.cc common.cc
    DEPS process_group
-         tcp_store
+         phi
         place
         enforce
         collective_helper
         device_context
-         ${DEVICE_EVENT_LIBS}
-         dense_tensor
-         comm_static_check
-         nccl_dynamic_check)
+         ${DEVICE_EVENT_LIBS})
 endif()

 if(WITH_XPU_BKCL)
  cc_library(
    process_group_bkcl
    SRCS process_group_bkcl.cc bkcl_tools.cc common.cc
-    DEPS process_group
-         tcp_store
-         place
-         enforce
-         collective_helper
-         device_context
-         dense_tensor)
+    DEPS process_group phi place enforce collective_helper device_context)
 endif()

 if(WITH_MPI)
@@ -55,15 +46,7 @@ if(WITH_CUSTOM_DEVICE)
  cc_library(
    process_group_custom
    SRCS process_group_custom.cc custom_ccl_tools.cc common.cc
-    DEPS process_group
-         tcp_store
-         phi_backends
-         place
-         enforce
-         collective_helper
-         device_context
-         comm_static_check
-         dense_tensor)
+    DEPS process_group phi place enforce collective_helper device_context)
 endif()

 set(COMM_UTILS_DEPS process_group)

--- a/paddle/fluid/distributed/fleet_executor/CMakeLists.txt
+++ b/paddle/fluid/distributed/fleet_executor/CMakeLists.txt
@@ -5,7 +5,7 @@ endif()
 proto_library(interceptor_message_proto SRCS interceptor_message.proto)

 if(WITH_ARM_BRPC)
-  set(BRPC_DEPS arm_brpc snappy gflags glog)
+  set(BRPC_DEPS arm_brpc snappy phi glog)
 elseif(WITH_DISTRIBUTE AND NOT WITH_PSLIB)
  set(BRPC_DEPS
      brpc
@@ -15,7 +15,7 @@ elseif(WITH_DISTRIBUTE AND NOT WITH_PSLIB)
      zlib
      leveldb
      snappy
-      gflags
+      phi
      glog)
 else()
  set(BRPC_DEPS "")
@@ -51,7 +51,7 @@ cc_library(
       collective_helper
       op_registry
       executor_gc_helper
-       gflags
+       phi
       glog
       ${BRPC_DEPS})


--- a/paddle/fluid/distributed/ps/service/CMakeLists.txt
+++ b/paddle/fluid/distributed/ps/service/CMakeLists.txt
@@ -8,12 +8,11 @@ if(WITH_HETERPS)
      ssl
      crypto
      protobuf
-      gflags
+      phi
      glog
      zlib
      leveldb
      snappy
-      gflags
      glog
      device_context
      rocksdb)
@@ -25,12 +24,11 @@ else()
      ssl
      crypto
      protobuf
-      gflags
+      phi
      glog
      zlib
      leveldb
      snappy
-      gflags
      glog
      device_context)

@@ -122,8 +120,7 @@ cc_library(
       simple_threadpool
       simple_rpc
       scope
-       math_function
-       selected_rows_functor
+       phi
       ps_gpu_wrapper
       ${RPC_DEPS})

@@ -150,7 +147,7 @@ cc_library(
 #cc_library(
 #  communicator
 #  SRCS communicator/communicator.cc
-#  DEPS scope client table math_function selected_rows_functor ${RPC_DEPS})
+#  DEPS scope client table phi ${RPC_DEPS})
 #cc_library(
 #  ps_service
 #  SRCS ps_service/service.cc

--- a/paddle/fluid/distributed/ps/table/CMakeLists.txt
+++ b/paddle/fluid/distributed/ps/table/CMakeLists.txt
@@ -48,7 +48,7 @@ cc_library(
       string_helper
       simple_threadpool
       xxhash
-       generator)
+       phi)

 set_source_files_properties(
  tensor_accessor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
@@ -91,7 +91,7 @@ cc_library(
       ps_framework_proto
       string_helper
       device_context
-       gflags
+       phi
       glog
       fs
       afs_wrapper

--- a/paddle/fluid/distributed/rpc/CMakeLists.txt
+++ b/paddle/fluid/distributed/rpc/CMakeLists.txt
@@ -20,7 +20,7 @@ set(PADDLE_RPC_DEPS
    zlib
    leveldb
    snappy
-    gflags
+    phi
    glog
    pybind)
 proto_library(paddle_rpc_proto SRCS rpc.proto)

--- a/paddle/fluid/distributed/test/CMakeLists.txt
+++ b/paddle/fluid/distributed/test/CMakeLists.txt
@@ -73,7 +73,7 @@ cc_test_old(
  DEPS
  brpc_utils
  scope
-  math_function
+  phi
  ${COMMON_DEPS}
  ${RPC_DEPS})


--- a/paddle/fluid/eager/CMakeLists.txt
+++ b/paddle/fluid/eager/CMakeLists.txt
 set(eager_deps
-    phi_api
-    phi_dygraph_api
+    phi
    hook_utils
    tensor_utils
    utils
    global_utils
    backward
-    phi_tensor
    tracer
    layer
    autograd_meta
@@ -48,27 +46,26 @@ if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
  cc_library(
    backward
    SRCS backward.cc
-    DEPS grad_tensor_holder utils autograd_meta grad_node_info switch_autotune)
+    DEPS grad_tensor_holder utils autograd_meta grad_node_info phi)
 endif()

 cc_library(
  eager_nan_inf_utils
  SRCS nan_inf_utils.cc
-  DEPS phi_tensor nan_inf_utils enforce)
+  DEPS phi nan_inf_utils enforce)
 cc_library(
  grad_node_info
  SRCS grad_node_info.cc
-  DEPS phi_api phi_tensor)
+  DEPS phi)

 cc_library(
  autograd_meta
  SRCS autograd_meta.cc
-  DEPS phi_api phi_tensor)
+  DEPS phi)
 cc_library(
  utils
  SRCS utils.cc
-  DEPS phi_api
-       phi_tensor
+  DEPS phi
       global_utils
       layer
       proto_desc

--- a/paddle/fluid/eager/accumulation/CMakeLists.txt
+++ b/paddle/fluid/eager/accumulation/CMakeLists.txt
@@ -2,5 +2,5 @@ if(NOT (NOT WITH_PYTHON AND ON_INFER))
  cc_library(
    accumulation_node
    SRCS accumulation_node.cc
-    DEPS gradient_accumulator phi_api grad_node_info)
+    DEPS gradient_accumulator phi grad_node_info)
 endif()
--- a/paddle/fluid/eager/api/generated/eager_generated/backwards/CMakeLists.txt
+++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/CMakeLists.txt
 cc_library(
  scale_node
  SRCS scale_node.cc
-  DEPS global_utils phi phi_api grad_node_info)
+  DEPS global_utils phi grad_node_info)

 if(NOT (NOT WITH_PYTHON AND ON_INFER))
  cc_library(

--- a/paddle/fluid/eager/api/generated/eager_generated/forwards/CMakeLists.txt
+++ b/paddle/fluid/eager/api/generated/eager_generated/forwards/CMakeLists.txt
 cc_library(
  eager_scale
  SRCS scale.cc
-  DEPS phi_api phi autograd_meta scale_node)
+  DEPS phi autograd_meta scale_node)

 if(NOT (NOT WITH_PYTHON AND ON_INFER))
  cc_library(

--- a/paddle/fluid/eager/api/utils/CMakeLists.txt
+++ b/paddle/fluid/eager/api/utils/CMakeLists.txt
@@ -7,7 +7,7 @@ if(NOT (NOT WITH_PYTHON AND ON_INFER))
  cc_library(
    tensor_utils
    SRCS tensor_utils.cc
-    DEPS phi_api autograd_meta grad_node_info accumulation_node)
+    DEPS phi autograd_meta grad_node_info accumulation_node)
  cc_library(
    hook_utils
    SRCS hook_utils.cc
@@ -16,7 +16,7 @@ else()
  cc_library(
    tensor_utils
    SRCS tensor_utils.cc
-    DEPS phi_api autograd_meta grad_node_info)
+    DEPS phi autograd_meta grad_node_info)
  cc_library(
    hook_utils
    SRCS hook_utils.cc

--- a/paddle/fluid/eager/auto_code_generator/CMakeLists.txt
+++ b/paddle/fluid/eager/auto_code_generator/CMakeLists.txt
@@ -52,6 +52,15 @@ if(WIN32)
    set(eager_generator_path "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}")
  endif()

+  if(WITH_PHI_SHARED)
+    message("Copied phi.dll for Eager AutoCodeGen")
+    add_custom_command(
+      OUTPUT ${eager_generator_path}/phi.dll
+      COMMAND ${CMAKE_COMMAND} -E copy ${PHI_LIB} ${eager_generator_path}
+      DEPENDS phi)
+    list(APPEND EAGER_CODEGEN_DEPS ${eager_generator_path}/phi.dll)
+  endif()
+
  if(${CBLAS_PROVIDER} STREQUAL MKLML)
    message("Copied libiomp5md.dll for Eager AutoCodeGen")
    add_custom_command(

--- a/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
@@ -392,7 +392,7 @@ FORWARD_CC_FILE_TEMPLATE = """
 #include "paddle/fluid/eager/api/manual/eager_manual/dygraph_forward_api.h"
 #include "paddle/phi/core/flags.h"

-DECLARE_bool(check_nan_inf);
+PHI_DECLARE_bool(check_nan_inf);
 PHI_DECLARE_string(tensor_operants_mode);
 {}
 {}

--- a/paddle/fluid/eager/custom_operator/CMakeLists.txt
+++ b/paddle/fluid/eager/custom_operator/CMakeLists.txt
 cc_library(
  custom_operator_node
  SRCS custom_operator_node.cc
-  DEPS phi_tensor phi_api grad_node_info custom_operator op_meta_info)
+  DEPS phi grad_node_info custom_operator)
--- a/paddle/fluid/eager/pylayer/CMakeLists.txt
+++ b/paddle/fluid/eager/pylayer/CMakeLists.txt
 cc_library(
  py_layer_node
  SRCS py_layer_node.cc
-  DEPS pybind phi_api grad_node_info)
+  DEPS pybind phi grad_node_info)
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -115,7 +115,7 @@ proto_library(trainer_desc_proto SRCS trainer_desc.proto DEPS framework_proto
 cc_library(
  string_array
  SRCS string_array.cc
-  DEPS utf8proc phi_enforce)
+  DEPS utf8proc phi)

 cc_library(
  data_type
@@ -130,7 +130,7 @@ cc_test(
 cc_library(
  tensor
  SRCS tensor_util.cc
-  DEPS place memory data_type device_context dense_tensor)
+  DEPS place memory data_type device_context phi)

 cc_test(
  tensor_test
@@ -166,12 +166,12 @@ cc_test(
 cc_library(
  lod_tensor
  SRCS lod_tensor.cc
-  DEPS ddim mixed_vector place tensor framework_proto version)
+  DEPS phi place tensor framework_proto version)

 cc_test(
  lod_tensor_test
  SRCS lod_tensor_test.cc
-  DEPS lod_utils lod_tensor memory)
+  DEPS phi lod_tensor memory)

 if(WITH_GPU)
  nv_test(
@@ -188,12 +188,12 @@ endif()
 cc_library(
  garbage_collector
  SRCS garbage_collector.cc
-  DEPS device_context memory gflags glog)
+  DEPS device_context memory phi glog)

 cc_library(
  reader
  SRCS reader.cc
-  DEPS lod_tensor ddim)
+  DEPS lod_tensor phi)
 cc_test(
  reader_test
  SRCS reader_test.cc
@@ -202,13 +202,12 @@ cc_test(
 cc_test(
  threadpool_test
  SRCS threadpool_test.cc
-  DEPS threadpool)
+  DEPS phi)

 cc_library(
  var_type_traits
  SRCS var_type_traits.cc
-  DEPS framework_proto scope tensor_array sparse_coo_tensor sparse_csr_tensor
-       extended_tensor)
+  DEPS framework_proto scope phi)
 if(WITH_GPU)
  target_link_libraries(var_type_traits dynload_cuda)
 endif()
@@ -242,7 +241,7 @@ endif()
 cc_library(
  scope
  SRCS scope.cc
-  DEPS glog threadpool xxhash var_type_traits)
+  DEPS glog phi xxhash var_type_traits)
 cc_library(
  device_worker
  SRCS device_worker.cc
@@ -273,12 +272,12 @@ if(WITH_GPU)
  nv_test(
    data_device_transform_test
    SRCS data_device_transform_test.cu
-    DEPS operator op_registry device_context math_function scope)
+    DEPS operator op_registry device_context phi scope)
 elseif(WITH_ROCM)
  hip_test(
    data_device_transform_test
    SRCS data_device_transform_test.cu
-    DEPS operator op_registry device_context math_function scope)
+    DEPS operator op_registry device_context phi scope)
 endif()

 if(WITH_GPU)
@@ -333,7 +332,7 @@ endif()
 cc_library(
  data_layout_transform
  SRCS data_layout_transform.cc
-  DEPS tensor math_function phi_data_layout_transform)
+  DEPS tensor phi)
 cc_test(
  data_layout_transform_test
  SRCS data_layout_transform_test.cc
@@ -342,14 +341,13 @@ cc_test(
 cc_library(
  data_transform
  SRCS data_transform.cc
-  DEPS math_function
+  DEPS phi
       tensor
       framework_proto
       selected_rows_utils
       data_device_transform
       data_type_transform
-       data_layout_transform
-       phi_data_transform)
+       data_layout_transform)

 cc_library(
  attribute
@@ -400,7 +398,7 @@ cc_library(
 cc_library(
  shape_inference
  SRCS shape_inference.cc
-  DEPS ddim attribute selected_rows_utils)
+  DEPS phi attribute selected_rows_utils)

 # every source file that includes "dnnl.h" must depends on mkldnn
 # or, the first one should depends on mkldnn
@@ -433,30 +431,17 @@ if(WITH_XPU)
    phi_utils
    SRCS phi_utils.cc
    DEPS lod_tensor
-         dense_tensor
         selected_rows_utils
-         int_array
-         scalar
         place
         phi
         var_type_traits
         op_info
-         xpu_op_list
-         convert_utils)
+         xpu_op_list)
 else()
  cc_library(
    phi_utils
    SRCS phi_utils.cc
-    DEPS lod_tensor
-         dense_tensor
-         selected_rows_utils
-         int_array
-         scalar
-         place
-         phi
-         var_type_traits
-         op_info
-         convert_utils)
+    DEPS lod_tensor selected_rows_utils place phi var_type_traits op_info)
 endif()

 if(WITH_XPU)
@@ -482,11 +467,10 @@ if(WITH_XPU)
         unused_var_check
         nan_inf_utils
         phi_utils
-         kernel_factory
         infershape_utils
-         op_utils
+         phi
         op_compat_infos
-         get_kerneltype_forvar_utils)
+         type_info)
 else()
  cc_library(
    operator
@@ -509,11 +493,10 @@ else()
         unused_var_check
         nan_inf_utils
         phi_utils
-         kernel_factory
         infershape_utils
-         op_utils
+         phi
         op_compat_infos
-         get_kerneltype_forvar_utils)
+         type_info)
 endif()

 cc_test(
@@ -543,7 +526,7 @@ cc_library(
       version
       xxhash
       op_dist_attr
-       scalar
+       phi
       op_version_proto
       op_version_registry)

@@ -853,7 +836,7 @@ if(WITH_DISTRIBUTE)
           heter_server
           brpc
           fleet_executor
-           flags)
+           phi)
    set(DISTRIBUTE_COMPILE_FLAGS "")
    if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
      set(DISTRIBUTE_COMPILE_FLAGS "${DISTRIBUTE_COMPILE_FLAGS} -faligned-new")
@@ -1071,7 +1054,7 @@ if(WITH_PSCORE)
           executor
           heter_server
           gloo_wrapper
-           eigen_function
+           phi
           ${RPC_DEPS}
           graph_gpu_wrapper)
  else()
@@ -1088,7 +1071,7 @@ if(WITH_PSCORE)
           executor
           heter_server
           gloo_wrapper
-           eigen_function
+           phi
           ${RPC_DEPS})
  endif()
 else()
@@ -1112,7 +1095,7 @@ cc_test(
 cc_library(
  selected_rows_utils
  SRCS selected_rows_utils.cc
-  DEPS selected_rows device_context)
+  DEPS phi device_context)
 cc_test(
  selected_rows_utils_test
  SRCS selected_rows_utils_test.cc
@@ -1162,12 +1145,11 @@ cc_library(
       phi
       phi_utils
       op_info
-       shape_inference
-       sparse_coo_tensor)
+       shape_inference)
 cc_test(
  infershape_utils_test
  SRCS infershape_utils_test.cc
-  DEPS infershape_utils infermeta_utils meta_tensor)
+  DEPS infershape_utils phi)

 # Get the current working branch
 execute_process(
@@ -1198,12 +1180,15 @@ cc_library(
       operator
       dynamic_loader
       string_helper
-       phi_tensor
-       op_meta_info
-       phi_api
-       tensor_api
-       phi_tensor_operants
-       operants_manager)
+       phi
+       imperative_flag
+       layer)
+
+cc_library(type_info SRCS type_info.cc)
+add_dependencies(type_info framework_proto auto_parallel_proto xxhash)
+if(WITH_MKLDNN)
+  add_dependencies(type_info mkldnn)
+endif()

 set(FLUID_FRAMEWORK_MODULES
    proto_desc

--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -10,15 +10,15 @@ cc_library(
 cc_library(
  scale_loss_grad_op_handle
  SRCS scale_loss_grad_op_handle.cc
-  DEPS op_handle_base scope lod_tensor ddim memory)
+  DEPS op_handle_base scope lod_tensor phi memory)
 cc_library(
  fetch_op_handle
  SRCS fetch_op_handle.cc
-  DEPS op_handle_base scope lod_tensor ddim memory)
+  DEPS op_handle_base scope lod_tensor phi memory)
 cc_library(
  fetch_async_op_handle
  SRCS fetch_async_op_handle.cc
-  DEPS op_handle_base scope lod_tensor ddim memory)
+  DEPS op_handle_base scope lod_tensor phi memory)

 cc_library(
  share_tensor_buffer_functor
@@ -78,7 +78,7 @@ if(WITH_GPU)
    DEPS op_handle_base
         scope
         lod_tensor
-         ddim
+         phi
         memory
         dynload_cuda
         variable_visitor)
@@ -88,7 +88,7 @@ if(WITH_GPU)
    DEPS op_handle_base
         scope
         lod_tensor
-         ddim
+         phi
         memory
         dynload_cuda
         variable_visitor
@@ -99,7 +99,7 @@ if(WITH_GPU)
    DEPS op_handle_base
         scope
         lod_tensor
-         ddim
+         phi
         memory
         dynload_cuda
         variable_visitor
@@ -114,7 +114,7 @@ if(WITH_GPU)
      DEPS op_handle_base
           scope
           lod_tensor
-           ddim
+           phi
           memory
           dynload_cuda
           variable_visitor
@@ -126,19 +126,17 @@ if(WITH_GPU)
    nv_library(
      reduce_op_handle
      SRCS reduce_op_handle.cc
-      DEPS op_handle_base variable_visitor scope ddim dynload_cuda
-           selected_rows_functor)
+      DEPS op_handle_base variable_visitor scope phi dynload_cuda)
  else()
    nv_library(
      reduce_op_handle
      SRCS reduce_op_handle.cc
-      DEPS op_handle_base variable_visitor scope ddim dynload_cuda
-           selected_rows_functor)
+      DEPS op_handle_base variable_visitor scope phi dynload_cuda)
  endif()
  nv_library(
    broadcast_op_handle
    SRCS broadcast_op_handle.cc
-    DEPS op_handle_base scope ddim memory variable_visitor dynload_cuda)
+    DEPS op_handle_base scope phi memory variable_visitor dynload_cuda)
  nv_library(
    fused_broadcast_op_handle
    SRCS fused_broadcast_op_handle.cc
@@ -154,7 +152,7 @@ elseif(WITH_ROCM)
    DEPS op_handle_base
         scope
         lod_tensor
-         ddim
+         phi
         memory
         dynload_cuda
         variable_visitor)
@@ -164,7 +162,7 @@ elseif(WITH_ROCM)
    DEPS op_handle_base
         scope
         lod_tensor
-         ddim
+         phi
         memory
         dynload_cuda
         variable_visitor
@@ -175,7 +173,7 @@ elseif(WITH_ROCM)
    DEPS op_handle_base
         scope
         lod_tensor
-         ddim
+         phi
         memory
         dynload_cuda
         variable_visitor
@@ -187,19 +185,17 @@ elseif(WITH_ROCM)
    hip_library(
      reduce_op_handle
      SRCS reduce_op_handle.cc
-      DEPS op_handle_base variable_visitor scope ddim dynload_cuda
-           selected_rows_functor)
+      DEPS op_handle_base variable_visitor scope phi dynload_cuda)
  else()
    hip_library(
      reduce_op_handle
      SRCS reduce_op_handle.cc
-      DEPS op_handle_base variable_visitor scope ddim dynload_cuda
-           selected_rows_functor)
+      DEPS op_handle_base variable_visitor scope phi dynload_cuda)
  endif()
  hip_library(
    broadcast_op_handle
    SRCS broadcast_op_handle.cc
-    DEPS op_handle_base scope ddim memory variable_visitor dynload_cuda)
+    DEPS op_handle_base scope phi memory variable_visitor dynload_cuda)
  hip_library(
    fused_broadcast_op_handle
    SRCS fused_broadcast_op_handle.cc
@@ -212,14 +208,14 @@ else()
  cc_library(
    all_reduce_op_handle
    SRCS all_reduce_op_handle.cc
-    DEPS op_handle_base scope lod_tensor ddim memory variable_visitor)
+    DEPS op_handle_base scope lod_tensor phi memory variable_visitor)
  cc_library(
    fused_all_reduce_op_handle
    SRCS fused_all_reduce_op_handle.cc
    DEPS op_handle_base
         scope
         lod_tensor
-         ddim
+         phi
         memory
         variable_visitor
         place)
@@ -229,7 +225,7 @@ else()
    DEPS op_handle_base
         scope
         lod_tensor
-         ddim
+         phi
         memory
         variable_visitor
         place
@@ -239,17 +235,17 @@ else()
    cc_library(
      reduce_op_handle
      SRCS reduce_op_handle.cc
-      DEPS op_handle_base variable_visitor scope ddim selected_rows_functor)
+      DEPS op_handle_base variable_visitor scope phi)
  else()
    cc_library(
      reduce_op_handle
      SRCS reduce_op_handle.cc
-      DEPS op_handle_base variable_visitor scope ddim selected_rows_functor)
+      DEPS op_handle_base variable_visitor scope phi)
  endif()
  cc_library(
    broadcast_op_handle
    SRCS broadcast_op_handle.cc
-    DEPS op_handle_base scope ddim memory variable_visitor)
+    DEPS op_handle_base scope phi memory variable_visitor)
  cc_library(
    fused_broadcast_op_handle
    SRCS fused_broadcast_op_handle.cc
@@ -259,7 +255,7 @@ endif()
 cc_library(
  gather_op_handle
  SRCS gather_op_handle.cc
-  DEPS op_handle_base scope ddim memory variable_visitor)
+  DEPS op_handle_base scope phi memory variable_visitor)

 cc_library(
  eager_deletion_op_handle
@@ -305,7 +301,7 @@ cc_test(
  DEPS var_handle
       op_handle_base
       scope
-       ddim
+       phi
       memory
       device_context
       broadcast_op_handle)
@@ -317,7 +313,7 @@ cc_test_old(
  var_handle
  op_handle_base
  scope
-  ddim
+  phi
  memory
  device_context
  gather_op_handle)
@@ -330,12 +326,12 @@ cc_library(
  scope_buffered_ssa_graph_executor
  SRCS scope_buffered_ssa_graph_executor.cc
  DEPS ssa_graph_executor scope_buffered_monitor)
-#cc_test(reduce_op_handle_test SRCS reduce_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
+#cc_test(reduce_op_handle_test SRCS reduce_op_handle_test.cc DEPS var_handle op_handle_base scope phi memory
 #        device_context reduce_op_handle )
 cc_library(
  bind_threaded_ssa_graph_executor
  SRCS bind_threaded_ssa_graph_executor.cc
-  DEPS fetch_op_handle gflags ssa_graph_executor scope simple_threadpool
+  DEPS fetch_op_handle phi ssa_graph_executor scope simple_threadpool
       device_context)
 cc_library(
  fast_threaded_ssa_graph_executor

--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -20,9 +20,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/details/reduce_op_handle.h"
 #include "paddle/fluid/framework/ir/graph_printer.h"
 #include "paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h"
+#include "paddle/phi/core/flags.h"

 DECLARE_bool(convert_all_blocks);
-DECLARE_bool(use_mkldnn);
+PHI_DECLARE_bool(use_mkldnn);
 #ifdef PADDLE_WITH_CINN
 DECLARE_bool(use_cinn);
 #endif

--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -32,7 +32,7 @@ cc_library(
 cc_library(
  cost_model
  SRCS cost_model.cc
-  DEPS executor graph profiler proto_desc phi_device_tracer)
+  DEPS executor graph profiler proto_desc phi)

 set(GRAPH_PATTERN_DETECTOR_DEPS graph graph_helper graph_traits)
 if(WITH_TESTING)
@@ -458,9 +458,6 @@ if(WITH_MKLDNN)
      graph_to_program_pass
      conv_op
      conv_transpose_op
-      math_function
-      im2col
-      vol2col
      batch_norm_op
      generated_op
      activation_op
@@ -468,7 +465,7 @@ if(WITH_MKLDNN)
      concat_and_split
      naive_executor
      device_context
-      eigen_function)
+      phi)
  if(WITH_GPU OR WITH_ROCM)
    set(TEST_CONV_BN_PASS_DEPS ${TEST_CONV_BN_PASS_DEPS} depthwise_conv)
  endif()

--- a/paddle/fluid/framework/ir/fuse_adamw_op_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_adamw_op_pass.cc
@@ -221,7 +221,7 @@ bool InitAndCheckAttrs(const size_t &found_adamw_count,
    }
  }

-  // Check whether with_decay and multi_precision are matched。
+  // Check whether with_decay and multi_precision are matched
  if (config->with_decay !=
          PADDLE_GET_CONST(bool, adamw_op_desc->GetAttr("with_decay")) ||
      config->multi_precision !=

--- a/paddle/fluid/framework/ir/fusion_group/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/fusion_group/CMakeLists.txt
@@ -6,13 +6,13 @@ if(WITH_GPU OR WITH_ROCM)
  cc_test(
    test_code_generator
    SRCS code_generator_tester.cc
-    DEPS code_generator phi_backends lod_tensor graph_viz_pass)
+    DEPS code_generator phi lod_tensor graph_viz_pass)
 endif()

 cc_library(
  fusion_group_pass
  SRCS fusion_group_pass.cc elementwise_group_detector.cc
-  DEPS subgraph_detector fuse_pass_base code_generator phi_backends)
+  DEPS subgraph_detector fuse_pass_base code_generator phi)
 cc_test(
  test_fusion_group_pass
  SRCS fusion_group_pass_tester.cc

--- a/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt
@@ -76,5 +76,4 @@ cc_library(
 cc_test(
  test_reference_count_pass_last_lived_ops
  SRCS test_reference_count_pass_last_lived_ops.cc
-  DEPS parallel_executor elementwise_mul_op elementwise_add_op generated_op
-       eigen_function)
+  DEPS parallel_executor elementwise_mul_op elementwise_add_op generated_op phi)
--- a/paddle/fluid/framework/new_executor/CMakeLists.txt
+++ b/paddle/fluid/framework/new_executor/CMakeLists.txt
@@ -16,4 +16,4 @@ cc_library(
 cc_library(
  staticgraph_executor_statistics
  SRCS executor_statistics.cc
-  DEPS enforce glog phi_os_info)
+  DEPS enforce glog phi)
--- a/paddle/fluid/framework/new_executor/interpreter/CMakeLists.txt
+++ b/paddle/fluid/framework/new_executor/interpreter/CMakeLists.txt
@@ -6,7 +6,6 @@ set(INTERPRETER_DEPS
    device_context
    global_utils
    op_registry
-    phi_tensor_utils
    scope
    framework_proto
    data_feed_proto
@@ -31,7 +30,7 @@ set(INTERPRETER_DEPS
    enforce
    scope
    glog
-    comm_context_manager
+    phi
    ${DEVICE_EVENT_LIBS}
    glog)


--- a/paddle/fluid/framework/new_executor/workqueue/CMakeLists.txt
+++ b/paddle/fluid/framework/new_executor/workqueue/CMakeLists.txt
@@ -5,7 +5,7 @@ cc_library(
 cc_library(
  workqueue
  SRCS workqueue.cc
-  DEPS workqueue_utils enforce glog phi_os_info)
+  DEPS workqueue_utils enforce glog phi)
 cc_test(
  workqueue_test
  SRCS workqueue_test.cc

--- a/paddle/fluid/framework/paddle2cinn/CMakeLists.txt
+++ b/paddle/fluid/framework/paddle2cinn/CMakeLists.txt
@@ -5,7 +5,7 @@ pass_library(
  cinn_subgraph_detector
  subgraph_detector
  cinn_compiler
-  errors
+  phi
  enforce)

 pass_library(cinn_zero_tensor_trick_pass base)
@@ -17,7 +17,7 @@ cc_library(
 cc_library(
  transform_type
  SRCS transform_type.cc
-  DEPS errors enforce cinn)
+  DEPS phi enforce cinn)
 cc_library(
  cinn_cache_key
  SRCS cinn_cache_key.cc

--- a/paddle/fluid/framework/raw_tensor.h
+++ b/paddle/fluid/framework/raw_tensor.h
@@ -16,6 +16,7 @@ limitations under the License. */

 #include <unordered_map>

+#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/extended_tensor.h"
 #include "paddle/utils/any.h"

@@ -52,7 +53,7 @@ class RawTensor : public phi::ExtendedTensor,
  T& Get() const {
    PADDLE_ENFORCE_EQ(data_.empty(),
                      false,
-                      platform::errors::PreconditionNotMet(
+                      phi::errors::PreconditionNotMet(
                          "The data in RawTensor is empty. Please set data "
                          "before using it."));


--- a/paddle/fluid/framework/type_info.cc
+++ b/paddle/fluid/framework/type_info.cc
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/eager/eager_tensor.h"
+#include "paddle/fluid/framework/feed_fetch_type.h"
+#include "paddle/fluid/framework/raw_tensor.h"
+#include "paddle/fluid/framework/string_array.h"
+#include "paddle/fluid/prim/utils/static/desc_tensor.h"
+
+namespace phi {
+template <>
+const TypeInfo<phi::TensorBase>
+    TypeInfoTraits<phi::TensorBase, paddle::framework::RawTensor>::kType =
+        RegisterStaticType<phi::TensorBase>(
+            paddle::framework::RawTensor::name());
+
+template <>
+const TypeInfo<phi::TensorBase>
+    TypeInfoTraits<phi::TensorBase, paddle::framework::Vocab>::kType =
+        RegisterStaticType<phi::TensorBase>(paddle::framework::Vocab::name());
+
+template <>
+const TypeInfo<phi::TensorBase>
+    TypeInfoTraits<phi::TensorBase, paddle::framework::Strings>::kType =
+        RegisterStaticType<phi::TensorBase>(paddle::framework::Strings::name());
+
+template <>
+const TypeInfo<phi::TensorBase>
+    TypeInfoTraits<phi::TensorBase, paddle::framework::FeedList>::kType =
+        RegisterStaticType<phi::TensorBase>(
+            paddle::framework::FeedList::name());
+
+template <>
+const TypeInfo<phi::TensorBase>
+    TypeInfoTraits<phi::TensorBase, egr::VariableCompatTensor>::kType =
+        RegisterStaticType<phi::TensorBase>(egr::VariableCompatTensor::name());
+
+template <>
+const TypeInfo<phi::TensorBase>
+    TypeInfoTraits<phi::TensorBase, paddle::prim::DescTensor>::kType =
+        RegisterStaticType<phi::TensorBase>(paddle::prim::DescTensor::name());
+
+}  // namespace phi
--- a/paddle/fluid/imperative/CMakeLists.txt
+++ b/paddle/fluid/imperative/CMakeLists.txt
 cc_library(
  imperative_flag
  SRCS flags.cc
-  DEPS gflags flags)
+  DEPS phi)
 cc_library(
  var_helper
  SRCS var_helper.cc
-  DEPS tensor selected_rows extended_tensor)
+  DEPS tensor phi)
 if(WITH_XPU)
  cc_library(
    prepared_operator
@@ -20,8 +20,7 @@ if(WITH_XPU)
         op_kernel_type
         data_transform
         nan_inf_utils
-         scalar
-         int_array
+         phi
         var_helper
         profiler
         place)
@@ -38,8 +37,7 @@ else()
         op_kernel_type
         data_transform
         nan_inf_utils
-         scalar
-         int_array
+         phi
         var_helper
         profiler
         place)
@@ -47,14 +45,14 @@ endif()
 cc_library(
  layer
  SRCS layer.cc
-  DEPS prepared_operator math_function imperative_flag variable_helper
-       op_registry var_helper)
+  DEPS prepared_operator phi imperative_flag variable_helper op_registry
+       var_helper)
 add_subdirectory(jit)
 if(WITH_GPU)
  cc_library(
    layout_autotune
    SRCS layout_autotune.cc
-    DEPS op_info phi_backends)
+    DEPS op_info phi)
 else()
  cc_library(
    layout_autotune
@@ -80,15 +78,15 @@ cc_library(
 cc_library(
  basic_engine
  SRCS basic_engine.cc
-  DEPS layer gradient_accumulator switch_autotune)
+  DEPS layer gradient_accumulator phi)
 cc_library(
  engine
  SRCS basic_engine.cc partial_grad_engine.cc
-  DEPS layer gradient_accumulator switch_autotune)
+  DEPS layer gradient_accumulator phi)
 cc_library(
  imperative_profiler
  SRCS profiler.cc
-  DEPS flags)
+  DEPS phi)
 if(NOT WIN32)
  if(WITH_NCCL OR WITH_RCCL)
    cc_library(
@@ -174,12 +172,4 @@ endif()
 cc_library(
  gradient_accumulator
  SRCS gradient_accumulator.cc
-  DEPS blas
-       operator
-       lod_tensor
-       selected_rows_utils
-       selected_rows_functor
-       var_type_traits
-       layer
-       math_function
-       phi_tensor)
+  DEPS operator lod_tensor selected_rows_utils var_type_traits layer phi)
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -32,14 +32,8 @@ endif()

 # fluid_modules exclude API-interface of inference/api and inference/capi_exp
 get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
-get_property(phi_modules GLOBAL PROPERTY PHI_MODULES)
-get_property(phi_kernels GLOBAL PROPERTY PHI_KERNELS)
 set(utils_modules pretty_log string_helper benchmark)

-if(WITH_CUSTOM_DEVICE)
-  set(fluid_modules ${fluid_modules} phi_capi)
-endif()
-
 add_subdirectory(api)

 # Create static inference library if needed
@@ -51,7 +45,6 @@ set(STATIC_INFERENCE_API
    reset_tensor_array
    analysis_config
    paddle_pass_builder
-    phi
    ${mkldnn_quantizer_cfg})

 set(OP_LIST
@@ -64,16 +57,14 @@ set(KERNEL_LIST

 #windows GPU static library over the limit, so not create_static_lib, and cc_library is dummy
 if(WIN32 AND WITH_GPU)
-  cc_library(paddle_inference DEPS ${fluid_modules} phi ${STATIC_INFERENCE_API}
+  cc_library(paddle_inference DEPS ${fluid_modules} ${STATIC_INFERENCE_API}
                                   ${utils_modules})
 else()
  # message("${fluid_modules}")
-  # message("PHI_MODULES ${phi_modules}")
-  # message("${phi_kernels}")
  # message("${STATIC_INFERENCE_API}")
  # message("${utils_modules}")
-  create_static_lib(paddle_inference ${fluid_modules} ${phi_modules}
-                    ${phi_kernels} ${STATIC_INFERENCE_API} ${utils_modules})
+  create_static_lib(paddle_inference ${fluid_modules} ${STATIC_INFERENCE_API}
+                    ${utils_modules})
 endif()

 if(NOT APPLE)
@@ -103,7 +94,7 @@ set(SHARED_INFERENCE_SRCS
 # shared inference library deps
 list(REMOVE_ITEM fluid_modules standalone_executor
     interpretercore_garbage_collector)
-set(SHARED_INFERENCE_DEPS ${fluid_modules} phi analysis_predictor
+set(SHARED_INFERENCE_DEPS phi ${fluid_modules} analysis_predictor
                          ${utils_modules})

 if(WITH_CRYPTO)
@@ -124,12 +115,6 @@ if(WITH_ONNXRUNTIME)
      ${CMAKE_CURRENT_SOURCE_DIR}/api/onnxruntime_predictor.cc)
 endif()

-#export all symbols for paddle/phi/api/include/api.h on paddle_inference_shared, only for UNIX
-if(UNIX)
-  set(SHARED_INFERENCE_DEPS ${SHARED_INFERENCE_DEPS}
-                            $<TARGET_OBJECTS:phi_function_api>)
-endif()
-
 # Create shared inference library
 cc_library(
  paddle_inference_shared SHARED
@@ -141,12 +126,15 @@ target_link_libraries(paddle_inference_shared ${os_dependency_modules})
 if(WIN32)
  set_property(TARGET paddle_inference_shared
               PROPERTY WINDOWS_EXPORT_ALL_SYMBOLS ON)
-  target_link_libraries(paddle_inference_shared gflags)
+  target_link_libraries(paddle_inference_shared phi)
 endif()

 set_target_properties(paddle_inference_shared PROPERTIES OUTPUT_NAME
                                                         paddle_inference)
-if(NOT APPLE AND NOT WIN32)
+if(NOT APPLE
+   AND NOT WIN32
+   AND NOT WITH_TESTING
+   AND NOT WITH_INFERENCE_API_TEST)
  # TODO(liuyiqun): Temporarily disable the link flag because it is not support on Mac.
  set(LINK_FLAGS
      "-Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/paddle_inference.map")

--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -41,7 +41,7 @@ if(WITH_CRYPTO)
  list(APPEND paddle_inference_api_deps paddle_crypto)
 endif()
 if(WITH_CUSTOM_DEVICE)
-  set(paddle_inference_api_deps ${paddle_inference_api_deps} phi_capi)
+  set(paddle_inference_api_deps ${paddle_inference_api_deps} phi)
 endif()

 cc_library(
@@ -50,7 +50,7 @@ cc_library(
  DEPS ${paddle_inference_api_deps})

 if(WIN32)
-  target_link_libraries(paddle_inference_api gflags)
+  target_link_libraries(paddle_inference_api phi)
 endif()

 set(inference_deps ${analysis_deps} paddle_inference_api analysis

--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -72,7 +72,7 @@
 #endif

 #ifdef PADDLE_WITH_MKLML
-#include "paddle/fluid/platform/dynload/mklml.h"
+#include "paddle/phi/backends/dynload/mklml.h"
 #endif

 #ifdef PADDLE_WITH_MKLDNN
@@ -1121,7 +1121,7 @@ bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
  // Frees unused memory allocated by the Intel® MKL Memory Allocator to
  // avoid memory leak. See:
  // https://software.intel.com/en-us/mkl-developer-reference-c-mkl-free-buffers
-  platform::dynload::MKL_Free_Buffers();
+  phi::dynload::MKL_Free_Buffers();
 #endif
  return true;
 }
@@ -1185,7 +1185,7 @@ bool AnalysisPredictor::Run(const std::vector<paddle::Tensor> &inputs,
  // Frees unused memory allocated by the Intel® MKL Memory Allocator to
  // avoid memory leak. See:
  // https://software.intel.com/en-us/mkl-developer-reference-c-mkl-free-buffers
-  platform::dynload::MKL_Free_Buffers();
+  phi::dynload::MKL_Free_Buffers();
 #endif
  return true;
 }
@@ -2100,7 +2100,7 @@ bool AnalysisPredictor::ZeroCopyRun() {
  // Frees unused memory allocated by the Intel® MKL Memory Allocator to
  // avoid memory leak. See:
  // https://software.intel.com/en-us/mkl-developer-reference-c-mkl-free-buffers
-  platform::dynload::MKL_Free_Buffers();
+  phi::dynload::MKL_Free_Buffers();
 #endif
  return true;
 }

--- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
@@ -199,7 +199,7 @@ if(NOT WIN32)
      ${MATH_LIB}
      ${MKLDNN_LIB}
      glog
-      gflags
+      phi
      protobuf
      xxhash
      cryptopp

--- a/paddle/fluid/inference/api/demo_ci/run.sh
+++ b/paddle/fluid/inference/api/demo_ci/run.sh
@@ -29,6 +29,7 @@ WIN_DETECT=$(echo `uname` | grep "Win") # detect current platform

 cd `dirname $0`
 current_dir=`pwd`
+
 if [ $2 == ON ]; then
  # You can export yourself if move the install path
  MKL_LIB=${inference_install_dir}/third_party/install/mklml/lib

--- a/paddle/fluid/inference/api/details/CMakeLists.txt
+++ b/paddle/fluid/inference/api/details/CMakeLists.txt
@@ -25,7 +25,7 @@ if(WITH_ONNXRUNTIME)
  cc_library(
    zero_copy_tensor_dummy
    SRCS zero_copy_tensor_dummy.cc
-    DEPS onnxruntime phi_enforce)
+    DEPS onnxruntime phi)
 else()
  cc_library(
    zero_copy_tensor
@@ -34,7 +34,7 @@ else()
  cc_library(
    zero_copy_tensor_dummy
    SRCS zero_copy_tensor_dummy.cc
-    DEPS phi_enforce)
+    DEPS phi)
 endif()

 cc_test(

--- a/paddle/fluid/inference/capi_exp/CMakeLists.txt
+++ b/paddle/fluid/inference/capi_exp/CMakeLists.txt
@@ -39,7 +39,7 @@ if(APPLE)
    utf8proc
    cryptopp
    protobuf
-    gflags
+    phi
    cblas)
 endif()


--- a/paddle/fluid/inference/goapi/test.sh
+++ b/paddle/fluid/inference/goapi/test.sh
@@ -23,7 +23,7 @@ fi
 # 2. set LD_LIBRARY_PATH
 export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$PWD/paddle_inference_c/third_party/install/mklml/lib/:$PWD/paddle_inference_c/third_party/install/mkldnn/lib/:$PWD/paddle_inference_c/paddle/lib/
 export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$PWD/paddle_inference_c/third_party/install/onnxruntime/lib/:$PWD/paddle_inference_c/third_party/install/paddle2onnx/lib/
-
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${PADDLE_ROOT}/build/paddle/phi/
 # 3. go test
 go clean -testcache
 go test -v ./...
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -141,8 +141,7 @@ nv_test(
 nv_test(
  test_custom_plugin_creater
  SRCS test_custom_plugin_creater.cc
-  DEPS paddle_framework tensorrt_converter op_meta_info custom_operator
-       init_phi)
+  DEPS paddle_framework tensorrt_converter phi custom_operator init_phi)

 if(WITH_ONNXRUNTIME AND WIN32)
  # Copy onnxruntime for some c++ test in Windows, since the test will

--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
 include(ExternalProject)

-set(ALLOCATOR_DEPS place stats profiler phi_backends device_context)
+set(ALLOCATOR_DEPS place stats profiler phi device_context)
 set(ALLOCATOR_SRCS
    allocator.cc
    cpu_allocator.cc
@@ -32,7 +32,7 @@ if(WITH_GPU OR WITH_ROCM)
 endif()

 if(WITH_GPU)
-  list(APPEND ALLOCATOR_DEPS phi_backends)
+  list(APPEND ALLOCATOR_DEPS phi)
 endif()

 if(CUDA_VERSION VERSION_GREATER_EQUAL 10.2)

--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -124,7 +124,7 @@ class CUDAGraphAllocator
      : underlying_allocator_(allocator) {}

 public:
-  ~CUDAGraphAllocator() { VLOG(10) << "CUDAGraphAllocator destructed"; }
+  ~CUDAGraphAllocator() {}

  static std::shared_ptr<Allocator> Create(
      const std::shared_ptr<Allocator>& allocator) {
@@ -1137,7 +1137,6 @@ void AllocatorFacade::RemoveMemoryPoolOfCUDAGraph(int64_t id) {
  if (ref_cnt == 0) {
    cuda_graph_map_.erase(id);
    cuda_graph_ref_cnt_.erase(ref_cnt_iter);
-    VLOG(10) << "Remove memory pool of CUDA Graph with memory ID " << id;
  } else {
    VLOG(10) << "Decrease memory pool ID " << id << " reference count to be "
             << ref_cnt;

--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -90,7 +90,7 @@ if(WITH_UNITY_BUILD)
    include(unity_build_rule.cmake)
 endif()

-set(OP_HEADER_DEPS ${OP_HEADER_DEPS} phi phi_utils backward_infermeta sparse_backward_infermeta static_prim_api get_expected_kernel_func)
+set(OP_HEADER_DEPS ${OP_HEADER_DEPS} phi phi_utils static_prim_api get_expected_kernel_func)

 register_operators(EXCLUDES py_func_op dgc_op generated_op1 generated_op2 generated_op3 generated_op4 load_combine_op lstm_op run_program_op quantize_linear_op
        recurrent_op save_combine_op sparse_attention_op sync_batch_norm_op activation_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS})
@@ -125,7 +125,7 @@ if (WITH_GPU OR WITH_ROCM)
    endif()
 endif()

-op_library(lstm_op DEPS ${OP_HEADER_DEPS}  lstm_compute)
+op_library(lstm_op DEPS ${OP_HEADER_DEPS})
 op_library(recurrent_op DEPS ${OP_HEADER_DEPS})

 set(COMMON_OP_DEPS ${OP_HEADER_DEPS})
@@ -136,17 +136,16 @@ if (WITH_DGC)
 endif()

 cc_library(common_infer_shape_functions SRCS common_infer_shape_functions.cc DEPS operator)
-cc_library(ops_extra_info SRCS ops_extra_info.cc DEPS attribute cudnn_workspace_helper)
+cc_library(ops_extra_info SRCS ops_extra_info.cc DEPS attribute phi)

-set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows_utils lapack_function
-lod_tensor maxouting unpooling pooling lod_rank_table context_project
-sequence_pooling executor generator static_prim_api)
+set(COMMON_OP_DEPS ${COMMON_OP_DEPS} phi)
+set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_utils
+lod_tensor unpooling lod_rank_table context_project executor static_prim_api)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc static_prim_api static_utils static_global_utils prim_utils)
-set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler sample_prob tree2col)
-set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions beam_search fc_functor matrix_inverse matrix_solve)
+set(COMMON_OP_DEPS ${COMMON_OP_DEPS} cos_sim_functor memory concat_and_split sampler sample_prob tree2col)
+set(COMMON_OP_DEPS ${COMMON_OP_DEPS} beam_search)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} box_wrapper ps_gpu_wrapper)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} common_infer_shape_functions)
-set(COMMON_OP_DEPS ${COMMON_OP_DEPS} eigen_function)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} processgroup_comm_utils)
 if(WITH_NCCL OR WITH_RCCL)
  set(COMMON_OP_DEPS ${COMMON_OP_DEPS} process_group_nccl)
@@ -189,7 +188,7 @@ endif()
 copy_if_different(${pybind_file} ${pybind_file_final})

 if (WITH_CUSTOM_DEVICE)
-cc_library(custom_device_common_op_registry SRCS custom_device_common_op_registry.cc DEPS operator phi_api)
+cc_library(custom_device_common_op_registry SRCS custom_device_common_op_registry.cc DEPS operator phi type_info)
 endif()

 if(NOT "${OP_LIST}" STREQUAL "")

--- a/paddle/fluid/operators/cinn/CMakeLists.txt
+++ b/paddle/fluid/operators/cinn/CMakeLists.txt
@@ -7,7 +7,7 @@ cc_library(
 cc_library(
  cinn_launch_context
  SRCS cinn_launch_context.cc
-  DEPS ddim
+  DEPS phi
       lod_tensor
       scope
       proto_desc

--- a/paddle/fluid/operators/collective/CMakeLists.txt
+++ b/paddle/fluid/operators/collective/CMakeLists.txt
@@ -18,7 +18,7 @@ foreach(src ${OPS})
 endforeach()

 if(WITH_GLOO)
-  set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} gloo_wrapper comm_context_manager)
+  set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} gloo_wrapper phi)
 endif()

 register_operators(
@@ -31,8 +31,7 @@ register_operators(
  ${COLLECTIVE_DEPS})

 if(WITH_NCCL OR WITH_RCCL)
-  set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} nccl_common collective_helper
-                      comm_context_manager nccl_comm_context)
+  set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} nccl_common collective_helper phi)
  op_library(c_gen_nccl_id_op DEPS ${COLLECTIVE_DEPS})
  op_library(gen_nccl_id_op DEPS ${COLLECTIVE_DEPS})
 endif()

--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
@@ -51,8 +51,8 @@ detection_library(polygon_box_transform_op SRCS polygon_box_transform_op.cc
 detection_library(rpn_target_assign_op SRCS rpn_target_assign_op.cc)
 detection_library(generate_proposal_labels_op SRCS
                  generate_proposal_labels_op.cc)
-detection_library(multiclass_nms_op SRCS multiclass_nms_op.cc DEPS gpc)
-detection_library(locality_aware_nms_op SRCS locality_aware_nms_op.cc DEPS gpc)
+detection_library(multiclass_nms_op SRCS multiclass_nms_op.cc DEPS phi)
+detection_library(locality_aware_nms_op SRCS locality_aware_nms_op.cc DEPS phi)
 detection_library(box_clip_op SRCS box_clip_op.cc box_clip_op.cu)
 detection_library(box_decoder_and_assign_op SRCS box_decoder_and_assign_op.cc
                  box_decoder_and_assign_op.cu)

--- a/paddle/fluid/operators/generator/CMakeLists.txt
+++ b/paddle/fluid/operators/generator/CMakeLists.txt
@@ -289,7 +289,7 @@ file(APPEND ${op_utils_header}
 # Automatically generate the registration code of all arg map functions
 # and compile the corresponding target to avoid frequent code conflicts
 # when writing to same file
-register_op_utils(op_compat_infos DEPS op_utils)
+register_op_utils(op_compat_infos DEPS phi)

 copy_if_different(${op_utils_header} ${op_utils_header_final})


--- a/paddle/fluid/operators/gru_op.cc
+++ b/paddle/fluid/operators/gru_op.cc
@@ -17,11 +17,12 @@ limitations under the License. */
 #include <memory>
 #include <string>

+#include "paddle/phi/core/flags.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/detail/gru_cpu_kernel.h"
 #include "paddle/phi/kernels/funcs/detail/gru_kernel.h"

-DECLARE_int32(paddle_num_threads);
+PHI_DECLARE_int32(paddle_num_threads);

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -6,21 +6,20 @@ if(WITH_XPU)
 endif()

 # please add new math_library in alphabetical order
-math_library(concat_and_split DEPS concat_and_split_functor)
-math_library(context_project DEPS im2col math_function)
+math_library(concat_and_split DEPS phi)
+math_library(context_project DEPS phi)
 math_library(cos_sim_functor)
 math_library(depthwise_conv)
 math_library(sample_prob)
-math_library(sampler DEPS generator)
+math_library(sampler DEPS phi)

-# math_library(math_function DEPS blas dense_tensor tensor)
 if(WITH_XPU)
-  math_library(beam_search DEPS math_function beam_search_xpu)
+  math_library(beam_search DEPS phi beam_search_xpu)
 else()
-  math_library(beam_search DEPS math_function)
+  math_library(beam_search DEPS phi)
 endif()

 math_library(unpooling)
 math_library(prelu)
 math_library(bert_encoder_functor)
-math_library(tree2col DEPS math_function)
+math_library(tree2col DEPS phi)
--- a/paddle/fluid/operators/pscore/CMakeLists.txt
+++ b/paddle/fluid/operators/pscore/CMakeLists.txt
@@ -20,7 +20,7 @@ if(WITH_ARM_BRPC)
    framework_proto
    sendrecv_rpc
    arm_brpc
-    gflags
+    phi
    glog
    snappy
    device_context)
@@ -42,7 +42,7 @@ else()
    ssl
    crypto
    protobuf
-    gflags
+    phi
    glog
    zlib
    snappy

--- a/paddle/fluid/operators/sequence_ops/CMakeLists.txt
+++ b/paddle/fluid/operators/sequence_ops/CMakeLists.txt
@@ -6,5 +6,5 @@ endif()
 register_operators()

 if(WITH_UNITY_BUILD)
-  target_link_libraries(paddle_operators_sequence_ops_unity sequence_pooling)
+  target_link_libraries(paddle_operators_sequence_ops_unity phi)
 endif()
--- a/paddle/fluid/operators/var_conv_2d_op.cc
+++ b/paddle/fluid/operators/var_conv_2d_op.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 #include <memory>
 #include <vector>

-#include "paddle/fluid/platform/dynload/mklml.h"
+#include "paddle/phi/backends/dynload/mklml.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/math_function.h"


--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -6,9 +6,9 @@ cc_library(
 cc_test(
  errors_test
  SRCS errors_test.cc
-  DEPS errors enforce)
+  DEPS phi enforce)

-set(enforce_deps flags errors flags phi_enforce)
+set(enforce_deps phi)
 if(WITH_GPU)
  set(enforce_deps ${enforce_deps} external_error_proto)
 endif()
@@ -26,20 +26,20 @@ cc_test(
 cc_test(
  cpu_info_test
  SRCS cpu_info_test.cc
-  DEPS phi_backends)
+  DEPS phi)
 cc_test(
  os_info_test
  SRCS os_info_test.cc
-  DEPS phi_os_info)
+  DEPS phi)

 cc_library(
  place
  SRCS place.cc
-  DEPS enforce phi_place)
+  DEPS enforce phi)
 cc_test(
  place_test
  SRCS place_test.cc
-  DEPS place glog gflags)
+  DEPS place glog phi)

 if(WITH_MKLDNN)
  set(MKLDNN_CTX_DEPS mkldnn)
@@ -104,7 +104,7 @@ endif()
 cc_library(
  init
  SRCS init.cc
-  DEPS device_context custom_kernel context_pool memcpy)
+  DEPS device_context phi memcpy)

 # memcpy depends on device_context, here add deps individually for
 # avoiding cycle dependencies
@@ -117,7 +117,6 @@ cc_library(
       xxhash
       ${STREAM_CALLBACK_DEPS}
       place
-       phi_place
       eigen3
       cpu_helper
       framework_proto
@@ -126,12 +125,8 @@ cc_library(
       ${MKLDNN_CTX_DEPS}
       ${dgc_deps}
       dlpack
-       cudnn_workspace_helper
-       ${XPU_CTX_DEPS}
-       phi_backends
-       phi_device_context
-       generator
-       phi_enforce)
+       phi
+       ${XPU_CTX_DEPS})

 cc_library(
  collective_helper
@@ -189,12 +184,12 @@ if(WITH_GPU)
      cuda_graph_with_memory_pool
      SRCS cuda_graph_with_memory_pool.cc
      DEPS ${DEVICE_EVENT_LIBS} device_event_custom_device device_context
-           allocator phi_backends)
+           allocator phi)
  else()
    nv_library(
      cuda_graph_with_memory_pool
      SRCS cuda_graph_with_memory_pool.cc
-      DEPS ${DEVICE_EVENT_LIBS} device_context allocator phi_backends)
+      DEPS ${DEVICE_EVENT_LIBS} device_context allocator phi)
  endif()
  nv_test(
    device_context_test
@@ -245,7 +240,7 @@ cc_test(
 cc_library(
  lodtensor_printer
  SRCS lodtensor_printer.cc
-  DEPS ddim
+  DEPS phi
       place
       tensor
       scope
@@ -263,41 +258,30 @@ if(WITH_GPU)
  nv_library(
    profiler
    SRCS profiler.cc profiler.cu
-    DEPS phi_os_info
-         phi_device_tracer
+    DEPS phi
         gpu_info
         enforce
         dynload_cuda
         new_profiler
         stats
         op_proto_maker
-         shape_inference
-         phi_profiler)
+         shape_inference)
 elseif(WITH_ROCM)
  hip_library(
    profiler
    SRCS profiler.cc profiler.cu
-    DEPS phi_os_info
-         phi_device_tracer
+    DEPS phi
         gpu_info
         enforce
         new_profiler
         stats
         op_proto_maker
-         shape_inference
-         phi_profiler)
+         shape_inference)
 else()
  cc_library(
    profiler
    SRCS profiler.cc
-    DEPS phi_os_info
-         phi_device_tracer
-         enforce
-         new_profiler
-         stats
-         op_proto_maker
-         shape_inference
-         phi_profiler)
+    DEPS phi enforce new_profiler stats op_proto_maker shape_inference)
 endif()

 cc_test(
@@ -333,7 +317,7 @@ if(WITH_GPU)
  nv_test(
    test_limit_gpu_memory
    SRCS test_limit_gpu_memory.cu
-    DEPS gpu_info flags)
+    DEPS gpu_info phi)
  nv_library(
    cuda_device_guard
    SRCS cuda_device_guard.cc
@@ -348,7 +332,7 @@ if(WITH_ROCM)
  hip_test(
    test_limit_gpu_memory
    SRCS test_limit_gpu_memory.cu
-    DEPS gpu_info flags)
+    DEPS gpu_info phi)
  hip_library(
    cuda_device_guard
    SRCS cuda_device_guard.cc
@@ -360,7 +344,7 @@ if(NOT APPLE AND NOT WIN32)
    cc_test(
      device_code_test
      SRCS device_code_test.cc
-      DEPS phi_backends lod_tensor)
+      DEPS phi lod_tensor)
  endif()
 endif()

@@ -382,4 +366,4 @@ cc_library(
 cc_test(
  init_phi_test
  SRCS init_phi_test.cc
-  DEPS phi_tensor init_phi)
+  DEPS phi init_phi)
--- a/paddle/fluid/platform/cpu_helper.cc
+++ b/paddle/fluid/platform/cpu_helper.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 #ifdef PADDLE_WITH_MKLML
 #include <omp.h>

-#include "paddle/fluid/platform/dynload/mklml.h"
+#include "paddle/phi/backends/dynload/mklml.h"
 #endif

 #ifdef PADDLE_USE_OPENBLAS
@@ -40,7 +40,7 @@ void SetNumThreads(int num_threads) {
  openblas_set_num_threads(real_num_threads);
 #elif defined(PADDLE_WITH_MKLML)
  int real_num_threads = num_threads > 1 ? num_threads : 1;
-  platform::dynload::MKL_Set_Num_Threads(real_num_threads);
+  phi::dynload::MKL_Set_Num_Threads(real_num_threads);
  omp_set_num_threads(real_num_threads);
 #elif defined(PADDLE_USE_REFERENCE_CBLAS)
  // cblas not support multi-thread

--- a/paddle/fluid/platform/device/custom/CMakeLists.txt
+++ b/paddle/fluid/platform/device/custom/CMakeLists.txt
@@ -2,9 +2,9 @@ if(WITH_CUSTOM_DEVICE)
  cc_library(
    custom_device_resource_pool
    SRCS custom_device_resource_pool.cc
-    DEPS gflags glog enforce monitor)
+    DEPS phi glog enforce monitor)
  cc_test(
    custom_device_test
    SRCS custom_device_test.cc
-    DEPS phi_tensor_utils phi_backends phi_device_context gradient_accumulator)
+    DEPS phi gradient_accumulator)
 endif()
--- a/paddle/fluid/platform/device/gpu/CMakeLists.txt
+++ b/paddle/fluid/platform/device/gpu/CMakeLists.txt
@@ -3,13 +3,7 @@ if(WITH_GPU)
  nv_library(
    gpu_info
    SRCS gpu_info.cc
-    DEPS phi_backends
-         gflags
-         glog
-         enforce
-         monitor
-         dynload_cuda
-         malloc)
+    DEPS phi glog enforce monitor dynload_cuda malloc)

  nv_test(cuda_helper_test SRCS cuda_helper_test.cu)
  nv_test(
@@ -21,7 +15,7 @@ elseif(WITH_ROCM)
  hip_library(
    gpu_info
    SRCS gpu_info.cc
-    DEPS phi_backends gflags glog enforce monitor dynload_cuda)
+    DEPS phi glog enforce monitor dynload_cuda)

  hip_test(cuda_helper_test SRCS cuda_helper_test.cu)
  hip_test(

--- a/paddle/fluid/platform/device/xpu/CMakeLists.txt
+++ b/paddle/fluid/platform/device/xpu/CMakeLists.txt
@@ -14,23 +14,11 @@ set(XPU_CTX_DEPS
 cc_library(
  xpu_info
  SRCS xpu_info.cc
-  DEPS gflags
-       glog
-       enforce
-       xpulib
-       device_context
-       place
-       phi_backends)
+  DEPS glog enforce xpulib device_context place phi)
 cc_library(
  xpu_op_list
  SRCS xpu_op_list.cc
-  DEPS gflags
-       glog
-       enforce
-       xpulib
-       device_context
-       op_kernel_type
-       phi_backends)
+  DEPS glog enforce xpulib device_context op_kernel_type phi)
 cc_library(
  xpu_resource_pool
  SRCS xpu_resource_pool.cc

--- a/paddle/fluid/platform/dynload/CMakeLists.txt
+++ b/paddle/fluid/platform/dynload/CMakeLists.txt
 cc_library(
  dynamic_loader
  SRCS dynamic_loader.cc
-  DEPS glog gflags enforce phi_dynamic_loader)
+  DEPS glog enforce phi)

 list(
  APPEND
@@ -57,26 +57,20 @@ if(WITH_ROCM)
  hip_library(
    dynload_cuda
    SRCS ${HIP_SRCS}
-    DEPS dynamic_loader phi_dynload_cuda)
+    DEPS dynamic_loader phi)
  cc_library(
    dynload_warpctc
    SRCS warpctc.cc
-    DEPS dynamic_loader warpctc phi_dynload_warpctc)
+    DEPS dynamic_loader warpctc phi)
 else()
  nv_library(
    dynload_cuda
    SRCS ${CUDA_SRCS}
-    DEPS dynamic_loader phi_dynload_cuda)
+    DEPS dynamic_loader phi)
  cc_library(
    dynload_warpctc
    SRCS warpctc.cc
-    DEPS dynamic_loader warpctc phi_dynload_warpctc)
-endif()
-if(WITH_MKLML)
-  cc_library(
-    dynload_mklml
-    SRCS mklml.cc
-    DEPS dynamic_loader mklml phi_dynload_mklml)
+    DEPS dynamic_loader warpctc phi)
 endif()

 # TODO(TJ): add iomp, mkldnn?
@@ -86,6 +80,6 @@ if(MKL_FOUND AND WITH_ONEMKL)
  cc_library(
    dynload_mklrt
    SRCS mklrt.cc
-    DEPS dynamic_loader phi_dynload_mklrt)
+    DEPS dynamic_loader phi)
  target_include_directories(dynload_mklrt PRIVATE ${MKL_INCLUDE})
 endif()
--- a/paddle/fluid/platform/dynload/mklml.h
+++ b/paddle/fluid/platform/dynload/mklml.h
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <mkl.h>
-
-#include <mutex>  // NOLINT
-
-#include "paddle/phi/backends/dynload/mklml.h"
-
-namespace paddle {
-namespace platform {
-namespace dynload {
-
-/**
- * The following macro definition can generate structs
- * (for each function) to dynamic load mklml routine
- * via operator overloading.
- */
-#define DYNAMIC_LOAD_MKLML_WRAP(__name)                      \
-  using DynLoad__##__name = phi::dynload::DynLoad__##__name; \
-  extern DynLoad__##__name __name
-
-#define PLATFORM_DECLARE_DYNAMIC_LOAD_MKLML_WRAP(__name) \
-  DYNAMIC_LOAD_MKLML_WRAP(__name)
-
-#define MKLML_ROUTINE_EACH(__macro) \
-  __macro(cblas_sgemm);             \
-  __macro(cblas_dgemm);             \
-  __macro(cblas_cgemm);             \
-  __macro(cblas_zgemm);             \
-  __macro(cblas_saxpy);             \
-  __macro(cblas_daxpy);             \
-  __macro(cblas_caxpy);             \
-  __macro(cblas_zaxpy);             \
-  __macro(cblas_scopy);             \
-  __macro(cblas_dcopy);             \
-  __macro(cblas_ccopy);             \
-  __macro(cblas_zcopy);             \
-  __macro(cblas_sgemv);             \
-  __macro(cblas_dgemv);             \
-  __macro(cblas_cgemv);             \
-  __macro(cblas_zgemv);             \
-  __macro(cblas_strsm);             \
-  __macro(cblas_dtrsm);             \
-  __macro(cblas_ctrsm);             \
-  __macro(cblas_ztrsm);             \
-  __macro(cblas_sgemm_alloc);       \
-  __macro(cblas_dgemm_alloc);       \
-  __macro(cblas_sgemm_pack);        \
-  __macro(cblas_dgemm_pack);        \
-  __macro(cblas_sgemm_compute);     \
-  __macro(cblas_dgemm_compute);     \
-  __macro(cblas_sgemm_free);        \
-  __macro(cblas_dgemm_free);        \
-  __macro(cblas_sgemm_batch);       \
-  __macro(cblas_dgemm_batch);       \
-  __macro(cblas_cgemm_batch);       \
-  __macro(cblas_zgemm_batch);       \
-  __macro(cblas_sdot);              \
-  __macro(cblas_ddot);              \
-  __macro(cblas_sasum);             \
-  __macro(cblas_dasum);             \
-  __macro(cblas_isamax);            \
-  __macro(cblas_idamax);            \
-  __macro(cblas_sscal);             \
-  __macro(cblas_dscal);             \
-  __macro(vsAdd);                   \
-  __macro(vdAdd);                   \
-  __macro(vsSub);                   \
-  __macro(vdSub);                   \
-  __macro(vsMul);                   \
-  __macro(vdMul);                   \
-  __macro(vsDiv);                   \
-  __macro(vdDiv);                   \
-  __macro(vsExp);                   \
-  __macro(vdExp);                   \
-  __macro(vsSqr);                   \
-  __macro(vdSqr);                   \
-  __macro(vsPowx);                  \
-  __macro(vdPowx);                  \
-  __macro(vsInv);                   \
-  __macro(vdInv);                   \
-  __macro(vmsErf);                  \
-  __macro(vmdErf);                  \
-  __macro(MKL_Free_Buffers);        \
-  __macro(MKL_Set_Num_Threads);     \
-  __macro(MKL_Get_Max_Threads);
-
-MKLML_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_MKLML_WRAP);
-
-#if !defined(_WIN32)
-DYNAMIC_LOAD_MKLML_WRAP(mkl_scsrmm);
-DYNAMIC_LOAD_MKLML_WRAP(mkl_dcsrmm);
-#endif
-
-#undef DYNAMIC_LOAD_MKLML_WRAP
-
-}  // namespace dynload
-}  // namespace platform
-}  // namespace paddle
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -40,6 +40,22 @@ PADDLE_DEFINE_EXPORTED_bool(enable_rpc_profiler,

 DEFINE_bool(enable_record_memory, false, "enable memory recorder");

+#if defined(_WIN32) && defined(PHI_SHARED)
+phi::ProfilerState phi::ProfilerHelper::g_state = phi::ProfilerState::kDisabled;
+bool phi::ProfilerHelper::g_enable_nvprof_hook = false;
+thread_local uint64_t phi::ProfilerHelper::g_thread_id;
+uint32_t phi::ProfilerHelper::g_next_thread_id = 0;
+std::mutex phi::ProfilerHelper::g_all_event_lists_mutex;
+std::list<std::shared_ptr<phi::EventList<phi::Event>>>
+    phi::ProfilerHelper::g_all_event_lists;
+thread_local std::shared_ptr<phi::EventList<phi::Event>>
+    phi::ProfilerHelper::g_event_list;
+std::list<std::shared_ptr<phi::EventList<phi::MemEvent>>>
+    phi::ProfilerHelper::g_all_mem_event_lists;
+thread_local std::shared_ptr<phi::EventList<phi::MemEvent>>
+    phi::ProfilerHelper::g_mem_event_list;
+std::mutex phi::ProfilerHelper::g_all_mem_event_lists_mutex;
+#endif
 namespace paddle {
 namespace platform {


--- a/paddle/fluid/platform/profiler/CMakeLists.txt
+++ b/paddle/fluid/platform/profiler/CMakeLists.txt
 cc_library(
  host_tracer
  SRCS host_tracer.cc
-  DEPS framework_proto enforce ddim var_type_traits)
+  DEPS framework_proto enforce phi var_type_traits)
 cc_library(
  cuda_tracer
  SRCS cuda_tracer.cc cupti_data_process.cc
@@ -28,7 +28,7 @@ cc_library(
 cc_library(
  cpu_utilization
  SRCS cpu_utilization.cc
-  DEPS phi_backends phi_os_info enforce glog)
+  DEPS phi enforce glog)
 cc_library(
  new_profiler
  SRCS profiler.cc

--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -28,7 +28,6 @@ set(PYBIND_DEPS
    gloo_wrapper
    infer_io_utils
    heter_wrapper
-    generator
    op_version_registry
    ps_gpu_wrapper
    custom_operator
@@ -37,16 +36,13 @@ set(PYBIND_DEPS
    fleet_executor
    global_utils
    phi_utils
-    tcp_store
-    comm_context_manager
+    phi
    new_profiler
-    auto_parallel
    jit_layer
    jit_property
    prim_utils
-    operants_manager
-    phi_tensor_operants
-    static_tensor_operants)
+    static_tensor_operants
+    type_info)

 if(WITH_PSCORE)
  set(PYBIND_DEPS ${PYBIND_DEPS} ps_service)
@@ -65,7 +61,7 @@ if(WITH_RPC)
      zlib
      leveldb
      snappy
-      gflags
+      phi
      glog)
 endif()
 if(WITH_GPU OR WITH_ROCM)
@@ -148,7 +144,6 @@ set(PYBIND_SRCS
    auto_parallel_py.cc)

 if(WITH_CUSTOM_DEVICE)
-  set(PYBIND_DEPS ${PYBIND_DEPS} phi_capi)
  set(PYBIND_DEPS ${PYBIND_DEPS} custom_device_common_op_registry)
 endif()

@@ -334,6 +329,14 @@ if(WITH_PYTHON)
      ")\n"
      "exit /b 0")

+    if(WITH_PHI_SHARED)
+      add_custom_command(
+        OUTPUT ${op_impl_path}/phi.dll
+        COMMAND ${CMAKE_COMMAND} -E copy ${PHI_LIB} ${op_impl_path}
+        DEPENDS phi)
+      list(APPEND EAGER_OP_IMPL_DEPS ${op_impl_path}/libiomp5md.dll)
+    endif()
+
    if(${CBLAS_PROVIDER} STREQUAL MKLML)
      add_custom_command(
        OUTPUT ${op_impl_path}/libiomp5md.dll
@@ -481,10 +484,8 @@ if(WITH_PYTHON)
    list(APPEND PYBIND_DEPS python)
    list(APPEND PYBIND_DEPS custom_operator)
    list(APPEND PYBIND_DEPS custom_operator_node)
-    list(APPEND PYBIND_DEPS tensor_api)
    list(APPEND PYBIND_DEPS eager_tensor_operants)
    list(APPEND PYBIND_DEPS pybind_util)
-    list(APPEND PYBIND_DEPS flags)
  endif()

  # On Linux, cc_library(paddle SHARED ..) will generate the libpaddle.so,

--- a/paddle/fluid/pybind/eager_utils.cc
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -38,7 +38,9 @@ limitations under the License. */
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/dense_tensor.h"
-DECLARE_bool(check_nan_inf);
+#include "paddle/phi/core/flags.h"
+
+PHI_DECLARE_bool(check_nan_inf);

 namespace paddle {
 namespace pybind {

--- a/paddle/phi/CMakeLists.txt
+++ b/paddle/phi/CMakeLists.txt
@@ -3,6 +3,15 @@ configure_file(config.h.in ${CMAKE_CURRENT_SOURCE_DIR}/config.h)
 # phi auto cmake utils
 include(phi)

+set(common_srcs CACHE INTERNAL "" FORCE)
+set(api_srcs CACHE INTERNAL "" FORCE)
+set(capi_srcs CACHE INTERNAL "" FORCE)
+set(core_srcs CACHE INTERNAL "" FORCE)
+set(backends_srcs CACHE INTERNAL "" FORCE)
+set(kernels_srcs CACHE INTERNAL "" FORCE)
+set(infermeta_srcs CACHE INTERNAL "" FORCE)
+#set(excluded_srcs CACHE INTERNAL "" FORCE)
+
 # paddle experimental common components
 add_subdirectory(common)

@@ -24,29 +33,153 @@ if(WITH_CUSTOM_DEVICE)
  add_subdirectory(capi)
 endif()

-# make an unity target for compile deps
 set(PHI_DEPS
-    convert_utils
-    dense_tensor
-    phi_backends
-    kernel_factory
-    kernel_context
-    arg_map_context
-    infermeta
-    lod_utils
-    sparse_csr_tensor
-    sparse_coo_tensor
-    string_tensor
-    api_scalar
-    api_int_array
-    extended_tensor
-    dist_attr
-    dist_mapper)
-
-get_property(phi_kernels GLOBAL PROPERTY PHI_KERNELS)
-set(PHI_DEPS ${PHI_DEPS} ${phi_kernels})
-
-cc_library(phi DEPS ${PHI_DEPS})
+    phi_profiler_proto
+    auto_parallel_proto
+    gflags
+    glog
+    warpctc
+    warprnnt
+    eigen3
+    xxhash
+    cblas
+    utf8proc)
+
+if(WITH_GPU)
+  list(APPEND PHI_DEPS external_error_proto)
+endif()
+
+if(WITH_ASCEND_CL)
+  list(APPEND PHI_DEPS npu_hccl)
+endif()
+
+if(WITH_FLASHATTN)
+  list(APPEND PHI_DEPS flashattn)
+endif()
+
+if(WITH_XBYAK)
+  list(APPEND PHI_DEPS xbyak)
+endif()
+
+if(WITH_MKLDNN)
+  list(APPEND PHI_DEPS mkldnn)
+endif()
+
+if(WITH_GLOO)
+  list(APPEND PHI_DEPS gloo)
+endif()
+
+if(WITH_CUDNN_FRONTEND)
+  list(APPEND PHI_DEPS cudnn-frontend)
+endif()
+
+if(WITH_POCKETFFT)
+  list(APPEND PHI_DEPS pocketfft)
+endif()
+
+if(WITH_MKLML)
+  list(APPEND PHI_DEPS pocketfft dynload_mklml)
+endif()
+
+if(WITH_XPU)
+  list(APPEND PHI_DEPS xpulib)
+endif()
+
+set(PHI_SRCS
+    ${common_srcs}
+    ${api_srcs}
+    ${core_srcs}
+    ${backends_srcs}
+    ${kernels_srcs}
+    ${infermeta_srcs}
+    ${capi_srcs})
+
+if(WITH_PHI_SHARED)
+  set(PHI_BUILD_TYPE
+      SHARED
+      CACHE INTERNAL "" FORCE)
+else()
+  set(PHI_BUILD_TYPE
+      STATIC
+      CACHE INTERNAL "" FORCE)
+endif()
+
+if(WITH_GPU)
+  add_definitions(-DCUDA_REAL_ARCHS=${NVCC_FLAGS_EXTRA_real_archs}
+  )# for backends/gpu/gpu_resources.cc
+  nv_library(
+    phi ${PHI_BUILD_TYPE}
+    SRCS ${PHI_SRCS}
+    DEPS ${PHI_DEPS})
+elseif(WITH_ROCM)
+  hip_add_library(phi ${PHI_BUILD_TYPE} ${PHI_SRCS})
+  target_link_libraries(phi ${PHI_DEPS})
+elseif(WITH_XPU_KP)
+  xpu_library(
+    phi ${PHI_BUILD_TYPE}
+    SRCS ${PHI_SRCS}
+    DEPS ${PHI_DEPS})
+else()
+  cc_library(
+    phi ${PHI_BUILD_TYPE}
+    SRCS ${PHI_SRCS}
+    DEPS ${PHI_DEPS})
+endif()
+
+if(WIN32)
+  target_link_libraries(phi shlwapi.lib)
+endif()
+
+if(WIN32)
+  if(WITH_PHI_SHARED)
+    set_property(TARGET phi PROPERTY WINDOWS_EXPORT_ALL_SYMBOLS ON)
+    set(PHI_NAME
+        phi.dll
+        CACHE INTERNAL "" FORCE)
+  else()
+    set(PHI_NAME
+        phi.lib
+        CACHE INTERNAL "" FORCE)
+  endif()
+elseif(APPLE)
+  if(WITH_PHI_SHARED)
+    set(PHI_NAME
+        libphi.dylib
+        CACHE INTERNAL "" FORCE)
+  else()
+    set(PHI_NAME
+        libphi.a
+        CACHE INTERNAL "" FORCE)
+  endif()
+else()
+  if(WITH_PHI_SHARED)
+    set(PHI_NAME
+        libphi.so
+        CACHE INTERNAL "" FORCE)
+  else()
+    set(PHI_NAME
+        libphi.a
+        CACHE INTERNAL "" FORCE)
+  endif()
+endif()
+
+set(PHI_LIB
+    "${CMAKE_CURRENT_BINARY_DIR}/${PHI_NAME}"
+    CACHE FILEPATH "PHI Library" FORCE)
+
+if(MKL_FOUND AND WITH_ONEMKL)
+  target_include_directories(phi PRIVATE ${MKL_INCLUDE})
+endif()
+
+add_dependencies(phi extern_lapack)
+if(WITH_CUTLASS)
+  add_dependencies(phi cutlass_codegen)
+  add_definitions("-DPADDLE_WITH_MEMORY_EFFICIENT_ATTENTION"
+  )# for memory_efficient_attention.h
+endif()
+if(WITH_FLASHATTN)
+  add_dependencies(phi flashattn)
+endif()

 set(phi_extension_header_file
    ${CMAKE_CURRENT_SOURCE_DIR}/extension.h

--- a/paddle/phi/api/CMakeLists.txt
+++ b/paddle/phi/api/CMakeLists.txt
 add_subdirectory(profiler)
 add_subdirectory(lib)
-cc_library(
-  phi_api
-  SRCS all.cc
-  DEPS phi_function_api phi_bw_function_api sparse_api sparse_bw_api
-       strings_api)
--- a/paddle/phi/api/all.cc
+++ b/paddle/phi/api/all.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/phi/api/all.h"
-
-namespace paddle {
-namespace experimental {}  // namespace experimental
-}  // namespace paddle
--- a/paddle/phi/api/ext/op_meta_info.h
+++ b/paddle/phi/api/ext/op_meta_info.h
@@ -112,9 +112,7 @@ class PADDLE_API CustomOpKernelContext {
  void EmplaceBackOutput(Tensor&& output);
  void EmplaceBackOutputs(const std::vector<Tensor>& outputs);
  void EmplaceBackAttr(paddle::any attr);
-  void EmplaceBackAttrs(const std::vector<paddle::any>& attrs) {
-    attrs_ = std::move(attrs);
-  }
+  void EmplaceBackAttrs(const std::vector<paddle::any>& attrs);
  const std::pair<size_t, size_t>& InputRangeAt(size_t idx) const;
  const std::pair<size_t, size_t>& OutputRangeAt(size_t idx) const;

@@ -125,13 +123,9 @@ class PADDLE_API CustomOpKernelContext {
  paddle::optional<Tensor> OptionalInputAt(size_t idx);
  paddle::optional<std::vector<Tensor>> OptionalInputsBetween(size_t start,
                                                              size_t end);
-  const std::vector<paddle::any>& Attrs() const { return attrs_; }
-  const std::vector<std::pair<size_t, size_t>>& InputRange() {
-    return input_range_;
-  }
-  const std::vector<std::pair<size_t, size_t>>& OutputRange() {
-    return output_range_;
-  }
+  const std::vector<paddle::any>& Attrs() const;
+  const std::vector<std::pair<size_t, size_t>>& InputRange();
+  const std::vector<std::pair<size_t, size_t>>& OutputRange();
  Tensor* MutableOutputAt(size_t idx);
  std::vector<Tensor*> MutableOutputBetween(size_t start, size_t end);
  std::vector<Tensor> OutputsBetween(size_t start, size_t end);
@@ -811,38 +805,20 @@ class PADDLE_API OpMetaInfo {
 //////////////// Op Meta Info Helper /////////////////
 class OpMetaInfoHelper {
 public:
-  static const std::string& GetOpName(const paddle::OpMetaInfo& info) {
-    return info.name_;
-  }
+  static const std::string& GetOpName(const paddle::OpMetaInfo& info);
  static const std::vector<std::string>& GetInputs(
-      const paddle::OpMetaInfo& info) {
-    return info.inputs_;
-  }
+      const paddle::OpMetaInfo& info);
  static const std::vector<std::string>& GetOutputs(
-      const paddle::OpMetaInfo& info) {
-    return info.outputs_;
-  }
+      const paddle::OpMetaInfo& info);
  static const std::vector<std::string>& GetAttrs(
-      const paddle::OpMetaInfo& info) {
-    return info.attrs_;
-  }
+      const paddle::OpMetaInfo& info);
  static const std::unordered_map<std::string, std::string>& GetInplaceMap(
-      const paddle::OpMetaInfo& info) {
-    return info.inplace_map_;
-  }
+      const paddle::OpMetaInfo& info);
  static const std::unordered_map<std::string, std::string>&
-  GetInplaceReverseMap(const paddle::OpMetaInfo& info) {
-    return info.inplace_reverse_map_;
-  }
-  static const KernelFunc& GetKernelFn(const paddle::OpMetaInfo& info) {
-    return info.kernel_fn_;
-  }
-  static const InferShapeFunc& GetInferShapeFn(const paddle::OpMetaInfo& info) {
-    return info.infer_shape_fn_;
-  }
-  static const InferDtypeFunc& GetInferDtypeFn(const paddle::OpMetaInfo& info) {
-    return info.infer_dtype_fn_;
-  }
+  GetInplaceReverseMap(const paddle::OpMetaInfo& info);
+  static const KernelFunc& GetKernelFn(const paddle::OpMetaInfo& info);
+  static const InferShapeFunc& GetInferShapeFn(const paddle::OpMetaInfo& info);
+  static const InferDtypeFunc& GetInferDtypeFn(const paddle::OpMetaInfo& info);
 };

 //////////////// Op Meta Info Map /////////////////

--- a/paddle/phi/api/include/tensor.h
+++ b/paddle/phi/api/include/tensor.h
@@ -410,7 +410,7 @@ class PADDLE_API Tensor final {
   *
   * @return const std::string&
   */
-  const std::string& name() const { return name_; }
+  const std::string& name() const;

  /**
   * @brief Set name of Tensor.
@@ -419,7 +419,7 @@ class PADDLE_API Tensor final {
   *
   * @param const std::string& name
   */
-  void set_name(const std::string& name) { name_ = name; }
+  void set_name(const std::string& name);

  /* Part 5: Data Transform methods */
  /* Alert!!!!: All copy method can only deep copy impl, autograd info only be

--- a/paddle/phi/api/lib/CMakeLists.txt
+++ b/paddle/phi/api/lib/CMakeLists.txt
-if(WITH_GPU)
-  nv_library(
-    phi_tensor_raw
-    SRCS tensor.cc
-    DEPS tensor_base
-         dense_tensor
-         phi_enforce
-         context_pool
-         tensor_api
-         int_array
-         scalar)
-elseif(WITH_ROCM)
-  hip_library(
-    phi_tensor_raw
-    SRCS tensor.cc
-    DEPS tensor_base
-         dense_tensor
-         phi_enforce
-         context_pool
-         tensor_api
-         int_array
-         scalar)
-else()
-  cc_library(
-    phi_tensor_raw
-    SRCS tensor.cc
-    DEPS tensor_base
-         dense_tensor
-         phi_enforce
-         context_pool
-         tensor_api
-         int_array
-         scalar)
-endif()
-
 set(api_gen_base ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/generator/api_base.py)

 # forward api file
@@ -157,157 +122,77 @@ if(NOT PYTHONINTERP_FOUND)
  find_package(PythonInterp REQUIRED)
 endif()

+execute_process(COMMAND ${PYTHON_EXECUTABLE} -m pip install pyyaml)
+
 # generate forward api
-add_custom_command(
-  OUTPUT ${api_header_file} ${api_source_file}
-  COMMAND ${PYTHON_EXECUTABLE} -m pip install pyyaml
+execute_process(
  COMMAND
    ${PYTHON_EXECUTABLE} ${api_gen_file} --api_yaml_path ${api_yaml_file}
    ${legacy_api_yaml_file} --api_header_path ${api_header_file_tmp}
-    --api_source_path ${api_source_file_tmp}
-  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${api_header_file_tmp}
-          ${api_header_file}
-  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${api_source_file_tmp}
-          ${api_source_file}
-  COMMENT "copy_if_different ${api_header_file} ${api_source_file}"
-  DEPENDS ${api_yaml_file} ${legacy_api_yaml_file} ${api_gen_file}
-          ${api_gen_base}
-  VERBATIM)
+    --api_source_path ${api_source_file_tmp})

 # generate backward api
-add_custom_command(
-  OUTPUT ${bw_api_header_file} ${bw_api_source_file} ${bw_api_header_file_tmp}
-         ${bw_api_source_file_tmp}
+execute_process(
  COMMAND
    ${PYTHON_EXECUTABLE} ${bw_api_gen_file} --backward_yaml_path
    ${bw_api_yaml_file} ${legacy_bw_api_yaml_file} --backward_header_path
-    ${bw_api_header_file_tmp} --backward_source_path ${bw_api_source_file_tmp}
-  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${bw_api_header_file_tmp}
-          ${bw_api_header_file}
-  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${bw_api_source_file_tmp}
-          ${bw_api_source_file}
-  COMMENT "copy_if_different ${bw_api_header_file} ${bw_api_source_file}"
-  DEPENDS ${bw_api_yaml_file} ${bw_api_gen_file} ${api_gen_base}
-          ${legacy_bw_api_yaml_file}
-  VERBATIM)
+    ${bw_api_header_file_tmp} --backward_source_path ${bw_api_source_file_tmp})

 # generate fused_op api
-add_custom_command(
-  OUTPUT ${fused_api_header_file} ${fused_api_source_file}
-  COMMAND ${PYTHON_EXECUTABLE} -m pip install pyyaml
+execute_process(
  COMMAND
    ${PYTHON_EXECUTABLE} ${api_gen_file} --api_yaml_path ${fused_api_yaml_file}
    --is_fused_ops_yaml --api_header_path ${fused_api_header_file_tmp}
-    --api_source_path ${fused_api_source_file_tmp}
-  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${fused_api_header_file_tmp}
-          ${fused_api_header_file}
-  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${fused_api_source_file_tmp}
-          ${fused_api_source_file}
-  COMMENT "copy_if_different ${fused_api_header_file} ${fused_api_source_file}"
-  DEPENDS ${fused_api_yaml_file} ${api_gen_file} ${api_gen_base}
-  VERBATIM)
+    --api_source_path ${fused_api_source_file_tmp})

 # generate fused_op backward api
-add_custom_command(
-  OUTPUT ${fused_bw_api_header_file} ${fused_bw_api_source_file}
-         ${fused_bw_api_header_file_tmp} ${fused_bw_api_source_file_tmp}
+execute_process(
  COMMAND
    ${PYTHON_EXECUTABLE} ${fused_bw_api_gen_file} --backward_yaml_path
    ${fused_bw_api_yaml_file} --is_fused_backward_yaml --backward_header_path
    ${fused_bw_api_header_file_tmp} --backward_source_path
-    ${fused_bw_api_source_file_tmp}
-  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${fused_bw_api_header_file_tmp}
-          ${fused_bw_api_header_file}
-  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${fused_bw_api_source_file_tmp}
-          ${fused_bw_api_source_file}
-  COMMENT
-    "copy_if_different ${fused_bw_api_header_file} ${fused_bw_api_source_file}"
-  DEPENDS ${fused_bw_api_yaml_file} ${bw_api_gen_file} ${api_gen_base}
-  VERBATIM)
+    ${fused_bw_api_source_file_tmp})

 # generate sparse api
-add_custom_command(
-  OUTPUT ${sparse_api_header_file} ${sparse_api_source_file}
+execute_process(
  COMMAND
    ${PYTHON_EXECUTABLE} ${sparse_api_gen_file} --api_yaml_path
    ${sparse_api_yaml_file} --api_header_path ${sparse_api_header_file_tmp}
-    --api_source_path ${sparse_api_source_file_tmp}
-  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${sparse_api_header_file_tmp}
-          ${sparse_api_header_file}
-  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${sparse_api_source_file_tmp}
-          ${sparse_api_source_file}
-  COMMENT
-    "copy_if_different ${sparse_api_header_file} ${sparse_sparse_api_source_file}"
-  DEPENDS ${sparse_api_yaml_file} ${sparse_api_gen_file} ${api_gen_base}
-          ${api_gen_file}
-  VERBATIM)
+    --api_source_path ${sparse_api_source_file_tmp})

 # generate backward sparse api
-add_custom_command(
-  OUTPUT ${sparse_bw_api_header_file} ${sparse_bw_api_source_file}
+execute_process(
  COMMAND
    ${PYTHON_EXECUTABLE} ${sparse_bw_api_gen_file} --api_yaml_path
    ${sparse_bw_api_yaml_file} --api_header_path
    ${sparse_bw_api_header_file_tmp} --api_source_path
-    ${sparse_bw_api_source_file_tmp}
-  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${sparse_bw_api_header_file_tmp}
-          ${sparse_bw_api_header_file}
-  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${sparse_bw_api_source_file_tmp}
-          ${sparse_bw_api_source_file}
-  COMMENT
-    "copy_if_different ${sparse_bw_api_header_file} ${sparse_bw_sparse_api_source_file}"
-  DEPENDS ${sparse_bw_api_yaml_file} ${sparse_bw_api_gen_file} ${api_gen_base}
-          ${api_gen_file} ${sparse_api_gen_file} ${bw_api_gen_file}
-  VERBATIM)
+    ${sparse_bw_api_source_file_tmp})

 # generate strings api
-add_custom_command(
-  OUTPUT ${strings_api_header_file} ${strings_api_source_file}
+execute_process(
  COMMAND
    ${PYTHON_EXECUTABLE} ${strings_api_gen_file} --api_yaml_path
    ${strings_api_yaml_file} --api_header_path ${strings_api_header_file_tmp}
-    --api_source_path ${strings_api_source_file_tmp}
-  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${strings_api_header_file_tmp}
-          ${strings_api_header_file}
-  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${strings_api_source_file_tmp}
-          ${strings_api_source_file}
-  COMMENT
-    "copy_if_different ${strings_api_header_file} ${strings_strings_api_source_file}"
-  DEPENDS ${strings_api_yaml_file} ${strings_api_gen_file} ${api_gen_base}
-          ${api_gen_file}
-  VERBATIM)
+    --api_source_path ${strings_api_source_file_tmp})

 # generate dygraph(intermediate) api
-add_custom_command(
-  OUTPUT ${dygraph_api_header_file} ${dygraph_api_source_file}
+execute_process(
  COMMAND
    ${PYTHON_EXECUTABLE} ${im_api_gen_file} --api_yaml_path ${api_yaml_file}
    ${legacy_api_yaml_file} --sparse_api_yaml_path ${sparse_api_yaml_file}
    --dygraph_api_header_path ${dygraph_api_header_file_tmp}
-    --dygraph_api_source_path ${dygraph_api_source_file_tmp}
-  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${dygraph_api_header_file_tmp}
-          ${dygraph_api_header_file}
-  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${dygraph_api_source_file_tmp}
-          ${dygraph_api_source_file}
-  DEPENDS ${api_yaml_file} ${legacy_api_yaml_file} ${sparse_api_yaml_file}
-          ${im_api_gen_file} ${api_gen_base} ${api_gen_file}
-  VERBATIM)
+    --dygraph_api_source_path ${dygraph_api_source_file_tmp})

 # generate wrapped infermeta
-add_custom_command(
-  OUTPUT ${wrapped_infermeta_header_file} ${wrapped_infermeta_source_file}
+execute_process(
  COMMAND
    ${PYTHON_EXECUTABLE} ${wrapped_infermeta_gen_file} --api_yaml_path
    ${api_yaml_file} ${legacy_api_yaml_file} --wrapped_infermeta_header_path
    ${wrapped_infermeta_header_file} --wrapped_infermeta_source_path
-    ${wrapped_infermeta_source_file}
-  DEPENDS ${api_yaml_file} ${legacy_api_yaml_file} ${wrapped_infermeta_gen_file}
-          ${api_gen_base}
-  VERBATIM)
+    ${wrapped_infermeta_source_file})

 # generate tensor and tensor operants file
 message("create or copy auto-geneated tensor files")
-execute_process(COMMAND ${PYTHON_EXECUTABLE} -m pip install pyyaml)
 execute_process(
  WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/generator
  COMMAND
@@ -324,154 +209,70 @@ if(${_result})
  message(FATAL_ERROR "tensor codegen failed, exiting.")
 endif()

-set(generated_tensor_files
-    "${operants_base_file}" "${tensor_api_source_file}"
-    "${phi_tensor_operants_header_file}" "${phi_tensor_operants_source_file}"
-    "${operants_manager_header_file}" "${operants_manager_source_file}")
+set(generated_files
+    "${operants_base_file}"
+    "${tensor_api_source_file}"
+    "${phi_tensor_operants_header_file}"
+    "${phi_tensor_operants_source_file}"
+    "${operants_manager_header_file}"
+    "${operants_manager_source_file}"
+    "${wrapped_infermeta_source_file}"
+    "${api_source_file}"
+    "${api_header_file}"
+    "${bw_api_source_file}"
+    "${bw_api_header_file}"
+    "${fused_api_source_file}"
+    "${fused_api_header_file}"
+    "${fused_bw_api_source_file}"
+    "${fused_bw_api_header_file}"
+    "${sparse_api_source_file}"
+    "${sparse_api_header_file}"
+    "${sparse_bw_api_source_file}"
+    "${sparse_bw_api_header_file}"
+    "${dygraph_api_source_file}"
+    "${dygraph_api_header_file}"
+    "${strings_api_source_file}"
+    "${strings_api_header_file}")

-foreach(generated_tensor_file ${generated_tensor_files})
-  if(EXISTS "${generated_tensor_file}.tmp" AND EXISTS
-                                               "${generated_tensor_file}")
-    execute_process(
-      COMMAND ${CMAKE_COMMAND} -E copy_if_different
-              "${generated_tensor_file}.tmp" "${generated_tensor_file}")
-    message(
-      "copy if different ${generated_tensor_file}.tmp ${generated_tensor_file}")
-  elseif(EXISTS "${generated_tensor_file}.tmp")
-    execute_process(
-      COMMAND ${CMAKE_COMMAND} -E copy "${generated_tensor_file}.tmp"
-              "${generated_tensor_file}")
-    message("copy ${generated_tensor_file}.tmp ${generated_tensor_file}")
+foreach(generated_file ${generated_files})
+  if(EXISTS "${generated_file}.tmp" AND EXISTS "${generated_file}")
+    execute_process(COMMAND ${CMAKE_COMMAND} -E copy_if_different
+                            "${generated_file}.tmp" "${generated_file}")
+    message("copy if different ${generated_file}.tmp ${generated_file}")
+  elseif(EXISTS "${generated_file}.tmp")
+    execute_process(COMMAND ${CMAKE_COMMAND} -E copy "${generated_file}.tmp"
+                            "${generated_file}")
+    message("copy ${generated_file}.tmp ${generated_file}")
  endif()
 endforeach()

-cc_library(
-  op_meta_info
-  SRCS op_meta_info.cc
-  DEPS phi_tensor_raw)
-cc_library(
-  wrapped_infermeta
-  SRCS ${wrapped_infermeta_source_file}
-  DEPS phi)
-cc_library(
-  context_pool
-  SRCS context_pool.cc
-  DEPS phi_backends phi_enforce place init phi_device_context)
-cc_library(
-  api_tensor_utils
-  SRCS tensor_utils.cc
-  DEPS phi_tensor_raw)
-
-cc_library(
-  kernel_dispatch
-  SRCS kernel_dispatch.cc
-  DEPS phi_tensor_raw phi_backends kernel_factory context_pool)
-cc_library(
-  api_gen_utils
-  SRCS api_gen_utils.cc
-  DEPS phi_tensor_raw selected_rows sparse_csr_tensor sparse_coo_tensor
-       infermeta_utils)
-cc_library(
-  phi_data_transform
-  SRCS data_transform.cc
-  DEPS phi_tensor_raw phi tensor)
-cc_library(
-  api_custom_impl
-  SRCS api_custom_impl.cc
-  DEPS phi_tensor_raw
-       phi
-       kernel_dispatch
-       api_gen_utils
-       backward_infermeta
-       phi_data_transform
-       phi_profiler)
-cc_library(
-  phi_function_api
-  SRCS ${api_source_file} ${fused_api_source_file}
-  DEPS phi_tensor_raw
-       phi
-       kernel_dispatch
-       api_gen_utils
-       phi_data_transform
-       api_custom_impl
-       api_tensor_utils
-       phi_profiler)
-cc_library(
-  phi_bw_function_api
-  SRCS ${bw_api_source_file} ${fused_bw_api_source_file}
-  DEPS phi_tensor_raw
-       phi
-       kernel_dispatch
-       api_gen_utils
-       backward_infermeta
-       sparse_backward_infermeta
-       phi_data_transform
-       phi_function_api
-       api_custom_impl
-       global_utils
-       phi_profiler)
-cc_library(
-  sparse_api
-  SRCS ${sparse_api_source_file}
-  DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_profiler)
-cc_library(
-  sparse_bw_api
-  SRCS ${sparse_bw_api_source_file}
-  DEPS phi_tensor_raw
-       phi
-       kernel_dispatch
-       api_gen_utils
-       sparse_api
-       sparse_backward_infermeta
-       phi_profiler)
-cc_library(
-  phi_dygraph_api
-  SRCS ${dygraph_api_source_file}
-  DEPS phi_tensor_raw
-       phi
-       kernel_dispatch
-       api_gen_utils
-       phi_data_transform
-       phi_function_api
-       sparse_api
-       phi_profiler)
-cc_library(
-  strings_api
-  SRCS ${strings_api_source_file}
-  DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_profiler)
-cc_library(
-  phi_tensor
-  SRCS tensor_method.cc
-  DEPS phi_tensor_raw
-       phi_function_api
-       api_gen_utils
-       kernel_dispatch
-       infermeta
-       sparse_infermeta
-       sparse_api
-       strings_api)
-cc_library(
-  tensor_copy
-  SRCS tensor_copy.cc
-  DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils)
-cc_library(
-  api_scalar
-  SRCS scalar.cc
-  DEPS tensor_copy)
-cc_library(
-  api_int_array
-  SRCS int_array.cc
-  DEPS tensor_copy)
-
-cc_library(
-  phi_tensor_operants
-  SRCS ${phi_tensor_operants_source_file}
-  DEPS phi_function_api)
-cc_library(
-  operants_manager
-  SRCS ${operants_manager_source_file}
-  DEPS phi_enforce)
-cc_library(
-  tensor_api
-  SRCS ${tensor_api_source_file}
-  DEPS operants_manager)
+collect_srcs(
+  api_srcs
+  SRCS
+  tensor.cc
+  op_meta_info.cc
+  context_pool.cc
+  tensor_utils.cc
+  kernel_dispatch.cc
+  api_gen_utils.cc
+  data_transform.cc
+  api_custom_impl.cc
+  tensor_method.cc
+  tensor_copy.cc
+  scalar.cc
+  int_array.cc)
+collect_generated_srcs(
+  api_srcs
+  SRCS
+  ${wrapped_infermeta_source_file}
+  ${api_source_file}
+  ${bw_api_source_file}
+  ${fused_api_source_file}
+  ${fused_bw_api_source_file}
+  ${sparse_api_source_file}
+  ${sparse_bw_api_source_file}
+  ${dygraph_api_source_file}
+  ${strings_api_source_file}
+  ${phi_tensor_operants_source_file}
+  ${operants_manager_source_file}
+  ${tensor_api_source_file})
--- a/paddle/phi/api/lib/context_pool.cc
+++ b/paddle/phi/api/lib/context_pool.cc
@@ -65,11 +65,12 @@ PADDLE_API phi::Allocator* GetAllocator(const phi::Place& place) {

 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PADDLE_API phi::CUDAStream* GetCurrentCUDAStream(const phi::Place& place) {
-  PADDLE_ENFORCE(place.GetType() == phi::AllocationType::GPU,
-                 phi::errors::InvalidArgument(
-                     "GetCurrentCUDAStream only supports GPUPlace input. "
-                     "However, your input is place=%s",
-                     place));
+  PADDLE_ENFORCE_EQ(place.GetType(),
+                    phi::AllocationType::GPU,
+                    phi::errors::InvalidArgument(
+                        "GetCurrentCUDAStream only supports GPUPlace input. "
+                        "However, your input is place=%s",
+                        place));

  auto& pool = paddle::experimental::DeviceContextPool::Instance();
  const phi::GPUContext* dev_ctx =

--- a/paddle/phi/api/lib/op_meta_info.cc
+++ b/paddle/phi/api/lib/op_meta_info.cc
@@ -119,6 +119,11 @@ void CustomOpKernelContext::EmplaceBackAttr(paddle::any attr) {
          << " has value of type: " << attrs_[attrs_.size() - 1].type().name();
 }

+void CustomOpKernelContext::EmplaceBackAttrs(
+    const std::vector<paddle::any>& attrs) {
+  attrs_ = std::move(attrs);
+}
+
 const Tensor& CustomOpKernelContext::InputAt(size_t idx) const {
  return inputs_.at(idx);
 }
@@ -132,6 +137,10 @@ std::vector<Tensor> CustomOpKernelContext::InputsBetween(size_t start,
  return rlt;
 }

+const std::vector<paddle::any>& CustomOpKernelContext::Attrs() const {
+  return attrs_;
+}
+
 Tensor& CustomOpKernelContext::MutableInputAt(size_t idx) {
  return inputs_.at(idx);
 }
@@ -193,6 +202,16 @@ const std::pair<size_t, size_t>& CustomOpKernelContext::OutputRangeAt(
  return output_range_.at(idx);
 }

+const std::vector<std::pair<size_t, size_t>>&
+CustomOpKernelContext::InputRange() {
+  return input_range_;
+}
+
+const std::vector<std::pair<size_t, size_t>>&
+CustomOpKernelContext::OutputRange() {
+  return output_range_;
+}
+
 void CustomOpKernelContext::ConstructInplaceIndex(
    const std::vector<std::string>& inputs,
    const std::vector<std::string>& outputs,
@@ -208,8 +227,9 @@ void CustomOpKernelContext::ConstructInplaceIndex(
      continue;
    }
    auto out_iter = find(outputs.begin(), outputs.end(), inplace_map.at(input));
-    PADDLE_ENFORCE(
-        out_iter != outputs.end(),
+    PADDLE_ENFORCE_NE(
+        out_iter,
+        outputs.end(),
        phi::errors::NotFound("Can't find the mapped value of %s, please check "
                              "the input of `Inplace` again and make "
                              "sure you registered your op accurately. ",
@@ -253,8 +273,9 @@ void CustomOpKernelContext::AssignInplaceOutputs() {
    size_t out_start_idx = output_range_[pair.second].first;
    size_t out_end_idx = output_range_[pair.second].second;
    size_t assign_tensor_size = in_end_idx - in_start_idx;
-    PADDLE_ENFORCE(
-        assign_tensor_size == out_end_idx - out_start_idx,
+    PADDLE_ENFORCE_EQ(
+        assign_tensor_size,
+        out_end_idx - out_start_idx,
        phi::errors::OutOfRange("When assigning inplaced tensor, Input vector "
                                "size %d mismatch output vector size %d",
                                in_end_idx - in_start_idx,
@@ -316,6 +337,43 @@ OpMetaInfo& OpMetaInfo::SetInferDtypeFn(InferDtypeFunc&& func) {
  return *this;
 }

+//////////////// Op Meta Info Helper /////////////////
+const std::string& OpMetaInfoHelper::GetOpName(const paddle::OpMetaInfo& info) {
+  return info.name_;
+}
+const std::vector<std::string>& OpMetaInfoHelper::GetInputs(
+    const paddle::OpMetaInfo& info) {
+  return info.inputs_;
+}
+const std::vector<std::string>& OpMetaInfoHelper::GetOutputs(
+    const paddle::OpMetaInfo& info) {
+  return info.outputs_;
+}
+const std::vector<std::string>& OpMetaInfoHelper::GetAttrs(
+    const paddle::OpMetaInfo& info) {
+  return info.attrs_;
+}
+const std::unordered_map<std::string, std::string>&
+OpMetaInfoHelper::GetInplaceMap(const paddle::OpMetaInfo& info) {
+  return info.inplace_map_;
+}
+const std::unordered_map<std::string, std::string>&
+OpMetaInfoHelper::GetInplaceReverseMap(const paddle::OpMetaInfo& info) {
+  return info.inplace_reverse_map_;
+}
+const KernelFunc& OpMetaInfoHelper::GetKernelFn(
+    const paddle::OpMetaInfo& info) {
+  return info.kernel_fn_;
+}
+const InferShapeFunc& OpMetaInfoHelper::GetInferShapeFn(
+    const paddle::OpMetaInfo& info) {
+  return info.infer_shape_fn_;
+}
+const InferDtypeFunc& OpMetaInfoHelper::GetInferDtypeFn(
+    const paddle::OpMetaInfo& info) {
+  return info.infer_dtype_fn_;
+}
+
 //////////////// Op Meta Info Map /////////////////

 std::vector<OpMetaInfo>& OpMetaInfoMap::operator[](const std::string& name) {
@@ -414,21 +472,23 @@ OpMetaInfoBuilder& OpMetaInfoBuilder::SetInplaceMap(
  const std::vector<std::string>& outputs =
      OpMetaInfoHelper::GetOutputs(*info_ptr_);
  for (const auto& pair : inplace_map) {
-    PADDLE_ENFORCE(
-        std::find(inputs.begin(), inputs.end(), pair.first) != inputs.cend(),
+    PADDLE_ENFORCE_NE(
+        std::find(inputs.begin(), inputs.end(), pair.first),
+        inputs.cend(),
        phi::errors::PreconditionNotMet(
            "The register of operator %s's `SetInplaceMap` failed. "
            "Please make sure: 1. Call `Inputs` and `Outputs` before "
            "`SetInplaceMap`; 2. The keys of inplace_map are inside `Inputs`",
            name_));
-    PADDLE_ENFORCE(std::find(outputs.begin(), outputs.end(), pair.second) !=
-                       outputs.cend(),
-                   phi::errors::PreconditionNotMet(
-                       "The register of operator %s's `SetInplaceMap` failed. "
-                       "Please make sure: 1. Call `Inputs` and `Outputs` "
-                       "before `SetInplaceMap`; 2. The values of inplace_map "
-                       "are inside `Outputs`",
-                       name_));
+    PADDLE_ENFORCE_NE(
+        std::find(outputs.begin(), outputs.end(), pair.second),
+        outputs.cend(),
+        phi::errors::PreconditionNotMet(
+            "The register of operator %s's `SetInplaceMap` failed. "
+            "Please make sure: 1. Call `Inputs` and `Outputs` "
+            "before `SetInplaceMap`; 2. The values of inplace_map "
+            "are inside `Outputs`",
+            name_));
  }
  info_ptr_->SetInplaceMap(
      std::forward<std::unordered_map<std::string, std::string>>(inplace_map));

--- a/paddle/phi/api/lib/tensor.cc
+++ b/paddle/phi/api/lib/tensor.cc
@@ -358,6 +358,10 @@ gpuStream_t Tensor::stream() const {
 }
 #endif

+const std::string &Tensor::name() const { return name_; }
+
+void Tensor::set_name(const std::string &name) { name_ = name; }
+
 /* Part 5: Status utils methods */

 bool Tensor::defined() const { return impl_ != nullptr; }

--- a/paddle/phi/api/profiler/CMakeLists.txt
+++ b/paddle/phi/api/profiler/CMakeLists.txt
@@ -26,16 +26,4 @@ if(WITH_PYTHON AND EXISTS ${PADDLE_BINARY_DIR})
  endif()
 endif()

-if(WITH_GPU OR WITH_ROCM)
-  set(GPU_CTX_DEPS dynload_cuda dynamic_loader)
-endif()
-
-cc_library(
-  phi_device_tracer
-  SRCS device_tracer.cc
-  DEPS phi_profiler_proto ${GPU_CTX_DEPS})
-
-cc_library(
-  phi_profiler
-  SRCS profiler.cc
-  DEPS phi_os_info phi_device_tracer phi_enforce)
+collect_srcs(api_srcs SRCS device_tracer.cc profiler.cc)
--- a/paddle/phi/backends/CMakeLists.txt
+++ b/paddle/phi/backends/CMakeLists.txt
@@ -2,17 +2,6 @@ add_subdirectory(dynload)
 add_subdirectory(gpu)

 set(BACKENDS_SRCS all_context.cc cpu/cpu_context.cc cpu/cpu_info.cc)
-set(BACKENDS_DEPS
-    enforce
-    place
-    flags
-    eigen3
-    phi_device_context
-    generator
-    phi_os_info)
-if(WITH_XBYAK)
-  list(APPEND BACKENDS_DEPS xbyak)
-endif()

 if(NOT APPLE AND NOT WIN32)
  list(APPEND BACKENDS_SRCS device_code.cc)
@@ -23,16 +12,10 @@ if(WITH_GPU OR WITH_ROCM)
       gpu/gpu_resources.cc)
  if(WITH_GPU)
    list(APPEND BACKENDS_SRCS gpu/cuda/cuda_info.cc gpu/cuda/cuda_graph.cc)
-    set_source_files_properties(
-      gpu/gpu_resources.cc
-      PROPERTIES COMPILE_FLAGS
-                 "-DCUDA_REAL_ARCHS=\"${NVCC_FLAGS_EXTRA_real_archs}\"")
-
  endif()
  if(WITH_ROCM)
    list(APPEND BACKENDS_SRCS gpu/rocm/rocm_info.cc)
  endif()
-  list(APPEND BACKENDS_DEPS phi_dynload_cuda)
 endif()

 if(WITH_XPU)
@@ -45,7 +28,6 @@ if(WITH_MKLDNN)
  list(APPEND BACKENDS_SRCS onednn/onednn_context.cc)
  list(APPEND BACKENDS_SRCS onednn/axpy_handler.cc)
  list(APPEND BACKENDS_SRCS onednn/matmul_utils.cc)
-  list(APPEND BACKENDS_DEPS mkldnn)
 endif()

 list(
@@ -55,26 +37,25 @@ list(
  device_guard.cc
  stream.cc
  event.cc
-  device_base.cc
  device_manager.cc
  context_pool.cc)

+if(WITH_GPU
+   OR WITH_ROCM
+   OR WITH_CUSTOM_DEVICE)
+  list(APPEND BACKENDS_SRCS device_base.cc)
+endif()
+
 if(WITH_CUSTOM_DEVICE)
  list(APPEND BACKENDS_SRCS custom/custom_context.cc custom/custom_device.cc
       custom/custom_device_op_list.cc)
 endif()

-add_library(phi_backends "${BACKENDS_SRCS}")
-target_link_libraries(phi_backends ${BACKENDS_DEPS})
-
-# for inference library
-get_property(phi_modules GLOBAL PROPERTY PHI_MODULES)
-set(phi_modules ${phi_modules} phi_backends)
-set_property(GLOBAL PROPERTY PHI_MODULES "${phi_modules}")
+collect_srcs(backends_srcs SRCS ${BACKENDS_SRCS})

 if(WITH_CUSTOM_DEVICE)
  cc_test(
    capi_test
    SRCS custom/capi_test.cc
-    DEPS phi_capi)
+    DEPS phi)
 endif()
--- a/paddle/phi/backends/cpu/cpu_context.cc
+++ b/paddle/phi/backends/cpu/cpu_context.cc
@@ -24,6 +24,10 @@

 namespace phi {

+template <>
+const TypeInfo<DeviceContext> TypeInfoTraits<DeviceContext, CPUContext>::kType =
+    RegisterStaticType<DeviceContext>(CPUContext::name());
+
 struct CPUContext::Impl {
  Impl() : place_(CPUPlace()) {}


--- a/paddle/phi/backends/custom/custom_context.cc
+++ b/paddle/phi/backends/custom/custom_context.cc
@@ -19,6 +19,11 @@ limitations under the License. */

 namespace phi {

+template <>
+const TypeInfo<DeviceContext>
+    TypeInfoTraits<DeviceContext, CustomContext>::kType =
+        RegisterStaticType<DeviceContext>(CustomContext::name());
+
 struct CustomContext::Impl {
  explicit Impl(const CustomPlace& place) : place_(place) {}


--- a/paddle/phi/backends/dynload/CMakeLists.txt
+++ b/paddle/phi/backends/dynload/CMakeLists.txt
-cc_library(
-  phi_dynamic_loader
-  SRCS dynamic_loader.cc port.cc
-  DEPS enforce glog gflags)
-
+set(DYNLOAD_COMMON_SRCS dynamic_loader.cc port.cc warpctc.cc warprnnt.cc
+                        lapack.cc)
+if(WITH_ASCEND_CL)
+  list(REMOVE_ITEM DYNLOAD_COMMON_SRCS warprnnt.cc)
+endif()
 list(
  APPEND
  CUDA_SRCS
@@ -60,66 +60,39 @@ configure_file(cupti_lib_path.h.in ${CMAKE_CURRENT_BINARY_DIR}/cupti_lib_path.h)
 if(CUPTI_FOUND)
  list(APPEND CUDA_SRCS cupti.cc)
 endif()
-if(WITH_ROCM)
-  hip_library(
-    phi_dynload_cuda
-    SRCS ${HIP_SRCS}
-    DEPS phi_dynamic_loader)
-  cc_library(
-    phi_dynload_warpctc
-    SRCS warpctc.cc
-    DEPS phi_dynamic_loader warpctc)
-  cc_library(
-    phi_dynload_warprnnt
-    SRCS warprnnt.cc
-    DEPS phi_dynamic_loader warprnnt)
-else()
-  nv_library(
-    phi_dynload_cuda
-    SRCS ${CUDA_SRCS}
-    DEPS phi_dynamic_loader)
-  cc_library(
-    phi_dynload_warpctc
-    SRCS warpctc.cc
-    DEPS phi_dynamic_loader warpctc)
-  cc_library(
-    phi_dynload_warprnnt
-    SRCS warprnnt.cc
-    DEPS phi_dynamic_loader warprnnt)
-endif()
+
 if(WITH_MKLML)
-  cc_library(
-    phi_dynload_mklml
-    SRCS mklml.cc
-    DEPS phi_dynamic_loader mklml)
+  # Only deps libmklml.so, not link
+  add_library(dynload_mklml STATIC mklml.cc)
+  add_dependencies(dynload_mklml mklml)
+  if(WIN32)
+    target_link_libraries(dynload_mklml ${MKLML_IOMP_LIB})
+  else()
+    target_link_libraries(dynload_mklml
+                          "-L${MKLML_LIB_DIR} -liomp5 -Wl,--as-needed")
+  endif()
 endif()

 if(WITH_FLASHATTN)
-  cc_library(
-    phi_dynload_flashattn
-    SRCS flashattn.cc
-    DEPS phi_dynamic_loader flashattn)
+  list(APPEND DYNLOAD_COMMON_SRCS flashattn.cc)
 endif()

-cc_library(
-  phi_dynload_lapack
-  SRCS lapack.cc
-  DEPS phi_dynamic_loader)
-add_dependencies(phi_dynload_lapack extern_lapack)
-# TODO(TJ): add iomp, mkldnn?
-
 if(MKL_FOUND AND WITH_ONEMKL)
  message("ONEMKL INCLUDE directory is ${MKL_INCLUDE}")
-  cc_library(
-    phi_dynload_mklrt
-    SRCS mklrt.cc
-    DEPS phi_dynamic_loader)
-  target_include_directories(phi_dynload_mklrt PRIVATE ${MKL_INCLUDE})
+  list(APPEND DYNLOAD_COMMON_SRCS mklrt.cc)
+endif()
+
+if(WITH_ROCM)
+  collect_srcs(backends_srcs SRCS ${DYNLOAD_COMMON_SRCS} ${HIP_SRCS})
+elseif(WITH_GPU)
+  collect_srcs(backends_srcs SRCS ${DYNLOAD_COMMON_SRCS} ${CUDA_SRCS})
+else()
+  collect_srcs(backends_srcs SRCS ${DYNLOAD_COMMON_SRCS})
 endif()

 if(WITH_CUDNN_FRONTEND)
  nv_test(
    cudnn_frontend_test
    SRCS cudnn_frontend_test.cc
-    DEPS phi_dynload_cuda cudnn-frontend)
+    DEPS phi cudnn-frontend)
 endif()
--- a/paddle/phi/backends/gpu/cuda/CMakeLists.txt
+++ b/paddle/phi/backends/gpu/cuda/CMakeLists.txt
-cc_library(cudnn_workspace_helper SRCS cudnn_workspace_helper.cc)
+collect_srcs(backends_srcs SRCS cudnn_workspace_helper.cc)
--- a/paddle/phi/backends/gpu/gpu_context.cc
+++ b/paddle/phi/backends/gpu/gpu_context.cc
@@ -59,6 +59,15 @@ limitations under the License. */

 namespace phi {

+template <>
+const TypeInfo<DeviceContext> TypeInfoTraits<DeviceContext, GPUContext>::kType =
+    RegisterStaticType<DeviceContext>(GPUContext::name());
+
+template <>
+const TypeInfo<DeviceContext>
+    TypeInfoTraits<DeviceContext, GPUPinnedContext>::kType =
+        RegisterStaticType<DeviceContext>(GPUPinnedContext::name());
+
 namespace internal {

 class EigenGpuStreamDevice : public Eigen::StreamInterface {

--- a/paddle/phi/backends/gpu/gpu_context.h
+++ b/paddle/phi/backends/gpu/gpu_context.h
@@ -15,6 +15,8 @@ limitations under the License. */

 #pragma once

+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+
 #include <array>
 #include <functional>
 #include <mutex>
@@ -305,3 +307,5 @@ class GPUPinnedContext
 };
 #endif
 }  // namespace phi
+
+#endif
--- a/paddle/phi/backends/onednn/onednn_context.cc
+++ b/paddle/phi/backends/onednn/onednn_context.cc
@@ -83,6 +83,11 @@ void OneDNNContextThreadLocals::Body::log_lib_version(void) {
  }
 }

+OneDNNContextThreadLocals::Body& OneDNNContextThreadLocals::fetch() {
+  thread_local Body b;
+  return b;
+}
+
 struct OneDNNContext::Impl {
  Impl() : p_blobmap_() {
    p_blobmap_.reset(new BlobMap());
@@ -462,5 +467,7 @@ const std::vector<std::string>& OneDNNContext::GetOutputsName(
  return impl_->GetOutputsName(output);
 }

+const char* OneDNNContext::name() { return "OneDNNContext"; }
+
 }  // namespace phi
 #endif
--- a/paddle/phi/backends/onednn/onednn_context.h
+++ b/paddle/phi/backends/onednn/onednn_context.h
@@ -76,10 +76,7 @@ class OneDNNContextThreadLocals {
  static constexpr size_t kMKLDNNSessionID_Default = 0;
  // mkldnn session id for cache clearing mode
  static constexpr size_t kMKLDNNSessionID_CacheClearing = -1;
-  static Body& fetch() {
-    thread_local Body b;
-    return b;
-  }
+  static Body& fetch();
 };

 class OneDNNContext : public CPUContext {
@@ -157,7 +154,7 @@ class OneDNNContext : public CPUContext {
  const std::vector<std::string>& GetOutputsName(
      const std::string& output) const;

-  static const char* name() { return "OneDNNContext"; }
+  static const char* name();

 private:
  struct Impl;

--- a/paddle/phi/backends/xpu/xpu_context.cc
+++ b/paddle/phi/backends/xpu/xpu_context.cc
@@ -30,6 +30,9 @@ namespace xpu = baidu::xpu::api;

 namespace phi {

+template <>
+const TypeInfo<DeviceContext> TypeInfoTraits<DeviceContext, XPUContext>::kType =
+    RegisterStaticType<DeviceContext>(XPUContext::name());
 struct XPUContext::Impl {
  void SetL3Cache(int l3_size = 14155776) {
    const int MAX_XPU_NUM = 16;

--- a/paddle/phi/capi/CMakeLists.txt
+++ b/paddle/phi/capi/CMakeLists.txt
 add_subdirectory(lib)
-cc_library(
-  phi_capi
-  SRCS all.cc
-  DEPS phi_c_data_type
-       phi_c_device_context
-       phi_c_int_array
-       phi_c_kernel_context
-       phi_c_kernel_factory
-       phi_c_kernel_registry
-       phi_c_place
-       phi_c_scalar
-       phi_c_tensor)
--- a/paddle/phi/capi/all.cc
+++ b/paddle/phi/capi/all.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/phi/capi/all.h"
-
-namespace paddle {
-namespace capi {}  // namespace capi
-}  // namespace paddle
--- a/paddle/phi/capi/lib/CMakeLists.txt
+++ b/paddle/phi/capi/lib/CMakeLists.txt
-cc_library(
-  phi_c_data_type
-  SRCS c_data_type.cc
-  DEPS dense_tensor)
-
-cc_library(
-  phi_c_device_context
-  SRCS c_device_context.cc
-  DEPS phi_backends)
-
-cc_library(
-  phi_c_int_array
-  SRCS c_int_array.cc
-  DEPS int_array)
-
-cc_library(
-  phi_c_kernel_context
-  SRCS c_kernel_context.cc
-  DEPS kernel_context)
-
-cc_library(
-  phi_c_kernel_factory
-  SRCS c_kernel_factory.cc
-  DEPS kernel_factory)
-
-cc_library(
-  phi_c_kernel_registry
-  SRCS c_kernel_registry.cc
-  DEPS dense_tensor)
-
-cc_library(
-  phi_c_place
-  SRCS c_place.cc
-  DEPS phi_place)
-
-cc_library(
-  phi_c_scalar
-  SRCS c_scalar.cc
-  DEPS scalar)
-
-cc_library(
-  phi_c_tensor
-  SRCS c_tensor.cc
-  DEPS dense_tensor)
+collect_srcs(
+  capi_srcs
+  SRCS
+  c_data_type.cc
+  c_device_context.cc
+  c_int_array.cc
+  c_kernel_context.cc
+  c_kernel_factory.cc
+  c_kernel_registry.cc
+  c_place.cc
+  c_scalar.cc
+  c_tensor.cc)
--- a/paddle/phi/common/CMakeLists.txt
+++ b/paddle/phi/common/CMakeLists.txt
-if(WITH_GPU)
-  nv_library(
-    phi_place
-    SRCS place.cc
-    DEPS phi_backends)
-elseif(WITH_ROCM)
-  hip_library(
-    phi_place
-    SRCS place.cc
-    DEPS phi_backends)
-else()
-  cc_library(phi_place SRCS place.cc)
-endif()
-
-cc_library(
-  scalar
-  SRCS scalar.cc
-  DEPS phi_enforce phi_tensor_utils)
-cc_library(
-  int_array
-  SRCS int_array.cc
-  DEPS phi_enforce phi_tensor_utils)
-cc_library(
-  memory_utils
-  SRCS memory_utils.cc
-  DEPS phi_enforce phi_place)
+collect_srcs(common_srcs SRCS place.cc scalar.cc int_array.cc memory_utils.cc)
--- a/paddle/phi/core/CMakeLists.txt
+++ b/paddle/phi/core/CMakeLists.txt
@@ -6,150 +6,35 @@ if(WITH_GPU)
  proto_library(external_error_proto SRCS external_error.proto)
 endif()

-cc_library(
-  flags
-  SRCS flags.cc
-  DEPS gflags)
-
-cc_library(errors SRCS errors.cc)
-set(phi_enforce_deps errors flags)
-if(WITH_GPU)
-  set(phi_enforce_deps ${phi_enforce_deps} external_error_proto)
-endif()
-cc_library(
-  phi_enforce
-  SRCS enforce.cc
-  DEPS ${phi_enforce_deps})
-
-cc_library(
-  phi_os_info
-  SRCS os_info.cc
-  DEPS phi_enforce)
-
-if(WITH_XPU)
-  cc_library(
-    kernel_factory
-    SRCS kernel_factory.cc
-    DEPS phi_enforce convert_utils phi_backends)
-else()
-  cc_library(
-    kernel_factory
-    SRCS kernel_factory.cc
-    DEPS phi_enforce convert_utils)
-endif()
-cc_library(
-  kernel_context
-  SRCS kernel_context.cc
-  DEPS phi_enforce phi_backends)
-
-cc_library(
-  ddim
-  SRCS ddim.cc
-  DEPS phi_enforce)
-cc_library(
-  tensor_base
-  SRCS tensor_base.cc allocator.cc
-  DEPS phi_enforce)
-cc_library(
-  tensor_meta
-  SRCS tensor_meta.cc
-  DEPS phi_enforce)
-cc_library(
-  lod_utils
-  SRCS lod_utils.cc
-  DEPS phi_enforce)
-cc_library(
-  threadpool
-  SRCS threadpool.cc
-  DEPS phi_enforce)
-
-cc_library(
-  dense_tensor
-  SRCS dense_tensor.cc dense_tensor_impl.cc
-  DEPS convert_utils tensor_meta tensor_base ddim)
-
-target_link_libraries(dense_tensor memory_utils)
-
-cc_library(
-  sparse_coo_tensor
-  SRCS sparse_coo_tensor.cc
-  DEPS tensor_meta tensor_base)
-cc_library(
-  sparse_csr_tensor
-  SRCS sparse_csr_tensor.cc
-  DEPS dense_tensor tensor_base)
-cc_library(
-  string_tensor
-  SRCS string_tensor.cc
-  DEPS convert_utils tensor_meta tensor_base)
-
-cc_library(
-  tensor_array
-  SRCS tensor_array.cc
-  DEPS dense_tensor tensor_base)
-
-cc_library(
-  extended_tensor
-  SRCS extended_tensor.cc
-  DEPS tensor_base)
-
-cc_library(
-  meta_tensor
-  SRCS meta_tensor.cc
-  DEPS tensor_base tensor_meta dense_tensor)
-cc_library(
-  infermeta_utils
-  SRCS infermeta_utils.cc
-  DEPS meta_tensor)
-
-cc_library(
-  selected_rows
-  SRCS selected_rows_impl.cc selected_rows.cc
-  DEPS tensor_base dense_tensor phi_enforce ddim)
-cc_library(
-  phi_device_context
-  SRCS device_context.cc
-  DEPS dense_tensor selected_rows)
-
-cc_library(
-  custom_kernel
-  SRCS custom_kernel.cc
-  DEPS kernel_factory)
-
-cc_library(
-  mixed_vector
-  SRCS mixed_vector.cc
-  DEPS phi_backends place memory)
-
-cc_library(
-  generator
-  SRCS generator.cc
-  DEPS enforce place)
-
-# Will remove once we implemented MKLDNN_Tensor
-if(WITH_MKLDNN)
-  add_dependencies(dense_tensor mkldnn)
-  add_dependencies(tensor_base mkldnn)
-endif()
-
-if(WITH_GPU)
-  nv_library(
-    phi_tensor_utils
-    SRCS tensor_utils.cc
-    DEPS phi_backends dense_tensor selected_rows memcpy memory_utils)
-elseif(WITH_ROCM)
-  hip_library(
-    phi_tensor_utils
-    SRCS tensor_utils.cc
-    DEPS phi_backends dense_tensor selected_rows memcpy memory_utils)
-elseif(WITH_XPU_KP)
-  xpu_library(
-    phi_tensor_utils
-    SRCS tensor_utils.cc
-    DEPS phi_backends dense_tensor selected_rows memcpy memory_utils)
-else()
-  cc_library(
-    phi_tensor_utils
-    SRCS tensor_utils.cc
-    DEPS dense_tensor selected_rows memcpy phi_backends memory_utils)
-endif()
+collect_srcs(
+  core_srcs
+  SRCS
+  flags.cc
+  errors.cc
+  enforce.cc
+  os_info.cc
+  kernel_context.cc
+  ddim.cc
+  tensor_base.cc
+  allocator.cc
+  tensor_meta.cc
+  lod_utils.cc
+  threadpool.cc
+  dense_tensor.cc
+  dense_tensor_impl.cc
+  sparse_coo_tensor.cc
+  sparse_csr_tensor.cc
+  string_tensor.cc
+  tensor_array.cc
+  extended_tensor.cc
+  meta_tensor.cc
+  infermeta_utils.cc
+  selected_rows_impl.cc
+  selected_rows.cc
+  device_context.cc
+  custom_kernel.cc
+  mixed_vector.cc
+  generator.cc
+  kernel_factory.cc
+  tensor_utils.cc
+  storage_properties.cc)
--- a/paddle/phi/core/compat/CMakeLists.txt
+++ b/paddle/phi/core/compat/CMakeLists.txt
-cc_library(
-  arg_map_context
-  SRCS arg_map_context.cc
-  DEPS phi_enforce)
-cc_library(
-  op_utils
-  SRCS op_utils.cc
-  DEPS arg_map_context enforce)
-cc_library(
-  get_kerneltype_forvar_utils
-  SRCS get_kerneltype_forvar_utils.cc
-  DEPS enforce)
-
-set(convert_utils_deps data_type place op_utils phi_backends)
-
-if(WITH_MKLDNN)
-  set(convert_utils_deps ${convert_utils_deps} mkldnn)
-endif()
-
-cc_library(
-  convert_utils
-  SRCS convert_utils.cc
-  DEPS ${convert_utils_deps})
+collect_srcs(core_srcs SRCS arg_map_context.cc op_utils.cc
+             get_kerneltype_forvar_utils.cc convert_utils.cc)
--- a/paddle/phi/core/compat/op_utils.cc
+++ b/paddle/phi/core/compat/op_utils.cc
@@ -26,4 +26,16 @@ OpUtilsMap& OpUtilsMap::Instance() {
  return g_op_utils_map;
 }

+BaseKernelNameRegistrar::BaseKernelNameRegistrar(const char* op_type,
+                                                 const char* base_kernel_name) {
+  OpUtilsMap::Instance().InsertBaseKernelName(op_type, base_kernel_name);
+  OpUtilsMap::Instance().InsertFluidOplName(op_type, base_kernel_name);
+}
+
+ArgumentMappingFnRegistrar::ArgumentMappingFnRegistrar(
+    const char* op_type, ArgumentMappingFn arg_mapping_fn) {
+  OpUtilsMap::Instance().InsertArgumentMappingFn(op_type,
+                                                 std::move(arg_mapping_fn));
+}
+
 }  // namespace phi
--- a/paddle/phi/core/compat/op_utils.h
+++ b/paddle/phi/core/compat/op_utils.h
@@ -210,18 +210,12 @@ class OpUtilsMap {
 };

 struct BaseKernelNameRegistrar {
-  BaseKernelNameRegistrar(const char* op_type, const char* base_kernel_name) {
-    OpUtilsMap::Instance().InsertBaseKernelName(op_type, base_kernel_name);
-    OpUtilsMap::Instance().InsertFluidOplName(op_type, base_kernel_name);
-  }
+  BaseKernelNameRegistrar(const char* op_type, const char* base_kernel_name);
 };

 struct ArgumentMappingFnRegistrar {
  ArgumentMappingFnRegistrar(const char* op_type,
-                             ArgumentMappingFn arg_mapping_fn) {
-    OpUtilsMap::Instance().InsertArgumentMappingFn(op_type,
-                                                   std::move(arg_mapping_fn));
-  }
+                             ArgumentMappingFn arg_mapping_fn);
 };

 #define PD_REGISTER_BASE_KERNEL_NAME(op_type, base_kernel_name)               \

--- a/paddle/phi/core/dense_tensor.cc
+++ b/paddle/phi/core/dense_tensor.cc
@@ -42,6 +42,11 @@ limitations under the License. */

 namespace phi {

+template <>
+const TypeInfo<phi::TensorBase>
+    TypeInfoTraits<phi::TensorBase, DenseTensor>::kType =
+        RegisterStaticType<phi::TensorBase>(DenseTensor::name());
+
 DenseTensor::DenseTensor(Allocator* a, const DenseTensorMeta& meta)
    : meta_(meta), holder_(a->Allocate(SizeOf(dtype()) * numel())) {}

@@ -115,8 +120,9 @@ void* DenseTensor::AllocateFrom(Allocator* allocator,
  if (fake_alloc) {
    bytes = 0;
  } else {
-    PADDLE_ENFORCE(
+    PADDLE_ENFORCE_EQ(
        valid(),
+        true,
        phi::errors::PreconditionNotMet("The meta data must be valid when "
                                        "call the mutable data function."));
    if (requested_size) {
@@ -169,8 +175,9 @@ const T* DenseTensor::data() const {
 template <typename T>
 T* DenseTensor::data() {
  T* ret = static_cast<T*>(data());
-  PADDLE_ENFORCE(
-      (dtype() == phi::CppTypeToDataType<T>::Type()),
+  PADDLE_ENFORCE_EQ(
+      dtype(),
+      phi::CppTypeToDataType<T>::Type(),
      phi::errors::InvalidArgument(
          "The type of data we are trying to retrieve (%s) does not match the "
          "type of data (%s) currently contained in the container.",
@@ -200,16 +207,18 @@ const void* DenseTensor::data() const {
 }

 void DenseTensor::set_meta(DenseTensorMeta&& meta) {
-  PADDLE_ENFORCE(!meta_.valid(),
-                 phi::errors::InvalidArgument(
-                     "Only when the original attribute of Tensor is "
-                     "incomplete, can it be reset."));
+  PADDLE_ENFORCE_EQ(meta_.valid(),
+                    false,
+                    phi::errors::InvalidArgument(
+                        "Only when the original attribute of Tensor is "
+                        "incomplete, can it be reset."));
  meta_ = std::move(meta);
 }

 void DenseTensor::set_meta(const DenseTensorMeta& meta) {
-  PADDLE_ENFORCE(
+  PADDLE_ENFORCE_EQ(
      meta.valid(),
+      true,
      phi::errors::InvalidArgument(
          "Input meta is invalid, please check the meta attribute."));
  meta_.dims = meta.dims;

--- a/paddle/phi/core/distributed/CMakeLists.txt
+++ b/paddle/phi/core/distributed/CMakeLists.txt
--- a/paddle/phi/core/distributed/auto_parallel/CMakeLists.txt
+++ b/paddle/phi/core/distributed/auto_parallel/CMakeLists.txt
--- a/paddle/phi/core/distributed/check/CMakeLists.txt
+++ b/paddle/phi/core/distributed/check/CMakeLists.txt
--- a/paddle/phi/core/distributed/store/CMakeLists.txt
+++ b/paddle/phi/core/distributed/store/CMakeLists.txt
--- a/paddle/phi/core/distributed/store/tcp_store.cc
+++ b/paddle/phi/core/distributed/store/tcp_store.cc
--- a/paddle/phi/core/enforce.cc
+++ b/paddle/phi/core/enforce.cc
--- a/paddle/phi/core/flags.h
+++ b/paddle/phi/core/flags.h
--- a/paddle/phi/core/lod_utils.cc
+++ b/paddle/phi/core/lod_utils.cc
--- a/paddle/phi/core/selected_rows.cc
+++ b/paddle/phi/core/selected_rows.cc
--- a/paddle/phi/core/sparse_coo_tensor.cc
+++ b/paddle/phi/core/sparse_coo_tensor.cc
--- a/paddle/phi/core/sparse_csr_tensor.cc
+++ b/paddle/phi/core/sparse_csr_tensor.cc
--- a/paddle/fluid/platform/dynload/mklml.cc
+++ b/paddle/fluid/platform/dynload/mklml.cc
--- a/paddle/phi/core/string_tensor.cc
+++ b/paddle/phi/core/string_tensor.cc
--- a/paddle/phi/core/tensor_array.cc
+++ b/paddle/phi/core/tensor_array.cc
--- a/paddle/phi/core/utils/type_info.h
+++ b/paddle/phi/core/utils/type_info.h
--- a/paddle/phi/infermeta/CMakeLists.txt
+++ b/paddle/phi/infermeta/CMakeLists.txt
--- a/paddle/phi/infermeta/multiary.cc
+++ b/paddle/phi/infermeta/multiary.cc
--- a/paddle/phi/infermeta/sparse/CMakeLists.txt
+++ b/paddle/phi/infermeta/sparse/CMakeLists.txt
--- a/paddle/phi/infermeta/strings/CMakeLists.txt
+++ b/paddle/phi/infermeta/strings/CMakeLists.txt
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
--- a/paddle/phi/kernels/autotune/CMakeLists.txt
+++ b/paddle/phi/kernels/autotune/CMakeLists.txt
--- a/paddle/phi/kernels/autotune/cache_base.h
+++ b/paddle/phi/kernels/autotune/cache_base.h
--- a/paddle/phi/kernels/cpu/rmsprop_kernel.cc
+++ b/paddle/phi/kernels/cpu/rmsprop_kernel.cc
--- a/paddle/phi/kernels/funcs/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/CMakeLists.txt
--- a/paddle/phi/kernels/funcs/blas/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/blas/CMakeLists.txt
--- a/paddle/phi/kernels/funcs/blas/blas_impl.cu.h
+++ b/paddle/phi/kernels/funcs/blas/blas_impl.cu.h
--- a/paddle/phi/kernels/funcs/detail/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/detail/CMakeLists.txt
--- a/paddle/phi/kernels/funcs/eigen/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/eigen/CMakeLists.txt
--- a/paddle/phi/kernels/funcs/jit/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/jit/CMakeLists.txt
--- a/paddle/phi/kernels/funcs/jit/gen/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/jit/gen/CMakeLists.txt
--- a/paddle/phi/kernels/funcs/jit/gen_base.h
+++ b/paddle/phi/kernels/funcs/jit/gen_base.h
--- a/paddle/phi/kernels/funcs/jit/more/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/jit/more/CMakeLists.txt
--- a/paddle/phi/kernels/funcs/jit/more/intrinsic/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/jit/more/intrinsic/CMakeLists.txt
--- a/paddle/phi/kernels/funcs/jit/more/mix/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/jit/more/mix/CMakeLists.txt
--- a/paddle/phi/kernels/funcs/jit/more/mkl/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/jit/more/mkl/CMakeLists.txt
--- a/paddle/phi/kernels/funcs/jit/refer/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/jit/refer/CMakeLists.txt
--- a/paddle/phi/kernels/funcs/lapack/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/lapack/CMakeLists.txt
--- a/paddle/phi/kernels/funcs/math_function.h
+++ b/paddle/phi/kernels/funcs/math_function.h
--- a/paddle/phi/kernels/funcs/matrix_inverse.cu.cc
+++ b/paddle/phi/kernels/funcs/matrix_inverse.cu.cc
--- a/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention.cu
+++ b/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention.cu
--- a/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention_backward.cu
+++ b/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention_backward.cu
--- a/paddle/phi/kernels/gpu/eigvalsh_kernel.cu
+++ b/paddle/phi/kernels/gpu/eigvalsh_kernel.cu
--- a/paddle/phi/kernels/gpu/gelu_funcs.h
+++ b/paddle/phi/kernels/gpu/gelu_funcs.h
--- a/paddle/phi/kernels/impl/isclose_kernel_impl.h
+++ b/paddle/phi/kernels/impl/isclose_kernel_impl.h
--- a/paddle/phi/kernels/impl/slice_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/slice_grad_kernel_impl.h
--- a/paddle/phi/kernels/transfer_layout_kernel.cc
+++ b/paddle/phi/kernels/transfer_layout_kernel.cc
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
--- a/paddle/testing/CMakeLists.txt
+++ b/paddle/testing/CMakeLists.txt
--- a/paddle/utils/CMakeLists.txt
+++ b/paddle/utils/CMakeLists.txt
--- a/paddle/utils/string/CMakeLists.txt
+++ b/paddle/utils/string/CMakeLists.txt
--- a/python/env_dict.py.in
+++ b/python/env_dict.py.in
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_run_cinn.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_run_cinn.py
--- a/python/paddle/fluid/tests/unittests/test_resnet50_with_cinn.py
+++ b/python/paddle/fluid/tests/unittests/test_resnet50_with_cinn.py
--- a/python/setup.py.in
+++ b/python/setup.py.in
--- a/setup.py
+++ b/setup.py
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
--- a/test/cpp/eager/CMakeLists.txt
+++ b/test/cpp/eager/CMakeLists.txt
--- a/test/cpp/fluid/CMakeLists.txt
+++ b/test/cpp/fluid/CMakeLists.txt
--- a/test/cpp/fluid/benchmark/CMakeLists.txt
+++ b/test/cpp/fluid/benchmark/CMakeLists.txt
--- a/test/cpp/fluid/cinn/CMakeLists.txt
+++ b/test/cpp/fluid/cinn/CMakeLists.txt
--- a/test/cpp/fluid/fused/CMakeLists.txt
+++ b/test/cpp/fluid/fused/CMakeLists.txt
--- a/test/cpp/fluid/math/CMakeLists.txt
+++ b/test/cpp/fluid/math/CMakeLists.txt
--- a/test/cpp/fluid/mkldnn/CMakeLists.txt
+++ b/test/cpp/fluid/mkldnn/CMakeLists.txt
--- a/test/cpp/fluid/pscore/CMakeLists.txt
+++ b/test/cpp/fluid/pscore/CMakeLists.txt
--- a/test/cpp/imperative/CMakeLists.txt
+++ b/test/cpp/imperative/CMakeLists.txt
--- a/test/cpp/imperative/test_hooks.cc
+++ b/test/cpp/imperative/test_hooks.cc
--- a/test/cpp/inference/infer_ut/CMakeLists.txt
+++ b/test/cpp/inference/infer_ut/CMakeLists.txt
--- a/test/cpp/jit/CMakeLists.txt
+++ b/test/cpp/jit/CMakeLists.txt
--- a/test/cpp/new_executor/CMakeLists.txt
+++ b/test/cpp/new_executor/CMakeLists.txt
--- a/test/cpp/phi/api/CMakeLists.txt
+++ b/test/cpp/phi/api/CMakeLists.txt
--- a/test/cpp/phi/api/scale_api.h
+++ b/test/cpp/phi/api/scale_api.h
--- a/test/cpp/phi/common/CMakeLists.txt
+++ b/test/cpp/phi/common/CMakeLists.txt
--- a/test/cpp/phi/core/CMakeLists.txt
+++ b/test/cpp/phi/core/CMakeLists.txt
--- a/test/cpp/phi/core/test_type_info.cc
+++ b/test/cpp/phi/core/test_type_info.cc
--- a/test/cpp/phi/kernels/CMakeLists.txt
+++ b/test/cpp/phi/kernels/CMakeLists.txt
--- a/test/cpp/phi/ops/CMakeLists.txt
+++ b/test/cpp/phi/ops/CMakeLists.txt
--- a/test/cpp/prim/CMakeLists.txt
+++ b/test/cpp/prim/CMakeLists.txt
--- a/tools/parallel_UT_rule.py
+++ b/tools/parallel_UT_rule.py