remove WITH_ASCEND_CL PADDLE_WITH_ASCEND_CL WITH_ASCEND_CXX11 (#52448)

0b60f28c · engineer1109 · GitHub · 04f8c24e · 0b60f28c · 0b60f28c
103 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -58,10 +58,6 @@ option(WITH_WIN_DUMP_DBG "Compile with windows core dump debug mode" OFF)
 option(WITH_ASCEND "Compile PaddlePaddle with ASCEND" OFF)
 option(WITH_ROCM "Compile PaddlePaddle with ROCM platform" OFF)
 option(WITH_IPU "Compile PaddlePaddle with Graphcore IPU" OFF)
-# NOTE(zhiqiu): WITH_ASCEND_CL can be compile on x86_64, so we can set WITH_ASCEND=OFF and WITH_ASCEND_CL=ON
-# to develop some acl related functionality on x86
-option(WITH_ASCEND_CL "Compile PaddlePaddle with ASCEND CL" ${WITH_ASCEND})
-option(WITH_ASCEND_CXX11 "Compile PaddlePaddle with ASCEND and CXX11 ABI" OFF)
 option(WITH_ONNXRUNTIME "Compile PaddlePaddle with ONNXRUNTIME" OFF)
 option(WITH_CUSPARSELT "Compile PaddlePaddle with CUSPARSELT" OFF)
 option(WITH_SETUP_INSTALL "Compile PaddlePaddle with setup.py" OFF)
@@ -113,14 +109,6 @@ if(APPLE AND WITH_ARM)
  set(CMAKE_CXX_FLAGS "${CMAKE_C_FLAGS} -target arm64-apple-darwin")
 endif()
-if(WITH_ASCEND_CL AND NOT WITH_ASCEND_CXX11)
-  if(WITH_ARM_BRPC)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=1")
-  else()
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
-  endif()
-endif()
 if(WIN32)
  option(MSVC_STATIC_CRT "use static C Runtime library by default" ON)
@@ -525,15 +513,6 @@ if(WITH_DISTRIBUTE)
        ON
        CACHE STRING "Enable GLOO when compiling WITH_DISTRIBUTE=ON." FORCE)
  endif()
-  if(WITH_ASCEND_CL AND NOT WITH_ARM_BRPC)
-    # disable WITH_PSCORE for NPU before include third_party
-    message(
-      WARNING
-        "Disable WITH_PSCORE when compiling with NPU. Force WITH_PSCORE=OFF.")
-    set(WITH_PSCORE
-        OFF
-        CACHE BOOL "Disable WITH_PSCORE when compiling with NPU" FORCE)
-  endif()
  if(WITH_ROCM AND HIP_VERSION LESS_EQUAL 40020496)
    # TODO(qili93): third-party rocksdb throw Illegal instruction with HIP version 40020496
    message(
@@ -567,13 +546,6 @@ if(WITH_RPC)
        OFF
        CACHE BOOL "Disable WITH_RPC when not compiled with distribute" FORCE)
  endif()
-  if(WITH_ASCEND_CL AND WITH_RPC)
-    message(
-      WARNING "Disable WITH_RPC when compiling with NPU. Force WITH_RPC=OFF.")
-    set(WITH_RPC
-        OFF
-        CACHE BOOL "Disable WITH_RPC when compiling with NPU" FORCE)
-  endif()
  if(WITH_ROCM AND WITH_RPC)
    message(
      WARNING "Disable WITH_RPC when compiling with ROCM. Force WITH_RPC=OFF.")

--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -97,10 +97,6 @@ if(WITH_ASCEND)
  add_definitions(-DPADDLE_WITH_ASCEND)
 endif()
-if(WITH_ASCEND_CL)
-  add_definitions(-DPADDLE_WITH_ASCEND_CL)
-endif()
 if(WITH_ASCEND_INT64)
  add_definitions(-DPADDLE_WITH_ASCEND_INT64)
 endif()

--- a/cmake/external/ascend.cmake
+++ b/cmake/external/ascend.cmake
@@ -25,111 +25,3 @@ if(EXISTS
  # It means CANN 20.2 +
  add_definitions(-DPADDLE_WITH_ASCEND_STRING)
 endif()
-if(WITH_ASCEND OR WITH_ASCEND_CL)
-  set(ASCEND_DRIVER_DIR ${ASCEND_DIR}/driver/lib64)
-  set(ASCEND_DRIVER_COMMON_DIR ${ASCEND_DIR}/driver/lib64/common)
-  set(ASCEND_DRIVER_SHARE_DIR ${ASCEND_DIR}/driver/lib64/share)
-  set(ASCEND_RUNTIME_DIR ${ASCEND_DIR}/fwkacllib/lib64)
-  set(ASCEND_ATC_DIR ${ASCEND_DIR}/atc/lib64)
-  set(ASCEND_ACL_DIR ${ASCEND_DIR}/acllib/lib64)
-  set(STATIC_ACL_LIB ${ASCEND_ACL_DIR})
-  set(ASCEND_MS_RUNTIME_PATH ${ASCEND_RUNTIME_DIR} ${ASCEND_ACL_DIR}
-                             ${ASCEND_ATC_DIR})
-  set(ASCEND_MS_DRIVER_PATH ${ASCEND_DRIVER_DIR} ${ASCEND_DRIVER_COMMON_DIR})
-  set(ATLAS_RUNTIME_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64)
-  set(ATLAS_RUNTIME_INC_DIR
-      ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include)
-  set(ATLAS_ACL_DIR ${ASCEND_DIR}/ascend-toolkit/latest/acllib/lib64)
-  set(ATLAS_ATC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/atc/lib64)
-  set(ATLAS_MS_RUNTIME_PATH ${ATLAS_RUNTIME_DIR} ${ATLAS_ACL_DIR}
-                            ${ATLAS_ATC_DIR})
-  set(atlas_graph_lib ${ATLAS_RUNTIME_DIR}/libgraph.so)
-  set(atlas_ge_runner_lib ${ATLAS_RUNTIME_DIR}/libge_runner.so)
-  set(atlas_acl_lib ${ATLAS_RUNTIME_DIR}/libascendcl.so)
-  include_directories(${ATLAS_RUNTIME_INC_DIR})
-  add_library(ascend_ge SHARED IMPORTED GLOBAL)
-  set_property(TARGET ascend_ge PROPERTY IMPORTED_LOCATION
-                                         ${atlas_ge_runner_lib})
-  add_library(ascend_graph SHARED IMPORTED GLOBAL)
-  set_property(TARGET ascend_graph PROPERTY IMPORTED_LOCATION
-                                            ${atlas_graph_lib})
-  add_library(atlas_acl SHARED IMPORTED GLOBAL)
-  set_property(TARGET atlas_acl PROPERTY IMPORTED_LOCATION ${atlas_acl_lib})
-  add_custom_target(extern_ascend DEPENDS ascend_ge ascend_graph atlas_acl)
-endif()
-if(WITH_ASCEND_CL)
-  set(ASCEND_CL_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64)
-  set(ascend_hccl_lib ${ASCEND_CL_DIR}/libhccl.so)
-  set(ascendcl_lib ${ASCEND_CL_DIR}/libascendcl.so)
-  set(acl_op_compiler_lib ${ASCEND_CL_DIR}/libacl_op_compiler.so)
-  set(FWKACLLIB_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include)
-  set(ACLLIB_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/acllib/include)
-  message(STATUS "FWKACLLIB_INC_DIR ${FWKACLLIB_INC_DIR}")
-  message(STATUS "ASCEND_CL_DIR ${ASCEND_CL_DIR}")
-  include_directories(${FWKACLLIB_INC_DIR})
-  include_directories(${ACLLIB_INC_DIR})
-  add_library(ascendcl SHARED IMPORTED GLOBAL)
-  set_property(TARGET ascendcl PROPERTY IMPORTED_LOCATION ${ascendcl_lib})
-  add_library(ascend_hccl SHARED IMPORTED GLOBAL)
-  set_property(TARGET ascend_hccl PROPERTY IMPORTED_LOCATION ${ascend_hccl_lib})
-  add_library(acl_op_compiler SHARED IMPORTED GLOBAL)
-  set_property(TARGET acl_op_compiler PROPERTY IMPORTED_LOCATION
-                                               ${acl_op_compiler_lib})
-  add_custom_target(extern_ascend_cl DEPENDS ascendcl acl_op_compiler)
-endif()
-if(WITH_ASCEND_CL)
-  macro(find_ascend_toolkit_version ascend_toolkit_version_info)
-    file(READ ${ascend_toolkit_version_info} ASCEND_TOOLKIT_VERSION_CONTENTS)
-    string(REGEX MATCH "version=([0-9]+\.[0-9]+\.(RC)?[0-9][.a-z0-9]*)"
-                 ASCEND_TOOLKIT_VERSION "${ASCEND_TOOLKIT_VERSION_CONTENTS}")
-    string(REGEX REPLACE "version=([0-9]+\.[0-9]+\.(RC)?[0-9][.a-z0-9]*)" "\\1"
-                         ASCEND_TOOLKIT_VERSION "${ASCEND_TOOLKIT_VERSION}")
-    string(REGEX REPLACE "[A-Z]|[a-z|\.]" "" CANN_VERSION
-                         ${ASCEND_TOOLKIT_VERSION})
-    string(SUBSTRING "${CANN_VERSION}000" 0 6 CANN_VERSION)
-    add_definitions("-DCANN_VERSION_CODE=${CANN_VERSION}")
-    if(NOT ASCEND_TOOLKIT_VERSION)
-      set(ASCEND_TOOLKIT_VERSION "???")
-    else()
-      message(
-        STATUS "Current Ascend Toolkit version is ${ASCEND_TOOLKIT_VERSION}")
-    endif()
-  endmacro()
-  macro(find_ascend_driver_version ascend_driver_version_info)
-    file(READ ${ascend_driver_version_info} ASCEND_DRIVER_VERSION_CONTENTS)
-    string(REGEX MATCH "Version=([0-9]+\.[0-9]+\.[0-9]+)" ASCEND_DRIVER_VERSION
-                 "${ASCEND_DRIVER_VERSION_CONTENTS}")
-    string(REGEX REPLACE "Version=([0-9]+\.[0-9]+\.[0-9]+)" "\\1"
-                         ASCEND_DRIVER_VERSION "${ASCEND_DRIVER_VERSION}")
-    if(NOT ASCEND_DRIVER_VERSION)
-      set(ASCEND_DRIVER_VERSION "???")
-    else()
-      message(
-        STATUS "Current Ascend Driver version is ${ASCEND_DRIVER_VERSION}")
-    endif()
-  endmacro()
-  if(WITH_ARM)
-    set(ASCEND_TOOLKIT_DIR ${ASCEND_DIR}/ascend-toolkit/latest/arm64-linux)
-  else()
-    set(ASCEND_TOOLKIT_DIR ${ASCEND_DIR}/ascend-toolkit/latest/x86_64-linux)
-  endif()
-  find_ascend_toolkit_version(${ASCEND_TOOLKIT_DIR}/ascend_toolkit_install.info)
-  find_ascend_driver_version(${ASCEND_DIR}/driver/version.info)
-endif()
--- a/cmake/external/gloo.cmake
+++ b/cmake/external/gloo.cmake
@@ -61,44 +61,24 @@ if(CMAKE_COMPILER_IS_GNUCC)
 endif()
 include_directories(${GLOO_INCLUDE_DIR})
-if(WITH_ASCEND OR WITH_ASCEND_CL)
+ExternalProject_Add(
-  ExternalProject_Add(
+  ${GLOO_PROJECT}
-    ${GLOO_PROJECT}
+  ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE}
-    ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE}
+  GIT_REPOSITORY ${GLOO_REPOSITORY}
-    GIT_REPOSITORY ${GLOO_REPOSITORY}
+  GIT_TAG ${GLOO_TAG}
-    GIT_TAG ${GLOO_TAG}
+  PREFIX "${GLOO_PREFIX_DIR}"
-    PREFIX "${GLOO_PREFIX_DIR}"
+  UPDATE_COMMAND ""
-    UPDATE_COMMAND ""
+  PATCH_COMMAND ${GLOO_PATCH_COMMAND}
-    CONFIGURE_COMMAND ""
+  CONFIGURE_COMMAND ""
-    BUILD_COMMAND
+  BUILD_COMMAND
-      mkdir -p ${GLOO_SOURCE_DIR}/build && cd ${GLOO_SOURCE_DIR}/build && cmake
+    mkdir -p ${GLOO_SOURCE_DIR}/build && cd ${GLOO_SOURCE_DIR}/build && cmake ..
-      .. -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} && ${CMAKE_COMMAND} --build . &&
+    -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} && ${CMAKE_COMMAND} --build . && mkdir
-      mkdir -p ${GLOO_LIBRARY_DIR} ${GLOO_INCLUDE_DIR}/gloo
+    -p ${GLOO_LIBRARY_DIR} ${GLOO_INCLUDE_DIR}/glo
-    INSTALL_COMMAND ${CMAKE_COMMAND} -E copy
+  INSTALL_COMMAND ${CMAKE_COMMAND} -E copy
-                    ${GLOO_SOURCE_DIR}/build/gloo/libgloo.a ${GLOO_LIBRARY_DIR}
+                  ${GLOO_SOURCE_DIR}/build/gloo/libgloo.a ${GLOO_LIBRARY_DIR}
-    COMMAND ${CMAKE_COMMAND} -E copy_directory "${GLOO_SOURCE_DIR}/gloo/"
+  COMMAND ${CMAKE_COMMAND} -E copy_directory "${GLOO_SOURCE_DIR}/gloo/"
-            "${GLOO_INCLUDE_DIR}/gloo"
+          "${GLOO_INCLUDE_DIR}/gloo"
-    BUILD_BYPRODUCTS ${GLOO_LIBRARIES})
+  BUILD_BYPRODUCTS ${GLOO_LIBRARIES})
-else()
-  ExternalProject_Add(
-    ${GLOO_PROJECT}
-    ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE}
-    GIT_REPOSITORY ${GLOO_REPOSITORY}
-    GIT_TAG ${GLOO_TAG}
-    PREFIX "${GLOO_PREFIX_DIR}"
-    UPDATE_COMMAND ""
-    PATCH_COMMAND ${GLOO_PATCH_COMMAND}
-    CONFIGURE_COMMAND ""
-    BUILD_COMMAND
-      mkdir -p ${GLOO_SOURCE_DIR}/build && cd ${GLOO_SOURCE_DIR}/build && cmake
-      .. -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} && ${CMAKE_COMMAND} --build . &&
-      mkdir -p ${GLOO_LIBRARY_DIR} ${GLOO_INCLUDE_DIR}/gloo
-    INSTALL_COMMAND ${CMAKE_COMMAND} -E copy
-                    ${GLOO_SOURCE_DIR}/build/gloo/libgloo.a ${GLOO_LIBRARY_DIR}
-    COMMAND ${CMAKE_COMMAND} -E copy_directory "${GLOO_SOURCE_DIR}/gloo/"
-            "${GLOO_INCLUDE_DIR}/gloo"
-    BUILD_BYPRODUCTS ${GLOO_LIBRARIES})
-endif()
 add_library(gloo STATIC IMPORTED GLOBAL)
 set_property(TARGET gloo PROPERTY IMPORTED_LOCATION ${GLOO_LIBRARIES})

--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -237,9 +237,6 @@ function(build_protobuf TARGET_NAME BUILD_FOR_HOST)
  if(WITH_ASCEND AND NOT WITH_ASCEND_CXX11)
    set(PROTOBUF_REPOSITORY https://gitee.com/tianjianhe/protobuf.git)
    set(PROTOBUF_TAG v21.12)
-  elseif(WITH_ASCEND_CL AND NOT WITH_ASCEND_CXX11)
-    set(PROTOBUF_REPOSITORY https://gitee.com/tianjianhe/protobuf.git)
-    set(PROTOBUF_TAG v21.12)
  elseif(WITH_IPU)
    set(PROTOBUF_REPOSITORY ${GIT_URL}/protocolbuffers/protobuf.git)
    set(PROTOBUF_TAG v21.12)
@@ -325,9 +322,7 @@ function(build_protobuf TARGET_NAME BUILD_FOR_HOST)
  endif()
 endfunction()
-if(WITH_ASCEND OR WITH_ASCEND_CL)
+if(WITH_IPU)
-  set(PROTOBUF_VERSION 21.12)
-elseif(WITH_IPU)
  set(PROTOBUF_VERSION 21.12)
 elseif(WITH_ARM_BRPC)
  set(PROTOBUF_VERSION 21.12-baidu-ee-common)

--- a/cmake/external/threadpool.cmake
+++ b/cmake/external/threadpool.cmake
@@ -15,11 +15,7 @@
 include(ExternalProject)
 set(THREADPOOL_PREFIX_DIR ${THIRD_PARTY_PATH}/threadpool)
-if(WITH_ASCEND OR WITH_ASCEND_CL)
+set(THREADPOOL_REPOSITORY ${GIT_URL}/progschj/ThreadPool.git)
-  set(THREADPOOL_REPOSITORY https://gitee.com/tianjianhe/ThreadPool.git)
-else()
-  set(THREADPOOL_REPOSITORY ${GIT_URL}/progschj/ThreadPool.git)
-endif()
 set(THREADPOOL_TAG 9a42ec1329f259a5f4881a291db1dcb8f2ad9040)
 set(THREADPOOL_INCLUDE_DIR ${THIRD_PARTY_PATH}/threadpool/src/extern_threadpool)

--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -64,96 +64,59 @@ else()
  set(USE_OMP ON)
 endif()
-if(WITH_ASCEND OR WITH_ASCEND_CL)
+if(WIN32)
-  ExternalProject_Add(
+  set(WARPCTC_C_FLAGS $<FILTER:${CMAKE_C_FLAGS},EXCLUDE,/Zc:inline>)
-    extern_warpctc
+  set(WARPCTC_C_FLAGS_DEBUG $<FILTER:${CMAKE_C_FLAGS_DEBUG},EXCLUDE,/Zc:inline>)
-    ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE}
+  set(WARPCTC_C_FLAGS_RELEASE
-    GIT_REPOSITORY ${WARPCTC_REPOSITORY}
+      $<FILTER:${CMAKE_C_FLAGS_RELEASE},EXCLUDE,/Zc:inline>)
-    GIT_TAG ${WARPCTC_TAG}
+  set(WARPCTC_CXX_FLAGS $<FILTER:${CMAKE_CXX_FLAGS},EXCLUDE,/Zc:inline>)
-    PREFIX ${WARPCTC_PREFIX_DIR}
+  set(WARPCTC_CXX_FLAGS_RELEASE
-    #UPDATE_COMMAND  ""
+      $<FILTER:${CMAKE_CXX_FLAGS_RELEASE},EXCLUDE,/Zc:inline>)
-    PATCH_COMMAND ""
+  set(WARPCTC_CXX_FLAGS_DEBUG
-    BUILD_ALWAYS 1
+      $<FILTER:${CMAKE_CXX_FLAGS_DEBUG},EXCLUDE,/Zc:inline>)
-    CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-               -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-               -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-               -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
-               -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
-               -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-               -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
-               -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
-               -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR}
-               -DWITH_GPU=${WITH_GPU}
-               -DWITH_ROCM=${WITH_ROCM}
-               -DWITH_OMP=${USE_OMP}
-               -DWITH_TORCH=OFF
-               -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON
-               -DBUILD_SHARED=ON
-               -DBUILD_TESTS=OFF
-               -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-               -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-               ${EXTERNAL_OPTIONAL_ARGS}
-    CMAKE_CACHE_ARGS
-      -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
-      -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-      -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR}
-    BUILD_BYPRODUCTS ${WARPCTC_LIBRARIES})
 else()
-  if(WIN32)
+  set(WARPCTC_C_FLAGS ${CMAKE_C_FLAGS})
-    set(WARPCTC_C_FLAGS $<FILTER:${CMAKE_C_FLAGS},EXCLUDE,/Zc:inline>)
+  set(WARPCTC_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG})
-    set(WARPCTC_C_FLAGS_DEBUG
+  set(WARPCTC_C_FLAGS_RELEASE ${CMAKE_C_FLAGS_RELEASE})
-        $<FILTER:${CMAKE_C_FLAGS_DEBUG},EXCLUDE,/Zc:inline>)
+  set(WARPCTC_CXX_FLAGS ${CMAKE_CXX_FLAGS})
-    set(WARPCTC_C_FLAGS_RELEASE
+  set(WARPCTC_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE})
-        $<FILTER:${CMAKE_C_FLAGS_RELEASE},EXCLUDE,/Zc:inline>)
+  set(WARPCTC_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG})
-    set(WARPCTC_CXX_FLAGS $<FILTER:${CMAKE_CXX_FLAGS},EXCLUDE,/Zc:inline>)
-    set(WARPCTC_CXX_FLAGS_RELEASE
-        $<FILTER:${CMAKE_CXX_FLAGS_RELEASE},EXCLUDE,/Zc:inline>)
-    set(WARPCTC_CXX_FLAGS_DEBUG
-        $<FILTER:${CMAKE_CXX_FLAGS_DEBUG},EXCLUDE,/Zc:inline>)
-  else()
-    set(WARPCTC_C_FLAGS ${CMAKE_C_FLAGS})
-    set(WARPCTC_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG})
-    set(WARPCTC_C_FLAGS_RELEASE ${CMAKE_C_FLAGS_RELEASE})
-    set(WARPCTC_CXX_FLAGS ${CMAKE_CXX_FLAGS})
-    set(WARPCTC_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE})
-    set(WARPCTC_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG})
-  endif()
-  ExternalProject_Add(
-    extern_warpctc
-    ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE}
-    GIT_REPOSITORY ${WARPCTC_REPOSITORY}
-    GIT_TAG ${WARPCTC_TAG}
-    PREFIX ${WARPCTC_PREFIX_DIR}
-    UPDATE_COMMAND ""
-    PATCH_COMMAND ${WARPCTC_PATCH_COMMAND}
-    #BUILD_ALWAYS    1
-    CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-               -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-               -DCMAKE_C_FLAGS=${WARPCTC_C_FLAGS}
-               -DCMAKE_C_FLAGS_DEBUG=${WARPCTC_C_FLAGS_DEBUG}
-               -DCMAKE_C_FLAGS_RELEASE=${WARPCTC_C_FLAGS_RELEASE}
-               -DCMAKE_CXX_FLAGS=${WARPCTC_CXX_FLAGS}
-               -DCMAKE_CXX_FLAGS_RELEASE=${WARPCTC_CXX_FLAGS_RELEASE}
-               -DCMAKE_CXX_FLAGS_DEBUG=${WARPCTC_CXX_FLAGS_DEBUG}
-               -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR}
-               -DWITH_GPU=${WITH_GPU}
-               -DWITH_ROCM=${WITH_ROCM}
-               -DWITH_OMP=${USE_OMP}
-               -DWITH_TORCH=OFF
-               -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON
-               -DBUILD_SHARED=ON
-               -DBUILD_TESTS=OFF
-               -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-               -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-               -DCUDA_TOOLKIT_ROOT_DIR=${CUDA_TOOLKIT_ROOT_DIR}
-               ${EXTERNAL_OPTIONAL_ARGS}
-               ${WARPCTC_CCBIN_OPTION}
-    CMAKE_CACHE_ARGS
-      -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
-      -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-      -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR}
-    BUILD_BYPRODUCTS ${WARPCTC_LIBRARIES})
 endif()
+ExternalProject_Add(
+  extern_warpctc
+  ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE}
+  GIT_REPOSITORY ${WARPCTC_REPOSITORY}
+  GIT_TAG ${WARPCTC_TAG}
+  PREFIX ${WARPCTC_PREFIX_DIR}
+  UPDATE_COMMAND ""
+  PATCH_COMMAND ${WARPCTC_PATCH_COMMAND}
+  #BUILD_ALWAYS    1
+  CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+             -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+             -DCMAKE_C_FLAGS=${WARPCTC_C_FLAGS}
+             -DCMAKE_C_FLAGS_DEBUG=${WARPCTC_C_FLAGS_DEBUG}
+             -DCMAKE_C_FLAGS_RELEASE=${WARPCTC_C_FLAGS_RELEASE}
+             -DCMAKE_CXX_FLAGS=${WARPCTC_CXX_FLAGS}
+             -DCMAKE_CXX_FLAGS_RELEASE=${WARPCTC_CXX_FLAGS_RELEASE}
+             -DCMAKE_CXX_FLAGS_DEBUG=${WARPCTC_CXX_FLAGS_DEBUG}
+             -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR}
+             -DWITH_GPU=${WITH_GPU}
+             -DWITH_ROCM=${WITH_ROCM}
+             -DWITH_OMP=${USE_OMP}
+             -DWITH_TORCH=OFF
+             -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON
+             -DBUILD_SHARED=ON
+             -DBUILD_TESTS=OFF
+             -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+             -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+             -DCUDA_TOOLKIT_ROOT_DIR=${CUDA_TOOLKIT_ROOT_DIR}
+             ${EXTERNAL_OPTIONAL_ARGS}
+             ${WARPCTC_CCBIN_OPTION}
+  CMAKE_CACHE_ARGS
+    -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
+    -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+    -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR}
+  BUILD_BYPRODUCTS ${WARPCTC_LIBRARIES})
 message(STATUS "warp-ctc library: ${WARPCTC_LIBRARIES}")
 get_filename_component(WARPCTC_LIBRARY_PATH ${WARPCTC_LIBRARIES} DIRECTORY)

--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -167,10 +167,6 @@ if(NOT WIN32)
    set(COMMON_FLAGS ${COMMON_FLAGS} -Wno-sign-compare -Wno-non-virtual-dtor)
  endif()
-  if(WITH_ASCEND_CL AND WITH_ARM_BRPC)
-    set(COMMON_FLAGS ${COMMON_FLAGS} -faligned-new)
-  endif()
  if(NOT APPLE)
    if((${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER 8.0) OR (WITH_ROCM))
      set(COMMON_FLAGS

--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -508,14 +508,9 @@ function(version version_file)
    OUTPUT_VARIABLE PADDLE_GIT_COMMIT)
  file(
    WRITE ${version_file}
-    "GIT COMMIT ID: ${PADDLE_GIT_COMMIT}\n"
+    "GIT COMMIT ID: ${PADDLE_GIT_COMMIT}\n" "WITH_MKL: ${WITH_MKL}\n"
-    "WITH_MKL: ${WITH_MKL}\n"
+    "WITH_MKLDNN: ${WITH_MKLDNN}\n" "WITH_GPU: ${WITH_GPU}\n"
-    "WITH_MKLDNN: ${WITH_MKLDNN}\n"
+    "WITH_ROCM: ${WITH_ROCM}\n" "WITH_IPU: ${WITH_IPU}\n")
-    "WITH_GPU: ${WITH_GPU}\n"
-    "WITH_ROCM: ${WITH_ROCM}\n"
-    "WITH_ASCEND_CL: ${WITH_ASCEND_CL}\n"
-    "WITH_ASCEND_CXX11: ${WITH_ASCEND_CXX11}\n"
-    "WITH_IPU: ${WITH_IPU}\n")
  if(WITH_GPU)
    file(APPEND ${version_file}
         "CUDA version: ${CUDA_VERSION}\n"
@@ -526,11 +521,6 @@ function(version version_file)
         "HIP version: v${HIP_MAJOR_VERSION}.${HIP_MINOR_VERSION}\n"
         "MIOpen version: v${MIOPEN_MAJOR_VERSION}.${MIOPEN_MINOR_VERSION}\n")
  endif()
-  if(WITH_ASCEND_CL)
-    file(APPEND ${version_file}
-         "Ascend Toolkit version: ${ASCEND_TOOLKIT_VERSION}\n"
-         "Ascend Driver version: ${ASCEND_DRIVER_VERSION}\n")
-  endif()
  if(WITH_IPU)
    file(APPEND ${version_file} "PopART version: ${POPART_VERSION}\n")
  endif()

--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -74,9 +74,6 @@ function(op_library TARGET)
  set(MKLDNN_FILE)
  set(op_common_deps operator op_registry math_function layer
                     common_infer_shape_functions)
-  if(WITH_ASCEND_CL)
-    set(op_common_deps ${op_common_deps} npu_op_runner)
-  endif()
  if(WITH_MLU)
    set(op_common_deps ${op_common_deps} mlu_baseop)
  endif()
@@ -175,12 +172,6 @@ function(op_library TARGET)
        list(APPEND xpu_kp_cc_srcs ${TARGET}.kps)
      endif()
    endif()
-    if(WITH_ASCEND_CL)
-      string(REPLACE "_op" "_op_npu" NPU_FILE "${TARGET}")
-      if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${NPU_FILE}.cc)
-        list(APPEND npu_cc_srcs ${NPU_FILE}.cc)
-      endif()
-    endif()
    if(WITH_MLU)
      string(REPLACE "_op" "_op_mlu" MLU_FILE "${TARGET}")
      if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MLU_FILE}.cc)
@@ -213,8 +204,6 @@ function(op_library TARGET)
        list(APPEND xpu_kp_cc_srcs ${src})
      elseif(WITH_XPU_KP AND ${src} MATCHES ".*\\.kps$")
        list(APPEND xpu_kp_cc_srcs ${src})
-      elseif(WITH_ASCEND_CL AND ${src} MATCHES ".*_op_npu.cc$")
-        list(APPEND npu_cc_srcs ${src})
      elseif(WITH_MLU AND ${src} MATCHES ".*_op_mlu.cc$")
        list(APPEND mlu_cc_srcs ${src})
      elseif(${src} MATCHES ".*\\.cc$")
@@ -331,13 +320,6 @@ function(op_library TARGET)
      SRCS ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} ${xpu_kp_cc_srcs}
      DEPS ${op_library_DEPS} ${op_common_deps})
  else()
-    # deal with CANN version control while registering NPU operators before build
-    if(WITH_ASCEND_CL)
-      if(CANN_VERSION LESS 504000)
-        list(REMOVE_ITEM npu_cc_srcs "multinomial_op_npu.cc")
-        list(REMOVE_ITEM npu_cc_srcs "take_along_axis_op_npu.cc")
-      endif()
-    endif()
    # Unity Build relies on global option `WITH_UNITY_BUILD` and local option `UNITY`.
    if(WITH_UNITY_BUILD AND op_library_UNITY)
      # Combine the cc source files.
@@ -541,18 +523,6 @@ function(op_library TARGET)
    endforeach()
  endif()
-  # pybind USE_OP_DEVICE_KERNEL for NPU
-  if(WITH_ASCEND_CL AND ${npu_cc_srcs_len} GREATER 0)
-    foreach(npu_src ${npu_cc_srcs})
-      set(op_name "")
-      find_register(${npu_src} "REGISTER_OP_NPU_KERNEL" op_name)
-      if(NOT ${op_name} EQUAL "")
-        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${op_name}, NPU);\n")
-        set(pybind_flag 1)
-      endif()
-    endforeach()
-  endif()
  # pybind USE_OP_DEVICE_KERNEL for MLU
  if(WITH_MLU AND ${mlu_cc_srcs_len} GREATER 0)
    foreach(mlu_src ${mlu_cc_srcs})

--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -394,16 +394,6 @@ if(WITH_BOX_PS)
  list(APPEND third_party_deps extern_box_ps)
 endif()
-if(WITH_ASCEND OR WITH_ASCEND_CL)
-  include(external/ascend)
-  if(WITH_ASCEND OR WITH_ASCEND_CL)
-    list(APPEND third_party_deps extern_ascend)
-  endif()
-  if(WITH_ASCEND_CL)
-    list(APPEND third_party_deps extern_ascend_cl)
-  endif()
-endif()
 if(WITH_PSCORE)
  include(external/snappy)
  list(APPEND third_party_deps extern_snappy)

--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -205,17 +205,10 @@ elseif(WITH_ROCM)
    SRCS fused_broadcast_op_handle.cc
    DEPS broadcast_op_handle)
 else()
-  if(WITH_ASCEND_CL)
+  cc_library(
-    cc_library(
+    nan_inf_utils
-      nan_inf_utils
+    SRCS nan_inf_utils_detail.cc
-      SRCS nan_inf_utils_detail.cc
+    DEPS framework_proto scope place)
-      DEPS npu_op_runner framework_proto scope place)
-  else()
-    cc_library(
-      nan_inf_utils
-      SRCS nan_inf_utils_detail.cc
-      DEPS framework_proto scope place)
-  endif()
  cc_library(
    all_reduce_op_handle
    SRCS all_reduce_op_handle.cc

--- a/paddle/fluid/framework/details/nan_inf_utils.h
+++ b/paddle/fluid/framework/details/nan_inf_utils.h
@@ -54,12 +54,6 @@ void CheckOpHasNanOrInfInDygraph(const std::string& op_type,
  }
 }
-#ifdef PADDLE_WITH_ASCEND_CL
-void NPUAllocAndClearFloatStatus(const framework::OperatorBase& op,
-                                 const framework::Scope& scope,
-                                 const platform::Place& place);
-#endif
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/details/nan_inf_utils_detail.cc
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
@@ -19,8 +19,6 @@
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/phi/common/amp_type_traits.h"
-#ifdef PADDLE_WITH_ASCEND_CL
-#endif
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/phi/kernels/funcs/eigen/extensions.h"
@@ -243,40 +241,6 @@ void CheckVarHasNanOrInf(const std::string& op_type,
        "phi::DenseTensor[%s] use xpu place. PaddlePaddle must compile "
        "with XPU.",
        var_name));
-#endif
-    return;
-  } else if (platform::is_npu_place(tensor->place())) {
-#ifdef PADDLE_WITH_ASCEND_CL
-    if (framework::TransToProtoVarType(tensor->dtype()) !=
-        proto::VarType::FP32) {
-      return;
-    }
-    phi::DenseTensor cpu_tensor;
-    cpu_tensor.Resize(tensor->dims());
-    float* cpu_data = static_cast<float*>(
-        cpu_tensor.mutable_data(platform::CPUPlace(), tensor->dtype()));
-    framework::TensorCopySync(*tensor, platform::CPUPlace(), &cpu_tensor);
-    bool flag = false;
-    for (int i = 0; i < cpu_tensor.numel(); i++) {
-      if (isnan(cpu_data[i]) || isinf(cpu_data[i])) {
-        flag = true;
-        break;
-      }
-    }
-    PADDLE_ENFORCE_NE(
-        flag,
-        true,
-        platform::errors::Fatal(
-            "Operator %s output phi::DenseTensor %s contains Inf.",
-            op_type,
-            var_name));
-#else
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "phi::DenseTensor[%s] use npu place. PaddlePaddle must compile "
-        "with NPU.",
-        var_name));
 #endif
    return;
  }
@@ -309,139 +273,6 @@ bool IsSkipOp(const framework::OperatorBase& op) {
  return false;
 }
-#ifdef PADDLE_WITH_ASCEND_CL
-using NpuOpRunner = paddle::operators::NpuOpRunner;
-constexpr int FLOAT_STATUS_SIZE = 8;
-static phi::DenseTensor& npu_float_status() {
-  static phi::DenseTensor float_status;
-  return float_status;
-}
-void NPUAllocAndClearFloatStatus(const framework::OperatorBase& op,
-                                 const framework::Scope& scope,
-                                 const platform::Place& place) {
-  if (!platform::is_npu_place(place)) return;
-  std::call_once(white_list_init_flag, InitWhiteListFormEnv);
-  if (IsSkipOp(op)) return;
-  auto* dev_ctx = reinterpret_cast<platform::NPUDeviceContext*>(
-      platform::DeviceContextPool::Instance().Get(place));
-  auto stream = dev_ctx->stream();
-  auto& flag = npu_float_status();
-  flag.mutable_data<float>({FLOAT_STATUS_SIZE}, place);
-  NpuOpRunner("NPUAllocFloatStatus", {}, {flag}).Run(stream);
-  phi::DenseTensor tmp;
-  tmp.mutable_data<float>({FLOAT_STATUS_SIZE}, place);
-  NpuOpRunner("NPUClearFloatStatus", {tmp}, {flag}).Run(stream);
-}
-void PrintNpuVarInfo(const std::string& op_type,
-                     const std::string& var_name,
-                     const framework::Variable* var,
-                     const platform::Place& place) {
-  const phi::DenseTensor* tensor{nullptr};
-  if (var->IsType<phi::DenseTensor>()) {
-    tensor = &var->Get<phi::DenseTensor>();
-  } else if (var->IsType<phi::SelectedRows>()) {
-    tensor = &var->Get<phi::SelectedRows>().value();
-  } else {
-    VLOG(10) << var_name << " var_name need not to check";
-    return;
-  }
-  if ((framework::TransToProtoVarType(tensor->dtype()) !=
-       proto::VarType::FP32) &&
-      (framework::TransToProtoVarType(tensor->dtype()) !=
-       proto::VarType::FP16)) {
-    return;
-  }
-  if (tensor->memory_size() == 0) {
-    VLOG(10) << var_name << " var_name need not to check, size == 0";
-    return;
-  }
-  VLOG(10) << "begin check " << op_type << " var_name:" << var_name
-           << ", place:" << tensor->place() << ", numel:" << tensor->numel();
-  phi::DenseTensor cpu_tensor;
-  cpu_tensor.Resize(tensor->dims());
-  cpu_tensor.mutable_data(platform::CPUPlace(), tensor->dtype());
-  framework::TensorCopySync(*tensor, platform::CPUPlace(), &cpu_tensor);
-  LOG(WARNING) << "print [" << var_name << "] tensor info:";
-  // use env strategy control in future, -1=print_all.
-  int print_num = 3;
-  if (framework::TransToProtoVarType(tensor->dtype()) == proto::VarType::FP32) {
-    const float* value = cpu_tensor.data<float>();
-    PrintNanInf(value, tensor->numel(), print_num, op_type, var_name, false);
-  } else if (framework::TransToProtoVarType(tensor->dtype()) ==
-             proto::VarType::FP16) {
-    const paddle::platform::float16* value =
-        cpu_tensor.data<paddle::platform::float16>();
-    PrintNanInf(value, tensor->numel(), print_num, op_type, var_name, false);
-  }
-}
-void PrintNPUOpValueInfo(const framework::OperatorBase& op,
-                         const framework::Scope& scope,
-                         const platform::Place& place) {
-  LOG(WARNING) << "There are `nan` or `inf` in operator (" << op.Type()
-               << "), here we print some tensor value info of this op.";
-  for (auto& vname : op.InputVars()) {
-    auto* var = scope.FindVar(vname);
-    if (var == nullptr) continue;
-    PrintNpuVarInfo(op.Type(), vname, var, place);
-  }
-  for (auto& vname : op.OutputVars(true)) {
-    auto* var = scope.FindVar(vname);
-    if (var == nullptr) continue;
-    PrintNpuVarInfo(op.Type(), vname, var, place);
-  }
-}
-static void NPUCheckOpHasNanOrInf(const framework::OperatorBase& op,
-                                  const framework::Scope& scope,
-                                  const platform::Place& place) {
-  if (!platform::is_npu_place(place)) return;
-  auto* dev_ctx = reinterpret_cast<platform::NPUDeviceContext*>(
-      platform::DeviceContextPool::Instance().Get(place));
-  auto stream = dev_ctx->stream();
-  auto& flag = npu_float_status();
-  phi::DenseTensor tmp;
-  tmp.mutable_data<float>({FLOAT_STATUS_SIZE}, place);
-  // NPUGetFloatStatus updates data on input in-place.
-  // tmp is only placeholder.
-  NpuOpRunner("NPUGetFloatStatus", {flag}, {tmp}).Run(stream);
-  phi::DenseTensor cpu_tensor;
-  auto cpu_place = platform::CPUPlace();
-  float* cpu_data = static_cast<float*>(
-      cpu_tensor.mutable_data<float>({FLOAT_STATUS_SIZE}, cpu_place));
-  framework::TensorCopySync(flag, cpu_place, &cpu_tensor);
-  float sum = 0.0;
-  for (int i = 0; i < FLOAT_STATUS_SIZE; ++i) {
-    sum += cpu_data[i];
-  }
-  if (sum >= 1.0) PrintNPUOpValueInfo(op, scope, place);
-  PADDLE_ENFORCE_LT(sum,
-                    1.0,
-                    platform::errors::PreconditionNotMet(
-                        "Operator %s contains Nan/Inf.", op.Type()));
-}
-#endif
 void CheckOpHasNanOrInf(const framework::OperatorBase& op,
                        const framework::Scope& exec_scope,
                        const platform::Place& place) {
@@ -449,13 +280,6 @@ void CheckOpHasNanOrInf(const framework::OperatorBase& op,
  if (IsSkipOp(op)) return;
-#ifdef PADDLE_WITH_ASCEND_CL
-  if (platform::is_npu_place(place)) {
-    NPUCheckOpHasNanOrInf(op, exec_scope, place);
-    return;
-  }
-#endif
  if (op_var_nan_inf_white_list().count(op.Type()) == 0) {
    // NOTE. vname may destruct in the end of this func.
    for (auto& vname : op.OutputVars(true)) {

--- a/paddle/fluid/framework/device_worker.h
+++ b/paddle/fluid/framework/device_worker.h
@@ -674,8 +674,7 @@ class PSGPUWorker : public HogwildWorker {
 };
 #endif
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-    defined(PADDLE_WITH_ASCEND_CL)
 class SectionWorker : public DeviceWorker {
 public:
  SectionWorker() {}

--- a/paddle/fluid/framework/device_worker_factory.cc
+++ b/paddle/fluid/framework/device_worker_factory.cc
@@ -83,8 +83,7 @@ REGISTER_DEVICE_WORKER_CLASS(HeterCpuWorker);
 REGISTER_DEVICE_WORKER_CLASS(PSGPUWorker);
 #endif
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-    defined(PADDLE_WITH_ASCEND_CL)
 REGISTER_DEVICE_WORKER_CLASS(SectionWorker);
 #endif
 }  // namespace framework

--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -516,23 +516,6 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx,
 #else
      PADDLE_THROW(
          platform::errors::Unimplemented("No IPU gc found in CPU/IPU paddle"));
-#endif
-    } else if (platform::is_npu_place(place_)) {
-#ifdef PADDLE_WITH_ASCEND_CL
-      if (IsFastEagerDeletionModeEnabled()) {
-        VLOG(4) << "Use unsafe fast gc for NPU.";
-        gc.reset(new NPUUnsafeFastGarbageCollector(place_, max_memory_size));
-      } else {
-        PADDLE_THROW(platform::errors::Unimplemented(
-            "Please set FLAGS_fast_eager_deletion_mode=true to use "
-            "GarbageCollector on NPU."));
-        // TODO(zhiqiu): fix bugs and enable NPUDefaultStreamGarbageCollector.
-        VLOG(4) << "Use default stream gc for NPU.";
-        gc.reset(new NPUDefaultStreamGarbageCollector(place_, max_memory_size));
-      }
-#else
-      PADDLE_THROW(
-          platform::errors::Unimplemented("No NPU gc found in CPU/NPU paddle"));
 #endif
    } else if (platform::is_mlu_place(place_)) {
 #ifdef PADDLE_WITH_MLU

--- a/paddle/fluid/framework/fleet/CMakeLists.txt
+++ b/paddle/fluid/framework/fleet/CMakeLists.txt
@@ -124,10 +124,3 @@ cc_test(
  test_fleet_cc
  SRCS test_fleet.cc
  DEPS fleet_wrapper gloo_wrapper fs shell)
-if(WITH_ASCEND OR WITH_ASCEND_CL)
-  cc_library(
-    ascend_wrapper
-    SRCS ascend_wrapper.cc
-    DEPS framework_proto lod_tensor ascend_ge ascend_graph)
-endif()
--- a/paddle/fluid/framework/fleet/ascend_wrapper.cc
+++ b/paddle/fluid/framework/fleet/ascend_wrapper.cc
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#ifdef PADDLE_WITH_ASCEND_CL
-#include "paddle/fluid/framework/fleet/ascend_wrapper.h"
-namespace paddle {
-namespace framework {
-std::shared_ptr<AscendInstance> AscendInstance::ascend_instance_ = nullptr;
-}  // end namespace framework
-}  // end namespace paddle
-#endif
--- a/paddle/fluid/framework/fleet/ascend_wrapper.h
+++ b/paddle/fluid/framework/fleet/ascend_wrapper.h
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-  http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-#ifdef PADDLE_WITH_ASCEND_CL
-#include <glog/logging.h>
-#include <map>
-#include <memory>
-#include <string>
-#include <vector>
-#include "ge/ge_api.h"
-#include "graph/attr_value.h"
-#include "graph/tensor.h"
-#include "graph/types.h"
-#include "paddle/fluid/framework/convert_utils.h"
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/platform/timer.h"
-namespace paddle {
-namespace framework {
-typedef ge::Graph AscendGraphDesc;
-#ifdef PADDLE_WITH_ASCEND_STRING
-using AscendString = ge::AscendString;
-#else
-using AscendString = std::string;
-#endif
-class AscendInstance {
- public:
-  virtual ~AscendInstance() {}
-  AscendInstance() {}
-  std::map<AscendString, AscendString> _GetDefaultInitOptions() {
-    std::map<AscendString, AscendString> init_options;
-    init_options["ge.exec.deviceId"] = "0";
-    init_options["ge.graphRunMode"] = "1";
-    return init_options;
-  }
-  std::map<AscendString, AscendString> _GetDefaultInitSessionOptions() {
-    std::map<AscendString, AscendString> init_options;
-    // init_options["a"] = "b";
-    // init_options["ge.trainFlag"] = "1";
-    return init_options;
-  }
-  ge::Status InitGEForUT() {
-    return ge::GEInitialize(_GetDefaultInitOptions());
-  }
-  void InitGlobalResouces() {
-    LOG(INFO) << "Begin ascend InitGlobalResouces";
-    session_.reset(new ge::Session(_GetDefaultInitSessionOptions()));
-    if (session_ == nullptr) {
-      PADDLE_THROW(platform::errors::Fatal("new session error: nullptr"));
-    }
-    LOG(INFO) << "End ascend InitGlobalResouces";
-  }
-  void DestroyGlobalResouces() {
-    LOG(INFO) << "Begin ascend DestroyGlobalResouces";
-    session_ = nullptr;
-    LOG(INFO) << "Begin ascend DestroyGlobalResouces";
-  }
-  static std::shared_ptr<AscendInstance> GetInstance() {
-    if (nullptr == ascend_instance_) {
-      ascend_instance_.reset(new paddle::framework::AscendInstance());
-      VLOG(1) << "Initialize AscendInstance Done";
-    }
-    return ascend_instance_;
-  }
-  void AddAscendSubgraph(int graph_idx, const AscendGraphDesc &graph) {
-    ge::Status status = session_->AddGraph(graph_idx, graph);
-    PADDLE_ENFORCE_EQ(status,
-                      ge::SUCCESS,
-                      paddle::platform::errors::PreconditionNotMet(
-                          "Calling addGraph of graph engine failed, please "
-                          "check Ascend Log."));
-    VLOG(1) << "AddAscendSubgraph " << graph_idx << " Done";
-  }
-  ge::DataType VarTypeToGeType(proto::VarType::Type type) {
-    if (type == proto::VarType::FP16) {
-      return ge::DataType::DT_FLOAT16;
-    } else if (type == proto::VarType::FP32) {
-      return ge::DataType::DT_FLOAT;
-    } else if (type == proto::VarType::FP64) {
-      return ge::DataType::DT_DOUBLE;
-    } else if (type == proto::VarType::INT32) {
-      return ge::DataType::DT_INT32;
-    } else if (type == proto::VarType::INT64) {
-      return ge::DataType::DT_INT64;
-    } else {
-      PADDLE_THROW(platform::errors::Unimplemented(
-          "Not support %s as tensor type.", DataTypeToString(type)));
-    }
-  }
-  int GeTypeSize(proto::VarType::Type type) {
-    if (type == proto::VarType::FP16) {
-      return 2;
-    } else if (type == proto::VarType::FP32) {
-      return 4;
-    } else if (type == proto::VarType::FP64) {
-      return 8;
-    } else if (type == proto::VarType::INT32) {
-      return 4;
-    } else if (type == proto::VarType::INT64) {
-      return 8;
-    } else {
-      PADDLE_THROW(platform::errors::Unimplemented(
-          "Not support %s as tensor type.", DataTypeToString(type)));
-    }
-  }
-  ge::Tensor ConvertToGeTensor(const phi::DenseTensor *tensor) {
-    auto numel = tensor->numel();
-    std::vector<int64_t> vec_dim;
-    auto dimen = arity(tensor->dims());
-    for (auto i = 0; i < dimen; ++i) {
-      vec_dim.push_back(tensor->dims()[i]);
-    }
-    // For Debug
-    // VLOG(1) << "input numel: " << numel << ", dimen is " << vec_dim.size() <<
-    // ", and shape is";
-    // for (const auto e : vec_dim) {
-    //   VLOG(0) << e;
-    // }
-    ge::Shape shape(vec_dim);
-    ge::TensorDesc tensor_desc(
-        shape,
-        ge::Format::FORMAT_ND,
-        VarTypeToGeType(framework::TransToProtoVarType(tensor->dtype())));
-    tensor_desc.SetRealDimCnt(vec_dim.size());
-    const uint8_t *data = reinterpret_cast<const uint8_t *>(tensor->data());
-    std::vector<uint8_t> dst(
-        numel * GeTypeSize(framework::TransToProtoVarType(tensor->dtype())));
-    memcpy(dst.data(),
-           data,
-           GeTypeSize(framework::TransToProtoVarType(tensor->dtype())) * numel);
-    ge::Tensor ge_tensor(tensor_desc, dst);
-    return ge_tensor;
-  }
-  void RunAscendSubgraph(int graph_idx,
-                         const std::vector<const phi::DenseTensor *> &inputs,
-                         std::vector<phi::DenseTensor *> *outputs) {
-    VLOG(1) << "Ascend Graph[" << graph_idx << "] is about to run.";
-    // Convert paddle phi::DenseTensor to GE phi::DenseTensor
-    std::vector<ge::Tensor> ge_inputs;
-    for (const auto &e : inputs) {
-      ge_inputs.push_back(ConvertToGeTensor(e));
-    }
-    // Run Graph
-    std::vector<ge::Tensor> ge_outputs;
-    ge::Status status = session_->RunGraph(graph_idx, ge_inputs, ge_outputs);
-    PADDLE_ENFORCE_EQ(status,
-                      ge::SUCCESS,
-                      paddle::platform::errors::PreconditionNotMet(
-                          "Calling RunGraph of graph engine failed, please "
-                          "check Ascend Log."));
-    VLOG(1) << "Run Ascend Graph[" << graph_idx << "] Done";
-    // change tensor back, note all tensor's type computed in GE is uint8
-    for (size_t i = 0; i < ge_outputs.size(); ++i) {
-      const uint8_t *ret_data = ge_outputs[i].GetData();
-      size_t size = ge_outputs[i].GetSize();
-      VLOG(1) << "GE phi::DenseTensor size of the " << i << "th output var is "
-              << size;
-      auto *dst = (*outputs)[i]->mutable_data<uint8_t>({(int64_t)size},
-                                                       platform::CPUPlace());
-      memcpy(dst, ret_data, size);
-      // Following for debug:
-      // VLOG(0) << "output for " << i << " var: ";
-      // float *tmp = reinterpret_cast<float*>(dst);
-      // for (size_t j = 0; j < size / 4; ++j) {
-      //   printf("%f ", tmp[j]);
-      // }
-      // printf("\n");
-    }
-  }
- protected:
-  std::shared_ptr<ge::Session> session_;
- private:
-  static std::shared_ptr<AscendInstance> ascend_instance_;
-};
-}  // namespace framework
-}  // namespace paddle
-#endif
--- a/paddle/fluid/framework/garbage_collector.cc
+++ b/paddle/fluid/framework/garbage_collector.cc
@@ -125,32 +125,6 @@ void CUDAPinnedGarbageCollector::ClearCallback(
 }
 #endif
-#ifdef PADDLE_WITH_ASCEND_CL
-NPUDefaultStreamGarbageCollector::NPUDefaultStreamGarbageCollector(
-    const platform::NPUPlace &place, size_t max_memory_size)
-    : GarbageCollector(place, max_memory_size) {}
-void NPUDefaultStreamGarbageCollector::Wait() const {
-  static_cast<platform::NPUDeviceContext *>(this->dev_ctx_)
-      ->WaitStreamCallback();
-}
-void NPUDefaultStreamGarbageCollector::ClearCallback(
-    const std::function<void()> &callback) {
-  static_cast<platform::NPUDeviceContext *>(this->dev_ctx_)
-      ->AddStreamCallback(callback);
-}
-NPUUnsafeFastGarbageCollector::NPUUnsafeFastGarbageCollector(
-    const platform::NPUPlace &place, size_t max_memory_size)
-    : GarbageCollector(place, max_memory_size) {}
-void NPUUnsafeFastGarbageCollector::ClearCallback(
-    const std::function<void()> &callback) {
-  callback();
-}
-#endif
 #ifdef PADDLE_WITH_MLU
 MLUDefaultStreamGarbageCollector::MLUDefaultStreamGarbageCollector(
    const platform::MLUPlace &place, size_t max_memory_size)

--- a/paddle/fluid/framework/garbage_collector.h
+++ b/paddle/fluid/framework/garbage_collector.h
@@ -139,28 +139,6 @@ class CUDAPinnedGarbageCollector : public GarbageCollector {
 };
 #endif
-#ifdef PADDLE_WITH_ASCEND_CL
-class NPUDefaultStreamGarbageCollector : public GarbageCollector {
- public:
-  NPUDefaultStreamGarbageCollector(const platform::NPUPlace &place,
-                                   size_t max_memory_size);
-  void Wait() const override;
- protected:
-  void ClearCallback(const std::function<void()> &callback) override;
-};
-class NPUUnsafeFastGarbageCollector : public GarbageCollector {
- public:
-  NPUUnsafeFastGarbageCollector(const platform::NPUPlace &place,
-                                size_t max_memory_size);
- protected:
-  void ClearCallback(const std::function<void()> &callback) override;
-};
-#endif
 #ifdef PADDLE_WITH_MLU
 class MLUDefaultStreamGarbageCollector : public GarbageCollector {
 public:

--- a/paddle/fluid/framework/new_executor/interpreter/execution_config.cc
+++ b/paddle/fluid/framework/new_executor/interpreter/execution_config.cc
@@ -60,11 +60,6 @@ inline std::tuple<int, int> GetThreadPoolConfig(const phi::Place& place,
      if (platform::is_xpu_place(place)) {
 #if defined(PADDLE_WITH_XPU)
        device_count = phi::backends::xpu::GetXPUDeviceCount();
-#endif
-      }
-      if (platform::is_npu_place(place)) {
-#if defined(PADDLE_WITH_ASCEND_CL)
-        device_count = platform::GetNPUDeviceCount();
 #endif
      }
      if (platform::is_ipu_place(place)) {

--- a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc
+++ b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc
@@ -631,16 +631,6 @@ void BuildOpFuncList(const platform::Place& place,
    VLOG(4) << "Start run " << place << " " << op->DebugStringEx(local_scope);
-#ifdef PADDLE_WITH_ASCEND_CL
-    // NOTE(wangxi): nan/inf cannot be detected on NPU by checking the variable
-    // values, but only through special `float_status` to checks whether
-    // the operation is overflow. More about `float_status`, see:
-    // https://gitee.com/ascend/modelzoo/issues/I3NF8V?from=project-issue
-    if (FLAGS_check_nan_inf) {
-      framework::details::NPUAllocAndClearFloatStatus(*op, *local_scope, place);
-    }
-#endif
    try {
      if (dynamic_cast<framework::OperatorWithKernel*>(op) == nullptr) {
        VLOG(4) << "HandleOperatorBase";

--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -87,16 +87,6 @@ inline void SetDeviceId(const platform::Place& place) {
 #else
    auto dev_id = place.device;
    platform::SetXPUDeviceId(dev_id);
-#endif
-  } else if (platform::is_npu_place(place)) {
-#ifndef PADDLE_WITH_ASCEND_CL
-    PADDLE_THROW(platform::errors::Unavailable(
-        "Cannot run operator on place %s, please recompile paddle or "
-        "reinstall Paddle with NPU support.",
-        place));
-#else
-    auto dev_id = place.device;
-    platform::SetNPUDeviceId(dev_id);
 #endif
  } else if (platform::is_custom_place(place)) {
 #ifndef PADDLE_WITH_CUSTOM_DEVICE
@@ -218,11 +208,6 @@ void InterpreterCore::RunImpl() {
    async_work_queue_ = GetWorkQueue();
    ExecuteInstructionList(vec_instruction_);
  }
-#ifdef PADDLE_WITH_ASCEND_CL
-  if (platform::is_npu_place(place_)) {
-    platform::DeviceContextPool::Instance().Get(place_)->Wait();
-  }
-#endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
  if (platform::is_custom_place(place_)) {
    platform::DeviceContextPool::Instance().Get(place_)->Wait();
@@ -893,18 +878,6 @@ void InterpreterCore::RunOperator(const Instruction& instr_node) {
                                       : var_scope_.GetMutableScope();
  VLOG(4) << "Start run " << place << " " << op->DebugStringEx(local_scope);
-#ifdef PADDLE_WITH_ASCEND_CL
-  if (platform::is_npu_place(place)) {
-    // NOTE(wangxi): nan/inf cannot be detected on NPU by checking the
-    // variable values, but only through special `float_status` to checks
-    // whether the operation is overflow. More about `float_status`, see:
-    // https://gitee.com/ascend/modelzoo/issues/I3NF8V?from=project-issue
-    if (FLAGS_check_nan_inf) {
-      framework::details::NPUAllocAndClearFloatStatus(*op, *local_scope, place);
-    }
-  }
-#endif
  auto op_with_kernel = dynamic_cast<const framework::OperatorWithKernel*>(op);
  {
    // If it is OperatorBase, InferShape do nothing.

--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -770,16 +770,6 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
 #else
      auto dev_id = place.device;
      platform::SetXPUDeviceId(dev_id);
-#endif
-    } else if (platform::is_npu_place(place)) {
-#ifndef PADDLE_WITH_ASCEND_CL
-      PADDLE_THROW(platform::errors::Unavailable(
-          "Cannot run operator on place %s, please recompile paddle or "
-          "reinstall Paddle with NPU support.",
-          place));
-#else
-      auto dev_id = place.device;
-      platform::SetNPUDeviceId(dev_id);
 #endif
    } else if (platform::is_mlu_place(place)) {
 #ifndef PADDLE_WITH_MLU
@@ -1692,17 +1682,6 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
  bool fallback_to_cpu = false;
  auto* dev_ctx = pool.Get(place);
-#ifdef PADDLE_WITH_ASCEND_CL
-  // NOTE(wangxi): nan/inf cannot be detected on NPU by checking the variable
-  // values, but only through special `float_status` to checks whether
-  // the operation is overflow. More about `float_status`, see:
-  // https://gitee.com/ascend/modelzoo/issues/I3NF8V?from=project-issue
-  if (FLAGS_check_nan_inf) {
-    framework::details::NPUAllocAndClearFloatStatus(*this, scope, place);
-  }
-#endif
  // using cache
  if (kernel_type_.get()) {
    dev_ctx = pool.Get(kernel_type_->place_);

--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -553,20 +553,6 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
      PADDLE_THROW(platform::errors::PermissionDenied(
          "Paddle can't use IPU device since it's not compiled with IPU,"
          "Please recompile or reinstall Paddle with IPU support."));
-#endif
-    } else if (platform::is_npu_place(place)) {
-#if defined(PADDLE_WITH_ASCEND_CL)
-      if (IsFastEagerDeletionModeEnabled()) {
-        gc.reset(new NPUUnsafeFastGarbageCollector(place, max_memory_size));
-      } else {
-        gc.reset(new NPUUnsafeFastGarbageCollector(place, max_memory_size));
-      }
-      VLOG(10) << "Created " << i << "-th GarbageCollector at " << place;
-#else
-      PADDLE_THROW(platform::errors::PermissionDenied(
-          "Paddle can't use NPU device since it's not compiled with "
-          "NPU,"
-          "Please recompile or reinstall Paddle with NPU support."));
 #endif
    } else if (platform::is_custom_place(place)) {
 #if defined(PADDLE_WITH_CUSTOM_DEVICE)

--- a/paddle/fluid/framework/phi_utils.cc
+++ b/paddle/fluid/framework/phi_utils.cc
@@ -112,15 +112,6 @@ phi::KernelKey FallBackToCpu(const phi::KernelKey& kernel_key,
        phi::Backend::CPU, kernel_key.layout(), kernel_key.dtype());
  }
 #endif
-#ifdef PADDLE_WITH_ASCEND_CL
-  if (kernel_key.backend() == phi::Backend::NPU) {
-    VLOG(3) << "phi missing NPU kernel: " << op.Type()
-            << ", expected_kernel_key:" << kernel_key
-            << ", fallback to CPU one!";
-    return phi::KernelKey(
-        phi::Backend::CPU, kernel_key.layout(), kernel_key.dtype());
-  }
-#endif
 #ifdef PADDLE_WITH_MLU
  if (kernel_key.backend() == phi::Backend::MLU) {
    VLOG(3) << "phi missing MLU kernel: " << op.Type()

--- a/paddle/fluid/framework/pipeline_trainer.cc
+++ b/paddle/fluid/framework/pipeline_trainer.cc
@@ -12,8 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-    defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/framework/data_feed_factory.h"
 #include "paddle/fluid/framework/device_worker_factory.h"
 #include "paddle/fluid/framework/trainer.h"
@@ -37,8 +36,6 @@ void PipelineTrainer::Initialize(const TrainerDesc& trainer_desc,
  int place_id = section_config.place_id();
 #if (defined PADDLE_WITH_NCCL) || (defined PADDLE_WITH_RCCL)
  place_ = platform::CUDAPlace(place_id);
-#elif (defined PADDLE_WITH_ASCEND_CL)  // NOLINT
-  place_ = platform::NPUPlace(place_id);
 #endif
  worker_ = DeviceWorkerFactory::CreateDeviceWorker(
      trainer_desc.device_worker_name());

--- a/paddle/fluid/framework/section_worker.cc
+++ b/paddle/fluid/framework/section_worker.cc
@@ -9,8 +9,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-    defined(PADDLE_WITH_ASCEND_CL)
 #include <float.h>
 #include "paddle/fluid/framework/device_worker.h"
@@ -235,18 +234,6 @@ void SectionWorker::TrainFiles() {
        gc.reset(new UnsafeFastGPUGarbageCollector(place_, max_memory_size));
      }
    }
-#elif defined(PADDLE_WITH_ASCEND_CL)
-    if (IsFastEagerDeletionModeEnabled()) {
-      VLOG(4) << "Use unsafe fast gc for NPU.";
-      gc.reset(new NPUUnsafeFastGarbageCollector(place_, max_memory_size));
-    } else {
-      PADDLE_THROW(platform::errors::Unimplemented(
-          "Please set FLAGS_fast_eager_deletion_mode=true to use "
-          "GarbageCollector on NPU."));
-      // TODO(zhiqiu): fix bugs and enable NPUDefaultStreamGarbageCollector.
-      VLOG(4) << "Use default stream gc for NPU.";
-      gc.reset(new NPUDefaultStreamGarbageCollector(place_, max_memory_size));
-    }
 #endif
  }  // max_memory_size >= 0

--- a/paddle/fluid/framework/tensor_test.cc
+++ b/paddle/fluid/framework/tensor_test.cc
@@ -143,35 +143,6 @@ TEST(DenseTensor, MutableData) {
    EXPECT_EQ(p1, p2);
  }
 #endif
-#ifdef PADDLE_WITH_ASCEND_CL
-  {
-    phi::DenseTensor src_tensor;
-    float* p1 = nullptr;
-    float* p2 = nullptr;
-    // initialization
-    p1 = src_tensor.mutable_data<float>(phi::make_ddim({1, 2, 3}),
-                                        platform::NPUPlace(0));
-    auto p1_holder = src_tensor.Holder();
-    EXPECT_NE(p1, nullptr);
-    // set src_tensor a new dim with large size
-    // momery is supposed to be re-allocated
-    p2 = src_tensor.mutable_data<float>(phi::make_ddim({3, 1024}),
-                                        platform::NPUPlace(0));
-    auto p2_holder = src_tensor.Holder();
-    EXPECT_NE(p2, nullptr);
-    EXPECT_NE(p1_holder.get(), p2_holder.get());
-    // set src_tensor a new dim with same size
-    // momery block is supposed to be unchanged
-    p1 = src_tensor.mutable_data<float>(phi::make_ddim({2, 2, 3}),
-                                        platform::NPUPlace(0));
-    EXPECT_EQ(p1, p2);
-    // set src_tensor a new dim with smaller size
-    // momery block is supposed to be unchanged
-    p2 = src_tensor.mutable_data<float>(phi::make_ddim({2, 2}),
-                                        platform::NPUPlace(0));
-    EXPECT_EQ(p1, p2);
-  }
-#endif
 }
 TEST(DenseTensor, ShareDataWith) {
@@ -207,16 +178,6 @@ TEST(DenseTensor, ShareDataWith) {
    ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
  }
 #endif
-#ifdef PADDLE_WITH_ASCEND_CL
-  {
-    phi::DenseTensor src_tensor;
-    phi::DenseTensor dst_tensor;
-    src_tensor.mutable_data<int>(phi::make_ddim({2, 3, 4}),
-                                 platform::NPUPlace(0));
-    dst_tensor.ShareDataWith(src_tensor);
-    ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
-  }
-#endif
 }
 TEST(DenseTensor, Slice) {
@@ -271,33 +232,6 @@ TEST(DenseTensor, Slice) {
    EXPECT_EQ(src_data_address + 9 * 2 * sizeof(double), slice_data_address);
  }
 #endif
-#ifdef PADDLE_WITH_ASCEND_CL
-  {
-    phi::DenseTensor src_tensor;
-    src_tensor.mutable_data<double>(phi::make_ddim({6, 9}),
-                                    platform::NPUPlace(0));
-    phi::DenseTensor slice_tensor = src_tensor.Slice(2, 6);
-    phi::DDim slice_dims = slice_tensor.dims();
-    ASSERT_EQ(arity(slice_dims), 2);
-    EXPECT_EQ(slice_dims[0], 4);
-    EXPECT_EQ(slice_dims[1], 9);
-    uintptr_t src_data_address =
-        reinterpret_cast<uintptr_t>(src_tensor.data<double>());
-    uintptr_t src_mutable_data_address =
-        reinterpret_cast<uintptr_t>(src_tensor.mutable_data<double>(
-            src_tensor.dims(), platform::NPUPlace(0)));
-    uintptr_t slice_data_address =
-        reinterpret_cast<uintptr_t>(slice_tensor.data<double>());
-    uintptr_t slice_mutable_data_address =
-        reinterpret_cast<uintptr_t>(slice_tensor.mutable_data<double>(
-            slice_tensor.dims(), platform::NPUPlace(0)));
-    EXPECT_EQ(src_data_address, src_mutable_data_address);
-    EXPECT_EQ(slice_data_address, slice_mutable_data_address);
-    EXPECT_EQ(src_data_address + 9 * 2 * sizeof(double), slice_data_address);
-  }
-#endif
 }
 TEST(DenseTensor, ReshapeToMatrix) {

--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -125,112 +125,6 @@ void TensorCopyImpl(const TENSOR& src,
        "Copy from %s to %s is not supported.", src_place, dst_place));
  }
 #endif
-#ifdef PADDLE_WITH_ASCEND_CL
-  // TODO(zhiqiu): handle different condition like CUDA code below
-  else if (platform::is_npu_place(src_place) &&  // NOLINT
-           platform::is_cpu_place(dst_place)) {
-    auto stream =
-        reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream();
-    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
-  }
-  else if (platform::is_cpu_place(src_place) &&  // NOLINT
-           platform::is_npu_place(dst_place)) {
-    //  1. cpu tensor -> npu pinned tensor
-    platform::NPUPinnedPlace npu_pinned_place;
-    phi::DenseTensor npu_pinned_tensor;
-    npu_pinned_tensor.Resize(src.dims());
-    auto npu_pinned_ptr =
-        npu_pinned_tensor.mutable_data(npu_pinned_place, src.dtype());
-    memory::Copy(npu_pinned_place, npu_pinned_ptr, src_place, src_ptr, size);
-    //  2. async copy npu pinned tensor -> npu tensor
-    memory::Copy(
-        dst_place,
-        dst_ptr,
-        npu_pinned_place,
-        npu_pinned_ptr,
-        size,
-        reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
-    //  3. record event
-    auto npu_pinned_allocator =
-        static_cast<paddle::memory::allocation::NPUPinnedAllocator*>(
-            paddle::memory::allocation::AllocatorFacade::Instance()
-                .GetAllocator(npu_pinned_place)
-                .get());
-    phi::Allocation* allocation = npu_pinned_tensor.Holder().get();
-    npu_pinned_allocator->RecordEvent(
-        allocation,
-        reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
-  }
-  else if (platform::is_npu_place(src_place) &&  // NOLINT
-           platform::is_npu_place(dst_place)) {
-    if (src_ptr == dst_ptr) {
-      VLOG(3) << "Skip copy the same data async from " << src_place << " to "
-              << dst_place;
-      return;
-    }
-    auto stream =
-        reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream();
-    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
-  }
-  else if (platform::is_npu_pinned_place(src_place) &&  // NOLINT
-           platform::is_npu_place(dst_place)) {         /* npu_pinned->npu */
-    auto src_npu_pinned_place = src_place;
-    auto dst_npu_place = dst_place;
-    auto ctx_place = ctx.GetPlace();
-    PADDLE_ENFORCE_EQ(
-        platform::is_npu_place(ctx_place),
-        true,
-        platform::errors::PreconditionNotMet(
-            "Device context place mismatch. When copying phi::DenseTensor "
-            "data from NPU Pinned memory to NPU memory, current "
-            "device context place should be NPU."));
-    auto ctx_npu_place = ctx_place;
-    PADDLE_ENFORCE_EQ(dst_npu_place,
-                      ctx_npu_place,
-                      platform::errors::PreconditionNotMet(
-                          "The target NPU device and current device context do "
-                          "not match. The target NPU device number is %d, but "
-                          "device context NPU number is %d.",
-                          dst_npu_place.device,
-                          ctx_npu_place.device));
-    auto stream =
-        reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream();
-    memory::Copy(
-        dst_npu_place, dst_ptr, src_npu_pinned_place, src_ptr, size, stream);
-  }
-  else if (platform::is_npu_place(src_place) &&        // NOLINT
-           platform::is_npu_pinned_place(dst_place)) { /* npu->npu_pinned */
-    auto src_npu_place = src_place;
-    auto dst_npu_pinned_place = dst_place;
-    auto ctx_place = ctx.GetPlace();
-    PADDLE_ENFORCE_EQ(
-        platform::is_npu_place(ctx_place),
-        true,
-        platform::errors::PreconditionNotMet(
-            "Device context place mismatch. When copying phi::DenseTensor "
-            "data from NPU memory to NPU Pinned memory, current "
-            "device context place should be NPU."));
-    auto ctx_npu_place = ctx_place;
-    PADDLE_ENFORCE_EQ(src_place,
-                      ctx_npu_place,
-                      platform::errors::PreconditionNotMet(
-                          "The source NPU device and current device context do "
-                          "not match. The source NPU device number is %d, but "
-                          "device context NPU number is %d.",
-                          src_npu_place.device,
-                          ctx_npu_place.device));
-    auto stream =
-        reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream();
-    memory::Copy(
-        dst_npu_pinned_place, dst_ptr, src_npu_place, src_ptr, size, stream);
-  }
-  else {  // NOLINT
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "Copy from %s to %s is not supported.", src_place, dst_place));
-  }
-#endif
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
           platform::is_cuda_pinned_place(dst_place)) {
@@ -539,29 +433,6 @@ void TensorCopySync(const phi::DenseTensor& src,
        "Copy from %s to %s is not supported.", src_place, dst_place));
  }
 #endif
-#ifdef PADDLE_WITH_ASCEND_CL
-  else if (platform::is_npu_place(src_place) &&  // NOLINT
-           platform::is_cpu_place(dst_place)) {  /* npu -> cpu*/
-    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
-  }
-  else if (platform::is_cpu_place(src_place) &&  // NOLINT
-           platform::is_npu_place(dst_place)) {  /* cpu -> npu*/
-    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
-  }
-  else if (platform::is_npu_place(src_place) &&  // NOLINT
-           platform::is_npu_place(dst_place)) {  /* npu -> npu*/
-    if (src_ptr == dst_ptr) {
-      VLOG(3) << "Skip copy the same data sync from " << src_place << " to "
-              << dst_place;
-      return;
-    }
-    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
-  }
-  else {  // NOLINT
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "Copy from %s to %s is not supported.", src_place, dst_place));
-  }
-#endif
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
           platform::is_cuda_pinned_place(dst_place)) {
@@ -758,31 +629,6 @@ void TensorToStream(std::ostream& os,
 #else
      PADDLE_THROW(platform::errors::Unimplemented(
          "MLUPlace is not supported when not compiled with MLU"));
-#endif
-    } else if (platform::is_npu_place(tensor.place())) {
-#ifdef PADDLE_WITH_ASCEND_CL
-      constexpr size_t kBufSize = 1024 * 1024 * 64;  // 64MB
-      std::unique_ptr<char[]> buf(new char[kBufSize]);
-      auto& npu_dev_ctx =
-          static_cast<const platform::NPUDeviceContext&>(dev_ctx);
-      platform::CPUPlace cpu;
-      uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
-      while (size != 0) {
-        size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
-        memory::Copy(cpu,
-                     buf.get(),
-                     tensor.place(),
-                     reinterpret_cast<const void*>(data),
-                     size_to_write,
-                     npu_dev_ctx.stream());
-        npu_dev_ctx.Wait();
-        os.write(buf.get(), size_to_write);
-        data += size_to_write;
-        size -= size_to_write;
-      }
-#else
-      PADDLE_THROW(platform::errors::Unimplemented(
-          "NPUPlace is not supported when not compiled with NPU"));
 #endif
    } else if (platform::is_custom_place(tensor.place())) {
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
@@ -875,7 +721,7 @@ void TensorFromStream(std::istream& is,
        platform::is_custom_place(dev_ctx.GetPlace())) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
    defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_MLU) ||  \
-    defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_CUSTOM_DEVICE)
+    defined(PADDLE_WITH_CUSTOM_DEVICE)
      phi::DenseTensor cpu_tensor;
      cpu_tensor.Resize(phi::make_ddim(shape));
      framework::VisitDataType(
@@ -958,7 +804,7 @@ void TensorFromStream(std::istream& is,
        platform::is_custom_place(dev_ctx.GetPlace())) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
    defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_MLU) ||  \
-    defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_CUSTOM_DEVICE)
+    defined(PADDLE_WITH_CUSTOM_DEVICE)
      phi::DenseTensor cpu_tensor;
      cpu_tensor.Resize(phi::make_ddim(dims));
      framework::VisitDataType(

--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
@@ -25,9 +25,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/string_array.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
-#ifdef PADDLE_WITH_ASCEND_CL
-#include "paddle/fluid/memory/allocation/npu_pinned_allocator.h"
-#endif
 #include "paddle/fluid/platform/device_context.h"
 #ifdef PADDLE_WITH_MLU
 #include "paddle/fluid/platform/device/mlu/device_context.h"
@@ -145,37 +142,6 @@ void TensorFromArray(const T* src,
                 reinterpret_cast<const phi::GPUContext&>(ctx).stream());
  }
 #endif
-#ifdef PADDLE_WITH_ASCEND_CL
-  else if (platform::is_npu_place(dst_place)) {  // NOLINT
-    //  1. vector -> npu pinned tensor
-    platform::NPUPinnedPlace npu_pinned_place;
-    phi::DenseTensor npu_pinned_tensor;
-    npu_pinned_tensor.Resize(dst->dims());
-    auto npu_pinned_ptr =
-        npu_pinned_tensor.mutable_data(npu_pinned_place, dst->dtype());
-    memory::Copy(npu_pinned_place, npu_pinned_ptr, src_place, src_ptr, size);
-    //  2. async copy npu pinned tensor -> npu tensor
-    memory::Copy(
-        dst_place,
-        dst_ptr,
-        npu_pinned_place,
-        npu_pinned_ptr,
-        size,
-        reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
-    //  3. record event
-    auto npu_pinned_allocator =
-        static_cast<paddle::memory::allocation::NPUPinnedAllocator*>(
-            paddle::memory::allocation::AllocatorFacade::Instance()
-                .GetAllocator(npu_pinned_place)
-                .get());
-    phi::Allocation* allocation = npu_pinned_tensor.Holder().get();
-    npu_pinned_allocator->RecordEvent(
-        allocation,
-        reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
-  }
-#endif
 #ifdef PADDLE_WITH_MLU
  else if (platform::is_mlu_place(dst_place)) {  // NOLINT
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
@@ -227,42 +193,6 @@ void TensorFromVector(const std::vector<T>& src,
                 reinterpret_cast<const phi::GPUContext&>(ctx).stream());
  }
 #endif
-#ifdef PADDLE_WITH_ASCEND_CL
-  // NOTE(zhiqiu): Becareful that aclrtMemcpyAsync is different from
-  // cudaMemcpyAsync.
-  // cudaMemcpyAsync is actually "sync" between cpu <-> gpu.
-  // aclrtMemcpyAsync is really "async" between cpu <-> npu.
-  // Since vector is on cpu, I think this function should be a "sync" operation,
-  // so pass nullptr as stream to  memory::Copy().
-  else if (platform::is_npu_place(dst_place)) {  // NOLINT
-    //  1. vector -> npu pinned tensor
-    phi::DenseTensor npu_pinned_tensor(dst->dtype());
-    platform::NPUPinnedPlace npu_pinned_place;
-    auto npu_pinned_ptr =
-        npu_pinned_tensor.mutable_data<T>(dst->dims(), npu_pinned_place);
-    memory::Copy(npu_pinned_place, npu_pinned_ptr, src_place, src_ptr, size);
-    //  2. async copy npu pinned tensor -> npu tensor
-    memory::Copy(
-        dst_place,
-        dst_ptr,
-        npu_pinned_place,
-        npu_pinned_ptr,
-        size,
-        reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
-    //  3. record event
-    auto npu_pinned_allocator =
-        static_cast<paddle::memory::allocation::NPUPinnedAllocator*>(
-            paddle::memory::allocation::AllocatorFacade::Instance()
-                .GetAllocator(npu_pinned_place)
-                .get());
-    phi::Allocation* allocation = npu_pinned_tensor.Holder().get();
-    npu_pinned_allocator->RecordEvent(
-        allocation,
-        reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
-  }
-#endif
 #ifdef PADDLE_WITH_MLU
  else if (platform::is_mlu_place(dst_place)) {  // NOLINT
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
@@ -324,37 +254,6 @@ inline void TensorFromVector(const std::vector<bool>& src,
                 reinterpret_cast<const phi::GPUContext&>(ctx).stream());
  }
 #endif
-#ifdef PADDLE_WITH_ASCEND_CL
-  else if (platform::is_npu_place(dst_place)) {  // NOLINT
-    //  1. vector -> npu pinned tensor
-    platform::NPUPinnedPlace npu_pinned_place;
-    phi::DenseTensor npu_pinned_tensor;
-    npu_pinned_tensor.Resize(dst->dims());
-    auto npu_pinned_ptr =
-        npu_pinned_tensor.mutable_data(npu_pinned_place, dst->dtype());
-    memory::Copy(npu_pinned_place, npu_pinned_ptr, src_place, src_ptr, size);
-    //  2. async copy npu pinned tensor -> npu tensor
-    memory::Copy(
-        dst_place,
-        dst_ptr,
-        npu_pinned_place,
-        npu_pinned_ptr,
-        size,
-        reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
-    //  3. record event
-    auto npu_pinned_allocator =
-        static_cast<paddle::memory::allocation::NPUPinnedAllocator*>(
-            paddle::memory::allocation::AllocatorFacade::Instance()
-                .GetAllocator(npu_pinned_place)
-                .get());
-    phi::Allocation* allocation = npu_pinned_tensor.Holder().get();
-    npu_pinned_allocator->RecordEvent(
-        allocation,
-        reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
-  }
-#endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
  else if (platform::is_custom_place(dst_place)) {  // NOLINT
    auto stream =
@@ -433,11 +332,6 @@ void TensorToVector(const phi::DenseTensor& src,
    memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
  }
 #endif
-#ifdef PADDLE_WITH_ASCEND_CL
-  else if (platform::is_npu_place(src.place())) {  // NOLINT
-    memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size, nullptr);
-  }
-#endif
 #ifdef PADDLE_WITH_MLU
  else if (platform::is_mlu_place(src.place())) {  // NOLINT
    memory::Copy(
@@ -491,11 +385,6 @@ inline void TensorToVector(const phi::DenseTensor& src,
    memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
  }
 #endif
-#ifdef PADDLE_WITH_ASCEND_CL
-  else if (platform::is_npu_place(src.place())) {  // NOLINT
-    memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size, nullptr);
-  }
-#endif
 #ifdef PADDLE_WITH_MLU
  else if (platform::is_mlu_place(src.place())) {  // NOLINT
    memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size, nullptr);
@@ -566,11 +455,6 @@ inline T GetValue(const phi::DenseTensor* x) {
  if (!platform::is_cpu_place(x->place())) {
    phi::DenseTensor cpu_x;
    framework::TensorCopy(*x, platform::CPUPlace(), &cpu_x);
-#if defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_MLU)
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    const platform::DeviceContext* dev_ctx = pool.Get(x->place());
-    dev_ctx->Wait();
-#endif
    value = cpu_x.data<T>()[0];
  } else {
    value = x->data<T>()[0];

--- a/paddle/fluid/framework/tensor_util_test.cc
+++ b/paddle/fluid/framework/tensor_util_test.cc
@@ -299,32 +299,6 @@ TEST(TensorToVector, Tensor_bool) {
    }
  }
 #endif
-#ifdef PADDLE_WITH_ASCEND_CL
-  {
-    std::vector<bool> src_vec = {
-        false,
-        true,
-        false,
-        true,
-        false,
-        true,
-        false,
-        true,
-        false,
-    };
-    phi::DenseTensor npu_tensor;
-    paddle::platform::NPUPlace place(0);
-    paddle::platform::NPUDeviceContext npu_ctx(place);
-    paddle::framework::TensorFromVector<bool>(src_vec, npu_ctx, &npu_tensor);
-    std::vector<bool> dst;
-    paddle::framework::TensorToVector<bool>(npu_tensor, npu_ctx, &dst);
-    for (int i = 0; i < 3 * 3; ++i) {
-      EXPECT_EQ(src_vec[i], dst[i]);
-    }
-  }
-#endif
 }
 TEST(TensorFromDLPack, Tensor) {

--- a/paddle/fluid/framework/trainer.h
+++ b/paddle/fluid/framework/trainer.h
@@ -302,8 +302,7 @@ class PSGPUTrainer : public TrainerBase {
 };
 #endif
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-    defined(PADDLE_WITH_ASCEND_CL)
 class PipelineTrainer : public TrainerBase {
 public:
  PipelineTrainer() {}

--- a/paddle/fluid/framework/trainer_factory.cc
+++ b/paddle/fluid/framework/trainer_factory.cc
@@ -82,8 +82,7 @@ REGISTER_TRAINER_CLASS(HeterXpuTrainer);
    (defined PADDLE_WITH_PSLIB)
 REGISTER_TRAINER_CLASS(PSGPUTrainer);
 #endif
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-    defined(PADDLE_WITH_ASCEND_CL)
 REGISTER_TRAINER_CLASS(PipelineTrainer);
 #endif
 }  // namespace framework

--- a/paddle/fluid/framework/type_defs.h
+++ b/paddle/fluid/framework/type_defs.h
@@ -65,28 +65,6 @@ using Attribute = paddle::variant<paddle::blank,
                                  std::vector<paddle::experimental::Scalar>>;
 using AttributeMap = std::unordered_map<std::string, Attribute>;
-#ifdef PADDLE_WITH_ASCEND_CL
-using NPUAttribute = paddle::variant<paddle::blank,
-                                     int,
-                                     float,
-                                     std::string,
-                                     std::vector<int>,
-                                     std::vector<float>,
-                                     std::vector<std::string>,
-                                     bool,
-                                     std::vector<bool>,
-                                     BlockDesc*,
-                                     int64_t,
-                                     std::vector<BlockDesc*>,
-                                     std::vector<int64_t>,
-                                     std::vector<double>,
-                                     VarDesc*,
-                                     std::vector<VarDesc*>,
-                                     std::vector<std::vector<int64_t>>>;
-using NPUAttributeMap = std::unordered_map<std::string, NPUAttribute>;
-#endif
 using OpCreator =
    std::function<OperatorBase*(const std::string& /*type*/,
                                const VariableNameMap& /*inputs*/,

--- a/paddle/fluid/framework/var_type_traits.h
+++ b/paddle/fluid/framework/var_type_traits.h
@@ -39,11 +39,6 @@
 #endif
 #endif
-#ifdef PADDLE_WITH_ASCEND_CL
-#include <hccl/hccl.h>
-#include <hccl/hccl_types.h>
-#endif
 #if defined(PADDLE_WITH_XPU_BKCL)
 #include "xpu/bkcl.h"
 #endif
@@ -69,10 +64,6 @@ class Communicator;
 class NCCLCommunicator;
 #endif
 #endif
-#ifdef PADDLE_WITH_ASCEND_CL
-class Communicator;
-class HCCLCommunicator;
-#endif
 #if defined(PADDLE_WITH_XPU_BKCL)
 class BKCLCommunicator;
@@ -205,9 +196,6 @@ using VarTypeRegistry = detail::VarTypeRegistryImpl<
 #endif
    operators::CudnnRNNCache,
 #endif
-#if defined(PADDLE_WITH_ASCEND_CL)
-    HcclRootInfo,
-#endif
 #if defined(PADDLE_WITH_XPU_BKCL)
    BKCLUniqueId,
    platform::BKCLCommunicator,

--- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
@@ -36,49 +36,6 @@ namespace paddle {
 namespace inference {
 namespace analysis {
-#ifdef PADDLE_WITH_ASCEND_CL
-void IrParamsSyncAmongDevicesPass::CopyParamsToNpu(Argument *argument) {
-  if (!argument->use_npu()) return;
-  auto &graph = argument->main_graph();
-  std::vector<std::string> repetitive_params;
-  if (graph.Has(framework::ir::kRepetitiveParamAttr))
-    repetitive_params = graph.Get<std::vector<std::string>>(
-        framework::ir::kRepetitiveParamAttr);
-  LOG(INFO) << "Sync params from CPU to NPU";
-  PADDLE_ENFORCE_EQ(argument->npu_device_id_valid(),
-                    true,
-                    platform::errors::PreconditionNotMet(
-                        "The npu_device_id field should be valid"));
-  platform::Place place = platform::NPUPlace(argument->npu_device_id());
-  auto *scope = argument->scope_ptr();
-  std::vector<std::string> all_vars = scope->LocalVarNames();
-  for (auto &var_name : all_vars) {
-    auto *var = scope->FindLocalVar(var_name);
-    PADDLE_ENFORCE_NOT_NULL(
-        var,
-        platform::errors::PreconditionNotMet("The var should not be nullptr"));
-    if (var->IsType<phi::DenseTensor>()) {
-      auto *t = var->GetMutable<phi::DenseTensor>();
-      platform::CPUPlace cpu_place;
-      phi::DenseTensor temp_tensor;
-      temp_tensor.Resize(t->dims());
-      temp_tensor.mutable_data<float>(cpu_place);
-      paddle::framework::TensorCopySync(*t, cpu_place, &temp_tensor);
-      t->clear();
-      paddle::framework::TensorCopySync(temp_tensor, place, t);
-    }
-  }
-}
-#endif
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 void IrParamsSyncAmongDevicesPass::CopyParamsToGpu(Argument *argument) {
  // The parameters are on the cpu, therefore, synchronization is not necessary.
@@ -253,11 +210,6 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
      argument->scope_valid(),
      true,
      platform::errors::PreconditionNotMet("The scope field should be valid"));
-#ifdef PADDLE_WITH_ASCEND_CL
-  if (argument->use_npu_valid()) {
-    CopyParamsToNpu(argument);
-  }
-#endif
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  if (argument->use_gpu_valid()) {
    CopyParamsToGpu(argument);

--- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
@@ -35,10 +35,6 @@ class IrParamsSyncAmongDevicesPass : public AnalysisPass {
  std::string repr() const override;
 private:
-#ifdef PADDLE_WITH_ASCEND_CL
-  void CopyParamsToNpu(Argument *argument);
-#endif
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  void CopyParamsToGpu(Argument *argument);
 #endif

--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -195,21 +195,6 @@ void AnalysisConfig::SetXpuDeviceId(int device_id) {
  Update();
 }
-void AnalysisConfig::EnableNpu(int device_id) {
-#if defined(PADDLE_WITH_ASCEND_CL)
-  use_npu_ = true;
-  npu_device_id_ = device_id;
-#elif defined(PADDLE_WITH_CUSTOM_DEVICE)
-  use_custom_device_ = true;
-  custom_device_id_ = device_id;
-  custom_device_type_ = "npu";
-#else
-  LOG(ERROR) << "Please compile with npu to EnableNpu()";
-  use_npu_ = false;
-#endif
-  Update();
-}
 void AnalysisConfig::EnableCustomDevice(const std::string &device_type,
                                        int device_id,
                                        Precision precision_mode) {
@@ -1023,20 +1008,6 @@ void AnalysisConfig::Update() {
        "with XPU-runtime."));
 #endif
  }
-  if (use_npu_) {
-#if defined(PADDLE_WITH_ASCEND_CL) || defined(LITE_SUBGRAPH_WITH_NPU)
-    PADDLE_ENFORCE_EQ(use_gpu_,
-                      false,
-                      platform::errors::Unavailable(
-                          "Currently, NPU and GPU cannot be enabled in the "
-                          "same analysis configuration."));
-#else
-    PADDLE_THROW(platform::errors::Unavailable(
-        "You tried to use an NPU device, but Paddle was not compiled "
-        "with NPU-runtime."));
-#endif
-  }
  if (use_ipu_) {
 #ifndef PADDLE_WITH_IPU
    PADDLE_THROW(platform::errors::Unavailable(

--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -376,14 +376,6 @@ void AnalysisPredictor::InitPlace() {
          "with WITH_XPU."));
 #endif  // PADDLE_WITH_XPU
    }
-  } else if (config_.use_npu()) {
-#ifdef PADDLE_WITH_ASCEND_CL
-    place_ = paddle::platform::NPUPlace(config_.npu_device_id());
-#else
-    PADDLE_THROW(platform::errors::Unavailable(
-        "You tried to use NPU forward propagation, but Paddle was not compiled "
-        "with WITH_ASCEND_CL."));
-#endif
  } else if (config_.NNAdapter().use_nnadapter) {
    if (config_.lite_engine_enabled()) {
      place_ = paddle::platform::CPUPlace();

--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -278,23 +278,6 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
 #else
      PADDLE_THROW(platform::errors::Unavailable(
          "Not compile with XPU, should not reach here."));
-#endif
-    } else {
-#ifdef PADDLE_WITH_ASCEND_CL
-      platform::DeviceContextPool &pool =
-          platform::DeviceContextPool::Instance();
-      auto *dev_ctx =
-          static_cast<const platform::NPUDeviceContext *>(pool.Get(place_));
-      auto dst_npu_place = place_;
-      memory::Copy(dst_npu_place,
-                   static_cast<void *>(input_ptr),
-                   platform::CPUPlace(),
-                   inputs[i].data.data(),
-                   inputs[i].data.length(),
-                   dev_ctx->stream());
-#else
-      PADDLE_THROW(platform::errors::Unavailable(
-          "Not compile with NPU, should not reach here."));
 #endif
    }

--- a/paddle/fluid/inference/api/api_impl_tester.cc
+++ b/paddle/fluid/inference/api/api_impl_tester.cc
@@ -305,15 +305,6 @@ TEST(inference_api_native, image_classification_xpu) {
 }
 #endif
-#ifdef PADDLE_WITH_ASCEND_CL
-TEST(inference_api_native, word2vec_npu) {
-  MainWord2Vec(paddle::PaddlePlace::kNPU);
-}
-// TEST(inference_api_native, image_classification_npu) {
-//   MainImageClassification(paddle::PaddlePlace::kNPU);
-// }
-#endif
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 TEST(inference_api_native, word2vec_gpu) {
  MainWord2Vec(paddle::PaddlePlace::kGPU);

--- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
@@ -244,25 +244,6 @@ void Tensor::CopyFromCpu(const T *data) {
    PADDLE_THROW(paddle::platform::errors::Unavailable(
        "Can not create tensor with XPU place because paddle is not compiled "
        "with XPU."));
-#endif
-  } else if (place_ == PlaceType::kNPU) {
-#ifdef PADDLE_WITH_ASCEND_CL
-    paddle::platform::DeviceContextPool &pool =
-        paddle::platform::DeviceContextPool::Instance();
-    paddle::platform::NPUPlace npu_place(device_);
-    auto *t_data = tensor->mutable_data<T>(npu_place);
-    auto *dev_ctx = static_cast<const paddle::platform::NPUDeviceContext *>(
-        pool.Get(npu_place));
-    paddle::memory::Copy(npu_place,
-                         static_cast<void *>(t_data),
-                         paddle::platform::CPUPlace(),
-                         data,
-                         ele_size,
-                         dev_ctx->stream());
-#else
-    PADDLE_THROW(paddle::platform::errors::Unavailable(
-        "Can not create tensor with NPU place because paddle is not compiled "
-        "with NPU."));
 #endif
  } else {
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
@@ -468,25 +449,6 @@ void Tensor::CopyToCpuImpl(T *data,
    PADDLE_THROW(paddle::platform::errors::Unavailable(
        "Can not create tensor with XPU place because paddle is not compiled "
        "with XPU."));
-#endif
-  } else if (place_ == PlaceType::kNPU) {
-#ifdef PADDLE_WITH_ASCEND_CL
-    paddle::platform::DeviceContextPool &pool =
-        paddle::platform::DeviceContextPool::Instance();
-    auto npu_place = t_place;
-    auto *dev_ctx = static_cast<const paddle::platform::NPUDeviceContext *>(
-        pool.Get(npu_place));
-    paddle::memory::Copy(paddle::platform::CPUPlace(),
-                         static_cast<void *>(data),
-                         npu_place,
-                         t_data,
-                         ele_num * sizeof(T),
-                         dev_ctx->stream());
-    paddle::platform::NPUStreamSync(dev_ctx->stream());
-#else
-    PADDLE_THROW(paddle::platform::errors::Unavailable(
-        "Can not create tensor with NPU place because paddle is not compiled "
-        "with NPU."));
 #endif
  } else {
 #ifdef PADDLE_WITH_CUSTOM_DEVICE

--- a/paddle/fluid/inference/api/details/zero_copy_tensor_test.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor_test.cc
@@ -146,10 +146,6 @@ TEST(Tensor, FillRandomDataAndCheck) {
  ASSERT_TRUE(FillRandomDataAndCheck(PlaceType::kGPU));
  ASSERT_TRUE(SetPlaceAndCheck(PlaceType::kGPU));
 #endif
-#ifdef PADDLE_WITH_ASCEND_CL
-  ASSERT_TRUE(FillRandomDataAndCheck(PlaceType::kNPU));
-  ASSERT_TRUE(SetPlaceAndCheck(PlaceType::kNPU));
-#endif
 #ifdef PADDLE_WITH_XPU
  ASSERT_TRUE(FillRandomDataAndCheck(PlaceType::kXPU));
  ASSERT_TRUE(SetPlaceAndCheck(PlaceType::kXPU));

--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -363,12 +363,6 @@ struct PD_INFER_DECL AnalysisConfig {
  ///
  void SetXpuDeviceId(int device_id = 0);
  ///
-  /// \brief Turn on NPU.
-  ///
-  /// \param device_id device_id the NPU card to use (default is 0).
-  ///
-  void EnableNpu(int device_id = 0);
-  ///
  /// \brief Turn on CustomDevice.
  ///
  /// \param device_type device_type the custom device to use.

--- a/paddle/fluid/inference/capi_exp/pd_config.cc
+++ b/paddle/fluid/inference/capi_exp/pd_config.cc
@@ -171,11 +171,6 @@ void PD_ConfigEnableXpu(__pd_keep PD_Config* pd_config,
                    enable_multi_stream);
 }
-void PD_ConfigEnableNpu(__pd_keep PD_Config* pd_config, int32_t device_id) {
-  CHECK_AND_CONVERT_PD_CONFIG;
-  config->EnableNpu(device_id);
-}
 PD_Bool PD_ConfigUseXpu(__pd_keep PD_Config* pd_config) {
  CHECK_AND_CONVERT_PD_CONFIG;
  return config->use_xpu();

--- a/paddle/fluid/inference/capi_exp/pd_config.h
+++ b/paddle/fluid/inference/capi_exp/pd_config.h
@@ -214,14 +214,6 @@ PADDLE_CAPI_EXPORT extern void PD_ConfigEnableXpu(
    PD_Bool adaptive_seqlen,
    PD_Bool enable_multi_stream);
 ///
-/// \brief Turn on NPU.
-///
-/// \param[in] pd_onfig config
-/// \param[in] device_id device_id the NPU card to use.
-///
-PADDLE_CAPI_EXPORT extern void PD_ConfigEnableNpu(
-    __pd_keep PD_Config* pd_config, int32_t device_id);
-///
 /// \brief A boolean state telling whether the XPU is turned on.
 ///
 /// \param[in] pd_onfig config

--- a/paddle/fluid/inference/goapi/config.go
+++ b/paddle/fluid/inference/goapi/config.go
@@ -212,15 +212,6 @@ func (config *Config) EnableXpu(l3WorkspaceSize int32, locked bool, autotune boo
 		cAutotuneFile, cPrecision, cvtGoBoolToPD(adaptiveSeqlen), cvtGoBoolToPD(enableMultiStream))
 }
-///
-/// \brief Turn on NPU.
-///
-/// \param deviceId the NPU card to use.
-///
-func (config *Config) EnableNpu(deviceId int32) {
-	C.PD_ConfigEnableNpu(config.c, C.int32_t(deviceId))
-}
 ///
 /// \brief A boolean state telling whether the GPU is turned on.
 ///

--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -50,11 +50,6 @@ if(UNIX AND NOT APPLE)
  list(APPEND ALLOCATOR_DEPS rt)
 endif()
-if(WITH_ASCEND_CL)
-  list(APPEND ALLOCATOR_SRCS npu_allocator.cc npu_pinned_allocator.cc)
-  list(APPEND ALLOCATOR_DEPS npu_info)
-endif()
 if(WITH_CUSTOM_DEVICE)
  list(APPEND ALLOCATOR_SRCS custom_allocator.cc)
 endif()

--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -54,10 +54,6 @@
 #include "paddle/fluid/platform/device/xpu/xpu_info.h"
 #endif
-#ifdef PADDLE_WITH_ASCEND_CL
-#include "paddle/fluid/memory/allocation/npu_pinned_allocator.h"
-#endif
 #ifdef PADDLE_WITH_IPU
 #include "paddle/fluid/platform/device/ipu/ipu_info.h"
 #endif
@@ -198,12 +194,6 @@ class AllocatorFacadePrivate {
          InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
        }
 #endif
-#ifdef PADDLE_WITH_ASCEND_CL
-        for (int dev_id = 0; dev_id < platform::GetNPUDeviceCount(); ++dev_id) {
-          InitNaiveBestFitNPUAllocator(platform::NPUPlace(dev_id));
-        }
-        InitNaiveBestFitNPUPinnedAllocator();
-#endif
 #ifdef PADDLE_WITH_MLU
        for (int dev_id = 0; dev_id < platform::GetMLUDeviceCount(); ++dev_id) {
          InitNaiveBestFitMLUAllocator(platform::MLUPlace(dev_id));
@@ -254,12 +244,6 @@ class AllocatorFacadePrivate {
        InitNaiveBestFitCUDAPinnedAllocator();
 #endif
-#ifdef PADDLE_WITH_ASCEND_CL
-        for (int dev_id = 0; dev_id < platform::GetNPUDeviceCount(); ++dev_id) {
-          InitNaiveBestFitNPUAllocator(platform::NPUPlace(dev_id));
-        }
-        InitNaiveBestFitNPUPinnedAllocator();
-#endif
 #ifdef PADDLE_WITH_XPU
        for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) {
          InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
@@ -823,17 +807,6 @@ class AllocatorFacadePrivate {
  }
 #endif
-#ifdef PADDLE_WITH_ASCEND_CL
-  void InitNaiveBestFitNPUAllocator(platform::NPUPlace p) {
-    allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
-  }
-  void InitNaiveBestFitNPUPinnedAllocator() {
-    allocators_[platform::NPUPinnedPlace()] =
-        std::make_shared<paddle::memory::allocation::NPUPinnedAllocator>();
-  }
-#endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
  void InitNaiveBestFitCustomDeviceAllocator(platform::CustomPlace p) {
    allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
@@ -915,12 +888,6 @@ class AllocatorFacadePrivate {
      places.emplace_back(platform::XPUPlace(dev_id));
    }
 #endif
-#ifdef PADDLE_WITH_ASCEND_CL
-    int device_count = platform::GetNPUDeviceCount();
-    for (int dev_id = 0; dev_id < device_count; ++dev_id) {
-      places.emplace_back(platform::NPUPlace(dev_id));
-    }
-#endif
 #ifdef PADDLE_WITH_IPU
    int device_count = platform::GetIPUDeviceCount();
    for (int dev_id = 0; dev_id < device_count; ++dev_id) {
@@ -1107,7 +1074,7 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
  } else {
    return m->GetAllocator(p, size)->Allocate(size);
  }
-#elif defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_ASCEND_CL)
+#elif defined(PADDLE_WITH_XPU)
  return GetAllocator(place)->Allocate(size);
 #else
  PADDLE_THROW(platform::errors::PreconditionNotMet(

--- a/paddle/fluid/memory/allocation/allocator_facade.h
+++ b/paddle/fluid/memory/allocation/allocator_facade.h
@@ -16,9 +16,6 @@
 #include <memory>
 #include "paddle/fluid/memory/allocation/allocator.h"
-#ifdef PADDLE_WITH_ASCEND_CL
-#include "paddle/fluid/memory/allocation/npu_pinned_allocator.h"
-#endif
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #endif
@@ -29,10 +26,6 @@ namespace paddle {
 namespace memory {
 namespace allocation {
-#ifdef PADDLE_WITH_ASCEND_CL
-using NPUPinnedAllocator = paddle::memory::allocation::NPUPinnedAllocator;
-#endif
 // Allocator Facade is the interface exposed to other modules.
 // All the configuration or dirty code under development should
 // be hidden behind this facade.

--- a/paddle/fluid/memory/allocation/buddy_allocator.cc
+++ b/paddle/fluid/memory/allocation/buddy_allocator.cc
@@ -19,8 +19,7 @@ limitations under the License. */
 #include "gflags/gflags.h"
 #include "glog/logging.h"
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    defined(PADDLE_WITH_MLU) || defined(PADDLE_WITH_ASCEND_CL)
 #define USE_DEVICE
 DECLARE_uint64(reallocate_gpu_memory_in_mb);
 #endif
@@ -57,9 +56,6 @@ BuddyAllocator::BuddyAllocator(
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
    init_allocate_size_func_ = &platform::GpuInitAllocSize;
    re_allocate_size_func_ = &platform::GpuReallocSize;
-#elif defined(PADDLE_WITH_ASCEND_CL)
-    init_allocate_size_func_ = &platform::NPUInitAllocSize;
-    re_allocate_size_func_ = &platform::NPUReallocSize;
 #elif defined(PADDLE_WITH_MLU)
    init_allocate_size_func_ = &platform::MLUInitAllocSize;
    re_allocate_size_func_ = &platform::MLUReallocSize;
@@ -257,9 +253,6 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool(
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  allocate_bytes = DeviceAllocateSize(
      &platform::GpuInitAllocSize, &platform::GpuReallocSize, request_bytes);
-#elif defined(PADDLE_WITH_ASCEND_CL)
-  allocate_bytes = DeviceAllocateSize(
-      &platform::NPUInitAllocSize, &platform::NPUReallocSize, request_bytes);
 #elif defined(PADDLE_WITH_MLU)
  allocate_bytes = DeviceAllocateSize(
      &platform::MLUInitAllocSize, &platform::MLUReallocSize, request_bytes);

--- a/paddle/fluid/memory/allocation/buddy_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/buddy_allocator_test.cc
@@ -29,8 +29,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/mlu/mlu_info.h"
 #endif
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_MLU)
 DECLARE_double(fraction_of_gpu_memory_to_use);
 DECLARE_uint64(initial_gpu_memory_in_mb);
 DECLARE_uint64(reallocate_gpu_memory_in_mb);
@@ -396,34 +395,6 @@ TEST(BuddyAllocator, Release) {
 }
 #endif
-#ifdef PADDLE_WITH_ASCEND_CL
-TEST(BuddyAllocator, NpuFraction) {
-  // In a 16 GB machine, the pool size will be about 160 MB
-  FLAGS_fraction_of_gpu_memory_to_use = 0.92;
-  FLAGS_initial_gpu_memory_in_mb = 0;
-  FLAGS_reallocate_gpu_memory_in_mb = 0;
-  BuddyAllocator buddy_allocator(
-      std::unique_ptr<SystemAllocator>(new NPUAllocator(0)),
-      platform::NPUMinChunkSize(),
-      platform::NPUMaxChunkSize());
-  // Less than pool size
-  TestBuddyAllocator(&buddy_allocator, 10);
-  TestBuddyAllocator(&buddy_allocator, 10 << 10);
-  TestBuddyAllocator(&buddy_allocator, 10 << 20);
-  buddy_allocator.Release();
-  // Greater than max chunk size
-  TestBuddyAllocator(&buddy_allocator,
-                     300 << 20,
-                     /* use_system_allocator = */ true);
-  TestBuddyAllocator(&buddy_allocator,
-                     1 * static_cast<size_t>(1 << 30),
-                     /* use_system_allocator = */ true);
-}
-#endif
 #ifdef PADDLE_WITH_MLU
 TEST(BuddyAllocator, MluFraction) {
  // In a 16 GB machine, the pool size will be about 160 MB

--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
@@ -213,210 +213,6 @@ size_t Used<platform::XPUPlace>(const platform::XPUPlace &place) {
 #endif
 }
-// For Ascend NPU
-#ifdef PADDLE_WITH_ASCEND_CL
-constexpr int EXTRA_PADDING_SIZE = 32;
-class NPUBuddyAllocatorList {
- private:
-  NPUBuddyAllocatorList() : devices_(platform::GetSelectedNPUDevices()) {
-    auto npu_num = devices_.size();
-    allocators_.resize(npu_num);
-    init_flags_.reserve(npu_num);
-    for (size_t i = 0; i < npu_num; ++i) {
-      init_flags_.emplace_back(new std::once_flag());
-    }
-  }
-  static NPUBuddyAllocatorList *CreateNewInstance() {
-    return new NPUBuddyAllocatorList();
-  }
- public:
-  static NPUBuddyAllocatorList *Instance() {
-    static auto *instance = CreateNewInstance();
-    return instance;
-  }
-  BuddyAllocator *Get(int npu_id) {
-    auto pos = std::distance(
-        devices_.begin(), std::find(devices_.begin(), devices_.end(), npu_id));
-    PADDLE_ENFORCE_LT(pos,
-                      devices_.size(),
-                      platform::errors::OutOfRange(
-                          "The index exceeds the size of devices, the size of "
-                          "devices is %d, the index is %d",
-                          devices_.size(),
-                          pos));
-    std::call_once(*init_flags_[pos], [this, pos] {
-      platform::SetNPUDeviceId(devices_[pos]);
-      allocators_[pos].reset(
-          new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
-                                 new detail::NPUAllocator(devices_[pos])),
-                             platform::NPUMinChunkSize(),
-                             platform::NPUMaxChunkSize(),
-                             EXTRA_PADDING_SIZE));
-      VLOG(10) << "\n\nNOTE:\n"
-               << "You can set GFlags environment variable "
-               << "'FLAGS_fraction_of_gpu_memory_to_use' "
-               << "or 'FLAGS_initial_gpu_memory_in_mb' "
-               << "or 'FLAGS_reallocate_gpu_memory_in_mb' "
-               << "to change the memory size for GPU usage.\n"
-               << "Current 'FLAGS_fraction_of_gpu_memory_to_use' value is "
-               << FLAGS_fraction_of_gpu_memory_to_use
-               << ". Current 'FLAGS_initial_gpu_memory_in_mb' value is "
-               << FLAGS_initial_gpu_memory_in_mb
-               << ". Current 'FLAGS_reallocate_gpu_memory_in_mb' value is "
-               << FLAGS_reallocate_gpu_memory_in_mb << "\n\n";
-    });
-    return allocators_[pos].get();
-  }
- private:
-  std::vector<int> devices_;
-  std::vector<std::unique_ptr<std::once_flag>> init_flags_;
-  std::vector<std::unique_ptr<BuddyAllocator>> allocators_;
-};
-BuddyAllocator *GetNPUBuddyAllocator(int npu_id) {
-  return NPUBuddyAllocatorList::Instance()->Get(npu_id);
-}
-BuddyAllocator *GetNPUPinnedBuddyAllocator() {
-  static std::once_flag init_flag;
-  static BuddyAllocator *ba = nullptr;
-  std::call_once(init_flag, []() {
-    ba = new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
-                                new detail::NPUPinnedAllocator),
-                            phi::backends::cpu::NPUPinnedMinChunkSize(),
-                            phi::backends::cpu::NPUPinnedMaxChunkSize());
-  });
-  return ba;
-}
-#endif
-template <>
-size_t Used<platform::NPUPlace>(const platform::NPUPlace &place) {
-#ifdef PADDLE_WITH_ASCEND_CL
-  return GetNPUBuddyAllocator(place.device)->Used();
-#else
-  PADDLE_THROW(platform::errors::PermissionDenied(
-      "'NPUPlace' is not supported in CPU only device."));
-#endif
-}
-template <>
-void *Alloc<platform::NPUPlace>(const platform::NPUPlace &place, size_t size) {
-#ifdef PADDLE_WITH_ASCEND_CL
-  auto *buddy_allocator = GetNPUBuddyAllocator(place.device);
-  auto *ptr = buddy_allocator->Alloc(size);
-  if (ptr == nullptr) {
-    platform::NPUDeviceGuard(place.device);
-    size_t avail, total;
-    platform::NPUMemoryUsage(&avail, &total);
-    PADDLE_THROW(platform::errors::ResourceExhausted(
-        "Cannot allocate %s in NPU %d, avaliable %s, total %s, NpuMinChunkSize "
-        "%s, NpuMaxChunkSize %s, NPU memory used: %s.",
-        string::HumanReadableSize(size),
-        place.device,
-        string::HumanReadableSize(avail),
-        string::HumanReadableSize(total),
-        string::HumanReadableSize(buddy_allocator->GetMinChunkSize()),
-        string::HumanReadableSize(buddy_allocator->GetMaxChunkSize()),
-        string::HumanReadableSize(Used<platform::NPUPlace>(place))));
-  } else {
-    if (FLAGS_init_allocated_mem) {
-      platform::NPUMemsetSync(ptr, 0xEF, size, size);
-    }
-  }
-  VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
-  return ptr;
-#else
-  PADDLE_THROW(platform::errors::PermissionDenied(
-      "'NPUPlace' is not supported in CPU only device."));
-#endif
-}
-template <>
-void Free<platform::NPUPlace>(const platform::NPUPlace &place,
-                              void *p,
-                              size_t size) {
-#ifdef PADDLE_WITH_ASCEND_CL
-  VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
-  GetNPUBuddyAllocator(place.device)->Free(p);
-#else
-  PADDLE_THROW(platform::errors::PermissionDenied(
-      "'NPUPlace' is not supported in CPU only device."));
-#endif
-}
-template <>
-uint64_t Release<platform::NPUPlace>(const platform::NPUPlace &place) {
-#ifdef PADDLE_WITH_ASCEND_CL
-  return GetNPUBuddyAllocator(place.device)->Release();
-#else
-  PADDLE_THROW(platform::errors::PermissionDenied(
-      "'NPUPlace' is not supported in CPU only device."));
-#endif
-}
-template <>
-size_t Used<platform::NPUPinnedPlace>(const platform::NPUPinnedPlace &place) {
-#ifdef PADDLE_WITH_ASCEND_CL
-  return GetNPUPinnedBuddyAllocator()->Used();
-#else
-  PADDLE_THROW(platform::errors::PermissionDenied(
-      "'NPUPinnedPlace' is not supported in CPU only device."));
-#endif
-}
-template <>
-void *Alloc<platform::NPUPinnedPlace>(const platform::NPUPinnedPlace &place,
-                                      size_t size) {
-#ifdef PADDLE_WITH_ASCEND_CL
-  auto *buddy_allocator = GetNPUPinnedBuddyAllocator();
-  void *ptr = buddy_allocator->Alloc(size);
-  if (ptr == nullptr) {
-    LOG(WARNING) << "Cannot allocate " << size << " bytes in NPUPinnedPlace";
-  }
-  if (FLAGS_init_allocated_mem) {
-    memset(ptr, 0xEF, size);
-  }
-  return ptr;
-#else
-  PADDLE_THROW(platform::errors::PermissionDenied(
-      "'NPUPinnedPlace' is not supported in CPU only device."));
-#endif
-}
-template <>
-void Free<platform::NPUPinnedPlace>(const platform::NPUPinnedPlace &place,
-                                    void *p,
-                                    size_t size) {
-#ifdef PADDLE_WITH_ASCEND_CL
-  GetNPUPinnedBuddyAllocator()->Free(p);
-#else
-  PADDLE_THROW(platform::errors::PermissionDenied(
-      "'NPUPinnedPlace' is not supported in CPU only device."));
-#endif
-}
-template <>
-uint64_t Release<platform::NPUPinnedPlace>(
-    const platform::NPUPinnedPlace &place) {
-#ifdef PADDLE_WITH_ASCEND_CL
-  return GetNPUPinnedBuddyAllocator()->Release();
-#else
-  PADDLE_THROW(platform::errors::PermissionDenied(
-      "'NPUPinnedPlace' is not supported in CPU only device."));
-#endif
-}
 // For CUDA
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 class GPUBuddyAllocatorList {

--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc
@@ -61,22 +61,6 @@ TEST(NaiveBestFitAllocatorTest, CudaPinnedAlloc) {
 }
 #endif
-#ifdef PADDLE_WITH_ASCEND_CL
-TEST(NaiveBestFitAllocatorTest, NpuAlloc) {
-  NaiveBestFitAllocator alloc{platform::NPUPlace(0)};
-  {
-    size_t size = (1 << 20);
-    auto allocation = alloc.Allocate(size);
-  }
-  sleep(10);
-  alloc.Release(platform::NPUPlace(0));
-  size_t size = (1 << 20);
-  auto allocation = alloc.Allocate(size);
-  alloc.Release(platform::NPUPlace(0));
-}
-#endif
 #ifdef PADDLE_WITH_MLU
 TEST(NaiveBestFitAllocatorTest, MluAlloc) {
  NaiveBestFitAllocator alloc{platform::MLUPlace(0)};

--- a/paddle/fluid/memory/allocation/npu_allocator.cc
+++ b/paddle/fluid/memory/allocation/npu_allocator.cc
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/memory/allocation/npu_allocator.h"
-#include <string>
-#include "paddle/fluid/platform/enforce.h"
-namespace paddle {
-namespace memory {
-namespace allocation {
-bool NPUAllocator::IsAllocThreadSafe() const { return true; }
-void NPUAllocator::FreeImpl(phi::Allocation* allocation) {
-  PADDLE_ENFORCE_EQ(
-      allocation->place(),
-      place_,
-      platform::errors::PermissionDenied(
-          "NPU memory is freed in incorrect device. This may be a bug"));
-  platform::RecordedNPUFree(
-      allocation->ptr(), allocation->size(), place_.device);
-  delete allocation;
-}
-phi::Allocation* NPUAllocator::AllocateImpl(size_t size) {
-  std::call_once(once_flag_,
-                 [this] { platform::SetNPUDeviceId(place_.device); });
-  void* ptr;
-  auto result = platform::RecordedNPUMalloc(&ptr, size, place_.device);
-  if (LIKELY(result == ACL_ERROR_NONE)) {
-    return new Allocation(ptr, size, platform::Place(place_));
-  }
-  size_t avail, total, actual_avail, actual_total;
-  bool is_limited = platform::RecordedNPUMemGetInfo(
-      &avail, &total, &actual_avail, &actual_total, place_.device);
-  std::string err_msg;
-  if (is_limited) {
-    auto limit_size = (total >> 20);
-    err_msg = string::Sprintf(
-        "Or set environment variable `FLAGS_gpu_memory_limit_mb` to a larger "
-        "value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the maximum "
-        "GPU memory usage is limited to %d MB.\n"
-        "   The command is `export FLAGS_gpu_memory_limit_mb=xxx`.",
-        limit_size,
-        limit_size);
-  }
-  PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
-      "\n\nOut of memory error on NPU %d. "
-      "Cannot allocate %s memory on NPU %d, "
-      "available memory is only %s.\n\n"
-      "Please check whether there is any other process using NPU %d.\n"
-      "1. If yes, please stop them, or start PaddlePaddle on another NPU.\n"
-      "2. If no, please decrease the batch size of your model. %s\n\n",
-      place_.device,
-      string::HumanReadableSize(size),
-      place_.device,
-      string::HumanReadableSize(avail),
-      place_.device,
-      err_msg));
-}
-}  // namespace allocation
-}  // namespace memory
-}  // namespace paddle
--- a/paddle/fluid/memory/allocation/npu_allocator.h
+++ b/paddle/fluid/memory/allocation/npu_allocator.h
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-#include <mutex>  // NOLINT
-#include "paddle/fluid/memory/allocation/allocator.h"
-#include "paddle/fluid/platform/place.h"
-namespace paddle {
-namespace memory {
-namespace allocation {
-class NPUAllocator : public Allocator {
- public:
-  explicit NPUAllocator(const platform::NPUPlace& place) : place_(place) {}
-  bool IsAllocThreadSafe() const override;
- protected:
-  void FreeImpl(phi::Allocation* allocation) override;
-  phi::Allocation* AllocateImpl(size_t size) override;
- private:
-  platform::NPUPlace place_;
-  std::once_flag once_flag_;
-};
-}  // namespace allocation
-}  // namespace memory
-}  // namespace paddle
--- a/paddle/fluid/memory/allocation/npu_pinned_allocator.cc
+++ b/paddle/fluid/memory/allocation/npu_pinned_allocator.cc
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#ifdef PADDLE_WITH_ASCEND_CL
-#include "paddle/fluid/memory/allocation/npu_pinned_allocator.h"
-namespace paddle {
-namespace memory {
-namespace allocation {
-void NPUPinnedAllocator::ProcessEventsAndFree() {
-  for (auto it = npu_events_.begin(); it != npu_events_.end();) {
-    aclrtEvent event = it->second;
-    aclrtEventStatus status = ACL_EVENT_STATUS_COMPLETE;
-    platform::NPUEventQuery(event, &status);
-    if (status == ACL_EVENT_STATUS_COMPLETE) {
-      auto *allocation = it->first;
-      void *ptr = allocation->ptr();
-      free(ptr);
-      npu_events_.erase(it++);
-      delete allocation;
-      platform::NPUEventDestroy(event);
-    } else {
-      ++it;
-    }
-  }
-}
-phi::Allocation *NPUPinnedAllocator::AllocateImpl(size_t size) {
-  std::lock_guard<std::mutex> lock(mtx_);
-  ProcessEventsAndFree();
-  void *ptr;
-  int error = posix_memalign(&ptr, kAlignment, size);
-  PADDLE_ENFORCE_EQ(
-      error,
-      0,
-      platform::errors::ResourceExhausted(
-          "Fail to alloc memory of %ld size, error code is %d.", size, error));
-  return new Allocation(ptr, size, platform::NPUPinnedPlace());
-}
-void NPUPinnedAllocator::FreeImpl(phi::Allocation *allocation) {
-  std::lock_guard<std::mutex> lock(mtx_);
-  void *ptr = allocation->ptr();
-  auto iter = npu_events_.find(allocation);
-  // Managed by GC if not called RecordEvent.
-  if (iter == npu_events_.end()) {
-    // double free? No such problem has been found so far.
-    // Or maybe we need a set<Allocation*> to record which
-    // Allocation managed by GC.
-    free(ptr);
-    delete allocation;
-    return;
-  }
-  aclrtEvent event = iter->second;
-  aclrtEventStatus status = ACL_EVENT_STATUS_COMPLETE;
-  platform::NPUEventQuery(event, &status);
-  if (status == ACL_EVENT_STATUS_COMPLETE) {
-    free(ptr);
-    npu_events_.erase(allocation);
-    delete allocation;
-    platform::NPUEventDestroy(event);
-  }
-  return;
-}
-uint64_t NPUPinnedAllocator::ReleaseImpl(const platform::Place &place) {
-  std::lock_guard<std::mutex> lock(mtx_);
-  // Empty implementation
-  return static_cast<uint64_t>(0);
-}
-void NPUPinnedAllocator::RecordEvent(phi::Allocation *allocation,
-                                     aclrtStream stream) {
-  std::lock_guard<std::mutex> lock(mtx_);
-  aclrtEvent event = nullptr;
-  platform::NPUEventCreate(&event);
-  platform::NPUEventRecord(event, stream);
-  npu_events_.insert({allocation, event});
-}
-}  // namespace allocation
-}  // namespace memory
-}  // namespace paddle
-#endif
--- a/paddle/fluid/memory/allocation/npu_pinned_allocator.h
+++ b/paddle/fluid/memory/allocation/npu_pinned_allocator.h
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-#ifdef PADDLE_WITH_ASCEND_CL
-#include <mutex>  // NOLINT
-#include <string>
-#include <unordered_map>
-#include "acl/acl.h"
-#include "paddle/fluid/memory/allocation/allocator.h"
-#include "paddle/fluid/platform/place.h"
-namespace paddle {
-namespace memory {
-namespace allocation {
-class NPUPinnedAllocator : public Allocator {
- public:
-  bool IsAllocThreadSafe() const override { return true; }
-  void ProcessEventsAndFree();
-  void RecordEvent(phi::Allocation *allocation, aclrtStream stream);
-  constexpr static size_t kAlignment = 4096UL;
- protected:
-  phi::Allocation *AllocateImpl(size_t size) override;
-  void FreeImpl(phi::Allocation *allocation) override;
-  uint64_t ReleaseImpl(const platform::Place &place) override;
- private:
-  std::unordered_map<phi::Allocation *, aclrtEvent> npu_events_;
-  mutable std::mutex mtx_;
-};
-}  // namespace allocation
-}  // namespace memory
-}  // namespace paddle
-#endif
--- a/paddle/fluid/memory/allocation/system_allocator.cc
+++ b/paddle/fluid/memory/allocation/system_allocator.cc
@@ -287,135 +287,6 @@ bool CUDAPinnedAllocator::UseGpu() const { return false; }
 #endif
-#ifdef PADDLE_WITH_ASCEND_CL
-void* NPUAllocator::Alloc(size_t* index, size_t size) {
-  if (size <= 0) return nullptr;
-  void* p;
-  auto result = platform::RecordedNPUMalloc(&p, size, npu_id_);
-  if (result == ACL_ERROR_NONE) {
-    *index = 0;
-    npu_alloc_size_ += size;
-    return p;
-  } else {
-    size_t avail, total, actual_avail, actual_total;
-    bool is_limited = platform::RecordedNPUMemGetInfo(
-        &avail, &total, &actual_avail, &actual_total, npu_id_);
-    std::string err_msg;
-    if (is_limited) {
-      auto limit_size = (total >> 20);
-      err_msg = string::Sprintf(
-          "\n   3) Set environment variable `FLAGS_gpu_memory_limit_mb` to a "
-          "larger value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the "
-          "maximum GPU memory usage is limited to %d MB.\n"
-          "      The command is `export FLAGS_gpu_memory_limit_mb=xxx`.",
-          limit_size,
-          limit_size);
-    }
-    PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
-        "\n\nOut of memory error on NPU %d. "
-        "Cannot allocate %s memory on NPU %d, "
-        "available memory is only %s.\n\n"
-        "Please check whether there is any other process using NPU %d.\n"
-        "1. If yes, please stop them, or start PaddlePaddle on another NPU.\n"
-        "2. If no, please try one of the following suggestions:\n"
-        "   1) Decrease the batch size of your model.\n"
-        "   2) FLAGS_fraction_of_gpu_memory_to_use is %.2lf now, "
-        "please set it to a higher value but less than 1.0.\n"
-        "      The command is "
-        "`export FLAGS_fraction_of_gpu_memory_to_use=xxx`.%s\n\n",
-        npu_id_,
-        string::HumanReadableSize(size),
-        npu_id_,
-        string::HumanReadableSize(avail),
-        npu_id_,
-        FLAGS_fraction_of_gpu_memory_to_use,
-        err_msg));
-  }
-}
-void NPUAllocator::Free(void* p, size_t size, size_t index) {
-  VLOG(4) << "Free " << p << " size " << size;
-  PADDLE_ENFORCE_EQ(index,
-                    0,
-                    platform::errors::InvalidArgument(
-                        "The index should be 0, index is %d", index));
-  PADDLE_ENFORCE_GE(npu_alloc_size_,
-                    size,
-                    platform::errors::InvalidArgument(
-                        "The size of memory (%d) to free exceeds the size of "
-                        "allocated gpu memory (%d)",
-                        size,
-                        npu_alloc_size_));
-  npu_alloc_size_ -= size;
-  platform::RecordedNPUFree(p, size, npu_id_);
-}
-bool NPUAllocator::UseGpu() const { return true; }
-void* NPUPinnedAllocator::Alloc(size_t* index, size_t size) {
-  if (size <= 0) return nullptr;
-  size_t usable =
-      phi::backends::cpu::NPUPinnedMaxAllocSize() - npu_pinnd_alloc_size_;
-  if (size > usable) {
-    LOG(WARNING) << "Cannot malloc " << size / 1024.0 / 1024.0
-                 << " MB pinned memory."
-                 << ", available " << usable / 1024.0 / 1024.0 << " MB";
-    return nullptr;
-  }
-  void* p;
-  // PINNED memory is visible to all NPU contexts.
-  auto result = platform::NPUHostMalloc(&p, size);
-  if (result == ACL_ERROR_NONE) {
-    *index = 1;  // PINNED memory
-    npu_pinnd_alloc_size_ += size;
-    return p;
-  } else {
-    LOG(WARNING) << "NPUHostMalloc failed.";
-    return nullptr;
-  }
-  return nullptr;
-}
-void NPUPinnedAllocator::Free(void* p, size_t size, size_t index) {
-  aclError err;
-  PADDLE_ENFORCE_EQ(index,
-                    1,
-                    platform::errors::InvalidArgument(
-                        "The index should be 1, but got %d", index));
-  PADDLE_ENFORCE_GE(npu_pinnd_alloc_size_,
-                    size,
-                    platform::errors::InvalidArgument(
-                        "The size of memory (%d) to free exceeds the size of "
-                        "allocated npu pinned memory (%d)",
-                        size,
-                        npu_pinnd_alloc_size_));
-  npu_pinnd_alloc_size_ -= size;
-  err = platform::NPUHostFree(p);
-  if (err != ACL_ERROR_NONE) {
-    PADDLE_ENFORCE_EQ(
-        err,
-        0,
-        platform::errors::Fatal(
-            "NPUHostFree failed in NPUPinnedAllocator, error code is %d", err));
-  }
-}
-bool NPUPinnedAllocator::UseGpu() const { return false; }
-#endif
 #ifdef PADDLE_WITH_MLU
 void* MLUAllocator::Alloc(size_t* index, size_t size) {
  if (size <= 0) return nullptr;

--- a/paddle/fluid/memory/allocation/system_allocator.h
+++ b/paddle/fluid/memory/allocation/system_allocator.h
@@ -68,32 +68,6 @@ class CUDAPinnedAllocator : public SystemAllocator {
 };
 #endif
-#ifdef PADDLE_WITH_ASCEND_CL
-class NPUAllocator : public SystemAllocator {
- public:
-  explicit NPUAllocator(int npu_id) : npu_id_(npu_id) {}
-  virtual void* Alloc(size_t* index, size_t size);
-  virtual void Free(void* p, size_t size, size_t index);
-  virtual bool UseGpu() const;
- private:
-  size_t npu_alloc_size_ = 0;
-  int npu_id_;
-};
-class NPUPinnedAllocator : public SystemAllocator {
- public:
-  virtual void* Alloc(size_t* index, size_t size);
-  virtual void Free(void* p, size_t size, size_t index);
-  virtual bool UseGpu() const;
- private:
-  size_t npu_pinnd_alloc_size_ = 0;
-};
-#endif
 #ifdef PADDLE_WITH_MLU
 class MLUAllocator : public SystemAllocator {
 public:

--- a/paddle/fluid/memory/allocation/system_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/system_allocator_test.cc
@@ -83,14 +83,6 @@ TEST(GPUAllocator, AllocFailure) {
 }
 #endif
-#ifdef PADDLE_WITH_ASCEND_CL
-TEST(NPUAllocator, Alloc) {
-  paddle::memory::detail::NPUAllocator a(0);
-  TestAllocator(&a, 1 << 20);
-  TestAllocator(&a, 1);
-}
-#endif
 #ifdef PADDLE_WITH_MLU
 TEST(MLUAllocator, Alloc) {
  paddle::memory::detail::MLUAllocator a(0);

--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
@@ -260,415 +260,6 @@ void Copy<phi::Place, phi::XPUPlace>(phi::Place dst_place,
 #endif
-#ifdef PADDLE_WITH_ASCEND_CL
-template <>
-void Copy<platform::NPUPlace, platform::CPUPlace>(platform::NPUPlace dst_place,
-                                                  void* dst,
-                                                  platform::CPUPlace src_place,
-                                                  const void* src,
-                                                  size_t num,
-                                                  void* stream) {
-  if (UNLIKELY(num == 0)) return;
-  platform::SetNPUDeviceId(dst_place.device);
-  VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
-          << dst_place << " by thream(" << stream << ")";
-  if (stream) {
-    platform::RecordEvent record_event(
-        "NpuMemcpyAsync:CPU->NPU", platform::TracerEventType::UserDefined, 1);
-    platform::NPUMemcpyAsync(dst,
-                             src,
-                             num,
-                             ACL_MEMCPY_HOST_TO_DEVICE,
-                             reinterpret_cast<aclrtStream>(stream));
-  } else {
-    // On NPU, async operation after sync operation is ok, while sync operation
-    // after async is not ok, since the async operation may not done.
-    // So, its needed to do wait before sync operation.
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    static_cast<platform::NPUDeviceContext*>(pool.Get(dst_place))->Wait();
-    platform::RecordEvent record_event(
-        "NpuMemcpySync:CPU->NPU", platform::TracerEventType::UserDefined, 1);
-    platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE);
-  }
-}
-template <>
-void Copy<platform::CPUPlace, platform::NPUPlace>(platform::CPUPlace dst_place,
-                                                  void* dst,
-                                                  platform::NPUPlace src_place,
-                                                  const void* src,
-                                                  size_t num,
-                                                  void* stream) {
-  if (UNLIKELY(num == 0)) return;
-  platform::SetNPUDeviceId(src_place.device);
-  VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
-          << dst_place << " by thream(" << stream << ")";
-  if (stream) {
-    platform::RecordEvent record_event(
-        "NpuMemcpyAsync:NPU->CPU", platform::TracerEventType::UserDefined, 1);
-    platform::NPUMemcpyAsync(dst,
-                             src,
-                             num,
-                             ACL_MEMCPY_DEVICE_TO_HOST,
-                             reinterpret_cast<aclrtStream>(stream));
-  } else {
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    static_cast<platform::NPUDeviceContext*>(pool.Get(src_place))->Wait();
-    platform::RecordEvent record_event(
-        "NpuMemcpySync:NPU->CPU", platform::TracerEventType::UserDefined, 1);
-    platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST);
-  }
-}
-template <>
-void Copy<platform::NPUPlace, platform::NPUPlace>(platform::NPUPlace dst_place,
-                                                  void* dst,
-                                                  platform::NPUPlace src_place,
-                                                  const void* src,
-                                                  size_t num,
-                                                  void* stream) {
-  if (UNLIKELY(num == 0)) return;
-  VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
-          << dst_place << " by stream(" << stream << ")";
-  if (dst_place == src_place) {
-    platform::SetNPUDeviceId(src_place.device);
-    if (stream) {
-      platform::RecordEvent record_event("NpuMemcpyAsync(same_npu):NPU->NPU",
-                                         platform::TracerEventType::UserDefined,
-                                         1);
-      platform::NPUMemcpyAsync(dst,
-                               src,
-                               num,
-                               ACL_MEMCPY_DEVICE_TO_DEVICE,
-                               reinterpret_cast<aclrtStream>(stream));
-    } else {
-      platform::DeviceContextPool& pool =
-          platform::DeviceContextPool::Instance();
-      static_cast<platform::NPUDeviceContext*>(pool.Get(dst_place))->Wait();
-      platform::RecordEvent record_event("NpuMemcpySync(same_npu):NPU->NPU",
-                                         platform::TracerEventType::UserDefined,
-                                         1);
-      platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE);
-    }
-  } else {
-    if (!platform::NPUCanAccessPeer(dst_place.device, dst_place.device)) {
-      PADDLE_THROW(platform::errors::Unavailable(
-          "Peer access between NPU places is not allowed."));
-    }
-    if (stream) {
-      // TODO(zhiqiu): support peer access?
-      platform::RecordEvent record_event("NpuMemcpyPeerAsync:NPU->NPU",
-                                         platform::TracerEventType::UserDefined,
-                                         1);
-      platform::NPUMemcpyAsync(dst,
-                               src,
-                               num,
-                               ACL_MEMCPY_DEVICE_TO_DEVICE,
-                               reinterpret_cast<aclrtStream>(stream));
-    } else {
-      platform::DeviceContextPool& pool =
-          platform::DeviceContextPool::Instance();
-      static_cast<platform::NPUDeviceContext*>(pool.Get(dst_place))->Wait();
-      platform::RecordEvent record_event("NpuMemcpyPeerSync:NPU->NPU",
-                                         platform::TracerEventType::UserDefined,
-                                         1);
-      platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE);
-    }
-  }
-}
-template <>
-void Copy<platform::CPUPlace, platform::NPUPinnedPlace>(
-    platform::CPUPlace dst_place,
-    void* dst,
-    platform::NPUPinnedPlace src_place,
-    const void* src,
-    size_t num) {
-  VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
-          << dst_place;
-  if (UNLIKELY(num == 0)) return;
-  std::memcpy(dst, src, num);
-}
-template <>
-void Copy<platform::NPUPinnedPlace, platform::CPUPlace>(
-    platform::NPUPinnedPlace dst_place,
-    void* dst,
-    platform::CPUPlace src_place,
-    const void* src,
-    size_t num) {
-  VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
-          << dst_place;
-  if (UNLIKELY(num == 0)) return;
-  std::memcpy(dst, src, num);
-}
-template <>
-void Copy<platform::NPUPinnedPlace, platform::NPUPinnedPlace>(
-    platform::NPUPinnedPlace dst_place,
-    void* dst,
-    platform::NPUPinnedPlace src_place,
-    const void* src,
-    size_t num) {
-  VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
-          << dst_place;
-  if (UNLIKELY(num == 0)) return;
-  std::memcpy(dst, src, num);
-}
-template <>
-void Copy<platform::NPUPinnedPlace, platform::NPUPlace>(
-    platform::NPUPinnedPlace dst_place,
-    void* dst,
-    platform::NPUPlace src_place,
-    const void* src,
-    size_t num,
-    void* stream) {
-  if (UNLIKELY(num == 0)) return;
-  platform::SetNPUDeviceId(src_place.device);
-  VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
-          << dst_place << " by thream(" << stream << ")";
-  if (stream) {
-    platform::RecordEvent record_event("NpuMemcpyAsync:NPU->NPUPinned",
-                                       platform::TracerEventType::UserDefined,
-                                       1);
-    platform::NPUMemcpyAsync(dst,
-                             src,
-                             num,
-                             ACL_MEMCPY_DEVICE_TO_HOST,
-                             reinterpret_cast<aclrtStream>(stream));
-  } else {
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    static_cast<platform::NPUDeviceContext*>(pool.Get(src_place))->Wait();
-    platform::RecordEvent record_event("NpuMemcpySync:NPU->NPUPinned",
-                                       platform::TracerEventType::UserDefined,
-                                       1);
-    platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST);
-  }
-}
-template <>
-void Copy<platform::NPUPlace, platform::NPUPinnedPlace>(
-    platform::NPUPlace dst_place,
-    void* dst,
-    platform::NPUPinnedPlace src_place,
-    const void* src,
-    size_t num,
-    void* stream) {
-  if (UNLIKELY(num == 0)) return;
-  platform::SetNPUDeviceId(dst_place.device);
-  VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
-          << dst_place << " by thream(" << stream << ")";
-  if (stream) {
-    platform::RecordEvent record_event("NpuMemcpyAsync:NPUPinned->NPU",
-                                       platform::TracerEventType::UserDefined,
-                                       1);
-    platform::NPUMemcpyAsync(dst,
-                             src,
-                             num,
-                             ACL_MEMCPY_HOST_TO_DEVICE,
-                             reinterpret_cast<aclrtStream>(stream));
-  } else {
-    // On NPU, async operation after sync operation is ok, while sync operation
-    // after async is not ok, since the async operation may not done.
-    // So, its needed to do wait before sync operation.
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    static_cast<platform::NPUDeviceContext*>(pool.Get(dst_place))->Wait();
-    platform::RecordEvent record_event("NpuMemcpySync:NPUPinned->NPU",
-                                       platform::TracerEventType::UserDefined,
-                                       1);
-    platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE);
-  }
-}
-// NOTE: only for CPUPlace, NPUPlace and NPUPinnedPlace.
-template <>
-void Copy<phi::Place, phi::Place>(phi::Place dst_place,
-                                  void* dst,
-                                  phi::Place src_place,
-                                  const void* src,
-                                  size_t num,
-                                  aclrtStream stream) {
-  if (src_place.GetType() == phi::AllocationType::CPU &&
-      dst_place.GetType() == phi::AllocationType::CPU) {
-    platform::CPUPlace place_dst, place_src;
-    return Copy(place_dst, dst, place_src, src, num);
-  } else if (src_place.GetType() == phi::AllocationType::CPU &&
-             dst_place.GetType() == phi::AllocationType::NPU) {
-    platform::NPUPlace place_dst(dst_place.GetDeviceId());
-    platform::CPUPlace place_src;
-    return Copy(place_dst, dst, place_src, src, num, stream);
-  } else if (src_place.GetType() == phi::AllocationType::NPU &&
-             dst_place.GetType() == phi::AllocationType::CPU) {
-    platform::NPUPlace place_src(src_place.GetDeviceId());
-    platform::CPUPlace place_dst;
-    return Copy(place_dst, dst, place_src, src, num, stream);
-  } else if (src_place.GetType() == phi::AllocationType::NPU &&
-             dst_place.GetType() == phi::AllocationType::NPU) {
-    platform::NPUPlace place_src(src_place.GetDeviceId());
-    platform::NPUPlace place_dst(dst_place.GetDeviceId());
-    return Copy(place_dst, dst, place_src, src, num, stream);
-  } else if (src_place.GetType() == phi::AllocationType::CPU &&
-             dst_place.GetType() == phi::AllocationType::NPUPINNED) {
-    platform::CPUPlace place_src;
-    platform::NPUPinnedPlace place_dst;
-    return Copy(place_dst, dst, place_src, src, num);
-  } else if (src_place.GetType() == phi::AllocationType::NPUPINNED &&
-             dst_place.GetType() == phi::AllocationType::CPU) {
-    platform::CPUPlace place_dst;
-    platform::NPUPinnedPlace place_src;
-    return Copy(place_dst, dst, place_src, src, num);
-  } else if (src_place.GetType() == phi::AllocationType::NPUPINNED &&
-             dst_place.GetType() == phi::AllocationType::NPUPINNED) {
-    platform::NPUPinnedPlace place_dst;
-    platform::NPUPinnedPlace place_src;
-    return Copy(place_dst, dst, place_src, src, num);
-  } else if (src_place.GetType() == phi::AllocationType::NPUPINNED &&
-             dst_place.GetType() == phi::AllocationType::NPU) {
-    platform::NPUPinnedPlace place_src;
-    platform::NPUPlace place_dst(dst_place.GetDeviceId());
-    return Copy(place_dst, dst, place_src, src, num, stream);
-  } else if (src_place.GetType() == phi::AllocationType::NPU &&
-             dst_place.GetType() == phi::AllocationType::NPUPINNED) {
-    platform::NPUPinnedPlace place_dst;
-    platform::NPUPlace place_src(src_place.GetDeviceId());
-    return Copy(place_dst, dst, place_src, src, num, stream);
-#ifdef PADDLE_WITH_CUSTOM_DEVICE
-  } else if (src_place.GetType() == phi::AllocationType::CPU &&  // NOLINT
-             dst_place.GetType() == phi::AllocationType::CUSTOM) {
-    platform::CPUPlace place_src;
-    platform::CustomPlace place_dst(dst_place);
-    return Copy(place_dst, dst, place_src, src, num, stream);
-  } else if (src_place.GetType() == phi::AllocationType::CUSTOM &&  // NOLINT
-             dst_place.GetType() == phi::AllocationType::CPU) {
-    platform::CustomPlace place_src(src_place);
-    platform::CPUPlace place_dst;
-    return Copy(place_dst, dst, place_src, src, num, stream);
-  } else if (src_place.GetType() == phi::AllocationType::CUSTOM &&  // NOLINT
-             dst_place.GetType() == phi::AllocationType::CUSTOM) {
-    platform::CustomPlace place_src(src_place);
-    platform::CustomPlace place_dst(dst_place);
-    return Copy(place_dst, dst, place_src, src, num, stream);
-#endif
-  }
-}
-// NOTE: only for (CPUPlace, NPUPlace and NPUPinnedPlace) -> (CPUPlace).
-template <>
-void Copy<phi::CPUPlace, phi::Place>(phi::CPUPlace dst_place,
-                                     void* dst,
-                                     phi::Place src_place,
-                                     const void* src,
-                                     size_t num,
-                                     aclrtStream stream) {
-  Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, stream);
-}
-// NOTE: only for (CPUPlace) -> (CPUPlace, NPUPlace and NPUPinnedPlace).
-template <>
-void Copy<phi::Place, phi::CPUPlace>(phi::Place dst_place,
-                                     void* dst,
-                                     phi::CPUPlace src_place,
-                                     const void* src,
-                                     size_t num,
-                                     aclrtStream stream) {
-  Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, stream);
-}
-// NOTE: only for (CPUPlace, NPUPlace and NPUPinnedPlace) -> (NPUPlace)
-template <>
-void Copy<phi::NPUPlace, phi::Place>(phi::NPUPlace dst_place,
-                                     void* dst,
-                                     phi::Place src_place,
-                                     const void* src,
-                                     size_t num,
-                                     aclrtStream stream) {
-  Copy(phi::Place(dst_place.GetType(), dst_place.GetDeviceId()),
-       dst,
-       src_place,
-       src,
-       num,
-       stream);
-}
-// NOTE: only for (NPUPlace) -> (CPUPlace, NPUPlace and NPUPinnedPlace)
-template <>
-void Copy<phi::Place, phi::NPUPlace>(phi::Place dst_place,
-                                     void* dst,
-                                     phi::NPUPlace src_place,
-                                     const void* src,
-                                     size_t num,
-                                     aclrtStream stream) {
-  Copy(dst_place,
-       dst,
-       phi::Place(src_place.GetType(), src_place.GetDeviceId()),
-       src,
-       num,
-       stream);
-}
-// NOTE: only for (CPUPlace, NPUPlace and NPUPinnedPlace) -> (NPUPinnedPlace)
-template <>
-void Copy<phi::NPUPinnedPlace, phi::Place>(phi::NPUPinnedPlace dst_place,
-                                           void* dst,
-                                           phi::Place src_place,
-                                           const void* src,
-                                           size_t num,
-                                           aclrtStream stream) {
-  Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, stream);
-}
-// NOTE: only for (NPUPinnedPlace) -> (CPUPlace, NPUPlace and NPUPinnedPlace)
-template <>
-void Copy<phi::Place, phi::NPUPinnedPlace>(phi::Place dst_place,
-                                           void* dst,
-                                           phi::NPUPinnedPlace src_place,
-                                           const void* src,
-                                           size_t num,
-                                           aclrtStream stream) {
-  Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, stream);
-}
-// NOTE: only for (CPUPlace) -> (NPUPinnedPlace)
-template <>
-void Copy<phi::NPUPinnedPlace, phi::Place>(phi::NPUPinnedPlace dst_place,
-                                           void* dst,
-                                           phi::Place src_place,
-                                           const void* src,
-                                           size_t num) {
-  Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, nullptr);
-}
-// NOTE: only for (NPUPinnedPlace) -> (CPUPlace)
-template <>
-void Copy<phi::Place, phi::NPUPinnedPlace>(phi::Place dst_place,
-                                           void* dst,
-                                           phi::NPUPinnedPlace src_place,
-                                           const void* src,
-                                           size_t num) {
-  Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, nullptr);
-}
-#endif
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 static constexpr size_t kMaxGpuAsyncCopyBytes = 64 * 1024;  // 64K
@@ -1391,18 +982,6 @@ void Copy<phi::Place, phi::Place>(phi::Place dst_place,
    std::memcpy(dst, src, num);
  }
 #endif
-#ifdef PADDLE_WITH_ASCEND_CL
-  else if (src_place.GetType() == phi::AllocationType::CPU &&  // NOLINT
-           dst_place.GetType() == phi::AllocationType::NPUPINNED) {
-    std::memcpy(dst, src, num);
-  } else if (src_place.GetType() == phi::AllocationType::NPUPINNED &&
-             dst_place.GetType() == phi::AllocationType::CPU) {
-    std::memcpy(dst, src, num);
-  } else if (src_place.GetType() == phi::AllocationType::NPUPINNED &&
-             dst_place.GetType() == phi::AllocationType::NPUPINNED) {
-    std::memcpy(dst, src, num);
-  }
-#endif
 #ifdef PADDLE_WITH_XPU
  else if (src_place.GetType() == phi::AllocationType::CPU &&  // NOLINT
           dst_place.GetType() == phi::AllocationType::CPU) {
@@ -1488,8 +1067,7 @@ void Copy<phi::CPUPlace, phi::Place>(phi::CPUPlace dst_place,
 }
 #if defined(PADDLE_WITH_CUSTOM_DEVICE) && !defined(PADDLE_WITH_CUDA) && \
-    !defined(PADDLE_WITH_ASCEND_CL) && !defined(PADDLE_WITH_HIP) &&     \
+    !defined(PADDLE_WITH_HIP)
-    !defined(PADDLE_WITH_MLU)
 template <>
 void Copy<phi::Place, phi::Place>(phi::Place dst_place,

--- a/paddle/fluid/operators/coalesce_tensor_op.cc
+++ b/paddle/fluid/operators/coalesce_tensor_op.cc
@@ -21,8 +21,7 @@
 #include "paddle/fluid/framework/var_type.h"
 #include "paddle/phi/backends/device_memory_aligment.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
-#ifdef PADDLE_WITH_ASCEND_CL
-#endif
 #include "paddle/fluid/framework/convert_utils.h"
 #ifdef PADDLE_WITH_MLU
 #include "paddle/fluid/operators/mlu/mlu_baseop.h"

--- a/paddle/fluid/operators/copy_cross_scope_test.cc
+++ b/paddle/fluid/operators/copy_cross_scope_test.cc
@@ -148,16 +148,4 @@ TEST(copy_cross_scope_to_main_scope, CUDA_fp32) {
  ctx.PartialInitWithAllocator();
  Compare2<float>(&scope, ctx, "copy_cross_scope");
 }
-#elif PADDLE_WITH_ASCEND_CL
-TEST(copy_cross_scope, NPU_fp32) {
-  f::Scope scope;
-  p::NPUDeviceContext ctx(p::NPUPlace(0));
-  Compare1<float>(&scope, ctx, "copy_cross_scope");
-}
-TEST(copy_cross_scope_to_main_scope, NPU_fp32) {
-  f::Scope scope;
-  p::NPUDeviceContext ctx(p::NPUPlace(0));
-  Compare2<float>(&scope, ctx, "copy_cross_scope");
-}
 #endif
--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
@@ -28,15 +28,9 @@ function(detection_library TARGET_NAME)
      PARENT_SCOPE)
 endfunction()
-if(WITH_ASCEND_CL)
+detection_library(box_coder_op SRCS box_coder_op.cc)
-  detection_library(box_coder_op SRCS box_coder_op.cc box_coder_op_npu.cc)
+detection_library(density_prior_box_op SRCS density_prior_box_op.cc
-  detection_library(density_prior_box_op SRCS density_prior_box_op.cc
+                  density_prior_box_op.cu)
-                    density_prior_box_op.cu density_prior_box_op_npu.cc)
-else()
-  detection_library(box_coder_op SRCS box_coder_op.cc)
-  detection_library(density_prior_box_op SRCS density_prior_box_op.cc
-                    density_prior_box_op.cu)
-endif()
 if(WITH_XPU)
  detection_library(iou_similarity_op SRCS iou_similarity_op.cc
@@ -49,11 +43,6 @@ elseif(WITH_MLU)
                    iou_similarity_op_mlu.cc)
  detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op_mlu.cc)
  detection_library(yolo_box_op SRCS yolo_box_op.cc yolo_box_op_mlu.cc)
-elseif(WITH_ASCEND_CL)
-  detection_library(iou_similarity_op SRCS iou_similarity_op.cc
-                    iou_similarity_op_npu.cc)
-  detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op_npu.cc)
-  detection_library(yolo_box_op SRCS yolo_box_op.cc)
 else()
  detection_library(iou_similarity_op SRCS iou_similarity_op.cc
                    iou_similarity_op.cu)

--- a/paddle/fluid/operators/expand_op.h
+++ b/paddle/fluid/operators/expand_op.h
@@ -36,13 +36,6 @@ inline std::vector<int> get_expand_times(
          *expand_tensor, platform::CPUPlace(), &cpu_expand_tensor);
      expand_data = cpu_expand_tensor.data<int>();
    }
-#ifdef PADDLE_WITH_ASCEND_CL
-    if (platform::is_npu_place(expand_tensor->place())) {
-      paddle::framework::TensorCopySync(
-          *expand_tensor, platform::CPUPlace(), &cpu_expand_tensor);
-      expand_data = cpu_expand_tensor.data<int>();
-    }
-#endif
 #ifdef PADDLE_WITH_XPU
    if (platform::is_xpu_place(expand_tensor->place())) {
      paddle::framework::TensorCopySync(

--- a/paddle/fluid/operators/expand_v2_op.h
+++ b/paddle/fluid/operators/expand_v2_op.h
@@ -37,13 +37,6 @@ inline std::vector<int> get_expand_shape(
          *shape_tensor, platform::CPUPlace(), &cpu_shape_tensor);
      shape_data = cpu_shape_tensor.data<int>();
    }
-#ifdef PADDLE_WITH_ASCEND_CL
-    if (platform::is_npu_place(shape_tensor->place())) {
-      paddle::framework::TensorCopySync(
-          *shape_tensor, platform::CPUPlace(), &cpu_shape_tensor);
-      shape_data = cpu_shape_tensor.data<int>();
-    }
-#endif
 #ifdef PADDLE_WITH_XPU
    if (platform::is_xpu_place(shape_tensor->place())) {
      paddle::framework::TensorCopySync(
@@ -75,13 +68,6 @@ inline std::vector<int> get_expand_shape(
        paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp);
        vec_epxand_shape.push_back(*temp.data<int32_t>());
      }
-#ifdef PADDLE_WITH_ASCEND_CL
-      else if (platform::is_npu_place(tensor->place())) {  // NOLINT
-        phi::DenseTensor temp;
-        paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp);
-        vec_epxand_shape.push_back(*temp.data<int32_t>());
-      }
-#endif
 #ifdef PADDLE_WITH_XPU
      else if (platform::is_xpu_place(tensor->place())) {  // NOLINT
        phi::DenseTensor temp;

--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
-if(WITH_ASCEND_CL)
-  cc_library(
-    beam_search_npu
-    SRCS beam_search_npu.cc
-    DEPS npu_op_runner)
-endif()
 if(WITH_XPU)
  cc_library(
    beam_search_xpu
@@ -13,9 +6,7 @@ if(WITH_XPU)
 endif()
 # please add new math_library in alphabetical order
-if(WITH_ASCEND_CL)
+if(WITH_MLU)
-  math_library(concat_and_split DEPS concat_and_split_functor npu_op_runner)
-elseif(WITH_MLU)
  math_library(concat_and_split DEPS concat_and_split_functor mlu_baseop)
 else()
  math_library(concat_and_split DEPS concat_and_split_functor)

--- a/paddle/fluid/operators/memcpy_d2h_op.cc
+++ b/paddle/fluid/operators/memcpy_d2h_op.cc
@@ -122,34 +122,6 @@ REGISTER_OPERATOR(
    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
    MemcpyD2HInferShapeFunctor);
-#ifdef PADDLE_WITH_ASCEND_CL
-REGISTER_OP_NPU_KERNEL_FUNCTOR(memcpy_d2h,
-                               float,
-                               ops::MemcpyD2HKernel,
-                               double,
-                               ops::MemcpyD2HKernel,
-                               int8_t,
-                               ops::MemcpyD2HKernel,
-                               uint8_t,
-                               ops::MemcpyD2HKernel,
-                               int,
-                               ops::MemcpyD2HKernel,
-                               int64_t,
-                               ops::MemcpyD2HKernel,
-                               bool,
-                               ops::MemcpyD2HKernel,
-                               paddle::platform::bfloat16,
-                               ops::MemcpyD2HKernel,
-                               paddle::platform::complex<float>,
-                               ops::MemcpyD2HKernel,
-                               paddle::platform::complex<double>,
-                               ops::MemcpyD2HKernel,
-                               plat::float16,
-                               ops::MemcpyD2HKernel,
-                               int16_t,
-                               ops::MemcpyD2HKernel);
-#endif
 #ifdef PADDLE_WITH_IPU
 REGISTER_OP_IPU_KERNEL_FUNCTOR(memcpy_d2h,
                               float,

--- a/paddle/fluid/operators/norm_op.cc
+++ b/paddle/fluid/operators/norm_op.cc
@@ -87,11 +87,7 @@ class NormOpGradOpMaker : public framework::SingleGradOpMaker<T> {
    op->SetAttrMap(this->Attrs());
    op->SetInput("X", this->Input("X"));
    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
-#ifndef PADDLE_WITH_ASCEND_CL
    op->SetInput("Norm", this->Output("Norm"));
-#else
-    op->SetInput("Out", this->Output("Out"));
-#endif
    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
  }
 };

--- a/paddle/fluid/platform/device/device_wrapper.h
+++ b/paddle/fluid/platform/device/device_wrapper.h
@@ -25,9 +25,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/xpu/xpu_info.h"
 #endif
-#ifdef PADDLE_WITH_ASCEND_CL
-#endif
 #ifdef PADDLE_WITH_MLU
 #include "paddle/fluid/platform/device/mlu/enforce.h"
 #include "paddle/fluid/platform/device/mlu/mlu_info.h"

--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -248,31 +248,6 @@ void EmplaceDeviceContexts(
      PADDLE_THROW(
          platform::errors::Unimplemented("IPUPlace is not supported. Please "
                                          "re-compile with WITH_IPU option."));
-#endif
-    } else if (platform::is_npu_place(place)) {
-#ifdef PADDLE_WITH_ASCEND_CL
-      EmplaceDeviceContext<NPUDeviceContext>(
-          place_to_device_context,
-          place,
-          disable_setting_default_stream_for_allocator,
-          /*unused*/ stream_priority);
-#else
-      PADDLE_THROW(platform::errors::Unimplemented(
-          "NPUPlace is not supported. Please "
-          "re-compile with WITH_ASCEND_CL option."));
-#endif
-    } else if (platform::is_npu_pinned_place(place)) {
-#ifdef PADDLE_WITH_ASCEND_CL
-      EmplaceDeviceContext<NPUPinnedDeviceContext>(
-          place_to_device_context,
-          place,
-          disable_setting_default_stream_for_allocator,
-          /*unused*/ stream_priority);
-#else
-      PADDLE_THROW(platform::errors::Unimplemented(
-          "NPUPinnedPlace is not supported. Please re-compile with "
-          "WITH_ASCEND_CL "
-          "option."));
 #endif
    }
  }

--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -68,8 +68,6 @@ limitations under the License. */
 #include "glog/logging.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
-#ifdef PADDLE_WITH_ASCEND_CL
-#endif
 #include "paddle/phi/backends/device_ext.h"
 #include "paddle/phi/backends/stream.h"
@@ -89,10 +87,6 @@ struct GpuDevice;
 #include "paddle/phi/backends/xpu/xpu_context.h"
 #endif
-#ifdef PADDLE_WITH_ASCEND_CL
-#include "acl/acl.h"
-#endif
 namespace paddle {
 namespace platform {
@@ -150,86 +144,6 @@ namespace xpu = baidu::xpu::api;
 using XPUDeviceContext = phi::XPUContext;
 #endif
-#ifdef PADDLE_WITH_ASCEND_CL
-class NPUDeviceContext
-    : public DeviceContext,
-      public phi::TypeInfoTraits<DeviceContext, NPUDeviceContext> {
- public:
-  explicit NPUDeviceContext(NPUPlace place);
-  virtual ~NPUDeviceContext();
-  Eigen::DefaultDevice* eigen_device() const { return nullptr; }
-  const Place& GetPlace() const override;
-  aclrtContext context() const;
-  /*! \brief  Wait for all operations completion in the stream. */
-  void Wait() const override;
-  /*! \brief  Return npu stream in the device context. */
-  aclrtStream stream() const;
-  template <typename Callback>
-  void AddStreamCallback(Callback&& callback) const {
-    return stream_->AddCallback(callback);
-  }
-  void WaitStreamCallback() const { return stream_->WaitCallback(); }
-#if defined(PADDLE_WITH_ASCEND_CL)
-  /*! \brief  Return hccl communicators. */
-  HcclComm hccl_comm() const { return hccl_comm_; }
-  /*! \brief  Set hccl communicators. */
-  void set_hccl_comm(HcclComm comm) { hccl_comm_ = comm; }
-#endif
-  // template <typename Callback>
-  // void AddStreamCallback(Callback&& callback) const {
-  //   return stream_->AddCallback(callback);
-  // }
-  // void WaitStreamCallback() const { return stream_->WaitCallback(); }
-  static const char* name() { return "NPUDeviceContext"; }
- private:
-  NPUPlace place_;
-  aclrtContext context_;
-#ifdef PADDLE_WITH_ASCEND_CL
-  // HCCLContext_t hccl_context_;
-  HcclComm hccl_comm_{nullptr};
-#endif
-  // Need to be the same with other DeviceContext,
-  // Eventhough eigen_device_ is not used in NPU
-  // NOTE(zhiqiu): why need?
-  std::unique_ptr<Eigen::DefaultDevice> eigen_device_;
-  std::shared_ptr<stream::NPUStream> stream_;
-  DISABLE_COPY_AND_ASSIGN(NPUDeviceContext);
-};
-// Currently, NPUPinnedDeviceContext is only used to data copying.
-class NPUPinnedDeviceContext
-    : public DeviceContext,
-      public phi::TypeInfoTraits<DeviceContext, NPUPinnedDeviceContext> {
- public:
-  NPUPinnedDeviceContext();
-  explicit NPUPinnedDeviceContext(NPUPinnedPlace place);
-  const Place& GetPlace() const override;
-  Eigen::DefaultDevice* eigen_device() const;
-  static const char* name() { return "NPUPinnedDeviceContext"; }
- private:
-  NPUPinnedPlace place_;
-  std::unique_ptr<Eigen::DefaultDevice> eigen_device_;
-};
-#endif
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 using CUDAPinnedDeviceContext = phi::GPUPinnedContext;
 #endif
@@ -264,18 +178,6 @@ template <>
 struct DefaultDeviceContextType<phi::MLUPlace>;
 #endif
-#ifdef PADDLE_WITH_ASCEND_CL
-template <>
-struct DefaultDeviceContextType<phi::NPUPlace> {
-  using TYPE = paddle::platform::NPUDeviceContext;
-};
-template <>
-struct DefaultDeviceContextType<phi::NPUPinnedPlace> {
-  using TYPE = paddle::platform::NPUPinnedDeviceContext;
-};
-#endif
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 template <>
 struct DefaultDeviceContextType<phi::GPUPinnedPlace> {

--- a/paddle/fluid/platform/device_event.h
+++ b/paddle/fluid/platform/device_event.h
@@ -38,12 +38,6 @@ USE_EVENT_WAIT(kCUDA, kCUDA)
 USE_EVENT_WAIT(kCPU, kCUDA)
 #endif
-#ifdef PADDLE_WITH_ASCEND_CL
-USE_EVENT(kNPU);
-USE_EVENT_WAIT(kNPU, kNPU)
-USE_EVENT_WAIT(kCPU, kNPU)
-#endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
 USE_EVENT(kCUSTOM_DEVICE);
 USE_EVENT_WAIT(kCUSTOM_DEVICE, kCUSTOM_DEVICE)

--- a/paddle/fluid/platform/device_event_npu.cc
+++ b/paddle/fluid/platform/device_event_npu.cc
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#ifdef PADDLE_WITH_ASCEND_CL
-#include "paddle/fluid/platform/device_event_base.h"
-#include "paddle/fluid/platform/event.h"
-namespace paddle {
-namespace platform {
-struct NPUDeviceEventWrapper {
-  explicit NPUDeviceEventWrapper(const platform::Place& place) {
-    PADDLE_ENFORCE_EQ(
-        platform::is_npu_place(place),
-        true,
-        platform::errors::PreconditionNotMet(
-            "Required device shall be NPUPlace, but received %d. ", place));
-    device_id_ = place.device;
-    PADDLE_ENFORCE_GT(
-        device_id_,
-        -1,
-        platform::errors::PreconditionNotMet(
-            "Required DeviceOption.device_id > -1, but received %d. ",
-            device_id_));
-    inner_event_ = NpuEventResourcePool::Instance().New(device_id_);
-  }
-  std::shared_ptr<NpuEventObject> inner_event_;
-  int device_id_;
-};
-void DeviceEventCreateNPU(DeviceEvent* event,
-                          const platform::Place& place,
-                          unsigned int) {
-  event->InitEvent(std::make_shared<NPUDeviceEventWrapper>(place));
-}
-void DeviceEventRecordNPU(DeviceEvent* event, const DeviceContext* context) {
-  auto* wrapper = static_cast<NPUDeviceEventWrapper*>(event->GetEvent().get());
-  auto* npu_dev_ctx = dynamic_cast<const platform::NPUDeviceContext*>(context);
-  PADDLE_ENFORCE_NOT_NULL(
-      npu_dev_ctx,
-      platform::errors::PreconditionNotMet(
-          "Failed to dynamic_cast context into NPUDeviceContext."));
-  NPUEventRecord(wrapper->inner_event_.get(), npu_dev_ctx->stream());
-}
-bool DeviceEventQueryNPU(const DeviceEvent* event) {
-  auto* wrapper = static_cast<NPUDeviceEventWrapper*>(event->GetEvent().get());
-  PADDLE_ENFORCE_NOT_NULL(
-      wrapper,
-      platform::errors::PreconditionNotMet(
-          "Failed to dynamic_cast event into NPUDeviceEventWrapper."));
-  aclrtEventStatus status = ACL_EVENT_STATUS_COMPLETE;
-  platform::NPUEventQuery(wrapper->inner_event_.get(), &status);
-  return ACL_EVENT_STATUS_COMPLETE == status;
-}
-void DeviceEventFinishNPU(const DeviceEvent* event) {
-  auto* wrapper = static_cast<NPUDeviceEventWrapper*>(event->GetEvent().get());
-  NPUEventSynchronize(wrapper->inner_event_.get());
-}
-void DeviceEventNPUWaitNPU(const DeviceEvent* event,
-                           const DeviceContext* context) {
-  auto* wrapper = static_cast<NPUDeviceEventWrapper*>(event->GetEvent().get());
-  auto* npu_dev_ctx = dynamic_cast<const platform::NPUDeviceContext*>(context);
-  PADDLE_ENFORCE_NOT_NULL(
-      npu_dev_ctx,
-      platform::errors::PreconditionNotMet(
-          "Failed to dynamic_cast context into NPUDeviceContext."));
-  NPUStreamWaitEvent(npu_dev_ctx->stream(), wrapper->inner_event_.get());
-}
-void DeviceEventCPUWaitNPU(const DeviceEvent* event,
-                           const DeviceContext* context) {
-  DeviceEventFinishNPU(event);
-}
-void DeviceEventSetFinishedNPU(const DeviceEvent* event) {
-  // do nothing
-}
-void EventResetNPU(const DeviceEvent* event) {
-  // do nothing
-}
-}  // namespace platform
-}  // namespace paddle
-using ::paddle::platform::kCPU;
-using ::paddle::platform::kNPU;
-REGISTER_EVENT_CREATE_FUNCTION(kNPU, paddle::platform::DeviceEventCreateNPU)
-REGISTER_EVENT_RECORD_FUNCTION(kNPU, paddle::platform::DeviceEventRecordNPU)
-REGISTER_EVENT_QUERY_FUNCTION(kNPU, paddle::platform::DeviceEventQueryNPU)
-REGISTER_EVENT_FINISH_FUNCTION(kNPU, paddle::platform::DeviceEventFinishNPU)
-REGISTER_EVENT_SET_FINISHED_FUNCTION(
-    kNPU, paddle::platform::DeviceEventSetFinishedNPU)
-REGISTER_EVENT_WAIT_FUNCTION(kNPU,
-                             kNPU,
-                             paddle::platform::DeviceEventNPUWaitNPU)
-REGISTER_EVENT_WAIT_FUNCTION(kCPU,
-                             kNPU,
-                             paddle::platform::DeviceEventCPUWaitNPU)
-REGISTER_EVENT_RESET_FUNCTION(kNPU, paddle::platform::EventResetNPU)
-#endif
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -54,7 +54,6 @@ void* GetCUDADsoHandle() { return phi::dynload::GetCUDADsoHandle(); }
 void* GetWarpCTCDsoHandle() { return phi::dynload::GetWarpCTCDsoHandle(); }
 void* GetNCCLDsoHandle() { return phi::dynload::GetNCCLDsoHandle(); }
-void* GetHCCLDsoHandle() { return phi::dynload::GetHCCLDsoHandle(); }
 void* GetTensorRtDsoHandle() { return phi::dynload::GetTensorRtDsoHandle(); }

--- a/paddle/fluid/platform/dynload/dynamic_loader.h
+++ b/paddle/fluid/platform/dynload/dynamic_loader.h
@@ -37,7 +37,6 @@ void* GetNVRTCDsoHandle();
 void* GetCUDADsoHandle();
 void* GetWarpCTCDsoHandle();
 void* GetNCCLDsoHandle();
-void* GetHCCLDsoHandle();
 void* GetTensorRtDsoHandle();
 void* GetMKLMLDsoHandle();
 void* GetLAPACKDsoHandle();

--- a/paddle/fluid/platform/gen_comm_id_helper.cc
+++ b/paddle/fluid/platform/gen_comm_id_helper.cc
@@ -12,9 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) ||          \
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
-    defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_ASCEND_CL) || \
+    defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_CNCL)
-    defined(PADDLE_WITH_CNCL)
 #include "paddle/fluid/platform/gen_comm_id_helper.h"
 #include <arpa/inet.h>

--- a/paddle/fluid/platform/gen_comm_id_helper.h
+++ b/paddle/fluid/platform/gen_comm_id_helper.h
@@ -14,9 +14,8 @@ limitations under the License. */
 #pragma once
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) ||          \
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
-    defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_ASCEND_CL) || \
+    defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_CNCL)
-    defined(PADDLE_WITH_CNCL)
 #include <functional>
 #include <memory>
 #include <mutex>

--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -187,17 +187,6 @@ void InitDevices() {
      LOG(WARNING) << "Compiled with WITH_XPU, but no XPU found in runtime.";
    }
 #endif
-#ifdef PADDLE_WITH_ASCEND_CL
-    // NOTE(zhiqiu): use singleton to explicitly init and finalize ACL
-    platform::AclInstance::Instance();  // NOLINT
-    try {
-      // use user specified XPUs in single-node multi-process mode.
-      devices = platform::GetSelectedNPUDevices();
-    } catch (const std::exception &exp) {
-      LOG(WARNING) << "Compiled with PADDLE_WITH_ASCEND_CL, but no NPU found "
-                      "in runtime.";
-    }
-#endif
 #ifdef PADDLE_WITH_IPU
    try {
      // use user specified IPUs.

--- a/paddle/fluid/pybind/ascend_wrapper_py.cc
+++ b/paddle/fluid/pybind/ascend_wrapper_py.cc
--- a/paddle/fluid/pybind/ascend_wrapper_py.h
+++ b/paddle/fluid/pybind/ascend_wrapper_py.h
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-#ifdef PADDLE_WITH_ASCEND_CL
-#include "pybind11/pybind11.h"
-#include "pybind11/stl.h"
-namespace py = pybind11;
-namespace paddle {
-namespace pybind {
-void BindAscendGraph(py::module* m);
-void BindAscendWrapper(py::module* m);
-void BindAscendDevice(py::module* m);
-}  // namespace pybind
-}  // namespace paddle
-#endif
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -2616,19 +2616,6 @@ void BindImperative(py::module *m_ptr) {
           py::arg("ring_id"));
 #endif
-#if defined(PADDLE_WITH_ASCEND_CL)
-  py::class_<imperative::HCCLParallelContext,
-             imperative::ParallelContext,
-             std::shared_ptr<imperative::HCCLParallelContext>>(
-      m, "HCCLParallelContext")
-      .def(py::init<const imperative::ParallelStrategy &,
-                    const platform::NPUPlace &>())
-      .def("init", [](imperative::HCCLParallelContext &self) { self.Init(); })
-      .def("init_with_ring_id",
-           &imperative::HCCLParallelContext::InitWithRingID,
-           py::arg("ring_id"));
-#endif
 #if defined(PADDLE_WITH_CNCL)
  py::class_<imperative::CNCLParallelContext,
             imperative::ParallelContext,

--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -772,7 +772,6 @@ void BindAnalysisConfig(py::module *m) {
           py::arg("device_type"),
           py::arg("device_id") = 0,
           py::arg("precision") = AnalysisConfig::Precision::kFloat32)
-      .def("enable_npu", &AnalysisConfig::EnableNpu, py::arg("device_id") = 0)
      .def("enable_ipu",
           &AnalysisConfig::EnableIpu,
           py::arg("ipu_device_num") = 1,
@@ -1063,13 +1062,7 @@ void BindPaddleInferPredictor(py::module *m) {
      .def("get_output_names", &paddle_infer::Predictor::GetOutputNames)
      .def("get_input_handle", &paddle_infer::Predictor::GetInputHandle)
      .def("get_output_handle", &paddle_infer::Predictor::GetOutputHandle)
-      .def("run",
+      .def("run", [](paddle_infer::Predictor &self) { self.Run(); })
-           [](paddle_infer::Predictor &self) {
-#ifdef PADDLE_WITH_ASCEND_CL
-             pybind11::gil_scoped_release release;
-#endif
-             self.Run();
-           })
      .def("clone",
           [](paddle_infer::Predictor &self) { return self.Clone(nullptr); })
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)

--- a/paddle/fluid/pybind/parallel_executor.cc
+++ b/paddle/fluid/pybind/parallel_executor.cc
@@ -139,10 +139,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #endif
-#ifdef PADDLE_WITH_ASCEND_CL
-#include "paddle/fluid/platform/collective_helper.h"
-#endif
 #ifdef PADDLE_WITH_XPU
 #include "paddle/fluid/platform/device/xpu/xpu_info.h"
 #include "paddle/fluid/platform/device/xpu/xpu_op_list.h"

--- a/paddle/fluid/pybind/place.cc
+++ b/paddle/fluid/pybind/place.cc
@@ -139,10 +139,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #endif
-#ifdef PADDLE_WITH_ASCEND_CL
-#include "paddle/fluid/platform/collective_helper.h"
-#endif
 #ifdef PADDLE_WITH_XPU
 #include "paddle/fluid/platform/device/xpu/xpu_info.h"
 #include "paddle/fluid/platform/device/xpu/xpu_op_list.h"
@@ -553,57 +549,14 @@ void BindPlace(pybind11::module &m) {  // NOLINT
  py::class_<platform::NPUPlace> npuplace(m, "NPUPlace", R"DOC(
    NPUPlace is a descriptor of a device.
    It represents a NPU device on which a tensor will be allocated and a model will run.
    Examples:
        .. code-block:: python
          # required: npu
          import paddle
          place = paddle.NPUPlace(0)
        )DOC");
  g_npuplace_pytype = reinterpret_cast<PyTypeObject *>(npuplace.ptr());
-  npuplace
+  npuplace.def("__init__", [](platform::NPUPlace &self, int dev_id) {})
-      .def("__init__",
-           [](platform::NPUPlace &self, int dev_id) {
-#ifdef PADDLE_WITH_ASCEND_CL
-             if (UNLIKELY(dev_id < 0)) {
-               LOG(ERROR) << string::Sprintf(
-                   "Invalid NPUPlace(%d), device id must be 0 or "
-                   "positive integer",
-                   dev_id);
-               std::exit(-1);
-             }
-             if (UNLIKELY(dev_id >= platform::GetNPUDeviceCount())) {
-               if (platform::GetNPUDeviceCount() == 0) {
-                 LOG(ERROR) << "Cannot use NPU because there is no NPU "
-                               "detected on your "
-                               "machine.";
-                 std::exit(-1);
-               } else {
-                 LOG(ERROR) << string::Sprintf(
-                     "Invalid NPUPlace(%d), must inside [0, %d), because NPU "
-                     "number on your machine is %d",
-                     dev_id,
-                     platform::GetNPUDeviceCount(),
-                     platform::GetNPUDeviceCount());
-                 std::exit(-1);
-               }
-             }
-             new (&self) platform::NPUPlace(dev_id);
-#else
-             LOG(ERROR) << string::Sprintf(
-                 "Cannot use NPU because you have installed CPU/GPU version "
-                 "PaddlePaddle.\n"
-                 "If you want to use NPU, please try to install NPU version "
-                 "PaddlePaddle by: pip install paddlepaddle-npu\n"
-                 "If you only have CPU, please change NPUPlace(%d) to be "
-                 "CPUPlace().\n",
-                 dev_id);
-             std::exit(-1);
-#endif
-           })
      .def("_type", &PlaceIndex<platform::NPUPlace>)
      .def("_equals", &IsSamePlace<platform::NPUPlace, platform::Place>)
      .def("_equals", &IsSamePlace<platform::NPUPlace, platform::CUDAPlace>)

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
--- a/paddle/fluid/pybind/tensor.cc
+++ b/paddle/fluid/pybind/tensor.cc
@@ -139,10 +139,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #endif
-#ifdef PADDLE_WITH_ASCEND_CL
-#include "paddle/fluid/platform/collective_helper.h"
-#endif
 #ifdef PADDLE_WITH_XPU
 #include "paddle/fluid/platform/device/xpu/xpu_info.h"
 #include "paddle/fluid/platform/device/xpu/xpu_op_list.h"

--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
--- a/paddle/phi/backends/device_memory_aligment.h
+++ b/paddle/phi/backends/device_memory_aligment.h
@@ -19,9 +19,7 @@ limitations under the License. */
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/errors.h"
-#if defined(PADDLE_WITH_ASCEND_CL)
-#include "paddle/phi/backends/npu/npu_info.h"
-#endif
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #ifdef PADDLE_WITH_MLU
 #include "paddle/phi/backends/mlu/mlu_info.h"
@@ -44,8 +42,6 @@ inline size_t Alignment(size_t size,
      alignment = phi::backends::gpu::GpuMinChunkSize();
 #elif defined(PADDLE_WITH_XPU)
      alignment = alignment;
-#elif defined(PADDLE_WITH_ASCEND_CL)
-      alignment = phi::backends::npu::NPUMinChunkSize();
 #elif defined(PADDLE_WITH_MLU)
      alignment = phi::backends::mlu::MLUMinChunkSize();
 #else

--- a/paddle/phi/backends/dynload/CMakeLists.txt
+++ b/paddle/phi/backends/dynload/CMakeLists.txt
--- a/paddle/phi/backends/dynload/dynamic_loader.cc
+++ b/paddle/phi/backends/dynload/dynamic_loader.cc
--- a/paddle/phi/backends/dynload/dynamic_loader.h
+++ b/paddle/phi/backends/dynload/dynamic_loader.h
--- a/paddle/phi/backends/npu/npu_info.h
+++ b/paddle/phi/backends/npu/npu_info.h
--- a/paddle/phi/core/flags.cc
+++ b/paddle/phi/core/flags.cc
--- a/paddle/phi/core/utils/visit_place.h
+++ b/paddle/phi/core/utils/visit_place.h
--- a/paddle/phi/kernels/funcs/interpolate_function.h
+++ b/paddle/phi/kernels/funcs/interpolate_function.h
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
--- a/test/amp/CMakeLists.txt
+++ b/test/amp/CMakeLists.txt
--- a/test/asp/CMakeLists.txt
+++ b/test/asp/CMakeLists.txt