rem is_compiled_with_npu (#52385)

* rem is_compiled_with_npu * rem nup related code * make lint happy * rem test * remove some tests * Update grad_scaler.py * fix an error

rem is_compiled_with_npu (#52385)
* rem is_compiled_with_npu * rem nup related code * make lint happy * rem test * remove some tests * Update grad_scaler.py * fix an error
7976e2a3 · Kim Yann · GitHub · 2acc2b14 · 7976e2a3 · 7976e2a3
54 changed file
--- a/.flake8
+++ b/.flake8
@@ -8,9 +8,6 @@ exclude =
    ./python/paddle/fluid/tra**,
    # Exclude third-party libraries
    ./python/paddle/utils/gast/**,
-    # Exclude files that will be removed in the future, see more at
-    # https://github.com/PaddlePaddle/Paddle/pull/46782#issuecomment-1273033731
-    ./python/paddle/fluid/tests/unittests/npu/**,
 ignore =
    # Whitespace before ‘,’, ‘;’, or ‘:’, it is not compatible with black
    E203,

--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -4,8 +4,7 @@ exclude: |
        patches/.+|
        paddle/fluid/framework/fleet/heter_ps/cudf/.+|
        paddle/fluid/distributed/ps/thirdparty/round_robin.h|
-        python/paddle/utils/gast/.+|
+        python/paddle/utils/gast/.+
-        python/paddle/fluid/tests/unittests/npu/.+
    )$
 repos:
 # Common hooks

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -265,14 +265,6 @@ bool IsCompiledWithROCM() {
 #endif
 }
-bool IsCompiledWithAscend() {
-#ifndef PADDLE_WITH_ASCEND
-  return false;
-#else
-  return true;
-#endif
-}
 bool IsCompiledWithXPU() {
 #ifndef PADDLE_WITH_XPU
  return false;
@@ -281,8 +273,6 @@ bool IsCompiledWithXPU() {
 #endif
 }
-bool IsCompiledWithNPU() { return false; }
 bool IsCompiledWithCustomDevice(std::string device_type) {
 #ifndef PADDLE_WITH_CUSTOM_DEVICE
  return false;
@@ -1592,14 +1582,6 @@ All parameter, weight, gradient are variables in Paddle.
      return context;
 #endif
          })
-      .def_static(
-          "create",
-          [](paddle::platform::NPUPlace &place)
-              -> paddle::platform::DeviceContext * {
-            PADDLE_THROW(platform::errors::PermissionDenied(
-                "Cannot use NPUPlace in CPU/GPU/XPU version, "
-                "Please recompile or reinstall Paddle with NPU support."));
-          })
      .def_static("create",
                  [](paddle::platform::CustomPlace &place)
                      -> paddle::platform::DeviceContext * {
@@ -1769,13 +1751,6 @@ All parameter, weight, gradient are variables in Paddle.
             pybind11::gil_scoped_release release;
             self.Run(scope, place);
           })
-      .def("run",
-           [](OperatorBase &self,
-              const Scope &scope,
-              const platform::NPUPlace &place) {
-             pybind11::gil_scoped_release release;
-             self.Run(scope, place);
-           })
      .def("run",
           [](OperatorBase &self,
              const Scope &scope,
@@ -1985,9 +1960,7 @@ All parameter, weight, gradient are variables in Paddle.
  });
  m.def("is_compiled_with_avx", IsCompiledWithAVX);
  m.def("is_compiled_with_cuda", IsCompiledWithCUDA);
-  m.def("is_compiled_with_ascend", IsCompiledWithAscend);
  m.def("is_compiled_with_rocm", IsCompiledWithROCM);
-  m.def("is_compiled_with_npu", IsCompiledWithNPU);
  m.def("is_compiled_with_custom_device", IsCompiledWithCustomDevice);
  m.def("is_compiled_with_ipu", IsCompiledWithIPU);
  m.def("is_compiled_with_xpu", IsCompiledWithXPU);

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -14,7 +14,6 @@ extend_skip_glob = [
    "python/paddle/fluid/[!t]**",
    "python/paddle/fluid/tra**",
    "python/paddle/utils/gast/**",
-    "python/paddle/fluid/tests/unittests/npu/**",
 ]
 [tool.ruff]
@@ -23,7 +22,6 @@ exclude = [
    "./python/paddle/fluid/[!t]**",
    "./python/paddle/fluid/tra**",
    "./python/paddle/utils/gast/**",
-    "./python/paddle/fluid/tests/unittests/npu/**",
 ]
 target-version = "py37"
 select = [

--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -334,7 +334,6 @@ from .framework import ParamAttr  # noqa: F401
 from .framework import CPUPlace  # noqa: F401
 from .framework import IPUPlace  # noqa: F401
 from .framework import CUDAPlace  # noqa: F401
-from .framework import NPUPlace  # noqa: F401
 from .framework import CUDAPinnedPlace  # noqa: F401
 from .framework import CustomPlace  # noqa: F401
@@ -363,7 +362,6 @@ from .device import get_cudnn_version  # noqa: F401
 from .device import set_device  # noqa: F401
 from .device import get_device  # noqa: F401
 from .device import is_compiled_with_xpu  # noqa: F401
-from .device import is_compiled_with_npu  # noqa: F401
 from .device import is_compiled_with_ipu  # noqa: F401
 from .device import is_compiled_with_cinn  # noqa: F401
 from .device import is_compiled_with_cuda  # noqa: F401
@@ -512,7 +510,6 @@ __all__ = [  # noqa
    'histogram',
    'multiplex',
    'CUDAPlace',
-    'NPUPlace',
    'empty',
    'shape',
    'real',

--- a/python/paddle/amp/auto_cast.py
+++ b/python/paddle/amp/auto_cast.py
@@ -344,7 +344,6 @@ def amp_guard(
    if enable and not (
        tracer._expected_place.is_gpu_place()
        or tracer._expected_place.is_xpu_place()
-        or tracer._expected_place.is_npu_place()
        or tracer._expected_place.is_custom_place()
    ):
        warnings.warn(
@@ -352,10 +351,6 @@ def amp_guard(
            % tracer._expected_place
        )
        enable = False
-    # For npu:
-    if tracer._expected_place.is_npu_place() and (dtype == 'bfloat16'):
-        warnings.warn('NPUPlace only support float16 amp.')
-        enable = False
    # For xpu:
    if tracer._expected_place.is_xpu_place() and (dtype == 'bfloat16'):
        warnings.warn('XPUPlace only support float16 amp.')

--- a/python/paddle/amp/grad_scaler.py
+++ b/python/paddle/amp/grad_scaler.py
@@ -105,11 +105,10 @@ class AmpScaler:
        if enable and not (
            tracer._expected_place.is_gpu_place()
            or tracer._expected_place.is_xpu_place()
-            or tracer._expected_place.is_npu_place()
            or tracer._expected_place.is_custom_place()
        ):
            warnings.warn(
-                'AmpScaler can only be enabled on CUDAPlace, XPUPlace, MLUPlace, NPUPlace and CustomPlace, current place is %s, so it makes no effect.'
+                'AmpScaler can only be enabled on CUDAPlace, XPUPlace, MLUPlace and CustomPlace, current place is %s, so it makes no effect.'
                % tracer._expected_place
            )
            enable = False
@@ -326,74 +325,36 @@ class AmpScaler:
                    if param.dtype == core.VarDesc.VarType.FP32
                ]
        self._found_inf = self._temp_found_inf_value_false
-        if core.is_compiled_with_npu():
+        if len(param_grads_fp16):
-            float_status = _legacy_C_ops.alloc_float_status()
+            _legacy_C_ops.check_finite_and_unscale(
-            _legacy_C_ops.clear_float_status(float_status, float_status)
+                param_grads_fp16,
+                self._scale,
-            if len(param_grads_fp16):
+                param_grads_fp16,
-                _legacy_C_ops.check_finite_and_unscale(
+                self._temp_found_inf_fp16,
-                    param_grads_fp16,
+            )
-                    self._scale,
+            self._found_inf = _C_ops.bitwise_or(
-                    float_status,
+                self._found_inf, self._temp_found_inf_fp16
-                    param_grads_fp16,
+            )
-                    self._temp_found_inf_fp16,
+        if len(param_grads_bf16):
-                )
+            _legacy_C_ops.check_finite_and_unscale(
-                self._found_inf = _C_ops.bitwise_or(
+                param_grads_bf16,
-                    self._found_inf, self._temp_found_inf_fp16
+                self._scale,
-                )
+                param_grads_bf16,
-            if len(param_grads_bf16):
+                self._temp_found_inf_bf16,
-                _legacy_C_ops.check_finite_and_unscale(
+            )
-                    param_grads_bf16,
+            self._found_inf = _C_ops.bitwise_or(
-                    self._scale,
+                self._found_inf, self._temp_found_inf_bf16
-                    float_status,
+            )
-                    param_grads_bf16,
+        if len(param_grads_fp32):
-                    self._temp_found_inf_bf16,
+            _legacy_C_ops.check_finite_and_unscale(
-                )
+                param_grads_fp32,
-                self._found_inf = _C_ops.bitwise_or(
+                self._scale,
-                    self._found_inf, self._temp_found_inf_bf16
+                param_grads_fp32,
-                )
+                self._temp_found_inf_fp32,
-            if len(param_grads_fp32):
+            )
-                _legacy_C_ops.check_finite_and_unscale(
+            self._found_inf = _C_ops.bitwise_or(
-                    param_grads_fp32,
+                self._found_inf, self._temp_found_inf_fp32
-                    self._scale,
+            )
-                    float_status,
-                    param_grads_fp32,
-                    self._temp_found_inf_fp32,
-                )
-                self._found_inf = _C_ops.bitwise_or(
-                    self._found_inf, self._temp_found_inf_fp32
-                )
-        else:
-            if len(param_grads_fp16):
-                _legacy_C_ops.check_finite_and_unscale(
-                    param_grads_fp16,
-                    self._scale,
-                    param_grads_fp16,
-                    self._temp_found_inf_fp16,
-                )
-                self._found_inf = _C_ops.bitwise_or(
-                    self._found_inf, self._temp_found_inf_fp16
-                )
-            if len(param_grads_bf16):
-                _legacy_C_ops.check_finite_and_unscale(
-                    param_grads_bf16,
-                    self._scale,
-                    param_grads_bf16,
-                    self._temp_found_inf_bf16,
-                )
-                self._found_inf = _C_ops.bitwise_or(
-                    self._found_inf, self._temp_found_inf_bf16
-                )
-            if len(param_grads_fp32):
-                _legacy_C_ops.check_finite_and_unscale(
-                    param_grads_fp32,
-                    self._scale,
-                    param_grads_fp32,
-                    self._temp_found_inf_fp32,
-                )
-                self._found_inf = _C_ops.bitwise_or(
-                    self._found_inf, self._temp_found_inf_fp32
-                )
        optimizer_state["state"] = OptimizerState.UNSCALED

--- a/python/paddle/device/__init__.py
+++ b/python/paddle/device/__init__.py
@@ -36,7 +36,6 @@ __all__ = [  # noqa
    'is_compiled_with_cinn',
    'is_compiled_with_cuda',
    'is_compiled_with_rocm',
-    'is_compiled_with_npu',
    'is_compiled_with_custom_device',
    'get_all_device_type',
    'get_all_custom_device_type',
@@ -53,24 +52,6 @@ __all__ = [  # noqa
 _cudnn_version = None
-# TODO: WITH_ASCEND_CL may changed to WITH_NPU or others in the future
-# for consistent.
-def is_compiled_with_npu():
-    """
-    Whether paddle was built with WITH_ASCEND_CL=ON to support Ascend NPU.
-    Return:
-        bool, ``True`` if NPU is supported, otherwise ``False``.
-    Examples:
-        .. code-block:: python
-            import paddle
-            support_npu = paddle.device.is_compiled_with_npu()
-    """
-    return core.is_compiled_with_npu()
 def is_compiled_with_custom_device(device_type):
    """
    Whether paddle was built with Paddle_CUSTOM_DEVICE .
@@ -210,15 +191,6 @@ def _convert_to_place(device):
        selected_xpus = os.getenv("FLAGS_selected_xpus", "0").split(",")
        device_id = int(selected_xpus[0])
        place = core.XPUPlace(device_id)
-    elif lower_device == 'npu':
-        if not core.is_compiled_with_npu():
-            raise ValueError(
-                "The device should not be 'npu', "
-                "since PaddlePaddle is not compiled with NPU"
-            )
-        selected_npus = os.getenv("FLAGS_selected_npus", "0").split(",")
-        device_id = int(selected_npus[0])
-        place = core.NPUPlace(device_id)
    elif lower_device == 'ipu':
        if not core.is_compiled_with_ipu():
            raise ValueError(
@@ -229,7 +201,6 @@ def _convert_to_place(device):
    else:
        avaliable_gpu_device = re.match(r'gpu:\d+', lower_device)
        avaliable_xpu_device = re.match(r'xpu:\d+', lower_device)
-        avaliable_npu_device = re.match(r'npu:\d+', lower_device)
        if avaliable_gpu_device:
            if not core.is_compiled_with_cuda():
                raise ValueError(
@@ -250,31 +221,7 @@ def _convert_to_place(device):
            device_id = device_info_list[1]
            device_id = int(device_id)
            place = core.XPUPlace(device_id)
-        if avaliable_npu_device:
+        if not avaliable_gpu_device and not avaliable_xpu_device:
-            if not core.is_compiled_with_npu():
-                device_info_list = device.split(':', 1)
-                device_type = device_info_list[0]
-                if device_type in core.get_all_custom_device_type():
-                    device_id = device_info_list[1]
-                    device_id = int(device_id)
-                    place = core.CustomPlace(device_type, device_id)
-                    return place
-                else:
-                    raise ValueError(
-                        "The device should not be {}, since PaddlePaddle is "
-                        "not compiled with NPU or compiled with custom device".format(
-                            avaliable_npu_device
-                        )
-                    )
-            device_info_list = device.split(':', 1)
-            device_id = device_info_list[1]
-            device_id = int(device_id)
-            place = core.NPUPlace(device_id)
-        if (
-            not avaliable_gpu_device
-            and not avaliable_xpu_device
-            and not avaliable_npu_device
-        ):
            device_info_list = device.split(':', 1)
            device_type = device_info_list[0]
            if device_type in core.get_all_custom_device_type():
@@ -346,9 +293,6 @@ def get_device():
    elif isinstance(place, core.XPUPlace):
        device_id = place.get_device_id()
        device = 'xpu:' + str(device_id)
-    elif isinstance(place, core.NPUPlace):
-        device_id = place.get_device_id()
-        device = 'npu:' + str(device_id)
    elif isinstance(place, core.IPUPlace):
        num_devices = core.get_ipu_device_count()
        device = f"ipus:{{0-{num_devices - 1}}}"
@@ -469,7 +413,7 @@ class Event:
    Parameters:
        device(str|paddle.CUDAPlace(n)|paddle.CustomPlace(n)): Which device the stream runn on. If device is None, the device is the current device. Default: None.
            It can be ``gpu``, ``gpu:x``,``custom_device``, ``custom_device:x``, where ``custom_device`` is the name of CustomDevicec,
-            where ``x`` is the index of the GPUs, XPUs or NPUs. And it can be paddle.CUDAPlace(n) or paddle.CustomPlace(n).
+            where ``x`` is the index of the GPUs, XPUs. And it can be paddle.CUDAPlace(n) or paddle.CustomPlace(n).
        enable_timing (bool, optional): indicates if the event should measure time, default is False
        blocking (bool, optional): if True, ``wait`` will be blocking, default is False
        interprocess (bool): if True, the event can be shared between processes, default is False
@@ -614,7 +558,7 @@ class Stream:
    Parameters:
        device(str|paddle.CUDAPlace(n)|paddle.CustomPlace(n)): Which device the stream runn on. If device is None, the device is the current device. Default: None.
            It can be ``gpu``, ``gpu:x``,``custom_device``, ``custom_device:x``, where ``custom_device`` is the name of CustomDevicec,
-            where ``x`` is the index of the GPUs, XPUs or NPUs. And it can be paddle.CUDAPlace(n) or paddle.CustomPlace(n).
+            where ``x`` is the index of the GPUs, XPUs. And it can be paddle.CUDAPlace(n) or paddle.CustomPlace(n).
        priority(int, optional): priority of the CUDA stream. Can be either
            1 (high priority) or 2 (low priority). By default, streams have
            priority 2.
@@ -936,7 +880,7 @@ def synchronize(device=None):
    Parameters:
        device(str|paddle.CUDAPlace(n)|paddle.XPUPlace(n)|paddle.CustomPlace(n)): The device which want to wait for.  If device is None, the device is the current device. Default: None.
            It can be ``gpu``, ``gpu:x``, ``xpu``, ``xpu:x``, ``custom_device``, ``custom_device:x``, where ``custom_device`` is the name of CustomDevicec,
-            where ``x`` is the index of the GPUs, XPUs or NPUs. And it can be paddle.CUDAPlace(n) or paddle.XPUPlace(n) or paddle.CustomPlace(n).
+            where ``x`` is the index of the GPUs, XPUs. And it can be paddle.CUDAPlace(n) or paddle.XPUPlace(n) or paddle.CustomPlace(n).
    Examples:
        .. code-block:: python
            # required: custom_device

--- a/python/paddle/distributed/collective.py
+++ b/python/paddle/distributed/collective.py
@@ -288,11 +288,6 @@ def new_group(ranks=None, backend=None, timeout=_default_timeout):
                core.NCCLParallelContext(strategy, place).init_with_ring_id(
                    ring_id
                )
-            elif core.is_compiled_with_npu():
-                place = core.NPUPlace(genv.device_id)
-                core.HCCLParallelContext(strategy, place).init_with_ring_id(
-                    ring_id
-                )
            elif core.is_compiled_with_xpu():
                place = core.XPUPlace(genv.device_id)
                core.BKCLParallelContext(strategy, place).init_with_ring_id(

--- a/python/paddle/distributed/fleet/ascend_utils.py
+++ b/python/paddle/distributed/fleet/ascend_utils.py
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import json
-import os
-from paddle.distributed.fleet.launch_utils import (
-    DeviceMode,
-    get_cluster,
-    get_host_name_ip,
-)
-__all__ = []
-def _get_ascend_rankfile(rank_table_file_path):
-    """
-    Args:
-    rank_table_file_path: ascend npu rank file json
-    {
-        "status": "completed",
-        "version": "1.0",
-        "server_count": "2",
-        "server_list": [
-            {
-                "server_id": "192.168.24.217",
-                "device": [
-                    {
-                        "device_id": "0",
-                        "device_ip": "192.1.184.23",
-                        "rank_id": "0"
-                    },
-                    {
-                        "device_id": "1",
-                        "device_ip": "192.2.21.93",
-                        "rank_id": "1"
-                    }
-                ]
-            },
-            {
-                "server_id": "192.168.26.177",
-                "device": [
-                    {
-                        "device_id": "0",
-                        "device_ip": "192.1.94.132",
-                        "rank_id": "2"
-                    },
-                    {
-                        "device_id": "1",
-                        "device_ip": "192.2.94.30",
-                        "rank_id": "3"
-                    }
-                ]
-            }
-        ]
-    }
-    Returns:
-        node_ips: node ip list
-        device_count: number of npu per machine
-    """
-    json_data = None
-    with open(rank_table_file_path) as json_file:
-        json_data = json.load(json_file)
-    node_ips = []
-    device_count = 0
-    server_list = json_data['server_list']
-    for server in server_list:
-        device_list = server['device']
-        device_count = len(device_list)
-        if os.getenv("FLAGS_MODELARTS", None):
-            nodes = os.getenv("DLS_TASK_NUMBER", None)
-            assert nodes is not None, "DLS_TASK_NUMBER didn't set!"
-            for node in range(int(nodes)):
-                node_ip = os.getenv(f"VC_CUSTOM{node}_HOSTS", None)
-                assert node_ip is not None, f"VC_CUSTOM{node}_HOSTS didn't set!"
-                node_ips.append(node_ip)
-            return node_ips, device_count
-        node_ips.append(server['server_id'])
-    return node_ips, device_count
-def get_cloud_cluster(
-    rank_table_file=None, device_mode=DeviceMode.ASCEND_NPU, start_port=6070
-):
-    """
-    Args:
-    rank_table_file: string, ascend npu rank file path
-    device_mode: DeviceMode(Int)
-    start_port: the start port of current runtime env
-    """
-    if rank_table_file:
-        # multi trainers
-        node_ips, device_count = _get_ascend_rankfile(rank_table_file)
-        if len(node_ips) == 1:
-            node_ip = node_ips[0]
-        else:
-            node_index = os.environ.get("PADDLE_TRAINER_ID")
-            node_ip = None
-            if node_index:
-                node_ip = node_ips[int(node_index)]
-            else:
-                _, node_ip = get_host_name_ip()
-        assert (
-            node_ip in node_ips
-        ), "Can't find your local ip {{{}}} in node_ips: {{{}}}".format(
-            node_ip,
-            node_ips,
-        )
-    else:
-        # single trainer (single ascend card)
-        node_ips = ["127.0.0.1"]
-        node_ip = node_ips[0]
-        device_count = 1
-    devices_per_proc = [str(x) for x in range(device_count)]
-    free_ports = list(range(start_port, start_port + len(devices_per_proc)))
-    trainer_endpoints = []
-    for ip in node_ips:
-        trainer_endpoints.append(["%s:%d" % (ip, port) for port in free_ports])
-    return get_cluster(
-        node_ips, node_ip, trainer_endpoints, device_mode, devices_per_proc
-    )
--- a/python/paddle/distributed/fleet/launch.py
+++ b/python/paddle/distributed/fleet/launch.py
@@ -64,7 +64,7 @@ import time
 from argparse import REMAINDER, ArgumentParser
 from paddle import framework
-from paddle.distributed.fleet import ascend_utils, cloud_utils, launch_utils
+from paddle.distributed.fleet import cloud_utils, launch_utils
 from paddle.distributed.fleet.elastic import enable_elastic, launch_elastic
 from paddle.distributed.fleet.launch_utils import (
    DeviceMode,
@@ -155,16 +155,6 @@ see: http://www.paddlepaddle.org/documentation/docs/zh/1.6/user_guides/howto/tra
        )
        base_group.add_argument("--selected_xpus", dest="xpus")
-    if framework.core.is_compiled_with_npu():
-        base_group.add_argument(
-            "--npus",
-            type=str,
-            default=None,
-            help="It's for xpu training. For example: "
-            "--npus=\"0,1,2,3\" will launch four training processes each bound to one npu.",
-        )
-        base_group.add_argument("--selected_npus", dest="npus")
    base_group.add_argument(
        "training_script",
        type=str,
@@ -407,13 +397,6 @@ def get_cluster_info(args):
            args.ips, device_mode, devices_per_proc, start_port
        )
        logger.debug(f"get cluster from cloud:{cluster}")
-    elif device_mode == DeviceMode.ASCEND_NPU:
-        # for ascend
-        cluster, pod = ascend_utils.get_cloud_cluster(
-            rank_table_file=os.getenv("RANK_TABLE_FILE", None),
-            device_mode=device_mode,
-            start_port=start_port,
-        )
    else:
        # trainers_num = 1 or not use paddlecloud ips="a,b"
        cluster, pod = get_cluster_from_args(
@@ -493,8 +476,6 @@ def infer_backend(args):
        return
    if framework.core.is_compiled_with_cuda():
        args.backend = 'nccl'
-    elif framework.core.is_compiled_with_npu():
-        args.backend = 'unknown'
    elif framework.core.is_compiled_with_xpu():
        args.backend = 'bkcl'
    else:
@@ -545,8 +526,6 @@ def which_distributed_mode(args):
    if framework.core.is_compiled_with_cuda():
        accelerators = framework.core.get_cuda_device_count()
-    elif framework.core.is_compiled_with_npu():
-        accelerators = framework.core.get_npu_device_count()
    elif framework.core.is_compiled_with_xpu():
        accelerators = framework.core.get_xpu_device_count()
    else:
@@ -578,7 +557,7 @@ def which_distributed_mode(args):
        ):
            if args.servers:
                logger.warning(
-                    "Not found distinct arguments and not compiled with cuda or xpu or npu. "
+                    "Not found distinct arguments and not compiled with cuda or xpu. "
                    "But found args.servers not empty, default use ps mode"
                )
                return DistributeMode.PS
@@ -586,7 +565,7 @@ def which_distributed_mode(args):
                return DistributeMode.COLLECTIVE
        else:
            logger.warning(
-                "Not found distinct arguments and compiled with cuda or xpu or npu. "
+                "Not found distinct arguments and compiled with cuda or xpu. "
                "Default use collective mode"
            )
            return DistributeMode.COLLECTIVE

--- a/python/paddle/distributed/fleet/launch_utils.py
+++ b/python/paddle/distributed/fleet/launch_utils.py
@@ -55,7 +55,6 @@ class DeviceMode:
    GPU = 1
    KUNLUN = 2
    XPU = 2
-    ASCEND_NPU = 3
    UNKNOWN = 3
@@ -299,10 +298,7 @@ def get_cluster(
        ), "current trainer_endpoints size should be greater equal than acclerators size."
        for i in range(len(devices_per_proc)):
            trainer = Trainer()
-            if (
+            if device_mode == DeviceMode.GPU:
-                device_mode == DeviceMode.GPU
-                or device_mode == DeviceMode.ASCEND_NPU
-            ):
                if isinstance(devices_per_proc[i], (list, tuple)):
                    trainer.accelerators.extend(devices_per_proc[i])
                    pod.accelerators.extend(devices_per_proc[i])
@@ -546,13 +542,6 @@ def start_local_trainers(
                [str(g) for g in t.accelerators]
            )
-        elif (
-            len(t.accelerators) > 0 and pod.device_mode == DeviceMode.ASCEND_NPU
-        ):
-            proc_env["FLAGS_selected_npus"] = "%s" % ",".join(
-                [str(g) for g in t.accelerators]
-            )
        if len(t.accelerators) > 0:
            proc_env["FLAGS_selected_accelerators"] = "%s" % ",".join(
                [str(g) for g in t.accelerators]
@@ -760,40 +749,6 @@ def get_xpus(xpus):
    return res_xpus
-def get_npus(npus):
-    if npus is None:
-        npus_num = framework.core.get_npu_device_count()
-        res_npus = [str(x) for x in range(0, npus_num)]
-    else:
-        npu_visible_devices = os.getenv("ASCEND_VISIBLE_DEVICES")
-        if npu_visible_devices is None or npu_visible_devices == "":
-            res_npus = [x.strip() for x in npus.split(',')]
-        else:
-            # change npus into relative values
-            # e.g. ASCEND_VISIBLE_DEVICES=4,5,6,7; args.npus=4,5,6,7;
-            # therefore npus=0,1,2,3
-            npu_visible_devices_list = npu_visible_devices.split(',')
-            for x in npus.split(','):
-                assert x in npu_visible_devices_list, (
-                    "Can't find "
-                    "your npus %s in ASCEND_VISIBLE_DEVICES[%s]."
-                    % (x, npu_visible_devices)
-                )
-            res_npus = [
-                npu_visible_devices_list.index(x.strip())
-                for x in npus.split(',')
-            ]
-            logger.info(
-                "Change selected_npus into reletive values. --ips:{} "
-                "will change into relative_ips:{} according to your "
-                "ASCEND_VISIBLE_DEVICES:{}".format(
-                    npus, res_npus, npu_visible_devices_list
-                )
-            )
-    return res_npus
 def get_device_mode(backend):
    if backend == 'heter':
        if (
@@ -808,16 +763,6 @@ def get_device_mode(backend):
        ):
            print("launch train in heter mode with XPU device.")
            return DeviceMode.XPU
-        if (
-            framework.core.is_compiled_with_npu()
-            and framework.core.get_npu_device_count() > 0
-        ):
-            print("launch train in heter mode with NPU device.")
-            return DeviceMode.ASCEND_NPU
-    if backend == 'hccl' and framework.core.get_npu_device_count() > 0:
-        print("launch train in ascend npu mode!")
-        return DeviceMode.ASCEND_NPU
    if backend == 'nccl' and framework.core.get_cuda_device_count() > 0:
        print("launch train in GPU mode!")
@@ -853,19 +798,6 @@ def get_device_proc_info(args):
            devices_per_proc = [gpus[i : i + n] for i in range(0, len(gpus), n)]
        else:
            devices_per_proc = gpus
-    elif device_mode == DeviceMode.ASCEND_NPU:
-        npus = get_npus(args.npus)
-        if args.nproc_per_node is not None:
-            assert (
-                len(npus) % int(args.nproc_per_node)
-            ) == 0, "npus' number:{} mod args.nproc_per_node:{} must == 0".format(
-                len(npus), args.nproc_per_node
-            )
-            n = int(len(npus) / int(args.nproc_per_node))
-            devices_per_proc = [npus[i : i + n] for i in range(0, len(npus), n)]
-        else:
-            devices_per_proc = npus
    elif device_mode == DeviceMode.XPU:
        xpus = get_xpus(args.xpus)
        if args.nproc_per_node is not None:
@@ -2079,12 +2011,6 @@ def check_backend(backend):
            "your paddle is not compiled with xpu but you assign 'bkcl' as backend."
        )
-    if backend == 'hccl' and not framework.core.is_compiled_with_npu():
-        raise ValueError(
-            "paddle.distributed initialize error, "
-            "your paddle is not compiled with npu but you assign 'hccl' as backend."
-        )
 def block_windows_and_macos(backend):
    if backend != 'gloo':
@@ -2106,7 +2032,4 @@ def get_backend_by_compile_flag():
    if framework.core.is_compiled_with_xpu():
        return 'bkcl'
-    if framework.core.is_compiled_with_npu():
-        return 'hccl'
    return 'gloo'
--- a/python/paddle/distributed/fleet/layers/mpu/mp_ops.py
+++ b/python/paddle/distributed/fleet/layers/mpu/mp_ops.py
@@ -536,7 +536,9 @@ def _parallel_linear(
    # NOTE: npu linear function use matmul_v2 but linear use matmul
    linear_function = (
-        _linear if core.is_compiled_with_npu() else paddle.nn.functional.linear
+        _linear
+        if core.is_compiled_with_custom_device('npu')
+        else paddle.nn.functional.linear
    )
    linear_out = linear_function(
        x,

--- a/python/paddle/distributed/fleet/meta_optimizers/common.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/common.py
@@ -196,7 +196,7 @@ class CollectiveHelper:
                    OP_ROLE_KEY: OpRole.Forward,
                },
            )
-        elif core.is_compiled_with_npu():
+        elif core.is_compiled_with_custom_device('npu'):
            block.append_op(
                type='c_gen_hccl_id',
                inputs={},

--- a/python/paddle/distributed/fleet/meta_optimizers/sharding/offload_helper.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/offload_helper.py
@@ -26,23 +26,17 @@ class PlaceType:
    CUDA = 1
    CUDA_PINNED = 2
    XPU = 3  # unsupport for now
-    NPU = 4
-    NPU_PINNED = 5
    @staticmethod
    def default_device():
        if core.is_compiled_with_cuda():
            return PlaceType.CUDA
-        elif core.is_compiled_with_npu():
-            return PlaceType.NPU
        return PlaceType.CPU
    @staticmethod
    def default_pinned():
        if core.is_compiled_with_cuda():
            return PlaceType.CUDA_PINNED
-        elif core.is_compiled_with_npu():
-            return PlaceType.NPU_PINNED
        return PlaceType.CPU

--- a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
@@ -596,7 +596,7 @@ class ShardingOptimizer(MetaOptimizerBase):
        rings = [self.mp_ring_id, self.pp_ring_id]
        # FIXME(wangxi): some problem with NPU found_finite, need sync with DP
-        if core.is_compiled_with_npu():
+        if core.is_compiled_with_custom_device('npu'):
            rings += [self.dp_ring_id]
        FP16Utils.sync_amp_check_nan_inf(main_block, rings)
@@ -721,7 +721,7 @@ class ShardingOptimizer(MetaOptimizerBase):
        self._dump_program_for_debug()
        # GPU need to wait server ready, GPU and NPU is Layered connection
-        if not core.is_compiled_with_npu():
+        if not core.is_compiled_with_custom_device('npu'):
            self._wait()
        return optimize_ops, params_grads
@@ -839,7 +839,7 @@ class ShardingOptimizer(MetaOptimizerBase):
                sync=False,
            )
-        if core.is_compiled_with_npu():
+        if core.is_compiled_with_custom_device('npu'):
            self._init_npu_pipeline_comm(startup_block)
            return

--- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_storage.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_storage.py
@@ -78,7 +78,7 @@ class InternalStorage:
        if self._device != device:
            tmp_buffer = (
                cvt_to_device(self.buffer, self.dev_id)
-                if device in ["gpu", "xpu", "npu"]
+                if device in ["gpu", "xpu"]
                else self.buffer.cpu()
            )
            for param in self._params:

--- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py
@@ -200,8 +200,9 @@ def device_guard(dev_id=0, device="cpu"):
    origin_device = paddle.device.get_device()
    if device == "cpu":
        paddle.set_device(device)
-    elif device in ["gpu", "xpu", "npu"]:
+    elif device in ["gpu", "xpu"]:
        paddle.set_device(f"{device}:{dev_id}")
    try:
        yield
    finally:
@@ -313,8 +314,6 @@ def cvt_to_device(x, dev_id, blocking=True):
    """
    if paddle.is_compiled_with_cuda():
        place = paddle.CUDAPlace(dev_id)
-    elif paddle.is_compiled_with_npu():
-        place = paddle.NPUPlace(dev_id)
    elif paddle.is_compiled_with_xpu():
        place = paddle.XPUPlace(dev_id)
    else:

--- a/python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py
+++ b/python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py
@@ -201,11 +201,9 @@ class HybridParallelInferenceHelper:
        assert isinstance(main_program, Program)
        self._device = None
-        if core.is_compiled_with_npu():
+        if core.is_compiled_with_cuda():
-            self._device = "npu"
-        elif core.is_compiled_with_cuda():
            self._device = "gpu"
-        assert self._device, "Only gpu and npu are supported."
+        assert self._device, "Only gpu are supported."
        assert not in_dygraph_mode(), "Only static graph mode is supported."

--- a/python/paddle/distributed/launch/context/device.py
+++ b/python/paddle/distributed/launch/context/device.py
@@ -24,7 +24,6 @@ class DeviceType:
    CPU = 'cpu'
    GPU = 'gpu'
    XPU = 'xpu'
-    NPU = 'npu'
    IPU = 'ipu'
    CUSTOM_DEVICE = 'custom_device'
@@ -68,8 +67,6 @@ class Device:
            return 'FLAGS_selected_cpus'
        if self._dtype == DeviceType.GPU:
            return 'FLAGS_selected_gpus'
-        if self._dtype == DeviceType.NPU:
-            return 'FLAGS_selected_npus'
        if self._dtype == DeviceType.XPU:
            return 'FLAGS_selected_xpus'
        if self._dtype == DeviceType.IPU:
@@ -111,9 +108,6 @@ class Device:
        elif 'XPU_VISIBLE_DEVICES' in os.environ:
            dev._dtype = DeviceType.XPU
            visible_devices = os.getenv("XPU_VISIBLE_DEVICES")
-        elif 'ASCEND_VISIBLE_DEVICES' in os.environ:
-            dev._dtype = DeviceType.NPU
-            visible_devices = os.getenv("ASCEND_VISIBLE_DEVICES")
        if visible_devices is not None and visible_devices != 'all':
            dev._labels = visible_devices.split(',')
@@ -152,10 +146,6 @@ class Device:
            dev._dtype = DeviceType.XPU
            num = core.get_xpu_device_count()
            visible_devices = os.getenv("XPU_VISIBLE_DEVICES")
-        elif core.is_compiled_with_npu():
-            dev._dtype = DeviceType.NPU
-            num = core.get_npu_device_count()
-            visible_devices = os.getenv("ASCEND_VISIBLE_DEVICES")
        elif core.is_compiled_with_ipu():
            dev._dtype = DeviceType.IPU
            num = core.get_ipu_device_count()

--- a/python/paddle/distributed/parallel.py
+++ b/python/paddle/distributed/parallel.py
@@ -721,9 +721,6 @@ class ParallelEnv:
            elif core.is_compiled_with_xpu():
                selected_xpus = os.getenv("FLAGS_selected_xpus", "0").split(",")
                self._device_id = int(selected_xpus[0])
-            elif core.is_compiled_with_npu():
-                selected_npus = os.getenv("FLAGS_selected_npus", "0").split(",")
-                self._device_id = int(selected_npus[0])
        self._trainer_endpoints = os.getenv(
            "PADDLE_TRAINER_ENDPOINTS", ""
@@ -889,12 +886,8 @@ def _start_kv_server(port, http_server_d, size):
 def _is_cpuonly(backend):
    check_backend(backend)
    if (
-        backend in ['auto', 'nccl', 'bkcl', 'hccl', 'heter', 'cncl']
+        backend in ['auto', 'nccl', 'bkcl', 'heter', 'cncl']
-        and (
+        and (core.is_compiled_with_cuda() or core.is_compiled_with_xpu())
-            core.is_compiled_with_cuda()
-            or core.is_compiled_with_xpu()
-            or core.is_compiled_with_npu()
-        )
    ) or backend == 'xccl':
        # passes 'auto' and can use cuda or xpu, use the default logics. so return False
@@ -994,7 +987,6 @@ def init_parallel_env():
        is_cpu_only
        or core.is_compiled_with_cuda()
        or core.is_compiled_with_xpu()
-        or core.is_compiled_with_npu()
        or backend == "xccl"
    ):
        raise NotImplementedError(
@@ -1013,9 +1005,6 @@ def init_parallel_env():
        elif not is_cpu_only and core.is_compiled_with_xpu():
            _check_var_exists('FLAGS_selected_xpus')
            backend = "bkcl" if backend == "auto" else backend
-        elif not is_cpu_only and core.is_compiled_with_npu():
-            _check_var_exists('FLAGS_selected_npus')
-            backend = "hccl" if backend == "auto" else backend
    _check_var_exists("PADDLE_TRAINER_ID")
    _check_var_exists("PADDLE_CURRENT_ENDPOINT")
@@ -1038,9 +1027,6 @@ def init_parallel_env():
        place = core.CUDAPlace(parallel_env.device_id)
    elif core.is_compiled_with_xpu():
        place = core.XPUPlace(parallel_env.device_id)
-    elif core.is_compiled_with_npu():
-        place = core.NPUPlace(parallel_env.device_id)
    _set_expected_place(place)
    group = None
@@ -1136,7 +1122,7 @@ def init_parallel_env():
    strategy.current_endpoint = parallel_env.current_endpoint
    strategy.nrings = parallel_env.nrings
-    # init nccl or hccl or bkcl or heter context
+    # init nccl or bkcl or heter context
    if is_cpu_only:
        parallel_helper._set_parallel_ctx(
            core.GLOOParallelContext(strategy, place)
@@ -1153,10 +1139,7 @@ def init_parallel_env():
        parallel_helper._set_parallel_ctx(
            core.BKCLParallelContext(strategy, place)
        )
-    elif core.is_compiled_with_npu():
-        parallel_helper._set_parallel_ctx(
-            core.HCCLParallelContext(strategy, place)
-        )
    if backend != "heter":
        other_endpoints = strategy.trainer_endpoints[:]
        other_endpoints.remove(strategy.current_endpoint)

--- a/python/paddle/distributed/ps/utils/collective_transpiler.py
+++ b/python/paddle/distributed/ps/utils/collective_transpiler.py
@@ -133,37 +133,8 @@ class Collective:
            wait_server_ready(other_endpoints)
        block = program.global_block()
-        if core.is_compiled_with_npu():
-            hccl_id_var = block.create_var(
+        if core.is_compiled_with_xpu():
-                name=unique_name.generate('hccl_id'),
-                persistable=True,
-                type=core.VarDesc.VarType.RAW,
-            )
-            endpoint_to_index_map = {e: idx for idx, e in enumerate(endpoints)}
-            block.append_op(
-                type='c_gen_hccl_id',
-                inputs={},
-                outputs={'Out': hccl_id_var},
-                attrs={
-                    'rank': rank,
-                    'endpoint': current_endpoint,
-                    'other_endpoints': other_endpoints,
-                    self.op_role_key: OpRole.Forward,
-                },
-            )
-            block.append_op(
-                type='c_comm_init_hccl',
-                inputs={'X': hccl_id_var},
-                outputs={},
-                attrs={
-                    'rank': rank,
-                    'ring_id': ring_id,
-                    'device_id': int(os.getenv("FLAGS_selected_npus")),
-                    'rank_ids': nranks,
-                    self.op_role_key: OpRole.Forward,
-                },
-            )
-        elif core.is_compiled_with_xpu():
            bkcl_id_var = block.create_var(
                name=unique_name.generate('bkcl_id'),
                persistable=True,

--- a/python/paddle/distributed/transpiler/collective.py
+++ b/python/paddle/distributed/transpiler/collective.py
@@ -131,37 +131,7 @@ class Collective:
            wait_server_ready(other_endpoints)
        block = program.global_block()
-        if core.is_compiled_with_npu():
+        if core.is_compiled_with_cuda():
-            hccl_id_var = block.create_var(
-                name=unique_name.generate('hccl_id'),
-                persistable=True,
-                type=core.VarDesc.VarType.RAW,
-            )
-            endpoint_to_index_map = {e: idx for idx, e in enumerate(endpoints)}
-            block.append_op(
-                type='c_gen_hccl_id',
-                inputs={},
-                outputs={'Out': hccl_id_var},
-                attrs={
-                    'rank': rank,
-                    'endpoint': current_endpoint,
-                    'other_endpoints': other_endpoints,
-                    self.op_role_key: OpRole.Forward,
-                },
-            )
-            block.append_op(
-                type='c_comm_init_hccl',
-                inputs={'X': hccl_id_var},
-                outputs={},
-                attrs={
-                    'rank': rank,
-                    'ring_id': ring_id,
-                    'device_id': int(os.getenv("FLAGS_selected_npus")),
-                    'rank_ids': nranks,
-                    self.op_role_key: OpRole.Forward,
-                },
-            )
-        elif core.is_compiled_with_cuda():
            nccl_id_var = block.create_var(
                name=unique_name.generate('nccl_id'),
                persistable=True,

--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -71,7 +71,6 @@ from .core import (
    XPUPlace,
    CUDAPlace,
    CUDAPinnedPlace,
-    NPUPlace,
    IPUPlace,
    MLUPlace,
    CustomPlace,
@@ -127,7 +126,6 @@ __all__ = (
        'XPUPlace',
        'CUDAPlace',
        'CUDAPinnedPlace',
-        'NPUPlace',
        'IPUPlace',
        'MLUPlace',
        'Tensor',
@@ -220,10 +218,6 @@ monkey_patch_variable()
 __bootstrap__()
 monkey_patch_varbase()
-# NOTE(zhiqiu): register npu_finalize on the exit of Python,
-# do some clean up manually.
-if core.is_compiled_with_npu():
-    atexit.register(core.npu_finalize)
 # NOTE(Aurelius84): clean up ExecutorCacheInfo in advance manually.
 atexit.register(core.clear_executor_cache)

--- a/python/paddle/fluid/device_worker.py
+++ b/python/paddle/fluid/device_worker.py
@@ -654,8 +654,6 @@ class Section(DeviceWorker):
        place_id = pipeline_opt["place_id"]
        if core.is_compiled_with_cuda():
            assert isinstance(place, core.CUDAPlace)
-        elif core.is_compiled_with_npu():
-            assert isinstance(place, core.NPUPlace)
        cfg.place = cfg.CUDAPlace
        cfg.place_id = place_id

--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -306,7 +306,7 @@ def monkey_patch_varbase():
            if _grad_scalar:
                # When using amp with Fleet DistributedStrategy, we do loss scaling implicitly.
                self = _grad_scalar.scale(self)
-            if paddle.is_compiled_with_xpu() or paddle.is_compiled_with_npu():
+            if paddle.is_compiled_with_xpu():
                # TODO(liuyuhui): Currently only for xpu. Will be removed in the future.
                scaled_loss = scale_loss(self)
                if framework.global_var._in_eager_mode_:

--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -2107,7 +2107,7 @@ class Executor:
            for var in program.global_block().vars.values():
                if var.is_data:
                    data_vars.append(var)
-            if core.is_compiled_with_npu():
+            if core.is_compiled_with_custom_device('npu'):
                dataset = paddle.fluid.DatasetFactory().create_dataset(
                    'InMemoryDataset'
                )
@@ -2284,7 +2284,7 @@ class Executor:
            for var in program.global_block().vars.values():
                if var.is_data:
                    data_vars.append(var)
-            if core.is_compiled_with_npu():
+            if core.is_compiled_with_custom_device('npu'):
                dataset = paddle.fluid.DatasetFactory().create_dataset(
                    'InMemoryDataset'
                )

--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -58,7 +58,6 @@ __all__ = [
    'is_compiled_with_cuda',
    'is_compiled_with_rocm',
    'is_compiled_with_xpu',
-    'is_compiled_with_npu',
    'Variable',
    'require_version',
    'device_guard',
@@ -224,7 +223,7 @@ def _in_eager_without_dygraph_check():
    return global_var._in_eager_mode_
-# FIXME(dev): We haven't fully verified eager mode on XPU/NPU et.al but
+# FIXME(dev): We haven't fully verified eager mode on XPU et.al but
 # only GPU/CPU. Remove this after we improve this feature.
 _is_first_import_ = True
@@ -715,15 +714,6 @@ def _xpu_ids():
    return device_ids
-def _npu_ids():
-    npus_env = os.getenv("FLAGS_selected_npus")
-    if npus_env:
-        device_ids = [int(s) for s in npus_env.split(",")]
-    else:
-        device_ids = range(core.get_npu_device_count())
-    return device_ids
 def _custom_device_ids(device_type):
    custom_devices_env = os.getenv("FLAGS_selected_" + device_type + "s")
    if custom_devices_env:
@@ -748,21 +738,6 @@ def is_compiled_with_xpu():
    return core.is_compiled_with_xpu()
-def is_compiled_with_npu():
-    """
-    Whether this whl package can be used to run the model on NPU.
-    Returns (bool): support npu or not.
-    Examples:
-        .. code-block:: python
-            import paddle.fluid as fluid
-            support_npu = fluid.is_compiled_with_npu()
-    """
-    return core.is_compiled_with_npu()
 def disable_signal_handler():
    """
    Reset signal handler registered by Paddle.
@@ -921,47 +896,6 @@ def xpu_places(device_ids=None):
    return [core.XPUPlace(dev_id) for dev_id in device_ids]
-def npu_places(device_ids=None):
-    """
-    Note:
-        For multi-card tasks, please use `FLAGS_selected_npus` environment variable to set the visible NPU device.
-    This function creates a list of :code:`paddle.NPUPlace` objects.
-    If :code:`device_ids` is None, environment variable of
-    :code:`FLAGS_selected_npus` would be checked first. For example, if
-    :code:`FLAGS_selected_npus=0,1,2`, the returned list would
-    be [paddle.NPUPlace(0), paddle.NPUPlace(1), paddle.NPUPlace(2)].
-    If :code:`FLAGS_selected_npus` is not set, all visible
-    npu places would be returned.
-    If :code:`device_ids` is not None, it should be the device
-    ids of NPUs. For example, if :code:`device_ids=[0,1,2]`,
-    the returned list would be
-    [paddle.NPUPlace(0), paddle.NPUPlace(1), paddle.NPUPlace(2)].
-    Parameters:
-        device_ids (list or tuple of int, optional): list of NPU device ids.
-    Returns:
-        list of paddle.NPUPlace: Created NPU place list.
-    Examples:
-        .. code-block:: python
-            # required: npu
-            import paddle
-            import paddle.static as static
-            paddle.enable_static()
-            npu_places = static.npu_places()
-    """
-    assert core.is_compiled_with_npu(), "Not compiled with NPU"
-    if device_ids is None:
-        device_ids = _npu_ids()
-    elif not isinstance(device_ids, (list, tuple)):
-        device_ids = [device_ids]
-    return [core.NPUPlace(dev_id) for dev_id in device_ids]
 def cpu_places(device_count=None):
    """
    This function creates a list of :code:`paddle.CPUPlace` objects, and returns the created list.
@@ -2587,10 +2521,6 @@ class Variable(metaclass=VariableMetaClass):
            p = core.Place()
            p.set_place(t._place())
            place = core.XPUPlace(p.xpu_device_id())
-        elif p.is_npu_place():
-            p = core.Place()
-            p.set_place(t._place())
-            place = core.NPUPlace(p.npu_device_id())
        else:
            p = core.Place()
            p.set_place(t._place())
@@ -7520,9 +7450,9 @@ def device_guard(device=None):
        device, index = device.split(':')
        if device == 'cpu':
            raise ValueError("Should not set device id for cpu.")
-    if device not in ['cpu', 'gpu', 'npu', 'xpu', '', None]:
+    if device not in ['cpu', 'gpu', 'xpu', '', None]:
        raise ValueError(
-            "The Attr(device) should be 'cpu' 'npu' 'xpu' or 'gpu', and it can also be empty string or None "
+            "The Attr(device) should be 'cpu' 'npu' or 'gpu', and it can also be empty string or None "
            "when there is no need to specify device. But received %s" % device
        )
    if index:
@@ -7651,7 +7581,6 @@ def _get_paddle_place(place):
            core.CPUPlace,
            core.CUDAPinnedPlace,
            core.CUDAPlace,
-            core.NPUPlace,
            core.IPUPlace,
            core.CustomPlace,
        ),
@@ -7701,19 +7630,6 @@ def _get_paddle_place(place):
        device_id = int(device_id)
        return core.XPUPlace(device_id)
-    # NPU
-    avaliable_npu_place = re.match(r'npu:\d+', place)
-    if avaliable_npu_place:
-        if not core.is_compiled_with_npu():
-            raise ValueError(
-                "The device should not be {}, since PaddlePaddle is "
-                "not compiled with NPU".format(avaliable_npu_place.group())
-            )
-        place_info_list = place.split(':', 1)
-        device_id = place_info_list[1]
-        device_id = int(device_id)
-        return core.NPUPlace(device_id)
    # IPU
    avaliable_ipu_place = re.match(r'ipu:\d+', place)
    if avaliable_ipu_place:
@@ -7728,9 +7644,7 @@ def _get_paddle_place(place):
        return core.IPUPlace(device_id)
    raise ValueError(
-        "Paddle supports CPUPlace, CUDAPlace,CUDAPinnedPlace, XPUPlace, IPUPlace and NPUPlace, but received {}.".format(
+        f"Paddle supports CPUPlace, CUDAPlace, CUDAPinnedPlace, XPUPlace and IPUPlace, but received {place}."
-            place
-        )
    )

--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -4553,7 +4553,7 @@ class PipelineOptimizer:
    def __init__(self, optimizer, num_microbatches=1, start_cpu_core_id=0):
        self._device = 'cpu'
-        if core.is_compiled_with_npu():
+        if core.is_compiled_with_custom_device('npu'):
            self._device = "npu"
        elif core.is_compiled_with_cuda():
            self._device = "gpu"
@@ -5770,7 +5770,7 @@ class PipelineOptimizer:
                    # If there are some not initialized sections in the fused var,
                    # and the value in those sections are nan/inf, it will trigger the nan/inf check.
                    # To avoid these problematic triggers, set constant is needed for npu
-                    "set_constant": core.is_compiled_with_npu(),
+                    "set_constant": core.is_compiled_with_custom_device('npu'),
                    "constant": float(0.0),
                },
            )
@@ -6387,8 +6387,8 @@ class PipelineOptimizer:
            dev_index = int(dev.split(":")[1])
            if core.is_compiled_with_cuda():
                place_list.append(core.CUDAPlace(dev_index % 1))
-            elif core.is_compiled_with_npu():
+            elif paddle.is_compiled_with_custom_device('npu'):
-                place_list.append(core.NPUPlace(dev_index % 1))
+                place_list.append(paddle.CustomPlace('npu', dev_index % 1))
        # Step6: Split startup program
        new_startup_program = self._split_startup_program(
@@ -6411,7 +6411,7 @@ class PipelineOptimizer:
        if core.is_compiled_with_cuda():
            place_id = int(os.getenv("FLAGS_selected_gpus", "0"))
-        elif core.is_compiled_with_npu():
+        elif core.is_compiled_with_custom_device('npu'):
            place_id = int(os.getenv("FLAGS_selected_npus", "0"))
        # A pass to move the recv op to the beginning of
        # the forward/backward phase

--- a/python/paddle/fluid/tests/unittests/ascend_multi_process_collective.py
+++ b/python/paddle/fluid/tests/unittests/ascend_multi_process_collective.py
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import sys
-def train(prefix):
-    selected_accelerators = os.getenv("FLAGS_selected_accelerators")
-    selected_npus = os.getenv("FLAGS_selected_npus")
-    trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
-    worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS")
-    current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT")
-    worker_endpoints = worker_endpoints_env
-    trainers_num = len(worker_endpoints.split(','))
-    device_ids = os.getenv("PADDLE_WORLD_DEVICE_IDS")
-    current_device_id = os.getenv("PADDLE_LOCAL_DEVICE_IDS")
-    details = "selected_accelerators:{} selected_npus:{} worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{} device_ids:{} device_id:{}".format(
-        selected_accelerators,
-        selected_npus,
-        worker_endpoints,
-        trainers_num,
-        current_endpoint,
-        trainer_id,
-        device_ids,
-        current_device_id,
-    )
-    print(details)
-    with open(f"multi_process_{prefix}.check_{trainer_id}.log", "w") as f:
-        f.write(details)
-if __name__ == '__main__':
-    prefix = sys.argv[1]
-    train(prefix)
--- a/python/paddle/fluid/tests/unittests/c_embedding_op_base.py
+++ b/python/paddle/fluid/tests/unittests/c_embedding_op_base.py
@@ -37,9 +37,7 @@ class TestCEmbeddingCPU(OpTest):
    def setUp(self):
        self.init_dtype()
        self.initcase()
-        if core.is_compiled_with_npu():
+        if core.is_compiled_with_xpu():
-            self.__class__.use_npu = True
-        elif core.is_compiled_with_xpu():
            self.__class__.use_xpu = True
        elif core.is_compiled_with_cuda():
            self.__class__.exist_fp64_check_grad = True
@@ -57,9 +55,7 @@ class TestCEmbeddingCPU(OpTest):
        np_out = get_c_embedding(self.start_index, self.end_index, table, ids)
        self.outputs = {'Out': np_out.reshape((2, 4, 64))}
        self.attrs = {'start_index': self.start_index}
-        if core.is_compiled_with_npu():
+        if core.is_compiled_with_xpu():
-            self.__class__.use_npu = True
-        elif core.is_compiled_with_xpu():
            self.__class__.use_xpu = True
    def test_check_cpu(self):
@@ -81,16 +77,12 @@ class TestCEmbeddingOpBase(TestCEmbeddingCPU):
    def test_check_output(self):
        if core.is_compiled_with_cuda():
            self.check_output_with_place(core.CUDAPlace(0))
-        elif core.is_compiled_with_npu():
-            self.check_output_with_place(core.NPUPlace(0))
        elif core.is_compiled_with_xpu():
            self.check_output_with_place(core.XPUPlace(0))
    def test_check_grad(self):
        if core.is_compiled_with_cuda():
            self.check_grad_with_place(core.CUDAPlace(0), ['W'], 'Out')
-        elif core.is_compiled_with_npu():
-            self.check_grad_with_place(core.NPUPlace(0), ['W'], 'Out')
        elif core.is_compiled_with_xpu():
            self.check_grad_with_place(core.XPUPlace(0), ['W'], 'Out')
@@ -98,9 +90,6 @@ class TestCEmbeddingOpBase(TestCEmbeddingCPU):
        if core.is_compiled_with_cuda():
            self.dtype = "float64"
            self.ids_dtype = "int64"
-        elif core.is_compiled_with_npu():
-            self.dtype = "float32"
-            self.ids_dtype = "int32"
        elif core.is_compiled_with_xpu():
            self.dtype = "float32"
            self.ids_dtype = "int64"
@@ -129,9 +118,7 @@ class TestCEmbeddingOpFP32(TestCEmbeddingOpBase):
        self.outputs = {'Out': np_out.reshape((2, 4, 64))}
        self.attrs = {'start_index': self.start_index}
-        if core.is_compiled_with_npu():
+        if core.is_compiled_with_xpu():
-            self.__class__.use_npu = True
-        elif core.is_compiled_with_xpu():
            self.__class__.use_xpu = True
        elif core.is_compiled_with_cuda():
            self.__class__.exist_fp64_check_grad = True

--- a/python/paddle/fluid/tests/unittests/eager_op_test.py
+++ b/python/paddle/fluid/tests/unittests/eager_op_test.py
@@ -24,6 +24,16 @@ from copy import copy
 import numpy as np
 from op import Operator
+from prim_op_test import OpTestUtils, PrimForwardChecker, PrimGradChecker
+from testsuite import append_input_output, append_loss_ops, create_op, set_input
+from white_list import (
+    check_shape_white_list,
+    compile_vs_runtime_white_list,
+    no_check_set_white_list,
+    no_grad_set_white_list,
+    op_accuracy_white_list,
+    op_threshold_white_list,
+)
 import paddle
 from paddle import fluid
@@ -36,20 +46,9 @@ from paddle.fluid.framework import (
    _current_expected_place,
    canonicalize_attrs,
 )
+from paddle.fluid.wrapped_decorator import signature_safe_contextmanager
 sys.path.append(os.path.abspath(os.path.dirname(__file__)))
-from prim_op_test import OpTestUtils, PrimForwardChecker, PrimGradChecker
-from testsuite import append_input_output, append_loss_ops, create_op, set_input
-from white_list import (
-    check_shape_white_list,
-    compile_vs_runtime_white_list,
-    no_check_set_white_list,
-    no_grad_set_white_list,
-    op_accuracy_white_list,
-    op_threshold_white_list,
-)
-from paddle.fluid.wrapped_decorator import signature_safe_contextmanager
 @signature_safe_contextmanager
@@ -338,10 +337,7 @@ class OpTest(unittest.TestCase):
        np.random.seed(123)
        random.seed(124)
-        if paddle.is_compiled_with_npu():
+        cls._use_system_allocator = _set_use_system_allocator(True)
-            cls._use_system_allocator = _set_use_system_allocator(False)
-        else:
-            cls._use_system_allocator = _set_use_system_allocator(True)
    @classmethod
    def tearDownClass(cls):
@@ -376,9 +372,6 @@ class OpTest(unittest.TestCase):
        def is_rocm_op_test():
            return core.is_compiled_with_rocm()
-        def is_npu_op_test():
-            return hasattr(cls, "use_npu") and cls.use_npu
        def is_custom_device_op_test():
            return hasattr(cls, "use_custom_device") and cls.use_custom_device
@@ -411,7 +404,6 @@ class OpTest(unittest.TestCase):
                and not is_xpu_op_test()
                and not is_mkldnn_op_test()
                and not is_rocm_op_test()
-                and not is_npu_op_test()
                and not is_custom_device_op_test()
                and not cls.check_prim
            ):
@@ -1965,10 +1957,8 @@ class OpTest(unittest.TestCase):
        # Check inplace for given op, its grad op, its grad_grad op, etc.
        # No effect on original OpTest
        # Currently not support ParallelExecutor on XPUPlace.
-        if (
+        if not paddle.is_compiled_with_xpu() and not isinstance(
-            not paddle.is_compiled_with_xpu()
+            place, core.CustomPlace
-            and not paddle.is_compiled_with_npu()
-            and not isinstance(place, core.CustomPlace)
        ):
            self.check_inplace_output_with_place(
                place, no_check_set=no_check_set, inplace_atol=inplace_atol

--- a/python/paddle/fluid/tests/unittests/test_cuda_max_memory_allocated.py
+++ b/python/paddle/fluid/tests/unittests/test_cuda_max_memory_allocated.py
@@ -59,7 +59,6 @@ class TestMaxMemoryAllocated(unittest.TestCase):
                -2,
                0.5,
                "gpu1",
-                "npu",
            ]
            for device in wrong_device:
                with self.assertRaises(BaseException):

--- a/python/paddle/fluid/tests/unittests/test_cuda_max_memory_reserved.py
+++ b/python/paddle/fluid/tests/unittests/test_cuda_max_memory_reserved.py
@@ -59,7 +59,6 @@ class TestMaxMemoryreserved(unittest.TestCase):
                -2,
                0.5,
                "gpu1",
-                "npu",
            ]
            for device in wrong_device:
                with self.assertRaises(BaseException):

--- a/python/paddle/fluid/tests/unittests/test_cuda_memory_allocated.py
+++ b/python/paddle/fluid/tests/unittests/test_cuda_memory_allocated.py
@@ -44,7 +44,6 @@ class TestMemoryAllocated(unittest.TestCase):
                -2,
                0.5,
                "gpu1",
-                "npu",
            ]
            for device in wrong_device:
                with self.assertRaises(BaseException):

--- a/python/paddle/fluid/tests/unittests/test_cuda_memory_reserved.py
+++ b/python/paddle/fluid/tests/unittests/test_cuda_memory_reserved.py
@@ -44,7 +44,6 @@ class TestMemoryreserved(unittest.TestCase):
                -2,
                0.5,
                "gpu1",
-                "npu",
            ]
            for device in wrong_device:
                with self.assertRaises(BaseException):

--- a/python/paddle/fluid/tests/unittests/test_device.py
+++ b/python/paddle/fluid/tests/unittests/test_device.py
@@ -46,10 +46,6 @@ class TestStaticDeviceManage(unittest.TestCase):
        if core.is_compiled_with_xpu():
            self._test_device("xpu:0", core.XPUPlace)
-    def test_npu_device(self):
-        if core.is_compiled_with_npu():
-            self._test_device("npu:0", core.NPUPlace)
 class TestImperativeDeviceManage(unittest.TestCase):
    def test_cpu(self):
@@ -95,25 +91,6 @@ class TestImperativeDeviceManage(unittest.TestCase):
                self.assertTrue(out.place.is_xpu_place())
                self.assertEqual(device, "xpu:0")
-    def test_npu(self):
-        if core.is_compiled_with_npu():
-            with fluid.dygraph.guard():
-                paddle.set_device('npu:0')
-                out1 = paddle.zeros(shape=[1, 3], dtype='float32')
-                out2 = paddle.ones(shape=[1, 3], dtype='float32')
-                out3 = paddle.concat(x=[out1, out2], axis=0)
-                device = paddle.get_device()
-                self.assertEqual(
-                    isinstance(
-                        framework._current_expected_place(), core.NPUPlace
-                    ),
-                    True,
-                )
-                self.assertTrue(out1.place.is_npu_place())
-                self.assertTrue(out2.place.is_npu_place())
-                self.assertTrue(out3.place.is_npu_place())
-                self.assertEqual(device, "npu:0")
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -17,11 +17,13 @@ import ast
 import os
 import pickle
 import random
+import socket
 import subprocess
 import sys
 import tempfile
 import time
 import unittest
+from contextlib import closing
 import numpy as np
@@ -684,9 +686,6 @@ class TestParallelDyGraphRunnerBase:
        elif fluid.core.is_compiled_with_xpu():
            device_id = int(os.getenv("FLAGS_selected_xpus", "0"))
            place = fluid.XPUPlace(device_id)
-        elif fluid.core.is_compiled_with_npu():
-            device_id = int(os.getenv("FLAGS_selected_npus", "0"))
-            place = fluid.NPUPlace(device_id)
        else:
            assert "Only support CUDAPlace or XPUPlace or CPU(Gloo) for now."
@@ -888,7 +887,6 @@ def runtime_main(test_class):
    parser.add_argument('--use_cpu', action='store_true')
    parser.add_argument('--use_xpu', action='store_true')
    parser.add_argument('--use_dgc', action='store_true')
-    parser.add_argument('--use_npu', action='store_true')
    parser.add_argument('--accumulate_gradient', action='store_true')
    parser.add_argument('--find_unused_parameters', action='store_true')
    parser.add_argument('--use_reduce', action='store_true')
@@ -932,10 +930,6 @@ def runtime_main(test_class):
        model.run_trainer(args)
-import socket
-from contextlib import closing
 class TestDistBase(unittest.TestCase):
    def _setup_config(self):
        raise NotImplementedError("tests should have _setup_config implemented")
@@ -945,21 +939,13 @@ class TestDistBase(unittest.TestCase):
            self.__use_cuda = False
            self.__use_xpu = False
            self._use_dgc = False
-            self.__use_npu = False
        elif self._enforce_place == "GPU":
            self.__use_cuda = True
            self.__use_xpu = False
-            self.__use_npu = False
        elif self._enforce_place == "XPU":
            self.__use_cuda = False
            self.__use_xpu = True
            self._use_dgc = False
-            self.__use_npu = False
-        elif self._enforce_place == "NPU":
-            self.__use_cuda = False
-            self.__use_xpu = False
-            self._use_dgc = False
-            self.__use_npu = True
        else:
            if fluid.core.is_compiled_with_cuda():
                self.__use_cuda = True
@@ -1149,13 +1135,6 @@ class TestDistBase(unittest.TestCase):
                "PADDLE_TRAINERS_NUM": "1",
                "PADDLE_TRAINER_ID": "0",
            }
-        elif self.__use_npu:
-            cmd += " --use_npu"
-            env_local = {
-                "FLAGS_selected_npus": devices,
-                "PADDLE_TRAINERS_NUM": "1",
-                "PADDLE_TRAINER_ID": "0",
-            }
        else:
            env_local = {'CPU_NUM': '1'}
@@ -1447,18 +1426,6 @@ class TestDistBase(unittest.TestCase):
                    "GLOG_v": "2",
                }
            )
-        elif self.__use_npu:
-            tr_cmd += " --use_npu"
-            env.update(
-                {
-                    "FLAGS_selected_npus": f"{trainer_id}",
-                    "PADDLE_TRAINERS_NUM": f"{trainer_num}",
-                    "PADDLE_TRAINER_ID": f"{trainer_id}",
-                    "PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints,
-                    "PADDLE_CURRENT_ENDPOINT": ep,
-                    "GLOG_v": "2",
-                }
-            )
        else:
            env.update({'CPU_NUM': '1'})

--- a/python/paddle/fluid/tests/unittests/test_fleet_ascend_utils.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_ascend_utils.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import json
-import unittest
-from paddle.distributed.fleet import ascend_utils
-RANK_TABLE_JSON = {
-    "status": "completed",
-    "version": "1.0",
-    "server_count": "1",
-    "server_list": [
-        {
-            "server_id": "127.0.0.1",
-            "device": [
-                {"device_id": "0", "device_ip": "192.1.184.23", "rank_id": "0"},
-                {"device_id": "1", "device_ip": "192.2.21.93", "rank_id": "1"},
-            ],
-        }
-    ],
-}
-class TestAscendUtil(unittest.TestCase):
-    def test_get_cloud_cluster(self):
-        cluster, pod = ascend_utils.get_cloud_cluster()
-        self.assertTrue(cluster)
-        self.assertTrue(pod)
-        with open('rank_table_file.json', 'w') as f:
-            json.dump(RANK_TABLE_JSON, f)
-        rank_table_file = "./rank_table_file.json"
-        cluster, pod = ascend_utils.get_cloud_cluster(
-            rank_table_file=rank_table_file
-        )
-        self.assertTrue(cluster)
-        self.assertTrue(pod)
-if __name__ == '__main__':
-    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_npu_identity_op.py
+++ b/python/paddle/fluid/tests/unittests/test_npu_identity_op.py
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import unittest
-import numpy as np
-import paddle
-class TestNPUIdentityOp(unittest.TestCase):
-    def setUp(self):
-        self.op_type = "npu_identity"
-        self.shape = [64, 6, 28, 28]
-        self.x = np.random.random(self.shape).astype(np.float32)
-        self.format = 3  # ACL_FORMAT_NC1HWC0 = 3
-        self.place = paddle.CPUPlace()
-    def test_api_static(self):
-        paddle.enable_static()
-        main_program = paddle.static.default_main_program()
-        startup_program = paddle.static.default_startup_program()
-        with paddle.static.program_guard(main_program, startup_program):
-            x_data = paddle.static.data(
-                shape=self.shape, name="data", dtype='float32'
-            )
-            output = paddle.incubate._npu_identity(x=x_data, format=self.format)
-        exe = paddle.static.Executor()
-        exe.run(startup_program)
-        result = exe.run(
-            main_program, feed={x_data.name: self.x}, fetch_list=[output]
-        )
-        np.testing.assert_allclose(result[0], self.x, rtol=1e-08)
-    def test_api_dygraph(self):
-        paddle.disable_static(self.place)
-        x = paddle.to_tensor(self.x)
-        out = paddle.incubate._npu_identity(x, self.format)
-        np.testing.assert_allclose(out.numpy(), self.x, rtol=1e-08)
-        paddle.enable_static()
-if __name__ == "__main__":
-    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_var_base.py
+++ b/python/paddle/fluid/tests/unittests/test_var_base.py
@@ -272,9 +272,6 @@ class TestVarBase(unittest.TestCase):
            check_with_place("gpu_pinned")
            check_with_place(core.CUDAPlace(0))
            check_with_place("gpu:0")
-        if core.is_compiled_with_npu():
-            check_with_place(core.NPUPlace(0))
-            check_with_place("npu:0")
    def test_to_tensor_not_change_input_stop_gradient(self):
        with paddle.fluid.dygraph.guard(core.CPUPlace()):

--- a/python/paddle/hapi/model.py
+++ b/python/paddle/hapi/model.py
@@ -156,33 +156,6 @@ def init_communicator(
                'ring_id': 0,
            },
        )
-    elif core.is_compiled_with_npu():
-        hccl_id_var = block.create_var(
-            name=fluid.unique_name.generate('hccl_id'),
-            persistable=True,
-            type=core.VarDesc.VarType.RAW,
-        )
-        block.append_op(
-            type='c_gen_hccl_id',
-            inputs={},
-            outputs={'Out': hccl_id_var},
-            attrs={
-                'rank': rank,
-                'endpoint': current_endpoint,
-                'other_endpoints': other_endpoints,
-            },
-        )
-        block.append_op(
-            type='c_comm_init_hccl',
-            inputs={'X': hccl_id_var},
-            outputs={},
-            attrs={
-                'rank': rank,
-                'ring_id': 0,
-                'device_id': int(os.getenv("FLAGS_selected_npus")),
-                'rank_ids': nranks,
-            },
-        )
    elif core.is_compiled_with_xpu():
        bkcl_id_var = block.create_var(
            name=fluid.unique_name.generate('bkcl_id'),

--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py
@@ -16,7 +16,7 @@ from paddle import _C_ops, _legacy_C_ops, get_flags, in_dynamic_mode
 from paddle.device import (
    get_all_custom_device_type,
    is_compiled_with_cuda,
-    is_compiled_with_npu,
+    is_compiled_with_custom_device,
    is_compiled_with_rocm,
 )
 from paddle.fluid.framework import _global_flags, in_dygraph_mode
@@ -466,7 +466,7 @@ def conv1d(
        use_cudnn = False
    # NPU only supports depthwise_conv2d when  "input_channel = output_channel = groups"
-    if is_compiled_with_npu():
+    if is_compiled_with_custom_device('npu'):
        if num_channels == groups and num_channels == num_filters:
            l_type = 'depthwise_conv2d'
        else:
@@ -756,7 +756,7 @@ def conv2d(
    use_mkldnn = _global_flags()["FLAGS_use_mkldnn"]
    # NPU only supports depthwise_conv2d when  "input_channel = output_channel = groups"
-    if is_compiled_with_npu():
+    if is_compiled_with_custom_device('npu'):
        if num_channels == groups and num_channels == num_filters:
            l_type = 'depthwise_conv2d'
        else:

--- a/python/paddle/static/__init__.py
+++ b/python/paddle/static/__init__.py
@@ -60,7 +60,6 @@ from ..fluid.framework import program_guard  # noqa: F401
 from ..fluid.framework import cpu_places  # noqa: F401
 from ..fluid.framework import cuda_places  # noqa: F401
 from ..fluid.framework import xpu_places  # noqa: F401
-from ..fluid.framework import npu_places  # noqa: F401
 from ..fluid.framework import Variable  # noqa: F401
 from ..fluid.framework import Operator  # noqa: F401
 from ..fluid.framework import Parameter  # noqa: F401
@@ -118,7 +117,6 @@ __all__ = [  # noqa
    'cpu_places',
    'cuda_places',
    'xpu_places',
-    'npu_places',
    'Variable',
    'create_global_var',
    'accuracy',

--- a/python/paddle/static/amp/amp_nn.py
+++ b/python/paddle/static/amp/amp_nn.py
@@ -54,7 +54,7 @@ def check_finite_and_unscale(x, scale, name=None, float_status=None):
        )
    inputs = {'X': x, 'Scale': scale}
-    if core.is_compiled_with_npu():
+    if core.is_compiled_with_custom_device('npu'):
        check_variable_and_dtype(
            float_status,
            "float_status",

--- a/python/paddle/static/amp/decorator.py
+++ b/python/paddle/static/amp/decorator.py
@@ -187,7 +187,7 @@ class OptimizerWithMixedPrecision:
        self._train_program = train_program
        # NOTE(zhiqiu): _float_status is only used for NPU.
-        if core.is_compiled_with_npu():
+        if core.is_compiled_with_custom_device('npu'):
            float_status = paddle.static.data(
                name="float_status", shape=[8], dtype='float32'
            )
@@ -408,7 +408,7 @@ class OptimizerWithMixedPrecision:
        if self._is_distributed:
            # if distributed, split check_finite_and_unscale to overlap
            # unscale with communication
-            if core.is_compiled_with_npu():
+            if core.is_compiled_with_custom_device('npu'):
                with self._train_program._optimized_guard(grads):
                    _, found_inf = check_finite_and_unscale(
                        grads,

--- a/python/paddle/static/amp/fp16_lists.py
+++ b/python/paddle/static/amp/fp16_lists.py
@@ -182,7 +182,7 @@ if core.is_compiled_with_xpu():
    _, _, _sys_unsupported_fp16_list = core.op_supported_infos(
        'XPU', core.VarDesc.VarType.FP16
    )
-elif core.is_compiled_with_npu():
+elif core.is_compiled_with_custom_device('npu'):
    _, _, _sys_unsupported_fp16_list = core.op_supported_infos(
        'NPU', core.VarDesc.VarType.FP16
    )

--- a/python/paddle/static/io.py
+++ b/python/paddle/static/io.py
@@ -1536,10 +1536,7 @@ def load(program, model_path, executor=None, var_list=None):
            p = paddle.fluid.core.Place()
            p.set_place(t._place())
            place = paddle.fluid.XPUPlace(p.xpu_device_id())
-        elif p.is_npu_place():
-            p = paddle.fluid.core.Place()
-            p.set_place(t._place())
-            place = paddle.fluid.NPUPlace(p.npu_device_id())
        else:
            p = paddle.fluid.core.Place()
            p.set_place(t._place())
@@ -1676,10 +1673,6 @@ def set_program_state(program, state_dict):
                p = paddle.fluid.core.Place()
                p.set_place(ten_place)
                py_place = paddle.fluid.XPUPlace(p.xpu_device_id())
-            elif ten_place.is_npu_place():
-                p = paddle.fluid.core.Place()
-                p.set_place(ten_place)
-                py_place = paddle.fluid.NPUPlace(p.npu_device_id())
            ten.set(new_para_np, py_place)

--- a/python/paddle/static/nn/common.py
+++ b/python/paddle/static/nn/common.py
@@ -946,7 +946,7 @@ def conv2d(
        l_type = 'depthwise_conv2d'
    # NPU only supports depthwise_conv2d when  "input_channel = output_channel = groups"
-    if core.is_compiled_with_npu():
+    if core.is_compiled_with_custom_device('npu'):
        if num_channels == groups and num_channels == num_filters:
            l_type = 'depthwise_conv2d'
        else:

--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -2246,8 +2246,6 @@ def _memcpy(input, place=None, output=None):
            dst_place_type = 2
        elif p.is_xpu_place():
            dst_place_type = 3
-        elif p.is_npu_place():
-            dst_place_type = 4
    attrs = {'dst_place_type': dst_place_type}
    helper.append_op(

--- a/python/paddle/utils/install_check.py
+++ b/python/paddle/utils/install_check.py
@@ -65,22 +65,6 @@ def _is_cuda_available():
        return False
-def _is_npu_available():
-    """
-    Check whether NPU is avaiable.
-    """
-    try:
-        assert len(paddle.static.npu_places()) > 0
-        return True
-    except Exception as e:
-        logging.warning(
-            "You are using NPU version PaddlePaddle, but there is no NPU "
-            "detected on your machine. Maybe NPU devices is not set properly."
-            "\n Original Error is {}".format(e)
-        )
-        return False
 def _is_xpu_available():
    """
    Check whether XPU is avaiable.
@@ -97,22 +81,19 @@ def _is_xpu_available():
        return False
-def _run_dygraph_single(use_cuda, use_xpu, use_npu):
+def _run_dygraph_single(use_cuda, use_xpu):
    """
-    Testing the simple network in dygraph mode using one CPU/GPU/XPU/NPU.
+    Testing the simple network in dygraph mode using one CPU/GPU/XPU.
    Args:
        use_cuda (bool): Whether running with CUDA.
        use_xpu (bool): Whether running with XPU.
-        use_npu (bool): Whether running with NPU.
    """
    paddle.disable_static()
    if use_cuda:
        paddle.set_device('gpu')
    elif use_xpu:
        paddle.set_device('xpu')
-    elif use_npu:
-        paddle.set_device('npu')
    else:
        paddle.set_device('cpu')
    weight_attr = paddle.ParamAttr(
@@ -135,14 +116,13 @@ def _run_dygraph_single(use_cuda, use_xpu, use_npu):
    opt.step()
-def _run_static_single(use_cuda, use_xpu, use_npu):
+def _run_static_single(use_cuda, use_xpu):
    """
-    Testing the simple network with executor running directly, using one CPU/GPU/XPU/NPU.
+    Testing the simple network with executor running directly, using one CPU/GPU/XPU.
    Args:
        use_cuda (bool): Whether running with CUDA.
        use_xpu (bool): Whether running with XPU.
-        use_npu (bool): Whether running with NPU.
    """
    paddle.enable_static()
    with paddle.static.scope_guard(paddle.static.Scope()):
@@ -159,8 +139,6 @@ def _run_static_single(use_cuda, use_xpu, use_npu):
            place = paddle.CUDAPlace(0)
        elif use_xpu:
            place = paddle.XPUPlace(0)
-        elif use_npu:
-            place = paddle.NPUPlace(0)
        else:
            place = paddle.CPUPlace()
@@ -223,7 +201,6 @@ def _run_parallel(device_list):
    Args:
        use_cuda (bool): Whether running with CUDA.
        use_xpu (bool): Whether running with XPU.
-        use_npu (bool): Whether running with NPU.
        device_list (int): The specified devices.
    """
    paddle.distributed.spawn(train_for_run_parallel, nprocs=len(device_list))
@@ -252,14 +229,11 @@ def run_check():
    use_cuda = False
    use_xpu = False
-    use_npu = False
    if paddle.is_compiled_with_cuda():
        use_cuda = _is_cuda_available()
    elif paddle.is_compiled_with_xpu():
        use_xpu = _is_xpu_available()
-    elif paddle.is_compiled_with_npu():
-        use_npu = _is_npu_available()
    if use_cuda:
        device_str = "GPU"
@@ -267,16 +241,13 @@ def run_check():
    elif use_xpu:
        device_str = "XPU"
        device_list = paddle.static.xpu_places()
-    elif use_npu:
-        device_str = "NPU"
-        device_list = paddle.static.npu_places()
    else:
        device_str = "CPU"
        device_list = paddle.static.cpu_places(device_count=1)
    device_count = len(device_list)
-    _run_static_single(use_cuda, use_xpu, use_npu)
+    _run_static_single(use_cuda, use_xpu)
-    _run_dygraph_single(use_cuda, use_xpu, use_npu)
+    _run_dygraph_single(use_cuda, use_xpu)
    print(f"PaddlePaddle works well on 1 {device_str}.")
    try:

--- a/test/custom_kernel/custom_kernel_dot_c_setup.py
+++ b/test/custom_kernel/custom_kernel_dot_c_setup.py
@@ -18,8 +18,6 @@ from distutils.sysconfig import get_python_lib
 from setuptools import Extension, setup
 from setuptools.command.build_ext import build_ext
-from paddle.fluid import core
 # refer: https://note.qidong.name/2018/03/setup-warning-strict-prototypes
 # Avoid a gcc warning below:
@@ -40,8 +38,6 @@ paddle_extra_compile_args = [
    '-Wno-parentheses',
    '-DPADDLE_WITH_CUSTOM_KERNEL',
 ]
-if core.is_compiled_with_npu():
-    paddle_extra_compile_args += ['-D_GLIBCXX_USE_CXX11_ABI=0']
 # include path
 site_packages_path = get_python_lib()

--- a/test/custom_kernel/custom_kernel_dot_setup.py
+++ b/test/custom_kernel/custom_kernel_dot_setup.py
@@ -18,8 +18,6 @@ import site
 from setuptools import Extension, setup
 from setuptools.command.build_ext import build_ext
-from paddle.fluid import core
 # refer: https://note.qidong.name/2018/03/setup-warning-strict-prototypes
 # Avoid a gcc warning below:
@@ -40,8 +38,6 @@ paddle_extra_compile_args = [
    '-Wno-parentheses',
    '-DPADDLE_WITH_CUSTOM_KERNEL',
 ]
-if core.is_compiled_with_npu():
-    paddle_extra_compile_args += ['-D_GLIBCXX_USE_CXX11_ABI=0']
 # include path
 site_packages_path = site.getsitepackages()

--- a/tools/get_quick_disable_lt.py
+++ b/tools/get_quick_disable_lt.py
@@ -32,9 +32,6 @@ def download_file():
    if paddle.is_compiled_with_rocm():
        url = "https://sys-p0.bj.bcebos.com/prec/{}".format('disable_ut_rocm')
-    if paddle.is_compiled_with_npu():
-        url = "https://sys-p0.bj.bcebos.com/prec/{}".format('disable_ut_npu')
    f = requests.get(url)
    data = f.text
    status_code = f.status_code