未验证 提交 7976e2a3 编写于 作者: K Kim Yann 提交者: GitHub

rem is_compiled_with_npu (#52385)

* rem is_compiled_with_npu

* rem nup related code

* make lint happy

* rem test

* remove some tests

* Update grad_scaler.py

* fix an error
上级 2acc2b14
......@@ -8,9 +8,6 @@ exclude =
./python/paddle/fluid/tra**,
# Exclude third-party libraries
./python/paddle/utils/gast/**,
# Exclude files that will be removed in the future, see more at
# https://github.com/PaddlePaddle/Paddle/pull/46782#issuecomment-1273033731
./python/paddle/fluid/tests/unittests/npu/**,
ignore =
# Whitespace before ‘,’, ‘;’, or ‘:’, it is not compatible with black
E203,
......
......@@ -4,8 +4,7 @@ exclude: |
patches/.+|
paddle/fluid/framework/fleet/heter_ps/cudf/.+|
paddle/fluid/distributed/ps/thirdparty/round_robin.h|
python/paddle/utils/gast/.+|
python/paddle/fluid/tests/unittests/npu/.+
python/paddle/utils/gast/.+
)$
repos:
# Common hooks
......
......@@ -265,14 +265,6 @@ bool IsCompiledWithROCM() {
#endif
}
bool IsCompiledWithAscend() {
#ifndef PADDLE_WITH_ASCEND
return false;
#else
return true;
#endif
}
bool IsCompiledWithXPU() {
#ifndef PADDLE_WITH_XPU
return false;
......@@ -281,8 +273,6 @@ bool IsCompiledWithXPU() {
#endif
}
bool IsCompiledWithNPU() { return false; }
bool IsCompiledWithCustomDevice(std::string device_type) {
#ifndef PADDLE_WITH_CUSTOM_DEVICE
return false;
......@@ -1592,14 +1582,6 @@ All parameter, weight, gradient are variables in Paddle.
return context;
#endif
})
.def_static(
"create",
[](paddle::platform::NPUPlace &place)
-> paddle::platform::DeviceContext * {
PADDLE_THROW(platform::errors::PermissionDenied(
"Cannot use NPUPlace in CPU/GPU/XPU version, "
"Please recompile or reinstall Paddle with NPU support."));
})
.def_static("create",
[](paddle::platform::CustomPlace &place)
-> paddle::platform::DeviceContext * {
......@@ -1769,13 +1751,6 @@ All parameter, weight, gradient are variables in Paddle.
pybind11::gil_scoped_release release;
self.Run(scope, place);
})
.def("run",
[](OperatorBase &self,
const Scope &scope,
const platform::NPUPlace &place) {
pybind11::gil_scoped_release release;
self.Run(scope, place);
})
.def("run",
[](OperatorBase &self,
const Scope &scope,
......@@ -1985,9 +1960,7 @@ All parameter, weight, gradient are variables in Paddle.
});
m.def("is_compiled_with_avx", IsCompiledWithAVX);
m.def("is_compiled_with_cuda", IsCompiledWithCUDA);
m.def("is_compiled_with_ascend", IsCompiledWithAscend);
m.def("is_compiled_with_rocm", IsCompiledWithROCM);
m.def("is_compiled_with_npu", IsCompiledWithNPU);
m.def("is_compiled_with_custom_device", IsCompiledWithCustomDevice);
m.def("is_compiled_with_ipu", IsCompiledWithIPU);
m.def("is_compiled_with_xpu", IsCompiledWithXPU);
......
......@@ -14,7 +14,6 @@ extend_skip_glob = [
"python/paddle/fluid/[!t]**",
"python/paddle/fluid/tra**",
"python/paddle/utils/gast/**",
"python/paddle/fluid/tests/unittests/npu/**",
]
[tool.ruff]
......@@ -23,7 +22,6 @@ exclude = [
"./python/paddle/fluid/[!t]**",
"./python/paddle/fluid/tra**",
"./python/paddle/utils/gast/**",
"./python/paddle/fluid/tests/unittests/npu/**",
]
target-version = "py37"
select = [
......
......@@ -334,7 +334,6 @@ from .framework import ParamAttr # noqa: F401
from .framework import CPUPlace # noqa: F401
from .framework import IPUPlace # noqa: F401
from .framework import CUDAPlace # noqa: F401
from .framework import NPUPlace # noqa: F401
from .framework import CUDAPinnedPlace # noqa: F401
from .framework import CustomPlace # noqa: F401
......@@ -363,7 +362,6 @@ from .device import get_cudnn_version # noqa: F401
from .device import set_device # noqa: F401
from .device import get_device # noqa: F401
from .device import is_compiled_with_xpu # noqa: F401
from .device import is_compiled_with_npu # noqa: F401
from .device import is_compiled_with_ipu # noqa: F401
from .device import is_compiled_with_cinn # noqa: F401
from .device import is_compiled_with_cuda # noqa: F401
......@@ -512,7 +510,6 @@ __all__ = [ # noqa
'histogram',
'multiplex',
'CUDAPlace',
'NPUPlace',
'empty',
'shape',
'real',
......
......@@ -344,7 +344,6 @@ def amp_guard(
if enable and not (
tracer._expected_place.is_gpu_place()
or tracer._expected_place.is_xpu_place()
or tracer._expected_place.is_npu_place()
or tracer._expected_place.is_custom_place()
):
warnings.warn(
......@@ -352,10 +351,6 @@ def amp_guard(
% tracer._expected_place
)
enable = False
# For npu:
if tracer._expected_place.is_npu_place() and (dtype == 'bfloat16'):
warnings.warn('NPUPlace only support float16 amp.')
enable = False
# For xpu:
if tracer._expected_place.is_xpu_place() and (dtype == 'bfloat16'):
warnings.warn('XPUPlace only support float16 amp.')
......
......@@ -105,11 +105,10 @@ class AmpScaler:
if enable and not (
tracer._expected_place.is_gpu_place()
or tracer._expected_place.is_xpu_place()
or tracer._expected_place.is_npu_place()
or tracer._expected_place.is_custom_place()
):
warnings.warn(
'AmpScaler can only be enabled on CUDAPlace, XPUPlace, MLUPlace, NPUPlace and CustomPlace, current place is %s, so it makes no effect.'
'AmpScaler can only be enabled on CUDAPlace, XPUPlace, MLUPlace and CustomPlace, current place is %s, so it makes no effect.'
% tracer._expected_place
)
enable = False
......@@ -326,44 +325,6 @@ class AmpScaler:
if param.dtype == core.VarDesc.VarType.FP32
]
self._found_inf = self._temp_found_inf_value_false
if core.is_compiled_with_npu():
float_status = _legacy_C_ops.alloc_float_status()
_legacy_C_ops.clear_float_status(float_status, float_status)
if len(param_grads_fp16):
_legacy_C_ops.check_finite_and_unscale(
param_grads_fp16,
self._scale,
float_status,
param_grads_fp16,
self._temp_found_inf_fp16,
)
self._found_inf = _C_ops.bitwise_or(
self._found_inf, self._temp_found_inf_fp16
)
if len(param_grads_bf16):
_legacy_C_ops.check_finite_and_unscale(
param_grads_bf16,
self._scale,
float_status,
param_grads_bf16,
self._temp_found_inf_bf16,
)
self._found_inf = _C_ops.bitwise_or(
self._found_inf, self._temp_found_inf_bf16
)
if len(param_grads_fp32):
_legacy_C_ops.check_finite_and_unscale(
param_grads_fp32,
self._scale,
float_status,
param_grads_fp32,
self._temp_found_inf_fp32,
)
self._found_inf = _C_ops.bitwise_or(
self._found_inf, self._temp_found_inf_fp32
)
else:
if len(param_grads_fp16):
_legacy_C_ops.check_finite_and_unscale(
param_grads_fp16,
......
......@@ -36,7 +36,6 @@ __all__ = [ # noqa
'is_compiled_with_cinn',
'is_compiled_with_cuda',
'is_compiled_with_rocm',
'is_compiled_with_npu',
'is_compiled_with_custom_device',
'get_all_device_type',
'get_all_custom_device_type',
......@@ -53,24 +52,6 @@ __all__ = [ # noqa
_cudnn_version = None
# TODO: WITH_ASCEND_CL may changed to WITH_NPU or others in the future
# for consistent.
def is_compiled_with_npu():
"""
Whether paddle was built with WITH_ASCEND_CL=ON to support Ascend NPU.
Return:
bool, ``True`` if NPU is supported, otherwise ``False``.
Examples:
.. code-block:: python
import paddle
support_npu = paddle.device.is_compiled_with_npu()
"""
return core.is_compiled_with_npu()
def is_compiled_with_custom_device(device_type):
"""
Whether paddle was built with Paddle_CUSTOM_DEVICE .
......@@ -210,15 +191,6 @@ def _convert_to_place(device):
selected_xpus = os.getenv("FLAGS_selected_xpus", "0").split(",")
device_id = int(selected_xpus[0])
place = core.XPUPlace(device_id)
elif lower_device == 'npu':
if not core.is_compiled_with_npu():
raise ValueError(
"The device should not be 'npu', "
"since PaddlePaddle is not compiled with NPU"
)
selected_npus = os.getenv("FLAGS_selected_npus", "0").split(",")
device_id = int(selected_npus[0])
place = core.NPUPlace(device_id)
elif lower_device == 'ipu':
if not core.is_compiled_with_ipu():
raise ValueError(
......@@ -229,7 +201,6 @@ def _convert_to_place(device):
else:
avaliable_gpu_device = re.match(r'gpu:\d+', lower_device)
avaliable_xpu_device = re.match(r'xpu:\d+', lower_device)
avaliable_npu_device = re.match(r'npu:\d+', lower_device)
if avaliable_gpu_device:
if not core.is_compiled_with_cuda():
raise ValueError(
......@@ -250,31 +221,7 @@ def _convert_to_place(device):
device_id = device_info_list[1]
device_id = int(device_id)
place = core.XPUPlace(device_id)
if avaliable_npu_device:
if not core.is_compiled_with_npu():
device_info_list = device.split(':', 1)
device_type = device_info_list[0]
if device_type in core.get_all_custom_device_type():
device_id = device_info_list[1]
device_id = int(device_id)
place = core.CustomPlace(device_type, device_id)
return place
else:
raise ValueError(
"The device should not be {}, since PaddlePaddle is "
"not compiled with NPU or compiled with custom device".format(
avaliable_npu_device
)
)
device_info_list = device.split(':', 1)
device_id = device_info_list[1]
device_id = int(device_id)
place = core.NPUPlace(device_id)
if (
not avaliable_gpu_device
and not avaliable_xpu_device
and not avaliable_npu_device
):
if not avaliable_gpu_device and not avaliable_xpu_device:
device_info_list = device.split(':', 1)
device_type = device_info_list[0]
if device_type in core.get_all_custom_device_type():
......@@ -346,9 +293,6 @@ def get_device():
elif isinstance(place, core.XPUPlace):
device_id = place.get_device_id()
device = 'xpu:' + str(device_id)
elif isinstance(place, core.NPUPlace):
device_id = place.get_device_id()
device = 'npu:' + str(device_id)
elif isinstance(place, core.IPUPlace):
num_devices = core.get_ipu_device_count()
device = f"ipus:{{0-{num_devices - 1}}}"
......@@ -469,7 +413,7 @@ class Event:
Parameters:
device(str|paddle.CUDAPlace(n)|paddle.CustomPlace(n)): Which device the stream runn on. If device is None, the device is the current device. Default: None.
It can be ``gpu``, ``gpu:x``,``custom_device``, ``custom_device:x``, where ``custom_device`` is the name of CustomDevicec,
where ``x`` is the index of the GPUs, XPUs or NPUs. And it can be paddle.CUDAPlace(n) or paddle.CustomPlace(n).
where ``x`` is the index of the GPUs, XPUs. And it can be paddle.CUDAPlace(n) or paddle.CustomPlace(n).
enable_timing (bool, optional): indicates if the event should measure time, default is False
blocking (bool, optional): if True, ``wait`` will be blocking, default is False
interprocess (bool): if True, the event can be shared between processes, default is False
......@@ -614,7 +558,7 @@ class Stream:
Parameters:
device(str|paddle.CUDAPlace(n)|paddle.CustomPlace(n)): Which device the stream runn on. If device is None, the device is the current device. Default: None.
It can be ``gpu``, ``gpu:x``,``custom_device``, ``custom_device:x``, where ``custom_device`` is the name of CustomDevicec,
where ``x`` is the index of the GPUs, XPUs or NPUs. And it can be paddle.CUDAPlace(n) or paddle.CustomPlace(n).
where ``x`` is the index of the GPUs, XPUs. And it can be paddle.CUDAPlace(n) or paddle.CustomPlace(n).
priority(int, optional): priority of the CUDA stream. Can be either
1 (high priority) or 2 (low priority). By default, streams have
priority 2.
......@@ -936,7 +880,7 @@ def synchronize(device=None):
Parameters:
device(str|paddle.CUDAPlace(n)|paddle.XPUPlace(n)|paddle.CustomPlace(n)): The device which want to wait for. If device is None, the device is the current device. Default: None.
It can be ``gpu``, ``gpu:x``, ``xpu``, ``xpu:x``, ``custom_device``, ``custom_device:x``, where ``custom_device`` is the name of CustomDevicec,
where ``x`` is the index of the GPUs, XPUs or NPUs. And it can be paddle.CUDAPlace(n) or paddle.XPUPlace(n) or paddle.CustomPlace(n).
where ``x`` is the index of the GPUs, XPUs. And it can be paddle.CUDAPlace(n) or paddle.XPUPlace(n) or paddle.CustomPlace(n).
Examples:
.. code-block:: python
# required: custom_device
......
......@@ -288,11 +288,6 @@ def new_group(ranks=None, backend=None, timeout=_default_timeout):
core.NCCLParallelContext(strategy, place).init_with_ring_id(
ring_id
)
elif core.is_compiled_with_npu():
place = core.NPUPlace(genv.device_id)
core.HCCLParallelContext(strategy, place).init_with_ring_id(
ring_id
)
elif core.is_compiled_with_xpu():
place = core.XPUPlace(genv.device_id)
core.BKCLParallelContext(strategy, place).init_with_ring_id(
......
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import os
from paddle.distributed.fleet.launch_utils import (
DeviceMode,
get_cluster,
get_host_name_ip,
)
__all__ = []
def _get_ascend_rankfile(rank_table_file_path):
"""
Args:
rank_table_file_path: ascend npu rank file json
{
"status": "completed",
"version": "1.0",
"server_count": "2",
"server_list": [
{
"server_id": "192.168.24.217",
"device": [
{
"device_id": "0",
"device_ip": "192.1.184.23",
"rank_id": "0"
},
{
"device_id": "1",
"device_ip": "192.2.21.93",
"rank_id": "1"
}
]
},
{
"server_id": "192.168.26.177",
"device": [
{
"device_id": "0",
"device_ip": "192.1.94.132",
"rank_id": "2"
},
{
"device_id": "1",
"device_ip": "192.2.94.30",
"rank_id": "3"
}
]
}
]
}
Returns:
node_ips: node ip list
device_count: number of npu per machine
"""
json_data = None
with open(rank_table_file_path) as json_file:
json_data = json.load(json_file)
node_ips = []
device_count = 0
server_list = json_data['server_list']
for server in server_list:
device_list = server['device']
device_count = len(device_list)
if os.getenv("FLAGS_MODELARTS", None):
nodes = os.getenv("DLS_TASK_NUMBER", None)
assert nodes is not None, "DLS_TASK_NUMBER didn't set!"
for node in range(int(nodes)):
node_ip = os.getenv(f"VC_CUSTOM{node}_HOSTS", None)
assert node_ip is not None, f"VC_CUSTOM{node}_HOSTS didn't set!"
node_ips.append(node_ip)
return node_ips, device_count
node_ips.append(server['server_id'])
return node_ips, device_count
def get_cloud_cluster(
rank_table_file=None, device_mode=DeviceMode.ASCEND_NPU, start_port=6070
):
"""
Args:
rank_table_file: string, ascend npu rank file path
device_mode: DeviceMode(Int)
start_port: the start port of current runtime env
"""
if rank_table_file:
# multi trainers
node_ips, device_count = _get_ascend_rankfile(rank_table_file)
if len(node_ips) == 1:
node_ip = node_ips[0]
else:
node_index = os.environ.get("PADDLE_TRAINER_ID")
node_ip = None
if node_index:
node_ip = node_ips[int(node_index)]
else:
_, node_ip = get_host_name_ip()
assert (
node_ip in node_ips
), "Can't find your local ip {{{}}} in node_ips: {{{}}}".format(
node_ip,
node_ips,
)
else:
# single trainer (single ascend card)
node_ips = ["127.0.0.1"]
node_ip = node_ips[0]
device_count = 1
devices_per_proc = [str(x) for x in range(device_count)]
free_ports = list(range(start_port, start_port + len(devices_per_proc)))
trainer_endpoints = []
for ip in node_ips:
trainer_endpoints.append(["%s:%d" % (ip, port) for port in free_ports])
return get_cluster(
node_ips, node_ip, trainer_endpoints, device_mode, devices_per_proc
)
......@@ -64,7 +64,7 @@ import time
from argparse import REMAINDER, ArgumentParser
from paddle import framework
from paddle.distributed.fleet import ascend_utils, cloud_utils, launch_utils
from paddle.distributed.fleet import cloud_utils, launch_utils
from paddle.distributed.fleet.elastic import enable_elastic, launch_elastic
from paddle.distributed.fleet.launch_utils import (
DeviceMode,
......@@ -155,16 +155,6 @@ see: http://www.paddlepaddle.org/documentation/docs/zh/1.6/user_guides/howto/tra
)
base_group.add_argument("--selected_xpus", dest="xpus")
if framework.core.is_compiled_with_npu():
base_group.add_argument(
"--npus",
type=str,
default=None,
help="It's for xpu training. For example: "
"--npus=\"0,1,2,3\" will launch four training processes each bound to one npu.",
)
base_group.add_argument("--selected_npus", dest="npus")
base_group.add_argument(
"training_script",
type=str,
......@@ -407,13 +397,6 @@ def get_cluster_info(args):
args.ips, device_mode, devices_per_proc, start_port
)
logger.debug(f"get cluster from cloud:{cluster}")
elif device_mode == DeviceMode.ASCEND_NPU:
# for ascend
cluster, pod = ascend_utils.get_cloud_cluster(
rank_table_file=os.getenv("RANK_TABLE_FILE", None),
device_mode=device_mode,
start_port=start_port,
)
else:
# trainers_num = 1 or not use paddlecloud ips="a,b"
cluster, pod = get_cluster_from_args(
......@@ -493,8 +476,6 @@ def infer_backend(args):
return
if framework.core.is_compiled_with_cuda():
args.backend = 'nccl'
elif framework.core.is_compiled_with_npu():
args.backend = 'unknown'
elif framework.core.is_compiled_with_xpu():
args.backend = 'bkcl'
else:
......@@ -545,8 +526,6 @@ def which_distributed_mode(args):
if framework.core.is_compiled_with_cuda():
accelerators = framework.core.get_cuda_device_count()
elif framework.core.is_compiled_with_npu():
accelerators = framework.core.get_npu_device_count()
elif framework.core.is_compiled_with_xpu():
accelerators = framework.core.get_xpu_device_count()
else:
......@@ -578,7 +557,7 @@ def which_distributed_mode(args):
):
if args.servers:
logger.warning(
"Not found distinct arguments and not compiled with cuda or xpu or npu. "
"Not found distinct arguments and not compiled with cuda or xpu. "
"But found args.servers not empty, default use ps mode"
)
return DistributeMode.PS
......@@ -586,7 +565,7 @@ def which_distributed_mode(args):
return DistributeMode.COLLECTIVE
else:
logger.warning(
"Not found distinct arguments and compiled with cuda or xpu or npu. "
"Not found distinct arguments and compiled with cuda or xpu. "
"Default use collective mode"
)
return DistributeMode.COLLECTIVE
......
......@@ -55,7 +55,6 @@ class DeviceMode:
GPU = 1
KUNLUN = 2
XPU = 2
ASCEND_NPU = 3
UNKNOWN = 3
......@@ -299,10 +298,7 @@ def get_cluster(
), "current trainer_endpoints size should be greater equal than acclerators size."
for i in range(len(devices_per_proc)):
trainer = Trainer()
if (
device_mode == DeviceMode.GPU
or device_mode == DeviceMode.ASCEND_NPU
):
if device_mode == DeviceMode.GPU:
if isinstance(devices_per_proc[i], (list, tuple)):
trainer.accelerators.extend(devices_per_proc[i])
pod.accelerators.extend(devices_per_proc[i])
......@@ -546,13 +542,6 @@ def start_local_trainers(
[str(g) for g in t.accelerators]
)
elif (
len(t.accelerators) > 0 and pod.device_mode == DeviceMode.ASCEND_NPU
):
proc_env["FLAGS_selected_npus"] = "%s" % ",".join(
[str(g) for g in t.accelerators]
)
if len(t.accelerators) > 0:
proc_env["FLAGS_selected_accelerators"] = "%s" % ",".join(
[str(g) for g in t.accelerators]
......@@ -760,40 +749,6 @@ def get_xpus(xpus):
return res_xpus
def get_npus(npus):
if npus is None:
npus_num = framework.core.get_npu_device_count()
res_npus = [str(x) for x in range(0, npus_num)]
else:
npu_visible_devices = os.getenv("ASCEND_VISIBLE_DEVICES")
if npu_visible_devices is None or npu_visible_devices == "":
res_npus = [x.strip() for x in npus.split(',')]
else:
# change npus into relative values
# e.g. ASCEND_VISIBLE_DEVICES=4,5,6,7; args.npus=4,5,6,7;
# therefore npus=0,1,2,3
npu_visible_devices_list = npu_visible_devices.split(',')
for x in npus.split(','):
assert x in npu_visible_devices_list, (
"Can't find "
"your npus %s in ASCEND_VISIBLE_DEVICES[%s]."
% (x, npu_visible_devices)
)
res_npus = [
npu_visible_devices_list.index(x.strip())
for x in npus.split(',')
]
logger.info(
"Change selected_npus into reletive values. --ips:{} "
"will change into relative_ips:{} according to your "
"ASCEND_VISIBLE_DEVICES:{}".format(
npus, res_npus, npu_visible_devices_list
)
)
return res_npus
def get_device_mode(backend):
if backend == 'heter':
if (
......@@ -808,16 +763,6 @@ def get_device_mode(backend):
):
print("launch train in heter mode with XPU device.")
return DeviceMode.XPU
if (
framework.core.is_compiled_with_npu()
and framework.core.get_npu_device_count() > 0
):
print("launch train in heter mode with NPU device.")
return DeviceMode.ASCEND_NPU
if backend == 'hccl' and framework.core.get_npu_device_count() > 0:
print("launch train in ascend npu mode!")
return DeviceMode.ASCEND_NPU
if backend == 'nccl' and framework.core.get_cuda_device_count() > 0:
print("launch train in GPU mode!")
......@@ -853,19 +798,6 @@ def get_device_proc_info(args):
devices_per_proc = [gpus[i : i + n] for i in range(0, len(gpus), n)]
else:
devices_per_proc = gpus
elif device_mode == DeviceMode.ASCEND_NPU:
npus = get_npus(args.npus)
if args.nproc_per_node is not None:
assert (
len(npus) % int(args.nproc_per_node)
) == 0, "npus' number:{} mod args.nproc_per_node:{} must == 0".format(
len(npus), args.nproc_per_node
)
n = int(len(npus) / int(args.nproc_per_node))
devices_per_proc = [npus[i : i + n] for i in range(0, len(npus), n)]
else:
devices_per_proc = npus
elif device_mode == DeviceMode.XPU:
xpus = get_xpus(args.xpus)
if args.nproc_per_node is not None:
......@@ -2079,12 +2011,6 @@ def check_backend(backend):
"your paddle is not compiled with xpu but you assign 'bkcl' as backend."
)
if backend == 'hccl' and not framework.core.is_compiled_with_npu():
raise ValueError(
"paddle.distributed initialize error, "
"your paddle is not compiled with npu but you assign 'hccl' as backend."
)
def block_windows_and_macos(backend):
if backend != 'gloo':
......@@ -2106,7 +2032,4 @@ def get_backend_by_compile_flag():
if framework.core.is_compiled_with_xpu():
return 'bkcl'
if framework.core.is_compiled_with_npu():
return 'hccl'
return 'gloo'
......@@ -536,7 +536,9 @@ def _parallel_linear(
# NOTE: npu linear function use matmul_v2 but linear use matmul
linear_function = (
_linear if core.is_compiled_with_npu() else paddle.nn.functional.linear
_linear
if core.is_compiled_with_custom_device('npu')
else paddle.nn.functional.linear
)
linear_out = linear_function(
x,
......
......@@ -196,7 +196,7 @@ class CollectiveHelper:
OP_ROLE_KEY: OpRole.Forward,
},
)
elif core.is_compiled_with_npu():
elif core.is_compiled_with_custom_device('npu'):
block.append_op(
type='c_gen_hccl_id',
inputs={},
......
......@@ -26,23 +26,17 @@ class PlaceType:
CUDA = 1
CUDA_PINNED = 2
XPU = 3 # unsupport for now
NPU = 4
NPU_PINNED = 5
@staticmethod
def default_device():
if core.is_compiled_with_cuda():
return PlaceType.CUDA
elif core.is_compiled_with_npu():
return PlaceType.NPU
return PlaceType.CPU
@staticmethod
def default_pinned():
if core.is_compiled_with_cuda():
return PlaceType.CUDA_PINNED
elif core.is_compiled_with_npu():
return PlaceType.NPU_PINNED
return PlaceType.CPU
......
......@@ -596,7 +596,7 @@ class ShardingOptimizer(MetaOptimizerBase):
rings = [self.mp_ring_id, self.pp_ring_id]
# FIXME(wangxi): some problem with NPU found_finite, need sync with DP
if core.is_compiled_with_npu():
if core.is_compiled_with_custom_device('npu'):
rings += [self.dp_ring_id]
FP16Utils.sync_amp_check_nan_inf(main_block, rings)
......@@ -721,7 +721,7 @@ class ShardingOptimizer(MetaOptimizerBase):
self._dump_program_for_debug()
# GPU need to wait server ready, GPU and NPU is Layered connection
if not core.is_compiled_with_npu():
if not core.is_compiled_with_custom_device('npu'):
self._wait()
return optimize_ops, params_grads
......@@ -839,7 +839,7 @@ class ShardingOptimizer(MetaOptimizerBase):
sync=False,
)
if core.is_compiled_with_npu():
if core.is_compiled_with_custom_device('npu'):
self._init_npu_pipeline_comm(startup_block)
return
......
......@@ -78,7 +78,7 @@ class InternalStorage:
if self._device != device:
tmp_buffer = (
cvt_to_device(self.buffer, self.dev_id)
if device in ["gpu", "xpu", "npu"]
if device in ["gpu", "xpu"]
else self.buffer.cpu()
)
for param in self._params:
......
......@@ -200,8 +200,9 @@ def device_guard(dev_id=0, device="cpu"):
origin_device = paddle.device.get_device()
if device == "cpu":
paddle.set_device(device)
elif device in ["gpu", "xpu", "npu"]:
elif device in ["gpu", "xpu"]:
paddle.set_device(f"{device}:{dev_id}")
try:
yield
finally:
......@@ -313,8 +314,6 @@ def cvt_to_device(x, dev_id, blocking=True):
"""
if paddle.is_compiled_with_cuda():
place = paddle.CUDAPlace(dev_id)
elif paddle.is_compiled_with_npu():
place = paddle.NPUPlace(dev_id)
elif paddle.is_compiled_with_xpu():
place = paddle.XPUPlace(dev_id)
else:
......
......@@ -201,11 +201,9 @@ class HybridParallelInferenceHelper:
assert isinstance(main_program, Program)
self._device = None
if core.is_compiled_with_npu():
self._device = "npu"
elif core.is_compiled_with_cuda():
if core.is_compiled_with_cuda():
self._device = "gpu"
assert self._device, "Only gpu and npu are supported."
assert self._device, "Only gpu are supported."
assert not in_dygraph_mode(), "Only static graph mode is supported."
......
......@@ -24,7 +24,6 @@ class DeviceType:
CPU = 'cpu'
GPU = 'gpu'
XPU = 'xpu'
NPU = 'npu'
IPU = 'ipu'
CUSTOM_DEVICE = 'custom_device'
......@@ -68,8 +67,6 @@ class Device:
return 'FLAGS_selected_cpus'
if self._dtype == DeviceType.GPU:
return 'FLAGS_selected_gpus'
if self._dtype == DeviceType.NPU:
return 'FLAGS_selected_npus'
if self._dtype == DeviceType.XPU:
return 'FLAGS_selected_xpus'
if self._dtype == DeviceType.IPU:
......@@ -111,9 +108,6 @@ class Device:
elif 'XPU_VISIBLE_DEVICES' in os.environ:
dev._dtype = DeviceType.XPU
visible_devices = os.getenv("XPU_VISIBLE_DEVICES")
elif 'ASCEND_VISIBLE_DEVICES' in os.environ:
dev._dtype = DeviceType.NPU
visible_devices = os.getenv("ASCEND_VISIBLE_DEVICES")
if visible_devices is not None and visible_devices != 'all':
dev._labels = visible_devices.split(',')
......@@ -152,10 +146,6 @@ class Device:
dev._dtype = DeviceType.XPU
num = core.get_xpu_device_count()
visible_devices = os.getenv("XPU_VISIBLE_DEVICES")
elif core.is_compiled_with_npu():
dev._dtype = DeviceType.NPU
num = core.get_npu_device_count()
visible_devices = os.getenv("ASCEND_VISIBLE_DEVICES")
elif core.is_compiled_with_ipu():
dev._dtype = DeviceType.IPU
num = core.get_ipu_device_count()
......
......@@ -721,9 +721,6 @@ class ParallelEnv:
elif core.is_compiled_with_xpu():
selected_xpus = os.getenv("FLAGS_selected_xpus", "0").split(",")
self._device_id = int(selected_xpus[0])
elif core.is_compiled_with_npu():
selected_npus = os.getenv("FLAGS_selected_npus", "0").split(",")
self._device_id = int(selected_npus[0])
self._trainer_endpoints = os.getenv(
"PADDLE_TRAINER_ENDPOINTS", ""
......@@ -889,12 +886,8 @@ def _start_kv_server(port, http_server_d, size):
def _is_cpuonly(backend):
check_backend(backend)
if (
backend in ['auto', 'nccl', 'bkcl', 'hccl', 'heter', 'cncl']
and (
core.is_compiled_with_cuda()
or core.is_compiled_with_xpu()
or core.is_compiled_with_npu()
)
backend in ['auto', 'nccl', 'bkcl', 'heter', 'cncl']
and (core.is_compiled_with_cuda() or core.is_compiled_with_xpu())
) or backend == 'xccl':
# passes 'auto' and can use cuda or xpu, use the default logics. so return False
......@@ -994,7 +987,6 @@ def init_parallel_env():
is_cpu_only
or core.is_compiled_with_cuda()
or core.is_compiled_with_xpu()
or core.is_compiled_with_npu()
or backend == "xccl"
):
raise NotImplementedError(
......@@ -1013,9 +1005,6 @@ def init_parallel_env():
elif not is_cpu_only and core.is_compiled_with_xpu():
_check_var_exists('FLAGS_selected_xpus')
backend = "bkcl" if backend == "auto" else backend
elif not is_cpu_only and core.is_compiled_with_npu():
_check_var_exists('FLAGS_selected_npus')
backend = "hccl" if backend == "auto" else backend
_check_var_exists("PADDLE_TRAINER_ID")
_check_var_exists("PADDLE_CURRENT_ENDPOINT")
......@@ -1038,9 +1027,6 @@ def init_parallel_env():
place = core.CUDAPlace(parallel_env.device_id)
elif core.is_compiled_with_xpu():
place = core.XPUPlace(parallel_env.device_id)
elif core.is_compiled_with_npu():
place = core.NPUPlace(parallel_env.device_id)
_set_expected_place(place)
group = None
......@@ -1136,7 +1122,7 @@ def init_parallel_env():
strategy.current_endpoint = parallel_env.current_endpoint
strategy.nrings = parallel_env.nrings
# init nccl or hccl or bkcl or heter context
# init nccl or bkcl or heter context
if is_cpu_only:
parallel_helper._set_parallel_ctx(
core.GLOOParallelContext(strategy, place)
......@@ -1153,10 +1139,7 @@ def init_parallel_env():
parallel_helper._set_parallel_ctx(
core.BKCLParallelContext(strategy, place)
)
elif core.is_compiled_with_npu():
parallel_helper._set_parallel_ctx(
core.HCCLParallelContext(strategy, place)
)
if backend != "heter":
other_endpoints = strategy.trainer_endpoints[:]
other_endpoints.remove(strategy.current_endpoint)
......
......@@ -133,37 +133,8 @@ class Collective:
wait_server_ready(other_endpoints)
block = program.global_block()
if core.is_compiled_with_npu():
hccl_id_var = block.create_var(
name=unique_name.generate('hccl_id'),
persistable=True,
type=core.VarDesc.VarType.RAW,
)
endpoint_to_index_map = {e: idx for idx, e in enumerate(endpoints)}
block.append_op(
type='c_gen_hccl_id',
inputs={},
outputs={'Out': hccl_id_var},
attrs={
'rank': rank,
'endpoint': current_endpoint,
'other_endpoints': other_endpoints,
self.op_role_key: OpRole.Forward,
},
)
block.append_op(
type='c_comm_init_hccl',
inputs={'X': hccl_id_var},
outputs={},
attrs={
'rank': rank,
'ring_id': ring_id,
'device_id': int(os.getenv("FLAGS_selected_npus")),
'rank_ids': nranks,
self.op_role_key: OpRole.Forward,
},
)
elif core.is_compiled_with_xpu():
if core.is_compiled_with_xpu():
bkcl_id_var = block.create_var(
name=unique_name.generate('bkcl_id'),
persistable=True,
......
......@@ -131,37 +131,7 @@ class Collective:
wait_server_ready(other_endpoints)
block = program.global_block()
if core.is_compiled_with_npu():
hccl_id_var = block.create_var(
name=unique_name.generate('hccl_id'),
persistable=True,
type=core.VarDesc.VarType.RAW,
)
endpoint_to_index_map = {e: idx for idx, e in enumerate(endpoints)}
block.append_op(
type='c_gen_hccl_id',
inputs={},
outputs={'Out': hccl_id_var},
attrs={
'rank': rank,
'endpoint': current_endpoint,
'other_endpoints': other_endpoints,
self.op_role_key: OpRole.Forward,
},
)
block.append_op(
type='c_comm_init_hccl',
inputs={'X': hccl_id_var},
outputs={},
attrs={
'rank': rank,
'ring_id': ring_id,
'device_id': int(os.getenv("FLAGS_selected_npus")),
'rank_ids': nranks,
self.op_role_key: OpRole.Forward,
},
)
elif core.is_compiled_with_cuda():
if core.is_compiled_with_cuda():
nccl_id_var = block.create_var(
name=unique_name.generate('nccl_id'),
persistable=True,
......
......@@ -71,7 +71,6 @@ from .core import (
XPUPlace,
CUDAPlace,
CUDAPinnedPlace,
NPUPlace,
IPUPlace,
MLUPlace,
CustomPlace,
......@@ -127,7 +126,6 @@ __all__ = (
'XPUPlace',
'CUDAPlace',
'CUDAPinnedPlace',
'NPUPlace',
'IPUPlace',
'MLUPlace',
'Tensor',
......@@ -220,10 +218,6 @@ monkey_patch_variable()
__bootstrap__()
monkey_patch_varbase()
# NOTE(zhiqiu): register npu_finalize on the exit of Python,
# do some clean up manually.
if core.is_compiled_with_npu():
atexit.register(core.npu_finalize)
# NOTE(Aurelius84): clean up ExecutorCacheInfo in advance manually.
atexit.register(core.clear_executor_cache)
......
......@@ -654,8 +654,6 @@ class Section(DeviceWorker):
place_id = pipeline_opt["place_id"]
if core.is_compiled_with_cuda():
assert isinstance(place, core.CUDAPlace)
elif core.is_compiled_with_npu():
assert isinstance(place, core.NPUPlace)
cfg.place = cfg.CUDAPlace
cfg.place_id = place_id
......
......@@ -306,7 +306,7 @@ def monkey_patch_varbase():
if _grad_scalar:
# When using amp with Fleet DistributedStrategy, we do loss scaling implicitly.
self = _grad_scalar.scale(self)
if paddle.is_compiled_with_xpu() or paddle.is_compiled_with_npu():
if paddle.is_compiled_with_xpu():
# TODO(liuyuhui): Currently only for xpu. Will be removed in the future.
scaled_loss = scale_loss(self)
if framework.global_var._in_eager_mode_:
......
......@@ -2107,7 +2107,7 @@ class Executor:
for var in program.global_block().vars.values():
if var.is_data:
data_vars.append(var)
if core.is_compiled_with_npu():
if core.is_compiled_with_custom_device('npu'):
dataset = paddle.fluid.DatasetFactory().create_dataset(
'InMemoryDataset'
)
......@@ -2284,7 +2284,7 @@ class Executor:
for var in program.global_block().vars.values():
if var.is_data:
data_vars.append(var)
if core.is_compiled_with_npu():
if core.is_compiled_with_custom_device('npu'):
dataset = paddle.fluid.DatasetFactory().create_dataset(
'InMemoryDataset'
)
......
......@@ -58,7 +58,6 @@ __all__ = [
'is_compiled_with_cuda',
'is_compiled_with_rocm',
'is_compiled_with_xpu',
'is_compiled_with_npu',
'Variable',
'require_version',
'device_guard',
......@@ -224,7 +223,7 @@ def _in_eager_without_dygraph_check():
return global_var._in_eager_mode_
# FIXME(dev): We haven't fully verified eager mode on XPU/NPU et.al but
# FIXME(dev): We haven't fully verified eager mode on XPU et.al but
# only GPU/CPU. Remove this after we improve this feature.
_is_first_import_ = True
......@@ -715,15 +714,6 @@ def _xpu_ids():
return device_ids
def _npu_ids():
npus_env = os.getenv("FLAGS_selected_npus")
if npus_env:
device_ids = [int(s) for s in npus_env.split(",")]
else:
device_ids = range(core.get_npu_device_count())
return device_ids
def _custom_device_ids(device_type):
custom_devices_env = os.getenv("FLAGS_selected_" + device_type + "s")
if custom_devices_env:
......@@ -748,21 +738,6 @@ def is_compiled_with_xpu():
return core.is_compiled_with_xpu()
def is_compiled_with_npu():
"""
Whether this whl package can be used to run the model on NPU.
Returns (bool): support npu or not.
Examples:
.. code-block:: python
import paddle.fluid as fluid
support_npu = fluid.is_compiled_with_npu()
"""
return core.is_compiled_with_npu()
def disable_signal_handler():
"""
Reset signal handler registered by Paddle.
......@@ -921,47 +896,6 @@ def xpu_places(device_ids=None):
return [core.XPUPlace(dev_id) for dev_id in device_ids]
def npu_places(device_ids=None):
"""
Note:
For multi-card tasks, please use `FLAGS_selected_npus` environment variable to set the visible NPU device.
This function creates a list of :code:`paddle.NPUPlace` objects.
If :code:`device_ids` is None, environment variable of
:code:`FLAGS_selected_npus` would be checked first. For example, if
:code:`FLAGS_selected_npus=0,1,2`, the returned list would
be [paddle.NPUPlace(0), paddle.NPUPlace(1), paddle.NPUPlace(2)].
If :code:`FLAGS_selected_npus` is not set, all visible
npu places would be returned.
If :code:`device_ids` is not None, it should be the device
ids of NPUs. For example, if :code:`device_ids=[0,1,2]`,
the returned list would be
[paddle.NPUPlace(0), paddle.NPUPlace(1), paddle.NPUPlace(2)].
Parameters:
device_ids (list or tuple of int, optional): list of NPU device ids.
Returns:
list of paddle.NPUPlace: Created NPU place list.
Examples:
.. code-block:: python
# required: npu
import paddle
import paddle.static as static
paddle.enable_static()
npu_places = static.npu_places()
"""
assert core.is_compiled_with_npu(), "Not compiled with NPU"
if device_ids is None:
device_ids = _npu_ids()
elif not isinstance(device_ids, (list, tuple)):
device_ids = [device_ids]
return [core.NPUPlace(dev_id) for dev_id in device_ids]
def cpu_places(device_count=None):
"""
This function creates a list of :code:`paddle.CPUPlace` objects, and returns the created list.
......@@ -2587,10 +2521,6 @@ class Variable(metaclass=VariableMetaClass):
p = core.Place()
p.set_place(t._place())
place = core.XPUPlace(p.xpu_device_id())
elif p.is_npu_place():
p = core.Place()
p.set_place(t._place())
place = core.NPUPlace(p.npu_device_id())
else:
p = core.Place()
p.set_place(t._place())
......@@ -7520,9 +7450,9 @@ def device_guard(device=None):
device, index = device.split(':')
if device == 'cpu':
raise ValueError("Should not set device id for cpu.")
if device not in ['cpu', 'gpu', 'npu', 'xpu', '', None]:
if device not in ['cpu', 'gpu', 'xpu', '', None]:
raise ValueError(
"The Attr(device) should be 'cpu' 'npu' 'xpu' or 'gpu', and it can also be empty string or None "
"The Attr(device) should be 'cpu' 'npu' or 'gpu', and it can also be empty string or None "
"when there is no need to specify device. But received %s" % device
)
if index:
......@@ -7651,7 +7581,6 @@ def _get_paddle_place(place):
core.CPUPlace,
core.CUDAPinnedPlace,
core.CUDAPlace,
core.NPUPlace,
core.IPUPlace,
core.CustomPlace,
),
......@@ -7701,19 +7630,6 @@ def _get_paddle_place(place):
device_id = int(device_id)
return core.XPUPlace(device_id)
# NPU
avaliable_npu_place = re.match(r'npu:\d+', place)
if avaliable_npu_place:
if not core.is_compiled_with_npu():
raise ValueError(
"The device should not be {}, since PaddlePaddle is "
"not compiled with NPU".format(avaliable_npu_place.group())
)
place_info_list = place.split(':', 1)
device_id = place_info_list[1]
device_id = int(device_id)
return core.NPUPlace(device_id)
# IPU
avaliable_ipu_place = re.match(r'ipu:\d+', place)
if avaliable_ipu_place:
......@@ -7728,9 +7644,7 @@ def _get_paddle_place(place):
return core.IPUPlace(device_id)
raise ValueError(
"Paddle supports CPUPlace, CUDAPlace,CUDAPinnedPlace, XPUPlace, IPUPlace and NPUPlace, but received {}.".format(
place
)
f"Paddle supports CPUPlace, CUDAPlace, CUDAPinnedPlace, XPUPlace and IPUPlace, but received {place}."
)
......
......@@ -4553,7 +4553,7 @@ class PipelineOptimizer:
def __init__(self, optimizer, num_microbatches=1, start_cpu_core_id=0):
self._device = 'cpu'
if core.is_compiled_with_npu():
if core.is_compiled_with_custom_device('npu'):
self._device = "npu"
elif core.is_compiled_with_cuda():
self._device = "gpu"
......@@ -5770,7 +5770,7 @@ class PipelineOptimizer:
# If there are some not initialized sections in the fused var,
# and the value in those sections are nan/inf, it will trigger the nan/inf check.
# To avoid these problematic triggers, set constant is needed for npu
"set_constant": core.is_compiled_with_npu(),
"set_constant": core.is_compiled_with_custom_device('npu'),
"constant": float(0.0),
},
)
......@@ -6387,8 +6387,8 @@ class PipelineOptimizer:
dev_index = int(dev.split(":")[1])
if core.is_compiled_with_cuda():
place_list.append(core.CUDAPlace(dev_index % 1))
elif core.is_compiled_with_npu():
place_list.append(core.NPUPlace(dev_index % 1))
elif paddle.is_compiled_with_custom_device('npu'):
place_list.append(paddle.CustomPlace('npu', dev_index % 1))
# Step6: Split startup program
new_startup_program = self._split_startup_program(
......@@ -6411,7 +6411,7 @@ class PipelineOptimizer:
if core.is_compiled_with_cuda():
place_id = int(os.getenv("FLAGS_selected_gpus", "0"))
elif core.is_compiled_with_npu():
elif core.is_compiled_with_custom_device('npu'):
place_id = int(os.getenv("FLAGS_selected_npus", "0"))
# A pass to move the recv op to the beginning of
# the forward/backward phase
......
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import sys
def train(prefix):
selected_accelerators = os.getenv("FLAGS_selected_accelerators")
selected_npus = os.getenv("FLAGS_selected_npus")
trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS")
current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT")
worker_endpoints = worker_endpoints_env
trainers_num = len(worker_endpoints.split(','))
device_ids = os.getenv("PADDLE_WORLD_DEVICE_IDS")
current_device_id = os.getenv("PADDLE_LOCAL_DEVICE_IDS")
details = "selected_accelerators:{} selected_npus:{} worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{} device_ids:{} device_id:{}".format(
selected_accelerators,
selected_npus,
worker_endpoints,
trainers_num,
current_endpoint,
trainer_id,
device_ids,
current_device_id,
)
print(details)
with open(f"multi_process_{prefix}.check_{trainer_id}.log", "w") as f:
f.write(details)
if __name__ == '__main__':
prefix = sys.argv[1]
train(prefix)
......@@ -37,9 +37,7 @@ class TestCEmbeddingCPU(OpTest):
def setUp(self):
self.init_dtype()
self.initcase()
if core.is_compiled_with_npu():
self.__class__.use_npu = True
elif core.is_compiled_with_xpu():
if core.is_compiled_with_xpu():
self.__class__.use_xpu = True
elif core.is_compiled_with_cuda():
self.__class__.exist_fp64_check_grad = True
......@@ -57,9 +55,7 @@ class TestCEmbeddingCPU(OpTest):
np_out = get_c_embedding(self.start_index, self.end_index, table, ids)
self.outputs = {'Out': np_out.reshape((2, 4, 64))}
self.attrs = {'start_index': self.start_index}
if core.is_compiled_with_npu():
self.__class__.use_npu = True
elif core.is_compiled_with_xpu():
if core.is_compiled_with_xpu():
self.__class__.use_xpu = True
def test_check_cpu(self):
......@@ -81,16 +77,12 @@ class TestCEmbeddingOpBase(TestCEmbeddingCPU):
def test_check_output(self):
if core.is_compiled_with_cuda():
self.check_output_with_place(core.CUDAPlace(0))
elif core.is_compiled_with_npu():
self.check_output_with_place(core.NPUPlace(0))
elif core.is_compiled_with_xpu():
self.check_output_with_place(core.XPUPlace(0))
def test_check_grad(self):
if core.is_compiled_with_cuda():
self.check_grad_with_place(core.CUDAPlace(0), ['W'], 'Out')
elif core.is_compiled_with_npu():
self.check_grad_with_place(core.NPUPlace(0), ['W'], 'Out')
elif core.is_compiled_with_xpu():
self.check_grad_with_place(core.XPUPlace(0), ['W'], 'Out')
......@@ -98,9 +90,6 @@ class TestCEmbeddingOpBase(TestCEmbeddingCPU):
if core.is_compiled_with_cuda():
self.dtype = "float64"
self.ids_dtype = "int64"
elif core.is_compiled_with_npu():
self.dtype = "float32"
self.ids_dtype = "int32"
elif core.is_compiled_with_xpu():
self.dtype = "float32"
self.ids_dtype = "int64"
......@@ -129,9 +118,7 @@ class TestCEmbeddingOpFP32(TestCEmbeddingOpBase):
self.outputs = {'Out': np_out.reshape((2, 4, 64))}
self.attrs = {'start_index': self.start_index}
if core.is_compiled_with_npu():
self.__class__.use_npu = True
elif core.is_compiled_with_xpu():
if core.is_compiled_with_xpu():
self.__class__.use_xpu = True
elif core.is_compiled_with_cuda():
self.__class__.exist_fp64_check_grad = True
......
......@@ -24,6 +24,16 @@ from copy import copy
import numpy as np
from op import Operator
from prim_op_test import OpTestUtils, PrimForwardChecker, PrimGradChecker
from testsuite import append_input_output, append_loss_ops, create_op, set_input
from white_list import (
check_shape_white_list,
compile_vs_runtime_white_list,
no_check_set_white_list,
no_grad_set_white_list,
op_accuracy_white_list,
op_threshold_white_list,
)
import paddle
from paddle import fluid
......@@ -36,20 +46,9 @@ from paddle.fluid.framework import (
_current_expected_place,
canonicalize_attrs,
)
from paddle.fluid.wrapped_decorator import signature_safe_contextmanager
sys.path.append(os.path.abspath(os.path.dirname(__file__)))
from prim_op_test import OpTestUtils, PrimForwardChecker, PrimGradChecker
from testsuite import append_input_output, append_loss_ops, create_op, set_input
from white_list import (
check_shape_white_list,
compile_vs_runtime_white_list,
no_check_set_white_list,
no_grad_set_white_list,
op_accuracy_white_list,
op_threshold_white_list,
)
from paddle.fluid.wrapped_decorator import signature_safe_contextmanager
@signature_safe_contextmanager
......@@ -338,9 +337,6 @@ class OpTest(unittest.TestCase):
np.random.seed(123)
random.seed(124)
if paddle.is_compiled_with_npu():
cls._use_system_allocator = _set_use_system_allocator(False)
else:
cls._use_system_allocator = _set_use_system_allocator(True)
@classmethod
......@@ -376,9 +372,6 @@ class OpTest(unittest.TestCase):
def is_rocm_op_test():
return core.is_compiled_with_rocm()
def is_npu_op_test():
return hasattr(cls, "use_npu") and cls.use_npu
def is_custom_device_op_test():
return hasattr(cls, "use_custom_device") and cls.use_custom_device
......@@ -411,7 +404,6 @@ class OpTest(unittest.TestCase):
and not is_xpu_op_test()
and not is_mkldnn_op_test()
and not is_rocm_op_test()
and not is_npu_op_test()
and not is_custom_device_op_test()
and not cls.check_prim
):
......@@ -1965,10 +1957,8 @@ class OpTest(unittest.TestCase):
# Check inplace for given op, its grad op, its grad_grad op, etc.
# No effect on original OpTest
# Currently not support ParallelExecutor on XPUPlace.
if (
not paddle.is_compiled_with_xpu()
and not paddle.is_compiled_with_npu()
and not isinstance(place, core.CustomPlace)
if not paddle.is_compiled_with_xpu() and not isinstance(
place, core.CustomPlace
):
self.check_inplace_output_with_place(
place, no_check_set=no_check_set, inplace_atol=inplace_atol
......
......@@ -59,7 +59,6 @@ class TestMaxMemoryAllocated(unittest.TestCase):
-2,
0.5,
"gpu1",
"npu",
]
for device in wrong_device:
with self.assertRaises(BaseException):
......
......@@ -59,7 +59,6 @@ class TestMaxMemoryreserved(unittest.TestCase):
-2,
0.5,
"gpu1",
"npu",
]
for device in wrong_device:
with self.assertRaises(BaseException):
......
......@@ -44,7 +44,6 @@ class TestMemoryAllocated(unittest.TestCase):
-2,
0.5,
"gpu1",
"npu",
]
for device in wrong_device:
with self.assertRaises(BaseException):
......
......@@ -44,7 +44,6 @@ class TestMemoryreserved(unittest.TestCase):
-2,
0.5,
"gpu1",
"npu",
]
for device in wrong_device:
with self.assertRaises(BaseException):
......
......@@ -46,10 +46,6 @@ class TestStaticDeviceManage(unittest.TestCase):
if core.is_compiled_with_xpu():
self._test_device("xpu:0", core.XPUPlace)
def test_npu_device(self):
if core.is_compiled_with_npu():
self._test_device("npu:0", core.NPUPlace)
class TestImperativeDeviceManage(unittest.TestCase):
def test_cpu(self):
......@@ -95,25 +91,6 @@ class TestImperativeDeviceManage(unittest.TestCase):
self.assertTrue(out.place.is_xpu_place())
self.assertEqual(device, "xpu:0")
def test_npu(self):
if core.is_compiled_with_npu():
with fluid.dygraph.guard():
paddle.set_device('npu:0')
out1 = paddle.zeros(shape=[1, 3], dtype='float32')
out2 = paddle.ones(shape=[1, 3], dtype='float32')
out3 = paddle.concat(x=[out1, out2], axis=0)
device = paddle.get_device()
self.assertEqual(
isinstance(
framework._current_expected_place(), core.NPUPlace
),
True,
)
self.assertTrue(out1.place.is_npu_place())
self.assertTrue(out2.place.is_npu_place())
self.assertTrue(out3.place.is_npu_place())
self.assertEqual(device, "npu:0")
if __name__ == '__main__':
unittest.main()
......@@ -17,11 +17,13 @@ import ast
import os
import pickle
import random
import socket
import subprocess
import sys
import tempfile
import time
import unittest
from contextlib import closing
import numpy as np
......@@ -684,9 +686,6 @@ class TestParallelDyGraphRunnerBase:
elif fluid.core.is_compiled_with_xpu():
device_id = int(os.getenv("FLAGS_selected_xpus", "0"))
place = fluid.XPUPlace(device_id)
elif fluid.core.is_compiled_with_npu():
device_id = int(os.getenv("FLAGS_selected_npus", "0"))
place = fluid.NPUPlace(device_id)
else:
assert "Only support CUDAPlace or XPUPlace or CPU(Gloo) for now."
......@@ -888,7 +887,6 @@ def runtime_main(test_class):
parser.add_argument('--use_cpu', action='store_true')
parser.add_argument('--use_xpu', action='store_true')
parser.add_argument('--use_dgc', action='store_true')
parser.add_argument('--use_npu', action='store_true')
parser.add_argument('--accumulate_gradient', action='store_true')
parser.add_argument('--find_unused_parameters', action='store_true')
parser.add_argument('--use_reduce', action='store_true')
......@@ -932,10 +930,6 @@ def runtime_main(test_class):
model.run_trainer(args)
import socket
from contextlib import closing
class TestDistBase(unittest.TestCase):
def _setup_config(self):
raise NotImplementedError("tests should have _setup_config implemented")
......@@ -945,21 +939,13 @@ class TestDistBase(unittest.TestCase):
self.__use_cuda = False
self.__use_xpu = False
self._use_dgc = False
self.__use_npu = False
elif self._enforce_place == "GPU":
self.__use_cuda = True
self.__use_xpu = False
self.__use_npu = False
elif self._enforce_place == "XPU":
self.__use_cuda = False
self.__use_xpu = True
self._use_dgc = False
self.__use_npu = False
elif self._enforce_place == "NPU":
self.__use_cuda = False
self.__use_xpu = False
self._use_dgc = False
self.__use_npu = True
else:
if fluid.core.is_compiled_with_cuda():
self.__use_cuda = True
......@@ -1149,13 +1135,6 @@ class TestDistBase(unittest.TestCase):
"PADDLE_TRAINERS_NUM": "1",
"PADDLE_TRAINER_ID": "0",
}
elif self.__use_npu:
cmd += " --use_npu"
env_local = {
"FLAGS_selected_npus": devices,
"PADDLE_TRAINERS_NUM": "1",
"PADDLE_TRAINER_ID": "0",
}
else:
env_local = {'CPU_NUM': '1'}
......@@ -1447,18 +1426,6 @@ class TestDistBase(unittest.TestCase):
"GLOG_v": "2",
}
)
elif self.__use_npu:
tr_cmd += " --use_npu"
env.update(
{
"FLAGS_selected_npus": f"{trainer_id}",
"PADDLE_TRAINERS_NUM": f"{trainer_num}",
"PADDLE_TRAINER_ID": f"{trainer_id}",
"PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints,
"PADDLE_CURRENT_ENDPOINT": ep,
"GLOG_v": "2",
}
)
else:
env.update({'CPU_NUM': '1'})
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import unittest
from paddle.distributed.fleet import ascend_utils
RANK_TABLE_JSON = {
"status": "completed",
"version": "1.0",
"server_count": "1",
"server_list": [
{
"server_id": "127.0.0.1",
"device": [
{"device_id": "0", "device_ip": "192.1.184.23", "rank_id": "0"},
{"device_id": "1", "device_ip": "192.2.21.93", "rank_id": "1"},
],
}
],
}
class TestAscendUtil(unittest.TestCase):
def test_get_cloud_cluster(self):
cluster, pod = ascend_utils.get_cloud_cluster()
self.assertTrue(cluster)
self.assertTrue(pod)
with open('rank_table_file.json', 'w') as f:
json.dump(RANK_TABLE_JSON, f)
rank_table_file = "./rank_table_file.json"
cluster, pod = ascend_utils.get_cloud_cluster(
rank_table_file=rank_table_file
)
self.assertTrue(cluster)
self.assertTrue(pod)
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
import paddle
class TestNPUIdentityOp(unittest.TestCase):
def setUp(self):
self.op_type = "npu_identity"
self.shape = [64, 6, 28, 28]
self.x = np.random.random(self.shape).astype(np.float32)
self.format = 3 # ACL_FORMAT_NC1HWC0 = 3
self.place = paddle.CPUPlace()
def test_api_static(self):
paddle.enable_static()
main_program = paddle.static.default_main_program()
startup_program = paddle.static.default_startup_program()
with paddle.static.program_guard(main_program, startup_program):
x_data = paddle.static.data(
shape=self.shape, name="data", dtype='float32'
)
output = paddle.incubate._npu_identity(x=x_data, format=self.format)
exe = paddle.static.Executor()
exe.run(startup_program)
result = exe.run(
main_program, feed={x_data.name: self.x}, fetch_list=[output]
)
np.testing.assert_allclose(result[0], self.x, rtol=1e-08)
def test_api_dygraph(self):
paddle.disable_static(self.place)
x = paddle.to_tensor(self.x)
out = paddle.incubate._npu_identity(x, self.format)
np.testing.assert_allclose(out.numpy(), self.x, rtol=1e-08)
paddle.enable_static()
if __name__ == "__main__":
unittest.main()
......@@ -272,9 +272,6 @@ class TestVarBase(unittest.TestCase):
check_with_place("gpu_pinned")
check_with_place(core.CUDAPlace(0))
check_with_place("gpu:0")
if core.is_compiled_with_npu():
check_with_place(core.NPUPlace(0))
check_with_place("npu:0")
def test_to_tensor_not_change_input_stop_gradient(self):
with paddle.fluid.dygraph.guard(core.CPUPlace()):
......
......@@ -156,33 +156,6 @@ def init_communicator(
'ring_id': 0,
},
)
elif core.is_compiled_with_npu():
hccl_id_var = block.create_var(
name=fluid.unique_name.generate('hccl_id'),
persistable=True,
type=core.VarDesc.VarType.RAW,
)
block.append_op(
type='c_gen_hccl_id',
inputs={},
outputs={'Out': hccl_id_var},
attrs={
'rank': rank,
'endpoint': current_endpoint,
'other_endpoints': other_endpoints,
},
)
block.append_op(
type='c_comm_init_hccl',
inputs={'X': hccl_id_var},
outputs={},
attrs={
'rank': rank,
'ring_id': 0,
'device_id': int(os.getenv("FLAGS_selected_npus")),
'rank_ids': nranks,
},
)
elif core.is_compiled_with_xpu():
bkcl_id_var = block.create_var(
name=fluid.unique_name.generate('bkcl_id'),
......
......@@ -16,7 +16,7 @@ from paddle import _C_ops, _legacy_C_ops, get_flags, in_dynamic_mode
from paddle.device import (
get_all_custom_device_type,
is_compiled_with_cuda,
is_compiled_with_npu,
is_compiled_with_custom_device,
is_compiled_with_rocm,
)
from paddle.fluid.framework import _global_flags, in_dygraph_mode
......@@ -466,7 +466,7 @@ def conv1d(
use_cudnn = False
# NPU only supports depthwise_conv2d when "input_channel = output_channel = groups"
if is_compiled_with_npu():
if is_compiled_with_custom_device('npu'):
if num_channels == groups and num_channels == num_filters:
l_type = 'depthwise_conv2d'
else:
......@@ -756,7 +756,7 @@ def conv2d(
use_mkldnn = _global_flags()["FLAGS_use_mkldnn"]
# NPU only supports depthwise_conv2d when "input_channel = output_channel = groups"
if is_compiled_with_npu():
if is_compiled_with_custom_device('npu'):
if num_channels == groups and num_channels == num_filters:
l_type = 'depthwise_conv2d'
else:
......
......@@ -60,7 +60,6 @@ from ..fluid.framework import program_guard # noqa: F401
from ..fluid.framework import cpu_places # noqa: F401
from ..fluid.framework import cuda_places # noqa: F401
from ..fluid.framework import xpu_places # noqa: F401
from ..fluid.framework import npu_places # noqa: F401
from ..fluid.framework import Variable # noqa: F401
from ..fluid.framework import Operator # noqa: F401
from ..fluid.framework import Parameter # noqa: F401
......@@ -118,7 +117,6 @@ __all__ = [ # noqa
'cpu_places',
'cuda_places',
'xpu_places',
'npu_places',
'Variable',
'create_global_var',
'accuracy',
......
......@@ -54,7 +54,7 @@ def check_finite_and_unscale(x, scale, name=None, float_status=None):
)
inputs = {'X': x, 'Scale': scale}
if core.is_compiled_with_npu():
if core.is_compiled_with_custom_device('npu'):
check_variable_and_dtype(
float_status,
"float_status",
......
......@@ -187,7 +187,7 @@ class OptimizerWithMixedPrecision:
self._train_program = train_program
# NOTE(zhiqiu): _float_status is only used for NPU.
if core.is_compiled_with_npu():
if core.is_compiled_with_custom_device('npu'):
float_status = paddle.static.data(
name="float_status", shape=[8], dtype='float32'
)
......@@ -408,7 +408,7 @@ class OptimizerWithMixedPrecision:
if self._is_distributed:
# if distributed, split check_finite_and_unscale to overlap
# unscale with communication
if core.is_compiled_with_npu():
if core.is_compiled_with_custom_device('npu'):
with self._train_program._optimized_guard(grads):
_, found_inf = check_finite_and_unscale(
grads,
......
......@@ -182,7 +182,7 @@ if core.is_compiled_with_xpu():
_, _, _sys_unsupported_fp16_list = core.op_supported_infos(
'XPU', core.VarDesc.VarType.FP16
)
elif core.is_compiled_with_npu():
elif core.is_compiled_with_custom_device('npu'):
_, _, _sys_unsupported_fp16_list = core.op_supported_infos(
'NPU', core.VarDesc.VarType.FP16
)
......
......@@ -1536,10 +1536,7 @@ def load(program, model_path, executor=None, var_list=None):
p = paddle.fluid.core.Place()
p.set_place(t._place())
place = paddle.fluid.XPUPlace(p.xpu_device_id())
elif p.is_npu_place():
p = paddle.fluid.core.Place()
p.set_place(t._place())
place = paddle.fluid.NPUPlace(p.npu_device_id())
else:
p = paddle.fluid.core.Place()
p.set_place(t._place())
......@@ -1676,10 +1673,6 @@ def set_program_state(program, state_dict):
p = paddle.fluid.core.Place()
p.set_place(ten_place)
py_place = paddle.fluid.XPUPlace(p.xpu_device_id())
elif ten_place.is_npu_place():
p = paddle.fluid.core.Place()
p.set_place(ten_place)
py_place = paddle.fluid.NPUPlace(p.npu_device_id())
ten.set(new_para_np, py_place)
......
......@@ -946,7 +946,7 @@ def conv2d(
l_type = 'depthwise_conv2d'
# NPU only supports depthwise_conv2d when "input_channel = output_channel = groups"
if core.is_compiled_with_npu():
if core.is_compiled_with_custom_device('npu'):
if num_channels == groups and num_channels == num_filters:
l_type = 'depthwise_conv2d'
else:
......
......@@ -2246,8 +2246,6 @@ def _memcpy(input, place=None, output=None):
dst_place_type = 2
elif p.is_xpu_place():
dst_place_type = 3
elif p.is_npu_place():
dst_place_type = 4
attrs = {'dst_place_type': dst_place_type}
helper.append_op(
......
......@@ -65,22 +65,6 @@ def _is_cuda_available():
return False
def _is_npu_available():
"""
Check whether NPU is avaiable.
"""
try:
assert len(paddle.static.npu_places()) > 0
return True
except Exception as e:
logging.warning(
"You are using NPU version PaddlePaddle, but there is no NPU "
"detected on your machine. Maybe NPU devices is not set properly."
"\n Original Error is {}".format(e)
)
return False
def _is_xpu_available():
"""
Check whether XPU is avaiable.
......@@ -97,22 +81,19 @@ def _is_xpu_available():
return False
def _run_dygraph_single(use_cuda, use_xpu, use_npu):
def _run_dygraph_single(use_cuda, use_xpu):
"""
Testing the simple network in dygraph mode using one CPU/GPU/XPU/NPU.
Testing the simple network in dygraph mode using one CPU/GPU/XPU.
Args:
use_cuda (bool): Whether running with CUDA.
use_xpu (bool): Whether running with XPU.
use_npu (bool): Whether running with NPU.
"""
paddle.disable_static()
if use_cuda:
paddle.set_device('gpu')
elif use_xpu:
paddle.set_device('xpu')
elif use_npu:
paddle.set_device('npu')
else:
paddle.set_device('cpu')
weight_attr = paddle.ParamAttr(
......@@ -135,14 +116,13 @@ def _run_dygraph_single(use_cuda, use_xpu, use_npu):
opt.step()
def _run_static_single(use_cuda, use_xpu, use_npu):
def _run_static_single(use_cuda, use_xpu):
"""
Testing the simple network with executor running directly, using one CPU/GPU/XPU/NPU.
Testing the simple network with executor running directly, using one CPU/GPU/XPU.
Args:
use_cuda (bool): Whether running with CUDA.
use_xpu (bool): Whether running with XPU.
use_npu (bool): Whether running with NPU.
"""
paddle.enable_static()
with paddle.static.scope_guard(paddle.static.Scope()):
......@@ -159,8 +139,6 @@ def _run_static_single(use_cuda, use_xpu, use_npu):
place = paddle.CUDAPlace(0)
elif use_xpu:
place = paddle.XPUPlace(0)
elif use_npu:
place = paddle.NPUPlace(0)
else:
place = paddle.CPUPlace()
......@@ -223,7 +201,6 @@ def _run_parallel(device_list):
Args:
use_cuda (bool): Whether running with CUDA.
use_xpu (bool): Whether running with XPU.
use_npu (bool): Whether running with NPU.
device_list (int): The specified devices.
"""
paddle.distributed.spawn(train_for_run_parallel, nprocs=len(device_list))
......@@ -252,14 +229,11 @@ def run_check():
use_cuda = False
use_xpu = False
use_npu = False
if paddle.is_compiled_with_cuda():
use_cuda = _is_cuda_available()
elif paddle.is_compiled_with_xpu():
use_xpu = _is_xpu_available()
elif paddle.is_compiled_with_npu():
use_npu = _is_npu_available()
if use_cuda:
device_str = "GPU"
......@@ -267,16 +241,13 @@ def run_check():
elif use_xpu:
device_str = "XPU"
device_list = paddle.static.xpu_places()
elif use_npu:
device_str = "NPU"
device_list = paddle.static.npu_places()
else:
device_str = "CPU"
device_list = paddle.static.cpu_places(device_count=1)
device_count = len(device_list)
_run_static_single(use_cuda, use_xpu, use_npu)
_run_dygraph_single(use_cuda, use_xpu, use_npu)
_run_static_single(use_cuda, use_xpu)
_run_dygraph_single(use_cuda, use_xpu)
print(f"PaddlePaddle works well on 1 {device_str}.")
try:
......
......@@ -18,8 +18,6 @@ from distutils.sysconfig import get_python_lib
from setuptools import Extension, setup
from setuptools.command.build_ext import build_ext
from paddle.fluid import core
# refer: https://note.qidong.name/2018/03/setup-warning-strict-prototypes
# Avoid a gcc warning below:
......@@ -40,8 +38,6 @@ paddle_extra_compile_args = [
'-Wno-parentheses',
'-DPADDLE_WITH_CUSTOM_KERNEL',
]
if core.is_compiled_with_npu():
paddle_extra_compile_args += ['-D_GLIBCXX_USE_CXX11_ABI=0']
# include path
site_packages_path = get_python_lib()
......
......@@ -18,8 +18,6 @@ import site
from setuptools import Extension, setup
from setuptools.command.build_ext import build_ext
from paddle.fluid import core
# refer: https://note.qidong.name/2018/03/setup-warning-strict-prototypes
# Avoid a gcc warning below:
......@@ -40,8 +38,6 @@ paddle_extra_compile_args = [
'-Wno-parentheses',
'-DPADDLE_WITH_CUSTOM_KERNEL',
]
if core.is_compiled_with_npu():
paddle_extra_compile_args += ['-D_GLIBCXX_USE_CXX11_ABI=0']
# include path
site_packages_path = site.getsitepackages()
......
......@@ -32,9 +32,6 @@ def download_file():
if paddle.is_compiled_with_rocm():
url = "https://sys-p0.bj.bcebos.com/prec/{}".format('disable_ut_rocm')
if paddle.is_compiled_with_npu():
url = "https://sys-p0.bj.bcebos.com/prec/{}".format('disable_ut_npu')
f = requests.get(url)
data = f.text
status_code = f.status_code
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册