未验证 提交 7976e2a3 编写于 作者: K Kim Yann 提交者: GitHub

rem is_compiled_with_npu (#52385)

* rem is_compiled_with_npu

* rem nup related code

* make lint happy

* rem test

* remove some tests

* Update grad_scaler.py

* fix an error
上级 2acc2b14
...@@ -8,9 +8,6 @@ exclude = ...@@ -8,9 +8,6 @@ exclude =
./python/paddle/fluid/tra**, ./python/paddle/fluid/tra**,
# Exclude third-party libraries # Exclude third-party libraries
./python/paddle/utils/gast/**, ./python/paddle/utils/gast/**,
# Exclude files that will be removed in the future, see more at
# https://github.com/PaddlePaddle/Paddle/pull/46782#issuecomment-1273033731
./python/paddle/fluid/tests/unittests/npu/**,
ignore = ignore =
# Whitespace before ‘,’, ‘;’, or ‘:’, it is not compatible with black # Whitespace before ‘,’, ‘;’, or ‘:’, it is not compatible with black
E203, E203,
......
...@@ -4,8 +4,7 @@ exclude: | ...@@ -4,8 +4,7 @@ exclude: |
patches/.+| patches/.+|
paddle/fluid/framework/fleet/heter_ps/cudf/.+| paddle/fluid/framework/fleet/heter_ps/cudf/.+|
paddle/fluid/distributed/ps/thirdparty/round_robin.h| paddle/fluid/distributed/ps/thirdparty/round_robin.h|
python/paddle/utils/gast/.+| python/paddle/utils/gast/.+
python/paddle/fluid/tests/unittests/npu/.+
)$ )$
repos: repos:
# Common hooks # Common hooks
......
...@@ -265,14 +265,6 @@ bool IsCompiledWithROCM() { ...@@ -265,14 +265,6 @@ bool IsCompiledWithROCM() {
#endif #endif
} }
bool IsCompiledWithAscend() {
#ifndef PADDLE_WITH_ASCEND
return false;
#else
return true;
#endif
}
bool IsCompiledWithXPU() { bool IsCompiledWithXPU() {
#ifndef PADDLE_WITH_XPU #ifndef PADDLE_WITH_XPU
return false; return false;
...@@ -281,8 +273,6 @@ bool IsCompiledWithXPU() { ...@@ -281,8 +273,6 @@ bool IsCompiledWithXPU() {
#endif #endif
} }
bool IsCompiledWithNPU() { return false; }
bool IsCompiledWithCustomDevice(std::string device_type) { bool IsCompiledWithCustomDevice(std::string device_type) {
#ifndef PADDLE_WITH_CUSTOM_DEVICE #ifndef PADDLE_WITH_CUSTOM_DEVICE
return false; return false;
...@@ -1592,14 +1582,6 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -1592,14 +1582,6 @@ All parameter, weight, gradient are variables in Paddle.
return context; return context;
#endif #endif
}) })
.def_static(
"create",
[](paddle::platform::NPUPlace &place)
-> paddle::platform::DeviceContext * {
PADDLE_THROW(platform::errors::PermissionDenied(
"Cannot use NPUPlace in CPU/GPU/XPU version, "
"Please recompile or reinstall Paddle with NPU support."));
})
.def_static("create", .def_static("create",
[](paddle::platform::CustomPlace &place) [](paddle::platform::CustomPlace &place)
-> paddle::platform::DeviceContext * { -> paddle::platform::DeviceContext * {
...@@ -1769,13 +1751,6 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -1769,13 +1751,6 @@ All parameter, weight, gradient are variables in Paddle.
pybind11::gil_scoped_release release; pybind11::gil_scoped_release release;
self.Run(scope, place); self.Run(scope, place);
}) })
.def("run",
[](OperatorBase &self,
const Scope &scope,
const platform::NPUPlace &place) {
pybind11::gil_scoped_release release;
self.Run(scope, place);
})
.def("run", .def("run",
[](OperatorBase &self, [](OperatorBase &self,
const Scope &scope, const Scope &scope,
...@@ -1985,9 +1960,7 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -1985,9 +1960,7 @@ All parameter, weight, gradient are variables in Paddle.
}); });
m.def("is_compiled_with_avx", IsCompiledWithAVX); m.def("is_compiled_with_avx", IsCompiledWithAVX);
m.def("is_compiled_with_cuda", IsCompiledWithCUDA); m.def("is_compiled_with_cuda", IsCompiledWithCUDA);
m.def("is_compiled_with_ascend", IsCompiledWithAscend);
m.def("is_compiled_with_rocm", IsCompiledWithROCM); m.def("is_compiled_with_rocm", IsCompiledWithROCM);
m.def("is_compiled_with_npu", IsCompiledWithNPU);
m.def("is_compiled_with_custom_device", IsCompiledWithCustomDevice); m.def("is_compiled_with_custom_device", IsCompiledWithCustomDevice);
m.def("is_compiled_with_ipu", IsCompiledWithIPU); m.def("is_compiled_with_ipu", IsCompiledWithIPU);
m.def("is_compiled_with_xpu", IsCompiledWithXPU); m.def("is_compiled_with_xpu", IsCompiledWithXPU);
......
...@@ -14,7 +14,6 @@ extend_skip_glob = [ ...@@ -14,7 +14,6 @@ extend_skip_glob = [
"python/paddle/fluid/[!t]**", "python/paddle/fluid/[!t]**",
"python/paddle/fluid/tra**", "python/paddle/fluid/tra**",
"python/paddle/utils/gast/**", "python/paddle/utils/gast/**",
"python/paddle/fluid/tests/unittests/npu/**",
] ]
[tool.ruff] [tool.ruff]
...@@ -23,7 +22,6 @@ exclude = [ ...@@ -23,7 +22,6 @@ exclude = [
"./python/paddle/fluid/[!t]**", "./python/paddle/fluid/[!t]**",
"./python/paddle/fluid/tra**", "./python/paddle/fluid/tra**",
"./python/paddle/utils/gast/**", "./python/paddle/utils/gast/**",
"./python/paddle/fluid/tests/unittests/npu/**",
] ]
target-version = "py37" target-version = "py37"
select = [ select = [
......
...@@ -334,7 +334,6 @@ from .framework import ParamAttr # noqa: F401 ...@@ -334,7 +334,6 @@ from .framework import ParamAttr # noqa: F401
from .framework import CPUPlace # noqa: F401 from .framework import CPUPlace # noqa: F401
from .framework import IPUPlace # noqa: F401 from .framework import IPUPlace # noqa: F401
from .framework import CUDAPlace # noqa: F401 from .framework import CUDAPlace # noqa: F401
from .framework import NPUPlace # noqa: F401
from .framework import CUDAPinnedPlace # noqa: F401 from .framework import CUDAPinnedPlace # noqa: F401
from .framework import CustomPlace # noqa: F401 from .framework import CustomPlace # noqa: F401
...@@ -363,7 +362,6 @@ from .device import get_cudnn_version # noqa: F401 ...@@ -363,7 +362,6 @@ from .device import get_cudnn_version # noqa: F401
from .device import set_device # noqa: F401 from .device import set_device # noqa: F401
from .device import get_device # noqa: F401 from .device import get_device # noqa: F401
from .device import is_compiled_with_xpu # noqa: F401 from .device import is_compiled_with_xpu # noqa: F401
from .device import is_compiled_with_npu # noqa: F401
from .device import is_compiled_with_ipu # noqa: F401 from .device import is_compiled_with_ipu # noqa: F401
from .device import is_compiled_with_cinn # noqa: F401 from .device import is_compiled_with_cinn # noqa: F401
from .device import is_compiled_with_cuda # noqa: F401 from .device import is_compiled_with_cuda # noqa: F401
...@@ -512,7 +510,6 @@ __all__ = [ # noqa ...@@ -512,7 +510,6 @@ __all__ = [ # noqa
'histogram', 'histogram',
'multiplex', 'multiplex',
'CUDAPlace', 'CUDAPlace',
'NPUPlace',
'empty', 'empty',
'shape', 'shape',
'real', 'real',
......
...@@ -344,7 +344,6 @@ def amp_guard( ...@@ -344,7 +344,6 @@ def amp_guard(
if enable and not ( if enable and not (
tracer._expected_place.is_gpu_place() tracer._expected_place.is_gpu_place()
or tracer._expected_place.is_xpu_place() or tracer._expected_place.is_xpu_place()
or tracer._expected_place.is_npu_place()
or tracer._expected_place.is_custom_place() or tracer._expected_place.is_custom_place()
): ):
warnings.warn( warnings.warn(
...@@ -352,10 +351,6 @@ def amp_guard( ...@@ -352,10 +351,6 @@ def amp_guard(
% tracer._expected_place % tracer._expected_place
) )
enable = False enable = False
# For npu:
if tracer._expected_place.is_npu_place() and (dtype == 'bfloat16'):
warnings.warn('NPUPlace only support float16 amp.')
enable = False
# For xpu: # For xpu:
if tracer._expected_place.is_xpu_place() and (dtype == 'bfloat16'): if tracer._expected_place.is_xpu_place() and (dtype == 'bfloat16'):
warnings.warn('XPUPlace only support float16 amp.') warnings.warn('XPUPlace only support float16 amp.')
......
...@@ -105,11 +105,10 @@ class AmpScaler: ...@@ -105,11 +105,10 @@ class AmpScaler:
if enable and not ( if enable and not (
tracer._expected_place.is_gpu_place() tracer._expected_place.is_gpu_place()
or tracer._expected_place.is_xpu_place() or tracer._expected_place.is_xpu_place()
or tracer._expected_place.is_npu_place()
or tracer._expected_place.is_custom_place() or tracer._expected_place.is_custom_place()
): ):
warnings.warn( warnings.warn(
'AmpScaler can only be enabled on CUDAPlace, XPUPlace, MLUPlace, NPUPlace and CustomPlace, current place is %s, so it makes no effect.' 'AmpScaler can only be enabled on CUDAPlace, XPUPlace, MLUPlace and CustomPlace, current place is %s, so it makes no effect.'
% tracer._expected_place % tracer._expected_place
) )
enable = False enable = False
...@@ -326,74 +325,36 @@ class AmpScaler: ...@@ -326,74 +325,36 @@ class AmpScaler:
if param.dtype == core.VarDesc.VarType.FP32 if param.dtype == core.VarDesc.VarType.FP32
] ]
self._found_inf = self._temp_found_inf_value_false self._found_inf = self._temp_found_inf_value_false
if core.is_compiled_with_npu(): if len(param_grads_fp16):
float_status = _legacy_C_ops.alloc_float_status() _legacy_C_ops.check_finite_and_unscale(
_legacy_C_ops.clear_float_status(float_status, float_status) param_grads_fp16,
self._scale,
if len(param_grads_fp16): param_grads_fp16,
_legacy_C_ops.check_finite_and_unscale( self._temp_found_inf_fp16,
param_grads_fp16, )
self._scale, self._found_inf = _C_ops.bitwise_or(
float_status, self._found_inf, self._temp_found_inf_fp16
param_grads_fp16, )
self._temp_found_inf_fp16, if len(param_grads_bf16):
) _legacy_C_ops.check_finite_and_unscale(
self._found_inf = _C_ops.bitwise_or( param_grads_bf16,
self._found_inf, self._temp_found_inf_fp16 self._scale,
) param_grads_bf16,
if len(param_grads_bf16): self._temp_found_inf_bf16,
_legacy_C_ops.check_finite_and_unscale( )
param_grads_bf16, self._found_inf = _C_ops.bitwise_or(
self._scale, self._found_inf, self._temp_found_inf_bf16
float_status, )
param_grads_bf16, if len(param_grads_fp32):
self._temp_found_inf_bf16, _legacy_C_ops.check_finite_and_unscale(
) param_grads_fp32,
self._found_inf = _C_ops.bitwise_or( self._scale,
self._found_inf, self._temp_found_inf_bf16 param_grads_fp32,
) self._temp_found_inf_fp32,
if len(param_grads_fp32): )
_legacy_C_ops.check_finite_and_unscale( self._found_inf = _C_ops.bitwise_or(
param_grads_fp32, self._found_inf, self._temp_found_inf_fp32
self._scale, )
float_status,
param_grads_fp32,
self._temp_found_inf_fp32,
)
self._found_inf = _C_ops.bitwise_or(
self._found_inf, self._temp_found_inf_fp32
)
else:
if len(param_grads_fp16):
_legacy_C_ops.check_finite_and_unscale(
param_grads_fp16,
self._scale,
param_grads_fp16,
self._temp_found_inf_fp16,
)
self._found_inf = _C_ops.bitwise_or(
self._found_inf, self._temp_found_inf_fp16
)
if len(param_grads_bf16):
_legacy_C_ops.check_finite_and_unscale(
param_grads_bf16,
self._scale,
param_grads_bf16,
self._temp_found_inf_bf16,
)
self._found_inf = _C_ops.bitwise_or(
self._found_inf, self._temp_found_inf_bf16
)
if len(param_grads_fp32):
_legacy_C_ops.check_finite_and_unscale(
param_grads_fp32,
self._scale,
param_grads_fp32,
self._temp_found_inf_fp32,
)
self._found_inf = _C_ops.bitwise_or(
self._found_inf, self._temp_found_inf_fp32
)
optimizer_state["state"] = OptimizerState.UNSCALED optimizer_state["state"] = OptimizerState.UNSCALED
......
...@@ -36,7 +36,6 @@ __all__ = [ # noqa ...@@ -36,7 +36,6 @@ __all__ = [ # noqa
'is_compiled_with_cinn', 'is_compiled_with_cinn',
'is_compiled_with_cuda', 'is_compiled_with_cuda',
'is_compiled_with_rocm', 'is_compiled_with_rocm',
'is_compiled_with_npu',
'is_compiled_with_custom_device', 'is_compiled_with_custom_device',
'get_all_device_type', 'get_all_device_type',
'get_all_custom_device_type', 'get_all_custom_device_type',
...@@ -53,24 +52,6 @@ __all__ = [ # noqa ...@@ -53,24 +52,6 @@ __all__ = [ # noqa
_cudnn_version = None _cudnn_version = None
# TODO: WITH_ASCEND_CL may changed to WITH_NPU or others in the future
# for consistent.
def is_compiled_with_npu():
"""
Whether paddle was built with WITH_ASCEND_CL=ON to support Ascend NPU.
Return:
bool, ``True`` if NPU is supported, otherwise ``False``.
Examples:
.. code-block:: python
import paddle
support_npu = paddle.device.is_compiled_with_npu()
"""
return core.is_compiled_with_npu()
def is_compiled_with_custom_device(device_type): def is_compiled_with_custom_device(device_type):
""" """
Whether paddle was built with Paddle_CUSTOM_DEVICE . Whether paddle was built with Paddle_CUSTOM_DEVICE .
...@@ -210,15 +191,6 @@ def _convert_to_place(device): ...@@ -210,15 +191,6 @@ def _convert_to_place(device):
selected_xpus = os.getenv("FLAGS_selected_xpus", "0").split(",") selected_xpus = os.getenv("FLAGS_selected_xpus", "0").split(",")
device_id = int(selected_xpus[0]) device_id = int(selected_xpus[0])
place = core.XPUPlace(device_id) place = core.XPUPlace(device_id)
elif lower_device == 'npu':
if not core.is_compiled_with_npu():
raise ValueError(
"The device should not be 'npu', "
"since PaddlePaddle is not compiled with NPU"
)
selected_npus = os.getenv("FLAGS_selected_npus", "0").split(",")
device_id = int(selected_npus[0])
place = core.NPUPlace(device_id)
elif lower_device == 'ipu': elif lower_device == 'ipu':
if not core.is_compiled_with_ipu(): if not core.is_compiled_with_ipu():
raise ValueError( raise ValueError(
...@@ -229,7 +201,6 @@ def _convert_to_place(device): ...@@ -229,7 +201,6 @@ def _convert_to_place(device):
else: else:
avaliable_gpu_device = re.match(r'gpu:\d+', lower_device) avaliable_gpu_device = re.match(r'gpu:\d+', lower_device)
avaliable_xpu_device = re.match(r'xpu:\d+', lower_device) avaliable_xpu_device = re.match(r'xpu:\d+', lower_device)
avaliable_npu_device = re.match(r'npu:\d+', lower_device)
if avaliable_gpu_device: if avaliable_gpu_device:
if not core.is_compiled_with_cuda(): if not core.is_compiled_with_cuda():
raise ValueError( raise ValueError(
...@@ -250,31 +221,7 @@ def _convert_to_place(device): ...@@ -250,31 +221,7 @@ def _convert_to_place(device):
device_id = device_info_list[1] device_id = device_info_list[1]
device_id = int(device_id) device_id = int(device_id)
place = core.XPUPlace(device_id) place = core.XPUPlace(device_id)
if avaliable_npu_device: if not avaliable_gpu_device and not avaliable_xpu_device:
if not core.is_compiled_with_npu():
device_info_list = device.split(':', 1)
device_type = device_info_list[0]
if device_type in core.get_all_custom_device_type():
device_id = device_info_list[1]
device_id = int(device_id)
place = core.CustomPlace(device_type, device_id)
return place
else:
raise ValueError(
"The device should not be {}, since PaddlePaddle is "
"not compiled with NPU or compiled with custom device".format(
avaliable_npu_device
)
)
device_info_list = device.split(':', 1)
device_id = device_info_list[1]
device_id = int(device_id)
place = core.NPUPlace(device_id)
if (
not avaliable_gpu_device
and not avaliable_xpu_device
and not avaliable_npu_device
):
device_info_list = device.split(':', 1) device_info_list = device.split(':', 1)
device_type = device_info_list[0] device_type = device_info_list[0]
if device_type in core.get_all_custom_device_type(): if device_type in core.get_all_custom_device_type():
...@@ -346,9 +293,6 @@ def get_device(): ...@@ -346,9 +293,6 @@ def get_device():
elif isinstance(place, core.XPUPlace): elif isinstance(place, core.XPUPlace):
device_id = place.get_device_id() device_id = place.get_device_id()
device = 'xpu:' + str(device_id) device = 'xpu:' + str(device_id)
elif isinstance(place, core.NPUPlace):
device_id = place.get_device_id()
device = 'npu:' + str(device_id)
elif isinstance(place, core.IPUPlace): elif isinstance(place, core.IPUPlace):
num_devices = core.get_ipu_device_count() num_devices = core.get_ipu_device_count()
device = f"ipus:{{0-{num_devices - 1}}}" device = f"ipus:{{0-{num_devices - 1}}}"
...@@ -469,7 +413,7 @@ class Event: ...@@ -469,7 +413,7 @@ class Event:
Parameters: Parameters:
device(str|paddle.CUDAPlace(n)|paddle.CustomPlace(n)): Which device the stream runn on. If device is None, the device is the current device. Default: None. device(str|paddle.CUDAPlace(n)|paddle.CustomPlace(n)): Which device the stream runn on. If device is None, the device is the current device. Default: None.
It can be ``gpu``, ``gpu:x``,``custom_device``, ``custom_device:x``, where ``custom_device`` is the name of CustomDevicec, It can be ``gpu``, ``gpu:x``,``custom_device``, ``custom_device:x``, where ``custom_device`` is the name of CustomDevicec,
where ``x`` is the index of the GPUs, XPUs or NPUs. And it can be paddle.CUDAPlace(n) or paddle.CustomPlace(n). where ``x`` is the index of the GPUs, XPUs. And it can be paddle.CUDAPlace(n) or paddle.CustomPlace(n).
enable_timing (bool, optional): indicates if the event should measure time, default is False enable_timing (bool, optional): indicates if the event should measure time, default is False
blocking (bool, optional): if True, ``wait`` will be blocking, default is False blocking (bool, optional): if True, ``wait`` will be blocking, default is False
interprocess (bool): if True, the event can be shared between processes, default is False interprocess (bool): if True, the event can be shared between processes, default is False
...@@ -614,7 +558,7 @@ class Stream: ...@@ -614,7 +558,7 @@ class Stream:
Parameters: Parameters:
device(str|paddle.CUDAPlace(n)|paddle.CustomPlace(n)): Which device the stream runn on. If device is None, the device is the current device. Default: None. device(str|paddle.CUDAPlace(n)|paddle.CustomPlace(n)): Which device the stream runn on. If device is None, the device is the current device. Default: None.
It can be ``gpu``, ``gpu:x``,``custom_device``, ``custom_device:x``, where ``custom_device`` is the name of CustomDevicec, It can be ``gpu``, ``gpu:x``,``custom_device``, ``custom_device:x``, where ``custom_device`` is the name of CustomDevicec,
where ``x`` is the index of the GPUs, XPUs or NPUs. And it can be paddle.CUDAPlace(n) or paddle.CustomPlace(n). where ``x`` is the index of the GPUs, XPUs. And it can be paddle.CUDAPlace(n) or paddle.CustomPlace(n).
priority(int, optional): priority of the CUDA stream. Can be either priority(int, optional): priority of the CUDA stream. Can be either
1 (high priority) or 2 (low priority). By default, streams have 1 (high priority) or 2 (low priority). By default, streams have
priority 2. priority 2.
...@@ -936,7 +880,7 @@ def synchronize(device=None): ...@@ -936,7 +880,7 @@ def synchronize(device=None):
Parameters: Parameters:
device(str|paddle.CUDAPlace(n)|paddle.XPUPlace(n)|paddle.CustomPlace(n)): The device which want to wait for. If device is None, the device is the current device. Default: None. device(str|paddle.CUDAPlace(n)|paddle.XPUPlace(n)|paddle.CustomPlace(n)): The device which want to wait for. If device is None, the device is the current device. Default: None.
It can be ``gpu``, ``gpu:x``, ``xpu``, ``xpu:x``, ``custom_device``, ``custom_device:x``, where ``custom_device`` is the name of CustomDevicec, It can be ``gpu``, ``gpu:x``, ``xpu``, ``xpu:x``, ``custom_device``, ``custom_device:x``, where ``custom_device`` is the name of CustomDevicec,
where ``x`` is the index of the GPUs, XPUs or NPUs. And it can be paddle.CUDAPlace(n) or paddle.XPUPlace(n) or paddle.CustomPlace(n). where ``x`` is the index of the GPUs, XPUs. And it can be paddle.CUDAPlace(n) or paddle.XPUPlace(n) or paddle.CustomPlace(n).
Examples: Examples:
.. code-block:: python .. code-block:: python
# required: custom_device # required: custom_device
......
...@@ -288,11 +288,6 @@ def new_group(ranks=None, backend=None, timeout=_default_timeout): ...@@ -288,11 +288,6 @@ def new_group(ranks=None, backend=None, timeout=_default_timeout):
core.NCCLParallelContext(strategy, place).init_with_ring_id( core.NCCLParallelContext(strategy, place).init_with_ring_id(
ring_id ring_id
) )
elif core.is_compiled_with_npu():
place = core.NPUPlace(genv.device_id)
core.HCCLParallelContext(strategy, place).init_with_ring_id(
ring_id
)
elif core.is_compiled_with_xpu(): elif core.is_compiled_with_xpu():
place = core.XPUPlace(genv.device_id) place = core.XPUPlace(genv.device_id)
core.BKCLParallelContext(strategy, place).init_with_ring_id( core.BKCLParallelContext(strategy, place).init_with_ring_id(
......
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import os
from paddle.distributed.fleet.launch_utils import (
DeviceMode,
get_cluster,
get_host_name_ip,
)
__all__ = []
def _get_ascend_rankfile(rank_table_file_path):
"""
Args:
rank_table_file_path: ascend npu rank file json
{
"status": "completed",
"version": "1.0",
"server_count": "2",
"server_list": [
{
"server_id": "192.168.24.217",
"device": [
{
"device_id": "0",
"device_ip": "192.1.184.23",
"rank_id": "0"
},
{
"device_id": "1",
"device_ip": "192.2.21.93",
"rank_id": "1"
}
]
},
{
"server_id": "192.168.26.177",
"device": [
{
"device_id": "0",
"device_ip": "192.1.94.132",
"rank_id": "2"
},
{
"device_id": "1",
"device_ip": "192.2.94.30",
"rank_id": "3"
}
]
}
]
}
Returns:
node_ips: node ip list
device_count: number of npu per machine
"""
json_data = None
with open(rank_table_file_path) as json_file:
json_data = json.load(json_file)
node_ips = []
device_count = 0
server_list = json_data['server_list']
for server in server_list:
device_list = server['device']
device_count = len(device_list)
if os.getenv("FLAGS_MODELARTS", None):
nodes = os.getenv("DLS_TASK_NUMBER", None)
assert nodes is not None, "DLS_TASK_NUMBER didn't set!"
for node in range(int(nodes)):
node_ip = os.getenv(f"VC_CUSTOM{node}_HOSTS", None)
assert node_ip is not None, f"VC_CUSTOM{node}_HOSTS didn't set!"
node_ips.append(node_ip)
return node_ips, device_count
node_ips.append(server['server_id'])
return node_ips, device_count
def get_cloud_cluster(
rank_table_file=None, device_mode=DeviceMode.ASCEND_NPU, start_port=6070
):
"""
Args:
rank_table_file: string, ascend npu rank file path
device_mode: DeviceMode(Int)
start_port: the start port of current runtime env
"""
if rank_table_file:
# multi trainers
node_ips, device_count = _get_ascend_rankfile(rank_table_file)
if len(node_ips) == 1:
node_ip = node_ips[0]
else:
node_index = os.environ.get("PADDLE_TRAINER_ID")
node_ip = None
if node_index:
node_ip = node_ips[int(node_index)]
else:
_, node_ip = get_host_name_ip()
assert (
node_ip in node_ips
), "Can't find your local ip {{{}}} in node_ips: {{{}}}".format(
node_ip,
node_ips,
)
else:
# single trainer (single ascend card)
node_ips = ["127.0.0.1"]
node_ip = node_ips[0]
device_count = 1
devices_per_proc = [str(x) for x in range(device_count)]
free_ports = list(range(start_port, start_port + len(devices_per_proc)))
trainer_endpoints = []
for ip in node_ips:
trainer_endpoints.append(["%s:%d" % (ip, port) for port in free_ports])
return get_cluster(
node_ips, node_ip, trainer_endpoints, device_mode, devices_per_proc
)
...@@ -64,7 +64,7 @@ import time ...@@ -64,7 +64,7 @@ import time
from argparse import REMAINDER, ArgumentParser from argparse import REMAINDER, ArgumentParser
from paddle import framework from paddle import framework
from paddle.distributed.fleet import ascend_utils, cloud_utils, launch_utils from paddle.distributed.fleet import cloud_utils, launch_utils
from paddle.distributed.fleet.elastic import enable_elastic, launch_elastic from paddle.distributed.fleet.elastic import enable_elastic, launch_elastic
from paddle.distributed.fleet.launch_utils import ( from paddle.distributed.fleet.launch_utils import (
DeviceMode, DeviceMode,
...@@ -155,16 +155,6 @@ see: http://www.paddlepaddle.org/documentation/docs/zh/1.6/user_guides/howto/tra ...@@ -155,16 +155,6 @@ see: http://www.paddlepaddle.org/documentation/docs/zh/1.6/user_guides/howto/tra
) )
base_group.add_argument("--selected_xpus", dest="xpus") base_group.add_argument("--selected_xpus", dest="xpus")
if framework.core.is_compiled_with_npu():
base_group.add_argument(
"--npus",
type=str,
default=None,
help="It's for xpu training. For example: "
"--npus=\"0,1,2,3\" will launch four training processes each bound to one npu.",
)
base_group.add_argument("--selected_npus", dest="npus")
base_group.add_argument( base_group.add_argument(
"training_script", "training_script",
type=str, type=str,
...@@ -407,13 +397,6 @@ def get_cluster_info(args): ...@@ -407,13 +397,6 @@ def get_cluster_info(args):
args.ips, device_mode, devices_per_proc, start_port args.ips, device_mode, devices_per_proc, start_port
) )
logger.debug(f"get cluster from cloud:{cluster}") logger.debug(f"get cluster from cloud:{cluster}")
elif device_mode == DeviceMode.ASCEND_NPU:
# for ascend
cluster, pod = ascend_utils.get_cloud_cluster(
rank_table_file=os.getenv("RANK_TABLE_FILE", None),
device_mode=device_mode,
start_port=start_port,
)
else: else:
# trainers_num = 1 or not use paddlecloud ips="a,b" # trainers_num = 1 or not use paddlecloud ips="a,b"
cluster, pod = get_cluster_from_args( cluster, pod = get_cluster_from_args(
...@@ -493,8 +476,6 @@ def infer_backend(args): ...@@ -493,8 +476,6 @@ def infer_backend(args):
return return
if framework.core.is_compiled_with_cuda(): if framework.core.is_compiled_with_cuda():
args.backend = 'nccl' args.backend = 'nccl'
elif framework.core.is_compiled_with_npu():
args.backend = 'unknown'
elif framework.core.is_compiled_with_xpu(): elif framework.core.is_compiled_with_xpu():
args.backend = 'bkcl' args.backend = 'bkcl'
else: else:
...@@ -545,8 +526,6 @@ def which_distributed_mode(args): ...@@ -545,8 +526,6 @@ def which_distributed_mode(args):
if framework.core.is_compiled_with_cuda(): if framework.core.is_compiled_with_cuda():
accelerators = framework.core.get_cuda_device_count() accelerators = framework.core.get_cuda_device_count()
elif framework.core.is_compiled_with_npu():
accelerators = framework.core.get_npu_device_count()
elif framework.core.is_compiled_with_xpu(): elif framework.core.is_compiled_with_xpu():
accelerators = framework.core.get_xpu_device_count() accelerators = framework.core.get_xpu_device_count()
else: else:
...@@ -578,7 +557,7 @@ def which_distributed_mode(args): ...@@ -578,7 +557,7 @@ def which_distributed_mode(args):
): ):
if args.servers: if args.servers:
logger.warning( logger.warning(
"Not found distinct arguments and not compiled with cuda or xpu or npu. " "Not found distinct arguments and not compiled with cuda or xpu. "
"But found args.servers not empty, default use ps mode" "But found args.servers not empty, default use ps mode"
) )
return DistributeMode.PS return DistributeMode.PS
...@@ -586,7 +565,7 @@ def which_distributed_mode(args): ...@@ -586,7 +565,7 @@ def which_distributed_mode(args):
return DistributeMode.COLLECTIVE return DistributeMode.COLLECTIVE
else: else:
logger.warning( logger.warning(
"Not found distinct arguments and compiled with cuda or xpu or npu. " "Not found distinct arguments and compiled with cuda or xpu. "
"Default use collective mode" "Default use collective mode"
) )
return DistributeMode.COLLECTIVE return DistributeMode.COLLECTIVE
......
...@@ -55,7 +55,6 @@ class DeviceMode: ...@@ -55,7 +55,6 @@ class DeviceMode:
GPU = 1 GPU = 1
KUNLUN = 2 KUNLUN = 2
XPU = 2 XPU = 2
ASCEND_NPU = 3
UNKNOWN = 3 UNKNOWN = 3
...@@ -299,10 +298,7 @@ def get_cluster( ...@@ -299,10 +298,7 @@ def get_cluster(
), "current trainer_endpoints size should be greater equal than acclerators size." ), "current trainer_endpoints size should be greater equal than acclerators size."
for i in range(len(devices_per_proc)): for i in range(len(devices_per_proc)):
trainer = Trainer() trainer = Trainer()
if ( if device_mode == DeviceMode.GPU:
device_mode == DeviceMode.GPU
or device_mode == DeviceMode.ASCEND_NPU
):
if isinstance(devices_per_proc[i], (list, tuple)): if isinstance(devices_per_proc[i], (list, tuple)):
trainer.accelerators.extend(devices_per_proc[i]) trainer.accelerators.extend(devices_per_proc[i])
pod.accelerators.extend(devices_per_proc[i]) pod.accelerators.extend(devices_per_proc[i])
...@@ -546,13 +542,6 @@ def start_local_trainers( ...@@ -546,13 +542,6 @@ def start_local_trainers(
[str(g) for g in t.accelerators] [str(g) for g in t.accelerators]
) )
elif (
len(t.accelerators) > 0 and pod.device_mode == DeviceMode.ASCEND_NPU
):
proc_env["FLAGS_selected_npus"] = "%s" % ",".join(
[str(g) for g in t.accelerators]
)
if len(t.accelerators) > 0: if len(t.accelerators) > 0:
proc_env["FLAGS_selected_accelerators"] = "%s" % ",".join( proc_env["FLAGS_selected_accelerators"] = "%s" % ",".join(
[str(g) for g in t.accelerators] [str(g) for g in t.accelerators]
...@@ -760,40 +749,6 @@ def get_xpus(xpus): ...@@ -760,40 +749,6 @@ def get_xpus(xpus):
return res_xpus return res_xpus
def get_npus(npus):
if npus is None:
npus_num = framework.core.get_npu_device_count()
res_npus = [str(x) for x in range(0, npus_num)]
else:
npu_visible_devices = os.getenv("ASCEND_VISIBLE_DEVICES")
if npu_visible_devices is None or npu_visible_devices == "":
res_npus = [x.strip() for x in npus.split(',')]
else:
# change npus into relative values
# e.g. ASCEND_VISIBLE_DEVICES=4,5,6,7; args.npus=4,5,6,7;
# therefore npus=0,1,2,3
npu_visible_devices_list = npu_visible_devices.split(',')
for x in npus.split(','):
assert x in npu_visible_devices_list, (
"Can't find "
"your npus %s in ASCEND_VISIBLE_DEVICES[%s]."
% (x, npu_visible_devices)
)
res_npus = [
npu_visible_devices_list.index(x.strip())
for x in npus.split(',')
]
logger.info(
"Change selected_npus into reletive values. --ips:{} "
"will change into relative_ips:{} according to your "
"ASCEND_VISIBLE_DEVICES:{}".format(
npus, res_npus, npu_visible_devices_list
)
)
return res_npus
def get_device_mode(backend): def get_device_mode(backend):
if backend == 'heter': if backend == 'heter':
if ( if (
...@@ -808,16 +763,6 @@ def get_device_mode(backend): ...@@ -808,16 +763,6 @@ def get_device_mode(backend):
): ):
print("launch train in heter mode with XPU device.") print("launch train in heter mode with XPU device.")
return DeviceMode.XPU return DeviceMode.XPU
if (
framework.core.is_compiled_with_npu()
and framework.core.get_npu_device_count() > 0
):
print("launch train in heter mode with NPU device.")
return DeviceMode.ASCEND_NPU
if backend == 'hccl' and framework.core.get_npu_device_count() > 0:
print("launch train in ascend npu mode!")
return DeviceMode.ASCEND_NPU
if backend == 'nccl' and framework.core.get_cuda_device_count() > 0: if backend == 'nccl' and framework.core.get_cuda_device_count() > 0:
print("launch train in GPU mode!") print("launch train in GPU mode!")
...@@ -853,19 +798,6 @@ def get_device_proc_info(args): ...@@ -853,19 +798,6 @@ def get_device_proc_info(args):
devices_per_proc = [gpus[i : i + n] for i in range(0, len(gpus), n)] devices_per_proc = [gpus[i : i + n] for i in range(0, len(gpus), n)]
else: else:
devices_per_proc = gpus devices_per_proc = gpus
elif device_mode == DeviceMode.ASCEND_NPU:
npus = get_npus(args.npus)
if args.nproc_per_node is not None:
assert (
len(npus) % int(args.nproc_per_node)
) == 0, "npus' number:{} mod args.nproc_per_node:{} must == 0".format(
len(npus), args.nproc_per_node
)
n = int(len(npus) / int(args.nproc_per_node))
devices_per_proc = [npus[i : i + n] for i in range(0, len(npus), n)]
else:
devices_per_proc = npus
elif device_mode == DeviceMode.XPU: elif device_mode == DeviceMode.XPU:
xpus = get_xpus(args.xpus) xpus = get_xpus(args.xpus)
if args.nproc_per_node is not None: if args.nproc_per_node is not None:
...@@ -2079,12 +2011,6 @@ def check_backend(backend): ...@@ -2079,12 +2011,6 @@ def check_backend(backend):
"your paddle is not compiled with xpu but you assign 'bkcl' as backend." "your paddle is not compiled with xpu but you assign 'bkcl' as backend."
) )
if backend == 'hccl' and not framework.core.is_compiled_with_npu():
raise ValueError(
"paddle.distributed initialize error, "
"your paddle is not compiled with npu but you assign 'hccl' as backend."
)
def block_windows_and_macos(backend): def block_windows_and_macos(backend):
if backend != 'gloo': if backend != 'gloo':
...@@ -2106,7 +2032,4 @@ def get_backend_by_compile_flag(): ...@@ -2106,7 +2032,4 @@ def get_backend_by_compile_flag():
if framework.core.is_compiled_with_xpu(): if framework.core.is_compiled_with_xpu():
return 'bkcl' return 'bkcl'
if framework.core.is_compiled_with_npu():
return 'hccl'
return 'gloo' return 'gloo'
...@@ -536,7 +536,9 @@ def _parallel_linear( ...@@ -536,7 +536,9 @@ def _parallel_linear(
# NOTE: npu linear function use matmul_v2 but linear use matmul # NOTE: npu linear function use matmul_v2 but linear use matmul
linear_function = ( linear_function = (
_linear if core.is_compiled_with_npu() else paddle.nn.functional.linear _linear
if core.is_compiled_with_custom_device('npu')
else paddle.nn.functional.linear
) )
linear_out = linear_function( linear_out = linear_function(
x, x,
......
...@@ -196,7 +196,7 @@ class CollectiveHelper: ...@@ -196,7 +196,7 @@ class CollectiveHelper:
OP_ROLE_KEY: OpRole.Forward, OP_ROLE_KEY: OpRole.Forward,
}, },
) )
elif core.is_compiled_with_npu(): elif core.is_compiled_with_custom_device('npu'):
block.append_op( block.append_op(
type='c_gen_hccl_id', type='c_gen_hccl_id',
inputs={}, inputs={},
......
...@@ -26,23 +26,17 @@ class PlaceType: ...@@ -26,23 +26,17 @@ class PlaceType:
CUDA = 1 CUDA = 1
CUDA_PINNED = 2 CUDA_PINNED = 2
XPU = 3 # unsupport for now XPU = 3 # unsupport for now
NPU = 4
NPU_PINNED = 5
@staticmethod @staticmethod
def default_device(): def default_device():
if core.is_compiled_with_cuda(): if core.is_compiled_with_cuda():
return PlaceType.CUDA return PlaceType.CUDA
elif core.is_compiled_with_npu():
return PlaceType.NPU
return PlaceType.CPU return PlaceType.CPU
@staticmethod @staticmethod
def default_pinned(): def default_pinned():
if core.is_compiled_with_cuda(): if core.is_compiled_with_cuda():
return PlaceType.CUDA_PINNED return PlaceType.CUDA_PINNED
elif core.is_compiled_with_npu():
return PlaceType.NPU_PINNED
return PlaceType.CPU return PlaceType.CPU
......
...@@ -596,7 +596,7 @@ class ShardingOptimizer(MetaOptimizerBase): ...@@ -596,7 +596,7 @@ class ShardingOptimizer(MetaOptimizerBase):
rings = [self.mp_ring_id, self.pp_ring_id] rings = [self.mp_ring_id, self.pp_ring_id]
# FIXME(wangxi): some problem with NPU found_finite, need sync with DP # FIXME(wangxi): some problem with NPU found_finite, need sync with DP
if core.is_compiled_with_npu(): if core.is_compiled_with_custom_device('npu'):
rings += [self.dp_ring_id] rings += [self.dp_ring_id]
FP16Utils.sync_amp_check_nan_inf(main_block, rings) FP16Utils.sync_amp_check_nan_inf(main_block, rings)
...@@ -721,7 +721,7 @@ class ShardingOptimizer(MetaOptimizerBase): ...@@ -721,7 +721,7 @@ class ShardingOptimizer(MetaOptimizerBase):
self._dump_program_for_debug() self._dump_program_for_debug()
# GPU need to wait server ready, GPU and NPU is Layered connection # GPU need to wait server ready, GPU and NPU is Layered connection
if not core.is_compiled_with_npu(): if not core.is_compiled_with_custom_device('npu'):
self._wait() self._wait()
return optimize_ops, params_grads return optimize_ops, params_grads
...@@ -839,7 +839,7 @@ class ShardingOptimizer(MetaOptimizerBase): ...@@ -839,7 +839,7 @@ class ShardingOptimizer(MetaOptimizerBase):
sync=False, sync=False,
) )
if core.is_compiled_with_npu(): if core.is_compiled_with_custom_device('npu'):
self._init_npu_pipeline_comm(startup_block) self._init_npu_pipeline_comm(startup_block)
return return
......
...@@ -78,7 +78,7 @@ class InternalStorage: ...@@ -78,7 +78,7 @@ class InternalStorage:
if self._device != device: if self._device != device:
tmp_buffer = ( tmp_buffer = (
cvt_to_device(self.buffer, self.dev_id) cvt_to_device(self.buffer, self.dev_id)
if device in ["gpu", "xpu", "npu"] if device in ["gpu", "xpu"]
else self.buffer.cpu() else self.buffer.cpu()
) )
for param in self._params: for param in self._params:
......
...@@ -200,8 +200,9 @@ def device_guard(dev_id=0, device="cpu"): ...@@ -200,8 +200,9 @@ def device_guard(dev_id=0, device="cpu"):
origin_device = paddle.device.get_device() origin_device = paddle.device.get_device()
if device == "cpu": if device == "cpu":
paddle.set_device(device) paddle.set_device(device)
elif device in ["gpu", "xpu", "npu"]: elif device in ["gpu", "xpu"]:
paddle.set_device(f"{device}:{dev_id}") paddle.set_device(f"{device}:{dev_id}")
try: try:
yield yield
finally: finally:
...@@ -313,8 +314,6 @@ def cvt_to_device(x, dev_id, blocking=True): ...@@ -313,8 +314,6 @@ def cvt_to_device(x, dev_id, blocking=True):
""" """
if paddle.is_compiled_with_cuda(): if paddle.is_compiled_with_cuda():
place = paddle.CUDAPlace(dev_id) place = paddle.CUDAPlace(dev_id)
elif paddle.is_compiled_with_npu():
place = paddle.NPUPlace(dev_id)
elif paddle.is_compiled_with_xpu(): elif paddle.is_compiled_with_xpu():
place = paddle.XPUPlace(dev_id) place = paddle.XPUPlace(dev_id)
else: else:
......
...@@ -201,11 +201,9 @@ class HybridParallelInferenceHelper: ...@@ -201,11 +201,9 @@ class HybridParallelInferenceHelper:
assert isinstance(main_program, Program) assert isinstance(main_program, Program)
self._device = None self._device = None
if core.is_compiled_with_npu(): if core.is_compiled_with_cuda():
self._device = "npu"
elif core.is_compiled_with_cuda():
self._device = "gpu" self._device = "gpu"
assert self._device, "Only gpu and npu are supported." assert self._device, "Only gpu are supported."
assert not in_dygraph_mode(), "Only static graph mode is supported." assert not in_dygraph_mode(), "Only static graph mode is supported."
......
...@@ -24,7 +24,6 @@ class DeviceType: ...@@ -24,7 +24,6 @@ class DeviceType:
CPU = 'cpu' CPU = 'cpu'
GPU = 'gpu' GPU = 'gpu'
XPU = 'xpu' XPU = 'xpu'
NPU = 'npu'
IPU = 'ipu' IPU = 'ipu'
CUSTOM_DEVICE = 'custom_device' CUSTOM_DEVICE = 'custom_device'
...@@ -68,8 +67,6 @@ class Device: ...@@ -68,8 +67,6 @@ class Device:
return 'FLAGS_selected_cpus' return 'FLAGS_selected_cpus'
if self._dtype == DeviceType.GPU: if self._dtype == DeviceType.GPU:
return 'FLAGS_selected_gpus' return 'FLAGS_selected_gpus'
if self._dtype == DeviceType.NPU:
return 'FLAGS_selected_npus'
if self._dtype == DeviceType.XPU: if self._dtype == DeviceType.XPU:
return 'FLAGS_selected_xpus' return 'FLAGS_selected_xpus'
if self._dtype == DeviceType.IPU: if self._dtype == DeviceType.IPU:
...@@ -111,9 +108,6 @@ class Device: ...@@ -111,9 +108,6 @@ class Device:
elif 'XPU_VISIBLE_DEVICES' in os.environ: elif 'XPU_VISIBLE_DEVICES' in os.environ:
dev._dtype = DeviceType.XPU dev._dtype = DeviceType.XPU
visible_devices = os.getenv("XPU_VISIBLE_DEVICES") visible_devices = os.getenv("XPU_VISIBLE_DEVICES")
elif 'ASCEND_VISIBLE_DEVICES' in os.environ:
dev._dtype = DeviceType.NPU
visible_devices = os.getenv("ASCEND_VISIBLE_DEVICES")
if visible_devices is not None and visible_devices != 'all': if visible_devices is not None and visible_devices != 'all':
dev._labels = visible_devices.split(',') dev._labels = visible_devices.split(',')
...@@ -152,10 +146,6 @@ class Device: ...@@ -152,10 +146,6 @@ class Device:
dev._dtype = DeviceType.XPU dev._dtype = DeviceType.XPU
num = core.get_xpu_device_count() num = core.get_xpu_device_count()
visible_devices = os.getenv("XPU_VISIBLE_DEVICES") visible_devices = os.getenv("XPU_VISIBLE_DEVICES")
elif core.is_compiled_with_npu():
dev._dtype = DeviceType.NPU
num = core.get_npu_device_count()
visible_devices = os.getenv("ASCEND_VISIBLE_DEVICES")
elif core.is_compiled_with_ipu(): elif core.is_compiled_with_ipu():
dev._dtype = DeviceType.IPU dev._dtype = DeviceType.IPU
num = core.get_ipu_device_count() num = core.get_ipu_device_count()
......
...@@ -721,9 +721,6 @@ class ParallelEnv: ...@@ -721,9 +721,6 @@ class ParallelEnv:
elif core.is_compiled_with_xpu(): elif core.is_compiled_with_xpu():
selected_xpus = os.getenv("FLAGS_selected_xpus", "0").split(",") selected_xpus = os.getenv("FLAGS_selected_xpus", "0").split(",")
self._device_id = int(selected_xpus[0]) self._device_id = int(selected_xpus[0])
elif core.is_compiled_with_npu():
selected_npus = os.getenv("FLAGS_selected_npus", "0").split(",")
self._device_id = int(selected_npus[0])
self._trainer_endpoints = os.getenv( self._trainer_endpoints = os.getenv(
"PADDLE_TRAINER_ENDPOINTS", "" "PADDLE_TRAINER_ENDPOINTS", ""
...@@ -889,12 +886,8 @@ def _start_kv_server(port, http_server_d, size): ...@@ -889,12 +886,8 @@ def _start_kv_server(port, http_server_d, size):
def _is_cpuonly(backend): def _is_cpuonly(backend):
check_backend(backend) check_backend(backend)
if ( if (
backend in ['auto', 'nccl', 'bkcl', 'hccl', 'heter', 'cncl'] backend in ['auto', 'nccl', 'bkcl', 'heter', 'cncl']
and ( and (core.is_compiled_with_cuda() or core.is_compiled_with_xpu())
core.is_compiled_with_cuda()
or core.is_compiled_with_xpu()
or core.is_compiled_with_npu()
)
) or backend == 'xccl': ) or backend == 'xccl':
# passes 'auto' and can use cuda or xpu, use the default logics. so return False # passes 'auto' and can use cuda or xpu, use the default logics. so return False
...@@ -994,7 +987,6 @@ def init_parallel_env(): ...@@ -994,7 +987,6 @@ def init_parallel_env():
is_cpu_only is_cpu_only
or core.is_compiled_with_cuda() or core.is_compiled_with_cuda()
or core.is_compiled_with_xpu() or core.is_compiled_with_xpu()
or core.is_compiled_with_npu()
or backend == "xccl" or backend == "xccl"
): ):
raise NotImplementedError( raise NotImplementedError(
...@@ -1013,9 +1005,6 @@ def init_parallel_env(): ...@@ -1013,9 +1005,6 @@ def init_parallel_env():
elif not is_cpu_only and core.is_compiled_with_xpu(): elif not is_cpu_only and core.is_compiled_with_xpu():
_check_var_exists('FLAGS_selected_xpus') _check_var_exists('FLAGS_selected_xpus')
backend = "bkcl" if backend == "auto" else backend backend = "bkcl" if backend == "auto" else backend
elif not is_cpu_only and core.is_compiled_with_npu():
_check_var_exists('FLAGS_selected_npus')
backend = "hccl" if backend == "auto" else backend
_check_var_exists("PADDLE_TRAINER_ID") _check_var_exists("PADDLE_TRAINER_ID")
_check_var_exists("PADDLE_CURRENT_ENDPOINT") _check_var_exists("PADDLE_CURRENT_ENDPOINT")
...@@ -1038,9 +1027,6 @@ def init_parallel_env(): ...@@ -1038,9 +1027,6 @@ def init_parallel_env():
place = core.CUDAPlace(parallel_env.device_id) place = core.CUDAPlace(parallel_env.device_id)
elif core.is_compiled_with_xpu(): elif core.is_compiled_with_xpu():
place = core.XPUPlace(parallel_env.device_id) place = core.XPUPlace(parallel_env.device_id)
elif core.is_compiled_with_npu():
place = core.NPUPlace(parallel_env.device_id)
_set_expected_place(place) _set_expected_place(place)
group = None group = None
...@@ -1136,7 +1122,7 @@ def init_parallel_env(): ...@@ -1136,7 +1122,7 @@ def init_parallel_env():
strategy.current_endpoint = parallel_env.current_endpoint strategy.current_endpoint = parallel_env.current_endpoint
strategy.nrings = parallel_env.nrings strategy.nrings = parallel_env.nrings
# init nccl or hccl or bkcl or heter context # init nccl or bkcl or heter context
if is_cpu_only: if is_cpu_only:
parallel_helper._set_parallel_ctx( parallel_helper._set_parallel_ctx(
core.GLOOParallelContext(strategy, place) core.GLOOParallelContext(strategy, place)
...@@ -1153,10 +1139,7 @@ def init_parallel_env(): ...@@ -1153,10 +1139,7 @@ def init_parallel_env():
parallel_helper._set_parallel_ctx( parallel_helper._set_parallel_ctx(
core.BKCLParallelContext(strategy, place) core.BKCLParallelContext(strategy, place)
) )
elif core.is_compiled_with_npu():
parallel_helper._set_parallel_ctx(
core.HCCLParallelContext(strategy, place)
)
if backend != "heter": if backend != "heter":
other_endpoints = strategy.trainer_endpoints[:] other_endpoints = strategy.trainer_endpoints[:]
other_endpoints.remove(strategy.current_endpoint) other_endpoints.remove(strategy.current_endpoint)
......
...@@ -133,37 +133,8 @@ class Collective: ...@@ -133,37 +133,8 @@ class Collective:
wait_server_ready(other_endpoints) wait_server_ready(other_endpoints)
block = program.global_block() block = program.global_block()
if core.is_compiled_with_npu():
hccl_id_var = block.create_var( if core.is_compiled_with_xpu():
name=unique_name.generate('hccl_id'),
persistable=True,
type=core.VarDesc.VarType.RAW,
)
endpoint_to_index_map = {e: idx for idx, e in enumerate(endpoints)}
block.append_op(
type='c_gen_hccl_id',
inputs={},
outputs={'Out': hccl_id_var},
attrs={
'rank': rank,
'endpoint': current_endpoint,
'other_endpoints': other_endpoints,
self.op_role_key: OpRole.Forward,
},
)
block.append_op(
type='c_comm_init_hccl',
inputs={'X': hccl_id_var},
outputs={},
attrs={
'rank': rank,
'ring_id': ring_id,
'device_id': int(os.getenv("FLAGS_selected_npus")),
'rank_ids': nranks,
self.op_role_key: OpRole.Forward,
},
)
elif core.is_compiled_with_xpu():
bkcl_id_var = block.create_var( bkcl_id_var = block.create_var(
name=unique_name.generate('bkcl_id'), name=unique_name.generate('bkcl_id'),
persistable=True, persistable=True,
......
...@@ -131,37 +131,7 @@ class Collective: ...@@ -131,37 +131,7 @@ class Collective:
wait_server_ready(other_endpoints) wait_server_ready(other_endpoints)
block = program.global_block() block = program.global_block()
if core.is_compiled_with_npu(): if core.is_compiled_with_cuda():
hccl_id_var = block.create_var(
name=unique_name.generate('hccl_id'),
persistable=True,
type=core.VarDesc.VarType.RAW,
)
endpoint_to_index_map = {e: idx for idx, e in enumerate(endpoints)}
block.append_op(
type='c_gen_hccl_id',
inputs={},
outputs={'Out': hccl_id_var},
attrs={
'rank': rank,
'endpoint': current_endpoint,
'other_endpoints': other_endpoints,
self.op_role_key: OpRole.Forward,
},
)
block.append_op(
type='c_comm_init_hccl',
inputs={'X': hccl_id_var},
outputs={},
attrs={
'rank': rank,
'ring_id': ring_id,
'device_id': int(os.getenv("FLAGS_selected_npus")),
'rank_ids': nranks,
self.op_role_key: OpRole.Forward,
},
)
elif core.is_compiled_with_cuda():
nccl_id_var = block.create_var( nccl_id_var = block.create_var(
name=unique_name.generate('nccl_id'), name=unique_name.generate('nccl_id'),
persistable=True, persistable=True,
......
...@@ -71,7 +71,6 @@ from .core import ( ...@@ -71,7 +71,6 @@ from .core import (
XPUPlace, XPUPlace,
CUDAPlace, CUDAPlace,
CUDAPinnedPlace, CUDAPinnedPlace,
NPUPlace,
IPUPlace, IPUPlace,
MLUPlace, MLUPlace,
CustomPlace, CustomPlace,
...@@ -127,7 +126,6 @@ __all__ = ( ...@@ -127,7 +126,6 @@ __all__ = (
'XPUPlace', 'XPUPlace',
'CUDAPlace', 'CUDAPlace',
'CUDAPinnedPlace', 'CUDAPinnedPlace',
'NPUPlace',
'IPUPlace', 'IPUPlace',
'MLUPlace', 'MLUPlace',
'Tensor', 'Tensor',
...@@ -220,10 +218,6 @@ monkey_patch_variable() ...@@ -220,10 +218,6 @@ monkey_patch_variable()
__bootstrap__() __bootstrap__()
monkey_patch_varbase() monkey_patch_varbase()
# NOTE(zhiqiu): register npu_finalize on the exit of Python,
# do some clean up manually.
if core.is_compiled_with_npu():
atexit.register(core.npu_finalize)
# NOTE(Aurelius84): clean up ExecutorCacheInfo in advance manually. # NOTE(Aurelius84): clean up ExecutorCacheInfo in advance manually.
atexit.register(core.clear_executor_cache) atexit.register(core.clear_executor_cache)
......
...@@ -654,8 +654,6 @@ class Section(DeviceWorker): ...@@ -654,8 +654,6 @@ class Section(DeviceWorker):
place_id = pipeline_opt["place_id"] place_id = pipeline_opt["place_id"]
if core.is_compiled_with_cuda(): if core.is_compiled_with_cuda():
assert isinstance(place, core.CUDAPlace) assert isinstance(place, core.CUDAPlace)
elif core.is_compiled_with_npu():
assert isinstance(place, core.NPUPlace)
cfg.place = cfg.CUDAPlace cfg.place = cfg.CUDAPlace
cfg.place_id = place_id cfg.place_id = place_id
......
...@@ -306,7 +306,7 @@ def monkey_patch_varbase(): ...@@ -306,7 +306,7 @@ def monkey_patch_varbase():
if _grad_scalar: if _grad_scalar:
# When using amp with Fleet DistributedStrategy, we do loss scaling implicitly. # When using amp with Fleet DistributedStrategy, we do loss scaling implicitly.
self = _grad_scalar.scale(self) self = _grad_scalar.scale(self)
if paddle.is_compiled_with_xpu() or paddle.is_compiled_with_npu(): if paddle.is_compiled_with_xpu():
# TODO(liuyuhui): Currently only for xpu. Will be removed in the future. # TODO(liuyuhui): Currently only for xpu. Will be removed in the future.
scaled_loss = scale_loss(self) scaled_loss = scale_loss(self)
if framework.global_var._in_eager_mode_: if framework.global_var._in_eager_mode_:
......
...@@ -2107,7 +2107,7 @@ class Executor: ...@@ -2107,7 +2107,7 @@ class Executor:
for var in program.global_block().vars.values(): for var in program.global_block().vars.values():
if var.is_data: if var.is_data:
data_vars.append(var) data_vars.append(var)
if core.is_compiled_with_npu(): if core.is_compiled_with_custom_device('npu'):
dataset = paddle.fluid.DatasetFactory().create_dataset( dataset = paddle.fluid.DatasetFactory().create_dataset(
'InMemoryDataset' 'InMemoryDataset'
) )
...@@ -2284,7 +2284,7 @@ class Executor: ...@@ -2284,7 +2284,7 @@ class Executor:
for var in program.global_block().vars.values(): for var in program.global_block().vars.values():
if var.is_data: if var.is_data:
data_vars.append(var) data_vars.append(var)
if core.is_compiled_with_npu(): if core.is_compiled_with_custom_device('npu'):
dataset = paddle.fluid.DatasetFactory().create_dataset( dataset = paddle.fluid.DatasetFactory().create_dataset(
'InMemoryDataset' 'InMemoryDataset'
) )
......
...@@ -58,7 +58,6 @@ __all__ = [ ...@@ -58,7 +58,6 @@ __all__ = [
'is_compiled_with_cuda', 'is_compiled_with_cuda',
'is_compiled_with_rocm', 'is_compiled_with_rocm',
'is_compiled_with_xpu', 'is_compiled_with_xpu',
'is_compiled_with_npu',
'Variable', 'Variable',
'require_version', 'require_version',
'device_guard', 'device_guard',
...@@ -224,7 +223,7 @@ def _in_eager_without_dygraph_check(): ...@@ -224,7 +223,7 @@ def _in_eager_without_dygraph_check():
return global_var._in_eager_mode_ return global_var._in_eager_mode_
# FIXME(dev): We haven't fully verified eager mode on XPU/NPU et.al but # FIXME(dev): We haven't fully verified eager mode on XPU et.al but
# only GPU/CPU. Remove this after we improve this feature. # only GPU/CPU. Remove this after we improve this feature.
_is_first_import_ = True _is_first_import_ = True
...@@ -715,15 +714,6 @@ def _xpu_ids(): ...@@ -715,15 +714,6 @@ def _xpu_ids():
return device_ids return device_ids
def _npu_ids():
npus_env = os.getenv("FLAGS_selected_npus")
if npus_env:
device_ids = [int(s) for s in npus_env.split(",")]
else:
device_ids = range(core.get_npu_device_count())
return device_ids
def _custom_device_ids(device_type): def _custom_device_ids(device_type):
custom_devices_env = os.getenv("FLAGS_selected_" + device_type + "s") custom_devices_env = os.getenv("FLAGS_selected_" + device_type + "s")
if custom_devices_env: if custom_devices_env:
...@@ -748,21 +738,6 @@ def is_compiled_with_xpu(): ...@@ -748,21 +738,6 @@ def is_compiled_with_xpu():
return core.is_compiled_with_xpu() return core.is_compiled_with_xpu()
def is_compiled_with_npu():
"""
Whether this whl package can be used to run the model on NPU.
Returns (bool): support npu or not.
Examples:
.. code-block:: python
import paddle.fluid as fluid
support_npu = fluid.is_compiled_with_npu()
"""
return core.is_compiled_with_npu()
def disable_signal_handler(): def disable_signal_handler():
""" """
Reset signal handler registered by Paddle. Reset signal handler registered by Paddle.
...@@ -921,47 +896,6 @@ def xpu_places(device_ids=None): ...@@ -921,47 +896,6 @@ def xpu_places(device_ids=None):
return [core.XPUPlace(dev_id) for dev_id in device_ids] return [core.XPUPlace(dev_id) for dev_id in device_ids]
def npu_places(device_ids=None):
"""
Note:
For multi-card tasks, please use `FLAGS_selected_npus` environment variable to set the visible NPU device.
This function creates a list of :code:`paddle.NPUPlace` objects.
If :code:`device_ids` is None, environment variable of
:code:`FLAGS_selected_npus` would be checked first. For example, if
:code:`FLAGS_selected_npus=0,1,2`, the returned list would
be [paddle.NPUPlace(0), paddle.NPUPlace(1), paddle.NPUPlace(2)].
If :code:`FLAGS_selected_npus` is not set, all visible
npu places would be returned.
If :code:`device_ids` is not None, it should be the device
ids of NPUs. For example, if :code:`device_ids=[0,1,2]`,
the returned list would be
[paddle.NPUPlace(0), paddle.NPUPlace(1), paddle.NPUPlace(2)].
Parameters:
device_ids (list or tuple of int, optional): list of NPU device ids.
Returns:
list of paddle.NPUPlace: Created NPU place list.
Examples:
.. code-block:: python
# required: npu
import paddle
import paddle.static as static
paddle.enable_static()
npu_places = static.npu_places()
"""
assert core.is_compiled_with_npu(), "Not compiled with NPU"
if device_ids is None:
device_ids = _npu_ids()
elif not isinstance(device_ids, (list, tuple)):
device_ids = [device_ids]
return [core.NPUPlace(dev_id) for dev_id in device_ids]
def cpu_places(device_count=None): def cpu_places(device_count=None):
""" """
This function creates a list of :code:`paddle.CPUPlace` objects, and returns the created list. This function creates a list of :code:`paddle.CPUPlace` objects, and returns the created list.
...@@ -2587,10 +2521,6 @@ class Variable(metaclass=VariableMetaClass): ...@@ -2587,10 +2521,6 @@ class Variable(metaclass=VariableMetaClass):
p = core.Place() p = core.Place()
p.set_place(t._place()) p.set_place(t._place())
place = core.XPUPlace(p.xpu_device_id()) place = core.XPUPlace(p.xpu_device_id())
elif p.is_npu_place():
p = core.Place()
p.set_place(t._place())
place = core.NPUPlace(p.npu_device_id())
else: else:
p = core.Place() p = core.Place()
p.set_place(t._place()) p.set_place(t._place())
...@@ -7520,9 +7450,9 @@ def device_guard(device=None): ...@@ -7520,9 +7450,9 @@ def device_guard(device=None):
device, index = device.split(':') device, index = device.split(':')
if device == 'cpu': if device == 'cpu':
raise ValueError("Should not set device id for cpu.") raise ValueError("Should not set device id for cpu.")
if device not in ['cpu', 'gpu', 'npu', 'xpu', '', None]: if device not in ['cpu', 'gpu', 'xpu', '', None]:
raise ValueError( raise ValueError(
"The Attr(device) should be 'cpu' 'npu' 'xpu' or 'gpu', and it can also be empty string or None " "The Attr(device) should be 'cpu' 'npu' or 'gpu', and it can also be empty string or None "
"when there is no need to specify device. But received %s" % device "when there is no need to specify device. But received %s" % device
) )
if index: if index:
...@@ -7651,7 +7581,6 @@ def _get_paddle_place(place): ...@@ -7651,7 +7581,6 @@ def _get_paddle_place(place):
core.CPUPlace, core.CPUPlace,
core.CUDAPinnedPlace, core.CUDAPinnedPlace,
core.CUDAPlace, core.CUDAPlace,
core.NPUPlace,
core.IPUPlace, core.IPUPlace,
core.CustomPlace, core.CustomPlace,
), ),
...@@ -7701,19 +7630,6 @@ def _get_paddle_place(place): ...@@ -7701,19 +7630,6 @@ def _get_paddle_place(place):
device_id = int(device_id) device_id = int(device_id)
return core.XPUPlace(device_id) return core.XPUPlace(device_id)
# NPU
avaliable_npu_place = re.match(r'npu:\d+', place)
if avaliable_npu_place:
if not core.is_compiled_with_npu():
raise ValueError(
"The device should not be {}, since PaddlePaddle is "
"not compiled with NPU".format(avaliable_npu_place.group())
)
place_info_list = place.split(':', 1)
device_id = place_info_list[1]
device_id = int(device_id)
return core.NPUPlace(device_id)
# IPU # IPU
avaliable_ipu_place = re.match(r'ipu:\d+', place) avaliable_ipu_place = re.match(r'ipu:\d+', place)
if avaliable_ipu_place: if avaliable_ipu_place:
...@@ -7728,9 +7644,7 @@ def _get_paddle_place(place): ...@@ -7728,9 +7644,7 @@ def _get_paddle_place(place):
return core.IPUPlace(device_id) return core.IPUPlace(device_id)
raise ValueError( raise ValueError(
"Paddle supports CPUPlace, CUDAPlace,CUDAPinnedPlace, XPUPlace, IPUPlace and NPUPlace, but received {}.".format( f"Paddle supports CPUPlace, CUDAPlace, CUDAPinnedPlace, XPUPlace and IPUPlace, but received {place}."
place
)
) )
......
...@@ -4553,7 +4553,7 @@ class PipelineOptimizer: ...@@ -4553,7 +4553,7 @@ class PipelineOptimizer:
def __init__(self, optimizer, num_microbatches=1, start_cpu_core_id=0): def __init__(self, optimizer, num_microbatches=1, start_cpu_core_id=0):
self._device = 'cpu' self._device = 'cpu'
if core.is_compiled_with_npu(): if core.is_compiled_with_custom_device('npu'):
self._device = "npu" self._device = "npu"
elif core.is_compiled_with_cuda(): elif core.is_compiled_with_cuda():
self._device = "gpu" self._device = "gpu"
...@@ -5770,7 +5770,7 @@ class PipelineOptimizer: ...@@ -5770,7 +5770,7 @@ class PipelineOptimizer:
# If there are some not initialized sections in the fused var, # If there are some not initialized sections in the fused var,
# and the value in those sections are nan/inf, it will trigger the nan/inf check. # and the value in those sections are nan/inf, it will trigger the nan/inf check.
# To avoid these problematic triggers, set constant is needed for npu # To avoid these problematic triggers, set constant is needed for npu
"set_constant": core.is_compiled_with_npu(), "set_constant": core.is_compiled_with_custom_device('npu'),
"constant": float(0.0), "constant": float(0.0),
}, },
) )
...@@ -6387,8 +6387,8 @@ class PipelineOptimizer: ...@@ -6387,8 +6387,8 @@ class PipelineOptimizer:
dev_index = int(dev.split(":")[1]) dev_index = int(dev.split(":")[1])
if core.is_compiled_with_cuda(): if core.is_compiled_with_cuda():
place_list.append(core.CUDAPlace(dev_index % 1)) place_list.append(core.CUDAPlace(dev_index % 1))
elif core.is_compiled_with_npu(): elif paddle.is_compiled_with_custom_device('npu'):
place_list.append(core.NPUPlace(dev_index % 1)) place_list.append(paddle.CustomPlace('npu', dev_index % 1))
# Step6: Split startup program # Step6: Split startup program
new_startup_program = self._split_startup_program( new_startup_program = self._split_startup_program(
...@@ -6411,7 +6411,7 @@ class PipelineOptimizer: ...@@ -6411,7 +6411,7 @@ class PipelineOptimizer:
if core.is_compiled_with_cuda(): if core.is_compiled_with_cuda():
place_id = int(os.getenv("FLAGS_selected_gpus", "0")) place_id = int(os.getenv("FLAGS_selected_gpus", "0"))
elif core.is_compiled_with_npu(): elif core.is_compiled_with_custom_device('npu'):
place_id = int(os.getenv("FLAGS_selected_npus", "0")) place_id = int(os.getenv("FLAGS_selected_npus", "0"))
# A pass to move the recv op to the beginning of # A pass to move the recv op to the beginning of
# the forward/backward phase # the forward/backward phase
......
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import sys
def train(prefix):
selected_accelerators = os.getenv("FLAGS_selected_accelerators")
selected_npus = os.getenv("FLAGS_selected_npus")
trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS")
current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT")
worker_endpoints = worker_endpoints_env
trainers_num = len(worker_endpoints.split(','))
device_ids = os.getenv("PADDLE_WORLD_DEVICE_IDS")
current_device_id = os.getenv("PADDLE_LOCAL_DEVICE_IDS")
details = "selected_accelerators:{} selected_npus:{} worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{} device_ids:{} device_id:{}".format(
selected_accelerators,
selected_npus,
worker_endpoints,
trainers_num,
current_endpoint,
trainer_id,
device_ids,
current_device_id,
)
print(details)
with open(f"multi_process_{prefix}.check_{trainer_id}.log", "w") as f:
f.write(details)
if __name__ == '__main__':
prefix = sys.argv[1]
train(prefix)
...@@ -37,9 +37,7 @@ class TestCEmbeddingCPU(OpTest): ...@@ -37,9 +37,7 @@ class TestCEmbeddingCPU(OpTest):
def setUp(self): def setUp(self):
self.init_dtype() self.init_dtype()
self.initcase() self.initcase()
if core.is_compiled_with_npu(): if core.is_compiled_with_xpu():
self.__class__.use_npu = True
elif core.is_compiled_with_xpu():
self.__class__.use_xpu = True self.__class__.use_xpu = True
elif core.is_compiled_with_cuda(): elif core.is_compiled_with_cuda():
self.__class__.exist_fp64_check_grad = True self.__class__.exist_fp64_check_grad = True
...@@ -57,9 +55,7 @@ class TestCEmbeddingCPU(OpTest): ...@@ -57,9 +55,7 @@ class TestCEmbeddingCPU(OpTest):
np_out = get_c_embedding(self.start_index, self.end_index, table, ids) np_out = get_c_embedding(self.start_index, self.end_index, table, ids)
self.outputs = {'Out': np_out.reshape((2, 4, 64))} self.outputs = {'Out': np_out.reshape((2, 4, 64))}
self.attrs = {'start_index': self.start_index} self.attrs = {'start_index': self.start_index}
if core.is_compiled_with_npu(): if core.is_compiled_with_xpu():
self.__class__.use_npu = True
elif core.is_compiled_with_xpu():
self.__class__.use_xpu = True self.__class__.use_xpu = True
def test_check_cpu(self): def test_check_cpu(self):
...@@ -81,16 +77,12 @@ class TestCEmbeddingOpBase(TestCEmbeddingCPU): ...@@ -81,16 +77,12 @@ class TestCEmbeddingOpBase(TestCEmbeddingCPU):
def test_check_output(self): def test_check_output(self):
if core.is_compiled_with_cuda(): if core.is_compiled_with_cuda():
self.check_output_with_place(core.CUDAPlace(0)) self.check_output_with_place(core.CUDAPlace(0))
elif core.is_compiled_with_npu():
self.check_output_with_place(core.NPUPlace(0))
elif core.is_compiled_with_xpu(): elif core.is_compiled_with_xpu():
self.check_output_with_place(core.XPUPlace(0)) self.check_output_with_place(core.XPUPlace(0))
def test_check_grad(self): def test_check_grad(self):
if core.is_compiled_with_cuda(): if core.is_compiled_with_cuda():
self.check_grad_with_place(core.CUDAPlace(0), ['W'], 'Out') self.check_grad_with_place(core.CUDAPlace(0), ['W'], 'Out')
elif core.is_compiled_with_npu():
self.check_grad_with_place(core.NPUPlace(0), ['W'], 'Out')
elif core.is_compiled_with_xpu(): elif core.is_compiled_with_xpu():
self.check_grad_with_place(core.XPUPlace(0), ['W'], 'Out') self.check_grad_with_place(core.XPUPlace(0), ['W'], 'Out')
...@@ -98,9 +90,6 @@ class TestCEmbeddingOpBase(TestCEmbeddingCPU): ...@@ -98,9 +90,6 @@ class TestCEmbeddingOpBase(TestCEmbeddingCPU):
if core.is_compiled_with_cuda(): if core.is_compiled_with_cuda():
self.dtype = "float64" self.dtype = "float64"
self.ids_dtype = "int64" self.ids_dtype = "int64"
elif core.is_compiled_with_npu():
self.dtype = "float32"
self.ids_dtype = "int32"
elif core.is_compiled_with_xpu(): elif core.is_compiled_with_xpu():
self.dtype = "float32" self.dtype = "float32"
self.ids_dtype = "int64" self.ids_dtype = "int64"
...@@ -129,9 +118,7 @@ class TestCEmbeddingOpFP32(TestCEmbeddingOpBase): ...@@ -129,9 +118,7 @@ class TestCEmbeddingOpFP32(TestCEmbeddingOpBase):
self.outputs = {'Out': np_out.reshape((2, 4, 64))} self.outputs = {'Out': np_out.reshape((2, 4, 64))}
self.attrs = {'start_index': self.start_index} self.attrs = {'start_index': self.start_index}
if core.is_compiled_with_npu(): if core.is_compiled_with_xpu():
self.__class__.use_npu = True
elif core.is_compiled_with_xpu():
self.__class__.use_xpu = True self.__class__.use_xpu = True
elif core.is_compiled_with_cuda(): elif core.is_compiled_with_cuda():
self.__class__.exist_fp64_check_grad = True self.__class__.exist_fp64_check_grad = True
......
...@@ -24,6 +24,16 @@ from copy import copy ...@@ -24,6 +24,16 @@ from copy import copy
import numpy as np import numpy as np
from op import Operator from op import Operator
from prim_op_test import OpTestUtils, PrimForwardChecker, PrimGradChecker
from testsuite import append_input_output, append_loss_ops, create_op, set_input
from white_list import (
check_shape_white_list,
compile_vs_runtime_white_list,
no_check_set_white_list,
no_grad_set_white_list,
op_accuracy_white_list,
op_threshold_white_list,
)
import paddle import paddle
from paddle import fluid from paddle import fluid
...@@ -36,20 +46,9 @@ from paddle.fluid.framework import ( ...@@ -36,20 +46,9 @@ from paddle.fluid.framework import (
_current_expected_place, _current_expected_place,
canonicalize_attrs, canonicalize_attrs,
) )
from paddle.fluid.wrapped_decorator import signature_safe_contextmanager
sys.path.append(os.path.abspath(os.path.dirname(__file__))) sys.path.append(os.path.abspath(os.path.dirname(__file__)))
from prim_op_test import OpTestUtils, PrimForwardChecker, PrimGradChecker
from testsuite import append_input_output, append_loss_ops, create_op, set_input
from white_list import (
check_shape_white_list,
compile_vs_runtime_white_list,
no_check_set_white_list,
no_grad_set_white_list,
op_accuracy_white_list,
op_threshold_white_list,
)
from paddle.fluid.wrapped_decorator import signature_safe_contextmanager
@signature_safe_contextmanager @signature_safe_contextmanager
...@@ -338,10 +337,7 @@ class OpTest(unittest.TestCase): ...@@ -338,10 +337,7 @@ class OpTest(unittest.TestCase):
np.random.seed(123) np.random.seed(123)
random.seed(124) random.seed(124)
if paddle.is_compiled_with_npu(): cls._use_system_allocator = _set_use_system_allocator(True)
cls._use_system_allocator = _set_use_system_allocator(False)
else:
cls._use_system_allocator = _set_use_system_allocator(True)
@classmethod @classmethod
def tearDownClass(cls): def tearDownClass(cls):
...@@ -376,9 +372,6 @@ class OpTest(unittest.TestCase): ...@@ -376,9 +372,6 @@ class OpTest(unittest.TestCase):
def is_rocm_op_test(): def is_rocm_op_test():
return core.is_compiled_with_rocm() return core.is_compiled_with_rocm()
def is_npu_op_test():
return hasattr(cls, "use_npu") and cls.use_npu
def is_custom_device_op_test(): def is_custom_device_op_test():
return hasattr(cls, "use_custom_device") and cls.use_custom_device return hasattr(cls, "use_custom_device") and cls.use_custom_device
...@@ -411,7 +404,6 @@ class OpTest(unittest.TestCase): ...@@ -411,7 +404,6 @@ class OpTest(unittest.TestCase):
and not is_xpu_op_test() and not is_xpu_op_test()
and not is_mkldnn_op_test() and not is_mkldnn_op_test()
and not is_rocm_op_test() and not is_rocm_op_test()
and not is_npu_op_test()
and not is_custom_device_op_test() and not is_custom_device_op_test()
and not cls.check_prim and not cls.check_prim
): ):
...@@ -1965,10 +1957,8 @@ class OpTest(unittest.TestCase): ...@@ -1965,10 +1957,8 @@ class OpTest(unittest.TestCase):
# Check inplace for given op, its grad op, its grad_grad op, etc. # Check inplace for given op, its grad op, its grad_grad op, etc.
# No effect on original OpTest # No effect on original OpTest
# Currently not support ParallelExecutor on XPUPlace. # Currently not support ParallelExecutor on XPUPlace.
if ( if not paddle.is_compiled_with_xpu() and not isinstance(
not paddle.is_compiled_with_xpu() place, core.CustomPlace
and not paddle.is_compiled_with_npu()
and not isinstance(place, core.CustomPlace)
): ):
self.check_inplace_output_with_place( self.check_inplace_output_with_place(
place, no_check_set=no_check_set, inplace_atol=inplace_atol place, no_check_set=no_check_set, inplace_atol=inplace_atol
......
...@@ -59,7 +59,6 @@ class TestMaxMemoryAllocated(unittest.TestCase): ...@@ -59,7 +59,6 @@ class TestMaxMemoryAllocated(unittest.TestCase):
-2, -2,
0.5, 0.5,
"gpu1", "gpu1",
"npu",
] ]
for device in wrong_device: for device in wrong_device:
with self.assertRaises(BaseException): with self.assertRaises(BaseException):
......
...@@ -59,7 +59,6 @@ class TestMaxMemoryreserved(unittest.TestCase): ...@@ -59,7 +59,6 @@ class TestMaxMemoryreserved(unittest.TestCase):
-2, -2,
0.5, 0.5,
"gpu1", "gpu1",
"npu",
] ]
for device in wrong_device: for device in wrong_device:
with self.assertRaises(BaseException): with self.assertRaises(BaseException):
......
...@@ -44,7 +44,6 @@ class TestMemoryAllocated(unittest.TestCase): ...@@ -44,7 +44,6 @@ class TestMemoryAllocated(unittest.TestCase):
-2, -2,
0.5, 0.5,
"gpu1", "gpu1",
"npu",
] ]
for device in wrong_device: for device in wrong_device:
with self.assertRaises(BaseException): with self.assertRaises(BaseException):
......
...@@ -44,7 +44,6 @@ class TestMemoryreserved(unittest.TestCase): ...@@ -44,7 +44,6 @@ class TestMemoryreserved(unittest.TestCase):
-2, -2,
0.5, 0.5,
"gpu1", "gpu1",
"npu",
] ]
for device in wrong_device: for device in wrong_device:
with self.assertRaises(BaseException): with self.assertRaises(BaseException):
......
...@@ -46,10 +46,6 @@ class TestStaticDeviceManage(unittest.TestCase): ...@@ -46,10 +46,6 @@ class TestStaticDeviceManage(unittest.TestCase):
if core.is_compiled_with_xpu(): if core.is_compiled_with_xpu():
self._test_device("xpu:0", core.XPUPlace) self._test_device("xpu:0", core.XPUPlace)
def test_npu_device(self):
if core.is_compiled_with_npu():
self._test_device("npu:0", core.NPUPlace)
class TestImperativeDeviceManage(unittest.TestCase): class TestImperativeDeviceManage(unittest.TestCase):
def test_cpu(self): def test_cpu(self):
...@@ -95,25 +91,6 @@ class TestImperativeDeviceManage(unittest.TestCase): ...@@ -95,25 +91,6 @@ class TestImperativeDeviceManage(unittest.TestCase):
self.assertTrue(out.place.is_xpu_place()) self.assertTrue(out.place.is_xpu_place())
self.assertEqual(device, "xpu:0") self.assertEqual(device, "xpu:0")
def test_npu(self):
if core.is_compiled_with_npu():
with fluid.dygraph.guard():
paddle.set_device('npu:0')
out1 = paddle.zeros(shape=[1, 3], dtype='float32')
out2 = paddle.ones(shape=[1, 3], dtype='float32')
out3 = paddle.concat(x=[out1, out2], axis=0)
device = paddle.get_device()
self.assertEqual(
isinstance(
framework._current_expected_place(), core.NPUPlace
),
True,
)
self.assertTrue(out1.place.is_npu_place())
self.assertTrue(out2.place.is_npu_place())
self.assertTrue(out3.place.is_npu_place())
self.assertEqual(device, "npu:0")
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -17,11 +17,13 @@ import ast ...@@ -17,11 +17,13 @@ import ast
import os import os
import pickle import pickle
import random import random
import socket
import subprocess import subprocess
import sys import sys
import tempfile import tempfile
import time import time
import unittest import unittest
from contextlib import closing
import numpy as np import numpy as np
...@@ -684,9 +686,6 @@ class TestParallelDyGraphRunnerBase: ...@@ -684,9 +686,6 @@ class TestParallelDyGraphRunnerBase:
elif fluid.core.is_compiled_with_xpu(): elif fluid.core.is_compiled_with_xpu():
device_id = int(os.getenv("FLAGS_selected_xpus", "0")) device_id = int(os.getenv("FLAGS_selected_xpus", "0"))
place = fluid.XPUPlace(device_id) place = fluid.XPUPlace(device_id)
elif fluid.core.is_compiled_with_npu():
device_id = int(os.getenv("FLAGS_selected_npus", "0"))
place = fluid.NPUPlace(device_id)
else: else:
assert "Only support CUDAPlace or XPUPlace or CPU(Gloo) for now." assert "Only support CUDAPlace or XPUPlace or CPU(Gloo) for now."
...@@ -888,7 +887,6 @@ def runtime_main(test_class): ...@@ -888,7 +887,6 @@ def runtime_main(test_class):
parser.add_argument('--use_cpu', action='store_true') parser.add_argument('--use_cpu', action='store_true')
parser.add_argument('--use_xpu', action='store_true') parser.add_argument('--use_xpu', action='store_true')
parser.add_argument('--use_dgc', action='store_true') parser.add_argument('--use_dgc', action='store_true')
parser.add_argument('--use_npu', action='store_true')
parser.add_argument('--accumulate_gradient', action='store_true') parser.add_argument('--accumulate_gradient', action='store_true')
parser.add_argument('--find_unused_parameters', action='store_true') parser.add_argument('--find_unused_parameters', action='store_true')
parser.add_argument('--use_reduce', action='store_true') parser.add_argument('--use_reduce', action='store_true')
...@@ -932,10 +930,6 @@ def runtime_main(test_class): ...@@ -932,10 +930,6 @@ def runtime_main(test_class):
model.run_trainer(args) model.run_trainer(args)
import socket
from contextlib import closing
class TestDistBase(unittest.TestCase): class TestDistBase(unittest.TestCase):
def _setup_config(self): def _setup_config(self):
raise NotImplementedError("tests should have _setup_config implemented") raise NotImplementedError("tests should have _setup_config implemented")
...@@ -945,21 +939,13 @@ class TestDistBase(unittest.TestCase): ...@@ -945,21 +939,13 @@ class TestDistBase(unittest.TestCase):
self.__use_cuda = False self.__use_cuda = False
self.__use_xpu = False self.__use_xpu = False
self._use_dgc = False self._use_dgc = False
self.__use_npu = False
elif self._enforce_place == "GPU": elif self._enforce_place == "GPU":
self.__use_cuda = True self.__use_cuda = True
self.__use_xpu = False self.__use_xpu = False
self.__use_npu = False
elif self._enforce_place == "XPU": elif self._enforce_place == "XPU":
self.__use_cuda = False self.__use_cuda = False
self.__use_xpu = True self.__use_xpu = True
self._use_dgc = False self._use_dgc = False
self.__use_npu = False
elif self._enforce_place == "NPU":
self.__use_cuda = False
self.__use_xpu = False
self._use_dgc = False
self.__use_npu = True
else: else:
if fluid.core.is_compiled_with_cuda(): if fluid.core.is_compiled_with_cuda():
self.__use_cuda = True self.__use_cuda = True
...@@ -1149,13 +1135,6 @@ class TestDistBase(unittest.TestCase): ...@@ -1149,13 +1135,6 @@ class TestDistBase(unittest.TestCase):
"PADDLE_TRAINERS_NUM": "1", "PADDLE_TRAINERS_NUM": "1",
"PADDLE_TRAINER_ID": "0", "PADDLE_TRAINER_ID": "0",
} }
elif self.__use_npu:
cmd += " --use_npu"
env_local = {
"FLAGS_selected_npus": devices,
"PADDLE_TRAINERS_NUM": "1",
"PADDLE_TRAINER_ID": "0",
}
else: else:
env_local = {'CPU_NUM': '1'} env_local = {'CPU_NUM': '1'}
...@@ -1447,18 +1426,6 @@ class TestDistBase(unittest.TestCase): ...@@ -1447,18 +1426,6 @@ class TestDistBase(unittest.TestCase):
"GLOG_v": "2", "GLOG_v": "2",
} }
) )
elif self.__use_npu:
tr_cmd += " --use_npu"
env.update(
{
"FLAGS_selected_npus": f"{trainer_id}",
"PADDLE_TRAINERS_NUM": f"{trainer_num}",
"PADDLE_TRAINER_ID": f"{trainer_id}",
"PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints,
"PADDLE_CURRENT_ENDPOINT": ep,
"GLOG_v": "2",
}
)
else: else:
env.update({'CPU_NUM': '1'}) env.update({'CPU_NUM': '1'})
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import unittest
from paddle.distributed.fleet import ascend_utils
RANK_TABLE_JSON = {
"status": "completed",
"version": "1.0",
"server_count": "1",
"server_list": [
{
"server_id": "127.0.0.1",
"device": [
{"device_id": "0", "device_ip": "192.1.184.23", "rank_id": "0"},
{"device_id": "1", "device_ip": "192.2.21.93", "rank_id": "1"},
],
}
],
}
class TestAscendUtil(unittest.TestCase):
def test_get_cloud_cluster(self):
cluster, pod = ascend_utils.get_cloud_cluster()
self.assertTrue(cluster)
self.assertTrue(pod)
with open('rank_table_file.json', 'w') as f:
json.dump(RANK_TABLE_JSON, f)
rank_table_file = "./rank_table_file.json"
cluster, pod = ascend_utils.get_cloud_cluster(
rank_table_file=rank_table_file
)
self.assertTrue(cluster)
self.assertTrue(pod)
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
import paddle
class TestNPUIdentityOp(unittest.TestCase):
def setUp(self):
self.op_type = "npu_identity"
self.shape = [64, 6, 28, 28]
self.x = np.random.random(self.shape).astype(np.float32)
self.format = 3 # ACL_FORMAT_NC1HWC0 = 3
self.place = paddle.CPUPlace()
def test_api_static(self):
paddle.enable_static()
main_program = paddle.static.default_main_program()
startup_program = paddle.static.default_startup_program()
with paddle.static.program_guard(main_program, startup_program):
x_data = paddle.static.data(
shape=self.shape, name="data", dtype='float32'
)
output = paddle.incubate._npu_identity(x=x_data, format=self.format)
exe = paddle.static.Executor()
exe.run(startup_program)
result = exe.run(
main_program, feed={x_data.name: self.x}, fetch_list=[output]
)
np.testing.assert_allclose(result[0], self.x, rtol=1e-08)
def test_api_dygraph(self):
paddle.disable_static(self.place)
x = paddle.to_tensor(self.x)
out = paddle.incubate._npu_identity(x, self.format)
np.testing.assert_allclose(out.numpy(), self.x, rtol=1e-08)
paddle.enable_static()
if __name__ == "__main__":
unittest.main()
...@@ -272,9 +272,6 @@ class TestVarBase(unittest.TestCase): ...@@ -272,9 +272,6 @@ class TestVarBase(unittest.TestCase):
check_with_place("gpu_pinned") check_with_place("gpu_pinned")
check_with_place(core.CUDAPlace(0)) check_with_place(core.CUDAPlace(0))
check_with_place("gpu:0") check_with_place("gpu:0")
if core.is_compiled_with_npu():
check_with_place(core.NPUPlace(0))
check_with_place("npu:0")
def test_to_tensor_not_change_input_stop_gradient(self): def test_to_tensor_not_change_input_stop_gradient(self):
with paddle.fluid.dygraph.guard(core.CPUPlace()): with paddle.fluid.dygraph.guard(core.CPUPlace()):
......
...@@ -156,33 +156,6 @@ def init_communicator( ...@@ -156,33 +156,6 @@ def init_communicator(
'ring_id': 0, 'ring_id': 0,
}, },
) )
elif core.is_compiled_with_npu():
hccl_id_var = block.create_var(
name=fluid.unique_name.generate('hccl_id'),
persistable=True,
type=core.VarDesc.VarType.RAW,
)
block.append_op(
type='c_gen_hccl_id',
inputs={},
outputs={'Out': hccl_id_var},
attrs={
'rank': rank,
'endpoint': current_endpoint,
'other_endpoints': other_endpoints,
},
)
block.append_op(
type='c_comm_init_hccl',
inputs={'X': hccl_id_var},
outputs={},
attrs={
'rank': rank,
'ring_id': 0,
'device_id': int(os.getenv("FLAGS_selected_npus")),
'rank_ids': nranks,
},
)
elif core.is_compiled_with_xpu(): elif core.is_compiled_with_xpu():
bkcl_id_var = block.create_var( bkcl_id_var = block.create_var(
name=fluid.unique_name.generate('bkcl_id'), name=fluid.unique_name.generate('bkcl_id'),
......
...@@ -16,7 +16,7 @@ from paddle import _C_ops, _legacy_C_ops, get_flags, in_dynamic_mode ...@@ -16,7 +16,7 @@ from paddle import _C_ops, _legacy_C_ops, get_flags, in_dynamic_mode
from paddle.device import ( from paddle.device import (
get_all_custom_device_type, get_all_custom_device_type,
is_compiled_with_cuda, is_compiled_with_cuda,
is_compiled_with_npu, is_compiled_with_custom_device,
is_compiled_with_rocm, is_compiled_with_rocm,
) )
from paddle.fluid.framework import _global_flags, in_dygraph_mode from paddle.fluid.framework import _global_flags, in_dygraph_mode
...@@ -466,7 +466,7 @@ def conv1d( ...@@ -466,7 +466,7 @@ def conv1d(
use_cudnn = False use_cudnn = False
# NPU only supports depthwise_conv2d when "input_channel = output_channel = groups" # NPU only supports depthwise_conv2d when "input_channel = output_channel = groups"
if is_compiled_with_npu(): if is_compiled_with_custom_device('npu'):
if num_channels == groups and num_channels == num_filters: if num_channels == groups and num_channels == num_filters:
l_type = 'depthwise_conv2d' l_type = 'depthwise_conv2d'
else: else:
...@@ -756,7 +756,7 @@ def conv2d( ...@@ -756,7 +756,7 @@ def conv2d(
use_mkldnn = _global_flags()["FLAGS_use_mkldnn"] use_mkldnn = _global_flags()["FLAGS_use_mkldnn"]
# NPU only supports depthwise_conv2d when "input_channel = output_channel = groups" # NPU only supports depthwise_conv2d when "input_channel = output_channel = groups"
if is_compiled_with_npu(): if is_compiled_with_custom_device('npu'):
if num_channels == groups and num_channels == num_filters: if num_channels == groups and num_channels == num_filters:
l_type = 'depthwise_conv2d' l_type = 'depthwise_conv2d'
else: else:
......
...@@ -60,7 +60,6 @@ from ..fluid.framework import program_guard # noqa: F401 ...@@ -60,7 +60,6 @@ from ..fluid.framework import program_guard # noqa: F401
from ..fluid.framework import cpu_places # noqa: F401 from ..fluid.framework import cpu_places # noqa: F401
from ..fluid.framework import cuda_places # noqa: F401 from ..fluid.framework import cuda_places # noqa: F401
from ..fluid.framework import xpu_places # noqa: F401 from ..fluid.framework import xpu_places # noqa: F401
from ..fluid.framework import npu_places # noqa: F401
from ..fluid.framework import Variable # noqa: F401 from ..fluid.framework import Variable # noqa: F401
from ..fluid.framework import Operator # noqa: F401 from ..fluid.framework import Operator # noqa: F401
from ..fluid.framework import Parameter # noqa: F401 from ..fluid.framework import Parameter # noqa: F401
...@@ -118,7 +117,6 @@ __all__ = [ # noqa ...@@ -118,7 +117,6 @@ __all__ = [ # noqa
'cpu_places', 'cpu_places',
'cuda_places', 'cuda_places',
'xpu_places', 'xpu_places',
'npu_places',
'Variable', 'Variable',
'create_global_var', 'create_global_var',
'accuracy', 'accuracy',
......
...@@ -54,7 +54,7 @@ def check_finite_and_unscale(x, scale, name=None, float_status=None): ...@@ -54,7 +54,7 @@ def check_finite_and_unscale(x, scale, name=None, float_status=None):
) )
inputs = {'X': x, 'Scale': scale} inputs = {'X': x, 'Scale': scale}
if core.is_compiled_with_npu(): if core.is_compiled_with_custom_device('npu'):
check_variable_and_dtype( check_variable_and_dtype(
float_status, float_status,
"float_status", "float_status",
......
...@@ -187,7 +187,7 @@ class OptimizerWithMixedPrecision: ...@@ -187,7 +187,7 @@ class OptimizerWithMixedPrecision:
self._train_program = train_program self._train_program = train_program
# NOTE(zhiqiu): _float_status is only used for NPU. # NOTE(zhiqiu): _float_status is only used for NPU.
if core.is_compiled_with_npu(): if core.is_compiled_with_custom_device('npu'):
float_status = paddle.static.data( float_status = paddle.static.data(
name="float_status", shape=[8], dtype='float32' name="float_status", shape=[8], dtype='float32'
) )
...@@ -408,7 +408,7 @@ class OptimizerWithMixedPrecision: ...@@ -408,7 +408,7 @@ class OptimizerWithMixedPrecision:
if self._is_distributed: if self._is_distributed:
# if distributed, split check_finite_and_unscale to overlap # if distributed, split check_finite_and_unscale to overlap
# unscale with communication # unscale with communication
if core.is_compiled_with_npu(): if core.is_compiled_with_custom_device('npu'):
with self._train_program._optimized_guard(grads): with self._train_program._optimized_guard(grads):
_, found_inf = check_finite_and_unscale( _, found_inf = check_finite_and_unscale(
grads, grads,
......
...@@ -182,7 +182,7 @@ if core.is_compiled_with_xpu(): ...@@ -182,7 +182,7 @@ if core.is_compiled_with_xpu():
_, _, _sys_unsupported_fp16_list = core.op_supported_infos( _, _, _sys_unsupported_fp16_list = core.op_supported_infos(
'XPU', core.VarDesc.VarType.FP16 'XPU', core.VarDesc.VarType.FP16
) )
elif core.is_compiled_with_npu(): elif core.is_compiled_with_custom_device('npu'):
_, _, _sys_unsupported_fp16_list = core.op_supported_infos( _, _, _sys_unsupported_fp16_list = core.op_supported_infos(
'NPU', core.VarDesc.VarType.FP16 'NPU', core.VarDesc.VarType.FP16
) )
......
...@@ -1536,10 +1536,7 @@ def load(program, model_path, executor=None, var_list=None): ...@@ -1536,10 +1536,7 @@ def load(program, model_path, executor=None, var_list=None):
p = paddle.fluid.core.Place() p = paddle.fluid.core.Place()
p.set_place(t._place()) p.set_place(t._place())
place = paddle.fluid.XPUPlace(p.xpu_device_id()) place = paddle.fluid.XPUPlace(p.xpu_device_id())
elif p.is_npu_place():
p = paddle.fluid.core.Place()
p.set_place(t._place())
place = paddle.fluid.NPUPlace(p.npu_device_id())
else: else:
p = paddle.fluid.core.Place() p = paddle.fluid.core.Place()
p.set_place(t._place()) p.set_place(t._place())
...@@ -1676,10 +1673,6 @@ def set_program_state(program, state_dict): ...@@ -1676,10 +1673,6 @@ def set_program_state(program, state_dict):
p = paddle.fluid.core.Place() p = paddle.fluid.core.Place()
p.set_place(ten_place) p.set_place(ten_place)
py_place = paddle.fluid.XPUPlace(p.xpu_device_id()) py_place = paddle.fluid.XPUPlace(p.xpu_device_id())
elif ten_place.is_npu_place():
p = paddle.fluid.core.Place()
p.set_place(ten_place)
py_place = paddle.fluid.NPUPlace(p.npu_device_id())
ten.set(new_para_np, py_place) ten.set(new_para_np, py_place)
......
...@@ -946,7 +946,7 @@ def conv2d( ...@@ -946,7 +946,7 @@ def conv2d(
l_type = 'depthwise_conv2d' l_type = 'depthwise_conv2d'
# NPU only supports depthwise_conv2d when "input_channel = output_channel = groups" # NPU only supports depthwise_conv2d when "input_channel = output_channel = groups"
if core.is_compiled_with_npu(): if core.is_compiled_with_custom_device('npu'):
if num_channels == groups and num_channels == num_filters: if num_channels == groups and num_channels == num_filters:
l_type = 'depthwise_conv2d' l_type = 'depthwise_conv2d'
else: else:
......
...@@ -2246,8 +2246,6 @@ def _memcpy(input, place=None, output=None): ...@@ -2246,8 +2246,6 @@ def _memcpy(input, place=None, output=None):
dst_place_type = 2 dst_place_type = 2
elif p.is_xpu_place(): elif p.is_xpu_place():
dst_place_type = 3 dst_place_type = 3
elif p.is_npu_place():
dst_place_type = 4
attrs = {'dst_place_type': dst_place_type} attrs = {'dst_place_type': dst_place_type}
helper.append_op( helper.append_op(
......
...@@ -65,22 +65,6 @@ def _is_cuda_available(): ...@@ -65,22 +65,6 @@ def _is_cuda_available():
return False return False
def _is_npu_available():
"""
Check whether NPU is avaiable.
"""
try:
assert len(paddle.static.npu_places()) > 0
return True
except Exception as e:
logging.warning(
"You are using NPU version PaddlePaddle, but there is no NPU "
"detected on your machine. Maybe NPU devices is not set properly."
"\n Original Error is {}".format(e)
)
return False
def _is_xpu_available(): def _is_xpu_available():
""" """
Check whether XPU is avaiable. Check whether XPU is avaiable.
...@@ -97,22 +81,19 @@ def _is_xpu_available(): ...@@ -97,22 +81,19 @@ def _is_xpu_available():
return False return False
def _run_dygraph_single(use_cuda, use_xpu, use_npu): def _run_dygraph_single(use_cuda, use_xpu):
""" """
Testing the simple network in dygraph mode using one CPU/GPU/XPU/NPU. Testing the simple network in dygraph mode using one CPU/GPU/XPU.
Args: Args:
use_cuda (bool): Whether running with CUDA. use_cuda (bool): Whether running with CUDA.
use_xpu (bool): Whether running with XPU. use_xpu (bool): Whether running with XPU.
use_npu (bool): Whether running with NPU.
""" """
paddle.disable_static() paddle.disable_static()
if use_cuda: if use_cuda:
paddle.set_device('gpu') paddle.set_device('gpu')
elif use_xpu: elif use_xpu:
paddle.set_device('xpu') paddle.set_device('xpu')
elif use_npu:
paddle.set_device('npu')
else: else:
paddle.set_device('cpu') paddle.set_device('cpu')
weight_attr = paddle.ParamAttr( weight_attr = paddle.ParamAttr(
...@@ -135,14 +116,13 @@ def _run_dygraph_single(use_cuda, use_xpu, use_npu): ...@@ -135,14 +116,13 @@ def _run_dygraph_single(use_cuda, use_xpu, use_npu):
opt.step() opt.step()
def _run_static_single(use_cuda, use_xpu, use_npu): def _run_static_single(use_cuda, use_xpu):
""" """
Testing the simple network with executor running directly, using one CPU/GPU/XPU/NPU. Testing the simple network with executor running directly, using one CPU/GPU/XPU.
Args: Args:
use_cuda (bool): Whether running with CUDA. use_cuda (bool): Whether running with CUDA.
use_xpu (bool): Whether running with XPU. use_xpu (bool): Whether running with XPU.
use_npu (bool): Whether running with NPU.
""" """
paddle.enable_static() paddle.enable_static()
with paddle.static.scope_guard(paddle.static.Scope()): with paddle.static.scope_guard(paddle.static.Scope()):
...@@ -159,8 +139,6 @@ def _run_static_single(use_cuda, use_xpu, use_npu): ...@@ -159,8 +139,6 @@ def _run_static_single(use_cuda, use_xpu, use_npu):
place = paddle.CUDAPlace(0) place = paddle.CUDAPlace(0)
elif use_xpu: elif use_xpu:
place = paddle.XPUPlace(0) place = paddle.XPUPlace(0)
elif use_npu:
place = paddle.NPUPlace(0)
else: else:
place = paddle.CPUPlace() place = paddle.CPUPlace()
...@@ -223,7 +201,6 @@ def _run_parallel(device_list): ...@@ -223,7 +201,6 @@ def _run_parallel(device_list):
Args: Args:
use_cuda (bool): Whether running with CUDA. use_cuda (bool): Whether running with CUDA.
use_xpu (bool): Whether running with XPU. use_xpu (bool): Whether running with XPU.
use_npu (bool): Whether running with NPU.
device_list (int): The specified devices. device_list (int): The specified devices.
""" """
paddle.distributed.spawn(train_for_run_parallel, nprocs=len(device_list)) paddle.distributed.spawn(train_for_run_parallel, nprocs=len(device_list))
...@@ -252,14 +229,11 @@ def run_check(): ...@@ -252,14 +229,11 @@ def run_check():
use_cuda = False use_cuda = False
use_xpu = False use_xpu = False
use_npu = False
if paddle.is_compiled_with_cuda(): if paddle.is_compiled_with_cuda():
use_cuda = _is_cuda_available() use_cuda = _is_cuda_available()
elif paddle.is_compiled_with_xpu(): elif paddle.is_compiled_with_xpu():
use_xpu = _is_xpu_available() use_xpu = _is_xpu_available()
elif paddle.is_compiled_with_npu():
use_npu = _is_npu_available()
if use_cuda: if use_cuda:
device_str = "GPU" device_str = "GPU"
...@@ -267,16 +241,13 @@ def run_check(): ...@@ -267,16 +241,13 @@ def run_check():
elif use_xpu: elif use_xpu:
device_str = "XPU" device_str = "XPU"
device_list = paddle.static.xpu_places() device_list = paddle.static.xpu_places()
elif use_npu:
device_str = "NPU"
device_list = paddle.static.npu_places()
else: else:
device_str = "CPU" device_str = "CPU"
device_list = paddle.static.cpu_places(device_count=1) device_list = paddle.static.cpu_places(device_count=1)
device_count = len(device_list) device_count = len(device_list)
_run_static_single(use_cuda, use_xpu, use_npu) _run_static_single(use_cuda, use_xpu)
_run_dygraph_single(use_cuda, use_xpu, use_npu) _run_dygraph_single(use_cuda, use_xpu)
print(f"PaddlePaddle works well on 1 {device_str}.") print(f"PaddlePaddle works well on 1 {device_str}.")
try: try:
......
...@@ -18,8 +18,6 @@ from distutils.sysconfig import get_python_lib ...@@ -18,8 +18,6 @@ from distutils.sysconfig import get_python_lib
from setuptools import Extension, setup from setuptools import Extension, setup
from setuptools.command.build_ext import build_ext from setuptools.command.build_ext import build_ext
from paddle.fluid import core
# refer: https://note.qidong.name/2018/03/setup-warning-strict-prototypes # refer: https://note.qidong.name/2018/03/setup-warning-strict-prototypes
# Avoid a gcc warning below: # Avoid a gcc warning below:
...@@ -40,8 +38,6 @@ paddle_extra_compile_args = [ ...@@ -40,8 +38,6 @@ paddle_extra_compile_args = [
'-Wno-parentheses', '-Wno-parentheses',
'-DPADDLE_WITH_CUSTOM_KERNEL', '-DPADDLE_WITH_CUSTOM_KERNEL',
] ]
if core.is_compiled_with_npu():
paddle_extra_compile_args += ['-D_GLIBCXX_USE_CXX11_ABI=0']
# include path # include path
site_packages_path = get_python_lib() site_packages_path = get_python_lib()
......
...@@ -18,8 +18,6 @@ import site ...@@ -18,8 +18,6 @@ import site
from setuptools import Extension, setup from setuptools import Extension, setup
from setuptools.command.build_ext import build_ext from setuptools.command.build_ext import build_ext
from paddle.fluid import core
# refer: https://note.qidong.name/2018/03/setup-warning-strict-prototypes # refer: https://note.qidong.name/2018/03/setup-warning-strict-prototypes
# Avoid a gcc warning below: # Avoid a gcc warning below:
...@@ -40,8 +38,6 @@ paddle_extra_compile_args = [ ...@@ -40,8 +38,6 @@ paddle_extra_compile_args = [
'-Wno-parentheses', '-Wno-parentheses',
'-DPADDLE_WITH_CUSTOM_KERNEL', '-DPADDLE_WITH_CUSTOM_KERNEL',
] ]
if core.is_compiled_with_npu():
paddle_extra_compile_args += ['-D_GLIBCXX_USE_CXX11_ABI=0']
# include path # include path
site_packages_path = site.getsitepackages() site_packages_path = site.getsitepackages()
......
...@@ -32,9 +32,6 @@ def download_file(): ...@@ -32,9 +32,6 @@ def download_file():
if paddle.is_compiled_with_rocm(): if paddle.is_compiled_with_rocm():
url = "https://sys-p0.bj.bcebos.com/prec/{}".format('disable_ut_rocm') url = "https://sys-p0.bj.bcebos.com/prec/{}".format('disable_ut_rocm')
if paddle.is_compiled_with_npu():
url = "https://sys-p0.bj.bcebos.com/prec/{}".format('disable_ut_npu')
f = requests.get(url) f = requests.get(url)
data = f.text data = f.text
status_code = f.status_code status_code = f.status_code
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册