From 1435b4c0961a6d6206904a315cb6bfbabfbe6f72 Mon Sep 17 00:00:00 2001 From: liym27 <33742067+liym27@users.noreply.github.com> Date: Tue, 23 Feb 2021 19:57:15 +0800 Subject: [PATCH] [NPU] Support executor with NPU (#31057) * [NPU] Support executor with NPU * Fix code according to reviews * Fix code * Add unittest for sub op npu --- paddle/fluid/framework/executor.cc | 8 + paddle/fluid/framework/operator.cc | 10 + paddle/fluid/framework/parallel_executor.cc | 3 + paddle/fluid/framework/tensor_util.cc | 33 ++- paddle/fluid/pybind/pybind.cc | 112 ++++++++- paddle/fluid/pybind/tensor_py.h | 16 ++ python/paddle/__init__.py | 2 + python/paddle/device.py | 19 +- python/paddle/fluid/__init__.py | 4 +- python/paddle/fluid/executor.py | 1 + python/paddle/fluid/framework.py | 23 +- .../fluid/tests/unittests/CMakeLists.txt | 4 + .../fluid/tests/unittests/npu/CMakeLists.txt | 6 + .../npu/test_elementwise_add_op_npu.py | 162 +++++++++++++ .../npu/test_elementwise_sub_op_npu.py | 224 ++++++++++++++++++ .../tests/unittests/npu/test_npu_place.py | 61 +++++ .../paddle/fluid/tests/unittests/op_test.py | 14 +- .../fluid/tests/unittests/test_device.py | 43 ++-- python/paddle/framework/__init__.py | 5 +- 19 files changed, 698 insertions(+), 52 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/npu/CMakeLists.txt create mode 100644 python/paddle/fluid/tests/unittests/npu/test_elementwise_add_op_npu.py create mode 100644 python/paddle/fluid/tests/unittests/npu/test_elementwise_sub_op_npu.py create mode 100644 python/paddle/fluid/tests/unittests/npu/test_npu_place.py diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 755b3bff763..32ceb7d7903 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -466,6 +466,14 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx, #else PADDLE_THROW( platform::errors::Unimplemented("No XPU gc found in CPU/GPU paddle")); +#endif + } else if (platform::is_npu_place(place_)) { +#ifdef PADDLE_WITH_ASCEND_CL + // TODO(ascendrc): Support garbage collector on NPUPlace + VLOG(4) << "Skip NPU gc because it is not implemented now."; +#else + PADDLE_THROW(platform::errors::Unimplemented( + "No NPU gc found in CPU/GPU/XPU paddle")); #endif } } diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 3af0d7fcf5d..665e7b2fcf8 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -1275,6 +1275,16 @@ void OperatorWithKernel::ChooseKernel(const RuntimeContext& ctx, expected_kernel_key.place_ = platform::CPUPlace(); kernel_iter = kernels.find(expected_kernel_key); } +#endif +#ifdef PADDLE_WITH_ASCEND_CL + if (kernel_iter == kernels.end() && + is_npu_place(expected_kernel_key.place_)) { + VLOG(3) << "missing NPU kernel: " << type_ + << ", expected_kernel_key:" << expected_kernel_key + << ", fallbacking to CPU one!"; + expected_kernel_key.place_ = platform::CPUPlace(); + kernel_iter = kernels.find(expected_kernel_key); + } #endif PADDLE_ENFORCE_NE(kernel_iter, kernels.end(), platform::errors::NotFound( diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index bfc3b7c7017..af048194178 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -614,6 +614,9 @@ ParallelExecutor::ParallelExecutor(const std::vector &places, const BuildStrategy &build_strategy, ir::Graph *graph) : member_(new ParallelExecutorPrivate(places, scope)) { + PADDLE_ENFORCE(places.size() > 0 && !is_npu_place(places[0]), + platform::errors::Unavailable( + "NPU is not supported in ParallelExecutor")); InitP2P(places); ir::InitReaderQueueDeviceCount(graph, *(member_->global_scope_), member_->places_.size()); diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index 7344bcfb6b8..d8e79d40c23 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -101,15 +101,19 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place, // TODO(zhiqiu): handle different condition like CUDA code below else if (platform::is_npu_place(src_place) && // NOLINT platform::is_cpu_place(dst_place)) { - auto stream = reinterpret_cast(ctx).stream(); + auto stream = + reinterpret_cast(ctx).stream(); memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr, - BOOST_GET_CONST(platform::NPUPlace, src_place), src_ptr, size, stream); + BOOST_GET_CONST(platform::NPUPlace, src_place), src_ptr, size, + stream); } else if (platform::is_cpu_place(src_place) && // NOLINT platform::is_npu_place(dst_place)) { - auto stream = reinterpret_cast(ctx).stream(); + auto stream = + reinterpret_cast(ctx).stream(); memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, - BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size, stream); + BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size, + stream); } else if (platform::is_npu_place(src_place) && // NOLINT platform::is_npu_place(dst_place)) { @@ -118,9 +122,11 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place, << dst_place; return; } - auto stream = reinterpret_cast(ctx).stream(); + auto stream = + reinterpret_cast(ctx).stream(); memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, - BOOST_GET_CONST(platform::NPUPlace, src_place), src_ptr, size, stream); + BOOST_GET_CONST(platform::NPUPlace, src_place), src_ptr, size, + stream); } else { // NOLINT PADDLE_THROW(platform::errors::Unimplemented( @@ -336,24 +342,27 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place, #endif #ifdef PADDLE_WITH_ASCEND_CL else if (platform::is_npu_place(src_place) && // NOLINT - platform::is_cpu_place(dst_place)) { + platform::is_cpu_place(dst_place)) { /* npu -> cpu*/ memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr, - BOOST_GET_CONST(platform::NPUPlace, src_place), src_ptr, size, nullptr); + BOOST_GET_CONST(platform::NPUPlace, src_place), src_ptr, size, + nullptr); } else if (platform::is_cpu_place(src_place) && // NOLINT - platform::is_npu_place(dst_place)) { + platform::is_npu_place(dst_place)) { /* cpu -> npu*/ memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, - BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size, nullptr); + BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size, + nullptr); } else if (platform::is_npu_place(src_place) && // NOLINT - platform::is_npu_place(dst_place)) { + platform::is_npu_place(dst_place)) { /* npu -> npu*/ if (src_ptr == dst_ptr) { VLOG(3) << "Skip copy the same data sync from " << src_place << " to " << dst_place; return; } memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, - BOOST_GET_CONST(platform::NPUPlace, src_place), src_ptr, size, nullptr); + BOOST_GET_CONST(platform::NPUPlace, src_place), src_ptr, size, + nullptr); } else { // NOLINT PADDLE_THROW(platform::errors::Unimplemented( diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 0fa50a8cd36..b3d9e22dba8 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -154,6 +154,14 @@ bool IsCompiledWithXPU() { #endif } +bool IsCompiledWithNPU() { +#ifndef PADDLE_WITH_ASCEND_CL + return false; +#else + return true; +#endif +} + bool IsCompiledWithMKLDNN() { #ifndef PADDLE_WITH_MKLDNN return false; @@ -567,6 +575,10 @@ PYBIND11_MODULE(core_noavx, m) { [](Tensor &self, paddle::platform::CPUPlace &place) { self.mutable_data(place); }) + .def("_alloc_float", + [](Tensor &self, paddle::platform::NPUPlace &place) { + self.mutable_data(place); + }) .def("_alloc_double", [](Tensor &self, paddle::platform::CPUPlace &place) { self.mutable_data(place); @@ -611,6 +623,11 @@ PYBIND11_MODULE(core_noavx, m) { paddle::framework::proto::VarType::Type type) { return reinterpret_cast(self.mutable_data(place, type)); }) + .def("_mutable_data", + [](Tensor &self, paddle::platform::NPUPlace &place, + paddle::framework::proto::VarType::Type type) { + return reinterpret_cast(self.mutable_data(place, type)); + }) .def("_clear", &Tensor::clear) .def("set", SetTensorFromPyArray, py::arg("array"), py::arg("place"), py::arg("zero_copy") = false) @@ -618,6 +635,8 @@ PYBIND11_MODULE(core_noavx, m) { py::arg("array"), py::arg("place"), py::arg("zero_copy") = false) .def("set", SetTensorFromPyArray, py::arg("array"), py::arg("place"), py::arg("zero_copy") = false) + .def("set", SetTensorFromPyArray, + py::arg("array"), py::arg("place"), py::arg("zero_copy") = false) .def("set", SetTensorFromPyArray, py::arg("array"), py::arg("place"), py::arg("zero_copy") = false, R"DOC( @@ -625,7 +644,7 @@ PYBIND11_MODULE(core_noavx, m) { Args: lod (numpy.ndarray): The data to set. - place (CPUPlace|CUDAPlace|XPUPlace|CUDAPinnedPlace): The place where the + place (CPUPlace|CUDAPlace|XPUPlace|CUDAPinnedPlace|NPUPlace): The place where the LoDTensor is to be set. zero_copy (bool, optional): Whether to share memory with the input numpy array. This parameter only works with CPUPlace. Default: False. @@ -1348,6 +1367,18 @@ All parameter, weight, gradient are variables in Paddle. return new paddle::platform::XPUDeviceContext(place); #endif }) + .def_static("create", + [](paddle::platform::NPUPlace& place) + -> paddle::platform::DeviceContext* { +#ifndef PADDLE_WITH_ASCEND_CL + PADDLE_THROW( + platform::errors::PermissionDenied( + "Cannot use NPUPlace in CPU/GPU/XPU version, " + "Please recompile or reinstall Paddle with NPU support.")); +#else + return new paddle::platform::NPUDeviceContext(place); +#endif + }) .def_static("create", [](paddle::platform::CUDAPlace& place) -> paddle::platform::DeviceContext* { @@ -1448,6 +1479,7 @@ All parameter, weight, gradient are variables in Paddle. .def("_equals", &IsSamePlace) .def("_equals", &IsSamePlace) .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) .def("_equals", &IsSamePlace) .def("_get_device_id", @@ -1517,6 +1549,7 @@ All parameter, weight, gradient are variables in Paddle. #ifdef PADDLE_WITH_XPU m.def("get_xpu_device_count", platform::GetXPUDeviceCount); #endif + py::class_(m, "CPUPlace", R"DOC( CPUPlace is a descriptor of a device. It represents a CPU device on which a tensor will be allocated and a model will run. @@ -1532,6 +1565,7 @@ All parameter, weight, gradient are variables in Paddle. .def("_type", &PlaceIndex) .def("_equals", &IsSamePlace) .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) .def("_equals", &IsSamePlace) .def("_equals", &IsSamePlace) .def("_equals", @@ -1569,6 +1603,8 @@ All parameter, weight, gradient are variables in Paddle. &IsSamePlace) .def("_equals", &IsSamePlace) + .def("_equals", + &IsSamePlace) .def("_equals", &IsSamePlace) .def("_equals", @@ -1576,6 +1612,65 @@ All parameter, weight, gradient are variables in Paddle. .def("__repr__", string::to_string) .def("__str__", string::to_string); + // NPUPlace + py::class_(m, "NPUPlace", R"DOC( + NPUPlace is a descriptor of a device. + It represents a NPU device on which a tensor will be allocated and a model will run. + + Examples: + .. code-block:: python + import paddle + npu_place = paddle.NPUPlace(0) + + )DOC") + .def("__init__", + [](platform::NPUPlace &self, int dev_id) { +#ifdef PADDLE_WITH_ASCEND_CL + if (UNLIKELY(dev_id < 0)) { + LOG(ERROR) << string::Sprintf( + "Invalid NPUPlace(%d), device id must be 0 or " + "positive integer", + dev_id); + std::exit(-1); + } + if (UNLIKELY(dev_id >= platform::GetNPUDeviceCount())) { + if (platform::GetNPUDeviceCount() == 0) { + LOG(ERROR) << "Cannot use NPU because there is no NPU " + "detected on your " + "machine."; + std::exit(-1); + } else { + LOG(ERROR) << string::Sprintf( + "Invalid NPUPlace(%d), must inside [0, %d), because NPU " + "number on your machine is %d", + dev_id, platform::GetNPUDeviceCount(), + platform::GetNPUDeviceCount()); + std::exit(-1); + } + } + new (&self) platform::NPUPlace(dev_id); +#else + LOG(ERROR) << string::Sprintf( + "Cannot use NPU because you have installed CPU/GPU version " + "PaddlePaddle.\n" + "If you want to use NPU, please try to install NPU version " + "PaddlePaddle by: pip install paddlepaddle-xpu\n" + "If you only have CPU, please change NPUPlace(%d) to be " + "CPUPlace().\n", + dev_id); + std::exit(-1); +#endif + }) + .def("_type", &PlaceIndex) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", + &IsSamePlace) + .def("__str__", string::to_string); + py::class_(m, "Place") .def(py::init<>()) .def("_type", &PlaceIndex) @@ -1583,6 +1678,7 @@ All parameter, weight, gradient are variables in Paddle. .def("_equals", &IsSamePlace) .def("_equals", &IsSamePlace) .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) .def("_equals", &IsSamePlace) .def("is_gpu_place", [](platform::Place &self) { return platform::is_gpu_place(self); }) @@ -1590,6 +1686,8 @@ All parameter, weight, gradient are variables in Paddle. [](platform::Place &self) { return platform::is_cpu_place(self); }) .def("is_xpu_place", [](platform::Place &self) { return platform::is_xpu_place(self); }) + .def("is_npu_place", + [](platform::Place &self) { return platform::is_npu_place(self); }) .def("is_cuda_pinned_place", [](platform::Place &self) { return platform::is_cuda_pinned_place(self); @@ -1602,6 +1700,10 @@ All parameter, weight, gradient are variables in Paddle. [](platform::Place &self) { return BOOST_GET_CONST(platform::XPUPlace, self).device; }) + .def("npu_device_id", + [](platform::Place &self) { + return BOOST_GET_CONST(platform::NPUPlace, self).device; + }) .def("set_place", [](platform::Place &self, const platform::Place &other) { self = other; }) .def("set_place", @@ -1621,6 +1723,10 @@ All parameter, weight, gradient are variables in Paddle. const platform::CUDAPinnedPlace &cuda_pinned_place) { self = cuda_pinned_place; }) + .def("set_place", + [](platform::Place &self, const platform::NPUPlace &npu_place) { + self = npu_place; + }) .def("__repr__", string::to_string) .def("__str__", string::to_string); @@ -1645,6 +1751,9 @@ All parameter, weight, gradient are variables in Paddle. .def("run", [](OperatorBase &self, const Scope &scope, const platform::XPUPlace &place) { self.Run(scope, place); }) + .def("run", + [](OperatorBase &self, const Scope &scope, + const platform::NPUPlace &place) { self.Run(scope, place); }) .def("run", [](OperatorBase &self, const Scope &scope, const platform::CUDAPlace &place) { self.Run(scope, place); }) @@ -1745,6 +1854,7 @@ All parameter, weight, gradient are variables in Paddle. m.def("is_compiled_with_cuda", IsCompiledWithCUDA); m.def("is_compiled_with_ascend", IsCompiledWithAscend); + m.def("is_compiled_with_npu", IsCompiledWithNPU); m.def("is_compiled_with_xpu", IsCompiledWithXPU); m.def("is_compiled_with_mkldnn", IsCompiledWithMKLDNN); m.def("supports_bfloat16", SupportsBfloat16); diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h index 7e60c98dc18..51fc3439c9a 100644 --- a/paddle/fluid/pybind/tensor_py.h +++ b/paddle/fluid/pybind/tensor_py.h @@ -285,6 +285,22 @@ void SetTensorFromPyArrayT( PADDLE_THROW(platform::errors::PermissionDenied( "Cannot use XPUPlace in CPU/GPU version, " "Please recompile or reinstall Paddle with XPU support.")); +#endif + } else if (paddle::platform::is_npu_place(place)) { +#ifdef PADDLE_WITH_ASCEND_CL + platform::Place tmp_place = place; + platform::NPUDeviceGuard guard( + BOOST_GET_CONST(platform::NPUPlace, tmp_place).device); + auto dst = self->mutable_data(place); + platform::NPUMemcpySync(dst, array.data(), array.nbytes(), + ACL_MEMCPY_HOST_TO_DEVICE); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &ctx = *pool.Get(place); + ctx.Wait(); +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "Cannot use NPUPlace in CPU/GPU/XPU version. " + "Please recompile or reinstall Paddle with NPU support.")); #endif } else { #ifdef PADDLE_WITH_CUDA diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py index 50043a9b3cf..2c13e6da83b 100755 --- a/python/paddle/__init__.py +++ b/python/paddle/__init__.py @@ -232,6 +232,7 @@ from .framework import ParamAttr #DEFINE_ALIAS from .framework import create_parameter #DEFINE_ALIAS from .framework import CPUPlace #DEFINE_ALIAS from .framework import CUDAPlace #DEFINE_ALIAS +from .framework import NPUPlace #DEFINE_ALIAS from .framework import CUDAPinnedPlace #DEFINE_ALIAS from .framework import grad #DEFINE_ALIAS @@ -256,6 +257,7 @@ from .device import set_device from .device import get_device from .device import is_compiled_with_cuda #DEFINE_ALIAS from .device import is_compiled_with_xpu +from .device import is_compiled_with_npu from .device import XPUPlace # from .tensor.tensor import Tensor #DEFINE_ALIAS # from .tensor.tensor import LoDTensor #DEFINE_ALIAS diff --git a/python/paddle/device.py b/python/paddle/device.py index 2beb92f2c3a..d0bca3df896 100644 --- a/python/paddle/device.py +++ b/python/paddle/device.py @@ -32,12 +32,28 @@ __all__ = [ # 'cuda_places', # 'CUDAPinnedPlace', # 'CUDAPlace', - 'is_compiled_with_cuda' + 'is_compiled_with_cuda', + 'is_compiled_with_npu' ] _cudnn_version = None +def is_compiled_with_npu(): + """ + Whether this whl package can be used to run the model on NPU. + + Returns (bool): `True` if NPU is supported, otherwise `False`. + + Examples: + .. code-block:: python + + import paddle + support_npu = paddle.is_compiled_with_npu() + """ + return core.is_compiled_with_npu() + + def is_compiled_with_xpu(): """ Whether paddle was built with WITH_XPU=ON to support Baidu Kunlun @@ -163,6 +179,7 @@ def set_device(device): device_id = device_info_list[1] device_id = int(device_id) place = core.XPUPlace(device_id) + framework._set_expected_place(place) return place diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 1a88d3512ea..5c6ce1dc17a 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -68,7 +68,8 @@ from .input import embedding, one_hot from . import distribute_lookup_table from .param_attr import ParamAttr, WeightNormParamAttr from .data_feeder import DataFeeder -from .core import LoDTensor, LoDTensorArray, CPUPlace, XPUPlace, CUDAPlace, CUDAPinnedPlace, Scope, _Scope +from .core import LoDTensor, LoDTensorArray, Scope, _Scope +from .core import CPUPlace, XPUPlace, CUDAPlace, CUDAPinnedPlace, NPUPlace from .incubate import fleet from .incubate import data_generator from .transpiler import DistributeTranspiler, \ @@ -124,6 +125,7 @@ __all__ = framework.__all__ + executor.__all__ + \ 'XPUPlace', 'CUDAPlace', 'CUDAPinnedPlace', + 'NPUPlace', 'Tensor', 'ParamAttr', 'WeightNormParamAttr', diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index 9b0b04a6ea7..acbd00c336b 100644 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -1213,6 +1213,7 @@ class Executor(object): # In distributed training, the compiled program is saved in Program._graph has_compiled_graph = isinstance(program._graph, compiler.CompiledProgram) + if has_compiled_graph: program._graph._compile(scope, self.place) # _graph in program does not support inference since the _graph is optimized diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 08ea46e6961..e17527a3293 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -5854,7 +5854,7 @@ def _get_paddle_place(place): if place is None: return place if isinstance(place, (core.Place, core.XPUPlace, core.CPUPlace, - core.CUDAPinnedPlace, core.CUDAPlace)): + core.CUDAPinnedPlace, core.CUDAPlace, core.NPUPlace)): return place if not isinstance(place, str): @@ -5864,9 +5864,11 @@ def _get_paddle_place(place): place = place.lower() if (place == "cpu"): return core.CPUPlace() + if (place == "device"): return core.Place() + # GPU avaliable_gpu_place = re.match(r'gpu:\d+', place) if place == "gpu_pinned" or place == "gpu" or avaliable_gpu_place: if not core.is_compiled_with_cuda(): @@ -5882,6 +5884,8 @@ def _get_paddle_place(place): device_id = place_info_list[1] device_id = int(device_id) return core.CUDAPlace(device_id) + + # XPU avaliable_xpu_place = re.match(r'xpu:\d+', place) if avaliable_xpu_place: if not core.is_compiled_with_xpu(): @@ -5892,9 +5896,22 @@ def _get_paddle_place(place): device_id = place_info_list[1] device_id = int(device_id) return core.XPUPlace(device_id) + + # NPU + avaliable_npu_place = re.match(r'npu:\d+', place) + if avaliable_npu_place: + if not core.is_compiled_with_npu(): + raise ValueError( + "The device should not be {}, since PaddlePaddle is " \ + "not compiled with NPU".format(avaliable_npu_place)) + place_info_list = place.split(':', 1) + device_id = place_info_list[1] + device_id = int(device_id) + return core.NPUPlace(device_id) + raise ValueError( - "paddle support CPUPlace, CUDAPlace,CUDAPinnedPlace and XPUPlace, Please check your Place Input" - ) + "Paddle supports CPUPlace, CUDAPlace,CUDAPinnedPlace, XPUPlace and NPUPlace, but received {}.". + format(place)) def _get_paddle_place_list(places): diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 1f4648f7963..93c15ba6cb7 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -608,6 +608,10 @@ if (WITH_XPU) add_subdirectory(xpu) endif() +if (WITH_ASCEND_CL) + add_subdirectory(npu) +endif() + if (WITH_MKLDNN) add_subdirectory(mkldnn) endif() diff --git a/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt b/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt new file mode 100644 index 00000000000..f71e04c09aa --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt @@ -0,0 +1,6 @@ +file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") +string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") + +foreach(TEST_OP ${TEST_OPS}) + py_test_modules(${TEST_OP} MODULES ${TEST_OP}) +endforeach(TEST_OP) diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_add_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_add_op_npu.py new file mode 100644 index 00000000000..47da4fdb23e --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_add_op_npu.py @@ -0,0 +1,162 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest, _set_use_system_allocator +import paddle +import paddle.fluid as fluid + +paddle.enable_static() + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestElementwiseAddOp(OpTest): + def setUp(self): + self.set_npu() + self.op_type = "elementwise_add" + self.place = paddle.NPUPlace(0) + self.init_dtype() + self.init_input_output() + self.init_kernel_type() + self.init_axis() + + self.inputs = { + 'X': OpTest.np_dtype_to_fluid_dtype(self.x), + 'Y': OpTest.np_dtype_to_fluid_dtype(self.y) + } + self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_mkldnn} + self.outputs = {'Out': self.out} + + def set_npu(self): + self.__class__.use_npu = True + + def init_kernel_type(self): + self.use_mkldnn = False + + def init_input_output(self): + self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype) + self.y = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype) + self.out = np.add(self.x, self.y) + + def init_dtype(self): + self.dtype = np.float32 + + def init_axis(self): + self.axis = -1 + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False) + + # TODO(ascendrc): Test grad op after it is implemented. + # def test_check_grad_normal(self): + # self.check_grad_with_place( + # self.place, ['X', 'Y'], + # 'Out', + # max_relative_error=0.006, + # check_dygraph=False) + # + # def test_check_grad_ingore_x(self): + # self.check_grad_with_place( + # self.place, ['Y'], + # 'Out', + # no_grad_set=set("X"), + # max_relative_error=0.006, + # check_dygraph=False) + # + # def test_check_grad_ingore_y(self): + # self.check_grad_with_place( + # self.place, ['X'], + # 'Out', + # no_grad_set=set("Y"), + # max_relative_error=0.006,check_dygraph=False) + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestAddAPI(unittest.TestCase): + def test_name(self): + with paddle.static.program_guard(paddle.static.Program()): + x = paddle.static.data(name="x", shape=[2, 3], dtype="float32") + y = paddle.static.data(name='y', shape=[2, 3], dtype='float32') + + y_1 = paddle.add(x, y, name='add_res') + self.assertEqual(('add_res' in y_1.name), True) + + def test_static(self): + with paddle.static.program_guard(paddle.static.Program()): + + x_np = np.array([2, 3, 4]).astype('float32') + y_np = np.array([1, 5, 2]).astype('float32') + + x = paddle.static.data(name="x", shape=[3], dtype='float32') + y = paddle.static.data(name="y", shape=[3], dtype='float32') + + x_reshape = paddle.reshape(x, [3, 1]) + y_reshape = paddle.reshape(y, [3, 1]) + z = paddle.add(x_reshape, y_reshape) + z = paddle.reshape(z, shape=[3]) + + place = paddle.NPUPlace(0) + exe = paddle.static.Executor(place) + x_value, y_value, z_value = exe.run(feed={"x": x_np, + "y": y_np}, + fetch_list=[x, y, z]) + + z_expected = np.array([3., 8., 6.]) + self.assertEqual( + (x_value == x_np).all(), + True, + msg="x_value = {}, but expected {}".format(x_value, x_np)) + self.assertEqual( + (y_value == y_np).all(), + True, + msg="y_value = {}, but expected {}".format(y_value, y_np)) + self.assertEqual( + (z_value == z_expected).all(), + True, + msg="z_value = {}, but expected {}".format(z_value, z_expected)) + + def test_backward(self): + # TODO(ascendrc): Test backward after add grad npu op implemented. + pass + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestAddError(unittest.TestCase): + def test_errors(self): + with paddle.static.program_guard(paddle.static.Program()): + # the input of elementwise_add must be Variable. + x1 = fluid.create_lod_tensor( + np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.NPUPlace(0)) + y1 = fluid.create_lod_tensor( + np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.NPUPlace(0)) + self.assertRaises(TypeError, paddle.add, x1, y1) + + # the input dtype must be float16 or float32 or float64 or int32 or int64 + x2 = paddle.static.data( + name='x2', shape=[3, 4, 5, 6], dtype="uint8") + y2 = paddle.static.data( + name='y2', shape=[3, 4, 5, 6], dtype="uint8") + self.assertRaises(TypeError, paddle.add, x2, y2) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_sub_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_sub_op_npu.py new file mode 100644 index 00000000000..8c6c7b46f49 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_sub_op_npu.py @@ -0,0 +1,224 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid + +paddle.enable_static() + +SEED = 2021 + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestElementwiseSubOp(OpTest): + def setUp(self): + self.set_npu() + self.op_type = "elementwise_sub" + self.place = paddle.NPUPlace(0) + self.init_dtype() + self.init_input_output() + self.init_kernel_type() + self.init_axis() + + self.inputs = { + 'X': OpTest.np_dtype_to_fluid_dtype(self.x), + 'Y': OpTest.np_dtype_to_fluid_dtype(self.y) + } + self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_mkldnn} + self.outputs = {'Out': self.out} + + def set_npu(self): + self.__class__.use_npu = True + + def init_kernel_type(self): + self.use_mkldnn = False + + def init_input_output(self): + self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype) + self.y = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype) + self.out = np.subtract(self.x, self.y) + + def init_dtype(self): + self.dtype = np.float32 + + def init_axis(self): + self.axis = 0 + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False) + + # TODO(ascendrc): For grad tests, OpTest raises FatalError:Segmentation fault + # when call op.run, which may be caused by system environment exception + # and the exact cause has not be located. + # def test_check_grad_normal(self): + # self.check_grad_with_place( + # self.place, ['X', 'Y'], + # 'Out', + # max_relative_error=0.006, + # check_dygraph=False) + # + # def test_check_grad_ingore_x(self): + # self.check_grad_with_place( + # self.place, ['Y'], + # 'Out', + # no_grad_set=set("X"), + # max_relative_error=0.006, + # check_dygraph=False) + # + # def test_check_grad_ingore_y(self): + # self.check_grad_with_place( + # self.place, ['X'], + # 'Out', + # no_grad_set=set("Y"), + # max_relative_error=0.006,check_dygraph=False) + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestSubtractAPI(unittest.TestCase): + def test_name(self): + with paddle.static.program_guard(paddle.static.Program()): + x = paddle.static.data(name="x", shape=[2, 3], dtype="float32") + y = paddle.static.data(name='y', shape=[2, 3], dtype='float32') + + y_1 = paddle.subtract(x, y, name='add_res') + self.assertEqual(('add_res' in y_1.name), True) + + def test_static(self): + with paddle.static.program_guard(paddle.static.Program()): + + x_np = np.array([2, 3, 4]).astype('float32') + y_np = np.array([1, 5, 2]).astype('float32') + + x = paddle.static.data(name="x", shape=[3], dtype='float32') + y = paddle.static.data(name="y", shape=[3], dtype='float32') + + x_reshape = paddle.reshape(x, [3, 1]) + y_reshape = paddle.reshape(y, [3, 1]) + z = paddle.subtract(x_reshape, y_reshape) + z = paddle.reshape(z, shape=[3]) + + place = paddle.NPUPlace(0) + exe = paddle.static.Executor(place) + x_value, y_value, z_value = exe.run(feed={"x": x_np, + "y": y_np}, + fetch_list=[x, y, z]) + + z_expected = np.array([1., -2., 2.]) + self.assertEqual( + (x_value == x_np).all(), + True, + msg="x_value = {}, but expected {}".format(x_value, x_np)) + self.assertEqual( + (y_value == y_np).all(), + True, + msg="y_value = {}, but expected {}".format(y_value, y_np)) + self.assertEqual( + (z_value == z_expected).all(), + True, + msg="z_value = {}, but expected {}".format(z_value, z_expected)) + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestSubtractError(unittest.TestCase): + def test_errors(self): + with paddle.static.program_guard(paddle.static.Program()): + # the input of elementwise_add must be Variable. + x1 = fluid.create_lod_tensor( + np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.NPUPlace(0)) + y1 = fluid.create_lod_tensor( + np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.NPUPlace(0)) + self.assertRaises(TypeError, paddle.subtract, x1, y1) + + # the input dtype must be float16 or float32 or float64 or int32 or int64 + x2 = paddle.static.data( + name='x2', shape=[3, 4, 5, 6], dtype="uint8") + y2 = paddle.static.data( + name='y2', shape=[3, 4, 5, 6], dtype="uint8") + self.assertRaises(TypeError, paddle.subtract, x2, y2) + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestSubtractNet(unittest.TestCase): + def _test(self, run_npu=True): + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = SEED + startup_prog.random_seed = SEED + np.random.seed(SEED) + + a_np = np.random.random(size=(32, 32)).astype('float32') + b_np = np.random.random(size=(32, 32)).astype('float32') + label_np = np.random.randint(2, size=(32, 1)).astype('int64') + + with paddle.static.program_guard(main_prog, startup_prog): + a = paddle.static.data(name="a", shape=[32, 32], dtype='float32') + b = paddle.static.data(name="b", shape=[32, 32], dtype='float32') + label = paddle.static.data( + name="label", shape=[32, 1], dtype='int64') + + sum = paddle.add(a, b) + c = paddle.assign(b) + z = paddle.subtract(sum, c) + + fc_1 = fluid.layers.fc(input=z, size=128) + prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax') + + cost = fluid.layers.cross_entropy(input=prediction, label=label) + loss = fluid.layers.reduce_mean(cost) + sgd = fluid.optimizer.SGD(learning_rate=0.01) + sgd.minimize(loss) + + if run_npu: + place = paddle.NPUPlace(0) + else: + place = paddle.CPUPlace() + + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + for epoch in range(100): + + pred_res, loss_res = exe.run( + main_prog, + feed={"a": a_np, + "b": b_np, + "label": label_np}, + fetch_list=[prediction, loss]) + if epoch % 10 == 0: + print("Epoch {} | Prediction[0]: {}, Loss: {}".format( + epoch, pred_res[0], loss_res)) + + return pred_res, loss_res + + def test_npu(self): + npu_pred, npu_loss = self._test(True) + cpu_pred, cpu_loos = self._test(False) + + self.assertTrue(np.allclose(npu_pred, cpu_pred)) + self.assertTrue(np.allclose(npu_loss, cpu_loos)) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/npu/test_npu_place.py b/python/paddle/fluid/tests/unittests/npu/test_npu_place.py new file mode 100644 index 00000000000..3f71fad2b9c --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_npu_place.py @@ -0,0 +1,61 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import paddle +import numpy as np +from paddle.fluid import core + +paddle.enable_static() + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestNpuPlace(unittest.TestCase): + def test(self): + p = core.Place() + p.set_place(paddle.NPUPlace(0)) + + self.assertTrue(p.is_npu_place()) + self.assertEqual(p.npu_device_id(), 0) + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestNpuPlaceError(unittest.TestCase): + def test_static(self): + # NPU is not supported in ParallelExecutor + prog = paddle.static.Program() + with paddle.static.program_guard(prog): + + x_np = np.array([2, 3, 4]).astype('float32') + y_np = np.array([1, 5, 2]).astype('float32') + + x = paddle.static.data(name="x", shape=[3], dtype='float32') + y = paddle.static.data(name="y", shape=[3], dtype='float32') + z = paddle.add(x, y) + + compiled_prog = paddle.static.CompiledProgram(prog) + place = paddle.NPUPlace(0) + exe = paddle.static.Executor(place) + + with self.assertRaisesRegex(RuntimeError, + "NPU is not supported in ParallelExecutor"): + exe.run(compiled_prog, feed={"x": x_np, "y": y_np}) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py index e3e84a73301..efce2e770b1 100644 --- a/python/paddle/fluid/tests/unittests/op_test.py +++ b/python/paddle/fluid/tests/unittests/op_test.py @@ -243,7 +243,10 @@ class OpTest(unittest.TestCase): np.random.seed(123) random.seed(124) - cls._use_system_allocator = _set_use_system_allocator(True) + if paddle.is_compiled_with_npu(): + cls._use_system_allocator = _set_use_system_allocator(False) + else: + cls._use_system_allocator = _set_use_system_allocator(True) @classmethod def tearDownClass(cls): @@ -272,6 +275,9 @@ class OpTest(unittest.TestCase): def is_mkldnn_op_test(): return hasattr(cls, "use_mkldnn") and cls.use_mkldnn == True + def is_npu_op_test(): + return hasattr(cls, "use_npu") and cls.use_npu == True + if not hasattr(cls, "op_type"): raise AssertionError( "This test do not have op_type in class attrs, " @@ -292,7 +298,8 @@ class OpTest(unittest.TestCase): and cls.op_type not in op_accuracy_white_list.NO_FP64_CHECK_GRAD_OP_LIST \ and not hasattr(cls, 'exist_fp64_check_grad') \ and not is_xpu_op_test() \ - and not is_mkldnn_op_test(): + and not is_mkldnn_op_test() \ + and not is_npu_op_test(): raise AssertionError( "This test of %s op needs check_grad with fp64 precision." % cls.op_type) @@ -1183,7 +1190,8 @@ class OpTest(unittest.TestCase): # Check inplace for given op, its grad op, its grad_grad op, etc. # No effect on original OpTest # Currently not support ParallelExecutor on XPUPlace. - if not paddle.is_compiled_with_xpu(): + if not paddle.is_compiled_with_xpu( + ) and not paddle.is_compiled_with_npu(): self.check_inplace_output_with_place( place, no_check_set=no_check_set, inplace_atol=inplace_atol) diff --git a/python/paddle/fluid/tests/unittests/test_device.py b/python/paddle/fluid/tests/unittests/test_device.py index 195337e80de..08697a08044 100644 --- a/python/paddle/fluid/tests/unittests/test_device.py +++ b/python/paddle/fluid/tests/unittests/test_device.py @@ -15,54 +15,39 @@ from __future__ import print_function import unittest -from op_test import OpTest -import numpy as np +import paddle import paddle.fluid as fluid import paddle.fluid.core as core import paddle.fluid.framework as framework -import warnings -import paddle class TestStaticDeviceManage(unittest.TestCase): - def test_cpu_device(self): - paddle.set_device('cpu') + def _test_device(self, device_name, device_class): + paddle.set_device(device_name) + out1 = paddle.zeros(shape=[1, 3], dtype='float32') out2 = paddle.ones(shape=[1, 3], dtype='float32') out3 = paddle.concat(x=[out1, out2], axis=0) - exe = paddle.fluid.Executor() + + exe = paddle.static.Executor() exe.run(paddle.fluid.default_startup_program()) res = exe.run(fetch_list=[out3]) + device = paddle.get_device() - self.assertEqual(isinstance(exe.place, core.CPUPlace), True) - self.assertEqual(device, "cpu") + self.assertEqual(isinstance(exe.place, device_class), True) + self.assertEqual(device, device_name) + + def test_cpu_device(self): + self._test_device("cpu", core.CPUPlace) def test_gpu_device(self): if core.is_compiled_with_cuda(): - out1 = paddle.zeros(shape=[1, 3], dtype='float32') - out2 = paddle.ones(shape=[1, 3], dtype='float32') - out3 = paddle.concat(x=[out1, out2], axis=0) - paddle.set_device('gpu:0') - exe = paddle.fluid.Executor() - exe.run(paddle.fluid.default_startup_program()) - res = exe.run(fetch_list=[out3]) - device = paddle.get_device() - self.assertEqual(isinstance(exe.place, core.CUDAPlace), True) - self.assertEqual(device, "gpu:0") + self._test_device("gpu:0", core.CUDAPlace) def test_xpu_device(self): if core.is_compiled_with_xpu(): - out1 = paddle.zeros(shape=[1, 3], dtype='float32') - out2 = paddle.ones(shape=[1, 3], dtype='float32') - out3 = paddle.concat(x=[out1, out2], axis=0) - paddle.set_device('xpu:0') - exe = paddle.fluid.Executor() - exe.run(paddle.fluid.default_startup_program()) - res = exe.run(fetch_list=[out3]) - device = paddle.get_device() - self.assertEqual(isinstance(exe.place, core.XPUPlace), True) - self.assertEqual(device, "xpu:0") + self._test_device("xpu:0", core.XPUPlace) class TestImperativeDeviceManage(unittest.TestCase): diff --git a/python/paddle/framework/__init__.py b/python/paddle/framework/__init__.py index f2b6888d7a7..5a616d81659 100644 --- a/python/paddle/framework/__init__.py +++ b/python/paddle/framework/__init__.py @@ -12,10 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -# TODO: import framework api under this directory +# TODO: import framework api under this directory __all__ = [ 'create_parameter', 'ParamAttr', 'CPUPlace', 'CUDAPlace', 'CUDAPinnedPlace', - 'get_default_dtype', 'set_default_dtype' + 'NPUPlace', 'get_default_dtype', 'set_default_dtype' ] __all__ += ['grad', 'LayerList', 'load', 'save', 'no_grad', 'DataParallel'] @@ -31,6 +31,7 @@ from ..fluid.layers.tensor import create_parameter #DEFINE_ALIAS from ..fluid.core import CPUPlace #DEFINE_ALIAS from ..fluid.core import CUDAPlace #DEFINE_ALIAS from ..fluid.core import CUDAPinnedPlace #DEFINE_ALIAS +from ..fluid.core import NPUPlace #DEFINE_ALIAS from ..fluid.core import VarBase #DEFINE_ALIAS from paddle.fluid import core #DEFINE_ALIAS -- GitLab