未验证 提交 1435b4c0 编写于 作者: L liym27 提交者: GitHub

[NPU] Support executor with NPU (#31057)

* [NPU] Support executor with NPU

* Fix code according to reviews

* Fix code

* Add unittest for sub op npu
上级 678a3e8f
...@@ -466,6 +466,14 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx, ...@@ -466,6 +466,14 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx,
#else #else
PADDLE_THROW( PADDLE_THROW(
platform::errors::Unimplemented("No XPU gc found in CPU/GPU paddle")); platform::errors::Unimplemented("No XPU gc found in CPU/GPU paddle"));
#endif
} else if (platform::is_npu_place(place_)) {
#ifdef PADDLE_WITH_ASCEND_CL
// TODO(ascendrc): Support garbage collector on NPUPlace
VLOG(4) << "Skip NPU gc because it is not implemented now.";
#else
PADDLE_THROW(platform::errors::Unimplemented(
"No NPU gc found in CPU/GPU/XPU paddle"));
#endif #endif
} }
} }
......
...@@ -1275,6 +1275,16 @@ void OperatorWithKernel::ChooseKernel(const RuntimeContext& ctx, ...@@ -1275,6 +1275,16 @@ void OperatorWithKernel::ChooseKernel(const RuntimeContext& ctx,
expected_kernel_key.place_ = platform::CPUPlace(); expected_kernel_key.place_ = platform::CPUPlace();
kernel_iter = kernels.find(expected_kernel_key); kernel_iter = kernels.find(expected_kernel_key);
} }
#endif
#ifdef PADDLE_WITH_ASCEND_CL
if (kernel_iter == kernels.end() &&
is_npu_place(expected_kernel_key.place_)) {
VLOG(3) << "missing NPU kernel: " << type_
<< ", expected_kernel_key:" << expected_kernel_key
<< ", fallbacking to CPU one!";
expected_kernel_key.place_ = platform::CPUPlace();
kernel_iter = kernels.find(expected_kernel_key);
}
#endif #endif
PADDLE_ENFORCE_NE(kernel_iter, kernels.end(), PADDLE_ENFORCE_NE(kernel_iter, kernels.end(),
platform::errors::NotFound( platform::errors::NotFound(
......
...@@ -614,6 +614,9 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places, ...@@ -614,6 +614,9 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
const BuildStrategy &build_strategy, const BuildStrategy &build_strategy,
ir::Graph *graph) ir::Graph *graph)
: member_(new ParallelExecutorPrivate(places, scope)) { : member_(new ParallelExecutorPrivate(places, scope)) {
PADDLE_ENFORCE(places.size() > 0 && !is_npu_place(places[0]),
platform::errors::Unavailable(
"NPU is not supported in ParallelExecutor"));
InitP2P(places); InitP2P(places);
ir::InitReaderQueueDeviceCount(graph, *(member_->global_scope_), ir::InitReaderQueueDeviceCount(graph, *(member_->global_scope_),
member_->places_.size()); member_->places_.size());
......
...@@ -101,15 +101,19 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place, ...@@ -101,15 +101,19 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
// TODO(zhiqiu): handle different condition like CUDA code below // TODO(zhiqiu): handle different condition like CUDA code below
else if (platform::is_npu_place(src_place) && // NOLINT else if (platform::is_npu_place(src_place) && // NOLINT
platform::is_cpu_place(dst_place)) { platform::is_cpu_place(dst_place)) {
auto stream = reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream(); auto stream =
reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream();
memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr, memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
BOOST_GET_CONST(platform::NPUPlace, src_place), src_ptr, size, stream); BOOST_GET_CONST(platform::NPUPlace, src_place), src_ptr, size,
stream);
} }
else if (platform::is_cpu_place(src_place) && // NOLINT else if (platform::is_cpu_place(src_place) && // NOLINT
platform::is_npu_place(dst_place)) { platform::is_npu_place(dst_place)) {
auto stream = reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream(); auto stream =
reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream();
memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size, stream); BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size,
stream);
} }
else if (platform::is_npu_place(src_place) && // NOLINT else if (platform::is_npu_place(src_place) && // NOLINT
platform::is_npu_place(dst_place)) { platform::is_npu_place(dst_place)) {
...@@ -118,9 +122,11 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place, ...@@ -118,9 +122,11 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
<< dst_place; << dst_place;
return; return;
} }
auto stream = reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream(); auto stream =
reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream();
memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
BOOST_GET_CONST(platform::NPUPlace, src_place), src_ptr, size, stream); BOOST_GET_CONST(platform::NPUPlace, src_place), src_ptr, size,
stream);
} }
else { // NOLINT else { // NOLINT
PADDLE_THROW(platform::errors::Unimplemented( PADDLE_THROW(platform::errors::Unimplemented(
...@@ -336,24 +342,27 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place, ...@@ -336,24 +342,27 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
#endif #endif
#ifdef PADDLE_WITH_ASCEND_CL #ifdef PADDLE_WITH_ASCEND_CL
else if (platform::is_npu_place(src_place) && // NOLINT else if (platform::is_npu_place(src_place) && // NOLINT
platform::is_cpu_place(dst_place)) { platform::is_cpu_place(dst_place)) { /* npu -> cpu*/
memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr, memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
BOOST_GET_CONST(platform::NPUPlace, src_place), src_ptr, size, nullptr); BOOST_GET_CONST(platform::NPUPlace, src_place), src_ptr, size,
nullptr);
} }
else if (platform::is_cpu_place(src_place) && // NOLINT else if (platform::is_cpu_place(src_place) && // NOLINT
platform::is_npu_place(dst_place)) { platform::is_npu_place(dst_place)) { /* cpu -> npu*/
memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size, nullptr); BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size,
nullptr);
} }
else if (platform::is_npu_place(src_place) && // NOLINT else if (platform::is_npu_place(src_place) && // NOLINT
platform::is_npu_place(dst_place)) { platform::is_npu_place(dst_place)) { /* npu -> npu*/
if (src_ptr == dst_ptr) { if (src_ptr == dst_ptr) {
VLOG(3) << "Skip copy the same data sync from " << src_place << " to " VLOG(3) << "Skip copy the same data sync from " << src_place << " to "
<< dst_place; << dst_place;
return; return;
} }
memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
BOOST_GET_CONST(platform::NPUPlace, src_place), src_ptr, size, nullptr); BOOST_GET_CONST(platform::NPUPlace, src_place), src_ptr, size,
nullptr);
} }
else { // NOLINT else { // NOLINT
PADDLE_THROW(platform::errors::Unimplemented( PADDLE_THROW(platform::errors::Unimplemented(
......
...@@ -154,6 +154,14 @@ bool IsCompiledWithXPU() { ...@@ -154,6 +154,14 @@ bool IsCompiledWithXPU() {
#endif #endif
} }
bool IsCompiledWithNPU() {
#ifndef PADDLE_WITH_ASCEND_CL
return false;
#else
return true;
#endif
}
bool IsCompiledWithMKLDNN() { bool IsCompiledWithMKLDNN() {
#ifndef PADDLE_WITH_MKLDNN #ifndef PADDLE_WITH_MKLDNN
return false; return false;
...@@ -567,6 +575,10 @@ PYBIND11_MODULE(core_noavx, m) { ...@@ -567,6 +575,10 @@ PYBIND11_MODULE(core_noavx, m) {
[](Tensor &self, paddle::platform::CPUPlace &place) { [](Tensor &self, paddle::platform::CPUPlace &place) {
self.mutable_data<float>(place); self.mutable_data<float>(place);
}) })
.def("_alloc_float",
[](Tensor &self, paddle::platform::NPUPlace &place) {
self.mutable_data<float>(place);
})
.def("_alloc_double", .def("_alloc_double",
[](Tensor &self, paddle::platform::CPUPlace &place) { [](Tensor &self, paddle::platform::CPUPlace &place) {
self.mutable_data<double>(place); self.mutable_data<double>(place);
...@@ -611,6 +623,11 @@ PYBIND11_MODULE(core_noavx, m) { ...@@ -611,6 +623,11 @@ PYBIND11_MODULE(core_noavx, m) {
paddle::framework::proto::VarType::Type type) { paddle::framework::proto::VarType::Type type) {
return reinterpret_cast<uintptr_t>(self.mutable_data(place, type)); return reinterpret_cast<uintptr_t>(self.mutable_data(place, type));
}) })
.def("_mutable_data",
[](Tensor &self, paddle::platform::NPUPlace &place,
paddle::framework::proto::VarType::Type type) {
return reinterpret_cast<uintptr_t>(self.mutable_data(place, type));
})
.def("_clear", &Tensor::clear) .def("_clear", &Tensor::clear)
.def("set", SetTensorFromPyArray<paddle::platform::CPUPlace>, .def("set", SetTensorFromPyArray<paddle::platform::CPUPlace>,
py::arg("array"), py::arg("place"), py::arg("zero_copy") = false) py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
...@@ -618,6 +635,8 @@ PYBIND11_MODULE(core_noavx, m) { ...@@ -618,6 +635,8 @@ PYBIND11_MODULE(core_noavx, m) {
py::arg("array"), py::arg("place"), py::arg("zero_copy") = false) py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
.def("set", SetTensorFromPyArray<paddle::platform::CUDAPlace>, .def("set", SetTensorFromPyArray<paddle::platform::CUDAPlace>,
py::arg("array"), py::arg("place"), py::arg("zero_copy") = false) py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
.def("set", SetTensorFromPyArray<paddle::platform::NPUPlace>,
py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
.def("set", SetTensorFromPyArray<paddle::platform::CUDAPinnedPlace>, .def("set", SetTensorFromPyArray<paddle::platform::CUDAPinnedPlace>,
py::arg("array"), py::arg("place"), py::arg("zero_copy") = false, py::arg("array"), py::arg("place"), py::arg("zero_copy") = false,
R"DOC( R"DOC(
...@@ -625,7 +644,7 @@ PYBIND11_MODULE(core_noavx, m) { ...@@ -625,7 +644,7 @@ PYBIND11_MODULE(core_noavx, m) {
Args: Args:
lod (numpy.ndarray): The data to set. lod (numpy.ndarray): The data to set.
place (CPUPlace|CUDAPlace|XPUPlace|CUDAPinnedPlace): The place where the place (CPUPlace|CUDAPlace|XPUPlace|CUDAPinnedPlace|NPUPlace): The place where the
LoDTensor is to be set. LoDTensor is to be set.
zero_copy (bool, optional): Whether to share memory with the input numpy array. zero_copy (bool, optional): Whether to share memory with the input numpy array.
This parameter only works with CPUPlace. Default: False. This parameter only works with CPUPlace. Default: False.
...@@ -1348,6 +1367,18 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -1348,6 +1367,18 @@ All parameter, weight, gradient are variables in Paddle.
return new paddle::platform::XPUDeviceContext(place); return new paddle::platform::XPUDeviceContext(place);
#endif #endif
}) })
.def_static("create",
[](paddle::platform::NPUPlace& place)
-> paddle::platform::DeviceContext* {
#ifndef PADDLE_WITH_ASCEND_CL
PADDLE_THROW(
platform::errors::PermissionDenied(
"Cannot use NPUPlace in CPU/GPU/XPU version, "
"Please recompile or reinstall Paddle with NPU support."));
#else
return new paddle::platform::NPUDeviceContext(place);
#endif
})
.def_static("create", .def_static("create",
[](paddle::platform::CUDAPlace& place) [](paddle::platform::CUDAPlace& place)
-> paddle::platform::DeviceContext* { -> paddle::platform::DeviceContext* {
...@@ -1448,6 +1479,7 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -1448,6 +1479,7 @@ All parameter, weight, gradient are variables in Paddle.
.def("_equals", &IsSamePlace<platform::CUDAPlace, platform::CUDAPlace>) .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::CUDAPlace>)
.def("_equals", &IsSamePlace<platform::CUDAPlace, platform::CPUPlace>) .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::CPUPlace>)
.def("_equals", &IsSamePlace<platform::CUDAPlace, platform::XPUPlace>) .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::XPUPlace>)
.def("_equals", &IsSamePlace<platform::CUDAPlace, platform::NPUPlace>)
.def("_equals", .def("_equals",
&IsSamePlace<platform::CUDAPlace, platform::CUDAPinnedPlace>) &IsSamePlace<platform::CUDAPlace, platform::CUDAPinnedPlace>)
.def("_get_device_id", .def("_get_device_id",
...@@ -1517,6 +1549,7 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -1517,6 +1549,7 @@ All parameter, weight, gradient are variables in Paddle.
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
m.def("get_xpu_device_count", platform::GetXPUDeviceCount); m.def("get_xpu_device_count", platform::GetXPUDeviceCount);
#endif #endif
py::class_<paddle::platform::CPUPlace>(m, "CPUPlace", R"DOC( py::class_<paddle::platform::CPUPlace>(m, "CPUPlace", R"DOC(
CPUPlace is a descriptor of a device. CPUPlace is a descriptor of a device.
It represents a CPU device on which a tensor will be allocated and a model will run. It represents a CPU device on which a tensor will be allocated and a model will run.
...@@ -1532,6 +1565,7 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -1532,6 +1565,7 @@ All parameter, weight, gradient are variables in Paddle.
.def("_type", &PlaceIndex<platform::CPUPlace>) .def("_type", &PlaceIndex<platform::CPUPlace>)
.def("_equals", &IsSamePlace<platform::CPUPlace, platform::Place>) .def("_equals", &IsSamePlace<platform::CPUPlace, platform::Place>)
.def("_equals", &IsSamePlace<platform::CPUPlace, platform::XPUPlace>) .def("_equals", &IsSamePlace<platform::CPUPlace, platform::XPUPlace>)
.def("_equals", &IsSamePlace<platform::CPUPlace, platform::NPUPlace>)
.def("_equals", &IsSamePlace<platform::CPUPlace, platform::CUDAPlace>) .def("_equals", &IsSamePlace<platform::CPUPlace, platform::CUDAPlace>)
.def("_equals", &IsSamePlace<platform::CPUPlace, platform::CPUPlace>) .def("_equals", &IsSamePlace<platform::CPUPlace, platform::CPUPlace>)
.def("_equals", .def("_equals",
...@@ -1569,6 +1603,8 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -1569,6 +1603,8 @@ All parameter, weight, gradient are variables in Paddle.
&IsSamePlace<platform::CUDAPinnedPlace, platform::CUDAPlace>) &IsSamePlace<platform::CUDAPinnedPlace, platform::CUDAPlace>)
.def("_equals", .def("_equals",
&IsSamePlace<platform::CUDAPinnedPlace, platform::XPUPlace>) &IsSamePlace<platform::CUDAPinnedPlace, platform::XPUPlace>)
.def("_equals",
&IsSamePlace<platform::CUDAPinnedPlace, platform::NPUPlace>)
.def("_equals", .def("_equals",
&IsSamePlace<platform::CUDAPinnedPlace, platform::CPUPlace>) &IsSamePlace<platform::CUDAPinnedPlace, platform::CPUPlace>)
.def("_equals", .def("_equals",
...@@ -1576,6 +1612,65 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -1576,6 +1612,65 @@ All parameter, weight, gradient are variables in Paddle.
.def("__repr__", string::to_string<const platform::CUDAPinnedPlace &>) .def("__repr__", string::to_string<const platform::CUDAPinnedPlace &>)
.def("__str__", string::to_string<const platform::CUDAPinnedPlace &>); .def("__str__", string::to_string<const platform::CUDAPinnedPlace &>);
// NPUPlace
py::class_<platform::NPUPlace>(m, "NPUPlace", R"DOC(
NPUPlace is a descriptor of a device.
It represents a NPU device on which a tensor will be allocated and a model will run.
Examples:
.. code-block:: python
import paddle
npu_place = paddle.NPUPlace(0)
)DOC")
.def("__init__",
[](platform::NPUPlace &self, int dev_id) {
#ifdef PADDLE_WITH_ASCEND_CL
if (UNLIKELY(dev_id < 0)) {
LOG(ERROR) << string::Sprintf(
"Invalid NPUPlace(%d), device id must be 0 or "
"positive integer",
dev_id);
std::exit(-1);
}
if (UNLIKELY(dev_id >= platform::GetNPUDeviceCount())) {
if (platform::GetNPUDeviceCount() == 0) {
LOG(ERROR) << "Cannot use NPU because there is no NPU "
"detected on your "
"machine.";
std::exit(-1);
} else {
LOG(ERROR) << string::Sprintf(
"Invalid NPUPlace(%d), must inside [0, %d), because NPU "
"number on your machine is %d",
dev_id, platform::GetNPUDeviceCount(),
platform::GetNPUDeviceCount());
std::exit(-1);
}
}
new (&self) platform::NPUPlace(dev_id);
#else
LOG(ERROR) << string::Sprintf(
"Cannot use NPU because you have installed CPU/GPU version "
"PaddlePaddle.\n"
"If you want to use NPU, please try to install NPU version "
"PaddlePaddle by: pip install paddlepaddle-xpu\n"
"If you only have CPU, please change NPUPlace(%d) to be "
"CPUPlace().\n",
dev_id);
std::exit(-1);
#endif
})
.def("_type", &PlaceIndex<platform::NPUPlace>)
.def("_equals", &IsSamePlace<platform::NPUPlace, platform::Place>)
.def("_equals", &IsSamePlace<platform::NPUPlace, platform::CUDAPlace>)
.def("_equals", &IsSamePlace<platform::NPUPlace, platform::CPUPlace>)
.def("_equals", &IsSamePlace<platform::NPUPlace, platform::XPUPlace>)
.def("_equals", &IsSamePlace<platform::NPUPlace, platform::NPUPlace>)
.def("_equals",
&IsSamePlace<platform::NPUPlace, platform::CUDAPinnedPlace>)
.def("__str__", string::to_string<const platform::NPUPlace &>);
py::class_<platform::Place>(m, "Place") py::class_<platform::Place>(m, "Place")
.def(py::init<>()) .def(py::init<>())
.def("_type", &PlaceIndex<platform::Place>) .def("_type", &PlaceIndex<platform::Place>)
...@@ -1583,6 +1678,7 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -1583,6 +1678,7 @@ All parameter, weight, gradient are variables in Paddle.
.def("_equals", &IsSamePlace<platform::Place, platform::CUDAPlace>) .def("_equals", &IsSamePlace<platform::Place, platform::CUDAPlace>)
.def("_equals", &IsSamePlace<platform::Place, platform::CPUPlace>) .def("_equals", &IsSamePlace<platform::Place, platform::CPUPlace>)
.def("_equals", &IsSamePlace<platform::Place, platform::XPUPlace>) .def("_equals", &IsSamePlace<platform::Place, platform::XPUPlace>)
.def("_equals", &IsSamePlace<platform::Place, platform::NPUPlace>)
.def("_equals", &IsSamePlace<platform::Place, platform::CUDAPinnedPlace>) .def("_equals", &IsSamePlace<platform::Place, platform::CUDAPinnedPlace>)
.def("is_gpu_place", .def("is_gpu_place",
[](platform::Place &self) { return platform::is_gpu_place(self); }) [](platform::Place &self) { return platform::is_gpu_place(self); })
...@@ -1590,6 +1686,8 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -1590,6 +1686,8 @@ All parameter, weight, gradient are variables in Paddle.
[](platform::Place &self) { return platform::is_cpu_place(self); }) [](platform::Place &self) { return platform::is_cpu_place(self); })
.def("is_xpu_place", .def("is_xpu_place",
[](platform::Place &self) { return platform::is_xpu_place(self); }) [](platform::Place &self) { return platform::is_xpu_place(self); })
.def("is_npu_place",
[](platform::Place &self) { return platform::is_npu_place(self); })
.def("is_cuda_pinned_place", .def("is_cuda_pinned_place",
[](platform::Place &self) { [](platform::Place &self) {
return platform::is_cuda_pinned_place(self); return platform::is_cuda_pinned_place(self);
...@@ -1602,6 +1700,10 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -1602,6 +1700,10 @@ All parameter, weight, gradient are variables in Paddle.
[](platform::Place &self) { [](platform::Place &self) {
return BOOST_GET_CONST(platform::XPUPlace, self).device; return BOOST_GET_CONST(platform::XPUPlace, self).device;
}) })
.def("npu_device_id",
[](platform::Place &self) {
return BOOST_GET_CONST(platform::NPUPlace, self).device;
})
.def("set_place", [](platform::Place &self, .def("set_place", [](platform::Place &self,
const platform::Place &other) { self = other; }) const platform::Place &other) { self = other; })
.def("set_place", .def("set_place",
...@@ -1621,6 +1723,10 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -1621,6 +1723,10 @@ All parameter, weight, gradient are variables in Paddle.
const platform::CUDAPinnedPlace &cuda_pinned_place) { const platform::CUDAPinnedPlace &cuda_pinned_place) {
self = cuda_pinned_place; self = cuda_pinned_place;
}) })
.def("set_place",
[](platform::Place &self, const platform::NPUPlace &npu_place) {
self = npu_place;
})
.def("__repr__", string::to_string<const platform::Place &>) .def("__repr__", string::to_string<const platform::Place &>)
.def("__str__", string::to_string<const platform::Place &>); .def("__str__", string::to_string<const platform::Place &>);
...@@ -1645,6 +1751,9 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -1645,6 +1751,9 @@ All parameter, weight, gradient are variables in Paddle.
.def("run", .def("run",
[](OperatorBase &self, const Scope &scope, [](OperatorBase &self, const Scope &scope,
const platform::XPUPlace &place) { self.Run(scope, place); }) const platform::XPUPlace &place) { self.Run(scope, place); })
.def("run",
[](OperatorBase &self, const Scope &scope,
const platform::NPUPlace &place) { self.Run(scope, place); })
.def("run", .def("run",
[](OperatorBase &self, const Scope &scope, [](OperatorBase &self, const Scope &scope,
const platform::CUDAPlace &place) { self.Run(scope, place); }) const platform::CUDAPlace &place) { self.Run(scope, place); })
...@@ -1745,6 +1854,7 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -1745,6 +1854,7 @@ All parameter, weight, gradient are variables in Paddle.
m.def("is_compiled_with_cuda", IsCompiledWithCUDA); m.def("is_compiled_with_cuda", IsCompiledWithCUDA);
m.def("is_compiled_with_ascend", IsCompiledWithAscend); m.def("is_compiled_with_ascend", IsCompiledWithAscend);
m.def("is_compiled_with_npu", IsCompiledWithNPU);
m.def("is_compiled_with_xpu", IsCompiledWithXPU); m.def("is_compiled_with_xpu", IsCompiledWithXPU);
m.def("is_compiled_with_mkldnn", IsCompiledWithMKLDNN); m.def("is_compiled_with_mkldnn", IsCompiledWithMKLDNN);
m.def("supports_bfloat16", SupportsBfloat16); m.def("supports_bfloat16", SupportsBfloat16);
......
...@@ -285,6 +285,22 @@ void SetTensorFromPyArrayT( ...@@ -285,6 +285,22 @@ void SetTensorFromPyArrayT(
PADDLE_THROW(platform::errors::PermissionDenied( PADDLE_THROW(platform::errors::PermissionDenied(
"Cannot use XPUPlace in CPU/GPU version, " "Cannot use XPUPlace in CPU/GPU version, "
"Please recompile or reinstall Paddle with XPU support.")); "Please recompile or reinstall Paddle with XPU support."));
#endif
} else if (paddle::platform::is_npu_place(place)) {
#ifdef PADDLE_WITH_ASCEND_CL
platform::Place tmp_place = place;
platform::NPUDeviceGuard guard(
BOOST_GET_CONST(platform::NPUPlace, tmp_place).device);
auto dst = self->mutable_data<T>(place);
platform::NPUMemcpySync(dst, array.data(), array.nbytes(),
ACL_MEMCPY_HOST_TO_DEVICE);
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &ctx = *pool.Get(place);
ctx.Wait();
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"Cannot use NPUPlace in CPU/GPU/XPU version. "
"Please recompile or reinstall Paddle with NPU support."));
#endif #endif
} else { } else {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
......
...@@ -232,6 +232,7 @@ from .framework import ParamAttr #DEFINE_ALIAS ...@@ -232,6 +232,7 @@ from .framework import ParamAttr #DEFINE_ALIAS
from .framework import create_parameter #DEFINE_ALIAS from .framework import create_parameter #DEFINE_ALIAS
from .framework import CPUPlace #DEFINE_ALIAS from .framework import CPUPlace #DEFINE_ALIAS
from .framework import CUDAPlace #DEFINE_ALIAS from .framework import CUDAPlace #DEFINE_ALIAS
from .framework import NPUPlace #DEFINE_ALIAS
from .framework import CUDAPinnedPlace #DEFINE_ALIAS from .framework import CUDAPinnedPlace #DEFINE_ALIAS
from .framework import grad #DEFINE_ALIAS from .framework import grad #DEFINE_ALIAS
...@@ -256,6 +257,7 @@ from .device import set_device ...@@ -256,6 +257,7 @@ from .device import set_device
from .device import get_device from .device import get_device
from .device import is_compiled_with_cuda #DEFINE_ALIAS from .device import is_compiled_with_cuda #DEFINE_ALIAS
from .device import is_compiled_with_xpu from .device import is_compiled_with_xpu
from .device import is_compiled_with_npu
from .device import XPUPlace from .device import XPUPlace
# from .tensor.tensor import Tensor #DEFINE_ALIAS # from .tensor.tensor import Tensor #DEFINE_ALIAS
# from .tensor.tensor import LoDTensor #DEFINE_ALIAS # from .tensor.tensor import LoDTensor #DEFINE_ALIAS
......
...@@ -32,12 +32,28 @@ __all__ = [ ...@@ -32,12 +32,28 @@ __all__ = [
# 'cuda_places', # 'cuda_places',
# 'CUDAPinnedPlace', # 'CUDAPinnedPlace',
# 'CUDAPlace', # 'CUDAPlace',
'is_compiled_with_cuda' 'is_compiled_with_cuda',
'is_compiled_with_npu'
] ]
_cudnn_version = None _cudnn_version = None
def is_compiled_with_npu():
"""
Whether this whl package can be used to run the model on NPU.
Returns (bool): `True` if NPU is supported, otherwise `False`.
Examples:
.. code-block:: python
import paddle
support_npu = paddle.is_compiled_with_npu()
"""
return core.is_compiled_with_npu()
def is_compiled_with_xpu(): def is_compiled_with_xpu():
""" """
Whether paddle was built with WITH_XPU=ON to support Baidu Kunlun Whether paddle was built with WITH_XPU=ON to support Baidu Kunlun
...@@ -163,6 +179,7 @@ def set_device(device): ...@@ -163,6 +179,7 @@ def set_device(device):
device_id = device_info_list[1] device_id = device_info_list[1]
device_id = int(device_id) device_id = int(device_id)
place = core.XPUPlace(device_id) place = core.XPUPlace(device_id)
framework._set_expected_place(place) framework._set_expected_place(place)
return place return place
......
...@@ -68,7 +68,8 @@ from .input import embedding, one_hot ...@@ -68,7 +68,8 @@ from .input import embedding, one_hot
from . import distribute_lookup_table from . import distribute_lookup_table
from .param_attr import ParamAttr, WeightNormParamAttr from .param_attr import ParamAttr, WeightNormParamAttr
from .data_feeder import DataFeeder from .data_feeder import DataFeeder
from .core import LoDTensor, LoDTensorArray, CPUPlace, XPUPlace, CUDAPlace, CUDAPinnedPlace, Scope, _Scope from .core import LoDTensor, LoDTensorArray, Scope, _Scope
from .core import CPUPlace, XPUPlace, CUDAPlace, CUDAPinnedPlace, NPUPlace
from .incubate import fleet from .incubate import fleet
from .incubate import data_generator from .incubate import data_generator
from .transpiler import DistributeTranspiler, \ from .transpiler import DistributeTranspiler, \
...@@ -124,6 +125,7 @@ __all__ = framework.__all__ + executor.__all__ + \ ...@@ -124,6 +125,7 @@ __all__ = framework.__all__ + executor.__all__ + \
'XPUPlace', 'XPUPlace',
'CUDAPlace', 'CUDAPlace',
'CUDAPinnedPlace', 'CUDAPinnedPlace',
'NPUPlace',
'Tensor', 'Tensor',
'ParamAttr', 'ParamAttr',
'WeightNormParamAttr', 'WeightNormParamAttr',
......
...@@ -1213,6 +1213,7 @@ class Executor(object): ...@@ -1213,6 +1213,7 @@ class Executor(object):
# In distributed training, the compiled program is saved in Program._graph # In distributed training, the compiled program is saved in Program._graph
has_compiled_graph = isinstance(program._graph, has_compiled_graph = isinstance(program._graph,
compiler.CompiledProgram) compiler.CompiledProgram)
if has_compiled_graph: if has_compiled_graph:
program._graph._compile(scope, self.place) program._graph._compile(scope, self.place)
# _graph in program does not support inference since the _graph is optimized # _graph in program does not support inference since the _graph is optimized
......
...@@ -5854,7 +5854,7 @@ def _get_paddle_place(place): ...@@ -5854,7 +5854,7 @@ def _get_paddle_place(place):
if place is None: if place is None:
return place return place
if isinstance(place, (core.Place, core.XPUPlace, core.CPUPlace, if isinstance(place, (core.Place, core.XPUPlace, core.CPUPlace,
core.CUDAPinnedPlace, core.CUDAPlace)): core.CUDAPinnedPlace, core.CUDAPlace, core.NPUPlace)):
return place return place
if not isinstance(place, str): if not isinstance(place, str):
...@@ -5864,9 +5864,11 @@ def _get_paddle_place(place): ...@@ -5864,9 +5864,11 @@ def _get_paddle_place(place):
place = place.lower() place = place.lower()
if (place == "cpu"): if (place == "cpu"):
return core.CPUPlace() return core.CPUPlace()
if (place == "device"): if (place == "device"):
return core.Place() return core.Place()
# GPU
avaliable_gpu_place = re.match(r'gpu:\d+', place) avaliable_gpu_place = re.match(r'gpu:\d+', place)
if place == "gpu_pinned" or place == "gpu" or avaliable_gpu_place: if place == "gpu_pinned" or place == "gpu" or avaliable_gpu_place:
if not core.is_compiled_with_cuda(): if not core.is_compiled_with_cuda():
...@@ -5882,6 +5884,8 @@ def _get_paddle_place(place): ...@@ -5882,6 +5884,8 @@ def _get_paddle_place(place):
device_id = place_info_list[1] device_id = place_info_list[1]
device_id = int(device_id) device_id = int(device_id)
return core.CUDAPlace(device_id) return core.CUDAPlace(device_id)
# XPU
avaliable_xpu_place = re.match(r'xpu:\d+', place) avaliable_xpu_place = re.match(r'xpu:\d+', place)
if avaliable_xpu_place: if avaliable_xpu_place:
if not core.is_compiled_with_xpu(): if not core.is_compiled_with_xpu():
...@@ -5892,9 +5896,22 @@ def _get_paddle_place(place): ...@@ -5892,9 +5896,22 @@ def _get_paddle_place(place):
device_id = place_info_list[1] device_id = place_info_list[1]
device_id = int(device_id) device_id = int(device_id)
return core.XPUPlace(device_id) return core.XPUPlace(device_id)
# NPU
avaliable_npu_place = re.match(r'npu:\d+', place)
if avaliable_npu_place:
if not core.is_compiled_with_npu():
raise ValueError(
"The device should not be {}, since PaddlePaddle is " \
"not compiled with NPU".format(avaliable_npu_place))
place_info_list = place.split(':', 1)
device_id = place_info_list[1]
device_id = int(device_id)
return core.NPUPlace(device_id)
raise ValueError( raise ValueError(
"paddle support CPUPlace, CUDAPlace,CUDAPinnedPlace and XPUPlace, Please check your Place Input" "Paddle supports CPUPlace, CUDAPlace,CUDAPinnedPlace, XPUPlace and NPUPlace, but received {}.".
) format(place))
def _get_paddle_place_list(places): def _get_paddle_place_list(places):
......
...@@ -608,6 +608,10 @@ if (WITH_XPU) ...@@ -608,6 +608,10 @@ if (WITH_XPU)
add_subdirectory(xpu) add_subdirectory(xpu)
endif() endif()
if (WITH_ASCEND_CL)
add_subdirectory(npu)
endif()
if (WITH_MKLDNN) if (WITH_MKLDNN)
add_subdirectory(mkldnn) add_subdirectory(mkldnn)
endif() endif()
......
file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
foreach(TEST_OP ${TEST_OPS})
py_test_modules(${TEST_OP} MODULES ${TEST_OP})
endforeach(TEST_OP)
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import numpy as np
import unittest
import sys
sys.path.append("..")
from op_test import OpTest, _set_use_system_allocator
import paddle
import paddle.fluid as fluid
paddle.enable_static()
@unittest.skipIf(not paddle.is_compiled_with_npu(),
"core is not compiled with NPU")
class TestElementwiseAddOp(OpTest):
def setUp(self):
self.set_npu()
self.op_type = "elementwise_add"
self.place = paddle.NPUPlace(0)
self.init_dtype()
self.init_input_output()
self.init_kernel_type()
self.init_axis()
self.inputs = {
'X': OpTest.np_dtype_to_fluid_dtype(self.x),
'Y': OpTest.np_dtype_to_fluid_dtype(self.y)
}
self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_mkldnn}
self.outputs = {'Out': self.out}
def set_npu(self):
self.__class__.use_npu = True
def init_kernel_type(self):
self.use_mkldnn = False
def init_input_output(self):
self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
self.y = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
self.out = np.add(self.x, self.y)
def init_dtype(self):
self.dtype = np.float32
def init_axis(self):
self.axis = -1
def test_check_output(self):
self.check_output_with_place(self.place, check_dygraph=False)
# TODO(ascendrc): Test grad op after it is implemented.
# def test_check_grad_normal(self):
# self.check_grad_with_place(
# self.place, ['X', 'Y'],
# 'Out',
# max_relative_error=0.006,
# check_dygraph=False)
#
# def test_check_grad_ingore_x(self):
# self.check_grad_with_place(
# self.place, ['Y'],
# 'Out',
# no_grad_set=set("X"),
# max_relative_error=0.006,
# check_dygraph=False)
#
# def test_check_grad_ingore_y(self):
# self.check_grad_with_place(
# self.place, ['X'],
# 'Out',
# no_grad_set=set("Y"),
# max_relative_error=0.006,check_dygraph=False)
@unittest.skipIf(not paddle.is_compiled_with_npu(),
"core is not compiled with NPU")
class TestAddAPI(unittest.TestCase):
def test_name(self):
with paddle.static.program_guard(paddle.static.Program()):
x = paddle.static.data(name="x", shape=[2, 3], dtype="float32")
y = paddle.static.data(name='y', shape=[2, 3], dtype='float32')
y_1 = paddle.add(x, y, name='add_res')
self.assertEqual(('add_res' in y_1.name), True)
def test_static(self):
with paddle.static.program_guard(paddle.static.Program()):
x_np = np.array([2, 3, 4]).astype('float32')
y_np = np.array([1, 5, 2]).astype('float32')
x = paddle.static.data(name="x", shape=[3], dtype='float32')
y = paddle.static.data(name="y", shape=[3], dtype='float32')
x_reshape = paddle.reshape(x, [3, 1])
y_reshape = paddle.reshape(y, [3, 1])
z = paddle.add(x_reshape, y_reshape)
z = paddle.reshape(z, shape=[3])
place = paddle.NPUPlace(0)
exe = paddle.static.Executor(place)
x_value, y_value, z_value = exe.run(feed={"x": x_np,
"y": y_np},
fetch_list=[x, y, z])
z_expected = np.array([3., 8., 6.])
self.assertEqual(
(x_value == x_np).all(),
True,
msg="x_value = {}, but expected {}".format(x_value, x_np))
self.assertEqual(
(y_value == y_np).all(),
True,
msg="y_value = {}, but expected {}".format(y_value, y_np))
self.assertEqual(
(z_value == z_expected).all(),
True,
msg="z_value = {}, but expected {}".format(z_value, z_expected))
def test_backward(self):
# TODO(ascendrc): Test backward after add grad npu op implemented.
pass
@unittest.skipIf(not paddle.is_compiled_with_npu(),
"core is not compiled with NPU")
class TestAddError(unittest.TestCase):
def test_errors(self):
with paddle.static.program_guard(paddle.static.Program()):
# the input of elementwise_add must be Variable.
x1 = fluid.create_lod_tensor(
np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.NPUPlace(0))
y1 = fluid.create_lod_tensor(
np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.NPUPlace(0))
self.assertRaises(TypeError, paddle.add, x1, y1)
# the input dtype must be float16 or float32 or float64 or int32 or int64
x2 = paddle.static.data(
name='x2', shape=[3, 4, 5, 6], dtype="uint8")
y2 = paddle.static.data(
name='y2', shape=[3, 4, 5, 6], dtype="uint8")
self.assertRaises(TypeError, paddle.add, x2, y2)
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import numpy as np
import unittest
import sys
sys.path.append("..")
from op_test import OpTest
import paddle
import paddle.fluid as fluid
paddle.enable_static()
SEED = 2021
@unittest.skipIf(not paddle.is_compiled_with_npu(),
"core is not compiled with NPU")
class TestElementwiseSubOp(OpTest):
def setUp(self):
self.set_npu()
self.op_type = "elementwise_sub"
self.place = paddle.NPUPlace(0)
self.init_dtype()
self.init_input_output()
self.init_kernel_type()
self.init_axis()
self.inputs = {
'X': OpTest.np_dtype_to_fluid_dtype(self.x),
'Y': OpTest.np_dtype_to_fluid_dtype(self.y)
}
self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_mkldnn}
self.outputs = {'Out': self.out}
def set_npu(self):
self.__class__.use_npu = True
def init_kernel_type(self):
self.use_mkldnn = False
def init_input_output(self):
self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
self.y = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
self.out = np.subtract(self.x, self.y)
def init_dtype(self):
self.dtype = np.float32
def init_axis(self):
self.axis = 0
def test_check_output(self):
self.check_output_with_place(self.place, check_dygraph=False)
# TODO(ascendrc): For grad tests, OpTest raises FatalError:Segmentation fault
# when call op.run, which may be caused by system environment exception
# and the exact cause has not be located.
# def test_check_grad_normal(self):
# self.check_grad_with_place(
# self.place, ['X', 'Y'],
# 'Out',
# max_relative_error=0.006,
# check_dygraph=False)
#
# def test_check_grad_ingore_x(self):
# self.check_grad_with_place(
# self.place, ['Y'],
# 'Out',
# no_grad_set=set("X"),
# max_relative_error=0.006,
# check_dygraph=False)
#
# def test_check_grad_ingore_y(self):
# self.check_grad_with_place(
# self.place, ['X'],
# 'Out',
# no_grad_set=set("Y"),
# max_relative_error=0.006,check_dygraph=False)
@unittest.skipIf(not paddle.is_compiled_with_npu(),
"core is not compiled with NPU")
class TestSubtractAPI(unittest.TestCase):
def test_name(self):
with paddle.static.program_guard(paddle.static.Program()):
x = paddle.static.data(name="x", shape=[2, 3], dtype="float32")
y = paddle.static.data(name='y', shape=[2, 3], dtype='float32')
y_1 = paddle.subtract(x, y, name='add_res')
self.assertEqual(('add_res' in y_1.name), True)
def test_static(self):
with paddle.static.program_guard(paddle.static.Program()):
x_np = np.array([2, 3, 4]).astype('float32')
y_np = np.array([1, 5, 2]).astype('float32')
x = paddle.static.data(name="x", shape=[3], dtype='float32')
y = paddle.static.data(name="y", shape=[3], dtype='float32')
x_reshape = paddle.reshape(x, [3, 1])
y_reshape = paddle.reshape(y, [3, 1])
z = paddle.subtract(x_reshape, y_reshape)
z = paddle.reshape(z, shape=[3])
place = paddle.NPUPlace(0)
exe = paddle.static.Executor(place)
x_value, y_value, z_value = exe.run(feed={"x": x_np,
"y": y_np},
fetch_list=[x, y, z])
z_expected = np.array([1., -2., 2.])
self.assertEqual(
(x_value == x_np).all(),
True,
msg="x_value = {}, but expected {}".format(x_value, x_np))
self.assertEqual(
(y_value == y_np).all(),
True,
msg="y_value = {}, but expected {}".format(y_value, y_np))
self.assertEqual(
(z_value == z_expected).all(),
True,
msg="z_value = {}, but expected {}".format(z_value, z_expected))
@unittest.skipIf(not paddle.is_compiled_with_npu(),
"core is not compiled with NPU")
class TestSubtractError(unittest.TestCase):
def test_errors(self):
with paddle.static.program_guard(paddle.static.Program()):
# the input of elementwise_add must be Variable.
x1 = fluid.create_lod_tensor(
np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.NPUPlace(0))
y1 = fluid.create_lod_tensor(
np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.NPUPlace(0))
self.assertRaises(TypeError, paddle.subtract, x1, y1)
# the input dtype must be float16 or float32 or float64 or int32 or int64
x2 = paddle.static.data(
name='x2', shape=[3, 4, 5, 6], dtype="uint8")
y2 = paddle.static.data(
name='y2', shape=[3, 4, 5, 6], dtype="uint8")
self.assertRaises(TypeError, paddle.subtract, x2, y2)
@unittest.skipIf(not paddle.is_compiled_with_npu(),
"core is not compiled with NPU")
class TestSubtractNet(unittest.TestCase):
def _test(self, run_npu=True):
main_prog = paddle.static.Program()
startup_prog = paddle.static.Program()
main_prog.random_seed = SEED
startup_prog.random_seed = SEED
np.random.seed(SEED)
a_np = np.random.random(size=(32, 32)).astype('float32')
b_np = np.random.random(size=(32, 32)).astype('float32')
label_np = np.random.randint(2, size=(32, 1)).astype('int64')
with paddle.static.program_guard(main_prog, startup_prog):
a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
label = paddle.static.data(
name="label", shape=[32, 1], dtype='int64')
sum = paddle.add(a, b)
c = paddle.assign(b)
z = paddle.subtract(sum, c)
fc_1 = fluid.layers.fc(input=z, size=128)
prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
cost = fluid.layers.cross_entropy(input=prediction, label=label)
loss = fluid.layers.reduce_mean(cost)
sgd = fluid.optimizer.SGD(learning_rate=0.01)
sgd.minimize(loss)
if run_npu:
place = paddle.NPUPlace(0)
else:
place = paddle.CPUPlace()
exe = paddle.static.Executor(place)
exe.run(startup_prog)
for epoch in range(100):
pred_res, loss_res = exe.run(
main_prog,
feed={"a": a_np,
"b": b_np,
"label": label_np},
fetch_list=[prediction, loss])
if epoch % 10 == 0:
print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
epoch, pred_res[0], loss_res))
return pred_res, loss_res
def test_npu(self):
npu_pred, npu_loss = self._test(True)
cpu_pred, cpu_loos = self._test(False)
self.assertTrue(np.allclose(npu_pred, cpu_pred))
self.assertTrue(np.allclose(npu_loss, cpu_loos))
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import paddle
import numpy as np
from paddle.fluid import core
paddle.enable_static()
@unittest.skipIf(not paddle.is_compiled_with_npu(),
"core is not compiled with NPU")
class TestNpuPlace(unittest.TestCase):
def test(self):
p = core.Place()
p.set_place(paddle.NPUPlace(0))
self.assertTrue(p.is_npu_place())
self.assertEqual(p.npu_device_id(), 0)
@unittest.skipIf(not paddle.is_compiled_with_npu(),
"core is not compiled with NPU")
class TestNpuPlaceError(unittest.TestCase):
def test_static(self):
# NPU is not supported in ParallelExecutor
prog = paddle.static.Program()
with paddle.static.program_guard(prog):
x_np = np.array([2, 3, 4]).astype('float32')
y_np = np.array([1, 5, 2]).astype('float32')
x = paddle.static.data(name="x", shape=[3], dtype='float32')
y = paddle.static.data(name="y", shape=[3], dtype='float32')
z = paddle.add(x, y)
compiled_prog = paddle.static.CompiledProgram(prog)
place = paddle.NPUPlace(0)
exe = paddle.static.Executor(place)
with self.assertRaisesRegex(RuntimeError,
"NPU is not supported in ParallelExecutor"):
exe.run(compiled_prog, feed={"x": x_np, "y": y_np})
if __name__ == '__main__':
unittest.main()
...@@ -243,7 +243,10 @@ class OpTest(unittest.TestCase): ...@@ -243,7 +243,10 @@ class OpTest(unittest.TestCase):
np.random.seed(123) np.random.seed(123)
random.seed(124) random.seed(124)
cls._use_system_allocator = _set_use_system_allocator(True) if paddle.is_compiled_with_npu():
cls._use_system_allocator = _set_use_system_allocator(False)
else:
cls._use_system_allocator = _set_use_system_allocator(True)
@classmethod @classmethod
def tearDownClass(cls): def tearDownClass(cls):
...@@ -272,6 +275,9 @@ class OpTest(unittest.TestCase): ...@@ -272,6 +275,9 @@ class OpTest(unittest.TestCase):
def is_mkldnn_op_test(): def is_mkldnn_op_test():
return hasattr(cls, "use_mkldnn") and cls.use_mkldnn == True return hasattr(cls, "use_mkldnn") and cls.use_mkldnn == True
def is_npu_op_test():
return hasattr(cls, "use_npu") and cls.use_npu == True
if not hasattr(cls, "op_type"): if not hasattr(cls, "op_type"):
raise AssertionError( raise AssertionError(
"This test do not have op_type in class attrs, " "This test do not have op_type in class attrs, "
...@@ -292,7 +298,8 @@ class OpTest(unittest.TestCase): ...@@ -292,7 +298,8 @@ class OpTest(unittest.TestCase):
and cls.op_type not in op_accuracy_white_list.NO_FP64_CHECK_GRAD_OP_LIST \ and cls.op_type not in op_accuracy_white_list.NO_FP64_CHECK_GRAD_OP_LIST \
and not hasattr(cls, 'exist_fp64_check_grad') \ and not hasattr(cls, 'exist_fp64_check_grad') \
and not is_xpu_op_test() \ and not is_xpu_op_test() \
and not is_mkldnn_op_test(): and not is_mkldnn_op_test() \
and not is_npu_op_test():
raise AssertionError( raise AssertionError(
"This test of %s op needs check_grad with fp64 precision." % "This test of %s op needs check_grad with fp64 precision." %
cls.op_type) cls.op_type)
...@@ -1183,7 +1190,8 @@ class OpTest(unittest.TestCase): ...@@ -1183,7 +1190,8 @@ class OpTest(unittest.TestCase):
# Check inplace for given op, its grad op, its grad_grad op, etc. # Check inplace for given op, its grad op, its grad_grad op, etc.
# No effect on original OpTest # No effect on original OpTest
# Currently not support ParallelExecutor on XPUPlace. # Currently not support ParallelExecutor on XPUPlace.
if not paddle.is_compiled_with_xpu(): if not paddle.is_compiled_with_xpu(
) and not paddle.is_compiled_with_npu():
self.check_inplace_output_with_place( self.check_inplace_output_with_place(
place, no_check_set=no_check_set, inplace_atol=inplace_atol) place, no_check_set=no_check_set, inplace_atol=inplace_atol)
......
...@@ -15,54 +15,39 @@ ...@@ -15,54 +15,39 @@
from __future__ import print_function from __future__ import print_function
import unittest import unittest
from op_test import OpTest
import numpy as np import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.core as core import paddle.fluid.core as core
import paddle.fluid.framework as framework import paddle.fluid.framework as framework
import warnings
import paddle
class TestStaticDeviceManage(unittest.TestCase): class TestStaticDeviceManage(unittest.TestCase):
def test_cpu_device(self): def _test_device(self, device_name, device_class):
paddle.set_device('cpu') paddle.set_device(device_name)
out1 = paddle.zeros(shape=[1, 3], dtype='float32') out1 = paddle.zeros(shape=[1, 3], dtype='float32')
out2 = paddle.ones(shape=[1, 3], dtype='float32') out2 = paddle.ones(shape=[1, 3], dtype='float32')
out3 = paddle.concat(x=[out1, out2], axis=0) out3 = paddle.concat(x=[out1, out2], axis=0)
exe = paddle.fluid.Executor()
exe = paddle.static.Executor()
exe.run(paddle.fluid.default_startup_program()) exe.run(paddle.fluid.default_startup_program())
res = exe.run(fetch_list=[out3]) res = exe.run(fetch_list=[out3])
device = paddle.get_device() device = paddle.get_device()
self.assertEqual(isinstance(exe.place, core.CPUPlace), True) self.assertEqual(isinstance(exe.place, device_class), True)
self.assertEqual(device, "cpu") self.assertEqual(device, device_name)
def test_cpu_device(self):
self._test_device("cpu", core.CPUPlace)
def test_gpu_device(self): def test_gpu_device(self):
if core.is_compiled_with_cuda(): if core.is_compiled_with_cuda():
out1 = paddle.zeros(shape=[1, 3], dtype='float32') self._test_device("gpu:0", core.CUDAPlace)
out2 = paddle.ones(shape=[1, 3], dtype='float32')
out3 = paddle.concat(x=[out1, out2], axis=0)
paddle.set_device('gpu:0')
exe = paddle.fluid.Executor()
exe.run(paddle.fluid.default_startup_program())
res = exe.run(fetch_list=[out3])
device = paddle.get_device()
self.assertEqual(isinstance(exe.place, core.CUDAPlace), True)
self.assertEqual(device, "gpu:0")
def test_xpu_device(self): def test_xpu_device(self):
if core.is_compiled_with_xpu(): if core.is_compiled_with_xpu():
out1 = paddle.zeros(shape=[1, 3], dtype='float32') self._test_device("xpu:0", core.XPUPlace)
out2 = paddle.ones(shape=[1, 3], dtype='float32')
out3 = paddle.concat(x=[out1, out2], axis=0)
paddle.set_device('xpu:0')
exe = paddle.fluid.Executor()
exe.run(paddle.fluid.default_startup_program())
res = exe.run(fetch_list=[out3])
device = paddle.get_device()
self.assertEqual(isinstance(exe.place, core.XPUPlace), True)
self.assertEqual(device, "xpu:0")
class TestImperativeDeviceManage(unittest.TestCase): class TestImperativeDeviceManage(unittest.TestCase):
......
...@@ -12,10 +12,10 @@ ...@@ -12,10 +12,10 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
# TODO: import framework api under this directory # TODO: import framework api under this directory
__all__ = [ __all__ = [
'create_parameter', 'ParamAttr', 'CPUPlace', 'CUDAPlace', 'CUDAPinnedPlace', 'create_parameter', 'ParamAttr', 'CPUPlace', 'CUDAPlace', 'CUDAPinnedPlace',
'get_default_dtype', 'set_default_dtype' 'NPUPlace', 'get_default_dtype', 'set_default_dtype'
] ]
__all__ += ['grad', 'LayerList', 'load', 'save', 'no_grad', 'DataParallel'] __all__ += ['grad', 'LayerList', 'load', 'save', 'no_grad', 'DataParallel']
...@@ -31,6 +31,7 @@ from ..fluid.layers.tensor import create_parameter #DEFINE_ALIAS ...@@ -31,6 +31,7 @@ from ..fluid.layers.tensor import create_parameter #DEFINE_ALIAS
from ..fluid.core import CPUPlace #DEFINE_ALIAS from ..fluid.core import CPUPlace #DEFINE_ALIAS
from ..fluid.core import CUDAPlace #DEFINE_ALIAS from ..fluid.core import CUDAPlace #DEFINE_ALIAS
from ..fluid.core import CUDAPinnedPlace #DEFINE_ALIAS from ..fluid.core import CUDAPinnedPlace #DEFINE_ALIAS
from ..fluid.core import NPUPlace #DEFINE_ALIAS
from ..fluid.core import VarBase #DEFINE_ALIAS from ..fluid.core import VarBase #DEFINE_ALIAS
from paddle.fluid import core #DEFINE_ALIAS from paddle.fluid import core #DEFINE_ALIAS
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册