[NPU] Support executor with NPU (#31057)

* [NPU] Support executor with NPU * Fix code according to reviews * Fix code * Add unittest for sub op npu

[NPU] Support executor with NPU (#31057)
* [NPU] Support executor with NPU * Fix code according to reviews * Fix code * Add unittest for sub op npu
1435b4c0 · liym27 · GitHub · 678a3e8f · 1435b4c0 · 1435b4c0
19 changed file
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -466,6 +466,14 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx,
 #else
      PADDLE_THROW(
          platform::errors::Unimplemented("No XPU gc found in CPU/GPU paddle"));
+#endif
+    } else if (platform::is_npu_place(place_)) {
+#ifdef PADDLE_WITH_ASCEND_CL
+      // TODO(ascendrc): Support garbage collector on NPUPlace
+      VLOG(4) << "Skip NPU gc because it is not implemented now.";
+#else
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "No NPU gc found in CPU/GPU/XPU paddle"));
 #endif
    }
  }

--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1275,6 +1275,16 @@ void OperatorWithKernel::ChooseKernel(const RuntimeContext& ctx,
    expected_kernel_key.place_ = platform::CPUPlace();
    kernel_iter = kernels.find(expected_kernel_key);
  }
+#endif
+#ifdef PADDLE_WITH_ASCEND_CL
+  if (kernel_iter == kernels.end() &&
+      is_npu_place(expected_kernel_key.place_)) {
+    VLOG(3) << "missing NPU kernel: " << type_
+            << ", expected_kernel_key:" << expected_kernel_key
+            << ", fallbacking to CPU one!";
+    expected_kernel_key.place_ = platform::CPUPlace();
+    kernel_iter = kernels.find(expected_kernel_key);
+  }
 #endif
  PADDLE_ENFORCE_NE(kernel_iter, kernels.end(),
                    platform::errors::NotFound(

--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -614,6 +614,9 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
                                   const BuildStrategy &build_strategy,
                                   ir::Graph *graph)
    : member_(new ParallelExecutorPrivate(places, scope)) {
+  PADDLE_ENFORCE(places.size() > 0 && !is_npu_place(places[0]),
+                 platform::errors::Unavailable(
+                     "NPU is not supported in ParallelExecutor"));
  InitP2P(places);
  ir::InitReaderQueueDeviceCount(graph, *(member_->global_scope_),
                                 member_->places_.size());

--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -101,15 +101,19 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
  // TODO(zhiqiu): handle different condition like CUDA code below
  else if (platform::is_npu_place(src_place) &&  // NOLINT
           platform::is_cpu_place(dst_place)) {
-    auto stream = reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream();
+    auto stream =
+        reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream();
    memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
-                 BOOST_GET_CONST(platform::NPUPlace, src_place), src_ptr, size, stream);
+                 BOOST_GET_CONST(platform::NPUPlace, src_place), src_ptr, size,
+                 stream);
  }
  else if (platform::is_cpu_place(src_place) &&  // NOLINT
           platform::is_npu_place(dst_place)) {
-    auto stream = reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream();
+    auto stream =
+        reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream();
    memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
-                 BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size, stream);
+                 BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size,
+                 stream);
  }
  else if (platform::is_npu_place(src_place) &&  // NOLINT
           platform::is_npu_place(dst_place)) {
@@ -118,9 +122,11 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
              << dst_place;
      return;
    }
-    auto stream = reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream();
+    auto stream =
+        reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream();
    memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
-                 BOOST_GET_CONST(platform::NPUPlace, src_place), src_ptr, size, stream);
+                 BOOST_GET_CONST(platform::NPUPlace, src_place), src_ptr, size,
+                 stream);
  }
  else {  // NOLINT
    PADDLE_THROW(platform::errors::Unimplemented(
@@ -336,24 +342,27 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
 #endif
 #ifdef PADDLE_WITH_ASCEND_CL
  else if (platform::is_npu_place(src_place) &&  // NOLINT
-           platform::is_cpu_place(dst_place)) {
+           platform::is_cpu_place(dst_place)) {  /* npu -> cpu*/
    memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
-                 BOOST_GET_CONST(platform::NPUPlace, src_place), src_ptr, size, nullptr);
+                 BOOST_GET_CONST(platform::NPUPlace, src_place), src_ptr, size,
+                 nullptr);
  }
  else if (platform::is_cpu_place(src_place) &&  // NOLINT
-           platform::is_npu_place(dst_place)) {
+           platform::is_npu_place(dst_place)) {  /* cpu -> npu*/
    memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
-                 BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size, nullptr);
+                 BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size,
+                 nullptr);
  }
  else if (platform::is_npu_place(src_place) &&  // NOLINT
-           platform::is_npu_place(dst_place)) {
+           platform::is_npu_place(dst_place)) {  /* npu -> npu*/
    if (src_ptr == dst_ptr) {
      VLOG(3) << "Skip copy the same data sync from " << src_place << " to "
              << dst_place;
      return;
    }
    memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
-                 BOOST_GET_CONST(platform::NPUPlace, src_place), src_ptr, size, nullptr);
+                 BOOST_GET_CONST(platform::NPUPlace, src_place), src_ptr, size,
+                 nullptr);
  }
  else {  // NOLINT
    PADDLE_THROW(platform::errors::Unimplemented(

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -154,6 +154,14 @@ bool IsCompiledWithXPU() {
 #endif
 }
+bool IsCompiledWithNPU() {
+#ifndef PADDLE_WITH_ASCEND_CL
+  return false;
+#else
+  return true;
+#endif
+}
 bool IsCompiledWithMKLDNN() {
 #ifndef PADDLE_WITH_MKLDNN
  return false;
@@ -567,6 +575,10 @@ PYBIND11_MODULE(core_noavx, m) {
           [](Tensor &self, paddle::platform::CPUPlace &place) {
             self.mutable_data<float>(place);
           })
+      .def("_alloc_float",
+           [](Tensor &self, paddle::platform::NPUPlace &place) {
+             self.mutable_data<float>(place);
+           })
      .def("_alloc_double",
           [](Tensor &self, paddle::platform::CPUPlace &place) {
             self.mutable_data<double>(place);
@@ -611,6 +623,11 @@ PYBIND11_MODULE(core_noavx, m) {
              paddle::framework::proto::VarType::Type type) {
             return reinterpret_cast<uintptr_t>(self.mutable_data(place, type));
           })
+      .def("_mutable_data",
+           [](Tensor &self, paddle::platform::NPUPlace &place,
+              paddle::framework::proto::VarType::Type type) {
+             return reinterpret_cast<uintptr_t>(self.mutable_data(place, type));
+           })
      .def("_clear", &Tensor::clear)
      .def("set", SetTensorFromPyArray<paddle::platform::CPUPlace>,
           py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
@@ -618,6 +635,8 @@ PYBIND11_MODULE(core_noavx, m) {
           py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
      .def("set", SetTensorFromPyArray<paddle::platform::CUDAPlace>,
           py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
+      .def("set", SetTensorFromPyArray<paddle::platform::NPUPlace>,
+           py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
      .def("set", SetTensorFromPyArray<paddle::platform::CUDAPinnedPlace>,
           py::arg("array"), py::arg("place"), py::arg("zero_copy") = false,
           R"DOC(
@@ -625,7 +644,7 @@ PYBIND11_MODULE(core_noavx, m) {
        Args:
          lod (numpy.ndarray): The data to set.
-          place (CPUPlace|CUDAPlace|XPUPlace|CUDAPinnedPlace): The place where the 
+          place (CPUPlace|CUDAPlace|XPUPlace|CUDAPinnedPlace|NPUPlace): The place where the
          LoDTensor is to be set.
          zero_copy (bool, optional): Whether to share memory with the input numpy array.
          This parameter only works with CPUPlace. Default: False.
@@ -1348,6 +1367,18 @@ All parameter, weight, gradient are variables in Paddle.
                    return new paddle::platform::XPUDeviceContext(place);
 #endif
                  })
+        .def_static("create",
+                    [](paddle::platform::NPUPlace& place)
+                        -> paddle::platform::DeviceContext* {
+#ifndef PADDLE_WITH_ASCEND_CL
+             PADDLE_THROW(
+                 platform::errors::PermissionDenied(
+                 "Cannot use NPUPlace in CPU/GPU/XPU version, "
+                 "Please recompile or reinstall Paddle with NPU support."));
+#else
+                return new paddle::platform::NPUDeviceContext(place);
+#endif
+        })
      .def_static("create",
                  [](paddle::platform::CUDAPlace& place)
                      -> paddle::platform::DeviceContext* {
@@ -1448,6 +1479,7 @@ All parameter, weight, gradient are variables in Paddle.
      .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::CUDAPlace>)
      .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::CPUPlace>)
      .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::XPUPlace>)
+      .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::NPUPlace>)
      .def("_equals",
           &IsSamePlace<platform::CUDAPlace, platform::CUDAPinnedPlace>)
      .def("_get_device_id",
@@ -1517,6 +1549,7 @@ All parameter, weight, gradient are variables in Paddle.
 #ifdef PADDLE_WITH_XPU
  m.def("get_xpu_device_count", platform::GetXPUDeviceCount);
 #endif
  py::class_<paddle::platform::CPUPlace>(m, "CPUPlace", R"DOC(
    CPUPlace is a descriptor of a device.
    It represents a CPU device on which a tensor will be allocated and a model will run.
@@ -1532,6 +1565,7 @@ All parameter, weight, gradient are variables in Paddle.
      .def("_type", &PlaceIndex<platform::CPUPlace>)
      .def("_equals", &IsSamePlace<platform::CPUPlace, platform::Place>)
      .def("_equals", &IsSamePlace<platform::CPUPlace, platform::XPUPlace>)
+      .def("_equals", &IsSamePlace<platform::CPUPlace, platform::NPUPlace>)
      .def("_equals", &IsSamePlace<platform::CPUPlace, platform::CUDAPlace>)
      .def("_equals", &IsSamePlace<platform::CPUPlace, platform::CPUPlace>)
      .def("_equals",
@@ -1569,6 +1603,8 @@ All parameter, weight, gradient are variables in Paddle.
           &IsSamePlace<platform::CUDAPinnedPlace, platform::CUDAPlace>)
      .def("_equals",
           &IsSamePlace<platform::CUDAPinnedPlace, platform::XPUPlace>)
+      .def("_equals",
+           &IsSamePlace<platform::CUDAPinnedPlace, platform::NPUPlace>)
      .def("_equals",
           &IsSamePlace<platform::CUDAPinnedPlace, platform::CPUPlace>)
      .def("_equals",
@@ -1576,6 +1612,65 @@ All parameter, weight, gradient are variables in Paddle.
      .def("__repr__", string::to_string<const platform::CUDAPinnedPlace &>)
      .def("__str__", string::to_string<const platform::CUDAPinnedPlace &>);
+  // NPUPlace
+  py::class_<platform::NPUPlace>(m, "NPUPlace", R"DOC(
+    NPUPlace is a descriptor of a device.
+    It represents a NPU device on which a tensor will be allocated and a model will run.
+    Examples:
+        .. code-block:: python
+          import paddle
+          npu_place = paddle.NPUPlace(0)
+        )DOC")
+      .def("__init__",
+           [](platform::NPUPlace &self, int dev_id) {
+#ifdef PADDLE_WITH_ASCEND_CL
+             if (UNLIKELY(dev_id < 0)) {
+               LOG(ERROR) << string::Sprintf(
+                   "Invalid NPUPlace(%d), device id must be 0 or "
+                   "positive integer",
+                   dev_id);
+               std::exit(-1);
+             }
+             if (UNLIKELY(dev_id >= platform::GetNPUDeviceCount())) {
+               if (platform::GetNPUDeviceCount() == 0) {
+                 LOG(ERROR) << "Cannot use NPU because there is no NPU "
+                               "detected on your "
+                               "machine.";
+                 std::exit(-1);
+               } else {
+                 LOG(ERROR) << string::Sprintf(
+                     "Invalid NPUPlace(%d), must inside [0, %d), because NPU "
+                     "number on your machine is %d",
+                     dev_id, platform::GetNPUDeviceCount(),
+                     platform::GetNPUDeviceCount());
+                 std::exit(-1);
+               }
+             }
+             new (&self) platform::NPUPlace(dev_id);
+#else
+             LOG(ERROR) << string::Sprintf(
+                 "Cannot use NPU because you have installed CPU/GPU version "
+                 "PaddlePaddle.\n"
+                 "If you want to use NPU, please try to install NPU version "
+                 "PaddlePaddle by: pip install paddlepaddle-xpu\n"
+                 "If you only have CPU, please change NPUPlace(%d) to be "
+                 "CPUPlace().\n",
+                 dev_id);
+             std::exit(-1);
+#endif
+           })
+      .def("_type", &PlaceIndex<platform::NPUPlace>)
+      .def("_equals", &IsSamePlace<platform::NPUPlace, platform::Place>)
+      .def("_equals", &IsSamePlace<platform::NPUPlace, platform::CUDAPlace>)
+      .def("_equals", &IsSamePlace<platform::NPUPlace, platform::CPUPlace>)
+      .def("_equals", &IsSamePlace<platform::NPUPlace, platform::XPUPlace>)
+      .def("_equals", &IsSamePlace<platform::NPUPlace, platform::NPUPlace>)
+      .def("_equals",
+           &IsSamePlace<platform::NPUPlace, platform::CUDAPinnedPlace>)
+      .def("__str__", string::to_string<const platform::NPUPlace &>);
  py::class_<platform::Place>(m, "Place")
      .def(py::init<>())
      .def("_type", &PlaceIndex<platform::Place>)
@@ -1583,6 +1678,7 @@ All parameter, weight, gradient are variables in Paddle.
      .def("_equals", &IsSamePlace<platform::Place, platform::CUDAPlace>)
      .def("_equals", &IsSamePlace<platform::Place, platform::CPUPlace>)
      .def("_equals", &IsSamePlace<platform::Place, platform::XPUPlace>)
+      .def("_equals", &IsSamePlace<platform::Place, platform::NPUPlace>)
      .def("_equals", &IsSamePlace<platform::Place, platform::CUDAPinnedPlace>)
      .def("is_gpu_place",
           [](platform::Place &self) { return platform::is_gpu_place(self); })
@@ -1590,6 +1686,8 @@ All parameter, weight, gradient are variables in Paddle.
           [](platform::Place &self) { return platform::is_cpu_place(self); })
      .def("is_xpu_place",
           [](platform::Place &self) { return platform::is_xpu_place(self); })
+      .def("is_npu_place",
+           [](platform::Place &self) { return platform::is_npu_place(self); })
      .def("is_cuda_pinned_place",
           [](platform::Place &self) {
             return platform::is_cuda_pinned_place(self);
@@ -1602,6 +1700,10 @@ All parameter, weight, gradient are variables in Paddle.
           [](platform::Place &self) {
             return BOOST_GET_CONST(platform::XPUPlace, self).device;
           })
+      .def("npu_device_id",
+           [](platform::Place &self) {
+             return BOOST_GET_CONST(platform::NPUPlace, self).device;
+           })
      .def("set_place", [](platform::Place &self,
                           const platform::Place &other) { self = other; })
      .def("set_place",
@@ -1621,6 +1723,10 @@ All parameter, weight, gradient are variables in Paddle.
              const platform::CUDAPinnedPlace &cuda_pinned_place) {
             self = cuda_pinned_place;
           })
+      .def("set_place",
+           [](platform::Place &self, const platform::NPUPlace &npu_place) {
+             self = npu_place;
+           })
      .def("__repr__", string::to_string<const platform::Place &>)
      .def("__str__", string::to_string<const platform::Place &>);
@@ -1645,6 +1751,9 @@ All parameter, weight, gradient are variables in Paddle.
      .def("run",
           [](OperatorBase &self, const Scope &scope,
              const platform::XPUPlace &place) { self.Run(scope, place); })
+      .def("run",
+           [](OperatorBase &self, const Scope &scope,
+              const platform::NPUPlace &place) { self.Run(scope, place); })
      .def("run",
           [](OperatorBase &self, const Scope &scope,
              const platform::CUDAPlace &place) { self.Run(scope, place); })
@@ -1745,6 +1854,7 @@ All parameter, weight, gradient are variables in Paddle.
  m.def("is_compiled_with_cuda", IsCompiledWithCUDA);
  m.def("is_compiled_with_ascend", IsCompiledWithAscend);
+  m.def("is_compiled_with_npu", IsCompiledWithNPU);
  m.def("is_compiled_with_xpu", IsCompiledWithXPU);
  m.def("is_compiled_with_mkldnn", IsCompiledWithMKLDNN);
  m.def("supports_bfloat16", SupportsBfloat16);

--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -285,6 +285,22 @@ void SetTensorFromPyArrayT(
    PADDLE_THROW(platform::errors::PermissionDenied(
        "Cannot use XPUPlace in CPU/GPU version, "
        "Please recompile or reinstall Paddle with XPU support."));
+#endif
+  } else if (paddle::platform::is_npu_place(place)) {
+#ifdef PADDLE_WITH_ASCEND_CL
+    platform::Place tmp_place = place;
+    platform::NPUDeviceGuard guard(
+        BOOST_GET_CONST(platform::NPUPlace, tmp_place).device);
+    auto dst = self->mutable_data<T>(place);
+    platform::NPUMemcpySync(dst, array.data(), array.nbytes(),
+                            ACL_MEMCPY_HOST_TO_DEVICE);
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &ctx = *pool.Get(place);
+    ctx.Wait();
+#else
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Cannot use NPUPlace in CPU/GPU/XPU version. "
+        "Please recompile or reinstall Paddle with NPU support."));
 #endif
  } else {
 #ifdef PADDLE_WITH_CUDA

--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -232,6 +232,7 @@ from .framework import ParamAttr  #DEFINE_ALIAS
 from .framework import create_parameter  #DEFINE_ALIAS
 from .framework import CPUPlace  #DEFINE_ALIAS
 from .framework import CUDAPlace  #DEFINE_ALIAS
+from .framework import NPUPlace  #DEFINE_ALIAS
 from .framework import CUDAPinnedPlace  #DEFINE_ALIAS
 from .framework import grad  #DEFINE_ALIAS
@@ -256,6 +257,7 @@ from .device import set_device
 from .device import get_device
 from .device import is_compiled_with_cuda  #DEFINE_ALIAS
 from .device import is_compiled_with_xpu
+from .device import is_compiled_with_npu
 from .device import XPUPlace
 # from .tensor.tensor import Tensor        #DEFINE_ALIAS
 # from .tensor.tensor import LoDTensor        #DEFINE_ALIAS

--- a/python/paddle/device.py
+++ b/python/paddle/device.py
@@ -32,12 +32,28 @@ __all__ = [
    #            'cuda_places',
    #            'CUDAPinnedPlace',
    #            'CUDAPlace',
-    'is_compiled_with_cuda'
+    'is_compiled_with_cuda',
+    'is_compiled_with_npu'
 ]
 _cudnn_version = None
+def is_compiled_with_npu():
+    """
+    Whether this whl package can be used to run the model on NPU.
+    Returns (bool): `True` if NPU is supported, otherwise `False`.
+    Examples:
+        .. code-block:: python
+            import paddle
+            support_npu = paddle.is_compiled_with_npu()
+    """
+    return core.is_compiled_with_npu()
 def is_compiled_with_xpu():
    """
    Whether paddle was built with WITH_XPU=ON to support Baidu Kunlun
@@ -163,6 +179,7 @@ def set_device(device):
            device_id = device_info_list[1]
            device_id = int(device_id)
            place = core.XPUPlace(device_id)
    framework._set_expected_place(place)
    return place

--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -68,7 +68,8 @@ from .input import embedding, one_hot
 from . import distribute_lookup_table
 from .param_attr import ParamAttr, WeightNormParamAttr
 from .data_feeder import DataFeeder
-from .core import LoDTensor, LoDTensorArray, CPUPlace, XPUPlace, CUDAPlace, CUDAPinnedPlace, Scope, _Scope
+from .core import LoDTensor, LoDTensorArray, Scope, _Scope
+from .core import CPUPlace, XPUPlace, CUDAPlace, CUDAPinnedPlace, NPUPlace
 from .incubate import fleet
 from .incubate import data_generator
 from .transpiler import DistributeTranspiler, \
@@ -124,6 +125,7 @@ __all__ = framework.__all__ + executor.__all__ + \
        'XPUPlace',
        'CUDAPlace',
        'CUDAPinnedPlace',
+        'NPUPlace',
        'Tensor',
        'ParamAttr',
        'WeightNormParamAttr',

--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -1213,6 +1213,7 @@ class Executor(object):
            # In distributed training, the compiled program is saved in Program._graph
            has_compiled_graph = isinstance(program._graph,
                                            compiler.CompiledProgram)
            if has_compiled_graph:
                program._graph._compile(scope, self.place)
                # _graph in program does not support inference since the _graph is optimized

--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -5854,7 +5854,7 @@ def _get_paddle_place(place):
    if place is None:
        return place
    if isinstance(place, (core.Place, core.XPUPlace, core.CPUPlace,
-                          core.CUDAPinnedPlace, core.CUDAPlace)):
+                          core.CUDAPinnedPlace, core.CUDAPlace, core.NPUPlace)):
        return place
    if not isinstance(place, str):
@@ -5864,9 +5864,11 @@ def _get_paddle_place(place):
    place = place.lower()
    if (place == "cpu"):
        return core.CPUPlace()
    if (place == "device"):
        return core.Place()
+    # GPU
    avaliable_gpu_place = re.match(r'gpu:\d+', place)
    if place == "gpu_pinned" or place == "gpu" or avaliable_gpu_place:
        if not core.is_compiled_with_cuda():
@@ -5882,6 +5884,8 @@ def _get_paddle_place(place):
            device_id = place_info_list[1]
            device_id = int(device_id)
            return core.CUDAPlace(device_id)
+    # XPU
    avaliable_xpu_place = re.match(r'xpu:\d+', place)
    if avaliable_xpu_place:
        if not core.is_compiled_with_xpu():
@@ -5892,9 +5896,22 @@ def _get_paddle_place(place):
        device_id = place_info_list[1]
        device_id = int(device_id)
        return core.XPUPlace(device_id)
+    # NPU
+    avaliable_npu_place = re.match(r'npu:\d+', place)
+    if avaliable_npu_place:
+        if not core.is_compiled_with_npu():
+            raise ValueError(
+                "The device should not be {}, since PaddlePaddle is " \
+                "not compiled with NPU".format(avaliable_npu_place))
+        place_info_list = place.split(':', 1)
+        device_id = place_info_list[1]
+        device_id = int(device_id)
+        return core.NPUPlace(device_id)
    raise ValueError(
-        "paddle support CPUPlace, CUDAPlace,CUDAPinnedPlace and XPUPlace, Please check your Place Input"
+        "Paddle supports CPUPlace, CUDAPlace,CUDAPinnedPlace, XPUPlace and NPUPlace, but received {}.".
-    )
+        format(place))
 def _get_paddle_place_list(places):

--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -608,6 +608,10 @@ if (WITH_XPU)
    add_subdirectory(xpu)
 endif()
+if (WITH_ASCEND_CL)
+    add_subdirectory(npu)
+endif()
 if (WITH_MKLDNN)
    add_subdirectory(mkldnn)
 endif()

--- a/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt
+file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+foreach(TEST_OP ${TEST_OPS})
+    py_test_modules(${TEST_OP} MODULES ${TEST_OP})
+endforeach(TEST_OP)
--- a/python/paddle/fluid/tests/unittests/npu/test_elementwise_add_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_add_op_npu.py
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest, _set_use_system_allocator
+import paddle
+import paddle.fluid as fluid
+paddle.enable_static()
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestElementwiseAddOp(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "elementwise_add"
+        self.place = paddle.NPUPlace(0)
+        self.init_dtype()
+        self.init_input_output()
+        self.init_kernel_type()
+        self.init_axis()
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(self.x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(self.y)
+        }
+        self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_mkldnn}
+        self.outputs = {'Out': self.out}
+    def set_npu(self):
+        self.__class__.use_npu = True
+    def init_kernel_type(self):
+        self.use_mkldnn = False
+    def init_input_output(self):
+        self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
+        self.y = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
+        self.out = np.add(self.x, self.y)
+    def init_dtype(self):
+        self.dtype = np.float32
+    def init_axis(self):
+        self.axis = -1
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+    # TODO(ascendrc): Test grad op after it is implemented.
+    # def test_check_grad_normal(self):
+    #     self.check_grad_with_place(
+    #         self.place, ['X', 'Y'],
+    #         'Out',
+    #         max_relative_error=0.006,
+    #         check_dygraph=False)
+    #
+    # def test_check_grad_ingore_x(self):
+    #     self.check_grad_with_place(
+    #         self.place, ['Y'],
+    #         'Out',
+    #         no_grad_set=set("X"),
+    #         max_relative_error=0.006,
+    #         check_dygraph=False)
+    #
+    # def test_check_grad_ingore_y(self):
+    #     self.check_grad_with_place(
+    #         self.place, ['X'],
+    #         'Out',
+    #         no_grad_set=set("Y"),
+    #         max_relative_error=0.006,check_dygraph=False)
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestAddAPI(unittest.TestCase):
+    def test_name(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.static.data(name="x", shape=[2, 3], dtype="float32")
+            y = paddle.static.data(name='y', shape=[2, 3], dtype='float32')
+            y_1 = paddle.add(x, y, name='add_res')
+            self.assertEqual(('add_res' in y_1.name), True)
+    def test_static(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x_np = np.array([2, 3, 4]).astype('float32')
+            y_np = np.array([1, 5, 2]).astype('float32')
+            x = paddle.static.data(name="x", shape=[3], dtype='float32')
+            y = paddle.static.data(name="y", shape=[3], dtype='float32')
+            x_reshape = paddle.reshape(x, [3, 1])
+            y_reshape = paddle.reshape(y, [3, 1])
+            z = paddle.add(x_reshape, y_reshape)
+            z = paddle.reshape(z, shape=[3])
+            place = paddle.NPUPlace(0)
+            exe = paddle.static.Executor(place)
+            x_value, y_value, z_value = exe.run(feed={"x": x_np,
+                                                      "y": y_np},
+                                                fetch_list=[x, y, z])
+            z_expected = np.array([3., 8., 6.])
+            self.assertEqual(
+                (x_value == x_np).all(),
+                True,
+                msg="x_value = {}, but expected {}".format(x_value, x_np))
+            self.assertEqual(
+                (y_value == y_np).all(),
+                True,
+                msg="y_value = {}, but expected {}".format(y_value, y_np))
+            self.assertEqual(
+                (z_value == z_expected).all(),
+                True,
+                msg="z_value = {}, but expected {}".format(z_value, z_expected))
+    def test_backward(self):
+        # TODO(ascendrc): Test backward after add grad npu op implemented.
+        pass
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestAddError(unittest.TestCase):
+    def test_errors(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            # the input of elementwise_add must be Variable.
+            x1 = fluid.create_lod_tensor(
+                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.NPUPlace(0))
+            y1 = fluid.create_lod_tensor(
+                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.NPUPlace(0))
+            self.assertRaises(TypeError, paddle.add, x1, y1)
+            # the input dtype must be float16 or float32 or float64 or int32 or int64
+            x2 = paddle.static.data(
+                name='x2', shape=[3, 4, 5, 6], dtype="uint8")
+            y2 = paddle.static.data(
+                name='y2', shape=[3, 4, 5, 6], dtype="uint8")
+            self.assertRaises(TypeError, paddle.add, x2, y2)
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/npu/test_elementwise_sub_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_sub_op_npu.py
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+paddle.enable_static()
+SEED = 2021
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestElementwiseSubOp(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "elementwise_sub"
+        self.place = paddle.NPUPlace(0)
+        self.init_dtype()
+        self.init_input_output()
+        self.init_kernel_type()
+        self.init_axis()
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(self.x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(self.y)
+        }
+        self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_mkldnn}
+        self.outputs = {'Out': self.out}
+    def set_npu(self):
+        self.__class__.use_npu = True
+    def init_kernel_type(self):
+        self.use_mkldnn = False
+    def init_input_output(self):
+        self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
+        self.y = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
+        self.out = np.subtract(self.x, self.y)
+    def init_dtype(self):
+        self.dtype = np.float32
+    def init_axis(self):
+        self.axis = 0
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+    # TODO(ascendrc): For grad tests, OpTest raises FatalError:Segmentation fault
+    #  when call op.run, which may be caused by system environment exception
+    #  and the exact cause has not be located.
+    # def test_check_grad_normal(self):
+    #     self.check_grad_with_place(
+    #         self.place, ['X', 'Y'],
+    #         'Out',
+    #         max_relative_error=0.006,
+    #         check_dygraph=False)
+    #
+    # def test_check_grad_ingore_x(self):
+    #     self.check_grad_with_place(
+    #         self.place, ['Y'],
+    #         'Out',
+    #         no_grad_set=set("X"),
+    #         max_relative_error=0.006,
+    #         check_dygraph=False)
+    #
+    # def test_check_grad_ingore_y(self):
+    #     self.check_grad_with_place(
+    #         self.place, ['X'],
+    #         'Out',
+    #         no_grad_set=set("Y"),
+    #         max_relative_error=0.006,check_dygraph=False)
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestSubtractAPI(unittest.TestCase):
+    def test_name(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.static.data(name="x", shape=[2, 3], dtype="float32")
+            y = paddle.static.data(name='y', shape=[2, 3], dtype='float32')
+            y_1 = paddle.subtract(x, y, name='add_res')
+            self.assertEqual(('add_res' in y_1.name), True)
+    def test_static(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x_np = np.array([2, 3, 4]).astype('float32')
+            y_np = np.array([1, 5, 2]).astype('float32')
+            x = paddle.static.data(name="x", shape=[3], dtype='float32')
+            y = paddle.static.data(name="y", shape=[3], dtype='float32')
+            x_reshape = paddle.reshape(x, [3, 1])
+            y_reshape = paddle.reshape(y, [3, 1])
+            z = paddle.subtract(x_reshape, y_reshape)
+            z = paddle.reshape(z, shape=[3])
+            place = paddle.NPUPlace(0)
+            exe = paddle.static.Executor(place)
+            x_value, y_value, z_value = exe.run(feed={"x": x_np,
+                                                      "y": y_np},
+                                                fetch_list=[x, y, z])
+            z_expected = np.array([1., -2., 2.])
+            self.assertEqual(
+                (x_value == x_np).all(),
+                True,
+                msg="x_value = {}, but expected {}".format(x_value, x_np))
+            self.assertEqual(
+                (y_value == y_np).all(),
+                True,
+                msg="y_value = {}, but expected {}".format(y_value, y_np))
+            self.assertEqual(
+                (z_value == z_expected).all(),
+                True,
+                msg="z_value = {}, but expected {}".format(z_value, z_expected))
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestSubtractError(unittest.TestCase):
+    def test_errors(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            # the input of elementwise_add must be Variable.
+            x1 = fluid.create_lod_tensor(
+                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.NPUPlace(0))
+            y1 = fluid.create_lod_tensor(
+                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.NPUPlace(0))
+            self.assertRaises(TypeError, paddle.subtract, x1, y1)
+            # the input dtype must be float16 or float32 or float64 or int32 or int64
+            x2 = paddle.static.data(
+                name='x2', shape=[3, 4, 5, 6], dtype="uint8")
+            y2 = paddle.static.data(
+                name='y2', shape=[3, 4, 5, 6], dtype="uint8")
+            self.assertRaises(TypeError, paddle.subtract, x2, y2)
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestSubtractNet(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+        a_np = np.random.random(size=(32, 32)).astype('float32')
+        b_np = np.random.random(size=(32, 32)).astype('float32')
+        label_np = np.random.randint(2, size=(32, 1)).astype('int64')
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
+            b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[32, 1], dtype='int64')
+            sum = paddle.add(a, b)
+            c = paddle.assign(b)
+            z = paddle.subtract(sum, c)
+            fc_1 = fluid.layers.fc(input=z, size=128)
+            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+            cost = fluid.layers.cross_entropy(input=prediction, label=label)
+            loss = fluid.layers.reduce_mean(cost)
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+        for epoch in range(100):
+            pred_res, loss_res = exe.run(
+                main_prog,
+                feed={"a": a_np,
+                      "b": b_np,
+                      "label": label_np},
+                fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+        return pred_res, loss_res
+    def test_npu(self):
+        npu_pred, npu_loss = self._test(True)
+        cpu_pred, cpu_loos = self._test(False)
+        self.assertTrue(np.allclose(npu_pred, cpu_pred))
+        self.assertTrue(np.allclose(npu_loss, cpu_loos))
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/npu/test_npu_place.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_npu_place.py
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import unittest
+import paddle
+import numpy as np
+from paddle.fluid import core
+paddle.enable_static()
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestNpuPlace(unittest.TestCase):
+    def test(self):
+        p = core.Place()
+        p.set_place(paddle.NPUPlace(0))
+        self.assertTrue(p.is_npu_place())
+        self.assertEqual(p.npu_device_id(), 0)
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestNpuPlaceError(unittest.TestCase):
+    def test_static(self):
+        # NPU is not supported in ParallelExecutor
+        prog = paddle.static.Program()
+        with paddle.static.program_guard(prog):
+            x_np = np.array([2, 3, 4]).astype('float32')
+            y_np = np.array([1, 5, 2]).astype('float32')
+            x = paddle.static.data(name="x", shape=[3], dtype='float32')
+            y = paddle.static.data(name="y", shape=[3], dtype='float32')
+            z = paddle.add(x, y)
+        compiled_prog = paddle.static.CompiledProgram(prog)
+        place = paddle.NPUPlace(0)
+        exe = paddle.static.Executor(place)
+        with self.assertRaisesRegex(RuntimeError,
+                                    "NPU is not supported in ParallelExecutor"):
+            exe.run(compiled_prog, feed={"x": x_np, "y": y_np})
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -243,7 +243,10 @@ class OpTest(unittest.TestCase):
        np.random.seed(123)
        random.seed(124)
-        cls._use_system_allocator = _set_use_system_allocator(True)
+        if paddle.is_compiled_with_npu():
+            cls._use_system_allocator = _set_use_system_allocator(False)
+        else:
+            cls._use_system_allocator = _set_use_system_allocator(True)
    @classmethod
    def tearDownClass(cls):
@@ -272,6 +275,9 @@ class OpTest(unittest.TestCase):
        def is_mkldnn_op_test():
            return hasattr(cls, "use_mkldnn") and cls.use_mkldnn == True
+        def is_npu_op_test():
+            return hasattr(cls, "use_npu") and cls.use_npu == True
        if not hasattr(cls, "op_type"):
            raise AssertionError(
                "This test do not have op_type in class attrs, "
@@ -292,7 +298,8 @@ class OpTest(unittest.TestCase):
                and cls.op_type not in op_accuracy_white_list.NO_FP64_CHECK_GRAD_OP_LIST \
                and not hasattr(cls, 'exist_fp64_check_grad') \
                and not is_xpu_op_test() \
-                and not is_mkldnn_op_test():
+                and not is_mkldnn_op_test() \
+                and not is_npu_op_test():
                raise AssertionError(
                    "This test of %s op needs check_grad with fp64 precision." %
                    cls.op_type)
@@ -1183,7 +1190,8 @@ class OpTest(unittest.TestCase):
        # Check inplace for given op, its grad op, its grad_grad op, etc.
        # No effect on original OpTest
        # Currently not support ParallelExecutor on XPUPlace.
-        if not paddle.is_compiled_with_xpu():
+        if not paddle.is_compiled_with_xpu(
+        ) and not paddle.is_compiled_with_npu():
            self.check_inplace_output_with_place(
                place, no_check_set=no_check_set, inplace_atol=inplace_atol)

--- a/python/paddle/fluid/tests/unittests/test_device.py
+++ b/python/paddle/fluid/tests/unittests/test_device.py
@@ -15,54 +15,39 @@
 from __future__ import print_function
 import unittest
-from op_test import OpTest
-import numpy as np
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import paddle.fluid.framework as framework
-import warnings
-import paddle
 class TestStaticDeviceManage(unittest.TestCase):
-    def test_cpu_device(self):
+    def _test_device(self, device_name, device_class):
-        paddle.set_device('cpu')
+        paddle.set_device(device_name)
        out1 = paddle.zeros(shape=[1, 3], dtype='float32')
        out2 = paddle.ones(shape=[1, 3], dtype='float32')
        out3 = paddle.concat(x=[out1, out2], axis=0)
-        exe = paddle.fluid.Executor()
+        exe = paddle.static.Executor()
        exe.run(paddle.fluid.default_startup_program())
        res = exe.run(fetch_list=[out3])
        device = paddle.get_device()
-        self.assertEqual(isinstance(exe.place, core.CPUPlace), True)
+        self.assertEqual(isinstance(exe.place, device_class), True)
-        self.assertEqual(device, "cpu")
+        self.assertEqual(device, device_name)
+    def test_cpu_device(self):
+        self._test_device("cpu", core.CPUPlace)
    def test_gpu_device(self):
        if core.is_compiled_with_cuda():
-            out1 = paddle.zeros(shape=[1, 3], dtype='float32')
+            self._test_device("gpu:0", core.CUDAPlace)
-            out2 = paddle.ones(shape=[1, 3], dtype='float32')
-            out3 = paddle.concat(x=[out1, out2], axis=0)
-            paddle.set_device('gpu:0')
-            exe = paddle.fluid.Executor()
-            exe.run(paddle.fluid.default_startup_program())
-            res = exe.run(fetch_list=[out3])
-            device = paddle.get_device()
-            self.assertEqual(isinstance(exe.place, core.CUDAPlace), True)
-            self.assertEqual(device, "gpu:0")
    def test_xpu_device(self):
        if core.is_compiled_with_xpu():
-            out1 = paddle.zeros(shape=[1, 3], dtype='float32')
+            self._test_device("xpu:0", core.XPUPlace)
-            out2 = paddle.ones(shape=[1, 3], dtype='float32')
-            out3 = paddle.concat(x=[out1, out2], axis=0)
-            paddle.set_device('xpu:0')
-            exe = paddle.fluid.Executor()
-            exe.run(paddle.fluid.default_startup_program())
-            res = exe.run(fetch_list=[out3])
-            device = paddle.get_device()
-            self.assertEqual(isinstance(exe.place, core.XPUPlace), True)
-            self.assertEqual(device, "xpu:0")
 class TestImperativeDeviceManage(unittest.TestCase):

--- a/python/paddle/framework/__init__.py
+++ b/python/paddle/framework/__init__.py
@@ -12,10 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# TODO: import framework api under this directory 
+# TODO: import framework api under this directory
 __all__ = [
    'create_parameter', 'ParamAttr', 'CPUPlace', 'CUDAPlace', 'CUDAPinnedPlace',
-    'get_default_dtype', 'set_default_dtype'
+    'NPUPlace', 'get_default_dtype', 'set_default_dtype'
 ]
 __all__ += ['grad', 'LayerList', 'load', 'save', 'no_grad', 'DataParallel']
@@ -31,6 +31,7 @@ from ..fluid.layers.tensor import create_parameter  #DEFINE_ALIAS
 from ..fluid.core import CPUPlace  #DEFINE_ALIAS
 from ..fluid.core import CUDAPlace  #DEFINE_ALIAS
 from ..fluid.core import CUDAPinnedPlace  #DEFINE_ALIAS
+from ..fluid.core import NPUPlace  #DEFINE_ALIAS
 from ..fluid.core import VarBase  #DEFINE_ALIAS
 from paddle.fluid import core  #DEFINE_ALIAS