diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index 755b3bff763973143fd2102a3717a4fa85f206a0..32ceb7d7903a3ce70c60ecf80725762a48030168 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -466,6 +466,14 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx,
 #else
       PADDLE_THROW(
           platform::errors::Unimplemented("No XPU gc found in CPU/GPU paddle"));
+#endif
+    } else if (platform::is_npu_place(place_)) {
+#ifdef PADDLE_WITH_ASCEND_CL
+      // TODO(ascendrc): Support garbage collector on NPUPlace
+      VLOG(4) << "Skip NPU gc because it is not implemented now.";
+#else
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "No NPU gc found in CPU/GPU/XPU paddle"));
 #endif
     }
   }
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 3af0d7fcf5de5bd60f824103b1a2fcd992239261..665e7b2fcf8bc6c4082aa38d6124b1fdd8f93eff 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1275,6 +1275,16 @@ void OperatorWithKernel::ChooseKernel(const RuntimeContext& ctx,
     expected_kernel_key.place_ = platform::CPUPlace();
     kernel_iter = kernels.find(expected_kernel_key);
   }
+#endif
+#ifdef PADDLE_WITH_ASCEND_CL
+  if (kernel_iter == kernels.end() &&
+      is_npu_place(expected_kernel_key.place_)) {
+    VLOG(3) << "missing NPU kernel: " << type_
+            << ", expected_kernel_key:" << expected_kernel_key
+            << ", fallbacking to CPU one!";
+    expected_kernel_key.place_ = platform::CPUPlace();
+    kernel_iter = kernels.find(expected_kernel_key);
+  }
 #endif
   PADDLE_ENFORCE_NE(kernel_iter, kernels.end(),
                     platform::errors::NotFound(
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index bfc3b7c70177b8b45ec328710fadc41e73d69889..af04819417868d0f9325061426d485410610a7a9 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -614,6 +614,9 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
                                    const BuildStrategy &build_strategy,
                                    ir::Graph *graph)
     : member_(new ParallelExecutorPrivate(places, scope)) {
+  PADDLE_ENFORCE(places.size() > 0 && !is_npu_place(places[0]),
+                 platform::errors::Unavailable(
+                     "NPU is not supported in ParallelExecutor"));
   InitP2P(places);
   ir::InitReaderQueueDeviceCount(graph, *(member_->global_scope_),
                                  member_->places_.size());
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index 7344bcfb6b8b27384ccc4198c55868efa4455a14..d8e79d40c23eb24671192a9bd368a25b7ba27ac4 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -101,15 +101,19 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
   // TODO(zhiqiu): handle different condition like CUDA code below
   else if (platform::is_npu_place(src_place) &&  // NOLINT
            platform::is_cpu_place(dst_place)) {
-    auto stream = reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream();
+    auto stream =
+        reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream();
     memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
-                 BOOST_GET_CONST(platform::NPUPlace, src_place), src_ptr, size, stream);
+                 BOOST_GET_CONST(platform::NPUPlace, src_place), src_ptr, size,
+                 stream);
   }
   else if (platform::is_cpu_place(src_place) &&  // NOLINT
            platform::is_npu_place(dst_place)) {
-    auto stream = reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream();
+    auto stream =
+        reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream();
     memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
-                 BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size, stream);
+                 BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size,
+                 stream);
   }
   else if (platform::is_npu_place(src_place) &&  // NOLINT
            platform::is_npu_place(dst_place)) {
@@ -118,9 +122,11 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
               << dst_place;
       return;
     }
-    auto stream = reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream();
+    auto stream =
+        reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream();
     memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
-                 BOOST_GET_CONST(platform::NPUPlace, src_place), src_ptr, size, stream);
+                 BOOST_GET_CONST(platform::NPUPlace, src_place), src_ptr, size,
+                 stream);
   }
   else {  // NOLINT
     PADDLE_THROW(platform::errors::Unimplemented(
@@ -336,24 +342,27 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
 #endif
 #ifdef PADDLE_WITH_ASCEND_CL
   else if (platform::is_npu_place(src_place) &&  // NOLINT
-           platform::is_cpu_place(dst_place)) {
+           platform::is_cpu_place(dst_place)) {  /* npu -> cpu*/
     memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
-                 BOOST_GET_CONST(platform::NPUPlace, src_place), src_ptr, size, nullptr);
+                 BOOST_GET_CONST(platform::NPUPlace, src_place), src_ptr, size,
+                 nullptr);
   }
   else if (platform::is_cpu_place(src_place) &&  // NOLINT
-           platform::is_npu_place(dst_place)) {
+           platform::is_npu_place(dst_place)) {  /* cpu -> npu*/
     memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
-                 BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size, nullptr);
+                 BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size,
+                 nullptr);
   }
   else if (platform::is_npu_place(src_place) &&  // NOLINT
-           platform::is_npu_place(dst_place)) {
+           platform::is_npu_place(dst_place)) {  /* npu -> npu*/
     if (src_ptr == dst_ptr) {
       VLOG(3) << "Skip copy the same data sync from " << src_place << " to "
               << dst_place;
       return;
     }
     memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
-                 BOOST_GET_CONST(platform::NPUPlace, src_place), src_ptr, size, nullptr);
+                 BOOST_GET_CONST(platform::NPUPlace, src_place), src_ptr, size,
+                 nullptr);
   }
   else {  // NOLINT
     PADDLE_THROW(platform::errors::Unimplemented(
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 0fa50a8cd362b1ddaece4796bc243d751a60aab3..b3d9e22dba8d244f5e57267a527de0bbdc534996 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -154,6 +154,14 @@ bool IsCompiledWithXPU() {
 #endif
 }
 
+bool IsCompiledWithNPU() {
+#ifndef PADDLE_WITH_ASCEND_CL
+  return false;
+#else
+  return true;
+#endif
+}
+
 bool IsCompiledWithMKLDNN() {
 #ifndef PADDLE_WITH_MKLDNN
   return false;
@@ -567,6 +575,10 @@ PYBIND11_MODULE(core_noavx, m) {
            [](Tensor &self, paddle::platform::CPUPlace &place) {
              self.mutable_data<float>(place);
            })
+      .def("_alloc_float",
+           [](Tensor &self, paddle::platform::NPUPlace &place) {
+             self.mutable_data<float>(place);
+           })
       .def("_alloc_double",
            [](Tensor &self, paddle::platform::CPUPlace &place) {
              self.mutable_data<double>(place);
@@ -611,6 +623,11 @@ PYBIND11_MODULE(core_noavx, m) {
               paddle::framework::proto::VarType::Type type) {
              return reinterpret_cast<uintptr_t>(self.mutable_data(place, type));
            })
+      .def("_mutable_data",
+           [](Tensor &self, paddle::platform::NPUPlace &place,
+              paddle::framework::proto::VarType::Type type) {
+             return reinterpret_cast<uintptr_t>(self.mutable_data(place, type));
+           })
       .def("_clear", &Tensor::clear)
       .def("set", SetTensorFromPyArray<paddle::platform::CPUPlace>,
            py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
@@ -618,6 +635,8 @@ PYBIND11_MODULE(core_noavx, m) {
            py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
       .def("set", SetTensorFromPyArray<paddle::platform::CUDAPlace>,
            py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
+      .def("set", SetTensorFromPyArray<paddle::platform::NPUPlace>,
+           py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
       .def("set", SetTensorFromPyArray<paddle::platform::CUDAPinnedPlace>,
            py::arg("array"), py::arg("place"), py::arg("zero_copy") = false,
            R"DOC(
@@ -625,7 +644,7 @@ PYBIND11_MODULE(core_noavx, m) {
         
         Args:
           lod (numpy.ndarray): The data to set.
-          place (CPUPlace|CUDAPlace|XPUPlace|CUDAPinnedPlace): The place where the 
+          place (CPUPlace|CUDAPlace|XPUPlace|CUDAPinnedPlace|NPUPlace): The place where the
           LoDTensor is to be set.
           zero_copy (bool, optional): Whether to share memory with the input numpy array.
           This parameter only works with CPUPlace. Default: False.
@@ -1348,6 +1367,18 @@ All parameter, weight, gradient are variables in Paddle.
                     return new paddle::platform::XPUDeviceContext(place);
 #endif
                   })
+        .def_static("create",
+                    [](paddle::platform::NPUPlace& place)
+                        -> paddle::platform::DeviceContext* {
+#ifndef PADDLE_WITH_ASCEND_CL
+             PADDLE_THROW(
+                 platform::errors::PermissionDenied(
+                 "Cannot use NPUPlace in CPU/GPU/XPU version, "
+                 "Please recompile or reinstall Paddle with NPU support."));
+#else
+                return new paddle::platform::NPUDeviceContext(place);
+#endif
+        })
       .def_static("create",
                   [](paddle::platform::CUDAPlace& place)
                       -> paddle::platform::DeviceContext* {
@@ -1448,6 +1479,7 @@ All parameter, weight, gradient are variables in Paddle.
       .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::CUDAPlace>)
       .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::CPUPlace>)
       .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::XPUPlace>)
+      .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::NPUPlace>)
       .def("_equals",
            &IsSamePlace<platform::CUDAPlace, platform::CUDAPinnedPlace>)
       .def("_get_device_id",
@@ -1517,6 +1549,7 @@ All parameter, weight, gradient are variables in Paddle.
 #ifdef PADDLE_WITH_XPU
   m.def("get_xpu_device_count", platform::GetXPUDeviceCount);
 #endif
+
   py::class_<paddle::platform::CPUPlace>(m, "CPUPlace", R"DOC(
     CPUPlace is a descriptor of a device.
     It represents a CPU device on which a tensor will be allocated and a model will run.
@@ -1532,6 +1565,7 @@ All parameter, weight, gradient are variables in Paddle.
       .def("_type", &PlaceIndex<platform::CPUPlace>)
       .def("_equals", &IsSamePlace<platform::CPUPlace, platform::Place>)
       .def("_equals", &IsSamePlace<platform::CPUPlace, platform::XPUPlace>)
+      .def("_equals", &IsSamePlace<platform::CPUPlace, platform::NPUPlace>)
       .def("_equals", &IsSamePlace<platform::CPUPlace, platform::CUDAPlace>)
       .def("_equals", &IsSamePlace<platform::CPUPlace, platform::CPUPlace>)
       .def("_equals",
@@ -1569,6 +1603,8 @@ All parameter, weight, gradient are variables in Paddle.
            &IsSamePlace<platform::CUDAPinnedPlace, platform::CUDAPlace>)
       .def("_equals",
            &IsSamePlace<platform::CUDAPinnedPlace, platform::XPUPlace>)
+      .def("_equals",
+           &IsSamePlace<platform::CUDAPinnedPlace, platform::NPUPlace>)
       .def("_equals",
            &IsSamePlace<platform::CUDAPinnedPlace, platform::CPUPlace>)
       .def("_equals",
@@ -1576,6 +1612,65 @@ All parameter, weight, gradient are variables in Paddle.
       .def("__repr__", string::to_string<const platform::CUDAPinnedPlace &>)
       .def("__str__", string::to_string<const platform::CUDAPinnedPlace &>);
 
+  // NPUPlace
+  py::class_<platform::NPUPlace>(m, "NPUPlace", R"DOC(
+    NPUPlace is a descriptor of a device.
+    It represents a NPU device on which a tensor will be allocated and a model will run.
+
+    Examples:
+        .. code-block:: python
+          import paddle
+          npu_place = paddle.NPUPlace(0)
+
+        )DOC")
+      .def("__init__",
+           [](platform::NPUPlace &self, int dev_id) {
+#ifdef PADDLE_WITH_ASCEND_CL
+             if (UNLIKELY(dev_id < 0)) {
+               LOG(ERROR) << string::Sprintf(
+                   "Invalid NPUPlace(%d), device id must be 0 or "
+                   "positive integer",
+                   dev_id);
+               std::exit(-1);
+             }
+             if (UNLIKELY(dev_id >= platform::GetNPUDeviceCount())) {
+               if (platform::GetNPUDeviceCount() == 0) {
+                 LOG(ERROR) << "Cannot use NPU because there is no NPU "
+                               "detected on your "
+                               "machine.";
+                 std::exit(-1);
+               } else {
+                 LOG(ERROR) << string::Sprintf(
+                     "Invalid NPUPlace(%d), must inside [0, %d), because NPU "
+                     "number on your machine is %d",
+                     dev_id, platform::GetNPUDeviceCount(),
+                     platform::GetNPUDeviceCount());
+                 std::exit(-1);
+               }
+             }
+             new (&self) platform::NPUPlace(dev_id);
+#else
+             LOG(ERROR) << string::Sprintf(
+                 "Cannot use NPU because you have installed CPU/GPU version "
+                 "PaddlePaddle.\n"
+                 "If you want to use NPU, please try to install NPU version "
+                 "PaddlePaddle by: pip install paddlepaddle-xpu\n"
+                 "If you only have CPU, please change NPUPlace(%d) to be "
+                 "CPUPlace().\n",
+                 dev_id);
+             std::exit(-1);
+#endif
+           })
+      .def("_type", &PlaceIndex<platform::NPUPlace>)
+      .def("_equals", &IsSamePlace<platform::NPUPlace, platform::Place>)
+      .def("_equals", &IsSamePlace<platform::NPUPlace, platform::CUDAPlace>)
+      .def("_equals", &IsSamePlace<platform::NPUPlace, platform::CPUPlace>)
+      .def("_equals", &IsSamePlace<platform::NPUPlace, platform::XPUPlace>)
+      .def("_equals", &IsSamePlace<platform::NPUPlace, platform::NPUPlace>)
+      .def("_equals",
+           &IsSamePlace<platform::NPUPlace, platform::CUDAPinnedPlace>)
+      .def("__str__", string::to_string<const platform::NPUPlace &>);
+
   py::class_<platform::Place>(m, "Place")
       .def(py::init<>())
       .def("_type", &PlaceIndex<platform::Place>)
@@ -1583,6 +1678,7 @@ All parameter, weight, gradient are variables in Paddle.
       .def("_equals", &IsSamePlace<platform::Place, platform::CUDAPlace>)
       .def("_equals", &IsSamePlace<platform::Place, platform::CPUPlace>)
       .def("_equals", &IsSamePlace<platform::Place, platform::XPUPlace>)
+      .def("_equals", &IsSamePlace<platform::Place, platform::NPUPlace>)
       .def("_equals", &IsSamePlace<platform::Place, platform::CUDAPinnedPlace>)
       .def("is_gpu_place",
            [](platform::Place &self) { return platform::is_gpu_place(self); })
@@ -1590,6 +1686,8 @@ All parameter, weight, gradient are variables in Paddle.
            [](platform::Place &self) { return platform::is_cpu_place(self); })
       .def("is_xpu_place",
            [](platform::Place &self) { return platform::is_xpu_place(self); })
+      .def("is_npu_place",
+           [](platform::Place &self) { return platform::is_npu_place(self); })
       .def("is_cuda_pinned_place",
            [](platform::Place &self) {
              return platform::is_cuda_pinned_place(self);
@@ -1602,6 +1700,10 @@ All parameter, weight, gradient are variables in Paddle.
            [](platform::Place &self) {
              return BOOST_GET_CONST(platform::XPUPlace, self).device;
            })
+      .def("npu_device_id",
+           [](platform::Place &self) {
+             return BOOST_GET_CONST(platform::NPUPlace, self).device;
+           })
       .def("set_place", [](platform::Place &self,
                            const platform::Place &other) { self = other; })
       .def("set_place",
@@ -1621,6 +1723,10 @@ All parameter, weight, gradient are variables in Paddle.
               const platform::CUDAPinnedPlace &cuda_pinned_place) {
              self = cuda_pinned_place;
            })
+      .def("set_place",
+           [](platform::Place &self, const platform::NPUPlace &npu_place) {
+             self = npu_place;
+           })
       .def("__repr__", string::to_string<const platform::Place &>)
       .def("__str__", string::to_string<const platform::Place &>);
 
@@ -1645,6 +1751,9 @@ All parameter, weight, gradient are variables in Paddle.
       .def("run",
            [](OperatorBase &self, const Scope &scope,
               const platform::XPUPlace &place) { self.Run(scope, place); })
+      .def("run",
+           [](OperatorBase &self, const Scope &scope,
+              const platform::NPUPlace &place) { self.Run(scope, place); })
       .def("run",
            [](OperatorBase &self, const Scope &scope,
               const platform::CUDAPlace &place) { self.Run(scope, place); })
@@ -1745,6 +1854,7 @@ All parameter, weight, gradient are variables in Paddle.
 
   m.def("is_compiled_with_cuda", IsCompiledWithCUDA);
   m.def("is_compiled_with_ascend", IsCompiledWithAscend);
+  m.def("is_compiled_with_npu", IsCompiledWithNPU);
   m.def("is_compiled_with_xpu", IsCompiledWithXPU);
   m.def("is_compiled_with_mkldnn", IsCompiledWithMKLDNN);
   m.def("supports_bfloat16", SupportsBfloat16);
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
index 7e60c98dc183276836fba4024b5b1d6ec3a865a5..51fc3439c9a5998c77478089d52027848c915a03 100644
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -285,6 +285,22 @@ void SetTensorFromPyArrayT(
     PADDLE_THROW(platform::errors::PermissionDenied(
         "Cannot use XPUPlace in CPU/GPU version, "
         "Please recompile or reinstall Paddle with XPU support."));
+#endif
+  } else if (paddle::platform::is_npu_place(place)) {
+#ifdef PADDLE_WITH_ASCEND_CL
+    platform::Place tmp_place = place;
+    platform::NPUDeviceGuard guard(
+        BOOST_GET_CONST(platform::NPUPlace, tmp_place).device);
+    auto dst = self->mutable_data<T>(place);
+    platform::NPUMemcpySync(dst, array.data(), array.nbytes(),
+                            ACL_MEMCPY_HOST_TO_DEVICE);
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &ctx = *pool.Get(place);
+    ctx.Wait();
+#else
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Cannot use NPUPlace in CPU/GPU/XPU version. "
+        "Please recompile or reinstall Paddle with NPU support."));
 #endif
   } else {
 #ifdef PADDLE_WITH_CUDA
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index 50043a9b3cf4ffdcb2bad720a4491cfe8e6ac620..2c13e6da83b619671fd9cf46755389d9684c4483 100755
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -232,6 +232,7 @@ from .framework import ParamAttr  #DEFINE_ALIAS
 from .framework import create_parameter  #DEFINE_ALIAS
 from .framework import CPUPlace  #DEFINE_ALIAS
 from .framework import CUDAPlace  #DEFINE_ALIAS
+from .framework import NPUPlace  #DEFINE_ALIAS
 from .framework import CUDAPinnedPlace  #DEFINE_ALIAS
 
 from .framework import grad  #DEFINE_ALIAS
@@ -256,6 +257,7 @@ from .device import set_device
 from .device import get_device
 from .device import is_compiled_with_cuda  #DEFINE_ALIAS
 from .device import is_compiled_with_xpu
+from .device import is_compiled_with_npu
 from .device import XPUPlace
 # from .tensor.tensor import Tensor        #DEFINE_ALIAS
 # from .tensor.tensor import LoDTensor        #DEFINE_ALIAS
diff --git a/python/paddle/device.py b/python/paddle/device.py
index 2beb92f2c3a75f52fe29ce2408433efec0354bd9..d0bca3df8963bee0e8439e50ce8da6f32205a815 100644
--- a/python/paddle/device.py
+++ b/python/paddle/device.py
@@ -32,12 +32,28 @@ __all__ = [
     #            'cuda_places',
     #            'CUDAPinnedPlace',
     #            'CUDAPlace',
-    'is_compiled_with_cuda'
+    'is_compiled_with_cuda',
+    'is_compiled_with_npu'
 ]
 
 _cudnn_version = None
 
 
+def is_compiled_with_npu():
+    """
+    Whether this whl package can be used to run the model on NPU.
+
+    Returns (bool): `True` if NPU is supported, otherwise `False`.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            support_npu = paddle.is_compiled_with_npu()
+    """
+    return core.is_compiled_with_npu()
+
+
 def is_compiled_with_xpu():
     """
     Whether paddle was built with WITH_XPU=ON to support Baidu Kunlun
@@ -163,6 +179,7 @@ def set_device(device):
             device_id = device_info_list[1]
             device_id = int(device_id)
             place = core.XPUPlace(device_id)
+
     framework._set_expected_place(place)
     return place
 
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 1a88d3512eaaaaa9e05a70231d169449c87f8ca5..5c6ce1dc17abd904ab179e38e25db71a5a84d71a 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -68,7 +68,8 @@ from .input import embedding, one_hot
 from . import distribute_lookup_table
 from .param_attr import ParamAttr, WeightNormParamAttr
 from .data_feeder import DataFeeder
-from .core import LoDTensor, LoDTensorArray, CPUPlace, XPUPlace, CUDAPlace, CUDAPinnedPlace, Scope, _Scope
+from .core import LoDTensor, LoDTensorArray, Scope, _Scope
+from .core import CPUPlace, XPUPlace, CUDAPlace, CUDAPinnedPlace, NPUPlace
 from .incubate import fleet
 from .incubate import data_generator
 from .transpiler import DistributeTranspiler, \
@@ -124,6 +125,7 @@ __all__ = framework.__all__ + executor.__all__ + \
         'XPUPlace',
         'CUDAPlace',
         'CUDAPinnedPlace',
+        'NPUPlace',
         'Tensor',
         'ParamAttr',
         'WeightNormParamAttr',
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index 9b0b04a6ea7164501ff5af6caace2bd4a6315cf4..acbd00c336bc380c5346040711a23342ee12eb5b 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -1213,6 +1213,7 @@ class Executor(object):
             # In distributed training, the compiled program is saved in Program._graph
             has_compiled_graph = isinstance(program._graph,
                                             compiler.CompiledProgram)
+
             if has_compiled_graph:
                 program._graph._compile(scope, self.place)
                 # _graph in program does not support inference since the _graph is optimized
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 08ea46e69619ac50384b5a7439cd1b931ad10ddb..e17527a329309327addd43b7763879cf70b72033 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -5854,7 +5854,7 @@ def _get_paddle_place(place):
     if place is None:
         return place
     if isinstance(place, (core.Place, core.XPUPlace, core.CPUPlace,
-                          core.CUDAPinnedPlace, core.CUDAPlace)):
+                          core.CUDAPinnedPlace, core.CUDAPlace, core.NPUPlace)):
         return place
 
     if not isinstance(place, str):
@@ -5864,9 +5864,11 @@ def _get_paddle_place(place):
     place = place.lower()
     if (place == "cpu"):
         return core.CPUPlace()
+
     if (place == "device"):
         return core.Place()
 
+    # GPU
     avaliable_gpu_place = re.match(r'gpu:\d+', place)
     if place == "gpu_pinned" or place == "gpu" or avaliable_gpu_place:
         if not core.is_compiled_with_cuda():
@@ -5882,6 +5884,8 @@ def _get_paddle_place(place):
             device_id = place_info_list[1]
             device_id = int(device_id)
             return core.CUDAPlace(device_id)
+
+    # XPU
     avaliable_xpu_place = re.match(r'xpu:\d+', place)
     if avaliable_xpu_place:
         if not core.is_compiled_with_xpu():
@@ -5892,9 +5896,22 @@ def _get_paddle_place(place):
         device_id = place_info_list[1]
         device_id = int(device_id)
         return core.XPUPlace(device_id)
+
+    # NPU
+    avaliable_npu_place = re.match(r'npu:\d+', place)
+    if avaliable_npu_place:
+        if not core.is_compiled_with_npu():
+            raise ValueError(
+                "The device should not be {}, since PaddlePaddle is " \
+                "not compiled with NPU".format(avaliable_npu_place))
+        place_info_list = place.split(':', 1)
+        device_id = place_info_list[1]
+        device_id = int(device_id)
+        return core.NPUPlace(device_id)
+
     raise ValueError(
-        "paddle support CPUPlace, CUDAPlace,CUDAPinnedPlace and XPUPlace, Please check your Place Input"
-    )
+        "Paddle supports CPUPlace, CUDAPlace,CUDAPinnedPlace, XPUPlace and NPUPlace, but received {}.".
+        format(place))
 
 
 def _get_paddle_place_list(places):
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 1f4648f7963811e50d890a60aa10a399e7681f84..93c15ba6cb77949ddfb2ba17f837cc419b1366dc 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -608,6 +608,10 @@ if (WITH_XPU)
     add_subdirectory(xpu)
 endif()
 
+if (WITH_ASCEND_CL)
+    add_subdirectory(npu)
+endif()
+
 if (WITH_MKLDNN)
     add_subdirectory(mkldnn)
 endif()
diff --git a/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt b/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f71e04c09aa38b8cf7b3a167b84d4dc0e6cc3ec7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt
@@ -0,0 +1,6 @@
+file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+
+foreach(TEST_OP ${TEST_OPS})
+    py_test_modules(${TEST_OP} MODULES ${TEST_OP})
+endforeach(TEST_OP)
diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_add_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_add_op_npu.py
new file mode 100644
index 0000000000000000000000000000000000000000..47da4fdb23ec49924fbfb1b5cc4b02e2355d287e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_add_op_npu.py
@@ -0,0 +1,162 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest, _set_use_system_allocator
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestElementwiseAddOp(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "elementwise_add"
+        self.place = paddle.NPUPlace(0)
+        self.init_dtype()
+        self.init_input_output()
+        self.init_kernel_type()
+        self.init_axis()
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(self.x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(self.y)
+        }
+        self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_mkldnn}
+        self.outputs = {'Out': self.out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_kernel_type(self):
+        self.use_mkldnn = False
+
+    def init_input_output(self):
+        self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
+        self.y = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
+        self.out = np.add(self.x, self.y)
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def init_axis(self):
+        self.axis = -1
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+    # TODO(ascendrc): Test grad op after it is implemented.
+    # def test_check_grad_normal(self):
+    #     self.check_grad_with_place(
+    #         self.place, ['X', 'Y'],
+    #         'Out',
+    #         max_relative_error=0.006,
+    #         check_dygraph=False)
+    #
+    # def test_check_grad_ingore_x(self):
+    #     self.check_grad_with_place(
+    #         self.place, ['Y'],
+    #         'Out',
+    #         no_grad_set=set("X"),
+    #         max_relative_error=0.006,
+    #         check_dygraph=False)
+    #
+    # def test_check_grad_ingore_y(self):
+    #     self.check_grad_with_place(
+    #         self.place, ['X'],
+    #         'Out',
+    #         no_grad_set=set("Y"),
+    #         max_relative_error=0.006,check_dygraph=False)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestAddAPI(unittest.TestCase):
+    def test_name(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.static.data(name="x", shape=[2, 3], dtype="float32")
+            y = paddle.static.data(name='y', shape=[2, 3], dtype='float32')
+
+            y_1 = paddle.add(x, y, name='add_res')
+            self.assertEqual(('add_res' in y_1.name), True)
+
+    def test_static(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+
+            x_np = np.array([2, 3, 4]).astype('float32')
+            y_np = np.array([1, 5, 2]).astype('float32')
+
+            x = paddle.static.data(name="x", shape=[3], dtype='float32')
+            y = paddle.static.data(name="y", shape=[3], dtype='float32')
+
+            x_reshape = paddle.reshape(x, [3, 1])
+            y_reshape = paddle.reshape(y, [3, 1])
+            z = paddle.add(x_reshape, y_reshape)
+            z = paddle.reshape(z, shape=[3])
+
+            place = paddle.NPUPlace(0)
+            exe = paddle.static.Executor(place)
+            x_value, y_value, z_value = exe.run(feed={"x": x_np,
+                                                      "y": y_np},
+                                                fetch_list=[x, y, z])
+
+            z_expected = np.array([3., 8., 6.])
+            self.assertEqual(
+                (x_value == x_np).all(),
+                True,
+                msg="x_value = {}, but expected {}".format(x_value, x_np))
+            self.assertEqual(
+                (y_value == y_np).all(),
+                True,
+                msg="y_value = {}, but expected {}".format(y_value, y_np))
+            self.assertEqual(
+                (z_value == z_expected).all(),
+                True,
+                msg="z_value = {}, but expected {}".format(z_value, z_expected))
+
+    def test_backward(self):
+        # TODO(ascendrc): Test backward after add grad npu op implemented.
+        pass
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestAddError(unittest.TestCase):
+    def test_errors(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            # the input of elementwise_add must be Variable.
+            x1 = fluid.create_lod_tensor(
+                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.NPUPlace(0))
+            y1 = fluid.create_lod_tensor(
+                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.NPUPlace(0))
+            self.assertRaises(TypeError, paddle.add, x1, y1)
+
+            # the input dtype must be float16 or float32 or float64 or int32 or int64
+            x2 = paddle.static.data(
+                name='x2', shape=[3, 4, 5, 6], dtype="uint8")
+            y2 = paddle.static.data(
+                name='y2', shape=[3, 4, 5, 6], dtype="uint8")
+            self.assertRaises(TypeError, paddle.add, x2, y2)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_sub_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_sub_op_npu.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c6c7b46f49f2725b646202998095adef3a65e63
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_sub_op_npu.py
@@ -0,0 +1,224 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestElementwiseSubOp(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "elementwise_sub"
+        self.place = paddle.NPUPlace(0)
+        self.init_dtype()
+        self.init_input_output()
+        self.init_kernel_type()
+        self.init_axis()
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(self.x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(self.y)
+        }
+        self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_mkldnn}
+        self.outputs = {'Out': self.out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_kernel_type(self):
+        self.use_mkldnn = False
+
+    def init_input_output(self):
+        self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
+        self.y = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
+        self.out = np.subtract(self.x, self.y)
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def init_axis(self):
+        self.axis = 0
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+    # TODO(ascendrc): For grad tests, OpTest raises FatalError:Segmentation fault
+    #  when call op.run, which may be caused by system environment exception
+    #  and the exact cause has not be located.
+    # def test_check_grad_normal(self):
+    #     self.check_grad_with_place(
+    #         self.place, ['X', 'Y'],
+    #         'Out',
+    #         max_relative_error=0.006,
+    #         check_dygraph=False)
+    #
+    # def test_check_grad_ingore_x(self):
+    #     self.check_grad_with_place(
+    #         self.place, ['Y'],
+    #         'Out',
+    #         no_grad_set=set("X"),
+    #         max_relative_error=0.006,
+    #         check_dygraph=False)
+    #
+    # def test_check_grad_ingore_y(self):
+    #     self.check_grad_with_place(
+    #         self.place, ['X'],
+    #         'Out',
+    #         no_grad_set=set("Y"),
+    #         max_relative_error=0.006,check_dygraph=False)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestSubtractAPI(unittest.TestCase):
+    def test_name(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.static.data(name="x", shape=[2, 3], dtype="float32")
+            y = paddle.static.data(name='y', shape=[2, 3], dtype='float32')
+
+            y_1 = paddle.subtract(x, y, name='add_res')
+            self.assertEqual(('add_res' in y_1.name), True)
+
+    def test_static(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+
+            x_np = np.array([2, 3, 4]).astype('float32')
+            y_np = np.array([1, 5, 2]).astype('float32')
+
+            x = paddle.static.data(name="x", shape=[3], dtype='float32')
+            y = paddle.static.data(name="y", shape=[3], dtype='float32')
+
+            x_reshape = paddle.reshape(x, [3, 1])
+            y_reshape = paddle.reshape(y, [3, 1])
+            z = paddle.subtract(x_reshape, y_reshape)
+            z = paddle.reshape(z, shape=[3])
+
+            place = paddle.NPUPlace(0)
+            exe = paddle.static.Executor(place)
+            x_value, y_value, z_value = exe.run(feed={"x": x_np,
+                                                      "y": y_np},
+                                                fetch_list=[x, y, z])
+
+            z_expected = np.array([1., -2., 2.])
+            self.assertEqual(
+                (x_value == x_np).all(),
+                True,
+                msg="x_value = {}, but expected {}".format(x_value, x_np))
+            self.assertEqual(
+                (y_value == y_np).all(),
+                True,
+                msg="y_value = {}, but expected {}".format(y_value, y_np))
+            self.assertEqual(
+                (z_value == z_expected).all(),
+                True,
+                msg="z_value = {}, but expected {}".format(z_value, z_expected))
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestSubtractError(unittest.TestCase):
+    def test_errors(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            # the input of elementwise_add must be Variable.
+            x1 = fluid.create_lod_tensor(
+                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.NPUPlace(0))
+            y1 = fluid.create_lod_tensor(
+                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.NPUPlace(0))
+            self.assertRaises(TypeError, paddle.subtract, x1, y1)
+
+            # the input dtype must be float16 or float32 or float64 or int32 or int64
+            x2 = paddle.static.data(
+                name='x2', shape=[3, 4, 5, 6], dtype="uint8")
+            y2 = paddle.static.data(
+                name='y2', shape=[3, 4, 5, 6], dtype="uint8")
+            self.assertRaises(TypeError, paddle.subtract, x2, y2)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestSubtractNet(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(32, 32)).astype('float32')
+        b_np = np.random.random(size=(32, 32)).astype('float32')
+        label_np = np.random.randint(2, size=(32, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
+            b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[32, 1], dtype='int64')
+
+            sum = paddle.add(a, b)
+            c = paddle.assign(b)
+            z = paddle.subtract(sum, c)
+
+            fc_1 = fluid.layers.fc(input=z, size=128)
+            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+
+            cost = fluid.layers.cross_entropy(input=prediction, label=label)
+            loss = fluid.layers.reduce_mean(cost)
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(
+                main_prog,
+                feed={"a": a_np,
+                      "b": b_np,
+                      "label": label_np},
+                fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_npu(self):
+        npu_pred, npu_loss = self._test(True)
+        cpu_pred, cpu_loos = self._test(False)
+
+        self.assertTrue(np.allclose(npu_pred, cpu_pred))
+        self.assertTrue(np.allclose(npu_loss, cpu_loos))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_npu_place.py b/python/paddle/fluid/tests/unittests/npu/test_npu_place.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f71fad2b9c1084148d8b0a28e556cc0bf5f366e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_npu_place.py
@@ -0,0 +1,61 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import paddle
+import numpy as np
+from paddle.fluid import core
+
+paddle.enable_static()
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestNpuPlace(unittest.TestCase):
+    def test(self):
+        p = core.Place()
+        p.set_place(paddle.NPUPlace(0))
+
+        self.assertTrue(p.is_npu_place())
+        self.assertEqual(p.npu_device_id(), 0)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestNpuPlaceError(unittest.TestCase):
+    def test_static(self):
+        # NPU is not supported in ParallelExecutor
+        prog = paddle.static.Program()
+        with paddle.static.program_guard(prog):
+
+            x_np = np.array([2, 3, 4]).astype('float32')
+            y_np = np.array([1, 5, 2]).astype('float32')
+
+            x = paddle.static.data(name="x", shape=[3], dtype='float32')
+            y = paddle.static.data(name="y", shape=[3], dtype='float32')
+            z = paddle.add(x, y)
+
+        compiled_prog = paddle.static.CompiledProgram(prog)
+        place = paddle.NPUPlace(0)
+        exe = paddle.static.Executor(place)
+
+        with self.assertRaisesRegex(RuntimeError,
+                                    "NPU is not supported in ParallelExecutor"):
+            exe.run(compiled_prog, feed={"x": x_np, "y": y_np})
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index e3e84a73301a0b296e3cb4cdf951b6f8c845bfff..efce2e770b1b16d1d70c86211b396b0a060b520e 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -243,7 +243,10 @@ class OpTest(unittest.TestCase):
         np.random.seed(123)
         random.seed(124)
 
-        cls._use_system_allocator = _set_use_system_allocator(True)
+        if paddle.is_compiled_with_npu():
+            cls._use_system_allocator = _set_use_system_allocator(False)
+        else:
+            cls._use_system_allocator = _set_use_system_allocator(True)
 
     @classmethod
     def tearDownClass(cls):
@@ -272,6 +275,9 @@ class OpTest(unittest.TestCase):
         def is_mkldnn_op_test():
             return hasattr(cls, "use_mkldnn") and cls.use_mkldnn == True
 
+        def is_npu_op_test():
+            return hasattr(cls, "use_npu") and cls.use_npu == True
+
         if not hasattr(cls, "op_type"):
             raise AssertionError(
                 "This test do not have op_type in class attrs, "
@@ -292,7 +298,8 @@ class OpTest(unittest.TestCase):
                 and cls.op_type not in op_accuracy_white_list.NO_FP64_CHECK_GRAD_OP_LIST \
                 and not hasattr(cls, 'exist_fp64_check_grad') \
                 and not is_xpu_op_test() \
-                and not is_mkldnn_op_test():
+                and not is_mkldnn_op_test() \
+                and not is_npu_op_test():
                 raise AssertionError(
                     "This test of %s op needs check_grad with fp64 precision." %
                     cls.op_type)
@@ -1183,7 +1190,8 @@ class OpTest(unittest.TestCase):
         # Check inplace for given op, its grad op, its grad_grad op, etc.
         # No effect on original OpTest
         # Currently not support ParallelExecutor on XPUPlace.
-        if not paddle.is_compiled_with_xpu():
+        if not paddle.is_compiled_with_xpu(
+        ) and not paddle.is_compiled_with_npu():
             self.check_inplace_output_with_place(
                 place, no_check_set=no_check_set, inplace_atol=inplace_atol)
 
diff --git a/python/paddle/fluid/tests/unittests/test_device.py b/python/paddle/fluid/tests/unittests/test_device.py
index 195337e80defa930a67d6d9e08dc585d07cdb6fa..08697a080445e606f17bdde83384eef391713721 100644
--- a/python/paddle/fluid/tests/unittests/test_device.py
+++ b/python/paddle/fluid/tests/unittests/test_device.py
@@ -15,54 +15,39 @@
 from __future__ import print_function
 
 import unittest
-from op_test import OpTest
 
-import numpy as np
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import paddle.fluid.framework as framework
-import warnings
-import paddle
 
 
 class TestStaticDeviceManage(unittest.TestCase):
-    def test_cpu_device(self):
-        paddle.set_device('cpu')
+    def _test_device(self, device_name, device_class):
+        paddle.set_device(device_name)
+
         out1 = paddle.zeros(shape=[1, 3], dtype='float32')
         out2 = paddle.ones(shape=[1, 3], dtype='float32')
         out3 = paddle.concat(x=[out1, out2], axis=0)
-        exe = paddle.fluid.Executor()
+
+        exe = paddle.static.Executor()
         exe.run(paddle.fluid.default_startup_program())
         res = exe.run(fetch_list=[out3])
+
         device = paddle.get_device()
-        self.assertEqual(isinstance(exe.place, core.CPUPlace), True)
-        self.assertEqual(device, "cpu")
+        self.assertEqual(isinstance(exe.place, device_class), True)
+        self.assertEqual(device, device_name)
+
+    def test_cpu_device(self):
+        self._test_device("cpu", core.CPUPlace)
 
     def test_gpu_device(self):
         if core.is_compiled_with_cuda():
-            out1 = paddle.zeros(shape=[1, 3], dtype='float32')
-            out2 = paddle.ones(shape=[1, 3], dtype='float32')
-            out3 = paddle.concat(x=[out1, out2], axis=0)
-            paddle.set_device('gpu:0')
-            exe = paddle.fluid.Executor()
-            exe.run(paddle.fluid.default_startup_program())
-            res = exe.run(fetch_list=[out3])
-            device = paddle.get_device()
-            self.assertEqual(isinstance(exe.place, core.CUDAPlace), True)
-            self.assertEqual(device, "gpu:0")
+            self._test_device("gpu:0", core.CUDAPlace)
 
     def test_xpu_device(self):
         if core.is_compiled_with_xpu():
-            out1 = paddle.zeros(shape=[1, 3], dtype='float32')
-            out2 = paddle.ones(shape=[1, 3], dtype='float32')
-            out3 = paddle.concat(x=[out1, out2], axis=0)
-            paddle.set_device('xpu:0')
-            exe = paddle.fluid.Executor()
-            exe.run(paddle.fluid.default_startup_program())
-            res = exe.run(fetch_list=[out3])
-            device = paddle.get_device()
-            self.assertEqual(isinstance(exe.place, core.XPUPlace), True)
-            self.assertEqual(device, "xpu:0")
+            self._test_device("xpu:0", core.XPUPlace)
 
 
 class TestImperativeDeviceManage(unittest.TestCase):
diff --git a/python/paddle/framework/__init__.py b/python/paddle/framework/__init__.py
index f2b6888d7a753d0e0bda7a0c01139e0f222a89cb..5a616d81659b2361266542ca9e6f5ce40503506d 100644
--- a/python/paddle/framework/__init__.py
+++ b/python/paddle/framework/__init__.py
@@ -12,10 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# TODO: import framework api under this directory 
+# TODO: import framework api under this directory
 __all__ = [
     'create_parameter', 'ParamAttr', 'CPUPlace', 'CUDAPlace', 'CUDAPinnedPlace',
-    'get_default_dtype', 'set_default_dtype'
+    'NPUPlace', 'get_default_dtype', 'set_default_dtype'
 ]
 
 __all__ += ['grad', 'LayerList', 'load', 'save', 'no_grad', 'DataParallel']
@@ -31,6 +31,7 @@ from ..fluid.layers.tensor import create_parameter  #DEFINE_ALIAS
 from ..fluid.core import CPUPlace  #DEFINE_ALIAS
 from ..fluid.core import CUDAPlace  #DEFINE_ALIAS
 from ..fluid.core import CUDAPinnedPlace  #DEFINE_ALIAS
+from ..fluid.core import NPUPlace  #DEFINE_ALIAS
 from ..fluid.core import VarBase  #DEFINE_ALIAS
 
 from paddle.fluid import core  #DEFINE_ALIAS