[NPU] support dygraph execution on npu place(#33579)

* in NPU environment, use CPUPlace for missing operators. * in NPU environment, use CPUPlace for missing operators. * fix TensorCopy bug and add unit test. * fix code style. * add more unit tests.

[NPU] support dygraph execution on npu place(#33579)
* in NPU environment, use CPUPlace for missing operators. * in NPU environment, use CPUPlace for missing operators. * fix TensorCopy bug and add unit test. * fix code style. * add more unit tests.
6aea6be2 · houj04 · GitHub · 049dd853 · 6aea6be2 · 6aea6be2
6 changed file
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -278,7 +278,7 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
                Tensor* dst) {
  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
  const platform::DeviceContext* dev_ctx;
-  if (platform::is_gpu_place(dst_place)) {
+  if (platform::is_gpu_place(dst_place) || platform::is_npu_place(dst_place)) {
    dev_ctx = pool.Get(dst_place);
  } else {
    dev_ctx = pool.Get(src.place());

--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -131,6 +131,13 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
    expected_kernel_key.place_ = platform::CPUPlace();
    kernel_iter = kernels.find(expected_kernel_key);
  }
+#endif
+#ifdef PADDLE_WITH_ASCEND_CL
+  if (kernel_iter == kernels.end() &&
+      is_npu_place(expected_kernel_key.place_)) {
+    expected_kernel_key.place_ = platform::CPUPlace();
+    kernel_iter = kernels.find(expected_kernel_key);
+  }
 #endif
  // TODO(jiabin): Add operator.cc's line 1000 part back when we need that case
  PADDLE_ENFORCE_NE(kernel_iter, kernels.end(),

--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -120,6 +120,17 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists(
      gc.reset(new framework::CPUGarbageCollector(
          BOOST_GET_CONST(platform::CPUPlace, place), 0));
      VLOG(10) << "Created GarbageCollector at " << place;
+    } else if (platform::is_npu_place(place)) {
+#if defined(PADDLE_WITH_ASCEND_CL)
+      // TODO(zhiqiu): fix bugs and enable NPUDefaultStreamGarbageCollector.
+      gc.reset(new framework::NPUUnsafeFastGarbageCollector(
+          BOOST_GET_CONST(platform::NPUPlace, place), 0));
+      VLOG(10) << "Created GarbageCollector at " << place;
+#else
+      PADDLE_THROW(platform::errors::PermissionDenied(
+          "Paddle can't use NPU device since it's not compiled with NPU,"
+          "Please recompile or reinstall Paddle with NPU support."));
+#endif
    } else {
      PADDLE_THROW(platform::errors::PreconditionNotMet(
          "Unsupported place for garbage collection"));

--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -135,12 +135,14 @@ static const platform::Place PyObjectToPlace(const py::object &place_obj) {
    return place_obj.cast<platform::XPUPlace>();
  } else if (py::isinstance<platform::CUDAPinnedPlace>(place_obj)) {
    return place_obj.cast<platform::CUDAPinnedPlace>();
+  } else if (py::isinstance<platform::NPUPlace>(place_obj)) {
+    return place_obj.cast<platform::NPUPlace>();
  } else if (py::isinstance<platform::Place>(place_obj)) {
    return place_obj.cast<platform::Place>();
  } else {
    PADDLE_THROW(platform::errors::InvalidArgument(
        "Place should be one of "
-        "Place/CPUPlace/XPUPlace/CUDAPlace/CUDAPinnedPlace"));
+        "Place/CPUPlace/XPUPlace/CUDAPlace/CUDAPinnedPlace/NPUPlace"));
  }
 }
@@ -172,9 +174,13 @@ static void InitTensorForVarBase(imperative::VarBase *self,
    SetTensorFromPyArray<platform::CUDAPinnedPlace>(
        tensor, array, BOOST_GET_CONST(platform::CUDAPinnedPlace, place),
        zero_copy);
+  } else if (platform::is_npu_place(place)) {
+    SetTensorFromPyArray<platform::NPUPlace>(
+        tensor, array, BOOST_GET_CONST(platform::NPUPlace, place), zero_copy);
  } else {
    PADDLE_THROW(platform::errors::InvalidArgument(
-        "Place should be one of CPUPlace/XPUPlace/CUDAPlace/CUDAPinnedPlace"));
+        "Place should be one of "
+        "CPUPlace/XPUPlace/CUDAPlace/CUDAPinnedPlace/NPUPlace"));
  }
  if (stop_gradient != -1) {
    self->SetOverridedStopGradient(stop_gradient);
@@ -718,6 +724,10 @@ void BindImperative(py::module *m_ptr) {
           py::arg("value"), py::arg("place"), py::arg("persistable") = false,
           py::arg("zero_copy") = false, py::arg("name") = "",
           py::arg("stop_gradient") = -1)
+      .def("__init__", &InitVarBaseFromNumpyWithArg<platform::NPUPlace>,
+           py::arg("value"), py::arg("place"), py::arg("persistable") = false,
+           py::arg("zero_copy") = false, py::arg("name") = "",
+           py::arg("stop_gradient") = -1)
      .def("__init__", &InitVarBaseFromNumpyWithArgDefault, py::arg("value"))
      .def("__init__", &InitVarBaseFromTensorWithArgDefault, py::arg("tensor"))
      .def("__init__", &InitVarBaseFromNumpyWithKwargs)
@@ -1452,6 +1462,16 @@ void BindImperative(py::module *m_ptr) {
             return new_var;
           },
           py::return_value_policy::copy)
+      .def("_copy_to",
+           [](const std::shared_ptr<imperative::VarBase> &self,
+              const platform::NPUPlace &place, bool blocking) {
+             auto new_var = self->NewVarBase(place, blocking);
+             if (!blocking) {
+               IncreaseVarbaseReferenceCountUntilCopyComplete(self, place);
+             }
+             return new_var;
+           },
+           py::return_value_policy::copy)
      .def("_copy_to",
           [](const std::shared_ptr<imperative::VarBase> &self,
              const platform::Place &place, bool blocking) {
@@ -1578,6 +1598,11 @@ void BindImperative(py::module *m_ptr) {
              self.SetExpectedPlace(*p);
              VLOG(4) << "Tracer(" << &self << ")"
                      << " set expected place " << *p;
+            } else if (py::isinstance<platform::NPUPlace>(obj)) {
+              auto p = obj.cast<platform::NPUPlace *>();
+              self.SetExpectedPlace(*p);
+              VLOG(4) << "Tracer(" << &self << ")"
+                      << " set expected place " << *p;
            } else if (py::isinstance<platform::Place>(obj)) {
              auto p = obj.cast<platform::Place *>();
              self.SetExpectedPlace(*p);
@@ -1586,7 +1611,7 @@ void BindImperative(py::module *m_ptr) {
            } else {
              PADDLE_THROW(platform::errors::InvalidArgument(
                  "Incompatible Place Type: supports XPUPlace, CUDAPlace, "
-                  "CPUPlace, "
+                  "CPUPlace, NPUPlace"
                  "and CUDAPinnedPlace, "
                  "but got Unknown Type!"));
            }
@@ -1647,6 +1672,19 @@ void BindImperative(py::module *m_ptr) {
                            std::move(attrs), place, trace_backward);
             }
           })
+      .def("trace",
+           [](imperative::Tracer &self, const std::string &type,
+              const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs,
+              framework::AttributeMap attrs, const platform::NPUPlace &place,
+              bool trace_backward) {
+             auto ins_map = ConvertToNameVarBaseMap(ins);
+             auto outs_map = ConvertToNameVarBaseMap(outs);
+             {
+               py::gil_scoped_release release;
+               self.TraceOp(type, std::move(ins_map), std::move(outs_map),
+                            std::move(attrs), place, trace_backward);
+             }
+           })
      .def("trace",
           [](imperative::Tracer &self, const std::string &type,
              const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs,
@@ -1704,6 +1742,7 @@ void BindImperative(py::module *m_ptr) {
  m.def("varbase_copy", &VarBaseCopy<platform::CUDAPlace>);
  m.def("varbase_copy", &VarBaseCopy<platform::XPUPlace>);
  m.def("varbase_copy", &VarBaseCopy<platform::CUDAPinnedPlace>);
+  m.def("varbase_copy", &VarBaseCopy<platform::NPUPlace>);
  m.def(
      "dygraph_partial_grad",
@@ -1804,6 +1843,12 @@ void BindImperative(py::module *m_ptr) {
           const py::args args, const py::kwargs kwargs) {
          return imperative::PyLayerApply(place, cls, args, kwargs);
        });
+  m.def("pylayer_apply",
+        [](const platform::NPUPlace &place, const py::object &cls,
+           const py::args args, const py::kwargs kwargs) {
+          return imperative::PyLayerApply(place, cls, args, kwargs);
+        });
 }
 }  // namespace pybind

--- a/python/paddle/fluid/tests/unittests/test_var_base.py
+++ b/python/paddle/fluid/tests/unittests/test_var_base.py
@@ -246,6 +246,9 @@ class TestVarBase(unittest.TestCase):
            _test_place("gpu_pinned")
            _test_place(core.CUDAPlace(0))
            _test_place("gpu:0")
+        if core.is_compiled_with_npu():
+            _test_place(core.NPUPlace(0))
+            _test_place("npu:0")
    def test_to_tensor_not_change_input_stop_gradient(self):
        with paddle.fluid.dygraph.guard(core.CPUPlace()):

--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -102,11 +102,10 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True):
    place = _get_paddle_place(place)
    if place is None:
        place = _current_expected_place()
-    elif not isinstance(
+    elif not isinstance(place, (core.Place, core.CPUPlace, core.CUDAPinnedPlace,
-            place,
+                                core.CUDAPlace, core.NPUPlace)):
-        (core.Place, core.CPUPlace, core.CUDAPinnedPlace, core.CUDAPlace)):
        raise ValueError(
-            "'place' must be any of paddle.Place, paddle.CPUPlace, paddle.CUDAPinnedPlace, paddle.CUDAPlace"
+            "'place' must be any of paddle.Place, paddle.CPUPlace, paddle.CUDAPinnedPlace, paddle.CUDAPlace, paddle.NPUPlace"
        )
    #Todo(zhouwei): Support allocate tensor on any other specified card