From 6aea6be20738cd9fb4a644f9f804d5cd795cdc9e Mon Sep 17 00:00:00 2001 From: houj04 <35131887+houj04@users.noreply.github.com> Date: Thu, 24 Jun 2021 17:21:43 +0800 Subject: [PATCH] [NPU] support dygraph execution on npu place(#33579) * in NPU environment, use CPUPlace for missing operators. * in NPU environment, use CPUPlace for missing operators. * fix TensorCopy bug and add unit test. * fix code style. * add more unit tests. --- paddle/fluid/framework/tensor_util.cc | 2 +- paddle/fluid/imperative/prepared_operator.cc | 7 +++ paddle/fluid/imperative/tracer.cc | 11 ++++ paddle/fluid/pybind/imperative.cc | 51 +++++++++++++++++-- .../fluid/tests/unittests/test_var_base.py | 3 ++ python/paddle/tensor/creation.py | 7 ++- 6 files changed, 73 insertions(+), 8 deletions(-) diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index d8f6df3e0b..7cd62e3e2a 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -278,7 +278,7 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place, Tensor* dst) { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); const platform::DeviceContext* dev_ctx; - if (platform::is_gpu_place(dst_place)) { + if (platform::is_gpu_place(dst_place) || platform::is_npu_place(dst_place)) { dev_ctx = pool.Get(dst_place); } else { dev_ctx = pool.Get(src.place()); diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc index 6bdb042ebd..4ee3ed6e52 100644 --- a/paddle/fluid/imperative/prepared_operator.cc +++ b/paddle/fluid/imperative/prepared_operator.cc @@ -131,6 +131,13 @@ PreparedOp PrepareImpl(const NameVarMap& ins, expected_kernel_key.place_ = platform::CPUPlace(); kernel_iter = kernels.find(expected_kernel_key); } +#endif +#ifdef PADDLE_WITH_ASCEND_CL + if (kernel_iter == kernels.end() && + is_npu_place(expected_kernel_key.place_)) { + expected_kernel_key.place_ = platform::CPUPlace(); + kernel_iter = kernels.find(expected_kernel_key); + } #endif // TODO(jiabin): Add operator.cc's line 1000 part back when we need that case PADDLE_ENFORCE_NE(kernel_iter, kernels.end(), diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc index 367f948ef6..a8ca788d3b 100644 --- a/paddle/fluid/imperative/tracer.cc +++ b/paddle/fluid/imperative/tracer.cc @@ -120,6 +120,17 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists( gc.reset(new framework::CPUGarbageCollector( BOOST_GET_CONST(platform::CPUPlace, place), 0)); VLOG(10) << "Created GarbageCollector at " << place; + } else if (platform::is_npu_place(place)) { +#if defined(PADDLE_WITH_ASCEND_CL) + // TODO(zhiqiu): fix bugs and enable NPUDefaultStreamGarbageCollector. + gc.reset(new framework::NPUUnsafeFastGarbageCollector( + BOOST_GET_CONST(platform::NPUPlace, place), 0)); + VLOG(10) << "Created GarbageCollector at " << place; +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "Paddle can't use NPU device since it's not compiled with NPU," + "Please recompile or reinstall Paddle with NPU support.")); +#endif } else { PADDLE_THROW(platform::errors::PreconditionNotMet( "Unsupported place for garbage collection")); diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index 816281ce8a..af7f03dc19 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -135,12 +135,14 @@ static const platform::Place PyObjectToPlace(const py::object &place_obj) { return place_obj.cast(); } else if (py::isinstance(place_obj)) { return place_obj.cast(); + } else if (py::isinstance(place_obj)) { + return place_obj.cast(); } else if (py::isinstance(place_obj)) { return place_obj.cast(); } else { PADDLE_THROW(platform::errors::InvalidArgument( "Place should be one of " - "Place/CPUPlace/XPUPlace/CUDAPlace/CUDAPinnedPlace")); + "Place/CPUPlace/XPUPlace/CUDAPlace/CUDAPinnedPlace/NPUPlace")); } } @@ -172,9 +174,13 @@ static void InitTensorForVarBase(imperative::VarBase *self, SetTensorFromPyArray( tensor, array, BOOST_GET_CONST(platform::CUDAPinnedPlace, place), zero_copy); + } else if (platform::is_npu_place(place)) { + SetTensorFromPyArray( + tensor, array, BOOST_GET_CONST(platform::NPUPlace, place), zero_copy); } else { PADDLE_THROW(platform::errors::InvalidArgument( - "Place should be one of CPUPlace/XPUPlace/CUDAPlace/CUDAPinnedPlace")); + "Place should be one of " + "CPUPlace/XPUPlace/CUDAPlace/CUDAPinnedPlace/NPUPlace")); } if (stop_gradient != -1) { self->SetOverridedStopGradient(stop_gradient); @@ -718,6 +724,10 @@ void BindImperative(py::module *m_ptr) { py::arg("value"), py::arg("place"), py::arg("persistable") = false, py::arg("zero_copy") = false, py::arg("name") = "", py::arg("stop_gradient") = -1) + .def("__init__", &InitVarBaseFromNumpyWithArg, + py::arg("value"), py::arg("place"), py::arg("persistable") = false, + py::arg("zero_copy") = false, py::arg("name") = "", + py::arg("stop_gradient") = -1) .def("__init__", &InitVarBaseFromNumpyWithArgDefault, py::arg("value")) .def("__init__", &InitVarBaseFromTensorWithArgDefault, py::arg("tensor")) .def("__init__", &InitVarBaseFromNumpyWithKwargs) @@ -1452,6 +1462,16 @@ void BindImperative(py::module *m_ptr) { return new_var; }, py::return_value_policy::copy) + .def("_copy_to", + [](const std::shared_ptr &self, + const platform::NPUPlace &place, bool blocking) { + auto new_var = self->NewVarBase(place, blocking); + if (!blocking) { + IncreaseVarbaseReferenceCountUntilCopyComplete(self, place); + } + return new_var; + }, + py::return_value_policy::copy) .def("_copy_to", [](const std::shared_ptr &self, const platform::Place &place, bool blocking) { @@ -1578,6 +1598,11 @@ void BindImperative(py::module *m_ptr) { self.SetExpectedPlace(*p); VLOG(4) << "Tracer(" << &self << ")" << " set expected place " << *p; + } else if (py::isinstance(obj)) { + auto p = obj.cast(); + self.SetExpectedPlace(*p); + VLOG(4) << "Tracer(" << &self << ")" + << " set expected place " << *p; } else if (py::isinstance(obj)) { auto p = obj.cast(); self.SetExpectedPlace(*p); @@ -1586,7 +1611,7 @@ void BindImperative(py::module *m_ptr) { } else { PADDLE_THROW(platform::errors::InvalidArgument( "Incompatible Place Type: supports XPUPlace, CUDAPlace, " - "CPUPlace, " + "CPUPlace, NPUPlace" "and CUDAPinnedPlace, " "but got Unknown Type!")); } @@ -1647,6 +1672,19 @@ void BindImperative(py::module *m_ptr) { std::move(attrs), place, trace_backward); } }) + .def("trace", + [](imperative::Tracer &self, const std::string &type, + const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs, + framework::AttributeMap attrs, const platform::NPUPlace &place, + bool trace_backward) { + auto ins_map = ConvertToNameVarBaseMap(ins); + auto outs_map = ConvertToNameVarBaseMap(outs); + { + py::gil_scoped_release release; + self.TraceOp(type, std::move(ins_map), std::move(outs_map), + std::move(attrs), place, trace_backward); + } + }) .def("trace", [](imperative::Tracer &self, const std::string &type, const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs, @@ -1704,6 +1742,7 @@ void BindImperative(py::module *m_ptr) { m.def("varbase_copy", &VarBaseCopy); m.def("varbase_copy", &VarBaseCopy); m.def("varbase_copy", &VarBaseCopy); + m.def("varbase_copy", &VarBaseCopy); m.def( "dygraph_partial_grad", @@ -1804,6 +1843,12 @@ void BindImperative(py::module *m_ptr) { const py::args args, const py::kwargs kwargs) { return imperative::PyLayerApply(place, cls, args, kwargs); }); + + m.def("pylayer_apply", + [](const platform::NPUPlace &place, const py::object &cls, + const py::args args, const py::kwargs kwargs) { + return imperative::PyLayerApply(place, cls, args, kwargs); + }); } } // namespace pybind diff --git a/python/paddle/fluid/tests/unittests/test_var_base.py b/python/paddle/fluid/tests/unittests/test_var_base.py index 98bc79fc7c..644e46f108 100644 --- a/python/paddle/fluid/tests/unittests/test_var_base.py +++ b/python/paddle/fluid/tests/unittests/test_var_base.py @@ -246,6 +246,9 @@ class TestVarBase(unittest.TestCase): _test_place("gpu_pinned") _test_place(core.CUDAPlace(0)) _test_place("gpu:0") + if core.is_compiled_with_npu(): + _test_place(core.NPUPlace(0)) + _test_place("npu:0") def test_to_tensor_not_change_input_stop_gradient(self): with paddle.fluid.dygraph.guard(core.CPUPlace()): diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py index b7c55ea424..734159422f 100644 --- a/python/paddle/tensor/creation.py +++ b/python/paddle/tensor/creation.py @@ -102,11 +102,10 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True): place = _get_paddle_place(place) if place is None: place = _current_expected_place() - elif not isinstance( - place, - (core.Place, core.CPUPlace, core.CUDAPinnedPlace, core.CUDAPlace)): + elif not isinstance(place, (core.Place, core.CPUPlace, core.CUDAPinnedPlace, + core.CUDAPlace, core.NPUPlace)): raise ValueError( - "'place' must be any of paddle.Place, paddle.CPUPlace, paddle.CUDAPinnedPlace, paddle.CUDAPlace" + "'place' must be any of paddle.Place, paddle.CPUPlace, paddle.CUDAPinnedPlace, paddle.CUDAPlace, paddle.NPUPlace" ) #Todo(zhouwei): Support allocate tensor on any other specified card -- GitLab