From 74259bac1eb49c76e8fd638a755a960e1cfb9159 Mon Sep 17 00:00:00 2001 From: ronnywang Date: Fri, 24 Jun 2022 02:09:17 +0800 Subject: [PATCH] fix npu plugin hang in backward in eager mode (#43614) --- paddle/fluid/eager/grad_tensor_holder.cc | 26 ++++++++++++++++-------- paddle/fluid/pybind/eager_method.cc | 26 ++++++++++++++++++++++++ paddle/phi/api/include/tensor.h | 8 ++++++++ paddle/phi/api/lib/tensor.cc | 4 ++++ 4 files changed, 55 insertions(+), 9 deletions(-) diff --git a/paddle/fluid/eager/grad_tensor_holder.cc b/paddle/fluid/eager/grad_tensor_holder.cc index ee5dd622412..542567b35c7 100644 --- a/paddle/fluid/eager/grad_tensor_holder.cc +++ b/paddle/fluid/eager/grad_tensor_holder.cc @@ -29,7 +29,9 @@ void GradTensorHolder::SetBufferSlotRankZeros(size_t slot_id, size_t rank) { } void GradTensorHolder::CopyValueFromTensor( - size_t slot_id, size_t rank, const paddle::experimental::Tensor& t, + size_t slot_id, + size_t rank, + const paddle::experimental::Tensor& t, bool fill_one) { // TODO(jiabin): We need to deal with empty input_buffer with slot size not // empty; @@ -49,7 +51,9 @@ void GradTensorHolder::CopyValueFromTensor( paddle::platform::errors::Fatal( "Invalid rank for GradTensorHolder::add() which exceeds size " "of buffer slot %d, got slot size is: %d rank is: %d", - slot_id, buffer_[slot_id].size(), rank)); + slot_id, + buffer_[slot_id].size(), + rank)); if (!fill_one) { paddle::experimental::Tensor& buffer_tensor = buffer_[slot_id][rank]; if ((!buffer_tensor.defined() || !buffer_tensor.initialized())) { @@ -83,7 +87,8 @@ void GradTensorHolder::CopyValueFromTensor( } } -void GradTensorHolder::add(size_t slot_id, size_t rank, +void GradTensorHolder::add(size_t slot_id, + size_t rank, const paddle::experimental::Tensor& t, bool create_graph) { PADDLE_ENFORCE(slot_id < buffer_.size(), @@ -102,7 +107,9 @@ void GradTensorHolder::add(size_t slot_id, size_t rank, paddle::platform::errors::Fatal( "Invalid rank for GradTensorHolder::add() which exceeds size " "of buffer slot %d, got slot size is: %d rank is: %d", - slot_id, buffer_[slot_id].size(), rank)); + slot_id, + buffer_[slot_id].size(), + rank)); paddle::experimental::Tensor& buffer_tensor = buffer_[slot_id][rank]; // TODO(jiabin): Code bellow is ugly to divide which inner var we used, @@ -115,7 +122,8 @@ void GradTensorHolder::add(size_t slot_id, size_t rank, buffer_tensor = t; } else { // Accumulation - PADDLE_ENFORCE_EQ(t.initialized(), true, + PADDLE_ENFORCE_EQ(t.initialized(), + true, paddle::platform::errors::Fatal( "We can only accumulate initialized tensor, but we " "got tensor: %s is empty please check you network " @@ -124,7 +132,7 @@ void GradTensorHolder::add(size_t slot_id, size_t rank, if (t.is_dense_tensor()) { if (buffer_tensor.is_dense_tensor()) { - if (create_graph) { + if (create_graph || t.is_custom_device()) { buffer_tensor = add_final_state_dygraph_function(t, buffer_tensor); } else { paddle::imperative::TensorAdd( @@ -136,8 +144,8 @@ void GradTensorHolder::add(size_t slot_id, size_t rank, // add_dygraph_function once it's supported paddle::experimental::Tensor new_buffer( std::make_shared(), "tmp_accumulator"); - paddle::imperative::SelectedRowsAddTensor(buffer_tensor, t, - &new_buffer); + paddle::imperative::SelectedRowsAddTensor( + buffer_tensor, t, &new_buffer); buffer_tensor.set_impl(new_buffer.impl()); } } else if (t.is_sparse_coo_tensor()) { @@ -151,7 +159,7 @@ void GradTensorHolder::add(size_t slot_id, size_t rank, paddle::experimental::Tensor buffer_values( std::make_shared( buffer_sparse->non_zero_elements())); - if (create_graph) { + if (create_graph || t.is_custom_device()) { buffer_values = add_final_state_dygraph_function(t_values, buffer_values); } else { diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc index 9c7ad28dac8..77e19629114 100644 --- a/paddle/fluid/pybind/eager_method.cc +++ b/paddle/fluid/pybind/eager_method.cc @@ -292,6 +292,32 @@ static PyObject* tensor_method_numpy(TensorObject* self, dense_tensor->numel(), kind); } +#endif +#ifdef PADDLE_WITH_CUSTOM_DEVICE + } else if (self->tensor.is_custom_device()) { + if (self->tensor.is_selected_rows()) { + VLOG(6) << "Getting SelectedRows's numpy value"; + auto* selected_rows = + static_cast(self->tensor.impl().get()); + auto* dense_tensor = static_cast( + selected_rows->mutable_value()); + phi::DeviceManager::GetDeviceWithPlace(self->tensor.place()) + ->MemoryCopyD2H( + pybind11::detail::array_proxy(array)->data, + dense_tensor->data(), + paddle::framework::DataTypeSize(dense_tensor->dtype()) * + dense_tensor->numel()); + } else { + VLOG(6) << "Getting DenseTensor's numpy value"; + auto dense_tensor = + std::dynamic_pointer_cast(self->tensor.impl()); + phi::DeviceManager::GetDeviceWithPlace(self->tensor.place()) + ->MemoryCopyD2H( + pybind11::detail::array_proxy(array)->data, + dense_tensor->data(), + paddle::framework::DataTypeSize(dense_tensor->dtype()) * + dense_tensor->numel()); + } #endif } else { PADDLE_THROW(platform::errors::InvalidArgument( diff --git a/paddle/phi/api/include/tensor.h b/paddle/phi/api/include/tensor.h index 2b0aea9e1ec..667ef281b99 100644 --- a/paddle/phi/api/include/tensor.h +++ b/paddle/phi/api/include/tensor.h @@ -286,6 +286,14 @@ class PADDLE_API Tensor final { */ bool is_gpu_pinned() const; + /** + * @brief Determine whether the tensor device is CustomDevice + * + * @return true + * @return false + */ + bool is_custom_device() const; + /* Part 4: Data Access methods */ /** diff --git a/paddle/phi/api/lib/tensor.cc b/paddle/phi/api/lib/tensor.cc index 74364d5ab03..cce90ea1e8e 100644 --- a/paddle/phi/api/lib/tensor.cc +++ b/paddle/phi/api/lib/tensor.cc @@ -177,6 +177,10 @@ bool Tensor::is_gpu_pinned() const { return paddle::platform::is_cuda_pinned_place(place()); } +bool Tensor::is_custom_device() const { + return paddle::platform::is_custom_place(place()); +} + /* Part 4: Data Access methods */ template -- GitLab