From 32143f44a2abe9ebd1b7370a9df9c686cd109c9b Mon Sep 17 00:00:00 2001 From: Qi Li Date: Mon, 28 Nov 2022 13:26:54 +0800 Subject: [PATCH] [NPU] apply npu_identity to conv bn and copy2cpu, test=develop (#48039) * [NPU] apply npu_identity to conv bn and copy2cpu, test=develop * update npu identity to share data with x, test=develop * address review comments, test=develop --- paddle/fluid/pybind/eager_method.cc | 9 +++++ paddle/fluid/pybind/tensor_py.h | 14 +++++++ paddle/phi/api/ext/tensor_compat.h | 1 + paddle/phi/kernels/npu_identity_kernel.cc | 6 +-- .../fluid/dygraph/varbase_patch_methods.py | 7 +++- .../tests/unittests/test_npu_identity_op.py | 8 ++-- python/paddle/incubate/tensor/manipulation.py | 2 +- python/paddle/nn/functional/conv.py | 37 ++++++++++++++++--- python/paddle/nn/layer/norm.py | 21 +++++++++++ 9 files changed, 89 insertions(+), 16 deletions(-) diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc index 6c91b32786..17d210cc2f 100644 --- a/paddle/fluid/pybind/eager_method.cc +++ b/paddle/fluid/pybind/eager_method.cc @@ -261,6 +261,15 @@ static PyObject* tensor_method_numpy(TensorObject* self, VLOG(6) << "Getting DenseTensor's numpy value"; auto dense_tensor = std::dynamic_pointer_cast(self->tensor.impl()); + // TODO(qili93): temporary for ascned npu performance to be removed along + // with npu_identity op + paddle::experimental::Tensor temp_tensor( + std::make_shared()); + if (dense_tensor->storage_properties_initialized()) { + temp_tensor = npu_identity_ad_func(self->tensor, -1); + dense_tensor = + std::dynamic_pointer_cast(temp_tensor.impl()); + } phi::DeviceManager::GetDeviceWithPlace(self->tensor.place()) ->MemoryCopyD2H( pybind11::detail::array_proxy(array)->data, diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h index 811ff2de64..f0c038226f 100644 --- a/paddle/fluid/pybind/tensor_py.h +++ b/paddle/fluid/pybind/tensor_py.h @@ -34,6 +34,7 @@ limitations under the License. */ #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/fluid/platform/cuda_device_guard.h" #endif +#include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h" #include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/platform/device_context.h" @@ -1168,6 +1169,19 @@ inline py::array TensorToPyArray(const phi::DenseTensor &tensor, "PyArray does not own data, in which case memory leak " "or double free would occur")); + // TODO(qili93): temporary for ascned npu performance to be removed along + // with npu_identity op + paddle::experimental::Tensor tensor_out( + std::make_shared()); + if (tensor.storage_properties_initialized()) { + paddle::experimental::Tensor tensor_in( + std::make_shared(tensor)); + tensor_out = npu_identity_ad_func(tensor_in, -1); + auto dense_tensor = + std::dynamic_pointer_cast(tensor_out.impl()); + tensor_buf_ptr = dense_tensor->data(); + } + size_t copy_bytes = sizeof_dtype * numel; platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto &ctx = *pool.Get(tensor.place()); diff --git a/paddle/phi/api/ext/tensor_compat.h b/paddle/phi/api/ext/tensor_compat.h index 2833629f0f..7233744c65 100644 --- a/paddle/phi/api/ext/tensor_compat.h +++ b/paddle/phi/api/ext/tensor_compat.h @@ -100,6 +100,7 @@ using experimental::multinomial; using experimental::multiply; using experimental::mv; using experimental::nll_loss; +using experimental::npu_identity; using experimental::one_hot; using experimental::ones; using experimental::pixel_shuffle; diff --git a/paddle/phi/kernels/npu_identity_kernel.cc b/paddle/phi/kernels/npu_identity_kernel.cc index 0c1af9bb40..c9fb7a9728 100644 --- a/paddle/phi/kernels/npu_identity_kernel.cc +++ b/paddle/phi/kernels/npu_identity_kernel.cc @@ -24,10 +24,8 @@ void NPUIdentityKernel(const Context& dev_ctx, const DenseTensor& x, const int format, DenseTensor* out) { - VLOG(4) << "npu_identity op is only for NPU, CPU or GPU kernel just empty " - "tensor with shape: " - << out->dims() << ", please avoid using this kernel!"; - *out = phi::EmptyLike(dev_ctx, *out); + VLOG(4) << "npu_identity op is only for NPU, please avoid using this kernel!"; + out->ShareDataWith(x); } } // namespace phi diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py index 6fa46692c7..ee57dc8cc2 100644 --- a/python/paddle/fluid/dygraph/varbase_patch_methods.py +++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py @@ -40,6 +40,7 @@ import paddle.utils.deprecated as deprecated import paddle.profiler as profiler from paddle.profiler.utils import in_profiler_mode from paddle import _C_ops, _legacy_C_ops +from paddle.device import get_all_custom_device_type _grad_scalar = None @@ -376,7 +377,11 @@ def monkey_patch_varbase(): if self._grad_ivar() is None: return None - new_ivar = self._grad_ivar()._copy_to(core.CPUPlace(), True) + new_ivar = self._grad_ivar() + # TODO(qili93): temporary for ascned npu performance to be removed along with npu_identity op + if 'npu' in get_all_custom_device_type(): + new_ivar = paddle.incubate._npu_identity(x=new_ivar, format=-1) + new_ivar = new_ivar._copy_to(core.CPUPlace(), True) if self._grad_ivar().type == core.VarDesc.VarType.SELECTED_ROWS: return ( np.array(new_ivar.value().get_selected_rows().get_tensor()), diff --git a/python/paddle/fluid/tests/unittests/test_npu_identity_op.py b/python/paddle/fluid/tests/unittests/test_npu_identity_op.py index 183a86a8ce..b79811dabe 100644 --- a/python/paddle/fluid/tests/unittests/test_npu_identity_op.py +++ b/python/paddle/fluid/tests/unittests/test_npu_identity_op.py @@ -41,15 +41,15 @@ class TestNPUIdentityOp(unittest.TestCase): main_program, feed={x_data.name: self.x}, fetch_list=[output] ) - np.testing.assert_allclose(result[0].shape, self.shape, rtol=1e-08) + np.testing.assert_allclose(result[0], self.x, rtol=1e-08) def test_api_dygraph(self): paddle.disable_static(self.place) - x_tensor = paddle.to_tensor(self.x) - out = paddle.incubate._npu_identity(x_tensor, self.format) + x = paddle.to_tensor(self.x) + out = paddle.incubate._npu_identity(x, self.format) - np.testing.assert_allclose(out.shape, self.shape, rtol=1e-08) + np.testing.assert_allclose(out.numpy(), self.x, rtol=1e-08) paddle.enable_static() diff --git a/python/paddle/incubate/tensor/manipulation.py b/python/paddle/incubate/tensor/manipulation.py index 0722c94aa1..b5f1681b9e 100644 --- a/python/paddle/incubate/tensor/manipulation.py +++ b/python/paddle/incubate/tensor/manipulation.py @@ -52,7 +52,7 @@ def _npu_identity(x, format=-1): return _C_ops.npu_identity(x, format) if _in_legacy_dygraph(): - return _legacy_C_ops.npu_identity(x, format) + return _legacy_C_ops.npu_identity(x, 'format', format) check_variable_and_dtype( x, diff --git a/python/paddle/nn/functional/conv.py b/python/paddle/nn/functional/conv.py index 1d5d5df458..58f0254f09 100644 --- a/python/paddle/nn/functional/conv.py +++ b/python/paddle/nn/functional/conv.py @@ -24,11 +24,13 @@ from ...fluid.data_feeder import check_variable_and_dtype, check_dtype from ...fluid.layer_helper import LayerHelper from ...tensor.manipulation import unsqueeze, squeeze from ...fluid.layers import nn +from ...framework import no_grad from paddle import _C_ops, _legacy_C_ops from paddle import get_flags from paddle import in_dynamic_mode from paddle.device import is_compiled_with_cuda from paddle.device import is_compiled_with_npu +from paddle.device import get_all_custom_device_type from paddle import in_dynamic_mode from paddle import get_flags from paddle.device import is_compiled_with_rocm @@ -150,15 +152,20 @@ def _conv_nd( if isinstance(bias, tuple): bias = bias[0] if len(bias.shape) < len(x.shape): - tmp_bias = _C_ops.reshape( + bias = _C_ops.reshape( bias, [1 for i in range(channel_dim)] + bias.shape + [1 for i in range(len(x.shape) - channel_dim - 1)], ) - return _C_ops.add(pre_bias, tmp_bias) - else: - return _C_ops.add(pre_bias, bias) + # TODO(qili93): temporary for ascned npu performance to be removed along with npu_identity op + if 'npu' in get_all_custom_device_type(): + with no_grad(): + bias_storage = _C_ops.npu_identity( + bias, 3 + ) # ACL_FORMAT_NC1HWC0 = 3 + bias_storage._share_underline_tensor_to(bias) + return _C_ops.add(pre_bias, bias) else: return pre_bias @@ -747,8 +754,26 @@ def conv2d( data_format, ) if bias is not None: - out = nn.elementwise_add(pre_bias, bias, axis=channel_dim) - return out + channel_dim = ( + channel_dim + len(x.shape) + if channel_dim < 0 + else channel_dim + ) + if len(bias.shape) < len(x.shape): + bias = _C_ops.reshape( + bias, + [1 for i in range(channel_dim)] + + bias.shape + + [1 for i in range(len(x.shape) - channel_dim - 1)], + ) + # TODO(qili93): temporary for ascned npu performance to be removed along with npu_identity op + if 'npu' in get_all_custom_device_type(): + with no_grad(): + bias_storage = _C_ops.npu_identity( + bias, 3 + ) # ACL_FORMAT_NC1HWC0 = 3 + bias_storage._share_underline_tensor_to(bias) + return _C_ops.add(pre_bias, bias) else: return pre_bias diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py index cd28479b5d..2395f2ed54 100644 --- a/python/paddle/nn/layer/norm.py +++ b/python/paddle/nn/layer/norm.py @@ -46,6 +46,7 @@ from .. import functional as F from paddle import _C_ops, _legacy_C_ops from .. import Layer from paddle import in_dynamic_mode +from paddle.device import get_all_custom_device_type from paddle.fluid.framework import in_dygraph_mode, _in_legacy_dygraph __all__ = [] @@ -683,6 +684,26 @@ class _BatchNormBase(Layer): ) self._variance.stop_gradient = True + # TODO(qili93): temporary for ascned npu performance to be removed along with npu_identity op + if 'npu' in get_all_custom_device_type(): + with no_grad(): + weight_trans = _C_ops.npu_identity( + self.weight, 3 + ) # ACL_FORMAT_NC1HWC0 = 3 + bias_trans = _C_ops.npu_identity( + self.bias, 3 + ) # ACL_FORMAT_NC1HWC0 = 3 + mean_trans = _C_ops.npu_identity( + self._mean, 3 + ) # ACL_FORMAT_NC1HWC0 = 3 + var_trans = _C_ops.npu_identity( + self._variance, 3 + ) # ACL_FORMAT_NC1HWC0 = 3 + weight_trans._share_underline_tensor_to(self.weight) + bias_trans._share_underline_tensor_to(self.bias) + mean_trans._share_underline_tensor_to(self._mean) + var_trans._share_underline_tensor_to(self._variance) + self._data_format = data_format self._in_place = False self._momentum = momentum -- GitLab