[NPU] apply npu_identity to conv bn and copy2cpu, test=develop (#48039)

* [NPU] apply npu_identity to conv bn and copy2cpu, test=develop * update npu identity to share data with x, test=develop * address review comments, test=develop

[NPU] apply npu_identity to conv bn and copy2cpu, test=develop (#48039)
* [NPU] apply npu_identity to conv bn and copy2cpu, test=develop * update npu identity to share data with x, test=develop * address review comments, test=develop
32143f44 · Qi Li · GitHub · 74d411e7 · 32143f44 · 32143f44
9 changed file
--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -261,6 +261,15 @@ static PyObject* tensor_method_numpy(TensorObject* self,
      VLOG(6) << "Getting DenseTensor's numpy value";
      auto dense_tensor =
          std::dynamic_pointer_cast<phi::DenseTensor>(self->tensor.impl());
+      // TODO(qili93): temporary for ascned npu performance to be removed along
+      // with npu_identity op
+      paddle::experimental::Tensor temp_tensor(
+          std::make_shared<phi::DenseTensor>());
+      if (dense_tensor->storage_properties_initialized()) {
+        temp_tensor = npu_identity_ad_func(self->tensor, -1);
+        dense_tensor =
+            std::dynamic_pointer_cast<phi::DenseTensor>(temp_tensor.impl());
+      }
      phi::DeviceManager::GetDeviceWithPlace(self->tensor.place())
          ->MemoryCopyD2H(
              pybind11::detail::array_proxy(array)->data,

--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -34,6 +34,7 @@ limitations under the License. */
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
+#include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h"
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/platform/device_context.h"
@@ -1168,6 +1169,19 @@ inline py::array TensorToPyArray(const phi::DenseTensor &tensor,
            "PyArray does not own data, in which case  memory leak "
            "or double free would occur"));
+    // TODO(qili93): temporary for ascned npu performance to be removed along
+    // with npu_identity op
+    paddle::experimental::Tensor tensor_out(
+        std::make_shared<phi::DenseTensor>());
+    if (tensor.storage_properties_initialized()) {
+      paddle::experimental::Tensor tensor_in(
+          std::make_shared<phi::DenseTensor>(tensor));
+      tensor_out = npu_identity_ad_func(tensor_in, -1);
+      auto dense_tensor =
+          std::dynamic_pointer_cast<phi::DenseTensor>(tensor_out.impl());
+      tensor_buf_ptr = dense_tensor->data();
+    }
    size_t copy_bytes = sizeof_dtype * numel;
    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
    auto &ctx = *pool.Get(tensor.place());

--- a/paddle/phi/api/ext/tensor_compat.h
+++ b/paddle/phi/api/ext/tensor_compat.h
@@ -100,6 +100,7 @@ using experimental::multinomial;
 using experimental::multiply;
 using experimental::mv;
 using experimental::nll_loss;
+using experimental::npu_identity;
 using experimental::one_hot;
 using experimental::ones;
 using experimental::pixel_shuffle;

--- a/paddle/phi/kernels/npu_identity_kernel.cc
+++ b/paddle/phi/kernels/npu_identity_kernel.cc
@@ -24,10 +24,8 @@ void NPUIdentityKernel(const Context& dev_ctx,
                       const DenseTensor& x,
                       const int format,
                       DenseTensor* out) {
-  VLOG(4) << "npu_identity op is only for NPU, CPU or GPU kernel just empty "
+  VLOG(4) << "npu_identity op is only for NPU, please avoid using this kernel!";
-             "tensor with shape: "
+  out->ShareDataWith(x);
-          << out->dims() << ", please avoid using this kernel!";
-  *out = phi::EmptyLike<T, Context>(dev_ctx, *out);
 }
 }  // namespace phi

--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -40,6 +40,7 @@ import paddle.utils.deprecated as deprecated
 import paddle.profiler as profiler
 from paddle.profiler.utils import in_profiler_mode
 from paddle import _C_ops, _legacy_C_ops
+from paddle.device import get_all_custom_device_type
 _grad_scalar = None
@@ -376,7 +377,11 @@ def monkey_patch_varbase():
            if self._grad_ivar() is None:
                return None
-            new_ivar = self._grad_ivar()._copy_to(core.CPUPlace(), True)
+            new_ivar = self._grad_ivar()
+            # TODO(qili93): temporary for ascned npu performance to be removed along with npu_identity op
+            if 'npu' in get_all_custom_device_type():
+                new_ivar = paddle.incubate._npu_identity(x=new_ivar, format=-1)
+            new_ivar = new_ivar._copy_to(core.CPUPlace(), True)
            if self._grad_ivar().type == core.VarDesc.VarType.SELECTED_ROWS:
                return (
                    np.array(new_ivar.value().get_selected_rows().get_tensor()),

--- a/python/paddle/fluid/tests/unittests/test_npu_identity_op.py
+++ b/python/paddle/fluid/tests/unittests/test_npu_identity_op.py
@@ -41,15 +41,15 @@ class TestNPUIdentityOp(unittest.TestCase):
            main_program, feed={x_data.name: self.x}, fetch_list=[output]
        )
-        np.testing.assert_allclose(result[0].shape, self.shape, rtol=1e-08)
+        np.testing.assert_allclose(result[0], self.x, rtol=1e-08)
    def test_api_dygraph(self):
        paddle.disable_static(self.place)
-        x_tensor = paddle.to_tensor(self.x)
+        x = paddle.to_tensor(self.x)
-        out = paddle.incubate._npu_identity(x_tensor, self.format)
+        out = paddle.incubate._npu_identity(x, self.format)
-        np.testing.assert_allclose(out.shape, self.shape, rtol=1e-08)
+        np.testing.assert_allclose(out.numpy(), self.x, rtol=1e-08)
        paddle.enable_static()

--- a/python/paddle/incubate/tensor/manipulation.py
+++ b/python/paddle/incubate/tensor/manipulation.py
@@ -52,7 +52,7 @@ def _npu_identity(x, format=-1):
        return _C_ops.npu_identity(x, format)
    if _in_legacy_dygraph():
-        return _legacy_C_ops.npu_identity(x, format)
+        return _legacy_C_ops.npu_identity(x, 'format', format)
    check_variable_and_dtype(
        x,

--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py
@@ -24,11 +24,13 @@ from ...fluid.data_feeder import check_variable_and_dtype, check_dtype
 from ...fluid.layer_helper import LayerHelper
 from ...tensor.manipulation import unsqueeze, squeeze
 from ...fluid.layers import nn
+from ...framework import no_grad
 from paddle import _C_ops, _legacy_C_ops
 from paddle import get_flags
 from paddle import in_dynamic_mode
 from paddle.device import is_compiled_with_cuda
 from paddle.device import is_compiled_with_npu
+from paddle.device import get_all_custom_device_type
 from paddle import in_dynamic_mode
 from paddle import get_flags
 from paddle.device import is_compiled_with_rocm
@@ -150,15 +152,20 @@ def _conv_nd(
            if isinstance(bias, tuple):
                bias = bias[0]
            if len(bias.shape) < len(x.shape):
-                tmp_bias = _C_ops.reshape(
+                bias = _C_ops.reshape(
                    bias,
                    [1 for i in range(channel_dim)]
                    + bias.shape
                    + [1 for i in range(len(x.shape) - channel_dim - 1)],
                )
-                return _C_ops.add(pre_bias, tmp_bias)
+            # TODO(qili93): temporary for ascned npu performance to be removed along with npu_identity op
-            else:
+            if 'npu' in get_all_custom_device_type():
-                return _C_ops.add(pre_bias, bias)
+                with no_grad():
+                    bias_storage = _C_ops.npu_identity(
+                        bias, 3
+                    )  # ACL_FORMAT_NC1HWC0 = 3
+                    bias_storage._share_underline_tensor_to(bias)
+            return _C_ops.add(pre_bias, bias)
        else:
            return pre_bias
@@ -747,8 +754,26 @@ def conv2d(
                data_format,
            )
            if bias is not None:
-                out = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
+                channel_dim = (
-                return out
+                    channel_dim + len(x.shape)
+                    if channel_dim < 0
+                    else channel_dim
+                )
+                if len(bias.shape) < len(x.shape):
+                    bias = _C_ops.reshape(
+                        bias,
+                        [1 for i in range(channel_dim)]
+                        + bias.shape
+                        + [1 for i in range(len(x.shape) - channel_dim - 1)],
+                    )
+                # TODO(qili93): temporary for ascned npu performance to be removed along with npu_identity op
+                if 'npu' in get_all_custom_device_type():
+                    with no_grad():
+                        bias_storage = _C_ops.npu_identity(
+                            bias, 3
+                        )  # ACL_FORMAT_NC1HWC0 = 3
+                        bias_storage._share_underline_tensor_to(bias)
+                return _C_ops.add(pre_bias, bias)
            else:
                return pre_bias

--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
@@ -46,6 +46,7 @@ from .. import functional as F
 from paddle import _C_ops, _legacy_C_ops
 from .. import Layer
 from paddle import in_dynamic_mode
+from paddle.device import get_all_custom_device_type
 from paddle.fluid.framework import in_dygraph_mode, _in_legacy_dygraph
 __all__ = []
@@ -683,6 +684,26 @@ class _BatchNormBase(Layer):
        )
        self._variance.stop_gradient = True
+        # TODO(qili93): temporary for ascned npu performance to be removed along with npu_identity op
+        if 'npu' in get_all_custom_device_type():
+            with no_grad():
+                weight_trans = _C_ops.npu_identity(
+                    self.weight, 3
+                )  # ACL_FORMAT_NC1HWC0 = 3
+                bias_trans = _C_ops.npu_identity(
+                    self.bias, 3
+                )  # ACL_FORMAT_NC1HWC0 = 3
+                mean_trans = _C_ops.npu_identity(
+                    self._mean, 3
+                )  # ACL_FORMAT_NC1HWC0 = 3
+                var_trans = _C_ops.npu_identity(
+                    self._variance, 3
+                )  # ACL_FORMAT_NC1HWC0 = 3
+                weight_trans._share_underline_tensor_to(self.weight)
+                bias_trans._share_underline_tensor_to(self.bias)
+                mean_trans._share_underline_tensor_to(self._mean)
+                var_trans._share_underline_tensor_to(self._variance)
        self._data_format = data_format
        self._in_place = False
        self._momentum = momentum