From 32143f44a2abe9ebd1b7370a9df9c686cd109c9b Mon Sep 17 00:00:00 2001
From: Qi Li <qili93@qq.com>
Date: Mon, 28 Nov 2022 13:26:54 +0800
Subject: [PATCH] [NPU] apply npu_identity to conv bn and copy2cpu,
 test=develop (#48039)

* [NPU] apply npu_identity to conv bn and copy2cpu, test=develop

* update npu identity to share data with x, test=develop

* address review comments, test=develop
---
 paddle/fluid/pybind/eager_method.cc           |  9 +++++
 paddle/fluid/pybind/tensor_py.h               | 14 +++++++
 paddle/phi/api/ext/tensor_compat.h            |  1 +
 paddle/phi/kernels/npu_identity_kernel.cc     |  6 +--
 .../fluid/dygraph/varbase_patch_methods.py    |  7 +++-
 .../tests/unittests/test_npu_identity_op.py   |  8 ++--
 python/paddle/incubate/tensor/manipulation.py |  2 +-
 python/paddle/nn/functional/conv.py           | 37 ++++++++++++++++---
 python/paddle/nn/layer/norm.py                | 21 +++++++++++
 9 files changed, 89 insertions(+), 16 deletions(-)

diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc
index 6c91b32786..17d210cc2f 100644
--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -261,6 +261,15 @@ static PyObject* tensor_method_numpy(TensorObject* self,
       VLOG(6) << "Getting DenseTensor's numpy value";
       auto dense_tensor =
           std::dynamic_pointer_cast<phi::DenseTensor>(self->tensor.impl());
+      // TODO(qili93): temporary for ascned npu performance to be removed along
+      // with npu_identity op
+      paddle::experimental::Tensor temp_tensor(
+          std::make_shared<phi::DenseTensor>());
+      if (dense_tensor->storage_properties_initialized()) {
+        temp_tensor = npu_identity_ad_func(self->tensor, -1);
+        dense_tensor =
+            std::dynamic_pointer_cast<phi::DenseTensor>(temp_tensor.impl());
+      }
       phi::DeviceManager::GetDeviceWithPlace(self->tensor.place())
           ->MemoryCopyD2H(
               pybind11::detail::array_proxy(array)->data,
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
index 811ff2de64..f0c038226f 100644
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -34,6 +34,7 @@ limitations under the License. */
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
+#include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h"
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/platform/device_context.h"
@@ -1168,6 +1169,19 @@ inline py::array TensorToPyArray(const phi::DenseTensor &tensor,
             "PyArray does not own data, in which case  memory leak "
             "or double free would occur"));
 
+    // TODO(qili93): temporary for ascned npu performance to be removed along
+    // with npu_identity op
+    paddle::experimental::Tensor tensor_out(
+        std::make_shared<phi::DenseTensor>());
+    if (tensor.storage_properties_initialized()) {
+      paddle::experimental::Tensor tensor_in(
+          std::make_shared<phi::DenseTensor>(tensor));
+      tensor_out = npu_identity_ad_func(tensor_in, -1);
+      auto dense_tensor =
+          std::dynamic_pointer_cast<phi::DenseTensor>(tensor_out.impl());
+      tensor_buf_ptr = dense_tensor->data();
+    }
+
     size_t copy_bytes = sizeof_dtype * numel;
     platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
     auto &ctx = *pool.Get(tensor.place());
diff --git a/paddle/phi/api/ext/tensor_compat.h b/paddle/phi/api/ext/tensor_compat.h
index 2833629f0f..7233744c65 100644
--- a/paddle/phi/api/ext/tensor_compat.h
+++ b/paddle/phi/api/ext/tensor_compat.h
@@ -100,6 +100,7 @@ using experimental::multinomial;
 using experimental::multiply;
 using experimental::mv;
 using experimental::nll_loss;
+using experimental::npu_identity;
 using experimental::one_hot;
 using experimental::ones;
 using experimental::pixel_shuffle;
diff --git a/paddle/phi/kernels/npu_identity_kernel.cc b/paddle/phi/kernels/npu_identity_kernel.cc
index 0c1af9bb40..c9fb7a9728 100644
--- a/paddle/phi/kernels/npu_identity_kernel.cc
+++ b/paddle/phi/kernels/npu_identity_kernel.cc
@@ -24,10 +24,8 @@ void NPUIdentityKernel(const Context& dev_ctx,
                        const DenseTensor& x,
                        const int format,
                        DenseTensor* out) {
-  VLOG(4) << "npu_identity op is only for NPU, CPU or GPU kernel just empty "
-             "tensor with shape: "
-          << out->dims() << ", please avoid using this kernel!";
-  *out = phi::EmptyLike<T, Context>(dev_ctx, *out);
+  VLOG(4) << "npu_identity op is only for NPU, please avoid using this kernel!";
+  out->ShareDataWith(x);
 }
 
 }  // namespace phi
diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py
index 6fa46692c7..ee57dc8cc2 100644
--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -40,6 +40,7 @@ import paddle.utils.deprecated as deprecated
 import paddle.profiler as profiler
 from paddle.profiler.utils import in_profiler_mode
 from paddle import _C_ops, _legacy_C_ops
+from paddle.device import get_all_custom_device_type
 
 _grad_scalar = None
 
@@ -376,7 +377,11 @@ def monkey_patch_varbase():
             if self._grad_ivar() is None:
                 return None
 
-            new_ivar = self._grad_ivar()._copy_to(core.CPUPlace(), True)
+            new_ivar = self._grad_ivar()
+            # TODO(qili93): temporary for ascned npu performance to be removed along with npu_identity op
+            if 'npu' in get_all_custom_device_type():
+                new_ivar = paddle.incubate._npu_identity(x=new_ivar, format=-1)
+            new_ivar = new_ivar._copy_to(core.CPUPlace(), True)
             if self._grad_ivar().type == core.VarDesc.VarType.SELECTED_ROWS:
                 return (
                     np.array(new_ivar.value().get_selected_rows().get_tensor()),
diff --git a/python/paddle/fluid/tests/unittests/test_npu_identity_op.py b/python/paddle/fluid/tests/unittests/test_npu_identity_op.py
index 183a86a8ce..b79811dabe 100644
--- a/python/paddle/fluid/tests/unittests/test_npu_identity_op.py
+++ b/python/paddle/fluid/tests/unittests/test_npu_identity_op.py
@@ -41,15 +41,15 @@ class TestNPUIdentityOp(unittest.TestCase):
             main_program, feed={x_data.name: self.x}, fetch_list=[output]
         )
 
-        np.testing.assert_allclose(result[0].shape, self.shape, rtol=1e-08)
+        np.testing.assert_allclose(result[0], self.x, rtol=1e-08)
 
     def test_api_dygraph(self):
         paddle.disable_static(self.place)
 
-        x_tensor = paddle.to_tensor(self.x)
-        out = paddle.incubate._npu_identity(x_tensor, self.format)
+        x = paddle.to_tensor(self.x)
+        out = paddle.incubate._npu_identity(x, self.format)
 
-        np.testing.assert_allclose(out.shape, self.shape, rtol=1e-08)
+        np.testing.assert_allclose(out.numpy(), self.x, rtol=1e-08)
         paddle.enable_static()
 
 
diff --git a/python/paddle/incubate/tensor/manipulation.py b/python/paddle/incubate/tensor/manipulation.py
index 0722c94aa1..b5f1681b9e 100644
--- a/python/paddle/incubate/tensor/manipulation.py
+++ b/python/paddle/incubate/tensor/manipulation.py
@@ -52,7 +52,7 @@ def _npu_identity(x, format=-1):
         return _C_ops.npu_identity(x, format)
 
     if _in_legacy_dygraph():
-        return _legacy_C_ops.npu_identity(x, format)
+        return _legacy_C_ops.npu_identity(x, 'format', format)
 
     check_variable_and_dtype(
         x,
diff --git a/python/paddle/nn/functional/conv.py b/python/paddle/nn/functional/conv.py
index 1d5d5df458..58f0254f09 100644
--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py
@@ -24,11 +24,13 @@ from ...fluid.data_feeder import check_variable_and_dtype, check_dtype
 from ...fluid.layer_helper import LayerHelper
 from ...tensor.manipulation import unsqueeze, squeeze
 from ...fluid.layers import nn
+from ...framework import no_grad
 from paddle import _C_ops, _legacy_C_ops
 from paddle import get_flags
 from paddle import in_dynamic_mode
 from paddle.device import is_compiled_with_cuda
 from paddle.device import is_compiled_with_npu
+from paddle.device import get_all_custom_device_type
 from paddle import in_dynamic_mode
 from paddle import get_flags
 from paddle.device import is_compiled_with_rocm
@@ -150,15 +152,20 @@ def _conv_nd(
             if isinstance(bias, tuple):
                 bias = bias[0]
             if len(bias.shape) < len(x.shape):
-                tmp_bias = _C_ops.reshape(
+                bias = _C_ops.reshape(
                     bias,
                     [1 for i in range(channel_dim)]
                     + bias.shape
                     + [1 for i in range(len(x.shape) - channel_dim - 1)],
                 )
-                return _C_ops.add(pre_bias, tmp_bias)
-            else:
-                return _C_ops.add(pre_bias, bias)
+            # TODO(qili93): temporary for ascned npu performance to be removed along with npu_identity op
+            if 'npu' in get_all_custom_device_type():
+                with no_grad():
+                    bias_storage = _C_ops.npu_identity(
+                        bias, 3
+                    )  # ACL_FORMAT_NC1HWC0 = 3
+                    bias_storage._share_underline_tensor_to(bias)
+            return _C_ops.add(pre_bias, bias)
         else:
             return pre_bias
 
@@ -747,8 +754,26 @@ def conv2d(
                 data_format,
             )
             if bias is not None:
-                out = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
-                return out
+                channel_dim = (
+                    channel_dim + len(x.shape)
+                    if channel_dim < 0
+                    else channel_dim
+                )
+                if len(bias.shape) < len(x.shape):
+                    bias = _C_ops.reshape(
+                        bias,
+                        [1 for i in range(channel_dim)]
+                        + bias.shape
+                        + [1 for i in range(len(x.shape) - channel_dim - 1)],
+                    )
+                # TODO(qili93): temporary for ascned npu performance to be removed along with npu_identity op
+                if 'npu' in get_all_custom_device_type():
+                    with no_grad():
+                        bias_storage = _C_ops.npu_identity(
+                            bias, 3
+                        )  # ACL_FORMAT_NC1HWC0 = 3
+                        bias_storage._share_underline_tensor_to(bias)
+                return _C_ops.add(pre_bias, bias)
             else:
                 return pre_bias
 
diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py
index cd28479b5d..2395f2ed54 100644
--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
@@ -46,6 +46,7 @@ from .. import functional as F
 from paddle import _C_ops, _legacy_C_ops
 from .. import Layer
 from paddle import in_dynamic_mode
+from paddle.device import get_all_custom_device_type
 from paddle.fluid.framework import in_dygraph_mode, _in_legacy_dygraph
 
 __all__ = []
@@ -683,6 +684,26 @@ class _BatchNormBase(Layer):
         )
         self._variance.stop_gradient = True
 
+        # TODO(qili93): temporary for ascned npu performance to be removed along with npu_identity op
+        if 'npu' in get_all_custom_device_type():
+            with no_grad():
+                weight_trans = _C_ops.npu_identity(
+                    self.weight, 3
+                )  # ACL_FORMAT_NC1HWC0 = 3
+                bias_trans = _C_ops.npu_identity(
+                    self.bias, 3
+                )  # ACL_FORMAT_NC1HWC0 = 3
+                mean_trans = _C_ops.npu_identity(
+                    self._mean, 3
+                )  # ACL_FORMAT_NC1HWC0 = 3
+                var_trans = _C_ops.npu_identity(
+                    self._variance, 3
+                )  # ACL_FORMAT_NC1HWC0 = 3
+                weight_trans._share_underline_tensor_to(self.weight)
+                bias_trans._share_underline_tensor_to(self.bias)
+                mean_trans._share_underline_tensor_to(self._mean)
+                var_trans._share_underline_tensor_to(self._variance)
+
         self._data_format = data_format
         self._in_place = False
         self._momentum = momentum
-- 
GitLab