未验证 提交 32143f44 编写于 作者: Q Qi Li 提交者: GitHub

[NPU] apply npu_identity to conv bn and copy2cpu, test=develop (#48039)

* [NPU] apply npu_identity to conv bn and copy2cpu, test=develop

* update npu identity to share data with x, test=develop

* address review comments, test=develop
上级 74d411e7
......@@ -261,6 +261,15 @@ static PyObject* tensor_method_numpy(TensorObject* self,
VLOG(6) << "Getting DenseTensor's numpy value";
auto dense_tensor =
std::dynamic_pointer_cast<phi::DenseTensor>(self->tensor.impl());
// TODO(qili93): temporary for ascned npu performance to be removed along
// with npu_identity op
paddle::experimental::Tensor temp_tensor(
std::make_shared<phi::DenseTensor>());
if (dense_tensor->storage_properties_initialized()) {
temp_tensor = npu_identity_ad_func(self->tensor, -1);
dense_tensor =
std::dynamic_pointer_cast<phi::DenseTensor>(temp_tensor.impl());
}
phi::DeviceManager::GetDeviceWithPlace(self->tensor.place())
->MemoryCopyD2H(
pybind11::detail::array_proxy(array)->data,
......
......@@ -34,6 +34,7 @@ limitations under the License. */
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/cuda_device_guard.h"
#endif
#include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h"
#include "paddle/fluid/framework/convert_utils.h"
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/platform/device_context.h"
......@@ -1168,6 +1169,19 @@ inline py::array TensorToPyArray(const phi::DenseTensor &tensor,
"PyArray does not own data, in which case memory leak "
"or double free would occur"));
// TODO(qili93): temporary for ascned npu performance to be removed along
// with npu_identity op
paddle::experimental::Tensor tensor_out(
std::make_shared<phi::DenseTensor>());
if (tensor.storage_properties_initialized()) {
paddle::experimental::Tensor tensor_in(
std::make_shared<phi::DenseTensor>(tensor));
tensor_out = npu_identity_ad_func(tensor_in, -1);
auto dense_tensor =
std::dynamic_pointer_cast<phi::DenseTensor>(tensor_out.impl());
tensor_buf_ptr = dense_tensor->data();
}
size_t copy_bytes = sizeof_dtype * numel;
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &ctx = *pool.Get(tensor.place());
......
......@@ -100,6 +100,7 @@ using experimental::multinomial;
using experimental::multiply;
using experimental::mv;
using experimental::nll_loss;
using experimental::npu_identity;
using experimental::one_hot;
using experimental::ones;
using experimental::pixel_shuffle;
......
......@@ -24,10 +24,8 @@ void NPUIdentityKernel(const Context& dev_ctx,
const DenseTensor& x,
const int format,
DenseTensor* out) {
VLOG(4) << "npu_identity op is only for NPU, CPU or GPU kernel just empty "
"tensor with shape: "
<< out->dims() << ", please avoid using this kernel!";
*out = phi::EmptyLike<T, Context>(dev_ctx, *out);
VLOG(4) << "npu_identity op is only for NPU, please avoid using this kernel!";
out->ShareDataWith(x);
}
} // namespace phi
......
......@@ -40,6 +40,7 @@ import paddle.utils.deprecated as deprecated
import paddle.profiler as profiler
from paddle.profiler.utils import in_profiler_mode
from paddle import _C_ops, _legacy_C_ops
from paddle.device import get_all_custom_device_type
_grad_scalar = None
......@@ -376,7 +377,11 @@ def monkey_patch_varbase():
if self._grad_ivar() is None:
return None
new_ivar = self._grad_ivar()._copy_to(core.CPUPlace(), True)
new_ivar = self._grad_ivar()
# TODO(qili93): temporary for ascned npu performance to be removed along with npu_identity op
if 'npu' in get_all_custom_device_type():
new_ivar = paddle.incubate._npu_identity(x=new_ivar, format=-1)
new_ivar = new_ivar._copy_to(core.CPUPlace(), True)
if self._grad_ivar().type == core.VarDesc.VarType.SELECTED_ROWS:
return (
np.array(new_ivar.value().get_selected_rows().get_tensor()),
......
......@@ -41,15 +41,15 @@ class TestNPUIdentityOp(unittest.TestCase):
main_program, feed={x_data.name: self.x}, fetch_list=[output]
)
np.testing.assert_allclose(result[0].shape, self.shape, rtol=1e-08)
np.testing.assert_allclose(result[0], self.x, rtol=1e-08)
def test_api_dygraph(self):
paddle.disable_static(self.place)
x_tensor = paddle.to_tensor(self.x)
out = paddle.incubate._npu_identity(x_tensor, self.format)
x = paddle.to_tensor(self.x)
out = paddle.incubate._npu_identity(x, self.format)
np.testing.assert_allclose(out.shape, self.shape, rtol=1e-08)
np.testing.assert_allclose(out.numpy(), self.x, rtol=1e-08)
paddle.enable_static()
......
......@@ -52,7 +52,7 @@ def _npu_identity(x, format=-1):
return _C_ops.npu_identity(x, format)
if _in_legacy_dygraph():
return _legacy_C_ops.npu_identity(x, format)
return _legacy_C_ops.npu_identity(x, 'format', format)
check_variable_and_dtype(
x,
......
......@@ -24,11 +24,13 @@ from ...fluid.data_feeder import check_variable_and_dtype, check_dtype
from ...fluid.layer_helper import LayerHelper
from ...tensor.manipulation import unsqueeze, squeeze
from ...fluid.layers import nn
from ...framework import no_grad
from paddle import _C_ops, _legacy_C_ops
from paddle import get_flags
from paddle import in_dynamic_mode
from paddle.device import is_compiled_with_cuda
from paddle.device import is_compiled_with_npu
from paddle.device import get_all_custom_device_type
from paddle import in_dynamic_mode
from paddle import get_flags
from paddle.device import is_compiled_with_rocm
......@@ -150,15 +152,20 @@ def _conv_nd(
if isinstance(bias, tuple):
bias = bias[0]
if len(bias.shape) < len(x.shape):
tmp_bias = _C_ops.reshape(
bias = _C_ops.reshape(
bias,
[1 for i in range(channel_dim)]
+ bias.shape
+ [1 for i in range(len(x.shape) - channel_dim - 1)],
)
return _C_ops.add(pre_bias, tmp_bias)
else:
return _C_ops.add(pre_bias, bias)
# TODO(qili93): temporary for ascned npu performance to be removed along with npu_identity op
if 'npu' in get_all_custom_device_type():
with no_grad():
bias_storage = _C_ops.npu_identity(
bias, 3
) # ACL_FORMAT_NC1HWC0 = 3
bias_storage._share_underline_tensor_to(bias)
return _C_ops.add(pre_bias, bias)
else:
return pre_bias
......@@ -747,8 +754,26 @@ def conv2d(
data_format,
)
if bias is not None:
out = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
return out
channel_dim = (
channel_dim + len(x.shape)
if channel_dim < 0
else channel_dim
)
if len(bias.shape) < len(x.shape):
bias = _C_ops.reshape(
bias,
[1 for i in range(channel_dim)]
+ bias.shape
+ [1 for i in range(len(x.shape) - channel_dim - 1)],
)
# TODO(qili93): temporary for ascned npu performance to be removed along with npu_identity op
if 'npu' in get_all_custom_device_type():
with no_grad():
bias_storage = _C_ops.npu_identity(
bias, 3
) # ACL_FORMAT_NC1HWC0 = 3
bias_storage._share_underline_tensor_to(bias)
return _C_ops.add(pre_bias, bias)
else:
return pre_bias
......
......@@ -46,6 +46,7 @@ from .. import functional as F
from paddle import _C_ops, _legacy_C_ops
from .. import Layer
from paddle import in_dynamic_mode
from paddle.device import get_all_custom_device_type
from paddle.fluid.framework import in_dygraph_mode, _in_legacy_dygraph
__all__ = []
......@@ -683,6 +684,26 @@ class _BatchNormBase(Layer):
)
self._variance.stop_gradient = True
# TODO(qili93): temporary for ascned npu performance to be removed along with npu_identity op
if 'npu' in get_all_custom_device_type():
with no_grad():
weight_trans = _C_ops.npu_identity(
self.weight, 3
) # ACL_FORMAT_NC1HWC0 = 3
bias_trans = _C_ops.npu_identity(
self.bias, 3
) # ACL_FORMAT_NC1HWC0 = 3
mean_trans = _C_ops.npu_identity(
self._mean, 3
) # ACL_FORMAT_NC1HWC0 = 3
var_trans = _C_ops.npu_identity(
self._variance, 3
) # ACL_FORMAT_NC1HWC0 = 3
weight_trans._share_underline_tensor_to(self.weight)
bias_trans._share_underline_tensor_to(self.bias)
mean_trans._share_underline_tensor_to(self._mean)
var_trans._share_underline_tensor_to(self._variance)
self._data_format = data_format
self._in_place = False
self._momentum = momentum
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册