未验证 提交 32143f44 编写于 作者: Q Qi Li 提交者: GitHub

[NPU] apply npu_identity to conv bn and copy2cpu, test=develop (#48039)

* [NPU] apply npu_identity to conv bn and copy2cpu, test=develop

* update npu identity to share data with x, test=develop

* address review comments, test=develop
上级 74d411e7
...@@ -261,6 +261,15 @@ static PyObject* tensor_method_numpy(TensorObject* self, ...@@ -261,6 +261,15 @@ static PyObject* tensor_method_numpy(TensorObject* self,
VLOG(6) << "Getting DenseTensor's numpy value"; VLOG(6) << "Getting DenseTensor's numpy value";
auto dense_tensor = auto dense_tensor =
std::dynamic_pointer_cast<phi::DenseTensor>(self->tensor.impl()); std::dynamic_pointer_cast<phi::DenseTensor>(self->tensor.impl());
// TODO(qili93): temporary for ascned npu performance to be removed along
// with npu_identity op
paddle::experimental::Tensor temp_tensor(
std::make_shared<phi::DenseTensor>());
if (dense_tensor->storage_properties_initialized()) {
temp_tensor = npu_identity_ad_func(self->tensor, -1);
dense_tensor =
std::dynamic_pointer_cast<phi::DenseTensor>(temp_tensor.impl());
}
phi::DeviceManager::GetDeviceWithPlace(self->tensor.place()) phi::DeviceManager::GetDeviceWithPlace(self->tensor.place())
->MemoryCopyD2H( ->MemoryCopyD2H(
pybind11::detail::array_proxy(array)->data, pybind11::detail::array_proxy(array)->data,
......
...@@ -34,6 +34,7 @@ limitations under the License. */ ...@@ -34,6 +34,7 @@ limitations under the License. */
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/cuda_device_guard.h" #include "paddle/fluid/platform/cuda_device_guard.h"
#endif #endif
#include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h"
#include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/framework/convert_utils.h"
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
...@@ -1168,6 +1169,19 @@ inline py::array TensorToPyArray(const phi::DenseTensor &tensor, ...@@ -1168,6 +1169,19 @@ inline py::array TensorToPyArray(const phi::DenseTensor &tensor,
"PyArray does not own data, in which case memory leak " "PyArray does not own data, in which case memory leak "
"or double free would occur")); "or double free would occur"));
// TODO(qili93): temporary for ascned npu performance to be removed along
// with npu_identity op
paddle::experimental::Tensor tensor_out(
std::make_shared<phi::DenseTensor>());
if (tensor.storage_properties_initialized()) {
paddle::experimental::Tensor tensor_in(
std::make_shared<phi::DenseTensor>(tensor));
tensor_out = npu_identity_ad_func(tensor_in, -1);
auto dense_tensor =
std::dynamic_pointer_cast<phi::DenseTensor>(tensor_out.impl());
tensor_buf_ptr = dense_tensor->data();
}
size_t copy_bytes = sizeof_dtype * numel; size_t copy_bytes = sizeof_dtype * numel;
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &ctx = *pool.Get(tensor.place()); auto &ctx = *pool.Get(tensor.place());
......
...@@ -100,6 +100,7 @@ using experimental::multinomial; ...@@ -100,6 +100,7 @@ using experimental::multinomial;
using experimental::multiply; using experimental::multiply;
using experimental::mv; using experimental::mv;
using experimental::nll_loss; using experimental::nll_loss;
using experimental::npu_identity;
using experimental::one_hot; using experimental::one_hot;
using experimental::ones; using experimental::ones;
using experimental::pixel_shuffle; using experimental::pixel_shuffle;
......
...@@ -24,10 +24,8 @@ void NPUIdentityKernel(const Context& dev_ctx, ...@@ -24,10 +24,8 @@ void NPUIdentityKernel(const Context& dev_ctx,
const DenseTensor& x, const DenseTensor& x,
const int format, const int format,
DenseTensor* out) { DenseTensor* out) {
VLOG(4) << "npu_identity op is only for NPU, CPU or GPU kernel just empty " VLOG(4) << "npu_identity op is only for NPU, please avoid using this kernel!";
"tensor with shape: " out->ShareDataWith(x);
<< out->dims() << ", please avoid using this kernel!";
*out = phi::EmptyLike<T, Context>(dev_ctx, *out);
} }
} // namespace phi } // namespace phi
......
...@@ -40,6 +40,7 @@ import paddle.utils.deprecated as deprecated ...@@ -40,6 +40,7 @@ import paddle.utils.deprecated as deprecated
import paddle.profiler as profiler import paddle.profiler as profiler
from paddle.profiler.utils import in_profiler_mode from paddle.profiler.utils import in_profiler_mode
from paddle import _C_ops, _legacy_C_ops from paddle import _C_ops, _legacy_C_ops
from paddle.device import get_all_custom_device_type
_grad_scalar = None _grad_scalar = None
...@@ -376,7 +377,11 @@ def monkey_patch_varbase(): ...@@ -376,7 +377,11 @@ def monkey_patch_varbase():
if self._grad_ivar() is None: if self._grad_ivar() is None:
return None return None
new_ivar = self._grad_ivar()._copy_to(core.CPUPlace(), True) new_ivar = self._grad_ivar()
# TODO(qili93): temporary for ascned npu performance to be removed along with npu_identity op
if 'npu' in get_all_custom_device_type():
new_ivar = paddle.incubate._npu_identity(x=new_ivar, format=-1)
new_ivar = new_ivar._copy_to(core.CPUPlace(), True)
if self._grad_ivar().type == core.VarDesc.VarType.SELECTED_ROWS: if self._grad_ivar().type == core.VarDesc.VarType.SELECTED_ROWS:
return ( return (
np.array(new_ivar.value().get_selected_rows().get_tensor()), np.array(new_ivar.value().get_selected_rows().get_tensor()),
......
...@@ -41,15 +41,15 @@ class TestNPUIdentityOp(unittest.TestCase): ...@@ -41,15 +41,15 @@ class TestNPUIdentityOp(unittest.TestCase):
main_program, feed={x_data.name: self.x}, fetch_list=[output] main_program, feed={x_data.name: self.x}, fetch_list=[output]
) )
np.testing.assert_allclose(result[0].shape, self.shape, rtol=1e-08) np.testing.assert_allclose(result[0], self.x, rtol=1e-08)
def test_api_dygraph(self): def test_api_dygraph(self):
paddle.disable_static(self.place) paddle.disable_static(self.place)
x_tensor = paddle.to_tensor(self.x) x = paddle.to_tensor(self.x)
out = paddle.incubate._npu_identity(x_tensor, self.format) out = paddle.incubate._npu_identity(x, self.format)
np.testing.assert_allclose(out.shape, self.shape, rtol=1e-08) np.testing.assert_allclose(out.numpy(), self.x, rtol=1e-08)
paddle.enable_static() paddle.enable_static()
......
...@@ -52,7 +52,7 @@ def _npu_identity(x, format=-1): ...@@ -52,7 +52,7 @@ def _npu_identity(x, format=-1):
return _C_ops.npu_identity(x, format) return _C_ops.npu_identity(x, format)
if _in_legacy_dygraph(): if _in_legacy_dygraph():
return _legacy_C_ops.npu_identity(x, format) return _legacy_C_ops.npu_identity(x, 'format', format)
check_variable_and_dtype( check_variable_and_dtype(
x, x,
......
...@@ -24,11 +24,13 @@ from ...fluid.data_feeder import check_variable_and_dtype, check_dtype ...@@ -24,11 +24,13 @@ from ...fluid.data_feeder import check_variable_and_dtype, check_dtype
from ...fluid.layer_helper import LayerHelper from ...fluid.layer_helper import LayerHelper
from ...tensor.manipulation import unsqueeze, squeeze from ...tensor.manipulation import unsqueeze, squeeze
from ...fluid.layers import nn from ...fluid.layers import nn
from ...framework import no_grad
from paddle import _C_ops, _legacy_C_ops from paddle import _C_ops, _legacy_C_ops
from paddle import get_flags from paddle import get_flags
from paddle import in_dynamic_mode from paddle import in_dynamic_mode
from paddle.device import is_compiled_with_cuda from paddle.device import is_compiled_with_cuda
from paddle.device import is_compiled_with_npu from paddle.device import is_compiled_with_npu
from paddle.device import get_all_custom_device_type
from paddle import in_dynamic_mode from paddle import in_dynamic_mode
from paddle import get_flags from paddle import get_flags
from paddle.device import is_compiled_with_rocm from paddle.device import is_compiled_with_rocm
...@@ -150,15 +152,20 @@ def _conv_nd( ...@@ -150,15 +152,20 @@ def _conv_nd(
if isinstance(bias, tuple): if isinstance(bias, tuple):
bias = bias[0] bias = bias[0]
if len(bias.shape) < len(x.shape): if len(bias.shape) < len(x.shape):
tmp_bias = _C_ops.reshape( bias = _C_ops.reshape(
bias, bias,
[1 for i in range(channel_dim)] [1 for i in range(channel_dim)]
+ bias.shape + bias.shape
+ [1 for i in range(len(x.shape) - channel_dim - 1)], + [1 for i in range(len(x.shape) - channel_dim - 1)],
) )
return _C_ops.add(pre_bias, tmp_bias) # TODO(qili93): temporary for ascned npu performance to be removed along with npu_identity op
else: if 'npu' in get_all_custom_device_type():
return _C_ops.add(pre_bias, bias) with no_grad():
bias_storage = _C_ops.npu_identity(
bias, 3
) # ACL_FORMAT_NC1HWC0 = 3
bias_storage._share_underline_tensor_to(bias)
return _C_ops.add(pre_bias, bias)
else: else:
return pre_bias return pre_bias
...@@ -747,8 +754,26 @@ def conv2d( ...@@ -747,8 +754,26 @@ def conv2d(
data_format, data_format,
) )
if bias is not None: if bias is not None:
out = nn.elementwise_add(pre_bias, bias, axis=channel_dim) channel_dim = (
return out channel_dim + len(x.shape)
if channel_dim < 0
else channel_dim
)
if len(bias.shape) < len(x.shape):
bias = _C_ops.reshape(
bias,
[1 for i in range(channel_dim)]
+ bias.shape
+ [1 for i in range(len(x.shape) - channel_dim - 1)],
)
# TODO(qili93): temporary for ascned npu performance to be removed along with npu_identity op
if 'npu' in get_all_custom_device_type():
with no_grad():
bias_storage = _C_ops.npu_identity(
bias, 3
) # ACL_FORMAT_NC1HWC0 = 3
bias_storage._share_underline_tensor_to(bias)
return _C_ops.add(pre_bias, bias)
else: else:
return pre_bias return pre_bias
......
...@@ -46,6 +46,7 @@ from .. import functional as F ...@@ -46,6 +46,7 @@ from .. import functional as F
from paddle import _C_ops, _legacy_C_ops from paddle import _C_ops, _legacy_C_ops
from .. import Layer from .. import Layer
from paddle import in_dynamic_mode from paddle import in_dynamic_mode
from paddle.device import get_all_custom_device_type
from paddle.fluid.framework import in_dygraph_mode, _in_legacy_dygraph from paddle.fluid.framework import in_dygraph_mode, _in_legacy_dygraph
__all__ = [] __all__ = []
...@@ -683,6 +684,26 @@ class _BatchNormBase(Layer): ...@@ -683,6 +684,26 @@ class _BatchNormBase(Layer):
) )
self._variance.stop_gradient = True self._variance.stop_gradient = True
# TODO(qili93): temporary for ascned npu performance to be removed along with npu_identity op
if 'npu' in get_all_custom_device_type():
with no_grad():
weight_trans = _C_ops.npu_identity(
self.weight, 3
) # ACL_FORMAT_NC1HWC0 = 3
bias_trans = _C_ops.npu_identity(
self.bias, 3
) # ACL_FORMAT_NC1HWC0 = 3
mean_trans = _C_ops.npu_identity(
self._mean, 3
) # ACL_FORMAT_NC1HWC0 = 3
var_trans = _C_ops.npu_identity(
self._variance, 3
) # ACL_FORMAT_NC1HWC0 = 3
weight_trans._share_underline_tensor_to(self.weight)
bias_trans._share_underline_tensor_to(self.bias)
mean_trans._share_underline_tensor_to(self._mean)
var_trans._share_underline_tensor_to(self._variance)
self._data_format = data_format self._data_format = data_format
self._in_place = False self._in_place = False
self._momentum = momentum self._momentum = momentum
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册