未验证 提交 526d963e 编写于 作者: K kangguangli 提交者: GitHub

[NPU] add gpu kernel for transfer layout (#46307)

* add gpu kernel for transfer layout

* comment error throw

* fix: flag setting in testcase; add condition check for raising error

* fix typo

* fix: add error type for PADDLE_THROW

* remove kernel fallback in data_transfer.cc

* remove useless variable definition
上级 1ecc39b4
...@@ -142,11 +142,22 @@ void DataTranferHelper::RunAndConstructOpFuncNode( ...@@ -142,11 +142,22 @@ void DataTranferHelper::RunAndConstructOpFuncNode(
if (phi::KernelFactory::Instance().HasCompatiblePhiKernel( if (phi::KernelFactory::Instance().HasCompatiblePhiKernel(
op_with_kernel->Type())) { op_with_kernel->Type())) {
auto phi_kernel_key = op_with_kernel->ChoosePhiKernel(exec_ctx); auto phi_kernel_key = op_with_kernel->ChoosePhiKernel(exec_ctx);
auto phi_kernel_name = op_with_kernel->PhiKernelSignature()->name;
VLOG(6) << "phi_kernel_key " << phi_kernel_key << "\n"; VLOG(6) << "phi_kernel_key " << phi_kernel_key << "\n";
VLOG(6) << "phi_kernel_name " << phi_kernel_name << "\n";
if (op_with_kernel->PhiKernel()->IsValid()) { if (op_with_kernel->PhiKernel()->IsValid()) {
run_phi_kernel = true; run_phi_kernel = true;
} }
// For data transfer ops, they should not fallback to cpu.
// Though they're device-independent operations,
// their implementations are device-related.
// For example, consider changing the layout of a gpu tensor
// while the gpu kernel of transfer_layout op does not exist.
// To use the cpu kernel, you must insert memcpy_d2h/mepcpy_h2d op
// in addition. But such operation should not be done here.
// Maybe in future we will support this.
} }
// 3. Execute transfer op and construct OpFuncNode // 3. Execute transfer op and construct OpFuncNode
......
...@@ -150,6 +150,13 @@ phi::KernelKey FallBackToCpu(const OpKernelType& expected_kernel_key, ...@@ -150,6 +150,13 @@ phi::KernelKey FallBackToCpu(const OpKernelType& expected_kernel_key,
phi::Backend::CPU, kernel_key.layout(), kernel_key.dtype()); phi::Backend::CPU, kernel_key.layout(), kernel_key.dtype());
} }
#endif #endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (platform::is_gpu_place(expected_kernel_key.place_)) {
PADDLE_THROW(platform::errors::Unavailable(
"For GPU kernel, they must not fallback into CPU kernel."));
}
#endif
return phi::KernelKey(); return phi::KernelKey();
} }
......
...@@ -58,18 +58,20 @@ template struct SetConstant<paddle::platform::CUDAPinnedDeviceContext, ...@@ -58,18 +58,20 @@ template struct SetConstant<paddle::platform::CUDAPinnedDeviceContext,
template struct SetConstant<paddle::platform::CUDAPinnedDeviceContext, template struct SetConstant<paddle::platform::CUDAPinnedDeviceContext,
phi::dtype::complex<double>>; phi::dtype::complex<double>>;
#define DEFINE_GPU_TRANS(RANK) \ #define DEFINE_GPU_TRANS(RANK) \
template struct Transpose<phi::GPUContext, bool, RANK>; \ template struct Transpose<phi::GPUContext, bool, RANK>; \
template struct Transpose<phi::GPUContext, float, RANK>; \ template struct Transpose<phi::GPUContext, unsigned char, RANK>; \
template struct Transpose<phi::GPUContext, double, RANK>; \ template struct Transpose<phi::GPUContext, float, RANK>; \
template struct Transpose<phi::GPUContext, float16, RANK>; \ template struct Transpose<phi::GPUContext, double, RANK>; \
template struct Transpose<phi::GPUContext, bfloat16, RANK>; \ template struct Transpose<phi::GPUContext, float16, RANK>; \
template struct Transpose<phi::GPUContext, int8_t, RANK>; \ template struct Transpose<phi::GPUContext, bfloat16, RANK>; \
template struct Transpose<phi::GPUContext, int32_t, RANK>; \ template struct Transpose<phi::GPUContext, int8_t, RANK>; \
template struct Transpose<phi::GPUContext, int64_t, RANK>; \ template struct Transpose<phi::GPUContext, int16_t, RANK>; \
template struct Transpose<phi::GPUContext, \ template struct Transpose<phi::GPUContext, int32_t, RANK>; \
phi::dtype::complex<float>, \ template struct Transpose<phi::GPUContext, int64_t, RANK>; \
RANK>; \ template struct Transpose<phi::GPUContext, \
phi::dtype::complex<float>, \
RANK>; \
template struct Transpose<phi::GPUContext, phi::dtype::complex<double>, RANK>; template struct Transpose<phi::GPUContext, phi::dtype::complex<double>, RANK>;
DEFINE_GPU_TRANS(1); DEFINE_GPU_TRANS(1);
......
...@@ -176,3 +176,10 @@ PD_REGISTER_GENERAL_KERNEL(transfer_layout, ...@@ -176,3 +176,10 @@ PD_REGISTER_GENERAL_KERNEL(transfer_layout,
ALL_LAYOUT, ALL_LAYOUT,
phi::TransferLayoutKernel<phi::CPUContext>, phi::TransferLayoutKernel<phi::CPUContext>,
ALL_DTYPE) {} ALL_DTYPE) {}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PD_REGISTER_GENERAL_KERNEL(transfer_layout,
GPU,
ALL_LAYOUT,
phi::TransferLayoutKernel<phi::GPUContext>,
ALL_DTYPE) {}
#endif
...@@ -18,6 +18,8 @@ import numpy as np ...@@ -18,6 +18,8 @@ import numpy as np
import paddle import paddle
import paddle.fluid.core as core import paddle.fluid.core as core
import paddle.fluid as fluid import paddle.fluid as fluid
from paddle.fluid.framework import Program, program_guard
from paddle.fluid.layer_helper import LayerHelper
from op_test import OpTest from op_test import OpTest
...@@ -38,6 +40,52 @@ class TestTransferLayoutOpkNCHWTokNHWC(OpTest): ...@@ -38,6 +40,52 @@ class TestTransferLayoutOpkNCHWTokNHWC(OpTest):
self.check_output() self.check_output()
def softmax_with_data_format(x, data_format, axis=-1, dtype=None, name=None):
helper = LayerHelper("softmax", **locals())
outs_cast = x
outs_softmax = helper.create_variable_for_type_inference(outs_cast.dtype)
helper.append_op(type='softmax',
inputs={'X': outs_cast},
outputs={'Out': outs_softmax},
attrs={
'axis': axis,
'use_cudnn': True,
'data_format': data_format
})
return outs_softmax
class TestTransferLayoutOpGpu(unittest.TestCase):
def test_layout_transfer(self):
if not core.is_compiled_with_cuda():
return
paddle.enable_static()
main_program = Program()
startup_program = Program()
n, c, h, w = 2, 3, 4, 5
with program_guard(main_program, startup_program):
x = paddle.static.data(shape=[n, c, h, w],
dtype='float32',
name='x')
y = softmax_with_data_format(x, data_format='NCHW')
z = softmax_with_data_format(x, data_format='NHWC')
place = fluid.CUDAPlace(
0) if core.is_compiled_with_cuda() else fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(startup_program)
ret = exe.run(main_program,
feed={'x': np.full((n, c, h, w), 1, np.float32)},
fetch_list=[z.name])
assert len(ret) == 1
assert ret[0].shape == (n, h, w, c)
if __name__ == '__main__': if __name__ == '__main__':
paddle.enable_static() paddle.enable_static()
unittest.main() unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册