diff --git a/paddle/fluid/framework/new_executor/data_transfer.cc b/paddle/fluid/framework/new_executor/data_transfer.cc index 06962f7b5e77313663af8bda640f164e7959fec3..00ede9fd4150a9a6110d1059296a8977dcac2734 100644 --- a/paddle/fluid/framework/new_executor/data_transfer.cc +++ b/paddle/fluid/framework/new_executor/data_transfer.cc @@ -142,11 +142,22 @@ void DataTranferHelper::RunAndConstructOpFuncNode( if (phi::KernelFactory::Instance().HasCompatiblePhiKernel( op_with_kernel->Type())) { auto phi_kernel_key = op_with_kernel->ChoosePhiKernel(exec_ctx); + auto phi_kernel_name = op_with_kernel->PhiKernelSignature()->name; VLOG(6) << "phi_kernel_key " << phi_kernel_key << "\n"; + VLOG(6) << "phi_kernel_name " << phi_kernel_name << "\n"; if (op_with_kernel->PhiKernel()->IsValid()) { run_phi_kernel = true; } + + // For data transfer ops, they should not fallback to cpu. + // Though they're device-independent operations, + // their implementations are device-related. + // For example, consider changing the layout of a gpu tensor + // while the gpu kernel of transfer_layout op does not exist. + // To use the cpu kernel, you must insert memcpy_d2h/mepcpy_h2d op + // in addition. But such operation should not be done here. + // Maybe in future we will support this. } // 3. Execute transfer op and construct OpFuncNode diff --git a/paddle/fluid/framework/phi_utils.cc b/paddle/fluid/framework/phi_utils.cc index 2e56fea28e0b53e31b67506f5c3fdee440477635..165a84307591215d669e3145e821550b149d4006 100644 --- a/paddle/fluid/framework/phi_utils.cc +++ b/paddle/fluid/framework/phi_utils.cc @@ -150,6 +150,13 @@ phi::KernelKey FallBackToCpu(const OpKernelType& expected_kernel_key, phi::Backend::CPU, kernel_key.layout(), kernel_key.dtype()); } #endif +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + if (platform::is_gpu_place(expected_kernel_key.place_)) { + PADDLE_THROW(platform::errors::Unavailable( + "For GPU kernel, they must not fallback into CPU kernel.")); + } +#endif + return phi::KernelKey(); } diff --git a/paddle/phi/kernels/funcs/math_function.cu b/paddle/phi/kernels/funcs/math_function.cu index c829adbc41373513e451af7d42c3e2055b22d539..74cadb63eb9feb5f771f389bb182c6329de6ade2 100644 --- a/paddle/phi/kernels/funcs/math_function.cu +++ b/paddle/phi/kernels/funcs/math_function.cu @@ -58,18 +58,20 @@ template struct SetConstant>; -#define DEFINE_GPU_TRANS(RANK) \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose, \ - RANK>; \ +#define DEFINE_GPU_TRANS(RANK) \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose, \ + RANK>; \ template struct Transpose, RANK>; DEFINE_GPU_TRANS(1); diff --git a/paddle/phi/kernels/transfer_layout_kernel.cc b/paddle/phi/kernels/transfer_layout_kernel.cc index be232b7c671e9baa6244215426ecacad69833b09..0c44b3c5a71d8cd9c21cfcb9a04b2f806e501f86 100644 --- a/paddle/phi/kernels/transfer_layout_kernel.cc +++ b/paddle/phi/kernels/transfer_layout_kernel.cc @@ -176,3 +176,10 @@ PD_REGISTER_GENERAL_KERNEL(transfer_layout, ALL_LAYOUT, phi::TransferLayoutKernel, ALL_DTYPE) {} +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_REGISTER_GENERAL_KERNEL(transfer_layout, + GPU, + ALL_LAYOUT, + phi::TransferLayoutKernel, + ALL_DTYPE) {} +#endif diff --git a/python/paddle/fluid/tests/unittests/test_transfer_layout_op.py b/python/paddle/fluid/tests/unittests/test_transfer_layout_op.py index 120c78a07eff2f2659da33eafd378539e66df8d5..c66ab4803f1c54168b97bc6de84ae0df3c2f9d4b 100644 --- a/python/paddle/fluid/tests/unittests/test_transfer_layout_op.py +++ b/python/paddle/fluid/tests/unittests/test_transfer_layout_op.py @@ -18,6 +18,8 @@ import numpy as np import paddle import paddle.fluid.core as core import paddle.fluid as fluid +from paddle.fluid.framework import Program, program_guard +from paddle.fluid.layer_helper import LayerHelper from op_test import OpTest @@ -38,6 +40,52 @@ class TestTransferLayoutOpkNCHWTokNHWC(OpTest): self.check_output() +def softmax_with_data_format(x, data_format, axis=-1, dtype=None, name=None): + helper = LayerHelper("softmax", **locals()) + outs_cast = x + + outs_softmax = helper.create_variable_for_type_inference(outs_cast.dtype) + helper.append_op(type='softmax', + inputs={'X': outs_cast}, + outputs={'Out': outs_softmax}, + attrs={ + 'axis': axis, + 'use_cudnn': True, + 'data_format': data_format + }) + + return outs_softmax + + +class TestTransferLayoutOpGpu(unittest.TestCase): + + def test_layout_transfer(self): + if not core.is_compiled_with_cuda(): + return + + paddle.enable_static() + + main_program = Program() + startup_program = Program() + n, c, h, w = 2, 3, 4, 5 + with program_guard(main_program, startup_program): + x = paddle.static.data(shape=[n, c, h, w], + dtype='float32', + name='x') + y = softmax_with_data_format(x, data_format='NCHW') + z = softmax_with_data_format(x, data_format='NHWC') + + place = fluid.CUDAPlace( + 0) if core.is_compiled_with_cuda() else fluid.CPUPlace() + exe = fluid.Executor(place) + exe.run(startup_program) + ret = exe.run(main_program, + feed={'x': np.full((n, c, h, w), 1, np.float32)}, + fetch_list=[z.name]) + assert len(ret) == 1 + assert ret[0].shape == (n, h, w, c) + + if __name__ == '__main__': paddle.enable_static() unittest.main()