From 526d963ebb462e586f5a1d6cadf0539b3f2e559b Mon Sep 17 00:00:00 2001
From: kangguangli <kangguangli@hotmail.com>
Date: Wed, 28 Sep 2022 10:42:17 +0800
Subject: [PATCH] [NPU] add gpu kernel for transfer layout (#46307)

* add gpu kernel for transfer layout

* comment error throw

* fix: flag setting in testcase; add condition check for raising error

* fix typo

* fix: add error type for PADDLE_THROW

* remove kernel fallback in data_transfer.cc

* remove useless variable definition
---
 .../framework/new_executor/data_transfer.cc   | 11 +++++
 paddle/fluid/framework/phi_utils.cc           |  7 +++
 paddle/phi/kernels/funcs/math_function.cu     | 26 +++++-----
 paddle/phi/kernels/transfer_layout_kernel.cc  |  7 +++
 .../unittests/test_transfer_layout_op.py      | 48 +++++++++++++++++++
 5 files changed, 87 insertions(+), 12 deletions(-)
diff --git a/paddle/fluid/framework/new_executor/data_transfer.cc b/paddle/fluid/framework/new_executor/data_transfer.cc
index 06962f7b5e..00ede9fd41 100644
--- a/paddle/fluid/framework/new_executor/data_transfer.cc
+++ b/paddle/fluid/framework/new_executor/data_transfer.cc
@@ -142,11 +142,22 @@ void DataTranferHelper::RunAndConstructOpFuncNode(
   if (phi::KernelFactory::Instance().HasCompatiblePhiKernel(
           op_with_kernel->Type())) {
     auto phi_kernel_key = op_with_kernel->ChoosePhiKernel(exec_ctx);
+    auto phi_kernel_name = op_with_kernel->PhiKernelSignature()->name;
     VLOG(6) << "phi_kernel_key " << phi_kernel_key << "\n";
+    VLOG(6) << "phi_kernel_name " << phi_kernel_name << "\n";
 
     if (op_with_kernel->PhiKernel()->IsValid()) {
       run_phi_kernel = true;
     }
+
+    // For data transfer ops, they should not fallback to cpu.
+    // Though they're device-independent operations,
+    // their implementations are device-related.
+    // For example, consider changing the layout of a gpu tensor
+    // while the gpu kernel of transfer_layout op does not exist.
+    // To use the cpu kernel, you must insert memcpy_d2h/mepcpy_h2d op
+    // in addition. But such operation should not be done here.
+    // Maybe in future we will support this.
   }
 
   // 3. Execute transfer op and construct OpFuncNode
diff --git a/paddle/fluid/framework/phi_utils.cc b/paddle/fluid/framework/phi_utils.cc
index 2e56fea28e..165a843075 100644
--- a/paddle/fluid/framework/phi_utils.cc
+++ b/paddle/fluid/framework/phi_utils.cc
@@ -150,6 +150,13 @@ phi::KernelKey FallBackToCpu(const OpKernelType& expected_kernel_key,
         phi::Backend::CPU, kernel_key.layout(), kernel_key.dtype());
   }
 #endif
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  if (platform::is_gpu_place(expected_kernel_key.place_)) {
+    PADDLE_THROW(platform::errors::Unavailable(
+        "For GPU kernel, they must not fallback into CPU kernel."));
+  }
+#endif
+
   return phi::KernelKey();
 }
 
diff --git a/paddle/phi/kernels/funcs/math_function.cu b/paddle/phi/kernels/funcs/math_function.cu
index c829adbc41..74cadb63eb 100644
--- a/paddle/phi/kernels/funcs/math_function.cu
+++ b/paddle/phi/kernels/funcs/math_function.cu
@@ -58,18 +58,20 @@ template struct SetConstant<paddle::platform::CUDAPinnedDeviceContext,
 template struct SetConstant<paddle::platform::CUDAPinnedDeviceContext,
                             phi::dtype::complex<double>>;
 
-#define DEFINE_GPU_TRANS(RANK)                                \
-  template struct Transpose<phi::GPUContext, bool, RANK>;     \
-  template struct Transpose<phi::GPUContext, float, RANK>;    \
-  template struct Transpose<phi::GPUContext, double, RANK>;   \
-  template struct Transpose<phi::GPUContext, float16, RANK>;  \
-  template struct Transpose<phi::GPUContext, bfloat16, RANK>; \
-  template struct Transpose<phi::GPUContext, int8_t, RANK>;   \
-  template struct Transpose<phi::GPUContext, int32_t, RANK>;  \
-  template struct Transpose<phi::GPUContext, int64_t, RANK>;  \
-  template struct Transpose<phi::GPUContext,                  \
-                            phi::dtype::complex<float>,       \
-                            RANK>;                            \
+#define DEFINE_GPU_TRANS(RANK)                                     \
+  template struct Transpose<phi::GPUContext, bool, RANK>;          \
+  template struct Transpose<phi::GPUContext, unsigned char, RANK>; \
+  template struct Transpose<phi::GPUContext, float, RANK>;         \
+  template struct Transpose<phi::GPUContext, double, RANK>;        \
+  template struct Transpose<phi::GPUContext, float16, RANK>;       \
+  template struct Transpose<phi::GPUContext, bfloat16, RANK>;      \
+  template struct Transpose<phi::GPUContext, int8_t, RANK>;        \
+  template struct Transpose<phi::GPUContext, int16_t, RANK>;       \
+  template struct Transpose<phi::GPUContext, int32_t, RANK>;       \
+  template struct Transpose<phi::GPUContext, int64_t, RANK>;       \
+  template struct Transpose<phi::GPUContext,                       \
+                            phi::dtype::complex<float>,            \
+                            RANK>;                                 \
   template struct Transpose<phi::GPUContext, phi::dtype::complex<double>, RANK>;
 
 DEFINE_GPU_TRANS(1);
diff --git a/paddle/phi/kernels/transfer_layout_kernel.cc b/paddle/phi/kernels/transfer_layout_kernel.cc
index be232b7c67..0c44b3c5a7 100644
--- a/paddle/phi/kernels/transfer_layout_kernel.cc
+++ b/paddle/phi/kernels/transfer_layout_kernel.cc
@@ -176,3 +176,10 @@ PD_REGISTER_GENERAL_KERNEL(transfer_layout,
                            ALL_LAYOUT,
                            phi::TransferLayoutKernel<phi::CPUContext>,
                            ALL_DTYPE) {}
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_REGISTER_GENERAL_KERNEL(transfer_layout,
+                           GPU,
+                           ALL_LAYOUT,
+                           phi::TransferLayoutKernel<phi::GPUContext>,
+                           ALL_DTYPE) {}
+#endif
diff --git a/python/paddle/fluid/tests/unittests/test_transfer_layout_op.py b/python/paddle/fluid/tests/unittests/test_transfer_layout_op.py
index 120c78a07e..c66ab4803f 100644
--- a/python/paddle/fluid/tests/unittests/test_transfer_layout_op.py
+++ b/python/paddle/fluid/tests/unittests/test_transfer_layout_op.py
@@ -18,6 +18,8 @@ import numpy as np
 import paddle
 import paddle.fluid.core as core
 import paddle.fluid as fluid
+from paddle.fluid.framework import Program, program_guard
+from paddle.fluid.layer_helper import LayerHelper
 from op_test import OpTest
 
 
@@ -38,6 +40,52 @@ class TestTransferLayoutOpkNCHWTokNHWC(OpTest):
         self.check_output()
 
 
+def softmax_with_data_format(x, data_format, axis=-1, dtype=None, name=None):
+    helper = LayerHelper("softmax", **locals())
+    outs_cast = x
+
+    outs_softmax = helper.create_variable_for_type_inference(outs_cast.dtype)
+    helper.append_op(type='softmax',
+                     inputs={'X': outs_cast},
+                     outputs={'Out': outs_softmax},
+                     attrs={
+                         'axis': axis,
+                         'use_cudnn': True,
+                         'data_format': data_format
+                     })
+
+    return outs_softmax
+
+
+class TestTransferLayoutOpGpu(unittest.TestCase):
+
+    def test_layout_transfer(self):
+        if not core.is_compiled_with_cuda():
+            return
+
+        paddle.enable_static()
+
+        main_program = Program()
+        startup_program = Program()
+        n, c, h, w = 2, 3, 4, 5
+        with program_guard(main_program, startup_program):
+            x = paddle.static.data(shape=[n, c, h, w],
+                                   dtype='float32',
+                                   name='x')
+            y = softmax_with_data_format(x, data_format='NCHW')
+            z = softmax_with_data_format(x, data_format='NHWC')
+
+        place = fluid.CUDAPlace(
+            0) if core.is_compiled_with_cuda() else fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        exe.run(startup_program)
+        ret = exe.run(main_program,
+                      feed={'x': np.full((n, c, h, w), 1, np.float32)},
+                      fetch_list=[z.name])
+        assert len(ret) == 1
+        assert ret[0].shape == (n, h, w, c)
+
+
 if __name__ == '__main__':
     paddle.enable_static()
     unittest.main()
-- 
GitLab