[XPU] add fp16 support for top_k_v2, squeeze2 and argsort. (#50614)

689de12c · houj04 · GitHub · 1c8e15c9 · 689de12c · 689de12c
5 changed file
--- a/paddle/phi/backends/xpu/xpu2_op_list.cc
+++ b/paddle/phi/backends/xpu/xpu2_op_list.cc
@@ -40,6 +40,7 @@ XPUOpMap& get_kl2_ops() {
      {"argsort",
       XPUKernelSet({phi::DataType::INT32,
                     phi::DataType::INT64,
+                     phi::DataType::FLOAT16,
                     phi::DataType::FLOAT32})},
      {"assign",
       XPUKernelSet({phi::DataType::FLOAT32,
@@ -598,6 +599,7 @@ XPUOpMap& get_kl2_ops() {
                     phi::DataType::BOOL,
                     phi::DataType::INT8,
                     phi::DataType::UINT8,
+                     phi::DataType::FLOAT16,
                     phi::DataType::FLOAT32})},
      {"squeeze",
       XPUKernelSet({phi::DataType::FLOAT64,
@@ -665,7 +667,8 @@ XPUOpMap& get_kl2_ops() {
       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
      {"truncated_gaussian_random", XPUKernelSet({phi::DataType::FLOAT32})},
      {"top_k", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
-      {"top_k_v2", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"top_k_v2",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
      {"update_loss_scaling",
       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
      {"unbind", XPUKernelSet({phi::DataType::FLOAT32})},

--- a/paddle/phi/kernels/xpu/argsort_kernel.cc
+++ b/paddle/phi/kernels/xpu/argsort_kernel.cc
@@ -207,34 +207,45 @@ void ArgsortKernel(const Context& dev_ctx,
    }
  }

+  using XPUType = typename XPUTypeTrait<T>::Type;
+
  if (int64_need_cast) {
-    XPUArgsort<T, true, true>()(dev_ctx.x_context(),
-                                input_data,
-                                output_data,
-                                indices_data,
-                                data_shape,
-                                permute_vec,
-                                descending);
+    XPUArgsort<XPUType, true, true>()(
+        dev_ctx.x_context(),
+        reinterpret_cast<const XPUType*>(input_data),
+        reinterpret_cast<XPUType*>(output_data),
+        indices_data,
+        data_shape,
+        permute_vec,
+        descending);
  } else if (index_need_cast) {
-    XPUArgsort<T, false, true>()(dev_ctx.x_context(),
-                                 input_data,
-                                 output_data,
-                                 indices_data,
-                                 data_shape,
-                                 permute_vec,
-                                 descending);
+    XPUArgsort<XPUType, false, true>()(
+        dev_ctx.x_context(),
+        reinterpret_cast<const XPUType*>(input_data),
+        reinterpret_cast<XPUType*>(output_data),
+        indices_data,
+        data_shape,
+        permute_vec,
+        descending);
  } else {
-    XPUArgsort<T, false, false>()(dev_ctx.x_context(),
-                                  input_data,
-                                  output_data,
-                                  indices_data,
-                                  data_shape,
-                                  permute_vec,
-                                  descending);
+    XPUArgsort<XPUType, false, false>()(
+        dev_ctx.x_context(),
+        reinterpret_cast<const XPUType*>(input_data),
+        reinterpret_cast<XPUType*>(output_data),
+        indices_data,
+        data_shape,
+        permute_vec,
+        descending);
  }
 }

 }  // namespace phi

-PD_REGISTER_KERNEL(
-    argsort, XPU, ALL_LAYOUT, phi::ArgsortKernel, float, int, int64_t) {}
+PD_REGISTER_KERNEL(argsort,
+                   XPU,
+                   ALL_LAYOUT,
+                   phi::ArgsortKernel,
+                   float,
+                   int,
+                   int64_t,
+                   phi::dtype::float16) {}
--- a/paddle/phi/kernels/xpu/top_k_kernel.cc
+++ b/paddle/phi/kernels/xpu/top_k_kernel.cc
@@ -28,6 +28,8 @@ void TopkKernel(const Context& dev_ctx,
                bool sorted,
                DenseTensor* out,
                DenseTensor* indices) {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+
  const auto& in_dims = x.dims();
  const T* in_data = x.data<T>();
  int64_t* indices_data = dev_ctx.template Alloc<int64_t>(indices);
@@ -59,13 +61,13 @@ void TopkKernel(const Context& dev_ctx,
    const size_t row =
        phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
    const size_t col = in_dims[in_dims.size() - 1];
-    int r = xpu::sorted_topk<T>(dev_ctx.x_context(),
-                                in_data,
-                                output_data,
-                                indices_int_data,
-                                row,
-                                col,
-                                k);
+    int r = xpu::sorted_topk<XPUType>(dev_ctx.x_context(),
+                                      reinterpret_cast<const XPUType*>(in_data),
+                                      reinterpret_cast<XPUType*>(output_data),
+                                      indices_int_data,
+                                      row,
+                                      col,
+                                      k);
    PADDLE_ENFORCE_XDNN_SUCCESS(r, "sorted_topk");

    r = xpu::cast<int32_t, int64_t>(dev_ctx.x_context(),
@@ -97,11 +99,14 @@ void TopkKernel(const Context& dev_ctx,
    }

    xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
-    T* trans_in_data = RAII_GUARD.alloc_l3_or_gm<T>(x.numel());
+    XPUType* trans_in_data = RAII_GUARD.alloc_l3_or_gm<XPUType>(x.numel());

    // Transpose and save interval output to trans_in
-    int r = xpu::transpose<T>(
-        dev_ctx.x_context(), in_data, trans_in_data, x_shape_host, trans_axes);
+    int r = xpu::transpose<XPUType>(dev_ctx.x_context(),
+                                    reinterpret_cast<const XPUType*>(in_data),
+                                    trans_in_data,
+                                    x_shape_host,
+                                    trans_axes);
    PADDLE_ENFORCE_EQ(r,
                      xpu::Error_t::SUCCESS,
                      errors::External("XPU API 1st Transpose kernel"
@@ -109,7 +114,7 @@ void TopkKernel(const Context& dev_ctx,
                                       r,
                                       XPUAPIErrorMsg[r]));

-    T* trans_out_data = RAII_GUARD.alloc_l3_or_gm<T>(out->numel());
+    XPUType* trans_out_data = RAII_GUARD.alloc_l3_or_gm<XPUType>(out->numel());
    int64_t* trans_idx_data = RAII_GUARD.alloc_l3_or_gm<int64_t>(out->numel());
    int32_t* trans_idx_int32_data =
        RAII_GUARD.alloc_l3_or_gm<int32_t>(out->numel());
@@ -118,13 +123,14 @@ void TopkKernel(const Context& dev_ctx,
    const size_t col = trans_dims[trans_dims.size() - 1];

    // Do top k on transposed input
-    r = xpu::sorted_topk<T>(dev_ctx.x_context(),
-                            trans_in_data,
-                            trans_out_data,
-                            trans_idx_int32_data,
-                            row,
-                            col,
-                            k);
+    r = xpu::sorted_topk<XPUType>(
+        dev_ctx.x_context(),
+        reinterpret_cast<const XPUType*>(trans_in_data),
+        reinterpret_cast<XPUType*>(trans_out_data),
+        trans_idx_int32_data,
+        row,
+        col,
+        k);
    PADDLE_ENFORCE_XDNN_SUCCESS(r, "sorted_topk");

    r = xpu::cast<int32_t, int64_t>(dev_ctx.x_context(),
@@ -146,11 +152,12 @@ void TopkKernel(const Context& dev_ctx,
    for (size_t i = 0; i < trans_back_axes.size(); ++i) {
      trans_out_shape_host[i] = trans_out_dims[i];
    }
-    r = xpu::transpose<T>(dev_ctx.x_context(),
-                          trans_out_data,
-                          output_data,
-                          trans_out_shape_host,
-                          trans_back_axes);
+    r = xpu::transpose<XPUType>(
+        dev_ctx.x_context(),
+        reinterpret_cast<const XPUType*>(trans_out_data),
+        reinterpret_cast<XPUType*>(output_data),
+        trans_out_shape_host,
+        trans_back_axes);
    PADDLE_ENFORCE_EQ(r,
                      xpu::Error_t::SUCCESS,
                      errors::External("XPU API 2nd Transpose kernel"
@@ -173,4 +180,5 @@ void TopkKernel(const Context& dev_ctx,

 }  // namespace phi

-PD_REGISTER_KERNEL(topk, XPU, ALL_LAYOUT, phi::TopkKernel, float) {}
+PD_REGISTER_KERNEL(
+    topk, XPU, ALL_LAYOUT, phi::TopkKernel, float, phi::dtype::float16) {}
--- a/python/paddle/fluid/tests/unittests/xpu/test_argsort_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_argsort_op_xpu.py
@@ -185,7 +185,9 @@ class XPUTestArgsortOp_LargeN(XPUOpTestWrapper):
 support_types = get_xpu_op_support_types('argsort')
 for stype in support_types:
    create_test_class(globals(), XPUTestArgsortOp, stype)
-    create_test_class(globals(), XPUTestArgsortOp_LargeN, stype)
+    if stype != "float16":
+        # skip fp16 test on LARGE input because unstable sort on low-precision fp16 will lead to test failure
+        create_test_class(globals(), XPUTestArgsortOp_LargeN, stype)

 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/xpu/test_top_k_v2_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_top_k_v2_op_xpu.py
@@ -30,6 +30,20 @@ import paddle
 paddle.enable_static()


+def random_unique_float(shape, dtype):
+    # create a random float array with 10x length
+    numel = np.prod(shape)
+    arr = np.random.uniform(-10.0, 10.0, numel * 10).astype(dtype)
+    arr = np.unique(arr)
+    assert (
+        arr.shape[0] >= numel
+    ), "failed to create enough unique values: %d vs %d" % (arr.shape[0], numel)
+    arr = arr[:numel]
+    np.random.shuffle(arr)
+    arr = arr.reshape(shape)
+    return arr
+
+
 def numpy_topk(x, k=1, axis=-1, largest=True):
    if axis < 0:
        axis = len(x.shape) + axis
@@ -52,16 +66,14 @@ class XPUTestTopKV2Op(XPUOpTestWrapper):
        self.use_dynamic_create_class = False

    class TestTopkOp(XPUOpTest):
-        def init_args(self):
-            self.k = 3
-            self.axis = 1
-            self.largest = True
-            self.input_data = np.random.rand(10, 20).astype(self.dtype)
-
        def setUp(self):
+            self.place = paddle.XPUPlace(0)
            self.op_type = "top_k_v2"
-            self.init_args()
            self.dtype = self.in_type
+            self.init_args()
+            self.input_data = random_unique_float(
+                self.input_data_shape, self.dtype
+            )
            self.inputs = {'X': self.input_data}
            self.attrs = {
                'k': self.k,
@@ -74,98 +86,112 @@ class XPUTestTopKV2Op(XPUOpTestWrapper):
            self.outputs = {'Out': output, 'Indices': indices}

        def test_check_output(self):
-            if paddle.is_compiled_with_xpu():
-                place = paddle.XPUPlace(0)
-                self.check_output_with_place(place)
+            self.check_output_with_place(self.place)

        def test_check_grad(self):
-            if paddle.is_compiled_with_xpu():
-                place = paddle.XPUPlace(0)
-                self.check_grad(set(['X']), 'Out')
+            self.check_grad_with_place(self.place, ['X'], 'Out')
+
+        def init_args(self):
+            self.k = 3
+            self.axis = 1
+            self.largest = True
+            self.input_data_shape = (10, 20)

    class TestTopkOp1(TestTopkOp):
        def init_args(self):
            self.k = 3
            self.axis = 1
            self.largest = True
-            self.input_data = np.random.rand(100, 155).astype(self.dtype)
+            # too many values for fp16 will lead to failure in random_unique_float function
+            if self.dtype == np.float16:
+                self.input_data_shape = (100, 55)
+            else:
+                self.input_data_shape = (100, 155)

    class TestTopkOp2(TestTopkOp):
        def init_args(self):
            self.k = 3
            self.axis = 1
            self.largest = True
-            self.input_data = np.random.rand(10, 10, 5).astype(self.dtype)
+            self.input_data_shape = (10, 10, 5)

    class TestTopkOp3(TestTopkOp):
        def init_args(self):
            self.k = 5
            self.axis = 1
            self.largest = True
-            self.input_data = np.random.rand(10, 10, 5).astype(self.dtype)
+            self.input_data_shape = (10, 10, 5)

    class TestTopkOp4(TestTopkOp):
        def init_args(self):
            self.k = 1
            self.axis = 1
            self.largest = True
-            self.input_data = np.random.rand(10, 10, 5).astype(self.dtype)
+            self.input_data_shape = (10, 10, 5)

    class TestTopkOp5(TestTopkOp):
        def init_args(self):
            self.k = 3
            self.axis = 2
            self.largest = True
-            self.input_data = np.random.rand(10, 10, 5).astype(self.dtype)
+            self.input_data_shape = (10, 10, 5)

    class TestTopkOp6(TestTopkOp):
        def init_args(self):
            self.k = 5
            self.axis = 1
            self.largest = True
-            self.input_data = np.random.rand(8, 32, 64).astype(self.dtype)
+            # too many values for fp16 will lead to failure in random_unique_float function
+            if self.dtype == np.float16:
+                self.input_data_shape = (8, 32, 32)
+            else:
+                self.input_data_shape = (8, 32, 64)

    class TestTopkOp7(TestTopkOp):
        def init_args(self):
            self.k = 10
            self.axis = 2
            self.largest = True
-            self.input_data = np.random.rand(8, 5, 10, 16).astype(self.dtype)
+            self.input_data_shape = (8, 5, 10, 16)

    class TestTopkOp8(TestTopkOp):
        def init_args(self):
            self.k = 1
            self.axis = 1
            self.largest = True
-            self.input_data = np.random.rand(8, 32, 64).astype(self.dtype)
+            # too many values for fp16 will lead to failure in random_unique_float function
+            if self.dtype == np.float16:
+                self.input_data_shape = (8, 32, 32)
+            else:
+                self.input_data_shape = (8, 32, 64)

    class TestTopkOp9(TestTopkOp):
        def init_args(self):
            self.k = 3
            self.axis = 1
            self.largest = True
-            self.input_data = np.random.rand(10, 10, 5).astype(self.dtype)
+            self.input_data_shape = (10, 10, 5)

    class TestTopkOp10(TestTopkOp):
        def init_args(self):
            self.k = 3
            self.axis = 1
            self.largest = True
-            self.input_data = np.random.rand(10, 10, 5).astype(self.dtype)
+            self.input_data_shape = (10, 10, 5)

    class TestTopkOp11(TestTopkOp):
        def init_args(self):
            self.k = 5
            self.axis = 1
            self.largest = True
-            self.input_data = np.random.rand(10, 10, 5).astype(self.dtype)
+            self.input_data_shape = (10, 10, 5)

    class TestTopkOp12(TestTopkOp):
        def init_args(self):
            self.k = 1
            self.axis = 1
            self.largest = True
-            self.input_data = np.random.rand(10, 10, 5).astype(self.dtype)
+            self.input_data_shape = (10, 10, 5)


 support_types = get_xpu_op_support_types('top_k_v2')