[AMP OP&Test] add bf16 fp16 type support for expand_v2_op and top_k_v2_op (#51263)

5b09dd56 · Thomas Young · GitHub · a6ae1e35 · 5b09dd56 · 5b09dd56
6 changed file
--- a/paddle/phi/kernels/funcs/top_k_function_cuda.h
+++ b/paddle/phi/kernels/funcs/top_k_function_cuda.h
@@ -26,6 +26,7 @@ limitations under the License. */
 #include "paddle/phi/backends/gpu/gpu_device_function.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
+#include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
@@ -49,6 +50,10 @@ namespace detail {
 template <>
 struct radix_key_codec_base<phi::dtype::float16>
    : radix_key_codec_integral<phi::dtype::float16, uint16_t> {};
+template <>
+struct radix_key_codec_base<phi::dtype::bfloat16>
+    : radix_key_codec_integral<phi::dtype::bfloat16, uint16_t> {};
 }  // namespace detail
 }  // namespace rocprim
 namespace cub = hipcub;
@@ -58,6 +63,12 @@ namespace cub {
 template <>
 struct NumericTraits<phi::dtype::float16>
    : BaseTraits<FLOATING_POINT, true, false, uint16_t, phi::dtype::float16> {};
+template <>
+struct NumericTraits<phi::dtype::bfloat16>
+    : BaseTraits<FLOATING_POINT, true, false, uint16_t, phi::dtype::bfloat16> {
+};
 }  // namespace cub
 #endif
@@ -586,6 +597,24 @@ struct RadixTypeConfig<phi::dtype::float16> {
  }
 };
+template <>
+struct RadixTypeConfig<phi::dtype::bfloat16> {
+  typedef uint32_t RadixType;
+  static inline __device__ RadixType Convert(phi::dtype::bfloat16 v) {
+    RadixType x = v.x;
+    RadixType mask = (x & 0x00008000) ? 0x0000ffff : 0x00008000;
+    return (v == v) ? (x ^ mask) : 0xffff;
+  }
+  static inline __device__ phi::dtype::bfloat16 Deconvert(RadixType v) {
+    RadixType mask = (v & 0x00008000) ? 0x00008000 : 0x0000ffff;
+    phi::dtype::bfloat16 r;
+    r.x = (v ^ mask);
+    return r;
+  }
+};
 /*---------------------------Helper Functions------------------*/
 __device__ __forceinline__ int GetLaneId() {
  int lane_id;

--- a/paddle/phi/kernels/gpu/top_k_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/top_k_grad_kernel.cu
@@ -13,8 +13,8 @@
 // limitations under the License.
 #include "paddle/phi/kernels/top_k_grad_kernel.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/top_k_function_cuda.h"
@@ -89,4 +89,5 @@ PD_REGISTER_KERNEL(topk_grad,
                   double,
                   int,
                   int64_t,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
--- a/paddle/phi/kernels/gpu/top_k_kernel.cu
+++ b/paddle/phi/kernels/gpu/top_k_kernel.cu
@@ -15,11 +15,13 @@
 #include "paddle/phi/kernels/top_k_kernel.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/gather.cu.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/top_k_function_cuda.h"
 namespace phi {
 #define FIXED_BLOCK_DIM_BASE(dim, ...) \
@@ -348,6 +350,7 @@ PD_REGISTER_KERNEL(topk,
                   double,
                   int,
                   int64_t,
-                   phi::dtype::float16) {
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {
  kernel->OutputAt(1).SetDataType(phi::DataType::INT64);
 }
--- a/python/paddle/fluid/tests/unittests/test_expand_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_expand_v2_op.py
@@ -17,7 +17,7 @@ import unittest
 import gradient_checker
 import numpy as np
 from decorator_helper import prog_scope
-from eager_op_test import OpTest
+from eager_op_test import OpTest, convert_float_to_uint16
 import paddle
 from paddle import fluid
@@ -202,6 +202,56 @@ class TestExpandV2OpInt64_t(OpTest):
        self.check_output()
+#  Situation 7: input x is Float16
+class TestExpandV2FP16Op(OpTest):
+    def setUp(self):
+        self.op_type = "expand_v2"
+        self.prim_op_type = "prim"
+        self.dtype = np.float16
+        self.python_api = paddle.expand
+        self.public_python_api = paddle.expand
+        self.inputs = {
+            'X': np.random.randint(10, size=(8, 8, 5)).astype(self.dtype)
+        }
+        self.attrs = {'shape': [8, 8, 5]}
+        output = np.tile(self.inputs['X'], (1, 1, 1))
+        self.outputs = {'Out': output}
+    def test_check_output(self):
+        self.check_output()
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', check_prim=True)
+#  Situation 8: input x is BF16
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA or not support the bfloat16",
+)
+class TestExpandV2BF16Op(OpTest):
+    def setUp(self):
+        self.op_type = "expand_v2"
+        self.prim_op_type = "prim"
+        self.dtype = np.uint16
+        self.python_api = paddle.expand
+        self.public_python_api = paddle.expand
+        x = np.random.randint(10, size=(8, 8, 5)).astype(np.float32)
+        self.inputs = {'X': convert_float_to_uint16(x)}
+        self.attrs = {'shape': [8, 8, 5]}
+        output = np.tile(x, (1, 1, 1)).astype(np.float32)
+        self.outputs = {'Out': convert_float_to_uint16(output)}
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(place)
+    def test_check_grad(self):
+        place = core.CUDAPlace(0)
+        self.check_grad_with_place(place, ['X'], 'Out', check_prim=True)
 class TestExpandV2Error(unittest.TestCase):
    def test_errors(self):
        with program_guard(Program(), Program()):
@@ -338,7 +388,7 @@ class TestExpandTripleGradCheck(unittest.TestCase):
            self.func(p)
-# Situation 7: comp case, shape is a list(without tensor)
+# Situation 9: comp case, shape is a list(without tensor)
 class TestExpandV2CompOpRank1(OpTest):
    def setUp(self):
        self.op_type = "expand_v2"
@@ -392,7 +442,7 @@ class TestExpandV2CompOpRank4(TestExpandV2CompOpRank1):
        self.expand_times = (1, 1, 1, 1)
-# Situation 8: comp case, input x is Integer
+# Situation 10: comp case, input x is Integer
 class TestExpandV2CompOpInteger(OpTest):
    def setUp(self):
        self.op_type = "expand_v2"
@@ -410,7 +460,7 @@ class TestExpandV2CompOpInteger(OpTest):
        self.check_output(check_prim=True)
-#  Situation 9: comp case, input x is Bool
+#  Situation 11: comp case, input x is Bool
 class TestExpandV2CompOpBoolean(OpTest):
    def setUp(self):
        self.op_type = "expand_v2"
@@ -426,7 +476,7 @@ class TestExpandV2CompOpBoolean(OpTest):
        self.check_output(check_prim=True)
-#  Situation 10: comp case, input x is Integer
+#  Situation 12: comp case, input x is Integer
 class TestExpandV2CompOpInt64_t(OpTest):
    def setUp(self):
        self.op_type = "expand_v2"

--- a/python/paddle/fluid/tests/unittests/test_top_k_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_top_k_v2_op.py
@@ -15,7 +15,7 @@
 import unittest
 import numpy as np
-from eager_op_test import OpTest
+from eager_op_test import OpTest, convert_float_to_uint16
 import paddle
 from paddle.fluid import core
@@ -189,6 +189,56 @@ class TestTopkOp7(TestTopkOp):
        self.outputs = {'Out': output, 'Indices': indices}
+class TestTopkFP16Op(TestTopkOp):
+    def setUp(self):
+        self.op_type = "top_k_v2"
+        self.python_api = paddle.topk
+        self.public_python_api = paddle.topk
+        self.dtype = np.float16
+        self.prim_op_type = "prim"
+        self.input_data = np.random.rand(10, 20).astype(self.dtype)
+        self.init_args()
+        self.inputs = {'X': self.input_data}
+        self.attrs = {'k': self.k, 'axis': self.axis, 'largest': self.largest}
+        output, indices = numpy_topk(
+            self.input_data, axis=self.axis, k=self.k, largest=self.largest
+        )
+        self.outputs = {'Out': output, 'Indices': indices}
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA or not support the bfloat16",
+)
+class TestTopkBF16Op(TestTopkOp):
+    def setUp(self):
+        self.op_type = "top_k_v2"
+        self.python_api = paddle.topk
+        self.public_python_api = paddle.topk
+        self.dtype = np.uint16
+        self.prim_op_type = "prim"
+        self.input_data = np.random.rand(10, 20).astype(np.float32)
+        self.init_args()
+        self.inputs = {'X': convert_float_to_uint16(self.input_data)}
+        self.attrs = {'k': self.k, 'axis': self.axis, 'largest': self.largest}
+        output, indices = numpy_topk(
+            self.input_data, axis=self.axis, k=self.k, largest=self.largest
+        )
+        self.outputs = {
+            'Out': convert_float_to_uint16(output),
+            'Indices': indices,
+        }
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(place, check_eager=True)
+    def test_check_grad(self):
+        place = core.CUDAPlace(0)
+        self.check_grad_with_place(place, {'X'}, 'Out', check_eager=True)
 class TestTopKAPI(unittest.TestCase):
    def setUp(self):
        np.random.seed(123)

--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -3418,7 +3418,15 @@ def expand(x, shape, name=None):
        check_variable_and_dtype(
            x,
            'x',
-            ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'],
+            [
+                'bool',
+                'float16',
+                'float32',
+                'float64',
+                'int32',
+                'int64',
+                'uint16',
+            ],
            'expand',
        )
        check_type(shape, 'shape', (list, tuple, Variable), 'expand')