[AMP OP&Test] register fp16 and bf16 kernel for uniform_random (#50993)

* register fp16 and bf16 kernel for uniform_random * fix compile * support selected_rows * add ut * revert cpu * fp16 test skip cpu

[AMP OP&Test] register fp16 and bf16 kernel for uniform_random (#50993)
* register fp16 and bf16 kernel for uniform_random * fix compile * support selected_rows * add ut * revert cpu * fp16 test skip cpu
72f34450 · Leo Chen · GitHub · a4689c90 · 72f34450 · 72f34450
7 changed file
--- a/paddle/phi/kernels/funcs/reduce_function.h
+++ b/paddle/phi/kernels/funcs/reduce_function.h
@@ -905,14 +905,16 @@ template <typename Tx,
          template <typename>
          class ReduceOp,
          typename TransformOp>
-static typename std::enable_if<!std::is_same<Tx, phi::dtype::float16>::value,
+static
-                               void>::type
+    typename std::enable_if<!std::is_same<Tx, phi::dtype::float16>::value &&
-CubTensorReduceImpl(const Tx* x_data,
+                                !std::is_same<Tx, phi::dtype::bfloat16>::value,
-                    Ty* y_data,
+                            void>::type
-                    const TransformOp& transform,
+    CubTensorReduceImpl(const Tx* x_data,
-                    int reduce_num,
+                        Ty* y_data,
-                    const KPDevice& dev_ctx,
+                        const TransformOp& transform,
-                    KPStream stream) {
+                        int reduce_num,
+                        const KPDevice& dev_ctx,
+                        KPStream stream) {
  auto reducer = ReduceOp<Ty>();
  cub::TransformInputIterator<Ty, TransformOp, const Tx*> trans_x(x_data,
                                                                  transform);
@@ -956,6 +958,23 @@ CubTensorReduceImpl(const Tx* x_data,
  PADDLE_THROW(phi::errors::InvalidArgument(
      "Tx should not be float16 when using cub::DeviceReduce::Reduce()."));
 }
+template <typename Tx,
+          typename Ty,
+          template <typename>
+          class ReduceOp,
+          typename TransformOp>
+static typename std::enable_if<std::is_same<Tx, phi::dtype::bfloat16>::value,
+                               void>::type
+CubTensorReduceImpl(const Tx* x_data,
+                    Ty* y_data,
+                    const TransformOp& transform,
+                    int reduce_num,
+                    const KPDevice& dev_ctx,
+                    KPStream stream) {
+  PADDLE_THROW(phi::errors::InvalidArgument(
+      "Tx should not be bfloat16 when using cub::DeviceReduce::Reduce()."));
+}
 #endif  // PADDLE_WITH_XPU_KP
 template <typename Tx,
@@ -1008,7 +1027,8 @@ void ReduceKernel(const KPDevice& dev_ctx,
  config.SetOutputData(y_data, dev_ctx, &tmp);
  constexpr bool kIsTxFP16 = std::is_same<Tx, phi::dtype::float16>::value;
-  bool use_cub_reduce = config.reduce_num == numel && !kIsTxFP16;
+  constexpr bool kIsTxBF16 = std::is_same<Tx, phi::dtype::bfloat16>::value;
+  bool use_cub_reduce = config.reduce_num == numel && !kIsTxFP16 && !kIsTxBF16;
 #ifndef PADDLE_WITH_XPU_KP
  if (use_cub_reduce) {
    if (is_mean) {

--- a/paddle/phi/kernels/funcs/uniform_real_distribution.h
+++ b/paddle/phi/kernels/funcs/uniform_real_distribution.h
@@ -46,4 +46,16 @@ inline void UniformRealDistribution(phi::dtype::bfloat16 *data,
  }
 }
+template <>
+inline void UniformRealDistribution(phi::dtype::float16 *data,
+                                    const int64_t &size,
+                                    const float &min,
+                                    const float &max,
+                                    std::shared_ptr<std::mt19937_64> engine) {
+  std::uniform_real_distribution<float> dist(min, max);
+  for (int64_t i = 0; i < size; ++i) {
+    data[i] = static_cast<phi::dtype::float16>(dist(*engine));
+  }
+}
 }  // namespace phi
--- a/paddle/phi/kernels/gpu/uniform_kernel.cu
+++ b/paddle/phi/kernels/gpu/uniform_kernel.cu
@@ -92,4 +92,5 @@ PD_REGISTER_KERNEL(uniform_raw,
                   phi::UniformRawKernel,
                   float,
                   double,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
--- a/paddle/phi/kernels/primitive/compute_primitives.h
+++ b/paddle/phi/kernels/primitive/compute_primitives.h
@@ -52,6 +52,12 @@ class MPTypeTrait<phi::dtype::float16> {
  using Type = float;
 };
+template <>
+class MPTypeTrait<phi::dtype::bfloat16> {
+ public:
+  using Type = float;
+};
 /**
 * @brief Will be used in BlockYReduce, get the index of reduce_num in shared
 * memory.

--- a/paddle/phi/kernels/selected_rows/uniform_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/uniform_kernel.cc
@@ -78,12 +78,23 @@ PD_REGISTER_KERNEL(uniform_sr,
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PD_REGISTER_KERNEL(
+PD_REGISTER_KERNEL(uniform_raw_sr,
-    uniform_raw_sr, GPU, ALL_LAYOUT, phi::sr::UniformRawKernel, float, double) {
+                   GPU,
-}
+                   ALL_LAYOUT,
+                   phi::sr::UniformRawKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
-PD_REGISTER_KERNEL(
+PD_REGISTER_KERNEL(uniform_sr,
-    uniform_sr, GPU, ALL_LAYOUT, phi::sr::UniformKernel, float, double) {}
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sr::UniformKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
 #endif
 #if defined(PADDLE_WITH_XPU)

--- a/paddle/phi/kernels/uniform_kernel.cc
+++ b/paddle/phi/kernels/uniform_kernel.cc
@@ -56,7 +56,8 @@ PD_REGISTER_KERNEL(uniform,
                   phi::UniformKernel,
                   float,
                   double,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
 #endif
 #ifdef PADDLE_WITH_XPU

--- a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
+++ b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
@@ -16,18 +16,21 @@ import os
 import unittest
 import numpy as np
-from op_test import OpTest
+from op_test import OpTest, convert_uint16_to_float
 from test_attribute_var import UnittestBase
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid import Program, program_guard
+from paddle.fluid.framework import convert_np_dtype_to_dtype_
 from paddle.fluid.op import Operator
 from paddle.tensor import random
 def output_hist(out):
+    if out.dtype == np.uint16:
+        out = convert_uint16_to_float(out)
    hist, _ = np.histogram(out, range=(-5, 10))
    hist = hist.astype("float32")
    hist /= float(out.size)
@@ -151,15 +154,19 @@ class TestUniformRandomOp(OpTest):
        self.op_type = "uniform_random"
        self.python_api = paddle.uniform
        self.inputs = {}
+        self.init_dtype()
        self.init_attrs()
        self.outputs = {"Out": np.zeros((1000, 784)).astype("float32")}
+    def init_dtype(self):
+        self.dtype = np.float32
    def init_attrs(self):
        self.attrs = {
            "shape": [1000, 784],
            "min": -5.0,
            "max": 10.0,
-            "seed": 10,
+            "dtype": convert_np_dtype_to_dtype_(self.dtype),
        }
        self.output_hist = output_hist
@@ -176,13 +183,25 @@ class TestUniformRandomOp(OpTest):
            with fluid.dygraph.base.guard(place=place):
                out = self.python_api(
                    self.attrs['shape'],
-                    'float32',
+                    self.dtype,
                    self.attrs['min'],
                    self.attrs['max'],
-                    self.attrs['seed'],
                )
+@unittest.skipIf(
+    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
+)
+class TestUniformRandomFP16Op(TestUniformRandomOp):
+    def init_dtype(self):
+        self.dtype = np.float16
+class TestUniformRandomBF16Op(TestUniformRandomOp):
+    def init_dtype(self):
+        self.dtype = np.uint16
 class TestUniformRandomOpError(unittest.TestCase):
    def test_errors(self):
        main_prog = Program()