未验证 提交 72f34450 编写于 作者: L Leo Chen 提交者: GitHub

[AMP OP&Test] register fp16 and bf16 kernel for uniform_random (#50993)

* register fp16 and bf16 kernel for uniform_random

* fix compile

* support selected_rows

* add ut

* revert cpu

* fp16 test skip cpu
上级 a4689c90
......@@ -905,14 +905,16 @@ template <typename Tx,
template <typename>
class ReduceOp,
typename TransformOp>
static typename std::enable_if<!std::is_same<Tx, phi::dtype::float16>::value,
void>::type
CubTensorReduceImpl(const Tx* x_data,
Ty* y_data,
const TransformOp& transform,
int reduce_num,
const KPDevice& dev_ctx,
KPStream stream) {
static
typename std::enable_if<!std::is_same<Tx, phi::dtype::float16>::value &&
!std::is_same<Tx, phi::dtype::bfloat16>::value,
void>::type
CubTensorReduceImpl(const Tx* x_data,
Ty* y_data,
const TransformOp& transform,
int reduce_num,
const KPDevice& dev_ctx,
KPStream stream) {
auto reducer = ReduceOp<Ty>();
cub::TransformInputIterator<Ty, TransformOp, const Tx*> trans_x(x_data,
transform);
......@@ -956,6 +958,23 @@ CubTensorReduceImpl(const Tx* x_data,
PADDLE_THROW(phi::errors::InvalidArgument(
"Tx should not be float16 when using cub::DeviceReduce::Reduce()."));
}
template <typename Tx,
typename Ty,
template <typename>
class ReduceOp,
typename TransformOp>
static typename std::enable_if<std::is_same<Tx, phi::dtype::bfloat16>::value,
void>::type
CubTensorReduceImpl(const Tx* x_data,
Ty* y_data,
const TransformOp& transform,
int reduce_num,
const KPDevice& dev_ctx,
KPStream stream) {
PADDLE_THROW(phi::errors::InvalidArgument(
"Tx should not be bfloat16 when using cub::DeviceReduce::Reduce()."));
}
#endif // PADDLE_WITH_XPU_KP
template <typename Tx,
......@@ -1008,7 +1027,8 @@ void ReduceKernel(const KPDevice& dev_ctx,
config.SetOutputData(y_data, dev_ctx, &tmp);
constexpr bool kIsTxFP16 = std::is_same<Tx, phi::dtype::float16>::value;
bool use_cub_reduce = config.reduce_num == numel && !kIsTxFP16;
constexpr bool kIsTxBF16 = std::is_same<Tx, phi::dtype::bfloat16>::value;
bool use_cub_reduce = config.reduce_num == numel && !kIsTxFP16 && !kIsTxBF16;
#ifndef PADDLE_WITH_XPU_KP
if (use_cub_reduce) {
if (is_mean) {
......
......@@ -46,4 +46,16 @@ inline void UniformRealDistribution(phi::dtype::bfloat16 *data,
}
}
template <>
inline void UniformRealDistribution(phi::dtype::float16 *data,
const int64_t &size,
const float &min,
const float &max,
std::shared_ptr<std::mt19937_64> engine) {
std::uniform_real_distribution<float> dist(min, max);
for (int64_t i = 0; i < size; ++i) {
data[i] = static_cast<phi::dtype::float16>(dist(*engine));
}
}
} // namespace phi
......@@ -92,4 +92,5 @@ PD_REGISTER_KERNEL(uniform_raw,
phi::UniformRawKernel,
float,
double,
phi::dtype::float16) {}
phi::dtype::float16,
phi::dtype::bfloat16) {}
......@@ -52,6 +52,12 @@ class MPTypeTrait<phi::dtype::float16> {
using Type = float;
};
template <>
class MPTypeTrait<phi::dtype::bfloat16> {
public:
using Type = float;
};
/**
* @brief Will be used in BlockYReduce, get the index of reduce_num in shared
* memory.
......
......@@ -78,12 +78,23 @@ PD_REGISTER_KERNEL(uniform_sr,
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PD_REGISTER_KERNEL(
uniform_raw_sr, GPU, ALL_LAYOUT, phi::sr::UniformRawKernel, float, double) {
}
PD_REGISTER_KERNEL(uniform_raw_sr,
GPU,
ALL_LAYOUT,
phi::sr::UniformRawKernel,
float,
double,
phi::dtype::float16,
phi::dtype::bfloat16) {}
PD_REGISTER_KERNEL(
uniform_sr, GPU, ALL_LAYOUT, phi::sr::UniformKernel, float, double) {}
PD_REGISTER_KERNEL(uniform_sr,
GPU,
ALL_LAYOUT,
phi::sr::UniformKernel,
float,
double,
phi::dtype::float16,
phi::dtype::bfloat16) {}
#endif
#if defined(PADDLE_WITH_XPU)
......
......@@ -56,7 +56,8 @@ PD_REGISTER_KERNEL(uniform,
phi::UniformKernel,
float,
double,
phi::dtype::float16) {}
phi::dtype::float16,
phi::dtype::bfloat16) {}
#endif
#ifdef PADDLE_WITH_XPU
......
......@@ -16,18 +16,21 @@ import os
import unittest
import numpy as np
from op_test import OpTest
from op_test import OpTest, convert_uint16_to_float
from test_attribute_var import UnittestBase
import paddle
import paddle.fluid as fluid
import paddle.fluid.core as core
from paddle.fluid import Program, program_guard
from paddle.fluid.framework import convert_np_dtype_to_dtype_
from paddle.fluid.op import Operator
from paddle.tensor import random
def output_hist(out):
if out.dtype == np.uint16:
out = convert_uint16_to_float(out)
hist, _ = np.histogram(out, range=(-5, 10))
hist = hist.astype("float32")
hist /= float(out.size)
......@@ -151,15 +154,19 @@ class TestUniformRandomOp(OpTest):
self.op_type = "uniform_random"
self.python_api = paddle.uniform
self.inputs = {}
self.init_dtype()
self.init_attrs()
self.outputs = {"Out": np.zeros((1000, 784)).astype("float32")}
def init_dtype(self):
self.dtype = np.float32
def init_attrs(self):
self.attrs = {
"shape": [1000, 784],
"min": -5.0,
"max": 10.0,
"seed": 10,
"dtype": convert_np_dtype_to_dtype_(self.dtype),
}
self.output_hist = output_hist
......@@ -176,13 +183,25 @@ class TestUniformRandomOp(OpTest):
with fluid.dygraph.base.guard(place=place):
out = self.python_api(
self.attrs['shape'],
'float32',
self.dtype,
self.attrs['min'],
self.attrs['max'],
self.attrs['seed'],
)
@unittest.skipIf(
not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
)
class TestUniformRandomFP16Op(TestUniformRandomOp):
def init_dtype(self):
self.dtype = np.float16
class TestUniformRandomBF16Op(TestUniformRandomOp):
def init_dtype(self):
self.dtype = np.uint16
class TestUniformRandomOpError(unittest.TestCase):
def test_errors(self):
main_prog = Program()
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册