未验证 提交 72f34450 编写于 作者: L Leo Chen 提交者: GitHub

[AMP OP&Test] register fp16 and bf16 kernel for uniform_random (#50993)

* register fp16 and bf16 kernel for uniform_random

* fix compile

* support selected_rows

* add ut

* revert cpu

* fp16 test skip cpu
上级 a4689c90
...@@ -905,14 +905,16 @@ template <typename Tx, ...@@ -905,14 +905,16 @@ template <typename Tx,
template <typename> template <typename>
class ReduceOp, class ReduceOp,
typename TransformOp> typename TransformOp>
static typename std::enable_if<!std::is_same<Tx, phi::dtype::float16>::value, static
void>::type typename std::enable_if<!std::is_same<Tx, phi::dtype::float16>::value &&
CubTensorReduceImpl(const Tx* x_data, !std::is_same<Tx, phi::dtype::bfloat16>::value,
Ty* y_data, void>::type
const TransformOp& transform, CubTensorReduceImpl(const Tx* x_data,
int reduce_num, Ty* y_data,
const KPDevice& dev_ctx, const TransformOp& transform,
KPStream stream) { int reduce_num,
const KPDevice& dev_ctx,
KPStream stream) {
auto reducer = ReduceOp<Ty>(); auto reducer = ReduceOp<Ty>();
cub::TransformInputIterator<Ty, TransformOp, const Tx*> trans_x(x_data, cub::TransformInputIterator<Ty, TransformOp, const Tx*> trans_x(x_data,
transform); transform);
...@@ -956,6 +958,23 @@ CubTensorReduceImpl(const Tx* x_data, ...@@ -956,6 +958,23 @@ CubTensorReduceImpl(const Tx* x_data,
PADDLE_THROW(phi::errors::InvalidArgument( PADDLE_THROW(phi::errors::InvalidArgument(
"Tx should not be float16 when using cub::DeviceReduce::Reduce().")); "Tx should not be float16 when using cub::DeviceReduce::Reduce()."));
} }
template <typename Tx,
typename Ty,
template <typename>
class ReduceOp,
typename TransformOp>
static typename std::enable_if<std::is_same<Tx, phi::dtype::bfloat16>::value,
void>::type
CubTensorReduceImpl(const Tx* x_data,
Ty* y_data,
const TransformOp& transform,
int reduce_num,
const KPDevice& dev_ctx,
KPStream stream) {
PADDLE_THROW(phi::errors::InvalidArgument(
"Tx should not be bfloat16 when using cub::DeviceReduce::Reduce()."));
}
#endif // PADDLE_WITH_XPU_KP #endif // PADDLE_WITH_XPU_KP
template <typename Tx, template <typename Tx,
...@@ -1008,7 +1027,8 @@ void ReduceKernel(const KPDevice& dev_ctx, ...@@ -1008,7 +1027,8 @@ void ReduceKernel(const KPDevice& dev_ctx,
config.SetOutputData(y_data, dev_ctx, &tmp); config.SetOutputData(y_data, dev_ctx, &tmp);
constexpr bool kIsTxFP16 = std::is_same<Tx, phi::dtype::float16>::value; constexpr bool kIsTxFP16 = std::is_same<Tx, phi::dtype::float16>::value;
bool use_cub_reduce = config.reduce_num == numel && !kIsTxFP16; constexpr bool kIsTxBF16 = std::is_same<Tx, phi::dtype::bfloat16>::value;
bool use_cub_reduce = config.reduce_num == numel && !kIsTxFP16 && !kIsTxBF16;
#ifndef PADDLE_WITH_XPU_KP #ifndef PADDLE_WITH_XPU_KP
if (use_cub_reduce) { if (use_cub_reduce) {
if (is_mean) { if (is_mean) {
......
...@@ -46,4 +46,16 @@ inline void UniformRealDistribution(phi::dtype::bfloat16 *data, ...@@ -46,4 +46,16 @@ inline void UniformRealDistribution(phi::dtype::bfloat16 *data,
} }
} }
template <>
inline void UniformRealDistribution(phi::dtype::float16 *data,
const int64_t &size,
const float &min,
const float &max,
std::shared_ptr<std::mt19937_64> engine) {
std::uniform_real_distribution<float> dist(min, max);
for (int64_t i = 0; i < size; ++i) {
data[i] = static_cast<phi::dtype::float16>(dist(*engine));
}
}
} // namespace phi } // namespace phi
...@@ -92,4 +92,5 @@ PD_REGISTER_KERNEL(uniform_raw, ...@@ -92,4 +92,5 @@ PD_REGISTER_KERNEL(uniform_raw,
phi::UniformRawKernel, phi::UniformRawKernel,
float, float,
double, double,
phi::dtype::float16) {} phi::dtype::float16,
phi::dtype::bfloat16) {}
...@@ -52,6 +52,12 @@ class MPTypeTrait<phi::dtype::float16> { ...@@ -52,6 +52,12 @@ class MPTypeTrait<phi::dtype::float16> {
using Type = float; using Type = float;
}; };
template <>
class MPTypeTrait<phi::dtype::bfloat16> {
public:
using Type = float;
};
/** /**
* @brief Will be used in BlockYReduce, get the index of reduce_num in shared * @brief Will be used in BlockYReduce, get the index of reduce_num in shared
* memory. * memory.
......
...@@ -78,12 +78,23 @@ PD_REGISTER_KERNEL(uniform_sr, ...@@ -78,12 +78,23 @@ PD_REGISTER_KERNEL(uniform_sr,
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PD_REGISTER_KERNEL( PD_REGISTER_KERNEL(uniform_raw_sr,
uniform_raw_sr, GPU, ALL_LAYOUT, phi::sr::UniformRawKernel, float, double) { GPU,
} ALL_LAYOUT,
phi::sr::UniformRawKernel,
float,
double,
phi::dtype::float16,
phi::dtype::bfloat16) {}
PD_REGISTER_KERNEL( PD_REGISTER_KERNEL(uniform_sr,
uniform_sr, GPU, ALL_LAYOUT, phi::sr::UniformKernel, float, double) {} GPU,
ALL_LAYOUT,
phi::sr::UniformKernel,
float,
double,
phi::dtype::float16,
phi::dtype::bfloat16) {}
#endif #endif
#if defined(PADDLE_WITH_XPU) #if defined(PADDLE_WITH_XPU)
......
...@@ -56,7 +56,8 @@ PD_REGISTER_KERNEL(uniform, ...@@ -56,7 +56,8 @@ PD_REGISTER_KERNEL(uniform,
phi::UniformKernel, phi::UniformKernel,
float, float,
double, double,
phi::dtype::float16) {} phi::dtype::float16,
phi::dtype::bfloat16) {}
#endif #endif
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
......
...@@ -16,18 +16,21 @@ import os ...@@ -16,18 +16,21 @@ import os
import unittest import unittest
import numpy as np import numpy as np
from op_test import OpTest from op_test import OpTest, convert_uint16_to_float
from test_attribute_var import UnittestBase from test_attribute_var import UnittestBase
import paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.core as core import paddle.fluid.core as core
from paddle.fluid import Program, program_guard from paddle.fluid import Program, program_guard
from paddle.fluid.framework import convert_np_dtype_to_dtype_
from paddle.fluid.op import Operator from paddle.fluid.op import Operator
from paddle.tensor import random from paddle.tensor import random
def output_hist(out): def output_hist(out):
if out.dtype == np.uint16:
out = convert_uint16_to_float(out)
hist, _ = np.histogram(out, range=(-5, 10)) hist, _ = np.histogram(out, range=(-5, 10))
hist = hist.astype("float32") hist = hist.astype("float32")
hist /= float(out.size) hist /= float(out.size)
...@@ -151,15 +154,19 @@ class TestUniformRandomOp(OpTest): ...@@ -151,15 +154,19 @@ class TestUniformRandomOp(OpTest):
self.op_type = "uniform_random" self.op_type = "uniform_random"
self.python_api = paddle.uniform self.python_api = paddle.uniform
self.inputs = {} self.inputs = {}
self.init_dtype()
self.init_attrs() self.init_attrs()
self.outputs = {"Out": np.zeros((1000, 784)).astype("float32")} self.outputs = {"Out": np.zeros((1000, 784)).astype("float32")}
def init_dtype(self):
self.dtype = np.float32
def init_attrs(self): def init_attrs(self):
self.attrs = { self.attrs = {
"shape": [1000, 784], "shape": [1000, 784],
"min": -5.0, "min": -5.0,
"max": 10.0, "max": 10.0,
"seed": 10, "dtype": convert_np_dtype_to_dtype_(self.dtype),
} }
self.output_hist = output_hist self.output_hist = output_hist
...@@ -176,13 +183,25 @@ class TestUniformRandomOp(OpTest): ...@@ -176,13 +183,25 @@ class TestUniformRandomOp(OpTest):
with fluid.dygraph.base.guard(place=place): with fluid.dygraph.base.guard(place=place):
out = self.python_api( out = self.python_api(
self.attrs['shape'], self.attrs['shape'],
'float32', self.dtype,
self.attrs['min'], self.attrs['min'],
self.attrs['max'], self.attrs['max'],
self.attrs['seed'],
) )
@unittest.skipIf(
not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
)
class TestUniformRandomFP16Op(TestUniformRandomOp):
def init_dtype(self):
self.dtype = np.float16
class TestUniformRandomBF16Op(TestUniformRandomOp):
def init_dtype(self):
self.dtype = np.uint16
class TestUniformRandomOpError(unittest.TestCase): class TestUniformRandomOpError(unittest.TestCase):
def test_errors(self): def test_errors(self):
main_prog = Program() main_prog = Program()
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册