未验证 提交 e0e044c0 编写于 作者: Z Zhang Zheng 提交者: GitHub

[AMP OP&Test] Support fp16&bf16 in reduce_max (#52862)

* [AMP OP&Test] Support fp16&bf16 in reduce_max
上级 dc8d6a1a
...@@ -16,7 +16,63 @@ ...@@ -16,7 +16,63 @@
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/impl/reduce_max_grad_kernel_impl.h" #include "paddle/phi/kernels/funcs/broadcast_function.h"
#include "paddle/phi/kernels/funcs/compare_functors.h"
#include "paddle/phi/kernels/funcs/elementwise_functor.h"
#include "paddle/phi/kernels/funcs/reduce_function.h"
namespace phi {
template <typename T, typename Context>
void ReduceMaxGradKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& out,
const DenseTensor& out_grad,
const IntArray& dims,
bool keep_dim,
bool reduce_all,
DenseTensor* x_grad) {
dev_ctx.Alloc(x_grad, x.dtype());
reduce_all = recompute_reduce_all(x, dims, reduce_all);
// get reduce_dim
int dim_size = x.dims().size();
auto reduce_dims =
funcs::details::GetReduceDim(dims.GetData(), dim_size, reduce_all);
auto update_dims = vectorize(x.dims());
for (auto i : reduce_dims) {
update_dims[i] = 1;
}
// make new tensor of out and out_grad
phi::DenseTensor new_out(out.type());
new_out.ShareDataWith(out);
new_out.Resize(phi::make_ddim(update_dims));
phi::DenseTensor new_out_grad(out_grad.type());
new_out_grad.ShareDataWith(out_grad);
new_out_grad.Resize(phi::make_ddim(update_dims));
// make equal_out
phi::DenseTensor* equal_out = new phi::DenseTensor();
equal_out->Resize(x.dims());
dev_ctx.template Alloc<T>(equal_out);
// compute
// 1. equal_out = Equal(x, y)
std::vector<const phi::DenseTensor*> equal_inputs = {&new_out, &x};
std::vector<phi::DenseTensor*> equal_outputs = {equal_out};
funcs::BroadcastKernel<phi::ElementwiseType::kBinary, T, T>(
dev_ctx, equal_inputs, &equal_outputs, 0, funcs::EqualFunctor<T>());
// 2. dx = dout * 1
std::vector<const phi::DenseTensor*> mul_inputs = {&new_out_grad, equal_out};
std::vector<phi::DenseTensor*> mul_outputs = {x_grad};
funcs::BroadcastKernel<phi::ElementwiseType::kBinary, T, T>(
dev_ctx, mul_inputs, &mul_outputs, 0, funcs::MultiplyFunctor<T>());
delete equal_out;
}
} // namespace phi
PD_REGISTER_KERNEL(max_grad, PD_REGISTER_KERNEL(max_grad,
GPU, GPU,
...@@ -25,4 +81,6 @@ PD_REGISTER_KERNEL(max_grad, ...@@ -25,4 +81,6 @@ PD_REGISTER_KERNEL(max_grad,
float, float,
double, double,
int, int,
int64_t) {} int64_t,
phi::dtype::float16,
phi::dtype::bfloat16) {}
...@@ -36,6 +36,14 @@ void MaxRawKernel(const Context& dev_ctx, ...@@ -36,6 +36,14 @@ void MaxRawKernel(const Context& dev_ctx,
#ifdef PADDLE_WITH_XPU_KP #ifdef PADDLE_WITH_XPU_KP
PD_REGISTER_KERNEL(max_raw, KPS, ALL_LAYOUT, phi::MaxRawKernel, float) {} PD_REGISTER_KERNEL(max_raw, KPS, ALL_LAYOUT, phi::MaxRawKernel, float) {}
#else #else
PD_REGISTER_KERNEL( PD_REGISTER_KERNEL(max_raw,
max_raw, KPS, ALL_LAYOUT, phi::MaxRawKernel, float, double, int, int64_t) {} KPS,
ALL_LAYOUT,
phi::MaxRawKernel,
float,
double,
int,
int64_t,
phi::dtype::float16,
phi::dtype::bfloat16) {}
#endif #endif
...@@ -34,7 +34,20 @@ void MaxKernel(const Context& dev_ctx, ...@@ -34,7 +34,20 @@ void MaxKernel(const Context& dev_ctx,
PD_REGISTER_KERNEL( PD_REGISTER_KERNEL(
max, CPU, ALL_LAYOUT, phi::MaxKernel, float, double, int, int64_t) {} max, CPU, ALL_LAYOUT, phi::MaxKernel, float, double, int, int64_t) {}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA)
PD_REGISTER_KERNEL(max,
GPU,
ALL_LAYOUT,
phi::MaxKernel,
float,
double,
int,
int64_t,
phi::dtype::float16,
phi::dtype::bfloat16) {}
#endif
#if defined(PADDLE_WITH_HIP)
PD_REGISTER_KERNEL( PD_REGISTER_KERNEL(
max, GPU, ALL_LAYOUT, phi::MaxKernel, float, double, int, int64_t) {} max, GPU, ALL_LAYOUT, phi::MaxKernel, float, double, int, int64_t) {}
#endif #endif
......
...@@ -251,18 +251,6 @@ class TestMaxOp(OpTest): ...@@ -251,18 +251,6 @@ class TestMaxOp(OpTest):
only_check_prim=True, only_check_prim=True,
) )
def test_raise_error(self):
if core.is_compiled_with_cuda():
self.inputs = {'X': np.random.random((5, 6, 10)).astype("float16")}
place = core.CUDAPlace(0)
with self.assertRaises(RuntimeError) as cm:
self.check_output_with_place(place)
error_msg = str(cm.exception).split("\n")[-2].strip().split(".")[0]
self.assertEqual(
error_msg,
"NotFoundError: The kernel (reduce_max) with key (GPU, Undefined(AnyLayout), float16) is not found and GPU kernel cannot fallback to CPU one",
)
class TestMaxOp_ZeroDim(OpTest): class TestMaxOp_ZeroDim(OpTest):
"""Remove Max with subgradient from gradient check to confirm the success of CI.""" """Remove Max with subgradient from gradient check to confirm the success of CI."""
...@@ -292,7 +280,7 @@ class TestMaxOp_ZeroDim(OpTest): ...@@ -292,7 +280,7 @@ class TestMaxOp_ZeroDim(OpTest):
) )
class TestMaxOp_FP32(OpTest): class TestMaxFP32Op(OpTest):
"""Remove Max with subgradient from gradient check to confirm the success of CI.""" """Remove Max with subgradient from gradient check to confirm the success of CI."""
def setUp(self): def setUp(self):
...@@ -300,13 +288,19 @@ class TestMaxOp_FP32(OpTest): ...@@ -300,13 +288,19 @@ class TestMaxOp_FP32(OpTest):
self.prim_op_type = "prim" self.prim_op_type = "prim"
self.python_api = paddle.max self.python_api = paddle.max
self.public_python_api = paddle.max self.public_python_api = paddle.max
self.inputs = {'X': np.random.random((5, 6, 10)).astype("float32")} self.init_dtype()
if self.dtype == np.uint16:
x = np.random.random((5, 6, 10)).astype(np.float32)
self.inputs = {'X': convert_float_to_uint16(x)}
else:
x = np.random.random((5, 6, 10)).astype(self.dtype)
self.inputs = {'X': x}
self.attrs = {'dim': [-1], 'keep_dim': True} self.attrs = {'dim': [-1], 'keep_dim': True}
self.outputs = { out = x.max(axis=tuple(self.attrs['dim']), keepdims=True)
'Out': self.inputs['X'].max( if self.dtype == np.uint16:
axis=tuple(self.attrs['dim']), keepdims=True self.outputs = {'Out': convert_float_to_uint16(out)}
) else:
} self.outputs = {'Out': out}
def test_check_output(self): def test_check_output(self):
self.check_output() self.check_output()
...@@ -320,6 +314,37 @@ class TestMaxOp_FP32(OpTest): ...@@ -320,6 +314,37 @@ class TestMaxOp_FP32(OpTest):
only_check_prim=True, only_check_prim=True,
) )
def init_dtype(self):
self.dtype = np.float32
class TestMaxFP16Op(TestMaxFP32Op):
def init_dtype(self):
self.dtype = np.float16
@unittest.skipIf(
not core.is_compiled_with_cuda()
or not core.is_bfloat16_supported(core.CUDAPlace(0)),
"core is not compiled with CUDA or not support the bfloat16",
)
class TestMaxBF16Op(TestMaxFP32Op):
def init_dtype(self):
self.dtype = np.uint16
def test_check_output(self):
self.check_output_with_place(core.CUDAPlace(0))
def test_check_grad(self):
# only composite op support gradient check of reduce_max
self.check_grad_with_place(
core.CUDAPlace(0),
['X'],
'Out',
check_prim=True,
only_check_prim=True,
)
@skip_check_grad_ci( @skip_check_grad_ci(
reason="reduce_min is discontinuous non-derivable function," reason="reduce_min is discontinuous non-derivable function,"
......
...@@ -2348,7 +2348,10 @@ def max(x, axis=None, keepdim=False, name=None): ...@@ -2348,7 +2348,10 @@ def max(x, axis=None, keepdim=False, name=None):
reduce_all, axis = _get_reduce_axis_with_tensor(axis, x) reduce_all, axis = _get_reduce_axis_with_tensor(axis, x)
helper = LayerHelper('max', **locals()) helper = LayerHelper('max', **locals())
check_variable_and_dtype( check_variable_and_dtype(
x, 'x', ['float32', 'float64', 'int32', 'int64'], 'max' x,
'x',
['float16', 'uint16', 'float32', 'float64', 'int32', 'int64'],
'max',
) )
if not isinstance(axis, Variable) and paddle.utils._contain_var(axis): if not isinstance(axis, Variable) and paddle.utils._contain_var(axis):
axis = paddle.utils._convert_to_tensor_list(axis) axis = paddle.utils._convert_to_tensor_list(axis)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册