From e0e044c0137814f130b9945498b85a7490083d46 Mon Sep 17 00:00:00 2001 From: Zhang Zheng <32410583+ZzSean@users.noreply.github.com> Date: Thu, 13 Apr 2023 19:28:48 +0800 Subject: [PATCH] [AMP OP&Test] Support fp16&bf16 in reduce_max (#52862) * [AMP OP&Test] Support fp16&bf16 in reduce_max --- .../phi/kernels/gpu/reduce_max_grad_kernel.cu | 62 +++++++++++++++++- paddle/phi/kernels/kps/reduce_max_kernel.cu | 12 +++- paddle/phi/kernels/reduce_max_kernel.cc | 15 ++++- .../fluid/tests/unittests/test_reduce_op.py | 63 +++++++++++++------ python/paddle/tensor/math.py | 5 +- 5 files changed, 132 insertions(+), 25 deletions(-) diff --git a/paddle/phi/kernels/gpu/reduce_max_grad_kernel.cu b/paddle/phi/kernels/gpu/reduce_max_grad_kernel.cu index b4ff277b502..7b4472c5223 100644 --- a/paddle/phi/kernels/gpu/reduce_max_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/reduce_max_grad_kernel.cu @@ -16,7 +16,63 @@ #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/impl/reduce_max_grad_kernel_impl.h" +#include "paddle/phi/kernels/funcs/broadcast_function.h" +#include "paddle/phi/kernels/funcs/compare_functors.h" +#include "paddle/phi/kernels/funcs/elementwise_functor.h" +#include "paddle/phi/kernels/funcs/reduce_function.h" + +namespace phi { + +template +void ReduceMaxGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out, + const DenseTensor& out_grad, + const IntArray& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* x_grad) { + dev_ctx.Alloc(x_grad, x.dtype()); + reduce_all = recompute_reduce_all(x, dims, reduce_all); + + // get reduce_dim + int dim_size = x.dims().size(); + auto reduce_dims = + funcs::details::GetReduceDim(dims.GetData(), dim_size, reduce_all); + auto update_dims = vectorize(x.dims()); + for (auto i : reduce_dims) { + update_dims[i] = 1; + } + + // make new tensor of out and out_grad + phi::DenseTensor new_out(out.type()); + new_out.ShareDataWith(out); + new_out.Resize(phi::make_ddim(update_dims)); + + phi::DenseTensor new_out_grad(out_grad.type()); + new_out_grad.ShareDataWith(out_grad); + new_out_grad.Resize(phi::make_ddim(update_dims)); + + // make equal_out + phi::DenseTensor* equal_out = new phi::DenseTensor(); + equal_out->Resize(x.dims()); + dev_ctx.template Alloc(equal_out); + + // compute + // 1. equal_out = Equal(x, y) + std::vector equal_inputs = {&new_out, &x}; + std::vector equal_outputs = {equal_out}; + funcs::BroadcastKernel( + dev_ctx, equal_inputs, &equal_outputs, 0, funcs::EqualFunctor()); + + // 2. dx = dout * 1 + std::vector mul_inputs = {&new_out_grad, equal_out}; + std::vector mul_outputs = {x_grad}; + funcs::BroadcastKernel( + dev_ctx, mul_inputs, &mul_outputs, 0, funcs::MultiplyFunctor()); + delete equal_out; +} +} // namespace phi PD_REGISTER_KERNEL(max_grad, GPU, @@ -25,4 +81,6 @@ PD_REGISTER_KERNEL(max_grad, float, double, int, - int64_t) {} + int64_t, + phi::dtype::float16, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/kps/reduce_max_kernel.cu b/paddle/phi/kernels/kps/reduce_max_kernel.cu index 9c0fdb52c42..a03035dcf19 100644 --- a/paddle/phi/kernels/kps/reduce_max_kernel.cu +++ b/paddle/phi/kernels/kps/reduce_max_kernel.cu @@ -36,6 +36,14 @@ void MaxRawKernel(const Context& dev_ctx, #ifdef PADDLE_WITH_XPU_KP PD_REGISTER_KERNEL(max_raw, KPS, ALL_LAYOUT, phi::MaxRawKernel, float) {} #else -PD_REGISTER_KERNEL( - max_raw, KPS, ALL_LAYOUT, phi::MaxRawKernel, float, double, int, int64_t) {} +PD_REGISTER_KERNEL(max_raw, + KPS, + ALL_LAYOUT, + phi::MaxRawKernel, + float, + double, + int, + int64_t, + phi::dtype::float16, + phi::dtype::bfloat16) {} #endif diff --git a/paddle/phi/kernels/reduce_max_kernel.cc b/paddle/phi/kernels/reduce_max_kernel.cc index 23da5bd4cd5..7892fc879c7 100644 --- a/paddle/phi/kernels/reduce_max_kernel.cc +++ b/paddle/phi/kernels/reduce_max_kernel.cc @@ -34,7 +34,20 @@ void MaxKernel(const Context& dev_ctx, PD_REGISTER_KERNEL( max, CPU, ALL_LAYOUT, phi::MaxKernel, float, double, int, int64_t) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) +PD_REGISTER_KERNEL(max, + GPU, + ALL_LAYOUT, + phi::MaxKernel, + float, + double, + int, + int64_t, + phi::dtype::float16, + phi::dtype::bfloat16) {} +#endif + +#if defined(PADDLE_WITH_HIP) PD_REGISTER_KERNEL( max, GPU, ALL_LAYOUT, phi::MaxKernel, float, double, int, int64_t) {} #endif diff --git a/python/paddle/fluid/tests/unittests/test_reduce_op.py b/python/paddle/fluid/tests/unittests/test_reduce_op.py index 01b25b54311..05087936924 100644 --- a/python/paddle/fluid/tests/unittests/test_reduce_op.py +++ b/python/paddle/fluid/tests/unittests/test_reduce_op.py @@ -251,18 +251,6 @@ class TestMaxOp(OpTest): only_check_prim=True, ) - def test_raise_error(self): - if core.is_compiled_with_cuda(): - self.inputs = {'X': np.random.random((5, 6, 10)).astype("float16")} - place = core.CUDAPlace(0) - with self.assertRaises(RuntimeError) as cm: - self.check_output_with_place(place) - error_msg = str(cm.exception).split("\n")[-2].strip().split(".")[0] - self.assertEqual( - error_msg, - "NotFoundError: The kernel (reduce_max) with key (GPU, Undefined(AnyLayout), float16) is not found and GPU kernel cannot fallback to CPU one", - ) - class TestMaxOp_ZeroDim(OpTest): """Remove Max with subgradient from gradient check to confirm the success of CI.""" @@ -292,7 +280,7 @@ class TestMaxOp_ZeroDim(OpTest): ) -class TestMaxOp_FP32(OpTest): +class TestMaxFP32Op(OpTest): """Remove Max with subgradient from gradient check to confirm the success of CI.""" def setUp(self): @@ -300,13 +288,19 @@ class TestMaxOp_FP32(OpTest): self.prim_op_type = "prim" self.python_api = paddle.max self.public_python_api = paddle.max - self.inputs = {'X': np.random.random((5, 6, 10)).astype("float32")} + self.init_dtype() + if self.dtype == np.uint16: + x = np.random.random((5, 6, 10)).astype(np.float32) + self.inputs = {'X': convert_float_to_uint16(x)} + else: + x = np.random.random((5, 6, 10)).astype(self.dtype) + self.inputs = {'X': x} self.attrs = {'dim': [-1], 'keep_dim': True} - self.outputs = { - 'Out': self.inputs['X'].max( - axis=tuple(self.attrs['dim']), keepdims=True - ) - } + out = x.max(axis=tuple(self.attrs['dim']), keepdims=True) + if self.dtype == np.uint16: + self.outputs = {'Out': convert_float_to_uint16(out)} + else: + self.outputs = {'Out': out} def test_check_output(self): self.check_output() @@ -320,6 +314,37 @@ class TestMaxOp_FP32(OpTest): only_check_prim=True, ) + def init_dtype(self): + self.dtype = np.float32 + + +class TestMaxFP16Op(TestMaxFP32Op): + def init_dtype(self): + self.dtype = np.float16 + + +@unittest.skipIf( + not core.is_compiled_with_cuda() + or not core.is_bfloat16_supported(core.CUDAPlace(0)), + "core is not compiled with CUDA or not support the bfloat16", +) +class TestMaxBF16Op(TestMaxFP32Op): + def init_dtype(self): + self.dtype = np.uint16 + + def test_check_output(self): + self.check_output_with_place(core.CUDAPlace(0)) + + def test_check_grad(self): + # only composite op support gradient check of reduce_max + self.check_grad_with_place( + core.CUDAPlace(0), + ['X'], + 'Out', + check_prim=True, + only_check_prim=True, + ) + @skip_check_grad_ci( reason="reduce_min is discontinuous non-derivable function," diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py index fe412003787..0e6b55142bf 100644 --- a/python/paddle/tensor/math.py +++ b/python/paddle/tensor/math.py @@ -2348,7 +2348,10 @@ def max(x, axis=None, keepdim=False, name=None): reduce_all, axis = _get_reduce_axis_with_tensor(axis, x) helper = LayerHelper('max', **locals()) check_variable_and_dtype( - x, 'x', ['float32', 'float64', 'int32', 'int64'], 'max' + x, + 'x', + ['float16', 'uint16', 'float32', 'float64', 'int32', 'int64'], + 'max', ) if not isinstance(axis, Variable) and paddle.utils._contain_var(axis): axis = paddle.utils._convert_to_tensor_list(axis) -- GitLab