From ca4df333d3ea3be71d82273865ee39e7c5c74910 Mon Sep 17 00:00:00 2001 From: zhangbo9674 <82555433+zhangbo9674@users.noreply.github.com> Date: Wed, 23 Feb 2022 10:02:09 +0800 Subject: [PATCH] [bf16] add bf16 kernel: elementwise_div (#39602) * add elementwise_div * refine rocm * refine code * refine op register * solve conflict * refine unittest * refine unittest precision * add rocm --- .../elementwise/elementwise_div_op.cu | 6 +++ .../device/gpu/cuda/cuda_device_function.h | 12 ++++++ .../device/gpu/rocm/rocm_device_function.h | 7 ++++ paddle/phi/kernels/gpu/math_kernel.cu | 2 + .../unittests/test_elementwise_div_op.py | 38 ++++++++++++++++++- 5 files changed, 64 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.cu b/paddle/fluid/operators/elementwise/elementwise_div_op.cu index 06f9107db2..9eb4b0352e 100644 --- a/paddle/fluid/operators/elementwise/elementwise_div_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_div_op.cu @@ -53,6 +53,8 @@ REGISTER_OP_CUDA_KERNEL( ops::ElementwiseDivKernel, ops::ElementwiseDivKernel, + ops::ElementwiseDivKernel, ops::ElementwiseDivKernel, ops::ElementwiseDivKernel, ops::ElementwiseDivKernel, @@ -65,6 +67,8 @@ REGISTER_OP_CUDA_KERNEL( ops::ElementwiseDivGradKernel, ops::ElementwiseDivGradKernel, + ops::ElementwiseDivGradKernel, ops::ElementwiseDivGradKernel, ops::ElementwiseDivGradKernel, ops::ElementwiseDivGradKernel, @@ -78,6 +82,8 @@ REGISTER_OP_CUDA_KERNEL( float>, ops::ElementwiseDivDoubleGradKernel, + ops::ElementwiseDivDoubleGradKernel, ops::ElementwiseDivDoubleGradKernel, ops::ElementwiseDivDoubleGradKernel +__forceinline__ __device__ bfloat16 CudaShuffleXorSync(unsigned mask, + bfloat16 val, + int width) { +#if defined(PADDLE_CUDA_BF16) + return bfloat16(__shfl_xor_sync(mask, static_cast(val), width)); +#else + PADDLE_ENFORCE( + false, "__shfl_xor_sync with bfloat16 is not supported on cuda <= 11."); +#endif +} + template <> __forceinline__ __device__ paddle::platform::complex CudaShuffleXorSync( unsigned mask, paddle::platform::complex val, int width) { diff --git a/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h b/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h index 63897bd671..61bf1905fd 100644 --- a/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h +++ b/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h @@ -91,6 +91,13 @@ __forceinline__ __device__ float16 CudaShuffleXorSync(unsigned mask, return float16(__shfl_xor(static_cast(val), width)); } +template <> +__forceinline__ __device__ bfloat16 CudaShuffleXorSync(unsigned mask, + bfloat16 val, + int width) { + return bfloat16(__shfl_xor(static_cast(val), width)); +} + template <> __forceinline__ __device__ paddle::platform::complex CudaShuffleXorSync( unsigned mask, paddle::platform::complex val, int width) { diff --git a/paddle/phi/kernels/gpu/math_kernel.cu b/paddle/phi/kernels/gpu/math_kernel.cu index f7b1205cb5..02e3f00bd3 100644 --- a/paddle/phi/kernels/gpu/math_kernel.cu +++ b/paddle/phi/kernels/gpu/math_kernel.cu @@ -92,6 +92,7 @@ DEFINE_CUDA_ELEMENTWISE_OP(Divide) } // namespace phi using float16 = phi::dtype::float16; +using bfloat16 = phi::dtype::bfloat16; using complex64 = ::phi::dtype::complex; using complex128 = ::phi::dtype::complex; @@ -128,6 +129,7 @@ PD_REGISTER_KERNEL(divide_raw, int, int64_t, float16, + bfloat16, complex64, complex128) {} PD_REGISTER_KERNEL(multiply_raw, diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py index 32860a6694..a43e56b081 100644 --- a/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py +++ b/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py @@ -18,7 +18,7 @@ import numpy as np import paddle import paddle.fluid as fluid import paddle.fluid.core as core -from op_test import OpTest, skip_check_grad_ci +from op_test import OpTest, skip_check_grad_ci, convert_float_to_uint16 class ElementwiseDivOp(OpTest): @@ -55,6 +55,42 @@ class ElementwiseDivOp(OpTest): pass +@unittest.skipIf( + not core.is_compiled_with_cuda() or core.cudnn_version() < 8100, + "core is not compiled with CUDA and cudnn version need larger than 8.1.0") +class TestElementwiseDivOpBF16(OpTest): + def setUp(self): + self.op_type = "elementwise_div" + self.dtype = np.uint16 + + x = np.random.uniform(0.1, 1, [12, 13]).astype(np.float32) + y = np.random.uniform(0.1, 1, [12, 13]).astype(np.float32) + + out = np.divide(x, y) + + self.inputs = { + 'X': convert_float_to_uint16(x), + 'Y': convert_float_to_uint16(y) + } + self.outputs = {'Out': convert_float_to_uint16(out)} + + def test_check_output(self): + place = core.CUDAPlace(0) + self.check_output_with_place(place) + + def test_check_grad_normal(self): + place = core.CUDAPlace(0) + self.check_grad_with_place(place, ['X', 'Y'], 'Out') + + def test_check_grad_ingore_x(self): + place = core.CUDAPlace(0) + self.check_grad_with_place(place, ['Y'], 'Out', no_grad_set=set("X")) + + def test_check_grad_ingore_y(self): + place = core.CUDAPlace(0) + self.check_grad_with_place(place, ['X'], 'Out', no_grad_set=set('Y')) + + @skip_check_grad_ci( reason="[skip shape check] Use y_shape(1) to test broadcast.") class TestElementwiseDivOp_scalar(ElementwiseDivOp): -- GitLab