From 2fedd39bcf8fc0d74d693e299d1c11019300fbe7 Mon Sep 17 00:00:00 2001 From: zhangbo9674 <82555433+zhangbo9674@users.noreply.github.com> Date: Fri, 25 Feb 2022 11:50:18 +0800 Subject: [PATCH] [bf16] add bf16 kernel: elementwise_add elementwise_mul elementwise_sub (#39716) * add ele_add * add ele_mul * add ele_sub * sovle conflict * fix npu * refine ele_add * add ele_mul unittest * refine ele_sub * refine ci * refine unittest --- .../elementwise/elementwise_add_op.cu | 1 + .../elementwise/elementwise_mul_op.cc | 8 ++++ .../elementwise/elementwise_mul_op.cu | 6 +++ .../elementwise/elementwise_sub_op.cc | 6 +++ .../elementwise/elementwise_sub_op.cu | 6 +++ .../kernels/cpu/elementwise_grad_kernel.cc | 2 + paddle/phi/kernels/cpu/math_kernel.cc | 6 ++- paddle/phi/kernels/funcs/blas/blas_impl.h | 30 +++++++++++++ .../kernels/gpu/elementwise_grad_kernel.cu | 5 +++ paddle/phi/kernels/gpu/math_kernel.cu | 5 ++- paddle/phi/kernels/math_kernel.cc | 7 +++- .../unittests/test_elementwise_add_op.py | 42 ++++++++++++++++++- .../unittests/test_elementwise_mul_op.py | 35 +++++++++++++++- .../unittests/test_elementwise_sub_op.py | 30 ++++++++++++- .../test_imperative_auto_mixed_precision.py | 2 +- 15 files changed, 182 insertions(+), 9 deletions(-) diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cu b/paddle/fluid/operators/elementwise/elementwise_add_op.cu index 2b55d9fbaf6..52bf9b0e03f 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cu @@ -24,5 +24,6 @@ REGISTER_OP_CUDA_KERNEL( ops::ElementwiseAddKernel, ops::ElementwiseAddKernel, ops::ElementwiseAddKernel, + ops::ElementwiseAddKernel, ops::ElementwiseAddKernel>, ops::ElementwiseAddKernel>); diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cc b/paddle/fluid/operators/elementwise/elementwise_mul_op.cc index 5ff0f29ab43..e172279145e 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cc @@ -167,6 +167,8 @@ REGISTER_OP_CPU_KERNEL( ops::ElementwiseMulKernel, ops::ElementwiseMulKernel, ops::ElementwiseMulKernel, + ops::ElementwiseMulKernel, ops::ElementwiseMulKernel>, ops::ElementwiseMulKernel, ops::ElementwiseMulGradKernel, ops::ElementwiseMulGradKernel, + ops::ElementwiseMulGradKernel, ops::ElementwiseMulGradKernel>, ops::ElementwiseMulGradKernel, ops::ElementwiseMulDoubleGradKernel, + ops::ElementwiseMulDoubleGradKernel, ops::ElementwiseMulDoubleGradKernel>, ops::ElementwiseMulDoubleGradKernel, ops::ElementwiseMulTripleGradKernel, + ops::ElementwiseMulTripleGradKernel, ops::ElementwiseMulTripleGradKernel>, ops::ElementwiseMulTripleGradKernel, ops::ElementwiseMulKernel, ops::ElementwiseMulKernel, + ops::ElementwiseMulKernel, ops::ElementwiseMulKernel>, ops::ElementwiseMulKernel>); REGISTER_OP_CUDA_KERNEL( @@ -110,6 +111,7 @@ REGISTER_OP_CUDA_KERNEL( ops::ElementwiseMulGradKernel, ops::ElementwiseMulGradKernel, ops::ElementwiseMulGradKernel, + ops::ElementwiseMulGradKernel, ops::ElementwiseMulGradKernel>, ops::ElementwiseMulGradKernel, ops::ElementwiseMulDoubleGradKernel, ops::ElementwiseMulDoubleGradKernel, + ops::ElementwiseMulDoubleGradKernel, ops::ElementwiseMulDoubleGradKernel>, ops::ElementwiseMulDoubleGradKernel, ops::ElementwiseMulTripleGradKernel, ops::ElementwiseMulTripleGradKernel, + ops::ElementwiseMulTripleGradKernel, ops::ElementwiseMulTripleGradKernel>, ops::ElementwiseMulTripleGradKernel, ops::ElementwiseSubKernel, ops::ElementwiseSubKernel, + ops::ElementwiseSubKernel, ops::ElementwiseSubKernel>, ops::ElementwiseSubKernel, ops::ElementwiseSubGradKernel, ops::ElementwiseSubGradKernel, + ops::ElementwiseSubGradKernel, ops::ElementwiseSubGradKernel>, ops::ElementwiseSubGradKernel, ops::ElementwiseSubDoubleGradKernel, + ops::ElementwiseSubDoubleGradKernel, ops::ElementwiseSubDoubleGradKernel>, ops::ElementwiseSubDoubleGradKernel, ops::ElementwiseSubKernel, + ops::ElementwiseSubKernel, ops::ElementwiseSubKernel, ops::ElementwiseSubKernel, ops::ElementwiseSubKernel, @@ -34,6 +36,8 @@ REGISTER_OP_CUDA_KERNEL( ops::ElementwiseSubGradKernel, ops::ElementwiseSubGradKernel, + ops::ElementwiseSubGradKernel, ops::ElementwiseSubGradKernel, ops::ElementwiseSubGradKernel, ops::ElementwiseSubGradKernel, @@ -51,6 +55,8 @@ REGISTER_OP_CUDA_KERNEL( int>, ops::ElementwiseSubDoubleGradKernel, + ops::ElementwiseSubDoubleGradKernel, ops::ElementwiseSubDoubleGradKernel>, ops::ElementwiseSubDoubleGradKernel, phi::dtype::complex) {} @@ -182,5 +183,6 @@ PD_REGISTER_KERNEL(subtract_double_grad, int16_t, int, int64_t, + phi::dtype::bfloat16, phi::dtype::complex, phi::dtype::complex) {} diff --git a/paddle/phi/kernels/cpu/math_kernel.cc b/paddle/phi/kernels/cpu/math_kernel.cc index 581c5f90f35..5cfcfe62c78 100644 --- a/paddle/phi/kernels/cpu/math_kernel.cc +++ b/paddle/phi/kernels/cpu/math_kernel.cc @@ -139,7 +139,8 @@ PD_REGISTER_KERNEL(subtract_raw, int, int64_t, complex64, - complex128) {} + complex128, + phi::dtype::bfloat16) {} PD_REGISTER_KERNEL(divide_raw, CPU, ALL_LAYOUT, @@ -160,7 +161,8 @@ PD_REGISTER_KERNEL(multiply_raw, int64_t, bool, complex64, - complex128) {} + complex128, + phi::dtype::bfloat16) {} PD_REGISTER_KERNEL(sum_raw, CPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/funcs/blas/blas_impl.h b/paddle/phi/kernels/funcs/blas/blas_impl.h index 4d7700a89d2..2868aa5acb7 100644 --- a/paddle/phi/kernels/funcs/blas/blas_impl.h +++ b/paddle/phi/kernels/funcs/blas/blas_impl.h @@ -76,6 +76,36 @@ struct CBlas { "Blas VCOPY do not supported on CPU with bfloat16," " please check your code")); } + + template + static void VADD(int n, + const phi::dtype::bfloat16 *x, + const phi::dtype::bfloat16 *y, + phi::dtype::bfloat16 *z) { + for (int i = 0; i < n; ++i) { + z[i] = x[i] + y[i]; + } + } + + template + static void VMUL(int n, + const phi::dtype::bfloat16 *x, + const phi::dtype::bfloat16 *y, + phi::dtype::bfloat16 *z) { + for (int i = 0; i < n; ++i) { + z[i] = x[i] * y[i]; + } + } + + template + static void VSUB(int n, + const phi::dtype::bfloat16 *x, + const phi::dtype::bfloat16 *y, + phi::dtype::bfloat16 *z) { + for (int i = 0; i < n; ++i) { + z[i] = x[i] - y[i]; + } + } }; #ifdef PADDLE_WITH_MKLML diff --git a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu index 02dbb506c4e..3c4c01b1dc8 100644 --- a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu @@ -128,6 +128,7 @@ PD_REGISTER_KERNEL(add_grad, int, int64_t, phi::dtype::float16, + phi::dtype::bfloat16, phi::dtype::complex, phi::dtype::complex) {} @@ -140,6 +141,7 @@ PD_REGISTER_KERNEL(add_double_grad, int, int64_t, phi::dtype::float16, + phi::dtype::bfloat16, phi::dtype::complex, phi::dtype::complex) {} @@ -152,6 +154,7 @@ PD_REGISTER_KERNEL(add_triple_grad, int, int64_t, phi::dtype::float16, + phi::dtype::bfloat16, phi::dtype::complex, phi::dtype::complex) {} @@ -164,6 +167,7 @@ PD_REGISTER_KERNEL(subtract_grad, int, int64_t, phi::dtype::float16, + phi::dtype::bfloat16, phi::dtype::complex, phi::dtype::complex) {} @@ -176,5 +180,6 @@ PD_REGISTER_KERNEL(subtract_double_grad, int, int64_t, phi::dtype::float16, + phi::dtype::bfloat16, phi::dtype::complex, phi::dtype::complex) {} diff --git a/paddle/phi/kernels/gpu/math_kernel.cu b/paddle/phi/kernels/gpu/math_kernel.cu index 02e3f00bd34..56e8b16ccbe 100644 --- a/paddle/phi/kernels/gpu/math_kernel.cu +++ b/paddle/phi/kernels/gpu/math_kernel.cu @@ -106,6 +106,7 @@ PD_REGISTER_KERNEL(add_raw, int, int64_t, float16, + bfloat16, complex64, complex128) {} PD_REGISTER_KERNEL(subtract_raw, @@ -118,6 +119,7 @@ PD_REGISTER_KERNEL(subtract_raw, int, int64_t, float16, + bfloat16, complex64, complex128) {} PD_REGISTER_KERNEL(divide_raw, @@ -143,7 +145,8 @@ PD_REGISTER_KERNEL(multiply_raw, bool, float16, complex64, - complex128) {} + complex128, + bfloat16) {} PD_REGISTER_KERNEL(sum_raw, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/math_kernel.cc b/paddle/phi/kernels/math_kernel.cc index db6c5e1ac35..3cb7b66ddf7 100644 --- a/paddle/phi/kernels/math_kernel.cc +++ b/paddle/phi/kernels/math_kernel.cc @@ -121,7 +121,8 @@ PD_REGISTER_KERNEL(subtract, int, int64_t, complex64, - complex128) {} + complex128, + phi::dtype::bfloat16) {} PD_REGISTER_KERNEL(divide, CPU, ALL_LAYOUT, @@ -142,7 +143,8 @@ PD_REGISTER_KERNEL(multiply, int64_t, bool, complex64, - complex128) {} + complex128, + phi::dtype::bfloat16) {} #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) PD_REGISTER_KERNEL(mean, @@ -180,6 +182,7 @@ PD_REGISTER_KERNEL(add, int, int64_t, phi::dtype::float16, + phi::dtype::bfloat16, complex64, complex128) {} PD_REGISTER_KERNEL(subtract, diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py index d067a2bd577..d1d391a3949 100644 --- a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py +++ b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py @@ -17,7 +17,7 @@ import unittest import numpy as np import paddle import paddle.fluid.core as core -from op_test import OpTest, skip_check_grad_ci +from op_test import OpTest, skip_check_grad_ci, convert_float_to_uint16 import paddle.fluid as fluid from paddle.fluid import compiler, Program, program_guard @@ -98,6 +98,46 @@ class TestFP16ElementwiseAddOp(TestElementwiseAddOp): place, atol=1e-3, check_dygraph=(self.use_mkldnn == False)) +@unittest.skipIf( + not core.is_compiled_with_cuda() or core.cudnn_version() < 8100, + "core is not compiled with CUDA and cudnn version need larger than 8.1.0") +class TestBF16ElementwiseAddOp(OpTest): + def setUp(self): + self.op_type = "elementwise_add" + self.dtype = np.uint16 + + self.x = np.random.uniform(0.1, 1, [13, 17]).astype(np.float32) + self.y = np.random.uniform(0.1, 1, [13, 17]).astype(np.float32) + self.out = np.add(self.x, self.y) + + self.axis = -1 + + self.inputs = { + 'X': + OpTest.np_dtype_to_fluid_dtype(convert_float_to_uint16(self.x)), + 'Y': + OpTest.np_dtype_to_fluid_dtype(convert_float_to_uint16(self.y)) + } + self.attrs = {'axis': self.axis, 'use_mkldnn': False} + self.outputs = {'Out': convert_float_to_uint16(self.out)} + + def test_check_output(self): + place = core.CUDAPlace(0) + self.check_output_with_place(place) + + def test_check_grad_normal(self): + place = core.CUDAPlace(0) + self.check_grad_with_place(place, ['X', 'Y'], 'Out') + + def test_check_grad_ingore_x(self): + place = core.CUDAPlace(0) + self.check_grad_with_place(place, ['Y'], 'Out', no_grad_set=set("X")) + + def test_check_grad_ingore_y(self): + place = core.CUDAPlace(0) + self.check_grad_with_place(place, ['X'], 'Out', no_grad_set=set('Y')) + + @skip_check_grad_ci( reason="[skip shape check] Use y_shape(1) to test broadcast.") class TestElementwiseAddOp_scalar(TestElementwiseAddOp): diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py index 7bace9bc535..00967cb503f 100644 --- a/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py +++ b/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py @@ -23,7 +23,7 @@ import paddle.fluid.core as core from paddle.fluid import Program, compiler, program_guard from paddle.fluid.op import Operator -from op_test import OpTest, skip_check_grad_ci +from op_test import OpTest, skip_check_grad_ci, convert_float_to_uint16 class ElementwiseMulOp(OpTest): @@ -83,6 +83,39 @@ class ElementwiseMulOp(OpTest): pass +class TestBF16ElementwiseMulOp(OpTest): + def setUp(self): + self.op_type = "elementwise_mul" + self.dtype = np.uint16 + + self.x = np.random.uniform(0.1, 1, [13, 17]).astype(np.float32) + self.y = np.random.uniform(0.1, 1, [13, 17]).astype(np.float32) + self.out = np.multiply(self.x, self.y) + + self.axis = -1 + + self.inputs = { + 'X': + OpTest.np_dtype_to_fluid_dtype(convert_float_to_uint16(self.x)), + 'Y': + OpTest.np_dtype_to_fluid_dtype(convert_float_to_uint16(self.y)) + } + self.outputs = {'Out': convert_float_to_uint16(self.out)} + self.attrs = {'axis': self.axis, 'use_mkldnn': False} + + def test_check_output(self): + self.check_output() + + def test_check_grad_normal(self): + self.check_grad(['X', 'Y'], 'Out') + + def test_check_grad_ingore_x(self): + self.check_grad(['Y'], 'Out', no_grad_set=set("X")) + + def test_check_grad_ingore_y(self): + self.check_grad(['X'], 'Out', no_grad_set=set('Y')) + + @skip_check_grad_ci( reason="[skip shape check] Use y_shape(1) to test broadcast.") class TestElementwiseMulOp_scalar(ElementwiseMulOp): diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py index 2594c96eebd..6801a4bc5f3 100644 --- a/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py +++ b/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py @@ -17,7 +17,8 @@ import unittest import numpy as np import paddle import paddle.fluid as fluid -from op_test import OpTest, skip_check_grad_ci +import paddle.fluid.core as core +from op_test import OpTest, skip_check_grad_ci, convert_float_to_uint16 class TestElementwiseOp(OpTest): @@ -44,6 +45,33 @@ class TestElementwiseOp(OpTest): ['X'], 'Out', max_relative_error=0.005, no_grad_set=set('Y')) +class TestBF16ElementwiseOp(OpTest): + def setUp(self): + self.op_type = "elementwise_sub" + self.dtype = np.uint16 + x = np.random.uniform(0.1, 1, [13, 17]).astype(np.float32) + y = np.random.uniform(0.1, 1, [13, 17]).astype(np.float32) + out = x - y + + self.inputs = { + 'X': convert_float_to_uint16(x), + 'Y': convert_float_to_uint16(y) + } + self.outputs = {'Out': convert_float_to_uint16(out)} + + def test_check_output(self): + self.check_output() + + def test_check_grad_normal(self): + self.check_grad(['X', 'Y'], 'Out') + + def test_check_grad_ingore_x(self): + self.check_grad(['Y'], 'Out', no_grad_set=set("X")) + + def test_check_grad_ingore_y(self): + self.check_grad(['X'], 'Out', no_grad_set=set('Y')) + + @skip_check_grad_ci( reason="[skip shape check] Use y_shape(1) to test broadcast.") class TestElementwiseSubOp_scalar(TestElementwiseOp): diff --git a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py index 306c6b4707e..0043a7f78b4 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py @@ -1143,7 +1143,7 @@ class TestBf16(unittest.TestCase): def test_bf16(self): out_fp32 = self.train(enable_amp=False) out_bf16 = self.train(enable_amp=True) - self.assertTrue(np.allclose(out_fp32, out_bf16, rtol=1.e-3, atol=1.e-2)) + self.assertTrue(np.allclose(out_fp32, out_bf16, rtol=1.e-3, atol=1.e-1)) if __name__ == '__main__': -- GitLab