diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cu b/paddle/fluid/operators/elementwise/elementwise_add_op.cu index 2b55d9fbaf6cba83f722e29f6d5359a1a8884c84..52bf9b0e03f025e41a59a677aa44a6ac92910ba6 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cu @@ -24,5 +24,6 @@ REGISTER_OP_CUDA_KERNEL( ops::ElementwiseAddKernel, ops::ElementwiseAddKernel, ops::ElementwiseAddKernel, + ops::ElementwiseAddKernel, ops::ElementwiseAddKernel>, ops::ElementwiseAddKernel>); diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cc b/paddle/fluid/operators/elementwise/elementwise_mul_op.cc index 5ff0f29ab43a059fefa165dae5c6388231cc8182..e172279145e28c0731ed0d8d91769d0b293662fe 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cc @@ -167,6 +167,8 @@ REGISTER_OP_CPU_KERNEL( ops::ElementwiseMulKernel, ops::ElementwiseMulKernel, ops::ElementwiseMulKernel, + ops::ElementwiseMulKernel, ops::ElementwiseMulKernel>, ops::ElementwiseMulKernel, ops::ElementwiseMulGradKernel, ops::ElementwiseMulGradKernel, + ops::ElementwiseMulGradKernel, ops::ElementwiseMulGradKernel>, ops::ElementwiseMulGradKernel, ops::ElementwiseMulDoubleGradKernel, + ops::ElementwiseMulDoubleGradKernel, ops::ElementwiseMulDoubleGradKernel>, ops::ElementwiseMulDoubleGradKernel, ops::ElementwiseMulTripleGradKernel, + ops::ElementwiseMulTripleGradKernel, ops::ElementwiseMulTripleGradKernel>, ops::ElementwiseMulTripleGradKernel, ops::ElementwiseMulKernel, ops::ElementwiseMulKernel, + ops::ElementwiseMulKernel, ops::ElementwiseMulKernel>, ops::ElementwiseMulKernel>); REGISTER_OP_CUDA_KERNEL( @@ -110,6 +111,7 @@ REGISTER_OP_CUDA_KERNEL( ops::ElementwiseMulGradKernel, ops::ElementwiseMulGradKernel, ops::ElementwiseMulGradKernel, + ops::ElementwiseMulGradKernel, ops::ElementwiseMulGradKernel>, ops::ElementwiseMulGradKernel, ops::ElementwiseMulDoubleGradKernel, ops::ElementwiseMulDoubleGradKernel, + ops::ElementwiseMulDoubleGradKernel, ops::ElementwiseMulDoubleGradKernel>, ops::ElementwiseMulDoubleGradKernel, ops::ElementwiseMulTripleGradKernel, ops::ElementwiseMulTripleGradKernel, + ops::ElementwiseMulTripleGradKernel, ops::ElementwiseMulTripleGradKernel>, ops::ElementwiseMulTripleGradKernel, ops::ElementwiseSubKernel, ops::ElementwiseSubKernel, + ops::ElementwiseSubKernel, ops::ElementwiseSubKernel>, ops::ElementwiseSubKernel, ops::ElementwiseSubGradKernel, ops::ElementwiseSubGradKernel, + ops::ElementwiseSubGradKernel, ops::ElementwiseSubGradKernel>, ops::ElementwiseSubGradKernel, ops::ElementwiseSubDoubleGradKernel, + ops::ElementwiseSubDoubleGradKernel, ops::ElementwiseSubDoubleGradKernel>, ops::ElementwiseSubDoubleGradKernel, ops::ElementwiseSubKernel, + ops::ElementwiseSubKernel, ops::ElementwiseSubKernel, ops::ElementwiseSubKernel, ops::ElementwiseSubKernel, @@ -34,6 +36,8 @@ REGISTER_OP_CUDA_KERNEL( ops::ElementwiseSubGradKernel, ops::ElementwiseSubGradKernel, + ops::ElementwiseSubGradKernel, ops::ElementwiseSubGradKernel, ops::ElementwiseSubGradKernel, ops::ElementwiseSubGradKernel, @@ -51,6 +55,8 @@ REGISTER_OP_CUDA_KERNEL( int>, ops::ElementwiseSubDoubleGradKernel, + ops::ElementwiseSubDoubleGradKernel, ops::ElementwiseSubDoubleGradKernel>, ops::ElementwiseSubDoubleGradKernel, phi::dtype::complex) {} @@ -182,5 +183,6 @@ PD_REGISTER_KERNEL(subtract_double_grad, int16_t, int, int64_t, + phi::dtype::bfloat16, phi::dtype::complex, phi::dtype::complex) {} diff --git a/paddle/phi/kernels/cpu/math_kernel.cc b/paddle/phi/kernels/cpu/math_kernel.cc index 581c5f90f35e5cadb239291d143ce54d499c017e..5cfcfe62c7816c84a4f2876942b4d9b30dfad167 100644 --- a/paddle/phi/kernels/cpu/math_kernel.cc +++ b/paddle/phi/kernels/cpu/math_kernel.cc @@ -139,7 +139,8 @@ PD_REGISTER_KERNEL(subtract_raw, int, int64_t, complex64, - complex128) {} + complex128, + phi::dtype::bfloat16) {} PD_REGISTER_KERNEL(divide_raw, CPU, ALL_LAYOUT, @@ -160,7 +161,8 @@ PD_REGISTER_KERNEL(multiply_raw, int64_t, bool, complex64, - complex128) {} + complex128, + phi::dtype::bfloat16) {} PD_REGISTER_KERNEL(sum_raw, CPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/funcs/blas/blas_impl.h b/paddle/phi/kernels/funcs/blas/blas_impl.h index 4d7700a89d27bb66e741b1e38207d5bd3a797658..2868aa5acb75e37110f02cf30e761625a3cc8ff7 100644 --- a/paddle/phi/kernels/funcs/blas/blas_impl.h +++ b/paddle/phi/kernels/funcs/blas/blas_impl.h @@ -76,6 +76,36 @@ struct CBlas { "Blas VCOPY do not supported on CPU with bfloat16," " please check your code")); } + + template + static void VADD(int n, + const phi::dtype::bfloat16 *x, + const phi::dtype::bfloat16 *y, + phi::dtype::bfloat16 *z) { + for (int i = 0; i < n; ++i) { + z[i] = x[i] + y[i]; + } + } + + template + static void VMUL(int n, + const phi::dtype::bfloat16 *x, + const phi::dtype::bfloat16 *y, + phi::dtype::bfloat16 *z) { + for (int i = 0; i < n; ++i) { + z[i] = x[i] * y[i]; + } + } + + template + static void VSUB(int n, + const phi::dtype::bfloat16 *x, + const phi::dtype::bfloat16 *y, + phi::dtype::bfloat16 *z) { + for (int i = 0; i < n; ++i) { + z[i] = x[i] - y[i]; + } + } }; #ifdef PADDLE_WITH_MKLML diff --git a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu index 02dbb506c4eb579fbb2b82513421aaf1dd3ef163..3c4c01b1dc8ff739ac87ca2e9fe7a6659ab4eac3 100644 --- a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu @@ -128,6 +128,7 @@ PD_REGISTER_KERNEL(add_grad, int, int64_t, phi::dtype::float16, + phi::dtype::bfloat16, phi::dtype::complex, phi::dtype::complex) {} @@ -140,6 +141,7 @@ PD_REGISTER_KERNEL(add_double_grad, int, int64_t, phi::dtype::float16, + phi::dtype::bfloat16, phi::dtype::complex, phi::dtype::complex) {} @@ -152,6 +154,7 @@ PD_REGISTER_KERNEL(add_triple_grad, int, int64_t, phi::dtype::float16, + phi::dtype::bfloat16, phi::dtype::complex, phi::dtype::complex) {} @@ -164,6 +167,7 @@ PD_REGISTER_KERNEL(subtract_grad, int, int64_t, phi::dtype::float16, + phi::dtype::bfloat16, phi::dtype::complex, phi::dtype::complex) {} @@ -176,5 +180,6 @@ PD_REGISTER_KERNEL(subtract_double_grad, int, int64_t, phi::dtype::float16, + phi::dtype::bfloat16, phi::dtype::complex, phi::dtype::complex) {} diff --git a/paddle/phi/kernels/gpu/math_kernel.cu b/paddle/phi/kernels/gpu/math_kernel.cu index 02e3f00bd3425b6dd6f3fe02a4eabf59aaca99ea..56e8b16ccbe0df16fdc96470a8167e6dc6abfb3c 100644 --- a/paddle/phi/kernels/gpu/math_kernel.cu +++ b/paddle/phi/kernels/gpu/math_kernel.cu @@ -106,6 +106,7 @@ PD_REGISTER_KERNEL(add_raw, int, int64_t, float16, + bfloat16, complex64, complex128) {} PD_REGISTER_KERNEL(subtract_raw, @@ -118,6 +119,7 @@ PD_REGISTER_KERNEL(subtract_raw, int, int64_t, float16, + bfloat16, complex64, complex128) {} PD_REGISTER_KERNEL(divide_raw, @@ -143,7 +145,8 @@ PD_REGISTER_KERNEL(multiply_raw, bool, float16, complex64, - complex128) {} + complex128, + bfloat16) {} PD_REGISTER_KERNEL(sum_raw, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/math_kernel.cc b/paddle/phi/kernels/math_kernel.cc index db6c5e1ac35919c153c8021c82e747cc3ca9fe37..3cb7b66ddf73e5fa3c5502a4acaad2c277a22ac6 100644 --- a/paddle/phi/kernels/math_kernel.cc +++ b/paddle/phi/kernels/math_kernel.cc @@ -121,7 +121,8 @@ PD_REGISTER_KERNEL(subtract, int, int64_t, complex64, - complex128) {} + complex128, + phi::dtype::bfloat16) {} PD_REGISTER_KERNEL(divide, CPU, ALL_LAYOUT, @@ -142,7 +143,8 @@ PD_REGISTER_KERNEL(multiply, int64_t, bool, complex64, - complex128) {} + complex128, + phi::dtype::bfloat16) {} #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) PD_REGISTER_KERNEL(mean, @@ -180,6 +182,7 @@ PD_REGISTER_KERNEL(add, int, int64_t, phi::dtype::float16, + phi::dtype::bfloat16, complex64, complex128) {} PD_REGISTER_KERNEL(subtract, diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py index d067a2bd577880a58e757a422c52058661b4eedb..d1d391a3949ead28697c0756803e873c41914079 100644 --- a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py +++ b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py @@ -17,7 +17,7 @@ import unittest import numpy as np import paddle import paddle.fluid.core as core -from op_test import OpTest, skip_check_grad_ci +from op_test import OpTest, skip_check_grad_ci, convert_float_to_uint16 import paddle.fluid as fluid from paddle.fluid import compiler, Program, program_guard @@ -98,6 +98,46 @@ class TestFP16ElementwiseAddOp(TestElementwiseAddOp): place, atol=1e-3, check_dygraph=(self.use_mkldnn == False)) +@unittest.skipIf( + not core.is_compiled_with_cuda() or core.cudnn_version() < 8100, + "core is not compiled with CUDA and cudnn version need larger than 8.1.0") +class TestBF16ElementwiseAddOp(OpTest): + def setUp(self): + self.op_type = "elementwise_add" + self.dtype = np.uint16 + + self.x = np.random.uniform(0.1, 1, [13, 17]).astype(np.float32) + self.y = np.random.uniform(0.1, 1, [13, 17]).astype(np.float32) + self.out = np.add(self.x, self.y) + + self.axis = -1 + + self.inputs = { + 'X': + OpTest.np_dtype_to_fluid_dtype(convert_float_to_uint16(self.x)), + 'Y': + OpTest.np_dtype_to_fluid_dtype(convert_float_to_uint16(self.y)) + } + self.attrs = {'axis': self.axis, 'use_mkldnn': False} + self.outputs = {'Out': convert_float_to_uint16(self.out)} + + def test_check_output(self): + place = core.CUDAPlace(0) + self.check_output_with_place(place) + + def test_check_grad_normal(self): + place = core.CUDAPlace(0) + self.check_grad_with_place(place, ['X', 'Y'], 'Out') + + def test_check_grad_ingore_x(self): + place = core.CUDAPlace(0) + self.check_grad_with_place(place, ['Y'], 'Out', no_grad_set=set("X")) + + def test_check_grad_ingore_y(self): + place = core.CUDAPlace(0) + self.check_grad_with_place(place, ['X'], 'Out', no_grad_set=set('Y')) + + @skip_check_grad_ci( reason="[skip shape check] Use y_shape(1) to test broadcast.") class TestElementwiseAddOp_scalar(TestElementwiseAddOp): diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py index 7bace9bc535243194e2ed9ca82db49e6d1b4f2f4..00967cb503fe5fd677839a869798964bb5fb0b71 100644 --- a/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py +++ b/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py @@ -23,7 +23,7 @@ import paddle.fluid.core as core from paddle.fluid import Program, compiler, program_guard from paddle.fluid.op import Operator -from op_test import OpTest, skip_check_grad_ci +from op_test import OpTest, skip_check_grad_ci, convert_float_to_uint16 class ElementwiseMulOp(OpTest): @@ -83,6 +83,39 @@ class ElementwiseMulOp(OpTest): pass +class TestBF16ElementwiseMulOp(OpTest): + def setUp(self): + self.op_type = "elementwise_mul" + self.dtype = np.uint16 + + self.x = np.random.uniform(0.1, 1, [13, 17]).astype(np.float32) + self.y = np.random.uniform(0.1, 1, [13, 17]).astype(np.float32) + self.out = np.multiply(self.x, self.y) + + self.axis = -1 + + self.inputs = { + 'X': + OpTest.np_dtype_to_fluid_dtype(convert_float_to_uint16(self.x)), + 'Y': + OpTest.np_dtype_to_fluid_dtype(convert_float_to_uint16(self.y)) + } + self.outputs = {'Out': convert_float_to_uint16(self.out)} + self.attrs = {'axis': self.axis, 'use_mkldnn': False} + + def test_check_output(self): + self.check_output() + + def test_check_grad_normal(self): + self.check_grad(['X', 'Y'], 'Out') + + def test_check_grad_ingore_x(self): + self.check_grad(['Y'], 'Out', no_grad_set=set("X")) + + def test_check_grad_ingore_y(self): + self.check_grad(['X'], 'Out', no_grad_set=set('Y')) + + @skip_check_grad_ci( reason="[skip shape check] Use y_shape(1) to test broadcast.") class TestElementwiseMulOp_scalar(ElementwiseMulOp): diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py index 2594c96eebd69fcdd88d48e793e48d854b79535a..6801a4bc5f30b4829e8e9ceae201ab050b30758e 100644 --- a/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py +++ b/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py @@ -17,7 +17,8 @@ import unittest import numpy as np import paddle import paddle.fluid as fluid -from op_test import OpTest, skip_check_grad_ci +import paddle.fluid.core as core +from op_test import OpTest, skip_check_grad_ci, convert_float_to_uint16 class TestElementwiseOp(OpTest): @@ -44,6 +45,33 @@ class TestElementwiseOp(OpTest): ['X'], 'Out', max_relative_error=0.005, no_grad_set=set('Y')) +class TestBF16ElementwiseOp(OpTest): + def setUp(self): + self.op_type = "elementwise_sub" + self.dtype = np.uint16 + x = np.random.uniform(0.1, 1, [13, 17]).astype(np.float32) + y = np.random.uniform(0.1, 1, [13, 17]).astype(np.float32) + out = x - y + + self.inputs = { + 'X': convert_float_to_uint16(x), + 'Y': convert_float_to_uint16(y) + } + self.outputs = {'Out': convert_float_to_uint16(out)} + + def test_check_output(self): + self.check_output() + + def test_check_grad_normal(self): + self.check_grad(['X', 'Y'], 'Out') + + def test_check_grad_ingore_x(self): + self.check_grad(['Y'], 'Out', no_grad_set=set("X")) + + def test_check_grad_ingore_y(self): + self.check_grad(['X'], 'Out', no_grad_set=set('Y')) + + @skip_check_grad_ci( reason="[skip shape check] Use y_shape(1) to test broadcast.") class TestElementwiseSubOp_scalar(TestElementwiseOp): diff --git a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py index 306c6b4707e8a3d7386bd8af3e32e55d09d563c4..0043a7f78b4b37550b95e9af86c882dad3e7843a 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py @@ -1143,7 +1143,7 @@ class TestBf16(unittest.TestCase): def test_bf16(self): out_fp32 = self.train(enable_amp=False) out_bf16 = self.train(enable_amp=True) - self.assertTrue(np.allclose(out_fp32, out_bf16, rtol=1.e-3, atol=1.e-2)) + self.assertTrue(np.allclose(out_fp32, out_bf16, rtol=1.e-3, atol=1.e-1)) if __name__ == '__main__':