diff --git a/paddle/phi/kernels/gpu/matmul_grad_kernel.cu b/paddle/phi/kernels/gpu/matmul_grad_kernel.cu index 4b48e2eb1732266250fe9e647f7bba73f9aaf393..e27d5251a914473a17f3dbba56ed5ce5500dd6da 100644 --- a/paddle/phi/kernels/gpu/matmul_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/matmul_grad_kernel.cu @@ -64,4 +64,5 @@ PD_REGISTER_KERNEL(matmul_with_flatten_double_grad, phi::MatmulWithFlattenDoubleGradKernel, float, double, + phi::dtype::bfloat16, phi::dtype::float16) {} diff --git a/python/paddle/fluid/tests/unittests/test_mul_op.py b/python/paddle/fluid/tests/unittests/test_mul_op.py index 1e2b909d90dab4ca1f234e8414c516df806a4d78..9ce95d354d7af6af676c73f294209750cfc6733f 100644 --- a/python/paddle/fluid/tests/unittests/test_mul_op.py +++ b/python/paddle/fluid/tests/unittests/test_mul_op.py @@ -20,7 +20,7 @@ import numpy as np from paddle.fluid import core sys.path.append("..") -from eager_op_test import OpTest +from eager_op_test import OpTest, convert_float_to_uint16 class TestMulOp(OpTest): @@ -114,14 +114,14 @@ class TestMulOp2(OpTest): @unittest.skipIf( not core.is_compiled_with_cuda(), "core is not compiled with CUDA" ) -class TestFP16MulOp1(TestMulOp): +class TestMulFP16Op1(TestMulOp): def init_dtype_type(self): self.dtype = np.float16 def test_check_output(self): place = core.CUDAPlace(0) if core.is_float16_supported(place): - self.check_output_with_place(place, atol=1e-1, check_dygraph=False) + self.check_output_with_place(place, check_dygraph=False) def test_check_grad_normal(self): place = core.CUDAPlace(0) @@ -130,7 +130,6 @@ class TestFP16MulOp1(TestMulOp): place, ['X', 'Y'], 'Out', - max_relative_error=0.5, check_dygraph=False, ) @@ -141,7 +140,6 @@ class TestFP16MulOp1(TestMulOp): place, ['Y'], 'Out', - max_relative_error=0.5, no_grad_set=set("X"), check_dygraph=False, ) @@ -153,7 +151,6 @@ class TestFP16MulOp1(TestMulOp): place, ['X'], 'Out', - max_relative_error=0.5, no_grad_set=set('Y'), check_dygraph=False, ) @@ -162,14 +159,14 @@ class TestFP16MulOp1(TestMulOp): @unittest.skipIf( not core.is_compiled_with_cuda(), "core is not compiled with CUDA" ) -class TestFP16MulOp2(TestMulOp2): +class TestMulFP16Op2(TestMulOp2): def init_dtype_type(self): self.dtype = np.float16 def test_check_output(self): place = core.CUDAPlace(0) if core.is_float16_supported(place): - self.check_output_with_place(place, atol=2e-1, check_dygraph=False) + self.check_output_with_place(place, check_dygraph=False) def test_check_grad_normal(self): place = core.CUDAPlace(0) @@ -178,7 +175,6 @@ class TestFP16MulOp2(TestMulOp2): place, ['X', 'Y'], 'Out', - max_relative_error=0.9, check_dygraph=False, ) @@ -189,7 +185,6 @@ class TestFP16MulOp2(TestMulOp2): place, ['Y'], 'Out', - max_relative_error=0.5, no_grad_set=set("X"), check_dygraph=False, ) @@ -201,11 +196,120 @@ class TestFP16MulOp2(TestMulOp2): place, ['X'], 'Out', - max_relative_error=0.9, no_grad_set=set('Y'), check_dygraph=False, ) +@unittest.skipIf( + not core.is_compiled_with_cuda() + or not core.is_bfloat16_supported(core.CUDAPlace(0)), + "core is not compiled with CUDA or not support bfloat16", +) +class TestMulBF16Op1(OpTest): + def setUp(self): + self.op_type = "mul" + self.init_dtype_type() + self.inputs = { + 'X': np.random.random((20, 5)).astype(self.np_dtype), + 'Y': np.random.random((5, 21)).astype(self.np_dtype), + } + self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])} + + self.inputs['X'] = convert_float_to_uint16(self.inputs['X']) + self.inputs['Y'] = convert_float_to_uint16(self.inputs['Y']) + self.outputs['Out'] = convert_float_to_uint16(self.outputs['Out']) + self.place = core.CUDAPlace(0) + + def init_dtype_type(self): + self.dtype = np.uint16 + self.np_dtype = np.float32 + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False) + + def test_check_grad_normal(self): + self.check_grad_with_place( + self.place, ['X', 'Y'], 'Out', check_dygraph=False + ) + + def test_check_grad_ingore_x(self): + self.check_grad_with_place( + self.place, + ['Y'], + 'Out', + no_grad_set=set("X"), + check_dygraph=False, + ) + + def test_check_grad_ingore_y(self): + self.check_grad_with_place( + self.place, + ['X'], + 'Out', + no_grad_set=set('Y'), + check_dygraph=False, + ) + + +@unittest.skipIf( + not core.is_compiled_with_cuda() + or not core.is_bfloat16_supported(core.CUDAPlace(0)), + "core is not compiled with CUDA or not support bfloat16", +) +class TestMulBF16Op2(TestMulBF16Op1): + def setUp(self): + self.op_type = "mul" + self.init_dtype_type() + self.inputs = { + 'X': np.random.random((3, 4, 2, 9)).astype(self.np_dtype), + 'Y': np.random.random((3, 6, 1, 2, 3)).astype(self.np_dtype), + } + self.attrs = { + 'x_num_col_dims': 2, + 'y_num_col_dims': 2, + } + result = np.dot( + self.inputs['X'].reshape(3 * 4, 2 * 9), + self.inputs['Y'].reshape(3 * 6, 1 * 2 * 3), + ) + result = result.reshape(3, 4, 1, 2, 3) + self.outputs = {'Out': result} + + self.inputs['X'] = convert_float_to_uint16(self.inputs['X']) + self.inputs['Y'] = convert_float_to_uint16(self.inputs['Y']) + self.outputs['Out'] = convert_float_to_uint16(self.outputs['Out']) + self.place = core.CUDAPlace(0) + + def test_check_grad_normal(self): + self.check_grad_with_place( + self.place, + ['X', 'Y'], + 'Out', + numeric_grad_delta=0.02, + check_dygraph=False, + ) + + def test_check_grad_ingore_x(self): + self.check_grad_with_place( + self.place, + ['Y'], + 'Out', + numeric_grad_delta=0.02, + no_grad_set=set("X"), + check_dygraph=False, + ) + + def test_check_grad_ingore_y(self): + self.check_grad_with_place( + self.place, + ['X'], + 'Out', + numeric_grad_delta=0.02, + no_grad_set=set('Y'), + check_dygraph=False, + ) + + if __name__ == "__main__": unittest.main()