# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # Copyright (c) 2022 NVIDIA Corporation. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import unittest import numpy as np from eager_op_test import OpTest, skip_check_grad_ci, skip_check_inplace_ci import paddle from paddle.fluid import core def is_fused_gemm_epilogue_supported(): if paddle.is_compiled_with_cuda() and not paddle.is_compiled_with_rocm(): return hasattr(core.eager.ops, 'fused_gemm_epilogue') else: return False def gelu(x): y_ref = ( 0.5 * x * (1.0 + np.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * np.power(x, 3)))) ) return y_ref.astype(x.dtype) def relu(x): mask = x > 0 return x * mask def get_output(X, Y, bias, act): out = np.dot(X, Y) + bias if act == 'relu': return relu(out) elif act == 'gelu': return gelu(out) else: return out @skip_check_inplace_ci(reason="no inplace op") class TestFuseGemmBase(OpTest): pass @skip_check_grad_ci(reason="no grap op") @unittest.skipIf( not core.is_compiled_with_cuda(), "core is not compiled with CUDA" ) class TestFuseGemmEpilogueOpReluMMFP16(TestFuseGemmBase): def setUp(self): self.op_type = "fused_gemm_epilogue" self.place = core.CUDAPlace(0) self.init_dtype_type() self.inputs = { 'X': np.random.random((8, 4)).astype(self.dtype) - 0.5, 'Y': np.random.random((4, 128)).astype(self.dtype) - 0.5, 'Bias': np.random.random((128,)).astype(self.dtype) - 0.5, } self.outputs = { 'Out': get_output( self.inputs['X'], self.inputs['Y'], self.inputs['Bias'], 'relu' ) } self.attrs = {"activation": 'relu'} def init_dtype_type(self): self.dtype = np.float16 self.atol = 1e-3 def test_check_output(self): if self.dtype == np.float16 and not core.is_float16_supported( self.place ): return self.check_output_with_place( self.place, atol=self.atol, check_dygraph=False ) @skip_check_grad_ci(reason="no grap op") @unittest.skipIf( not core.is_compiled_with_cuda(), "core is not compiled with CUDA" ) class TestFuseGemmEpilogueOpReluMMFP32(TestFuseGemmEpilogueOpReluMMFP16): def init_dtype_type(self): self.dtype = np.single self.atol = 1e-6 @skip_check_grad_ci(reason="no grap op") @unittest.skipIf( not core.is_compiled_with_cuda(), "core is not compiled with CUDA" ) class TestFuseGemmEpilogueOpReluMMFP64(TestFuseGemmEpilogueOpReluMMFP16): def init_dtype_type(self): self.dtype = np.double self.atol = 1e-6 @skip_check_grad_ci(reason="no grap op") @unittest.skipIf( not core.is_compiled_with_cuda(), "core is not compiled with CUDA" ) class TestFuseGemmEpilogueOpReluMTMFP16(TestFuseGemmBase): def setUp(self): self.op_type = "fused_gemm_epilogue" self.place = core.CUDAPlace(0) self.init_dtype_type() self.inputs = { 'X': np.random.random((4, 8)).astype(self.dtype) - 0.5, 'Y': np.random.random((4, 128)).astype(self.dtype) - 0.5, 'Bias': np.random.random((128,)).astype(self.dtype) - 0.5, } self.outputs = { 'Out': get_output( self.inputs['X'].T, self.inputs['Y'], self.inputs['Bias'], 'relu', ) } self.attrs = {'trans_x': True, "activation": 'relu'} def init_dtype_type(self): self.dtype = np.float16 self.atol = 1e-3 def test_check_output(self): if self.dtype == np.float16 and not core.is_float16_supported( self.place ): return self.check_output_with_place( self.place, atol=self.atol, check_dygraph=False ) @skip_check_grad_ci(reason="no grap op") @unittest.skipIf( not core.is_compiled_with_cuda(), "core is not compiled with CUDA" ) class TestFuseGemmEpilogueOpReluMTMFP32(TestFuseGemmEpilogueOpReluMTMFP16): def init_dtype_type(self): self.dtype = np.single self.atol = 1e-6 @skip_check_grad_ci(reason="no grap op") @unittest.skipIf( not core.is_compiled_with_cuda(), "core is not compiled with CUDA" ) class TestFuseGemmEpilogueOpReluMTMFP64(TestFuseGemmEpilogueOpReluMTMFP16): def init_dtype_type(self): self.dtype = np.double self.atol = 1e-6 @skip_check_grad_ci(reason="no grap op") @unittest.skipIf( not core.is_compiled_with_cuda(), "core is not compiled with CUDA" ) class TestFuseGemmEpilogueOpReluMMTFP16(TestFuseGemmBase): def setUp(self): self.op_type = "fused_gemm_epilogue" self.place = core.CUDAPlace(0) self.init_dtype_type() self.inputs = { 'X': np.random.random((8, 4)).astype(self.dtype) - 0.5, 'Y': np.random.random((128, 4)).astype(self.dtype) - 0.5, 'Bias': np.random.random((128,)).astype(self.dtype) - 0.5, } self.outputs = { 'Out': get_output( self.inputs['X'], self.inputs['Y'].T, self.inputs['Bias'], 'relu', ) } self.attrs = {'trans_y': True, "activation": 'relu'} def init_dtype_type(self): self.dtype = np.float16 self.atol = 1e-3 def test_check_output(self): if self.dtype == np.float16 and not core.is_float16_supported( self.place ): return self.check_output_with_place( self.place, atol=self.atol, check_dygraph=False ) @skip_check_grad_ci(reason="no grap op") @unittest.skipIf( not core.is_compiled_with_cuda(), "core is not compiled with CUDA" ) class TestFuseGemmEpilogueOpReluMMTFP32(TestFuseGemmEpilogueOpReluMMTFP16): def init_dtype_type(self): self.dtype = np.single self.atol = 1e-6 @skip_check_grad_ci(reason="no grap op") @unittest.skipIf( not core.is_compiled_with_cuda(), "core is not compiled with CUDA" ) class TestFuseGemmEpilogueOpReluMMTFP64(TestFuseGemmEpilogueOpReluMMTFP16): def init_dtype_type(self): self.dtype = np.double self.atol = 1e-6 @skip_check_grad_ci(reason="no grap op") @unittest.skipIf( not core.is_compiled_with_cuda(), "core is not compiled with CUDA" ) class TestFuseGemmEpilogueOpReluMTMTFP16(TestFuseGemmBase): def setUp(self): self.op_type = "fused_gemm_epilogue" self.place = core.CUDAPlace(0) self.init_dtype_type() self.inputs = { 'X': np.random.random((4, 8)).astype(self.dtype) - 0.5, 'Y': np.random.random((128, 4)).astype(self.dtype) - 0.5, 'Bias': np.random.random((128,)).astype(self.dtype) - 0.5, } self.outputs = { 'Out': get_output( self.inputs['X'].T, self.inputs['Y'].T, self.inputs['Bias'], 'relu', ) } self.attrs = {'trans_x': True, 'trans_y': True, "activation": 'relu'} def init_dtype_type(self): self.dtype = np.float16 self.atol = 1e-3 def test_check_output(self): if self.dtype == np.float16 and not core.is_float16_supported( self.place ): return self.check_output_with_place( self.place, atol=self.atol, check_dygraph=False ) @skip_check_grad_ci(reason="no grap op") @unittest.skipIf( not core.is_compiled_with_cuda(), "core is not compiled with CUDA" ) class TestFuseGemmEpilogueOpReluMTMTFP32(TestFuseGemmEpilogueOpReluMTMTFP16): def init_dtype_type(self): self.dtype = np.single self.atol = 1e-6 @skip_check_grad_ci(reason="no grap op") @unittest.skipIf( not core.is_compiled_with_cuda(), "core is not compiled with CUDA" ) class TestFuseGemmEpilogueOpReluMTMTFP64(TestFuseGemmEpilogueOpReluMTMTFP16): def init_dtype_type(self): self.dtype = np.double self.atol = 1e-6 @skip_check_grad_ci(reason="no grap op") @unittest.skipIf( not core.is_compiled_with_cuda(), "core is not compiled with CUDA" ) class TestFuseGemmEpilogueOpReluMMFP16MultiDimX(TestFuseGemmBase): def setUp(self): self.op_type = "fused_gemm_epilogue" self.place = core.CUDAPlace(0) self.init_dtype_type() self.inputs = { 'X': np.random.random((2, 2, 8, 4)).astype(self.dtype) - 0.5, 'Y': np.random.random((4, 128)).astype(self.dtype) - 0.5, 'Bias': np.random.random((128,)).astype(self.dtype) - 0.5, } self.outputs = { 'Out': get_output( self.inputs['X'].reshape((-1, 4)), self.inputs['Y'], self.inputs['Bias'], 'relu', ).reshape((2, 2, 8, 128)) } self.attrs = {"activation": 'relu'} def init_dtype_type(self): self.dtype = np.float16 self.atol = 1e-3 def test_check_output(self): if self.dtype == np.float16 and not core.is_float16_supported( self.place ): return self.check_output_with_place( self.place, atol=self.atol, check_dygraph=False ) @skip_check_grad_ci(reason="no grap op") @unittest.skipIf( not core.is_compiled_with_cuda(), "core is not compiled with CUDA" ) class TestFuseGemmEpilogueOpReluMMFP32MultiDimX( TestFuseGemmEpilogueOpReluMMFP16MultiDimX ): def init_dtype_type(self): self.dtype = np.single self.atol = 1e-6 @skip_check_grad_ci(reason="no grap op") @unittest.skipIf( not core.is_compiled_with_cuda(), "core is not compiled with CUDA" ) class TestFuseGemmEpilogueOpReluMMFP64MultiDimX( TestFuseGemmEpilogueOpReluMMFP16MultiDimX ): def init_dtype_type(self): self.dtype = np.double self.atol = 1e-6 @skip_check_grad_ci(reason="no grap op") @unittest.skipIf( not core.is_compiled_with_cuda(), "core is not compiled with CUDA" ) class TestFuseGemmEpilogueOpReluMTMFP16MultiDimX(TestFuseGemmBase): def setUp(self): self.op_type = "fused_gemm_epilogue" self.place = core.CUDAPlace(0) self.init_dtype_type() self.inputs = { 'X': np.random.random((4, 2, 2, 8)).astype(self.dtype) - 0.5, 'Y': np.random.random((4, 128)).astype(self.dtype) - 0.5, 'Bias': np.random.random((128,)).astype(self.dtype) - 0.5, } self.outputs = { 'Out': get_output( self.inputs['X'].reshape((4, -1)).T, self.inputs['Y'], self.inputs['Bias'], 'relu', ).reshape((2, 2, 8, 128)) } self.attrs = {'trans_x': True, "activation": 'relu'} def init_dtype_type(self): self.dtype = np.float16 self.atol = 1e-3 def test_check_output(self): if self.dtype == np.float16 and not core.is_float16_supported( self.place ): return self.check_output_with_place( self.place, atol=self.atol, check_dygraph=False ) @skip_check_grad_ci(reason="no grap op") @unittest.skipIf( not core.is_compiled_with_cuda(), "core is not compiled with CUDA" ) class TestFuseGemmEpilogueOpReluMTMFP32MultiDimX( TestFuseGemmEpilogueOpReluMTMFP16MultiDimX ): def init_dtype_type(self): self.dtype = np.single self.atol = 1e-6 @skip_check_grad_ci(reason="no grap op") @unittest.skipIf( not core.is_compiled_with_cuda(), "core is not compiled with CUDA" ) class TestFuseGemmEpilogueOpReluMTMFP64MultiDimX( TestFuseGemmEpilogueOpReluMTMFP16MultiDimX ): def init_dtype_type(self): self.dtype = np.double self.atol = 1e-6 @skip_check_grad_ci(reason="no grap op") @unittest.skipIf( not core.is_compiled_with_cuda(), "core is not compiled with CUDA" ) class TestFuseGemmEpilogueOpGeluMMFP16(TestFuseGemmBase): def setUp(self): self.op_type = "fused_gemm_epilogue" self.place = core.CUDAPlace(0) self.init_dtype_type() self.inputs = { 'X': np.random.random((8, 4)).astype(self.dtype) - 0.5, 'Y': np.random.random((4, 128)).astype(self.dtype) - 0.5, 'Bias': np.random.random((128,)).astype(self.dtype) - 0.5, } self.attrs = {"activation": 'gelu'} self.outputs = { 'Out': get_output( self.inputs['X'], self.inputs['Y'], self.inputs['Bias'], 'gelu' ) } def init_dtype_type(self): self.dtype = np.float16 self.atol = 1e-3 def test_check_output(self): if self.dtype == np.float16 and not core.is_float16_supported( self.place ): return self.check_output_with_place( self.place, atol=self.atol, check_dygraph=False ) @skip_check_grad_ci(reason="no grap op") @unittest.skipIf( not core.is_compiled_with_cuda(), "core is not compiled with CUDA" ) class TestFuseGemmEpilogueOpGeluMMFP32(TestFuseGemmEpilogueOpGeluMMFP16): def init_dtype_type(self): self.dtype = np.single self.atol = 1e-6 @skip_check_grad_ci(reason="no grap op") @unittest.skipIf( not core.is_compiled_with_cuda(), "core is not compiled with CUDA" ) class TestFuseGemmEpilogueOpGeluMMFP64(TestFuseGemmEpilogueOpGeluMMFP16): def init_dtype_type(self): self.dtype = np.double self.atol = 1e-6 @skip_check_grad_ci(reason="no grap op") @unittest.skipIf( not core.is_compiled_with_cuda(), "core is not compiled with CUDA" ) class TestFuseGemmEpilogueOpNoneMMFP16(TestFuseGemmBase): def setUp(self): self.op_type = "fused_gemm_epilogue" self.place = core.CUDAPlace(0) self.init_dtype_type() self.inputs = { 'X': np.random.random((8, 4)).astype(self.dtype) - 0.5, 'Y': np.random.random((4, 128)).astype(self.dtype) - 0.5, 'Bias': np.random.random((128,)).astype(self.dtype) - 0.5, } self.attrs = {"activation": 'none'} self.outputs = { 'Out': get_output( self.inputs['X'], self.inputs['Y'], self.inputs['Bias'], 'none' ) } def init_dtype_type(self): self.dtype = np.float16 self.atol = 1e-3 def test_check_output(self): if self.dtype == np.float16 and not core.is_float16_supported( self.place ): return self.check_output_with_place( self.place, atol=self.atol, check_dygraph=False ) @skip_check_grad_ci(reason="no grap op") @unittest.skipIf( not core.is_compiled_with_cuda(), "core is not compiled with CUDA" ) class TestFuseGemmEpilogueOpNoneMMFP32(TestFuseGemmEpilogueOpNoneMMFP16): def init_dtype_type(self): self.dtype = np.single self.atol = 1e-6 @skip_check_grad_ci(reason="no grap op") @unittest.skipIf( not core.is_compiled_with_cuda(), "core is not compiled with CUDA" ) class TestFuseGemmEpilogueOpNoneMMFP64(TestFuseGemmEpilogueOpNoneMMFP16): def init_dtype_type(self): self.dtype = np.double self.atol = 1e-6 def matmul(x, y, bias, trans_x, trans_y): x = np.array(x) if trans_x: x = np.ascontiguousarray(np.transpose(x)) if trans_y: y = np.ascontiguousarray(np.transpose(y)) z = np.matmul(x, y) if bias is None: return z else: return z + bias def matmul_grad(x, y, bias, dz, trans_x, trans_y): if trans_x: if trans_y: dx = matmul(y, dz, None, True, True) dy = matmul(dz, x, None, True, True) else: dx = matmul(y, dz, None, False, True) dy = matmul(x, dz, None, False, False) else: if trans_y: dx = matmul(dz, y, None, False, False) dy = matmul(dz, x, None, True, False) else: dx = matmul(dz, y, None, False, True) dy = matmul(x, dz, None, True, False) if bias is None: dbias = None else: dbias = np.sum(dz, axis=0, keepdims=False) return dx, dy, dbias @unittest.skipIf( not is_fused_gemm_epilogue_supported(), "fused_gemm_epilogue is only supported when CUDA version >= 11.6", ) class TestEagerFusedGemmEpilogue(unittest.TestCase): def setUp(self): paddle.set_device('gpu') def test_case_act(self): paddle.disable_static() x_np = np.random.random((8, 4)).astype(np.float64) - 0.5 y_np = np.random.random((4, 128)).astype(np.float64) - 0.5 bias_np = np.random.random((128,)).astype(np.float64) - 0.5 x = paddle.to_tensor(x_np) y = paddle.to_tensor(y_np) bias = paddle.to_tensor(bias_np) x.stop_gradient = False y.stop_gradient = False out1 = core.eager.ops.fused_gemm_epilogue( x, y, bias, 'trans_x', False, 'trans_y', False, 'activation', 'none' ) out2 = core.eager.ops.fused_gemm_epilogue( x, y, bias, 'trans_x', False, 'trans_y', False, 'activation', 'relu' ) out3 = core.eager.ops.fused_gemm_epilogue( x, y, bias, 'trans_x', False, 'trans_y', False, 'activation', 'gelu' ) out_np1 = get_output(x_np, y_np, bias_np, 'none') out_np2 = get_output(x_np, y_np, bias_np, 'relu') out_np3 = get_output(x_np, y_np, bias_np, 'gelu') np.testing.assert_allclose(out1, out_np1, rtol=1e-05) np.testing.assert_allclose(out2, out_np2, rtol=1e-05) np.testing.assert_allclose(out3, out_np3, rtol=1e-05) out_grad_np1 = np.random.randint( low=-20, high=20, size=out_np1.shape ).astype(np.float64) paddle.autograd.backward( out1, grad_tensors=[paddle.to_tensor(out_grad_np1)] ) x_grad_np, y_grad_np, bias_grad_np = matmul_grad( x_np, y_np, bias_np, out_grad_np1, False, False ) np.testing.assert_allclose(x.grad.numpy(), x_grad_np, rtol=1e-05) self.assertEqual(y_grad_np.shape, y_np.shape) np.testing.assert_allclose(y.grad.numpy(), y_grad_np, rtol=1e-05) paddle.enable_static() if __name__ == "__main__": paddle.enable_static() np.random.seed(0) unittest.main()