# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # Copyright (c) 2022 NVIDIA Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Test cases for role makers.""" from __future__ import print_function import paddle import os import unittest import numpy as np import paddle.fluid.core as core def compare(ref, res, atol, rtol): ref = np.array(ref).flatten() res = np.array(res).flatten() tmp_ref = ref.astype(np.float) tol = atol + rtol * abs(tmp_ref) diff = abs(res - ref) indices = np.transpose(np.where(diff > tol)) if len(indices) == 0: return True return False def verify_node_count(graph, node_name, target_count): count = 0 for node in graph.nodes(): if node.name() == node_name: count += 1 return count == target_count class MultiFCLayer(paddle.nn.Layer): def __init__(self, hidden, Activation): super(MultiFCLayer, self).__init__() self.linear1 = paddle.nn.Linear(hidden, 4 * hidden) self.linear2 = paddle.nn.Linear(4 * hidden, hidden) self.linear3 = paddle.nn.Linear(hidden, hidden) self.relu1 = Activation() self.relu2 = Activation() self.relu3 = Activation() def forward(self, x, matmul_y, ele_y): output = self.linear1(x) output = self.relu1(output) output = self.linear2(output) output1 = paddle.matmul(output, matmul_y) output = self.linear3(output) output = self.relu2(output) output = paddle.matmul(output, matmul_y) output = paddle.add(output, ele_y) output = self.relu3(output) output = paddle.add(output, output1) return output @unittest.skipIf(not core.is_compiled_with_cuda(), "core is not compiled with CUDA") class TestFuseGemmEpilogueFWDBase(unittest.TestCase): def setUp(self): self.batch = 64 self.seqlen = 128 self.hidden = 768 paddle.enable_static() self.main_prog = paddle.static.Program() self.startup_prog = paddle.static.Program() with paddle.static.program_guard(self.main_prog, self.startup_prog): data = paddle.static.data( name="_data", shape=[-1, self.seqlen, self.hidden], dtype='float32') matmul_y = paddle.static.data( name="_matmul_y", shape=[1, self.hidden, self.hidden], dtype='float32') ele_y = paddle.static.data( name="_ele_y", shape=[self.hidden, ], dtype='float32') multi_layer = MultiFCLayer(self.hidden, self._get_act_type()[0]) with paddle.static.amp.fp16_guard(): out = multi_layer(data, matmul_y, ele_y) self.loss = paddle.mean(out) self.data_arr = np.random.random( (self.batch, self.seqlen, self.hidden)).astype("float32") - 0.5 self.matmul_y_arr = np.random.random( (1, self.hidden, self.hidden)).astype("float32") - 0.5 self.ele_y_arr = np.random.random( (self.hidden, )).astype("float32") - 0.5 self.place = paddle.CUDAPlace(0) self.exe = paddle.static.Executor(self.place) self.exe.run(self.startup_prog) self._pre_test_hooks() self.feed = { "_data": self.data_arr, "_matmul_y": self.matmul_y_arr, "_ele_y": self.ele_y_arr } self.reference = self.exe.run(self.main_prog, feed=self.feed, fetch_list=[self.loss.name]) @unittest.skipIf(not core.is_compiled_with_cuda(), "core is not compiled with CUDA") def _test_output(self): build_strategy = paddle.static.BuildStrategy() build_strategy.fuse_gemm_epilogue = True program = paddle.static.CompiledProgram(self.main_prog) program = program.with_data_parallel( loss_name=self.loss.name, build_strategy=build_strategy, places=paddle.static.cuda_places()) result = self.exe.run(program, feed=self.feed, fetch_list=[self.loss.name]) self.assertTrue( compare(self.reference, result, self.atol, self.rtol), "[{}] outputs are miss-matched.".format(type(self).__name__)) self.assertTrue( verify_node_count(program._graph, "fused_gemm_epilogue", 3), "[{}] The number of fused_gemm_epilogue is miss-matched in the computing graph.". format(type(self).__name__)) act_fwd_name = self._get_act_type()[1] self.assertTrue( verify_node_count(program._graph, act_fwd_name, 1), "[{}] The number of {} is miss-matched in the computing graph.". format(type(self).__name__, act_fwd_name)) def _pre_test_hooks(self): self.atol = 1e-4 self.rtol = 1e-3 def _get_act_type(self): return paddle.nn.ReLU, "relu" @unittest.skipIf(not core.is_compiled_with_cuda(), "core is not compiled with CUDA") class TestFuseGemmEpilogueReluFWDFP32(TestFuseGemmEpilogueFWDBase): def _pre_test_hooks(self): self.atol = 1e-3 self.rtol = 1e-2 def _get_act_type(self): return paddle.nn.ReLU, "relu" def test_output(self): self._test_output() @unittest.skipIf(not core.is_compiled_with_cuda(), "core is not compiled with CUDA") class TestFuseGemmEpilogueReluFWDFP16(TestFuseGemmEpilogueReluFWDFP32): def _pre_test_hooks(self): self.atol = 1e-3 self.rtol = 1e-2 fp16_var_list = paddle.static.amp.cast_model_to_fp16(self.main_prog) paddle.static.amp.cast_parameters_to_fp16( self.place, self.main_prog, to_fp16_var_names=fp16_var_list) self.data_arr = self.data_arr.astype("float16") self.matmul_y_arr = self.matmul_y_arr.astype("float16") self.ele_y_arr = self.ele_y_arr.astype("float16") @unittest.skipIf(not core.is_compiled_with_cuda(), "core is not compiled with CUDA") class TestFuseGemmEpilogueGeluFWDFP32(TestFuseGemmEpilogueFWDBase): def _pre_test_hooks(self): self.atol = 1e-4 self.rtol = 1e-3 def _get_act_type(self): return paddle.nn.GELU, "gelu" def test_output(self): self._test_output() @unittest.skipIf(not core.is_compiled_with_cuda(), "core is not compiled with CUDA") class TestFuseGemmEpilogueGeluFWDFP16(TestFuseGemmEpilogueGeluFWDFP32): def _pre_test_hooks(self): self.atol = 1e-3 self.rtol = 1e-2 fp16_var_list = paddle.static.amp.cast_model_to_fp16(self.main_prog) paddle.static.amp.cast_parameters_to_fp16( self.place, self.main_prog, to_fp16_var_names=fp16_var_list) self.data_arr = self.data_arr.astype("float16") self.matmul_y_arr = self.matmul_y_arr.astype("float16") self.ele_y_arr = self.ele_y_arr.astype("float16") @unittest.skipIf(not core.is_compiled_with_cuda(), "core is not compiled with CUDA") class TestFuseGemmEpilogueBWDBase(unittest.TestCase): def setUp(self): self.batch = 64 self.seqlen = 128 self.hidden = 768 paddle.enable_static() self.main_prog = paddle.static.Program() self.startup_prog = paddle.static.Program() with paddle.static.program_guard(self.main_prog, self.startup_prog): data = paddle.static.data( name="_data", shape=[-1, self.seqlen, self.hidden], dtype='float32') matmul_y = paddle.static.data( name="_matmul_y", shape=[1, self.hidden, self.hidden], dtype='float32') ele_y = paddle.static.data( name="_ele_y", shape=[self.hidden, ], dtype='float32') multi_layer = MultiFCLayer(self.hidden, self._get_act_type()[0]) with paddle.static.amp.fp16_guard(): out = multi_layer(data, matmul_y, ele_y) self.loss = paddle.mean(out) paddle.static.append_backward(loss=self.loss) self.data_arr = np.random.random( (self.batch, self.seqlen, self.hidden)).astype("float32") - 0.5 self.matmul_y_arr = np.random.random( (1, self.hidden, self.hidden)).astype("float32") - 0.5 self.ele_y_arr = np.random.random( (self.hidden, )).astype("float32") - 0.5 self.place = paddle.CUDAPlace(0) self.exe = paddle.static.Executor(self.place) self.exe.run(self.startup_prog) self._pre_test_hooks() self.feed = { "_data": self.data_arr, "_matmul_y": self.matmul_y_arr, "_ele_y": self.ele_y_arr } self.fetch = [ self.loss.name, '{}.w_0@GRAD'.format(multi_layer.linear1.full_name()), '{}.b_0@GRAD'.format(multi_layer.linear1.full_name()), '{}.w_0@GRAD'.format(multi_layer.linear2.full_name()), '{}.b_0@GRAD'.format(multi_layer.linear2.full_name()), '{}.w_0@GRAD'.format(multi_layer.linear3.full_name()), '{}.b_0@GRAD'.format(multi_layer.linear3.full_name()) ] self.outs_ref = self.exe.run(self.main_prog, feed=self.feed, fetch_list=self.fetch) @unittest.skipIf(not core.is_compiled_with_cuda(), "core is not compiled with CUDA") def _test_output(self): build_strategy = paddle.static.BuildStrategy() build_strategy.fuse_gemm_epilogue = True program = paddle.static.CompiledProgram(self.main_prog) program = program.with_data_parallel( loss_name=self.loss.name, build_strategy=build_strategy, places=paddle.static.cuda_places()) outs_res = self.exe.run(program, feed=self.feed, fetch_list=self.fetch) for ref, res in zip(self.outs_ref, outs_res): self.assertTrue( compare(ref, res, self.atol, self.rtol), "[{}] output is miss-matched.".format(type(self).__name__)) self.assertTrue( verify_node_count(program._graph, "fused_gemm_epilogue", 3), "[{}] The number of fused_gemm_epilogue is miss-matched in the computing graph.". format(type(self).__name__)) self.assertTrue( verify_node_count(program._graph, "fused_gemm_epilogue_grad", 3), "[{}] The number of fused_gemm_epilogue_grad is miss-matched in the computing graph.". format(type(self).__name__)) _, act_fwd_name, act_bwd_name = self._get_act_type() self.assertTrue( verify_node_count(program._graph, act_fwd_name, 1), "[{}] The number of {} is miss-matched in the computing graph.". format(type(self).__name__, act_fwd_name)) self.assertTrue( verify_node_count(program._graph, act_bwd_name, 2), "[{}] The number of {} is miss-matched in the computing graph.". format(type(self).__name__, act_bwd_name)) def _pre_test_hooks(self): self.atol = 1e-4 self.rtol = 1e-3 def _get_act_type(self): return paddle.nn.ReLU, "relu", "relu_grad" @unittest.skipIf(not core.is_compiled_with_cuda(), "core is not compiled with CUDA") class TestFuseGemmEpilogueReLUBWDFP32(TestFuseGemmEpilogueBWDBase): def _pre_test_hooks(self): self.atol = 1e-4 self.rtol = 1e-3 def _get_act_type(self): return paddle.nn.ReLU, "relu", "relu_grad" def test_output(self): self._test_output() @unittest.skipIf(not core.is_compiled_with_cuda(), "core is not compiled with CUDA") class TestFuseGemmEpilogueReLUBWDFP16(TestFuseGemmEpilogueReLUBWDFP32): def _pre_test_hooks(self): self.atol = 1e-3 self.rtol = 1e-2 fp16_var_list = paddle.static.amp.cast_model_to_fp16(self.main_prog) paddle.static.amp.cast_parameters_to_fp16( self.place, self.main_prog, to_fp16_var_names=fp16_var_list) self.data_arr = self.data_arr.astype("float16") self.matmul_y_arr = self.matmul_y_arr.astype("float16") self.ele_y_arr = self.ele_y_arr.astype("float16") @unittest.skipIf(not core.is_compiled_with_cuda(), "core is not compiled with CUDA") class TestFuseGemmEpilogueGeLUBWDFP32(TestFuseGemmEpilogueBWDBase): def _pre_test_hooks(self): self.atol = 5e-4 self.rtol = 1e-3 def _get_act_type(self): return paddle.nn.GELU, "gelu", "gelu_grad" def test_output(self): self._test_output() @unittest.skipIf(not core.is_compiled_with_cuda(), "core is not compiled with CUDA") class TestFuseGemmEpilogueGeLUBWDFP16(TestFuseGemmEpilogueGeLUBWDFP32): def _pre_test_hooks(self): self.atol = 1e-3 self.rtol = 1e-2 fp16_var_list = paddle.static.amp.cast_model_to_fp16(self.main_prog) paddle.static.amp.cast_parameters_to_fp16( self.place, self.main_prog, to_fp16_var_names=fp16_var_list) self.data_arr = self.data_arr.astype("float16") self.matmul_y_arr = self.matmul_y_arr.astype("float16") self.ele_y_arr = self.ele_y_arr.astype("float16") if __name__ == "__main__": np.random.seed(0) unittest.main()