From 7085cb971b3709caf33354bd1c0d1248a4b46ab1 Mon Sep 17 00:00:00 2001 From: taixiurong Date: Thu, 8 Sep 2022 16:11:20 +0800 Subject: [PATCH] =?UTF-8?q?xpu-paddlepaddle-40=20[=E4=BB=BB=E5=8A=A1]=20fu?= =?UTF-8?q?sed=5Fgemm=5Fepilogue=20=E6=94=AF=E6=8C=81xpu=20=20(#45706)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * add gemm_epilogue * xpu-paddlepaddle-40 [任务] fused_gemm_epilogue 支持 test=kunlun --- paddle/fluid/operators/fused/CMakeLists.txt | 1 + .../operators/fused/fused_gemm_epilogue_op.cc | 7 +- .../operators/fused/fused_gemm_epilogue_op.h | 2 + .../fused/fused_gemm_epilogue_op_xpu.cc | 241 ++++++++++++++ .../fluid/platform/device/xpu/xpu2_op_list.h | 9 +- .../test_fused_gemm_epilogue_grad_op_xpu.py | 125 ++++++++ .../xpu/test_fused_gemm_epilogue_op_xpu.py | 293 ++++++++++++++++++ 7 files changed, 673 insertions(+), 5 deletions(-) create mode 100644 paddle/fluid/operators/fused/fused_gemm_epilogue_op_xpu.cc create mode 100644 python/paddle/fluid/tests/unittests/xpu/test_fused_gemm_epilogue_grad_op_xpu.py create mode 100644 python/paddle/fluid/tests/unittests/xpu/test_fused_gemm_epilogue_op_xpu.py diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt index 02a3f4d7a0..f2a4b68765 100755 --- a/paddle/fluid/operators/fused/CMakeLists.txt +++ b/paddle/fluid/operators/fused/CMakeLists.txt @@ -36,6 +36,7 @@ op_library(fusion_lstm_op) if(WITH_XPU) op_library(resnet_basic_block_op) op_library(resnet_unit_op) + op_library(fused_gemm_epilogue_op) endif() if(WITH_GPU OR WITH_ROCM) diff --git a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc index f1eeae8651..f9366bace3 100644 --- a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc +++ b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc @@ -392,14 +392,13 @@ class FusedGemmEpilogueOpGradMaker : public framework::SingleGradOpMaker { protected: void Apply(GradOpPtr op) const override { const auto& act_type = this->template Attr("activation"); - PADDLE_ENFORCE_EQ( - act_type, - "none", - phi::errors::InvalidArgument("The activation should be none.")); op->SetType(this->ForwardOpType() + "_grad"); op->SetInput("X", this->Input("X")); op->SetInput("Y", this->Input("Y")); + if (act_type != "none") { + op->SetInput("ReserveSpace", this->Input("ReserveSpace")); + } op->SetInput("DOut", this->OutputGrad("Out")); op->SetOutput("DX", this->InputGrad("X")); diff --git a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.h b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.h index 160bda9081..62701fe43f 100644 --- a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.h +++ b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once +#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060 #include #include @@ -321,3 +322,4 @@ class GemmEpilogueAlgoCache { } // namespace operators } // namespace paddle +#endif diff --git a/paddle/fluid/operators/fused/fused_gemm_epilogue_op_xpu.cc b/paddle/fluid/operators/fused/fused_gemm_epilogue_op_xpu.cc new file mode 100644 index 0000000000..2b4b03e32c --- /dev/null +++ b/paddle/fluid/operators/fused/fused_gemm_epilogue_op_xpu.cc @@ -0,0 +1,241 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +Copyright (c) 2022 NVIDIA Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/fluid/framework/scope_guard.h" +#include "paddle/fluid/platform/float16.h" +#include "paddle/phi/kernels/xpu/xpu_api_wrapper.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class FusedGemmEpilogueXPUKernel : public framework::OpKernel { + using XPUType = typename XPUTypeTrait::Type; + + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto& dev_ctx = ctx.template device_context(); + + const Tensor* x = ctx.Input("X"); + const Tensor* y = ctx.Input("Y"); + const Tensor* bias = ctx.Input("Bias"); + + Tensor* out = ctx.Output("Out"); + Tensor* reserve_space = ctx.Output("ReserveSpace"); + + bool trans_x = ctx.Attr("trans_x"); + bool trans_y = ctx.Attr("trans_y"); + + std::string activation = ctx.Attr("activation"); + VLOG(5) << "trans_x = " << trans_x << " , trans_y = " << trans_y + << " , activation = " << activation; + + auto x_mat_dims = + phi::flatten_to_2d(x->dims(), trans_x ? 1 : x->dims().size() - 1); + + // (M * K) * (K * N) for new api use + // int64_t M = trans_x ? x_mat_dims[1] : x_mat_dims[0]; + // int64_t K = trans_y ? y->dims()[1] : y->dims()[0]; + // int64_t N = trans_y ? y->dims()[0] : y->dims()[1]; + + // 调用新接口,这里先分开调用,等待qingpen的新接口 + int r = 0; + xpu::Activation_t act = xpu::Activation_t::LINEAR; + if (activation == "relu") { + act = xpu::Activation_t::RELU; + } else if (activation == "gelu") { + act = xpu::Activation_t::GELU; + } + // fc + bias + act + // 1. fc + phi::XpuFcInfo fc_info; + + phi::GetFCInfo(x_mat_dims, y->dims(), trans_x, trans_y, &fc_info); + VLOG(0) << "FusedGemmEpilogueXPUKernel 000"; + xpu::Context* xpu_ctx = dev_ctx.x_context(); + + const XPUType* x_ptr = reinterpret_cast(x->data()); + const XPUType* y_ptr = reinterpret_cast(y->data()); + XPUType* out_ptr = + reinterpret_cast(out->mutable_data(ctx.GetPlace())); + xpu::ctx_guard RAII_GUARD(xpu_ctx); + XPUType* fc_out_ptr = RAII_GUARD.alloc_l3_or_gm(out->numel()); + phi::MatMulXPUFunction( + xpu_ctx, x_ptr, y_ptr, fc_out_ptr, fc_info, 1.0f); + XPUType* bias_out_ptr = out_ptr; + if (activation != "none" && reserve_space) { + bias_out_ptr = reinterpret_cast( + reserve_space->mutable_data(ctx.GetPlace())); + } + // 2 bias + const XPUType* bias_ptr = reinterpret_cast(bias->data()); + r = xpu::broadcast_add(xpu_ctx, + fc_out_ptr, + bias_ptr, + bias_out_ptr, + {fc_info.m, fc_info.n}, + {fc_info.n}); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_add"); + // 3 act + if (activation == "relu") { + r = xpu::relu(xpu_ctx, bias_out_ptr, out_ptr, out->numel()); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "relu"); + } else if (activation == "gelu") { + r = xpu::gelu(xpu_ctx, bias_out_ptr, out_ptr, out->numel()); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "gelu"); + } + } +}; + +template +class FusedGemmEpilogueXPUGradKernel : public framework::OpKernel { + using XPUType = typename XPUTypeTrait::Type; + + public: + void Compute(const framework::ExecutionContext& ctx) const override { + bool trans_x = ctx.Attr("trans_x"); + bool trans_y = ctx.Attr("trans_y"); + auto& dev_ctx = ctx.template device_context(); + const Tensor* dout = ctx.Input("DOut"); + const Tensor* x = ctx.Input("X"); + const Tensor* y = ctx.Input("Y"); + + const Tensor* reserve_space = ctx.Input("ReserveSpace"); + + Tensor* dx = ctx.Output("DX"); + Tensor* dy = ctx.Output("DY"); + Tensor* dbias = ctx.Output("DBias"); + + std::string activation = "none"; + if (ctx.HasAttr("activation")) { + activation = ctx.Attr("activation"); + } else if (ctx.HasAttr("activation_grad")) { + activation = ctx.Attr("activation_grad"); + } + + auto* xpu_ctx = dev_ctx.x_context(); + xpu::ctx_guard RAII_GUARD(xpu_ctx); + const XPUType* dout_ptr = reinterpret_cast(dout->data()); + + const XPUType* dout_fc_ptr = dout_ptr; + const XPUType* x_ptr = reinterpret_cast(x->data()); + const XPUType* y_ptr = reinterpret_cast(y->data()); + + // const XPUType* + const XPUType* reserve_space_ptr = + (reserve_space == NULL) + ? (reinterpret_cast(NULL)) + : (reinterpret_cast(reserve_space->data())); + XPUType* d_act_input_ptr; + if (activation != "none") { + d_act_input_ptr = RAII_GUARD.alloc_l3_or_gm(dout->numel()); + dout_fc_ptr = d_act_input_ptr; + } + + // 1. act_grad 2. fc_grad 3. dbias + int r = 0; + if (activation == "relu") { + r = xpu::relu_grad(xpu_ctx, + reserve_space_ptr, + reserve_space_ptr, + dout_ptr, + d_act_input_ptr, + dout->numel()); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "relu_grad"); + } else if (activation == "gelu") { + r = xpu::gelu_grad(xpu_ctx, + reserve_space_ptr, + reserve_space_ptr, + dout_ptr, + d_act_input_ptr, + dout->numel()); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "gelu_grad"); + } + + auto x_mat_dims = + phi::flatten_to_2d(x->dims(), trans_x ? 1 : x->dims().size() - 1); + phi::XpuFcInfo info_forward; + phi::GetFCInfo(x_mat_dims, y->dims(), trans_x, trans_y, &info_forward); + + // 2. fc_grad + const XPUType* a_1 = reinterpret_cast(NULL); + const XPUType* b_1 = reinterpret_cast(NULL); + const XPUType* a_2 = reinterpret_cast(NULL); + const XPUType* b_2 = reinterpret_cast(NULL); + XPUType* c_1 = + (dx == NULL) + ? reinterpret_cast(NULL) + : reinterpret_cast(dx->mutable_data(ctx.GetPlace())); + XPUType* c_2 = + (dy == NULL) + ? reinterpret_cast(NULL) + : reinterpret_cast(dy->mutable_data(ctx.GetPlace())); + phi::XpuFcInfo info_dx; + phi::XpuFcInfo info_dy; + std::tuple + fc_info = phi::MatmulGradFcInfo(xpu_ctx, + &RAII_GUARD, + info_forward, + trans_x, + trans_y, + x_ptr, + y_ptr, + dout_fc_ptr); + std::tie(info_dx, info_dy, a_1, b_1, a_2, b_2) = fc_info; + if (dx) { + phi::MatMulXPUFunction(xpu_ctx, a_1, b_1, c_1, info_dx, 1.0f); + } + if (dy) { + phi::MatMulXPUFunction(xpu_ctx, a_2, b_2, c_2, info_dy, 1.0f); + } + // 3. dbias + if (dbias) { + XPUType* dbias_ptr = + reinterpret_cast(dbias->mutable_data(ctx.GetPlace())); + r = xpu::reduce_sum(xpu_ctx, + dout_fc_ptr, + dbias_ptr, + {info_forward.m, info_forward.n}, + {0}); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "reduce_sum"); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_XPU_KERNEL( + fused_gemm_epilogue, + ops::FusedGemmEpilogueXPUKernel, + ops::FusedGemmEpilogueXPUKernel); + +REGISTER_OP_XPU_KERNEL( + fused_gemm_epilogue_grad, + ops::FusedGemmEpilogueXPUGradKernel, + ops::FusedGemmEpilogueXPUGradKernel); diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h index 0aa0e20491..ba03a3b53d 100644 --- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h +++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h @@ -654,7 +654,14 @@ XPUOpMap& get_kl2_ops() { {"resnet_basic_block_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"resnet_basic_block", - XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}}; + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"fused_gemm_epilogue", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, + {"fused_gemm_epilogue_grad", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, + }; return s_xpu2_kernels; } diff --git a/python/paddle/fluid/tests/unittests/xpu/test_fused_gemm_epilogue_grad_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_fused_gemm_epilogue_grad_op_xpu.py new file mode 100644 index 0000000000..5d56f37d89 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/xpu/test_fused_gemm_epilogue_grad_op_xpu.py @@ -0,0 +1,125 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2022 NVIDIA Corporation. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import sys + +sys.path.append("..") + +import unittest +import numpy as np +import paddle +import paddle.fluid.core as core +from op_test_xpu import XPUOpTest +from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper + + +def get_outputs(DOut, X, Y): + DX = np.dot(DOut, Y.T) + DY = np.dot(X.T, DOut) + DBias = np.sum(DOut, axis=0) + + return DX, DY, DBias + + +class XPUTestFuseGemmGradOp(XPUOpTestWrapper): + + def __init__(self): + self.op_name = 'fused_gemm_epilogue_grad' + self.use_dynamic_create_class = False + + class TestFuseGemmEpilogueGradOpDXYBias1(XPUOpTest): + + def setUp(self): + paddle.enable_static() + self.op_type = "fused_gemm_epilogue_grad" + self.__class__.no_need_check_grad = True + + self.dtype = self.in_type + self.init_data() + + def init_data(self): + self.inputs = { + 'DOut': np.random.random((8, 128)).astype(self.dtype) - 0.5, + 'X': np.random.random((8, 4)).astype(self.dtype) - 0.5, + 'Y': np.random.random((4, 128)).astype(self.dtype) - 0.5 + } + + self.attrs = {"activation": 'none'} + + DX, DY, DBias = get_outputs(self.inputs['DOut'], self.inputs['X'], + self.inputs['Y']) + self.outputs = {'DX': DX, 'DY': DY, 'DBias': DBias} + + def test_check_output(self): + self.atol = 1e-4 + if self.dtype == np.float16: + self.atol = 1e-3 + self.check_output_with_place(core.XPUPlace(0), atol=self.atol) + + class TestFuseGemmEpilogueGradOpDXYBias2(XPUOpTest): + + def init_data(self): + self.inputs = { + 'DOut': np.random.random((8, 128)).astype(self.dtype) - 0.5, + 'X': np.random.random((8, 4)).astype(self.dtype) - 0.5, + 'Y': np.random.random((4, 128)).astype(self.dtype) - 0.5 + } + + self.attrs = {"activation": 'none'} + + _, DY, DBias = get_outputs(self.inputs['DOut'], self.inputs['X'], + self.inputs['Y']) + self.outputs = {'DY': DY, 'DBias': DBias} + + class TestFuseGemmEpilogueGradOpDXYBias3(XPUOpTest): + + def init_data(self): + self.inputs = { + 'DOut': np.random.random((8, 128)).astype(self.dtype) - 0.5, + 'X': np.random.random((8, 4)).astype(self.dtype) - 0.5, + 'Y': np.random.random((4, 128)).astype(self.dtype) - 0.5 + } + + self.attrs = {"activation": 'none'} + + _, DY, _ = get_outputs(self.inputs['DOut'], self.inputs['X'], + self.inputs['Y']) + self.outputs = {'DY': DY} + + class TestFuseGemmEpilogueGradOpDXYBias4(XPUOpTest): + + def init_data(self): + self.inputs = { + 'DOut': np.random.random((8, 128)).astype(self.dtype) - 0.5, + 'X': np.random.random((8, 4)).astype(self.dtype) - 0.5, + 'Y': np.random.random((4, 128)).astype(self.dtype) - 0.5 + } + + self.attrs = {"activation": 'none'} + + DX, DY, _ = get_outputs(self.inputs['DOut'], self.inputs['X'], + self.inputs['Y']) + self.outputs = {'DX': DX, 'DY': DY} + + +support_types = get_xpu_op_support_types('fused_gemm_epilogue_grad') +for stype in support_types: + create_test_class(globals(), XPUTestFuseGemmGradOp, stype) + +if __name__ == "__main__": + paddle.enable_static() + np.random.seed(0) + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/xpu/test_fused_gemm_epilogue_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_fused_gemm_epilogue_op_xpu.py new file mode 100644 index 0000000000..2e1d5848e6 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/xpu/test_fused_gemm_epilogue_op_xpu.py @@ -0,0 +1,293 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2022 NVIDIA Corporation. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import sys + +sys.path.append("..") +import unittest +import numpy as np +import paddle +import paddle.fluid.core as core +from op_test_xpu import XPUOpTest +from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper + + +def gelu(x): + y_ref = 0.5 * x * ( + 1.0 + np.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * np.power(x, 3)))) + return y_ref.astype(x.dtype) + + +def relu(x): + mask = x > 0 + return x * mask + + +def get_output(X, Y, bias, act): + out = np.dot(X, Y) + bias + if act == 'relu': + return relu(out) + elif act == 'gelu': + return gelu(out) + else: + return out + + +def matmul(x, y, bias, trans_x, trans_y): + x = np.array(x) + if trans_x: + x = np.ascontiguousarray(np.transpose(x)) + if trans_y: + y = np.ascontiguousarray(np.transpose(y)) + z = np.matmul(x, y) + if bias is None: + return z + else: + return z + bias + + +def matmul_grad(x, y, bias, dz, trans_x, trans_y): + if trans_x: + if trans_y: + dx = matmul(y, dz, None, True, True) + dy = matmul(dz, x, None, True, True) + else: + dx = matmul(y, dz, None, False, True) + dy = matmul(x, dz, None, False, False) + else: + if trans_y: + dx = matmul(dz, y, None, False, False) + dy = matmul(dz, x, None, True, False) + else: + dx = matmul(dz, y, None, False, True) + dy = matmul(x, dz, None, True, False) + if bias is None: + dbias = None + else: + dbias = np.sum(dz, axis=0, keepdims=False) + return dx, dy, dbias + + +class XPUTestFuseGemmOp(XPUOpTestWrapper): + + def __init__(self): + self.op_name = 'fused_gemm_epilogue' + self.use_dynamic_create_class = False + + class TestFuseGemmBase(XPUOpTest): + + def setUp(self): + self.__class__.no_need_check_grad = True + self.op_type = "fused_gemm_epilogue" + self.init_dtype_type() + self.init_datas_shape_and_attrs() + self.inputs = { + 'X': np.random.random(self.x_shape).astype(self.dtype) - 0.5, + 'Y': np.random.random(self.y_shape).astype(self.dtype) - 0.5, + 'Bias': + np.random.random(self.bias_shape).astype(self.dtype) - 0.5 + } + + if self.trans_x == True: + numpy_input_x = self.inputs['X'].reshape( + (self.x_shape[0], -1)).T + else: + numpy_input_x = self.inputs['X'].reshape((-1, self.x_shape[-1])) + + if self.trans_y == True: + numpy_input_y = self.inputs['Y'].T + else: + numpy_input_y = self.inputs['Y'] + + self.outputs = { + 'Out': + get_output(numpy_input_x, numpy_input_y, self.inputs['Bias'], + self.activation).reshape(self.out_shape) + } + self.attrs = { + "activation": self.activation, + "trans_y": self.trans_y, + "trans_x": self.trans_x + } + + def init_dtype_type(self): + self.dtype = self.in_type + self.atol = 1e-4 + if self.dtype == np.float16: + self.atol = 1e-3 + + def init_datas_shape_and_attrs(self): + self.x_shape = [8, 4] + self.y_shape = [4, 128] + self.bias_shape = [ + 128, + ] + self.out_shape = [8, 128] + self.activation = "relu" + self.trans_y = False + self.trans_x = False + + def test_check_output(self): + self.check_output_with_place(core.XPUPlace(0), atol=self.atol) + + class TestFuseGemmEpilogueOp1(TestFuseGemmBase): + + def init_datas_shape_and_attrs(self): + self.x_shape = [4, 8] + self.y_shape = [4, 128] + self.bias_shape = [ + 128, + ] + self.out_shape = [8, 128] + self.activation = "relu" + self.trans_y = False + self.trans_x = True + + class TestFuseGemmEpilogueOp2(TestFuseGemmBase): + + def init_datas_shape_and_attrs(self): + self.x_shape = [8, 4] + self.y_shape = [128, 4] + self.bias_shape = [ + 128, + ] + self.out_shape = [8, 128] + self.activation = "relu" + self.trans_y = True + self.trans_x = False + + class TestFuseGemmEpilogueOp3(TestFuseGemmBase): + + def init_datas_shape_and_attrs(self): + self.x_shape = [4, 8] + self.y_shape = [128, 4] + self.bias_shape = [ + 128, + ] + self.out_shape = [8, 128] + self.activation = "relu" + self.trans_y = True + self.trans_x = True + + class TestFuseGemmEpilogueOp4(TestFuseGemmBase): + + def init_datas_shape_and_attrs(self): + self.x_shape = [2, 2, 8, 4] + self.y_shape = [4, 128] + self.bias_shape = [ + 128, + ] + self.out_shape = [2, 2, 8, 128] + self.activation = "relu" + self.trans_y = False + self.trans_x = False + + class TestFuseGemmEpilogueOp5(TestFuseGemmBase): + + def init_datas_shape_and_attrs(self): + self.x_shape = [4, 2, 2, 8] + self.y_shape = [4, 128] + self.bias_shape = [ + 128, + ] + self.out_shape = [2, 2, 8, 128] + self.activation = "relu" + self.trans_y = False + self.trans_x = True + + class TestFuseGemmEpilogueOp6(TestFuseGemmBase): + + def init_datas_shape_and_attrs(self): + self.x_shape = [8, 4] + self.y_shape = [4, 128] + self.bias_shape = [ + 128, + ] + self.out_shape = [8, 128] + self.activation = "gelu" + self.trans_y = False + self.trans_x = False + + class TestFuseGemmEpilogueOp7(TestFuseGemmBase): + + def init_datas_shape_and_attrs(self): + self.x_shape = [8, 4] + self.y_shape = [4, 128] + self.bias_shape = [ + 128, + ] + self.out_shape = [8, 128] + self.activation = "none" + self.trans_y = False + self.trans_x = False + + +class TestEagerFusedGemmEpilogue(unittest.TestCase): + + def setUp(self): + paddle.set_device('xpu') + + def test_case_act(self): + paddle.disable_static() + x_np = np.random.random((8, 4)).astype(np.float32) - 0.5 + y_np = np.random.random((4, 128)).astype(np.float32) - 0.5 + bias_np = np.random.random((128, )).astype(np.float32) - 0.5 + x = paddle.to_tensor(x_np) + y = paddle.to_tensor(y_np) + bias = paddle.to_tensor(bias_np) + x.stop_gradient = False + y.stop_gradient = False + + out1 = core.ops.fused_gemm_epilogue(x, y, bias, 'trans_x', False, + 'trans_y', False, 'activation', + 'none') + out2 = core.ops.fused_gemm_epilogue(x, y, bias, 'trans_x', False, + 'trans_y', False, 'activation', + 'relu') + out3 = core.ops.fused_gemm_epilogue(x, y, bias, 'trans_x', False, + 'trans_y', False, 'activation', + 'gelu') + + out_np1 = get_output(x_np, y_np, bias_np, 'none') + out_np2 = get_output(x_np, y_np, bias_np, 'relu') + out_np3 = get_output(x_np, y_np, bias_np, 'gelu') + + np.testing.assert_allclose(out1, out_np1, atol=1e-04) + np.testing.assert_allclose(out2, out_np2, atol=1e-04) + np.testing.assert_allclose(out3, out_np3, atol=1e-03) + + out_grad_np1 = np.random.randint(low=-20, high=20, + size=out_np1.shape).astype(np.float32) + paddle.autograd.backward(out1, + grad_tensors=[paddle.to_tensor(out_grad_np1)]) + + x_grad_np, y_grad_np, bias_grad_np = matmul_grad( + x_np, y_np, bias_np, out_grad_np1, False, False) + np.testing.assert_allclose(x.grad.numpy(), x_grad_np, atol=1e-02) + self.assertEqual(y_grad_np.shape, y_np.shape) + np.testing.assert_allclose(y.grad.numpy(), y_grad_np, atol=1e-03) + + paddle.enable_static() + + +support_types = get_xpu_op_support_types('fused_gemm_epilogue') +for stype in support_types: + create_test_class(globals(), XPUTestFuseGemmOp, stype) + +if __name__ == "__main__": + paddle.enable_static() + np.random.seed(0) + unittest.main() -- GitLab