未验证 提交 57f6a469 编写于 作者: Y Yiqun Liu 提交者: GitHub

Enable matmul + bias fusion in fused_gat_attention. (#50755)

* Enable matmul + bias fusion in fused_gat_attention.

* Add a variable to control whether using fused matmul + bias.
上级 7c73910e
......@@ -14,12 +14,13 @@ limitations under the License. */
#pragma once
#include "paddle/fluid/operators/kernel_primitives/kernel_primitives.h"
#include "paddle/fluid/operators/fused/fused_gemm_epilogue_op.h"
#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/broadcast_function.h"
#include "paddle/phi/kernels/funcs/elementwise_functor.h"
#include "paddle/phi/kernels/primitive/kernel_primitives.h"
namespace paddle {
namespace operators {
......@@ -44,13 +45,43 @@ class AttnMatMul {
input_size_(input_size),
compute_bias_(compute_bias) {}
~AttnMatMul() {}
void ComputeForward(const phi::DenseTensor* weight,
const phi::DenseTensor* input,
const phi::DenseTensor* bias,
phi::DenseTensor* output,
phi::DenseTensor* bias_out) {
phi::DenseTensor* bias_out,
bool fused = false) {
VLOG(6) << "input.shape={" << input->dims() << "}, weight.shape={"
<< weight->dims() << "}, output.shape={" << output->dims()
<< "}, batch_size=" << bsz_seq_ << ", output_size=" << output_size_
<< ", input_size=" << input_size_ << ", transA=" << transA_
<< ", transB=" << transB_ << ", compute_bias=" << compute_bias_
<< ", fused=" << fused;
#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060
if (compute_bias_ && fused) {
PADDLE_ENFORCE_EQ(
!output || output == bias_out,
true,
phi::errors::InvalidArgument(
"The output (= input * weight) is expected to be nullptr or the "
"same as bias_out when fused is true."));
ComputeFusedGemmEpilogueForward<T>(dev_ctx_,
input,
weight,
bias,
bsz_seq_, // M
output_size_, // N
input_size_, // K
transA_,
transB_,
"none",
bias_out,
nullptr);
return;
}
#endif
// Note: for blas.GEMM API in Paddle, it treats all inputs as row-major.
// here: (transa, transb): nt, input * weight.
CBLAS_TRANSPOSE transA = transA_ ? CblasTrans : CblasNoTrans;
......@@ -85,7 +116,29 @@ class AttnMatMul {
phi::DenseTensor* d_input,
phi::DenseTensor* d_weight,
phi::DenseTensor* d_bias,
bool use_addto = false) {
bool use_addto = false,
bool fused = false) {
#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060
if (compute_bias_ && fused) {
ComputeFusedGemmEpilogueBackward<T>(dev_ctx_,
d_output,
input,
weight,
nullptr,
bsz_seq_, // M
output_size_, // N
input_size_, // K
transA_,
transB_,
"none",
d_input,
d_weight,
d_bias,
use_addto);
return;
}
#endif
T alpha = static_cast<T>(1.0);
T beta_dA = use_addto ? static_cast<T>(1.0) : static_cast<T>(0.0);
T beta_dB = static_cast<T>(0.0);
......
......@@ -15,6 +15,7 @@ limitations under the License. */
#pragma once
#include "paddle/fluid/operators/fused/fused_softmax_mask.cu.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/broadcast_function.h"
#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
#include "paddle/phi/kernels/funcs/dropout_impl.cu.h"
......
......@@ -209,7 +209,8 @@ void ComputeGatingLinearForward(const framework::ExecutionContext &ctx,
const GateAttentionConfig<T> &config,
const phi::DenseTensor *query,
const phi::DenseTensor *fmha_out,
phi::DenseTensor *gate_out) {
phi::DenseTensor *gate_bias_out,
bool use_fused_matmul_bias) {
auto *gate_weight = ctx.Input<phi::DenseTensor>("GateWeight");
auto *gate_bias = ctx.Input<phi::DenseTensor>("GateBias");
......@@ -220,14 +221,18 @@ void ComputeGatingLinearForward(const framework::ExecutionContext &ctx,
int m = config.batch_size * config.seq_len_m * config.seq_len_r;
int n = config.num_heads * config.head_dim;
int k = config.q_dim;
auto gate_attn_compute =
auto gate_linear =
AttnMatMul<T>(ctx.cuda_device_context(), false, false, m, n, k, true);
gate_attn_compute.ComputeForward(
gate_weight, query, gate_bias, gate_out, gate_out);
gate_linear.ComputeForward(gate_weight,
query,
gate_bias,
gate_bias_out,
gate_bias_out,
use_fused_matmul_bias);
// gate_out = sigmoid(gate_out) * fmha_out
std::vector<const phi::DenseTensor *> ins = {gate_out, fmha_out};
std::vector<phi::DenseTensor *> outs = {gate_out};
std::vector<const phi::DenseTensor *> ins = {gate_bias_out, fmha_out};
std::vector<phi::DenseTensor *> outs = {gate_bias_out};
phi::funcs::ElementwiseKernel<T>(
ctx.cuda_device_context(), ins, &outs, SigmoidMultiplyFunctor<T>());
}
......@@ -239,10 +244,12 @@ void ComputeGatingLinearBackward(const framework::ExecutionContext &ctx,
const phi::DenseTensor *fmha_out,
const phi::DenseTensor *gate_out_grad,
phi::DenseTensor *query_grad,
phi::DenseTensor *fmha_out_grad) {
phi::DenseTensor *fmha_out_grad,
bool use_fused_matmul_bias) {
const auto *gate_weight = ctx.Input<phi::DenseTensor>("GateWeight");
const auto *gate_bias = ctx.Input<phi::DenseTensor>("GateBias");
auto &dev_ctx = ctx.template device_context<phi::GPUContext>();
// Re-compute gate_bias_out
phi::DenseTensor gate_bias_out;
gate_bias_out.Resize(config.gate_out_dims);
......@@ -251,10 +258,14 @@ void ComputeGatingLinearBackward(const framework::ExecutionContext &ctx,
int m = config.batch_size * config.seq_len_m * config.seq_len_r;
int n = config.num_heads * config.head_dim;
int k = config.q_dim;
auto gate_attn_compute =
auto gate_linear =
AttnMatMul<T>(ctx.cuda_device_context(), false, false, m, n, k, true);
gate_attn_compute.ComputeForward(
gate_weight, query, gate_bias, &gate_bias_out, &gate_bias_out);
gate_linear.ComputeForward(gate_weight,
query,
gate_bias,
&gate_bias_out,
&gate_bias_out,
use_fused_matmul_bias);
// Gradient of sigmoid(gate_bias_out) * fmha_out
// Compute inplace and save gate_bias_out_grad to gate_bias_out.
......@@ -272,19 +283,22 @@ void ComputeGatingLinearBackward(const framework::ExecutionContext &ctx,
dev_ctx.Alloc<T>(gate_weight_grad, gate_weight_grad->numel() * sizeof(T));
dev_ctx.Alloc<T>(gate_bias_grad, gate_bias_grad->numel() * sizeof(T));
gate_attn_compute.ComputeBackward(query,
gate_weight,
&gate_bias_out,
query_grad,
gate_weight_grad,
gate_bias_grad);
gate_linear.ComputeBackward(query,
gate_weight,
&gate_bias_out,
query_grad,
gate_weight_grad,
gate_bias_grad,
false,
use_fused_matmul_bias);
}
template <typename T>
void ComputeOutputLinearForward(const framework::ExecutionContext &ctx,
const GateAttentionConfig<T> &config,
const phi::DenseTensor *fmha_or_gate_out,
phi::DenseTensor *out) {
phi::DenseTensor *out,
bool use_fused_matmul_bias) {
const auto *out_linear_weight =
ctx.Input<phi::DenseTensor>("OutLinearWeight");
const auto *out_linear_bias = ctx.Input<phi::DenseTensor>("OutLinearBias");
......@@ -293,17 +307,22 @@ void ComputeOutputLinearForward(const framework::ExecutionContext &ctx,
int m = config.batch_size * config.seq_len_m * config.seq_len_r;
int n = config.q_dim;
int k = config.num_heads * config.head_dim;
auto out_linear_compute =
auto out_linear =
AttnMatMul<T>(ctx.cuda_device_context(), false, false, m, n, k, true);
out_linear_compute.ComputeForward(
out_linear_weight, fmha_or_gate_out, out_linear_bias, out, out);
out_linear.ComputeForward(out_linear_weight,
fmha_or_gate_out,
out_linear_bias,
out,
out,
use_fused_matmul_bias);
}
template <typename T>
void ComputeOutputLinearBackward(const framework::ExecutionContext &ctx,
const GateAttentionGradConfig<T> &config,
const phi::DenseTensor *input,
phi::DenseTensor *input_grad) {
phi::DenseTensor *input_grad,
bool use_fused_matmul_bias) {
auto &dev_ctx = ctx.template device_context<phi::GPUContext>();
const auto *out_grad =
ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
......@@ -323,14 +342,16 @@ void ComputeOutputLinearBackward(const framework::ExecutionContext &ctx,
int m = config.batch_size * config.seq_len_m * config.seq_len_r;
int n = config.q_dim;
int k = config.num_heads * config.head_dim;
auto out_linear_compute =
auto out_linear =
AttnMatMul<T>(ctx.cuda_device_context(), false, false, m, n, k, true);
out_linear_compute.ComputeBackward(input,
out_linear_weight,
out_grad,
input_grad,
out_linear_weight_grad,
out_linear_bias_grad);
out_linear.ComputeBackward(input,
out_linear_weight,
out_grad,
input_grad,
out_linear_weight_grad,
out_linear_bias_grad,
false,
use_fused_matmul_bias);
}
template <typename T>
......@@ -358,6 +379,7 @@ class FusedGateAttentionOpKernel : public framework::OpKernel<T> {
const bool merge_qkv = ctx.Attr<bool>("merge_qkv");
const bool has_gating = ctx.Attr<bool>("has_gating");
bool use_fused_matmul_bias = true;
auto &dev_ctx = ctx.template device_context<phi::GPUContext>();
AllocWithDebugInfo<T>(dev_ctx, "softmax_out", softmax_out);
AllocWithDebugInfo<T>(dev_ctx, "fmha_out", fmha_out);
......@@ -413,12 +435,14 @@ class FusedGateAttentionOpKernel : public framework::OpKernel<T> {
// 3. Gating Linear
if (has_gating) {
ComputeGatingLinearForward<T>(ctx, config, query, fmha_out, gate_out);
ComputeGatingLinearForward<T>(
ctx, config, query, fmha_out, gate_out, use_fused_matmul_bias);
}
// 4. Output Linear
phi::DenseTensor *fmha_or_gate_out = has_gating ? gate_out : fmha_out;
ComputeOutputLinearForward<T>(ctx, config, fmha_or_gate_out, out);
ComputeOutputLinearForward<T>(
ctx, config, fmha_or_gate_out, out, use_fused_matmul_bias);
}
};
......@@ -454,6 +478,7 @@ class FusedGateAttentionGradKernel : public framework::OpKernel<T> {
bool has_gating = ctx.Attr<bool>("has_gating");
bool merge_qkv = ctx.Attr<bool>("merge_qkv");
bool use_fused_matmul_bias = true;
auto &dev_ctx = ctx.template device_context<phi::GPUContext>();
AllocWithDebugInfo<T>(dev_ctx, "query_grad", query_grad);
......@@ -468,7 +493,8 @@ class FusedGateAttentionGradKernel : public framework::OpKernel<T> {
phi::DenseTensor gate_out_grad;
gate_out_grad.Resize(config.gate_out_dims);
AllocWithDebugInfo<T>(dev_ctx, "gate_out_grad", &gate_out_grad);
ComputeOutputLinearBackward<T>(ctx, config, gate_out, &gate_out_grad);
ComputeOutputLinearBackward<T>(
ctx, config, gate_out, &gate_out_grad, use_fused_matmul_bias);
// 2. Gradient of Gating Linear
// Forward: gate_out = Sigmoid(Linear(fmha_out)) * fmha_out
......@@ -478,10 +504,12 @@ class FusedGateAttentionGradKernel : public framework::OpKernel<T> {
fmha_out,
&gate_out_grad,
query_grad,
&fmha_out_grad);
&fmha_out_grad,
use_fused_matmul_bias);
} else {
// 1. Gradient of Output Linear: out = Linear(fmha_grad)
ComputeOutputLinearBackward<T>(ctx, config, fmha_out, &fmha_out_grad);
ComputeOutputLinearBackward<T>(
ctx, config, fmha_out, &fmha_out_grad, use_fused_matmul_bias);
}
// 3. Gradient of FMHA
......
......@@ -14,7 +14,6 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/fused/fused_gemm_epilogue_op.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_version_registry.h"
......
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
......
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
* Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
// This file has been adapted from FasterTransformer file:
// https://github.com/NVIDIA/FasterTransformer/blob/v4.0/fastertransformer/cuda/masked_multihead_attention.cu
// We add License in the head.
......
......@@ -14,6 +14,12 @@
#pragma once
#ifdef PADDLE_WITH_CUDA
#include <cuda_runtime.h> // NOLINT
#include "paddle/phi/common/bfloat16.h"
#include "paddle/phi/common/float16.h"
namespace phi {
namespace backends {
namespace gpu {
......@@ -69,6 +75,22 @@ namespace gpu {
for (index_type i = __index__; __index__ < (num); \
__index__ += __stride__, i = __index__)
template <typename T>
cudaDataType_t ToCudaDataType() {
if (std::is_same<T, float>::value) {
return CUDA_R_32F;
} else if (std::is_same<T, double>::value) {
return CUDA_R_64F;
} else if (std::is_same<T, phi::dtype::float16>::value) {
return CUDA_R_16F;
#if CUDA_VERSION >= 11000
} else if (std::is_same<T, phi::dtype::bfloat16>::value) {
return CUDA_R_16BF;
#endif
}
}
} // namespace gpu
} // namespace backends
} // namespace phi
#endif
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册