Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
fdcdbec5
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 2 年 前同步成功
通知
2325
Star
20933
Fork
5424
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
You need to sign in or sign up before continuing.
未验证
提交
fdcdbec5
编写于
5月 30, 2022
作者:
C
crystal
提交者:
GitHub
5月 30, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Implement fused_gate_attention operator for AlphaFold. (#42018)
上级
17b8446d
变更
11
隐藏空白更改
内联
并排
Showing
11 changed file
with
1821 addition
and
109 deletion
+1821
-109
paddle/fluid/operators/fused/CMakeLists.txt
paddle/fluid/operators/fused/CMakeLists.txt
+3
-1
paddle/fluid/operators/fused/attn_gemm.h
paddle/fluid/operators/fused/attn_gemm.h
+80
-84
paddle/fluid/operators/fused/fmha_ref.h
paddle/fluid/operators/fused/fmha_ref.h
+3
-1
paddle/fluid/operators/fused/fused_gate_attention.h
paddle/fluid/operators/fused/fused_gate_attention.h
+647
-0
paddle/fluid/operators/fused/fused_gate_attention_op.cc
paddle/fluid/operators/fused/fused_gate_attention_op.cc
+317
-0
paddle/fluid/operators/fused/fused_gate_attention_op.cu
paddle/fluid/operators/fused/fused_gate_attention_op.cu
+488
-0
paddle/fluid/platform/device/gpu/gpu_info.cc
paddle/fluid/platform/device/gpu/gpu_info.cc
+2
-2
paddle/fluid/pybind/op_function_generator.h
paddle/fluid/pybind/op_function_generator.h
+7
-0
paddle/phi/kernels/gpudnn/softmax_gpudnn.h
paddle/phi/kernels/gpudnn/softmax_gpudnn.h
+21
-21
python/paddle/fluid/tests/unittests/CMakeLists.txt
python/paddle/fluid/tests/unittests/CMakeLists.txt
+1
-0
python/paddle/fluid/tests/unittests/test_fused_gate_attention_op.py
...dle/fluid/tests/unittests/test_fused_gate_attention_op.py
+252
-0
未找到文件。
paddle/fluid/operators/fused/CMakeLists.txt
浏览文件 @
fdcdbec5
...
...
@@ -23,7 +23,8 @@ register_operators(EXCLUDES
fused_feedforward_op
fused_multi_transformer_op
resnet_unit_op
fused_gemm_epilogue_op
)
fused_gemm_epilogue_op
fused_gate_attention_op
)
# fusion_gru_op does not have CUDA kernel
op_library
(
fusion_gru_op
)
...
...
@@ -58,6 +59,7 @@ if (WITH_GPU OR WITH_ROCM)
op_library
(
yolo_box_head_op
)
op_library
(
yolo_box_post_op
)
op_library
(
fused_embedding_eltwise_layernorm_op
)
op_library
(
fused_gate_attention_op
)
# fusion_group
if
(
NOT APPLE AND NOT WIN32
)
op_library
(
fusion_group_op DEPS device_code
)
...
...
paddle/fluid/operators/fused/attn_gemm.h
浏览文件 @
fdcdbec5
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
...
@@ -13,6 +16,7 @@ limitations under the License. */
#include "paddle/fluid/platform/float16.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/elementwise_functor.h"
#include "paddle/fluid/operators/kernel_primitives/kernel_primitives.h"
#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
...
...
@@ -21,6 +25,8 @@ limitations under the License. */
namespace
paddle
{
namespace
operators
{
using
Tensor
=
framework
::
Tensor
;
// support gemm-nt and gemm-nn, which is used in fused_attention_op.
template
<
typename
T
>
class
AttnMatMul
{
...
...
@@ -45,31 +51,21 @@ class AttnMatMul {
framework
::
Tensor
*
bias_out
)
{
// Note: for blas.GEMM API in Paddle, it treats all inputs as row-major.
// here: (transa, transb): nt, input * weight.
CBLAS_TRANSPOSE
transA
=
CblasNoTrans
;
CBLAS_TRANSPOSE
transB
=
CblasNoTrans
;
if
(
transA_
)
{
transA
=
CblasTrans
;
}
if
(
transB_
)
{
transB
=
CblasTrans
;
}
CBLAS_TRANSPOSE
transA
=
transA_
?
CblasTrans
:
CblasNoTrans
;
CBLAS_TRANSPOSE
transB
=
transB_
?
CblasTrans
:
CblasNoTrans
;
T
alpha
=
static_cast
<
T
>
(
1.0
);
T
beta
=
static_cast
<
T
>
(
0.0
);
//
here:
(m, n, k) = bsz_seq, output_size, input_size, (input, weight, out)
// (m, n, k) = bsz_seq, output_size, input_size, (input, weight, out)
auto
blas
=
phi
::
funcs
::
GetBlas
<
platform
::
CUDADeviceContext
,
T
>
(
dev_ctx_
);
blas
.
GEMM
(
transA
,
transB
,
bsz_seq_
,
output_size_
,
input_size_
,
alpha
,
input
->
data
<
T
>
(),
weight
->
data
<
T
>
(),
beta
,
output
->
data
<
T
>
());
if
(
compute_bias_
)
{
// compute output + bias
std
::
vector
<
const
Tensor
*>
ins
;
std
::
vector
<
Tensor
*>
outs
;
ins
.
emplace_back
(
output
);
ins
.
emplace_back
(
bias
);
outs
.
emplace_back
(
bias_out
);
int
elewise_add_axis
=
-
1
;
// bias_out = output + bias
std
::
vector
<
const
Tensor
*>
ins
=
{
output
,
bias
};
std
::
vector
<
Tensor
*>
outs
=
{
bias_out
};
phi
::
funcs
::
BroadcastKernel
<
phi
::
ElementwiseType
::
kBinary
,
T
,
T
>
(
dev_ctx_
,
ins
,
&
outs
,
elewise_add_axis
,
phi
::
funcs
::
AddFunctor
<
T
>
());
dev_ctx_
,
ins
,
&
outs
,
-
1
,
phi
::
funcs
::
AddFunctor
<
T
>
());
}
}
...
...
@@ -77,82 +73,71 @@ class AttnMatMul {
const
framework
::
Tensor
*
weight
,
const
framework
::
Tensor
*
d_output
,
framework
::
Tensor
*
d_input
,
framework
::
Tensor
*
d_weight
,
framework
::
Tensor
*
d_bias
)
{
framework
::
Tensor
*
d_bias
,
bool
use_addto
=
false
)
{
T
alpha
=
static_cast
<
T
>
(
1.0
);
T
beta
=
static_cast
<
T
>
(
0.0
);
auto
blas
=
phi
::
funcs
::
GetBlas
<
platform
::
CUDADeviceContext
,
T
>
(
dev_ctx_
);
CBLAS_TRANSPOSE
dB_transA
=
CblasNoTrans
;
CBLAS_TRANSPOSE
dB_transB
=
CblasNoTrans
;
CBLAS_TRANSPOSE
dA_transA
=
CblasNoTrans
;
CBLAS_TRANSPOSE
dA_transB
=
CblasNoTrans
;
int
dB_m
=
1
;
int
dB_n
=
1
;
int
dB_k
=
1
;
int
dA_m
=
1
;
int
dA_n
=
1
;
int
dA_k
=
1
;
T
*
dB_input_1_ptr
=
nullptr
;
T
*
dB_input_2_ptr
=
nullptr
;
T
*
dB_output_ptr
=
d_weight
->
data
<
T
>
();
T
*
dA_input_1_ptr
=
nullptr
;
T
*
dA_input_2_ptr
=
nullptr
;
T
*
dA_output_ptr
=
d_input
->
data
<
T
>
();
T
beta_dA
=
use_addto
?
static_cast
<
T
>
(
1.0
)
:
static_cast
<
T
>
(
0.0
);
T
beta_dB
=
static_cast
<
T
>
(
0.0
);
auto
blas
=
phi
::
funcs
::
GetBlas
<
platform
::
CUDADeviceContext
,
T
>
(
dev_ctx_
);
if
(
!
transA_
)
{
// f
w
: gemm-nt
// f
orward
: gemm-nt
if
(
transB_
)
{
// bw: gemm-tn, dB = (dC)^t * A
dB_transA
=
CblasTrans
;
dB_transB
=
CblasNoTrans
;
dB_m
=
output_size_
;
dB_n
=
input_size_
;
dB_k
=
bsz_seq_
;
// bw: gemm-nn, dA = dC * B
dA_transA
=
CblasNoTrans
;
dA_transB
=
CblasNoTrans
;
dA_m
=
bsz_seq_
;
dA_n
=
input_size_
;
dA_k
=
output_size_
;
blas
.
GEMM
(
dB_transA
,
dB_transB
,
dB_m
,
dB_n
,
dB_k
,
alpha
,
d_output
->
data
<
T
>
(),
input
->
data
<
T
>
(),
beta
,
dB_output_ptr
);
blas
.
GEMM
(
dA_transA
,
dA_transB
,
dA_m
,
dA_n
,
dA_k
,
alpha
,
d_output
->
data
<
T
>
(),
weight
->
data
<
T
>
(),
beta
,
dA_output_ptr
);
// backward: gemm-tn, dB = (dC)^T * A
if
(
d_weight
)
{
int
dB_m
=
output_size_
;
int
dB_n
=
input_size_
;
int
dB_k
=
bsz_seq_
;
T
*
dB_output_ptr
=
d_weight
->
data
<
T
>
();
blas
.
GEMM
(
CblasTrans
,
CblasNoTrans
,
dB_m
,
dB_n
,
dB_k
,
alpha
,
d_output
->
data
<
T
>
(),
input
->
data
<
T
>
(),
beta_dB
,
dB_output_ptr
);
}
// backward: gemm-nn, dA = dC * B
if
(
d_input
)
{
int
dA_m
=
bsz_seq_
;
int
dA_n
=
input_size_
;
int
dA_k
=
output_size_
;
T
*
dA_output_ptr
=
d_input
->
data
<
T
>
();
blas
.
GEMM
(
CblasNoTrans
,
CblasNoTrans
,
dA_m
,
dA_n
,
dA_k
,
alpha
,
d_output
->
data
<
T
>
(),
weight
->
data
<
T
>
(),
beta_dA
,
dA_output_ptr
);
}
}
else
{
// fw: gemm-nn
// bw: gemm-tn, dB = A^t * dC
dB_transA
=
CblasTrans
;
dB_transB
=
CblasNoTrans
;
dB_m
=
input_size_
;
dB_n
=
output_size_
;
dB_k
=
bsz_seq_
;
// bw: gemm-nt, dA = dC * B^t
dA_transA
=
CblasNoTrans
;
dA_transB
=
CblasTrans
;
dA_m
=
bsz_seq_
;
dA_n
=
input_size_
;
dA_k
=
output_size_
;
blas
.
GEMM
(
dB_transA
,
dB_transB
,
dB_m
,
dB_n
,
dB_k
,
alpha
,
input
->
data
<
T
>
(),
d_output
->
data
<
T
>
(),
beta
,
dB_output_ptr
);
blas
.
GEMM
(
dA_transA
,
dA_transB
,
dA_m
,
dA_n
,
dA_k
,
alpha
,
d_output
->
data
<
T
>
(),
weight
->
data
<
T
>
(),
beta
,
dA_output_ptr
);
// backward: gemm-tn, dB = A^T * dC
if
(
d_weight
)
{
int
dB_m
=
input_size_
;
int
dB_n
=
output_size_
;
int
dB_k
=
bsz_seq_
;
T
*
dB_output_ptr
=
d_weight
->
data
<
T
>
();
blas
.
GEMM
(
CblasTrans
,
CblasNoTrans
,
dB_m
,
dB_n
,
dB_k
,
alpha
,
input
->
data
<
T
>
(),
d_output
->
data
<
T
>
(),
beta_dB
,
dB_output_ptr
);
}
// backward: gemm-nt, dA = dC * B^T
if
(
d_input
)
{
int
dA_m
=
bsz_seq_
;
int
dA_n
=
input_size_
;
int
dA_k
=
output_size_
;
T
*
dA_output_ptr
=
d_input
->
data
<
T
>
();
blas
.
GEMM
(
CblasNoTrans
,
CblasTrans
,
dA_m
,
dA_n
,
dA_k
,
alpha
,
d_output
->
data
<
T
>
(),
weight
->
data
<
T
>
(),
beta_dA
,
dA_output_ptr
);
}
}
}
else
if
(
transB_
)
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"AttnMatMul wrapper do not support (transA=T, transB=T)"
"parameters."
));
}
else
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"AttnMatMul wrapper do not support (transA=T, transB=N)"
"AttnMatMul wrapper do not support (transA=T, transB=
T/
N)"
"parameters."
));
}
if
(
compute_bias_
)
{
// reduce: {0, 1, 2, 3, 4} -> {2, 3, 4} or {0, 1, 2} -> {2}
if
(
compute_bias_
&&
d_bias
)
{
// reduce: {0, 1, 2, 3, 4} -> {2, 3, 4} or {0, 1, 2} -> {2} or {0,1,2,3}
// -> {3} or {0,1,2,3,4} -> {3,4}
const
auto
input_dims
=
d_output
->
dims
();
const
auto
output_dims
=
d_bias
->
dims
();
bool
support_case_1
=
...
...
@@ -163,11 +148,22 @@ class AttnMatMul {
bool
support_case_2
=
(
input_dims
.
size
()
==
3
&&
output_dims
.
size
()
==
1
&&
(
input_dims
[
2
]
==
output_dims
[
0
]));
bool
support_case_3
=
(
input_dims
.
size
()
==
4
&&
output_dims
.
size
()
==
1
&&
input_dims
[
3
]
==
output_dims
[
0
]);
bool
support_case_4
=
(
input_dims
.
size
()
==
5
&&
output_dims
.
size
()
==
2
&&
input_dims
[
3
]
==
output_dims
[
0
]
&&
input_dims
[
4
]
==
output_dims
[
1
]);
gpuStream_t
stream
=
dev_ctx_
.
stream
();
if
(
support_case_1
||
support_case_2
)
{
gpuStream_t
stream
=
dev_ctx_
.
stream
();
TensorReduceImpl
<
T
,
T
,
kps
::
AddFunctor
,
kps
::
IdentityFunctor
<
T
>>
(
dev_ctx_
,
*
d_output
,
d_bias
,
kps
::
IdentityFunctor
<
T
>
(),
{
0
,
1
},
stream
);
}
else
if
(
support_case_3
||
support_case_4
)
{
TensorReduceImpl
<
T
,
T
,
kps
::
AddFunctor
,
kps
::
IdentityFunctor
<
T
>>
(
dev_ctx_
,
*
d_output
,
d_bias
,
kps
::
IdentityFunctor
<
T
>
(),
{
0
,
1
,
2
},
stream
);
}
else
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"Only support reduce when the input dims are [0,1,2,3,4] and "
...
...
paddle/fluid/operators/fused/fmha_ref.h
浏览文件 @
fdcdbec5
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
...
@@ -297,7 +300,6 @@ class FMHARef {
phi
::
SoftmaxBackwardCUDAKernelDriver
<
T
>
(
dev_ctx_
,
softmax_out_tensor
,
*
softmax_out_grad_tensor
,
softmax_axis
,
src_mask_out_grad_tensor
);
// recall LaunchElementwiseCudaKernel fw: src_mask_out = qk_out +
// src_mask
// Special case when dy is not needed and dx doesn't reduce
...
...
paddle/fluid/operators/fused/fused_gate_attention.h
0 → 100644
浏览文件 @
fdcdbec5
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
#include "paddle/fluid/operators/transpose_op.cu.h"
#include "paddle/phi/kernels/funcs/broadcast_function.h"
#include "paddle/phi/kernels/funcs/elementwise_base.h"
#include "paddle/phi/kernels/funcs/elementwise_functor.h"
#include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h"
namespace
paddle
{
namespace
operators
{
using
Tensor
=
framework
::
Tensor
;
inline
std
::
string
MemoryDebugString
(
const
Tensor
&
t
)
{
std
::
stringstream
ss
;
ss
<<
"shape=["
<<
t
.
dims
()
<<
"], size="
<<
static_cast
<
float
>
(
t
.
memory_size
())
/
(
1
<<
20
)
<<
" MB, ptr="
<<
t
.
data
();
size_t
total
=
0
;
size_t
available
=
0
;
platform
::
GpuMemoryUsage
(
&
available
,
&
total
);
ss
<<
"; memory allocated="
<<
static_cast
<
float
>
(
total
-
available
)
/
(
1
<<
20
)
<<
" MB"
;
return
ss
.
str
();
}
template
<
typename
T
>
struct
TernaryAddFunctor
{
inline
HOSTDEVICE
T
operator
()(
T
a
,
T
b
,
T
c
)
const
{
return
a
+
b
+
c
;
}
};
template
<
typename
T
>
struct
GateAttentionConfig
{
public:
int64_t
batch_size
;
int64_t
seq_len_m
;
int64_t
seq_len_r
;
int64_t
q_dim
;
int64_t
kv_dim
;
int64_t
key_dim
;
int64_t
m_size
;
int64_t
num_heads
;
phi
::
DDim
qkv_out_dims
;
phi
::
DDim
qkv_transpose_out_dims
;
phi
::
DDim
q_out_dims
;
phi
::
DDim
kv_out_dims
;
phi
::
DDim
q_transpose_out_dims
;
phi
::
DDim
kv_transpose_out_dims
;
phi
::
DDim
qk_out_dims
;
phi
::
DDim
softmax_out_dims
;
phi
::
DDim
qktv_out_dims
;
phi
::
DDim
gate_out_dims
;
GateAttentionConfig
(
const
Tensor
*
query
,
const
Tensor
*
key
,
const
Tensor
*
query_weight
,
const
Tensor
*
qkv_weight
,
bool
merge_qkv
)
{
// query: shape=[batch_size, seq_len_m, seq_len_r, q_dim]
batch_size
=
query
->
dims
()[
0
];
seq_len_m
=
query
->
dims
()[
1
];
seq_len_r
=
query
->
dims
()[
2
];
q_dim
=
query
->
dims
()[
3
];
if
(
merge_qkv
)
{
PADDLE_ENFORCE_NOT_NULL
(
qkv_weight
,
platform
::
errors
::
NotFound
(
"The input qkv_weight can not be nullptr "
"when merge_qkv is true."
));
// When q_dim == kv_dim, QKV matmul can be computed merged.
// qkv_weight: shape=[3, num_heads, key_dim, q_dim]
num_heads
=
qkv_weight
->
dims
()[
1
];
key_dim
=
qkv_weight
->
dims
()[
2
];
m_size
=
seq_len_r
;
kv_dim
=
q_dim
;
qkv_out_dims
=
{
batch_size
,
seq_len_m
,
seq_len_r
,
3
,
num_heads
,
key_dim
};
qkv_transpose_out_dims
=
{
3
,
batch_size
,
seq_len_m
,
num_heads
,
seq_len_r
,
key_dim
};
}
else
{
PADDLE_ENFORCE_NOT_NULL
(
key
,
platform
::
errors
::
NotFound
(
"The input key can not be nullptr when merge_qkv is false."
));
PADDLE_ENFORCE_NOT_NULL
(
query_weight
,
platform
::
errors
::
NotFound
(
"The input query_weight can not be "
"nullptr when merge_qkv is false."
));
// When q_dim != kv_dim, QKV matmul must be computed saparately.
// key: shape=[batch_size, seq_len_m, m_size, kv_dim]
// query_w: shape=[q_dim, num_heads, key_dim]
num_heads
=
query_weight
->
dims
()[
1
];
key_dim
=
query_weight
->
dims
()[
2
];
m_size
=
key
->
dims
()[
2
];
kv_dim
=
key
->
dims
()[
3
];
q_out_dims
=
{
batch_size
,
seq_len_m
,
seq_len_r
,
num_heads
,
key_dim
};
kv_out_dims
=
{
batch_size
,
seq_len_m
,
m_size
,
num_heads
,
key_dim
};
q_transpose_out_dims
=
{
batch_size
,
seq_len_m
,
num_heads
,
seq_len_r
,
key_dim
};
kv_transpose_out_dims
=
{
batch_size
,
seq_len_m
,
num_heads
,
m_size
,
key_dim
};
}
qk_out_dims
=
{
batch_size
,
seq_len_m
,
num_heads
,
seq_len_r
,
m_size
};
softmax_out_dims
=
{
batch_size
,
seq_len_m
,
num_heads
,
seq_len_r
,
m_size
};
qktv_out_dims
=
{
batch_size
,
seq_len_m
,
num_heads
,
seq_len_r
,
key_dim
};
gate_out_dims
=
{
batch_size
,
seq_len_m
,
seq_len_r
,
num_heads
,
key_dim
};
}
int64_t
GetQuerySize
()
const
{
return
batch_size
*
seq_len_m
*
seq_len_r
*
num_heads
*
key_dim
;
}
Tensor
*
GetQKVOut
(
const
platform
::
CUDADeviceContext
&
dev_ctx
)
{
if
(
!
qkv_out
.
IsInitialized
())
{
qkv_out
.
Resize
(
qkv_out_dims
);
qkv_out
.
mutable_data
<
T
>
(
dev_ctx
.
GetPlace
());
VLOG
(
4
)
<<
"qkv_out: "
<<
MemoryDebugString
(
qkv_out
);
}
return
&
qkv_out
;
}
Tensor
*
GetQueryOut
(
const
platform
::
CUDADeviceContext
&
dev_ctx
)
{
if
(
!
query_out
.
IsInitialized
())
{
query_out
.
Resize
(
q_out_dims
);
query_out
.
mutable_data
<
T
>
(
dev_ctx
.
GetPlace
());
VLOG
(
4
)
<<
"query_out: "
<<
MemoryDebugString
(
query_out
);
}
return
&
query_out
;
}
Tensor
*
GetKeyOut
(
const
platform
::
CUDADeviceContext
&
dev_ctx
)
{
if
(
!
key_out
.
IsInitialized
())
{
key_out
.
Resize
(
kv_out_dims
);
key_out
.
mutable_data
<
T
>
(
dev_ctx
.
GetPlace
());
VLOG
(
4
)
<<
"key_out: "
<<
MemoryDebugString
(
key_out
);
}
return
&
key_out
;
}
Tensor
*
GetValueOut
(
const
platform
::
CUDADeviceContext
&
dev_ctx
)
{
if
(
!
value_out
.
IsInitialized
())
{
value_out
.
Resize
(
kv_out_dims
);
value_out
.
mutable_data
<
T
>
(
dev_ctx
.
GetPlace
());
VLOG
(
4
)
<<
"value_out: "
<<
MemoryDebugString
(
value_out
);
}
return
&
value_out
;
}
Tensor
*
GetQKOut
(
const
platform
::
CUDADeviceContext
&
dev_ctx
,
Tensor
*
softmax_out
)
{
// softmax_dim = qk_out_dim[-1] = qk_out_dim[rank - 1]
int
softmax_dim
=
m_size
;
if
(
!
softmax_out
||
phi
::
UseCudnnSoftmax
<
T
>
(
dev_ctx
,
softmax_dim
,
true
))
{
// Not sure whether cudnn softmax can execute inplace.
if
(
!
qkv_out
.
IsInitialized
())
{
qk_out
.
Resize
(
qk_out_dims
);
qk_out
.
mutable_data
<
T
>
(
dev_ctx
.
GetPlace
());
VLOG
(
4
)
<<
"qk_out: "
<<
MemoryDebugString
(
qk_out
);
}
return
&
qk_out
;
}
else
{
return
softmax_out
;
}
}
void
ClearQKVOut
()
{
if
(
qkv_out
.
IsInitialized
())
{
qkv_out
.
clear
();
}
}
void
ClearQKOut
()
{
if
(
qk_out
.
IsInitialized
())
{
qk_out
.
clear
();
}
}
protected:
Tensor
qkv_out
;
// QKV is not merged
Tensor
query_out
;
Tensor
key_out
;
Tensor
value_out
;
// qk_out = BatchedGEMM(Q, K^T)
// qk_out: shape=[batch_size, seq_len_m, num_heads, seq_len_r, m_size]
// softmax_out = softmax(qk_out + nonbatched_bias + src_mask)
// The shape of qk_out, softmax_out is the same, thus can be called inplace.
Tensor
qk_out
;
};
template
<
typename
T
>
struct
GateAttentionGradConfig
:
public
GateAttentionConfig
<
T
>
{
public:
GateAttentionGradConfig
(
const
Tensor
*
query
,
const
Tensor
*
key
,
const
Tensor
*
query_weight
,
const
Tensor
*
qkv_weight
,
bool
merge_qkv
)
:
GateAttentionConfig
<
T
>
(
query
,
key
,
query_weight
,
qkv_weight
,
merge_qkv
)
{}
Tensor
*
GetQKVOutGrad
(
const
platform
::
CUDADeviceContext
&
dev_ctx
)
{
if
(
!
qkv_out_grad
.
IsInitialized
())
{
qkv_out_grad
.
Resize
(
this
->
qkv_out_dims
);
qkv_out_grad
.
mutable_data
<
T
>
(
dev_ctx
.
GetPlace
());
VLOG
(
4
)
<<
"qkv_out_grad: "
<<
MemoryDebugString
(
qkv_out_grad
);
}
return
&
qkv_out_grad
;
}
Tensor
*
GetQueryOutGrad
(
const
platform
::
CUDADeviceContext
&
dev_ctx
)
{
if
(
!
query_out_grad
.
IsInitialized
())
{
query_out_grad
.
Resize
(
this
->
q_out_dims
);
query_out_grad
.
mutable_data
<
T
>
(
dev_ctx
.
GetPlace
());
VLOG
(
4
)
<<
"query_out_grad: "
<<
MemoryDebugString
(
query_out_grad
);
}
return
&
query_out_grad
;
}
Tensor
*
GetKeyOutGrad
(
const
platform
::
CUDADeviceContext
&
dev_ctx
)
{
if
(
!
key_out_grad
.
IsInitialized
())
{
key_out_grad
.
Resize
(
this
->
kv_out_dims
);
key_out_grad
.
mutable_data
<
T
>
(
dev_ctx
.
GetPlace
());
VLOG
(
4
)
<<
"key_out_grad: "
<<
MemoryDebugString
(
key_out_grad
);
}
return
&
key_out_grad
;
}
Tensor
*
GetValueOutGrad
(
const
platform
::
CUDADeviceContext
&
dev_ctx
)
{
if
(
!
value_out_grad
.
IsInitialized
())
{
value_out_grad
.
Resize
(
this
->
kv_out_dims
);
value_out_grad
.
mutable_data
<
T
>
(
dev_ctx
.
GetPlace
());
VLOG
(
4
)
<<
"value_out_grad: "
<<
MemoryDebugString
(
value_out_grad
);
}
return
&
value_out_grad
;
}
Tensor
*
GetQKOutGrad
(
const
platform
::
CUDADeviceContext
&
dev_ctx
,
Tensor
*
softmax_out_grad
)
{
// softmax_dim = qk_out_dim[-1] = qk_out_dim[rank - 1]
int
softmax_dim
=
this
->
m_size
;
if
(
!
softmax_out_grad
||
phi
::
UseCudnnSoftmax
<
T
>
(
dev_ctx
,
softmax_dim
,
true
))
{
if
(
!
qk_out_grad
.
IsInitialized
())
{
qk_out_grad
.
Resize
(
this
->
qk_out_dims
);
qk_out_grad
.
mutable_data
<
T
>
(
dev_ctx
.
GetPlace
());
VLOG
(
4
)
<<
"qk_out_grad: "
<<
MemoryDebugString
(
qk_out_grad
);
}
return
&
qk_out_grad
;
}
else
{
return
softmax_out_grad
;
}
}
protected:
Tensor
qkv_out_grad
;
Tensor
query_out_grad
;
Tensor
key_out_grad
;
Tensor
value_out_grad
;
Tensor
qk_out_grad
;
};
template
<
typename
T
>
class
FMHAGateRef
{
public:
FMHAGateRef
(
const
platform
::
CUDADeviceContext
&
dev_ctx
,
bool
merge_qkv
)
:
dev_ctx_
(
dev_ctx
),
merge_qkv_
(
merge_qkv
)
{}
void
ComputeForward
(
const
Tensor
*
nonbatched_bias
,
const
Tensor
*
src_mask
,
Tensor
*
q_transpose_out
,
Tensor
*
k_transpose_out
,
Tensor
*
v_transpose_out
,
Tensor
*
qkv_transpose_out
,
Tensor
*
softmax_out
,
Tensor
*
fmha_out
,
GateAttentionConfig
<
T
>*
config
)
{
T
*
q_ptr
=
nullptr
;
T
*
k_ptr
=
nullptr
;
T
*
v_ptr
=
nullptr
;
if
(
merge_qkv_
)
{
// qkv_transpose_out = transpose(qkv_out)
PADDLE_ENFORCE_NOT_NULL
(
qkv_transpose_out
,
platform
::
errors
::
NotFound
(
"The input qkv_transpose_out can not be "
"nullptr when merge_qkv is true."
));
Tensor
*
qkv_out
=
config
->
GetQKVOut
(
dev_ctx_
);
ComputeQKVTransposeForward
(
*
qkv_out
,
qkv_transpose_out
);
config
->
ClearQKVOut
();
// q_size == k_size
int64_t
q_size
=
config
->
GetQuerySize
();
q_ptr
=
qkv_transpose_out
->
data
<
T
>
();
k_ptr
=
q_ptr
+
q_size
;
v_ptr
=
k_ptr
+
q_size
;
}
else
{
PADDLE_ENFORCE_NOT_NULL
(
q_transpose_out
,
platform
::
errors
::
NotFound
(
"The input q_transpose_out can not be "
"nullptr when merge_qkv is false."
));
PADDLE_ENFORCE_NOT_NULL
(
k_transpose_out
,
platform
::
errors
::
NotFound
(
"The input k_transpose_out can not be "
"nullptr when merge_qkv is false."
));
PADDLE_ENFORCE_NOT_NULL
(
v_transpose_out
,
platform
::
errors
::
NotFound
(
"The input v_transpose_out can not be "
"nullptr when merge_qkv is false."
));
Tensor
*
query_out
=
config
->
GetQueryOut
(
dev_ctx_
);
Tensor
*
key_out
=
config
->
GetKeyOut
(
dev_ctx_
);
Tensor
*
value_out
=
config
->
GetValueOut
(
dev_ctx_
);
ComputeQKVTransposeForward
(
*
query_out
,
*
key_out
,
*
value_out
,
q_transpose_out
,
k_transpose_out
,
v_transpose_out
);
// q_size != k_size
q_ptr
=
q_transpose_out
->
data
<
T
>
();
k_ptr
=
k_transpose_out
->
data
<
T
>
();
v_ptr
=
v_transpose_out
->
data
<
T
>
();
}
// qk_out = BatchedGEMM(Q, K^T)
// [batch_size, seq_len_m, num_heads, seq_len_r, key_dim] *
// [batch_size, seq_len_m, num_heads, m_size, key_dim]
// -> [batch_size, seq_len_m, num_heads, seq_len_r, m_size]
Tensor
*
qk_out
=
config
->
GetQKOut
(
dev_ctx_
,
softmax_out
);
T
*
qk_out_ptr
=
qk_out
->
data
<
T
>
();
int64_t
gemm_batch_size
=
config
->
batch_size
*
config
->
seq_len_m
*
config
->
num_heads
;
int64_t
gemm_m
=
config
->
seq_len_r
;
int64_t
gemm_n
=
config
->
m_size
;
int64_t
gemm_k
=
config
->
key_dim
;
T
alpha
=
static_cast
<
T
>
(
1.0
/
sqrt
(
config
->
key_dim
));
ComputeBatchedGEMM
(
q_ptr
,
k_ptr
,
qk_out_ptr
,
false
,
true
,
gemm_m
,
gemm_n
,
gemm_k
,
gemm_batch_size
,
alpha
);
// softmax_out = softmax(qk_out + nonbatched_bias + src_mask)
ComputeBiasMaskSoftmaxForward
(
nonbatched_bias
,
src_mask
,
qk_out
,
softmax_out
);
config
->
ClearQKOut
();
// qktv_out = BatchedGEMM(softmax_out, V)
// [batch_size, seq_len_m, num_heads, seq_len_r, m_size] *
// [batch_size, seq_len_m, num_heads, m_size, key_dim]
// -> [batch_size, seq_len_m, num_heads, seq_len_r, key_dim]
Tensor
qktv_out
;
qktv_out
.
Resize
(
config
->
qktv_out_dims
);
T
*
qktv_out_ptr
=
qktv_out
.
mutable_data
<
T
>
(
dev_ctx_
.
GetPlace
());
gemm_m
=
config
->
seq_len_r
;
gemm_n
=
config
->
key_dim
;
gemm_k
=
config
->
m_size
;
T
*
softmax_out_ptr
=
softmax_out
->
data
<
T
>
();
ComputeBatchedGEMM
(
softmax_out_ptr
,
v_ptr
,
qktv_out_ptr
,
false
,
false
,
gemm_m
,
gemm_n
,
gemm_k
,
gemm_batch_size
);
// fmha_out = transpose(qktv_out)
ComputeQKTVTransposeForward
(
qktv_out
,
fmha_out
);
}
void
ComputeBackward
(
const
Tensor
*
q_transpose_out
,
const
Tensor
*
k_transpose_out
,
const
Tensor
*
v_transpose_out
,
const
Tensor
*
qkv_transpose_out
,
const
Tensor
*
softmax_out
,
const
Tensor
*
fmha_out_grad
,
Tensor
*
src_mask_grad
,
Tensor
*
nonbatched_bias_grad
,
GateAttentionGradConfig
<
T
>*
config
)
{
const
T
*
q_ptr
=
nullptr
;
const
T
*
k_ptr
=
nullptr
;
const
T
*
v_ptr
=
nullptr
;
T
*
q_grad_ptr
=
nullptr
;
T
*
k_grad_ptr
=
nullptr
;
T
*
v_grad_ptr
=
nullptr
;
Tensor
q_transpose_out_grad
;
Tensor
k_transpose_out_grad
;
Tensor
v_transpose_out_grad
;
Tensor
qkv_transpose_out_grad
;
if
(
merge_qkv_
)
{
PADDLE_ENFORCE_NOT_NULL
(
qkv_transpose_out
,
platform
::
errors
::
NotFound
(
"The input qkv_transpose_out can not be "
"nullptr when merge_qkv is true."
));
int64_t
q_size
=
config
->
GetQuerySize
();
q_ptr
=
qkv_transpose_out
->
data
<
T
>
();
k_ptr
=
q_ptr
+
q_size
;
v_ptr
=
k_ptr
+
q_size
;
qkv_transpose_out_grad
.
Resize
(
config
->
qkv_transpose_out_dims
);
q_grad_ptr
=
qkv_transpose_out_grad
.
mutable_data
<
T
>
(
dev_ctx_
.
GetPlace
());
k_grad_ptr
=
q_grad_ptr
+
q_size
;
v_grad_ptr
=
k_grad_ptr
+
q_size
;
}
else
{
PADDLE_ENFORCE_NOT_NULL
(
q_transpose_out
,
platform
::
errors
::
NotFound
(
"The input q_transpose_out can not be "
"nullptr when merge_qkv is false."
));
PADDLE_ENFORCE_NOT_NULL
(
k_transpose_out
,
platform
::
errors
::
NotFound
(
"The input k_transpose_out can not be "
"nullptr when merge_qkv is false."
));
PADDLE_ENFORCE_NOT_NULL
(
v_transpose_out
,
platform
::
errors
::
NotFound
(
"The input v_transpose_out can not be "
"nullptr when merge_qkv is false."
));
q_ptr
=
q_transpose_out
->
data
<
T
>
();
k_ptr
=
k_transpose_out
->
data
<
T
>
();
v_ptr
=
v_transpose_out
->
data
<
T
>
();
q_transpose_out_grad
.
Resize
(
config
->
q_transpose_out_dims
);
k_transpose_out_grad
.
Resize
(
config
->
kv_transpose_out_dims
);
v_transpose_out_grad
.
Resize
(
config
->
kv_transpose_out_dims
);
q_grad_ptr
=
q_transpose_out_grad
.
mutable_data
<
T
>
(
dev_ctx_
.
GetPlace
());
k_grad_ptr
=
k_transpose_out_grad
.
mutable_data
<
T
>
(
dev_ctx_
.
GetPlace
());
v_grad_ptr
=
v_transpose_out_grad
.
mutable_data
<
T
>
(
dev_ctx_
.
GetPlace
());
}
Tensor
softmax_out_grad
;
softmax_out_grad
.
Resize
(
config
->
softmax_out_dims
);
softmax_out_grad
.
mutable_data
<
T
>
(
dev_ctx_
.
GetPlace
());
int64_t
gemm_batch_size
=
config
->
batch_size
*
config
->
seq_len_m
*
config
->
num_heads
;
{
// Forward: fmha_out = transpose(qktv_out)
Tensor
qktv_out_grad
;
qktv_out_grad
.
Resize
(
config
->
qktv_out_dims
);
T
*
qktv_out_grad_ptr
=
qktv_out_grad
.
mutable_data
<
T
>
(
dev_ctx_
.
GetPlace
());
ComputeQKTVTransposeBackward
(
*
fmha_out_grad
,
&
qktv_out_grad
);
// Forward: qktv_out = BatchedGEMM(softmax_out, V)
// Backward:
// V_grad = BatchedGEMM(softmax_out^T, qktv_out_grad) (dy = x^T * dout)
int64_t
gemm_m
=
config
->
m_size
;
int64_t
gemm_n
=
config
->
key_dim
;
int64_t
gemm_k
=
config
->
seq_len_r
;
const
T
*
softmax_out_ptr
=
softmax_out
->
data
<
T
>
();
ComputeBatchedGEMM
(
softmax_out_ptr
,
qktv_out_grad_ptr
,
v_grad_ptr
,
true
,
false
,
gemm_m
,
gemm_n
,
gemm_k
,
gemm_batch_size
);
// Backward: softmax_out_grad = qktv_out_grad * V^T (dx = dout * y^T)
gemm_m
=
config
->
seq_len_r
;
gemm_n
=
config
->
m_size
;
gemm_k
=
config
->
key_dim
;
T
*
softmax_out_grad_ptr
=
softmax_out_grad
.
data
<
T
>
();
ComputeBatchedGEMM
(
qktv_out_grad_ptr
,
v_ptr
,
softmax_out_grad_ptr
,
false
,
true
,
gemm_m
,
gemm_n
,
gemm_k
,
gemm_batch_size
);
}
Tensor
*
qk_out_grad
=
config
->
GetQKOutGrad
(
dev_ctx_
,
&
softmax_out_grad
);
ComputeBiasMaskSoftmaxBackward
(
&
softmax_out_grad
,
softmax_out
,
src_mask_grad
,
qk_out_grad
,
nonbatched_bias_grad
);
// Forward: qk_out = BatchedGEMM(Q, K^T)
// Backward: k_grad = BatchedGEMM(qk_out_grad^T, Q) (dy = dout^t * x)
int64_t
gemm_m
=
config
->
m_size
;
int64_t
gemm_n
=
config
->
key_dim
;
int64_t
gemm_k
=
config
->
seq_len_r
;
T
alpha
=
static_cast
<
T
>
(
1.0
/
sqrt
(
config
->
key_dim
));
T
*
qk_out_grad_ptr
=
qk_out_grad
->
data
<
T
>
();
ComputeBatchedGEMM
(
qk_out_grad_ptr
,
q_ptr
,
k_grad_ptr
,
true
,
false
,
gemm_m
,
gemm_n
,
gemm_k
,
gemm_batch_size
,
alpha
);
// Backward: q_grad = BatchedGEMM(qk_out_grad, K) (dx = dout * y)
gemm_m
=
config
->
seq_len_r
;
gemm_n
=
config
->
key_dim
;
gemm_k
=
config
->
m_size
;
ComputeBatchedGEMM
(
qk_out_grad_ptr
,
k_ptr
,
q_grad_ptr
,
false
,
false
,
gemm_m
,
gemm_n
,
gemm_k
,
gemm_batch_size
,
alpha
);
if
(
merge_qkv_
)
{
Tensor
*
qkv_out_grad
=
config
->
GetQKVOutGrad
(
dev_ctx_
);
ComputeQKVTransposeBackward
(
qkv_transpose_out_grad
,
qkv_out_grad
);
}
else
{
Tensor
*
q_out_grad
=
config
->
GetQueryOutGrad
(
dev_ctx_
);
Tensor
*
k_out_grad
=
config
->
GetKeyOutGrad
(
dev_ctx_
);
Tensor
*
v_out_grad
=
config
->
GetValueOutGrad
(
dev_ctx_
);
ComputeQKVTransposeBackward
(
q_transpose_out_grad
,
k_transpose_out_grad
,
v_transpose_out_grad
,
q_out_grad
,
k_out_grad
,
v_out_grad
);
}
}
void
ComputeQKVTransposeForward
(
const
Tensor
&
q_out
,
const
Tensor
&
k_out
,
const
Tensor
&
v_out
,
Tensor
*
q_transpose_out
,
Tensor
*
k_transpose_out
,
Tensor
*
v_transpose_out
)
{
int
ndims
=
5
;
std
::
vector
<
int
>
perm
=
{
0
,
1
,
3
,
2
,
4
};
TransposeGPUKernelDriver
<
T
>
(
dev_ctx_
,
ndims
,
q_out
,
perm
,
q_transpose_out
);
TransposeGPUKernelDriver
<
T
>
(
dev_ctx_
,
ndims
,
k_out
,
perm
,
k_transpose_out
);
TransposeGPUKernelDriver
<
T
>
(
dev_ctx_
,
ndims
,
v_out
,
perm
,
v_transpose_out
);
}
void
ComputeQKVTransposeBackward
(
const
Tensor
&
q_transpose_out_grad
,
const
Tensor
&
k_transpose_out_grad
,
const
Tensor
&
v_transpose_out_grad
,
Tensor
*
q_out_grad
,
Tensor
*
k_out_grad
,
Tensor
*
v_out_grad
)
{
int
ndims
=
5
;
std
::
vector
<
int
>
perm
=
{
0
,
1
,
3
,
2
,
4
};
TransposeGPUKernelDriver
<
T
>
(
dev_ctx_
,
ndims
,
q_transpose_out_grad
,
perm
,
q_out_grad
);
TransposeGPUKernelDriver
<
T
>
(
dev_ctx_
,
ndims
,
k_transpose_out_grad
,
perm
,
k_out_grad
);
TransposeGPUKernelDriver
<
T
>
(
dev_ctx_
,
ndims
,
v_transpose_out_grad
,
perm
,
v_out_grad
);
}
// [batch_size, seq_len_m, seq_len_r, 3, num_heads, key_dim] ->
// [3, batch_size, seq_len_m, num_heads, seq_len_r, key_dim]
void
ComputeQKVTransposeForward
(
const
Tensor
&
qkv_out
,
Tensor
*
qkv_transpose_out
)
{
int
ndims
=
6
;
std
::
vector
<
int
>
perm
=
{
3
,
0
,
1
,
4
,
2
,
5
};
TransposeGPUKernelDriver
<
T
>
(
dev_ctx_
,
ndims
,
qkv_out
,
perm
,
qkv_transpose_out
);
}
void
ComputeQKVTransposeBackward
(
const
Tensor
&
qkv_transpose_out_grad
,
Tensor
*
qkv_out_grad
)
{
int
ndims
=
6
;
std
::
vector
<
int
>
perm
=
{
1
,
2
,
4
,
0
,
3
,
5
};
TransposeGPUKernelDriver
<
T
>
(
dev_ctx_
,
ndims
,
qkv_transpose_out_grad
,
perm
,
qkv_out_grad
);
}
// [batch_size, seq_len_m, num_head, seq_len_r, c] ->
// [batch_size, seq_len_m, seq_len_r, num_head, c]
void
ComputeQKTVTransposeForward
(
const
Tensor
&
qktv_out
,
Tensor
*
fmha_out
)
{
int
ndims
=
5
;
std
::
vector
<
int
>
perm
=
{
0
,
1
,
3
,
2
,
4
};
TransposeGPUKernelDriver
<
T
>
(
dev_ctx_
,
ndims
,
qktv_out
,
perm
,
fmha_out
);
}
void
ComputeQKTVTransposeBackward
(
const
Tensor
&
fmha_out_grad
,
Tensor
*
qktv_out_grad
)
{
int
ndims
=
5
;
std
::
vector
<
int
>
perm
=
{
0
,
1
,
3
,
2
,
4
};
TransposeGPUKernelDriver
<
T
>
(
dev_ctx_
,
ndims
,
fmha_out_grad
,
perm
,
qktv_out_grad
);
}
// qk_out = qk_out + nonbatched_bias + src_mask
// softmax_out = softmax(src_mask_out)
void
ComputeBiasMaskSoftmaxForward
(
const
Tensor
*
nonbatched_bias
,
const
Tensor
*
src_mask
,
Tensor
*
qk_out
,
Tensor
*
softmax_out
)
{
if
(
nonbatched_bias
)
{
std
::
vector
<
const
Tensor
*>
ins
=
{
qk_out
,
nonbatched_bias
,
src_mask
};
std
::
vector
<
Tensor
*>
outs
=
{
qk_out
};
phi
::
funcs
::
BroadcastKernel
<
ElementwiseType
::
kTernary
,
T
,
T
>
(
dev_ctx_
,
ins
,
&
outs
,
-
1
,
TernaryAddFunctor
<
T
>
());
}
else
{
std
::
vector
<
const
Tensor
*>
ins
=
{
qk_out
,
src_mask
};
std
::
vector
<
Tensor
*>
outs
=
{
qk_out
};
phi
::
funcs
::
BroadcastKernel
<
ElementwiseType
::
kBinary
,
T
,
T
>
(
dev_ctx_
,
ins
,
&
outs
,
-
1
,
phi
::
funcs
::
AddFunctor
<
T
>
());
}
phi
::
SoftmaxForwardCUDAKernelDriver
<
T
>
(
dev_ctx_
,
*
qk_out
,
-
1
,
softmax_out
);
}
// src_mask_out = qk_out + nonbatched_bias + src_mask
// softmax_out = softmax(src_mask_out)
void
ComputeBiasMaskSoftmaxBackward
(
const
Tensor
*
softmax_out_grad
,
const
Tensor
*
softmax_out
,
Tensor
*
src_mask_grad
,
Tensor
*
qk_out_grad
,
Tensor
*
nonbatched_bias_grad
)
{
PADDLE_ENFORCE_NOT_NULL
(
qk_out_grad
,
platform
::
errors
::
NotFound
(
"The qk_out_grad can not be nullptr."
));
PADDLE_ENFORCE_EQ
(
qk_out_grad
->
dims
(),
softmax_out
->
dims
(),
platform
::
errors
::
InvalidArgument
(
"The shape of qk_out_grad and softmax_out is "
"expected to be the same. But recieved qk_out_grad's "
"shape = %s, softmax_out's shape = %s."
,
qk_out_grad
->
dims
(),
softmax_out
->
dims
()));
PADDLE_ENFORCE_EQ
(
src_mask_grad
,
nullptr
,
platform
::
errors
::
InvalidArgument
(
"src_mask_grad is expected to be nullptr."
));
phi
::
SoftmaxBackwardCUDAKernelDriver
<
T
>
(
dev_ctx_
,
*
softmax_out
,
*
softmax_out_grad
,
-
1
,
qk_out_grad
);
// [1, bs, num_head, seq_l, seq_l] -> [bs, num_head, seq_l, seq_l]
if
(
nonbatched_bias_grad
)
{
gpuStream_t
stream
=
dev_ctx_
.
stream
();
TensorReduceImpl
<
T
,
T
,
kps
::
AddFunctor
,
kps
::
IdentityFunctor
<
T
>>
(
dev_ctx_
,
*
qk_out_grad
,
nonbatched_bias_grad
,
kps
::
IdentityFunctor
<
T
>
(),
{
0
,
1
},
stream
);
}
}
private:
void
ComputeBatchedGEMM
(
const
T
*
a_ptr
,
const
T
*
b_ptr
,
T
*
c_ptr
,
bool
trans_a
,
bool
trans_b
,
int64_t
m
,
int64_t
n
,
int64_t
k
,
int64_t
batch_size
,
T
alpha
=
static_cast
<
T
>
(
1.0
),
T
beta
=
static_cast
<
T
>
(
0.0
))
{
CBLAS_TRANSPOSE
cblas_trans_a
=
trans_a
?
CblasTrans
:
CblasNoTrans
;
CBLAS_TRANSPOSE
cblas_trans_b
=
trans_b
?
CblasTrans
:
CblasNoTrans
;
int64_t
stride_a
=
m
*
k
;
int64_t
stride_b
=
k
*
n
;
auto
blas
=
phi
::
funcs
::
GetBlas
<
platform
::
CUDADeviceContext
,
T
>
(
dev_ctx_
);
blas
.
BatchedGEMM
(
cblas_trans_a
,
cblas_trans_b
,
m
,
n
,
k
,
alpha
,
a_ptr
,
b_ptr
,
beta
,
c_ptr
,
batch_size
,
stride_a
,
stride_b
);
}
const
platform
::
CUDADeviceContext
&
dev_ctx_
;
bool
merge_qkv_
;
};
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/fused/fused_gate_attention_op.cc
0 → 100644
浏览文件 @
fdcdbec5
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <memory>
#include <string>
#include "paddle/fluid/framework/op_registry.h"
namespace
paddle
{
namespace
operators
{
using
Tensor
=
framework
::
Tensor
;
using
DDim
=
framework
::
DDim
;
class
FusedGateAttentionOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
OP_INOUT_CHECK
(
ctx
->
HasInput
(
"Query"
),
"Input"
,
"Query"
,
"fused_gate_attention"
);
OP_INOUT_CHECK
(
ctx
->
HasInput
(
"OutLinearWeight"
),
"Input"
,
"OutLinearWeight"
,
"fused_gate_attention"
);
OP_INOUT_CHECK
(
ctx
->
HasInput
(
"OutLinearBias"
),
"Input"
,
"OutLinearBias"
,
"fused_gate_attention"
);
OP_INOUT_CHECK
(
ctx
->
HasOutput
(
"SoftmaxOut"
),
"Output"
,
"SoftmaxOut"
,
"fused_gate_attention"
);
OP_INOUT_CHECK
(
ctx
->
HasOutput
(
"FMHAOut"
),
"Output"
,
"FMHAOut"
,
"fused_gate_attention"
);
OP_INOUT_CHECK
(
ctx
->
HasOutput
(
"Out"
),
"Output"
,
"Out"
,
"fused_gate_attention"
);
auto
input_q_dims
=
ctx
->
GetInputDim
(
"Query"
);
int
batch_size
=
input_q_dims
[
0
];
int
seq_len_m
=
input_q_dims
[
1
];
int
seq_len_r
=
input_q_dims
[
2
];
int
num_head
,
m_size
,
key_dim
;
if
(
ctx
->
Attrs
().
Get
<
bool
>
(
"merge_qkv"
))
{
// QKV's input: [batch_size, seq_len_m, seq_len_r, qkv_dim]
// QKV's weight: [3, num_head, key_dim, qkv_dim]
OP_INOUT_CHECK
(
ctx
->
HasInput
(
"QKVWeight"
),
"Input"
,
"QKVWeight"
,
"fused_gate_attention"
);
OP_INOUT_CHECK
(
ctx
->
HasOutput
(
"QKVTransposeOut"
),
"Output"
,
"QKVTransposeOut"
,
"fused_gate_attention"
);
auto
qkv_w_dims
=
ctx
->
GetInputDim
(
"QKVWeight"
);
num_head
=
qkv_w_dims
[
1
];
key_dim
=
qkv_w_dims
[
2
];
m_size
=
seq_len_r
;
ctx
->
SetOutputDim
(
"QKVTransposeOut"
,
{
3
,
batch_size
,
seq_len_m
,
num_head
,
seq_len_r
,
key_dim
});
}
else
{
OP_INOUT_CHECK
(
ctx
->
HasInput
(
"QueryWeight"
),
"Input"
,
"QueryWeight"
,
"fused_gate_attention"
);
OP_INOUT_CHECK
(
ctx
->
HasInput
(
"KeyWeight"
),
"Input"
,
"KeyWeight"
,
"fused_gate_attention"
);
OP_INOUT_CHECK
(
ctx
->
HasInput
(
"ValueWeight"
),
"Input"
,
"ValueWeight"
,
"fused_gate_attention"
);
auto
input_k_dims
=
ctx
->
GetInputDim
(
"Key"
);
auto
q_w_dims
=
ctx
->
GetInputDim
(
"QueryWeight"
);
num_head
=
q_w_dims
[
1
];
key_dim
=
q_w_dims
[
2
];
m_size
=
input_k_dims
[
2
];
ctx
->
SetOutputDim
(
"QueryTransposeOut"
,
{
batch_size
,
seq_len_m
,
num_head
,
seq_len_r
,
key_dim
});
ctx
->
SetOutputDim
(
"KeyTransposeOut"
,
{
batch_size
,
seq_len_m
,
num_head
,
m_size
,
key_dim
});
ctx
->
SetOutputDim
(
"ValueTransposeOut"
,
{
batch_size
,
seq_len_m
,
num_head
,
m_size
,
key_dim
});
}
ctx
->
SetOutputDim
(
"SoftmaxOut"
,
{
batch_size
,
seq_len_m
,
num_head
,
seq_len_r
,
m_size
});
ctx
->
SetOutputDim
(
"FMHAOut"
,
{
batch_size
,
seq_len_m
,
seq_len_r
,
num_head
,
key_dim
});
if
(
ctx
->
Attrs
().
Get
<
bool
>
(
"has_gating"
))
{
OP_INOUT_CHECK
(
ctx
->
HasInput
(
"GateWeight"
),
"Input"
,
"GateWeight"
,
"fused_gate_attention"
);
OP_INOUT_CHECK
(
ctx
->
HasInput
(
"GateBias"
),
"Input"
,
"GateBias"
,
"fused_gate_attention"
);
ctx
->
SetOutputDim
(
"GateOut"
,
{
batch_size
,
seq_len_m
,
seq_len_r
,
num_head
,
key_dim
});
}
ctx
->
SetOutputDim
(
"Out"
,
ctx
->
GetInputDim
(
"Query"
));
}
};
class
FusedGateAttentionOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
override
{
AddInput
(
"Query"
,
"The query tensor."
);
AddInput
(
"Key"
,
"The key tensor."
).
AsDispensable
();
AddInput
(
"QueryWeight"
,
"(optional) The query weight tensor."
)
.
AsDispensable
();
AddInput
(
"KeyWeight"
,
"(optional) The key weight tensor."
).
AsDispensable
();
AddInput
(
"ValueWeight"
,
"(optional) The value weight tensor."
)
.
AsDispensable
();
AddInput
(
"QKVWeight"
,
"(optional) The qkv weight tensor."
).
AsDispensable
();
AddInput
(
"NonbatchedBias"
,
"(optional) The nonbatchedBias tensor."
)
.
AsDispensable
();
AddInput
(
"SrcMask"
,
"The attention mask tensor in fmha."
);
AddInput
(
"GateWeight"
,
"(optional) The gate weight tensor."
)
.
AsDispensable
();
AddInput
(
"GateBias"
,
"(optional) The gate bias tensor."
).
AsDispensable
();
AddInput
(
"OutLinearWeight"
,
"The out_linear weight tensor."
);
AddInput
(
"OutLinearBias"
,
"The out_linear bias tensor."
);
AddOutput
(
"QueryTransposeOut"
,
"The transposed result of query matmul."
)
.
AsIntermediate
()
.
AsDispensable
();
AddOutput
(
"KeyTransposeOut"
,
"The transposed result of key matmul."
)
.
AsIntermediate
()
.
AsDispensable
();
AddOutput
(
"ValueTransposeOut"
,
"The transposed result of value matmul."
)
.
AsIntermediate
()
.
AsDispensable
();
AddOutput
(
"QKVTransposeOut"
,
"The transposed result of merged QKV matmul."
)
.
AsIntermediate
()
.
AsDispensable
();
AddOutput
(
"SoftmaxOut"
,
"Result in fmha."
).
AsIntermediate
();
AddOutput
(
"FMHAOut"
,
"Result in fmha."
).
AsIntermediate
();
AddOutput
(
"GateOut"
,
"Result of the gating module."
)
.
AsIntermediate
()
.
AsDispensable
();
AddOutput
(
"Out"
,
"Result after attention."
);
AddAttr
<
bool
>
(
"has_gating"
,
"if true, the attention op uses gate architecure, "
"[default true]."
)
.
SetDefault
(
true
);
AddAttr
<
bool
>
(
"merge_qkv"
,
"if true, calculation with merged qkv, "
"[default true]."
)
.
SetDefault
(
true
);
AddComment
(
R"DOC(
Add fused attention op whose logic is as follows:
{
q = paddle.einsum('nbqa,ahc->nbqhc', q_data, self.query_w)
k = paddle.einsum('nbka,ahc->nbkhc', m_data, self.key_w)
v = paddle.einsum('nbka,ahc->nbkhc', m_data, self.value_w)
logits = paddle.einsum('nbqhc,nbkhc->nbhqk', q * c , k) + bias
weights = nn.functional.softmax(logits)
weighted_avg = paddle.einsum('nbhqk,nbkhc->nbqhc', weights, v)
if nonbatched_bias is not None:
logits += paddle.unsqueeze(nonbatched_bias, axis=1)
if self.gating:
gate_values = paddle.einsum('nbqc,chv->nbqhv', q_data,
self.gating_w) + self.gating_b
gate_values_1 = nn.functional.sigmoid(gate_values)
weighted_avg *= gate_values_1
output = paddle.einsum('nbqhc,hco->nbqo', weighted_avg,
self.output_w) + self.output_b
}
)DOC"
);
}
};
class
FusedGateAttentionGradOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
OP_INOUT_CHECK
(
ctx
->
HasInput
(
"Query"
),
"Input"
,
"Query"
,
"fused_gate_attention_grad"
);
if
(
ctx
->
HasOutput
(
framework
::
GradVarName
(
"Query"
)))
{
ctx
->
SetOutputDim
(
framework
::
GradVarName
(
"Query"
),
ctx
->
GetInputDim
(
"Query"
));
}
if
(
ctx
->
HasOutput
(
framework
::
GradVarName
(
"Key"
)))
{
ctx
->
SetOutputDim
(
framework
::
GradVarName
(
"Key"
),
ctx
->
GetInputDim
(
"Key"
));
}
if
(
ctx
->
Attrs
().
Get
<
bool
>
(
"merge_qkv"
))
{
OP_INOUT_CHECK
(
ctx
->
HasInput
(
"QKVWeight"
),
"Input"
,
"QKVWeight"
,
"fused_gate_attention_arad"
);
ctx
->
SetOutputDim
(
framework
::
GradVarName
(
"QKVWeight"
),
ctx
->
GetInputDim
(
"QKVWeight"
));
}
else
{
OP_INOUT_CHECK
(
ctx
->
HasInput
(
"QueryWeight"
),
"Input"
,
"QueryWeight"
,
"fused_aate_attention_arad"
);
OP_INOUT_CHECK
(
ctx
->
HasInput
(
"KeyWeight"
),
"Input"
,
"KeyWeight"
,
"fused_aate_attention_arad"
);
OP_INOUT_CHECK
(
ctx
->
HasInput
(
"ValueWeight"
),
"Input"
,
"ValueWeight"
,
"fused_aate_attention_arad"
);
for
(
auto
&
name
:
{
"QueryWeight"
,
"KeyWeight"
,
"ValueWeight"
})
{
ctx
->
SetOutputDim
(
framework
::
GradVarName
(
name
),
ctx
->
GetInputDim
(
name
));
}
}
OP_INOUT_CHECK
(
ctx
->
HasInput
(
"OutLinearWeight"
),
"Input"
,
"OutLinearWeight"
,
"fused_aate_attention_arad"
);
if
(
ctx
->
Attrs
().
Get
<
bool
>
(
"has_gating"
))
{
for
(
auto
&
name
:
{
"GateWeight"
,
"GateBias"
,
"GateOut"
})
{
ctx
->
SetOutputDim
(
framework
::
GradVarName
(
name
),
ctx
->
GetInputDim
(
name
));
}
}
if
(
ctx
->
HasOutput
(
framework
::
GradVarName
(
"NonbatchedBias"
)))
{
ctx
->
SetOutputDim
(
framework
::
GradVarName
(
"NonbatchedBias"
),
ctx
->
GetInputDim
(
"NonbatchedBias"
));
}
ctx
->
SetOutputDim
(
framework
::
GradVarName
(
"FMHAOut"
),
ctx
->
GetInputDim
(
"FMHAOut"
));
ctx
->
SetOutputDim
(
framework
::
GradVarName
(
"OutLinearWeight"
),
ctx
->
GetInputDim
(
"OutLinearWeight"
));
ctx
->
SetOutputDim
(
framework
::
GradVarName
(
"OutLinearBias"
),
ctx
->
GetInputDim
(
"OutLinearBias"
));
}
};
template
<
typename
T
>
class
FusedGateAttentionGradOpMaker
:
public
framework
::
SingleGradOpMaker
<
T
>
{
public:
using
framework
::
SingleGradOpMaker
<
T
>::
SingleGradOpMaker
;
protected:
void
Apply
(
GradOpPtr
<
T
>
op
)
const
override
{
op
->
SetType
(
"fused_gate_attention_grad"
);
op
->
SetInput
(
framework
::
GradVarName
(
"Out"
),
this
->
OutputGrad
(
"Out"
));
op
->
SetInput
(
"Query"
,
this
->
Input
(
"Query"
));
op
->
SetOutput
(
framework
::
GradVarName
(
"Query"
),
this
->
InputGrad
(
"Query"
));
op
->
SetAttrMap
(
this
->
Attrs
());
bool
merge_qkv
=
BOOST_GET_CONST
(
bool
,
op
->
GetAttr
(
"merge_qkv"
));
if
(
merge_qkv
)
{
op
->
SetInput
(
"QKVWeight"
,
this
->
Input
(
"QKVWeight"
));
op
->
SetOutput
(
framework
::
GradVarName
(
"QKVWeight"
),
this
->
InputGrad
(
"QKVWeight"
));
op
->
SetInput
(
"QKVTransposeOut"
,
this
->
Output
(
"QKVTransposeOut"
));
}
else
{
op
->
SetInput
(
"Key"
,
this
->
Input
(
"Key"
));
op
->
SetOutput
(
framework
::
GradVarName
(
"Key"
),
this
->
InputGrad
(
"Key"
));
for
(
auto
&
name
:
{
"QueryWeight"
,
"KeyWeight"
,
"ValueWeight"
})
{
op
->
SetInput
(
name
,
this
->
Input
(
name
));
op
->
SetOutput
(
framework
::
GradVarName
(
name
),
this
->
InputGrad
(
name
));
}
for
(
auto
&
name
:
{
"QueryTransposeOut"
,
"KeyTransposeOut"
,
"ValueTransposeOut"
})
{
op
->
SetInput
(
name
,
this
->
Output
(
name
));
}
}
op
->
SetInput
(
"FMHAOut"
,
this
->
Output
(
"FMHAOut"
));
op
->
SetOutput
(
framework
::
GradVarName
(
"FMHAOut"
),
this
->
OutputGrad
(
"FMHAOut"
));
if
(
this
->
HasInput
(
"NonbatchedBias"
))
{
op
->
SetInput
(
"NonbatchedBias"
,
this
->
Input
(
"NonbatchedBias"
));
op
->
SetOutput
(
framework
::
GradVarName
(
"NonbatchedBias"
),
this
->
InputGrad
(
"NonbatchedBias"
));
}
op
->
SetInput
(
"SoftmaxOut"
,
this
->
Output
(
"SoftmaxOut"
));
bool
has_gating
=
BOOST_GET_CONST
(
bool
,
op
->
GetAttr
(
"has_gating"
));
if
(
has_gating
)
{
op
->
SetInput
(
"GateWeight"
,
this
->
Input
(
"GateWeight"
));
op
->
SetOutput
(
framework
::
GradVarName
(
"GateWeight"
),
this
->
InputGrad
(
"GateWeight"
));
op
->
SetInput
(
"GateBias"
,
this
->
Input
(
"GateBias"
));
op
->
SetOutput
(
framework
::
GradVarName
(
"GateBias"
),
this
->
InputGrad
(
"GateBias"
));
op
->
SetInput
(
"GateOut"
,
this
->
Output
(
"GateOut"
));
op
->
SetOutput
(
framework
::
GradVarName
(
"GateOut"
),
this
->
OutputGrad
(
"GateOut"
));
}
op
->
SetInput
(
"OutLinearWeight"
,
this
->
Input
(
"OutLinearWeight"
));
op
->
SetOutput
(
framework
::
GradVarName
(
"OutLinearWeight"
),
this
->
InputGrad
(
"OutLinearWeight"
));
op
->
SetInput
(
"OutLinearBias"
,
this
->
Input
(
"OutLinearBias"
));
op
->
SetOutput
(
framework
::
GradVarName
(
"OutLinearBias"
),
this
->
InputGrad
(
"OutLinearBias"
));
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OPERATOR
(
fused_gate_attention
,
ops
::
FusedGateAttentionOp
,
ops
::
FusedGateAttentionOpMaker
,
ops
::
FusedGateAttentionGradOpMaker
<
paddle
::
framework
::
OpDesc
>
,
ops
::
FusedGateAttentionGradOpMaker
<
paddle
::
imperative
::
OpBase
>
);
REGISTER_OPERATOR
(
fused_gate_attention_grad
,
ops
::
FusedGateAttentionGradOp
);
paddle/fluid/operators/fused/fused_gate_attention_op.cu
0 → 100644
浏览文件 @
fdcdbec5
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/operators/fused/attn_gemm.h"
#include "paddle/fluid/operators/fused/fused_gate_attention.h"
#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace
paddle
{
namespace
operators
{
using
Tensor
=
framework
::
Tensor
;
template
<
typename
T
>
struct
SigmoidMultiplyFunctor
{
using
MPType
=
typename
phi
::
dtype
::
MPTypeTrait
<
T
>::
Type
;
MPType
one
=
static_cast
<
MPType
>
(
1.0
f
);
// sigmoid(x) = 1 / (1 + exp(-x))
// out = sigmoid(x) * y
inline
HOSTDEVICE
T
operator
()(
T
x
,
T
y
)
const
{
MPType
x_mp
=
static_cast
<
MPType
>
(
x
);
T
sigmoid_out
=
static_cast
<
T
>
(
one
/
(
one
+
exp
(
-
x_mp
)));
return
sigmoid_out
*
y
;
}
};
template
<
typename
T
>
struct
SigmoidMultiplyGradFunctor
{
using
MPType
=
typename
phi
::
dtype
::
MPTypeTrait
<
T
>::
Type
;
MPType
one
=
static_cast
<
MPType
>
(
1.0
f
);
// Gradient of Multiply:
// dx = dout * y
// dy = dout * x
// Gradient of Sigmoid: dx = dout * out * (1 - out)
inline
HOSTDEVICE
phi
::
Array
<
T
,
2
>
operator
()(
const
T
dout
,
const
T
x
,
T
y
)
const
{
MPType
x_mp
=
static_cast
<
MPType
>
(
x
);
T
sigmoid_out
=
static_cast
<
T
>
(
one
/
(
one
+
exp
(
-
x_mp
)));
T
d_sigmoid_out
=
dout
*
y
;
phi
::
Array
<
T
,
2
>
outs
;
outs
[
0
]
=
d_sigmoid_out
*
sigmoid_out
*
(
static_cast
<
T
>
(
1.0
f
)
-
sigmoid_out
);
// dx
outs
[
1
]
=
dout
*
sigmoid_out
;
// dy
return
outs
;
}
};
template
<
typename
T
>
void
ComputeMergedQKVMatmulForward
(
const
framework
::
ExecutionContext
&
ctx
,
const
GateAttentionConfig
<
T
>
&
config
,
const
Tensor
*
query
,
Tensor
*
qkv_out
)
{
// query: shape=[batch_size, seq_len_m, seq_len_r, qkv_dim]
// qkv_weight: shape=[3, num_heads, key_dim, qkv_dim]
// qkv_out: shape=[batch_size, seq_len_m, seq_len_r, 3, num_heads, key_dim]
auto
*
qkv_weight
=
ctx
.
Input
<
Tensor
>
(
"QKVWeight"
);
// qkv_out = GEMM(query, qkv_weight^T)
int
m
=
config
.
batch_size
*
config
.
seq_len_m
*
config
.
seq_len_r
;
int
n
=
3
*
config
.
num_heads
*
config
.
key_dim
;
int
k
=
config
.
q_dim
;
auto
qkv_compute
=
AttnMatMul
<
T
>
(
ctx
.
cuda_device_context
(),
false
,
true
,
m
,
n
,
k
,
false
);
qkv_compute
.
ComputeForward
(
qkv_weight
,
query
,
nullptr
,
qkv_out
,
nullptr
);
}
template
<
typename
T
>
Tensor
*
ComputeMergedQKVMatmulBackward
(
const
framework
::
ExecutionContext
&
ctx
,
const
GateAttentionGradConfig
<
T
>
&
config
,
const
Tensor
*
query
,
const
Tensor
*
qkv_out_grad
,
Tensor
*
query_grad
,
bool
use_addto
)
{
auto
*
qkv_weight
=
ctx
.
Input
<
Tensor
>
(
"QKVWeight"
);
auto
*
qkv_weight_grad
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"QKVWeight"
));
qkv_weight_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
// Gradient of GEMM(query, qkv_weight)
int
m
=
config
.
batch_size
*
config
.
seq_len_m
*
config
.
seq_len_r
;
int
n
=
3
*
config
.
num_heads
*
config
.
key_dim
;
int
k
=
config
.
q_dim
;
auto
qkv_compute
=
AttnMatMul
<
T
>
(
ctx
.
cuda_device_context
(),
false
,
true
,
m
,
n
,
k
,
false
);
qkv_compute
.
ComputeBackward
(
query
,
qkv_weight
,
qkv_out_grad
,
query_grad
,
qkv_weight_grad
,
nullptr
,
use_addto
);
return
query_grad
;
}
template
<
typename
T
>
void
ComputeSeparatedQKVMatmulForward
(
const
framework
::
ExecutionContext
&
ctx
,
const
GateAttentionConfig
<
T
>
&
config
,
const
Tensor
*
query
,
const
Tensor
*
key
,
Tensor
*
query_out
,
Tensor
*
key_out
,
Tensor
*
value_out
)
{
auto
*
query_weight
=
ctx
.
Input
<
Tensor
>
(
"QueryWeight"
);
auto
*
key_weight
=
ctx
.
Input
<
Tensor
>
(
"KeyWeight"
);
auto
*
value_weight
=
ctx
.
Input
<
Tensor
>
(
"ValueWeight"
);
// query_out = GEMM(query, query_weight)
// query: shape=[batch_size, seq_len_m, seq_len_r, q_dim]
// query_weight: shape=[q_dim, num_heads, key_dim]
// query_out: shape=[batch_size, seq_len_m, seq_len_r, num_heads, key_dim]
int
q_m
=
config
.
batch_size
*
config
.
seq_len_m
*
config
.
seq_len_r
;
int
q_n
=
config
.
num_heads
*
config
.
key_dim
;
int
q_k
=
config
.
q_dim
;
auto
q_compute
=
AttnMatMul
<
T
>
(
ctx
.
cuda_device_context
(),
false
,
false
,
q_m
,
q_n
,
q_k
,
false
);
q_compute
.
ComputeForward
(
query_weight
,
query
,
nullptr
,
query_out
,
nullptr
);
// k_out = GEMM(key, key_weight)
// key: shape=[batch_size, seq_len_m, m_size, kv_dim]
// key_weight: shape=[kv_dim, num_heads, key_dim]
// key_out: shape=[batch_size, seq_len_m, m_size, num_heads, key_dim]
int
kv_m
=
config
.
batch_size
*
config
.
seq_len_m
*
config
.
m_size
;
int
kv_n
=
config
.
num_heads
*
config
.
key_dim
;
int
kv_k
=
config
.
kv_dim
;
auto
kv_compute
=
AttnMatMul
<
T
>
(
ctx
.
cuda_device_context
(),
false
,
false
,
kv_m
,
kv_n
,
kv_k
,
false
);
kv_compute
.
ComputeForward
(
key_weight
,
key
,
nullptr
,
key_out
,
nullptr
);
// value_out = GEMM(value, value_weight)
kv_compute
.
ComputeForward
(
value_weight
,
key
,
nullptr
,
value_out
,
nullptr
);
}
template
<
typename
T
>
Tensor
*
ComputeSeparatedQKVMatmulBackward
(
const
framework
::
ExecutionContext
&
ctx
,
const
GateAttentionGradConfig
<
T
>
&
config
,
const
Tensor
*
query
,
const
Tensor
*
key
,
const
Tensor
*
query_out_grad
,
const
Tensor
*
key_out_grad
,
const
Tensor
*
value_out_grad
,
Tensor
*
query_grad
,
Tensor
*
key_grad
,
bool
use_addto
)
{
// Gradient of GEMM(key, k_weight)
const
auto
*
key_weight
=
ctx
.
Input
<
Tensor
>
(
"KeyWeight"
);
auto
*
key_weight_grad
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"KeyWeight"
));
key_weight_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
int
kv_m
=
config
.
batch_size
*
config
.
seq_len_m
*
config
.
m_size
;
int
kv_n
=
config
.
num_heads
*
config
.
key_dim
;
int
kv_k
=
config
.
kv_dim
;
auto
kv_compute
=
AttnMatMul
<
T
>
(
ctx
.
cuda_device_context
(),
false
,
false
,
kv_m
,
kv_n
,
kv_k
,
false
);
kv_compute
.
ComputeBackward
(
key
,
key_weight
,
key_out_grad
,
key_grad
,
key_weight_grad
,
nullptr
,
false
);
// Gradient of GEMM(value, v_weight)
auto
*
value_weight
=
ctx
.
Input
<
Tensor
>
(
"ValueWeight"
);
auto
*
value_weight_grad
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"ValueWeight"
));
value_weight_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
kv_compute
.
ComputeBackward
(
key
,
value_weight
,
value_out_grad
,
key_grad
,
value_weight_grad
,
nullptr
,
true
);
// Gradient of GEMM(query, query_weight)
const
auto
*
query_weight
=
ctx
.
Input
<
Tensor
>
(
"QueryWeight"
);
auto
*
query_weight_grad
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"QueryWeight"
));
query_weight_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
int
q_m
=
config
.
batch_size
*
config
.
seq_len_m
*
config
.
seq_len_r
;
int
q_n
=
config
.
num_heads
*
config
.
key_dim
;
int
q_k
=
config
.
q_dim
;
auto
q_compute
=
AttnMatMul
<
T
>
(
ctx
.
cuda_device_context
(),
false
,
false
,
q_m
,
q_n
,
q_k
,
false
);
q_compute
.
ComputeBackward
(
query
,
query_weight
,
query_out_grad
,
query_grad
,
query_weight_grad
,
nullptr
,
use_addto
);
return
query_grad
;
}
template
<
typename
T
>
Tensor
*
ComputeGatingLinearForward
(
const
framework
::
ExecutionContext
&
ctx
,
const
GateAttentionConfig
<
T
>
&
config
,
const
Tensor
*
query
,
const
Tensor
*
fmha_out
)
{
auto
*
gate_weight
=
ctx
.
Input
<
Tensor
>
(
"GateWeight"
);
auto
*
gate_bias
=
ctx
.
Input
<
Tensor
>
(
"GateBias"
);
auto
*
gate_out
=
ctx
.
Output
<
Tensor
>
(
"GateOut"
);
gate_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
VLOG
(
4
)
<<
"[ComputeGatingLinearForward] gate_out: "
<<
MemoryDebugString
(
*
gate_out
);
// The first gate_bias_out stores the result of the multiplication,
// and the second gate_bias_out stores the result of the multiplication +
// bias.
// gate_out = GEMM(query, gate_weight) + gate_bias
int
m
=
config
.
batch_size
*
config
.
seq_len_m
*
config
.
seq_len_r
;
int
n
=
config
.
num_heads
*
config
.
key_dim
;
int
k
=
config
.
q_dim
;
auto
gate_attn_compute
=
AttnMatMul
<
T
>
(
ctx
.
cuda_device_context
(),
false
,
false
,
m
,
n
,
k
,
true
);
gate_attn_compute
.
ComputeForward
(
gate_weight
,
query
,
gate_bias
,
gate_out
,
gate_out
);
// gate_out = sigmoid(gate_out) * fmha_out
std
::
vector
<
const
Tensor
*>
ins
=
{
gate_out
,
fmha_out
};
std
::
vector
<
Tensor
*>
outs
=
{
gate_out
};
phi
::
funcs
::
ElementwiseKernel
<
T
>
(
ctx
.
cuda_device_context
(),
ins
,
&
outs
,
SigmoidMultiplyFunctor
<
T
>
());
return
gate_out
;
}
template
<
typename
T
>
Tensor
*
ComputeGatingLinearBackward
(
const
framework
::
ExecutionContext
&
ctx
,
const
GateAttentionGradConfig
<
T
>
&
config
,
const
Tensor
*
fmha_out
,
const
Tensor
*
gate_out_grad
,
Tensor
*
query_grad
,
Tensor
*
fmha_out_grad
)
{
const
auto
*
query
=
ctx
.
Input
<
Tensor
>
(
"Query"
);
const
auto
*
gate_weight
=
ctx
.
Input
<
Tensor
>
(
"GateWeight"
);
const
auto
*
gate_bias
=
ctx
.
Input
<
Tensor
>
(
"GateBias"
);
// Re-compute gate_bias_out
Tensor
gate_bias_out
;
gate_bias_out
.
Resize
(
config
.
gate_out_dims
);
gate_bias_out
.
mutable_data
<
T
>
(
ctx
.
GetPlace
());
int
m
=
config
.
batch_size
*
config
.
seq_len_m
*
config
.
seq_len_r
;
int
n
=
config
.
num_heads
*
config
.
key_dim
;
int
k
=
config
.
q_dim
;
auto
gate_attn_compute
=
AttnMatMul
<
T
>
(
ctx
.
cuda_device_context
(),
false
,
false
,
m
,
n
,
k
,
true
);
gate_attn_compute
.
ComputeForward
(
gate_weight
,
query
,
gate_bias
,
&
gate_bias_out
,
&
gate_bias_out
);
// Gradient of sigmoid(gate_bias_out) * fmha_out
// Compute inplace and save gate_bias_out_grad to gate_bias_out.
std
::
vector
<
const
Tensor
*>
ins
=
{
gate_out_grad
,
&
gate_bias_out
,
fmha_out
};
std
::
vector
<
Tensor
*>
outs
=
{
&
gate_bias_out
,
fmha_out_grad
};
phi
::
funcs
::
ElementwiseKernel
<
T
,
SigmoidMultiplyGradFunctor
<
T
>
,
2
>
(
ctx
.
cuda_device_context
(),
ins
,
&
outs
,
SigmoidMultiplyGradFunctor
<
T
>
());
// Gradient of GEMM(query, gate_weight) + gate_bias
auto
*
gate_weight_grad
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"GateWeight"
));
auto
*
gate_bias_grad
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"GateBias"
));
gate_weight_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
gate_bias_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
gate_attn_compute
.
ComputeBackward
(
query
,
gate_weight
,
&
gate_bias_out
,
query_grad
,
gate_weight_grad
,
gate_bias_grad
);
return
fmha_out_grad
;
}
template
<
typename
T
>
Tensor
*
ComputeOutputLinearForward
(
const
framework
::
ExecutionContext
&
ctx
,
const
GateAttentionConfig
<
T
>
&
config
,
const
Tensor
*
fmha_or_gate_out
)
{
const
auto
*
out_linear_weight
=
ctx
.
Input
<
Tensor
>
(
"OutLinearWeight"
);
const
auto
*
out_linear_bias
=
ctx
.
Input
<
Tensor
>
(
"OutLinearBias"
);
auto
*
out
=
ctx
.
Output
<
Tensor
>
(
"Out"
);
out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
VLOG
(
4
)
<<
"[ComputeOutputLinearForward] out: "
<<
MemoryDebugString
(
*
out
);
// out = GEMM(fmha_or_gate_out, out_linear_weight) + out_linear_bias
int
m
=
config
.
batch_size
*
config
.
seq_len_m
*
config
.
seq_len_r
;
int
n
=
config
.
q_dim
;
int
k
=
config
.
num_heads
*
config
.
key_dim
;
auto
out_linear_compute
=
AttnMatMul
<
T
>
(
ctx
.
cuda_device_context
(),
false
,
false
,
m
,
n
,
k
,
true
);
out_linear_compute
.
ComputeForward
(
out_linear_weight
,
fmha_or_gate_out
,
out_linear_bias
,
out
,
out
);
return
out
;
}
template
<
typename
T
>
Tensor
*
ComputeOutputLinearBackward
(
const
framework
::
ExecutionContext
&
ctx
,
const
GateAttentionGradConfig
<
T
>
&
config
,
bool
has_gating
)
{
std
::
string
input_name
=
has_gating
?
"GateOut"
:
"FMHAOut"
;
const
auto
*
out_grad
=
ctx
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Out"
));
const
auto
*
out_linear_weight
=
ctx
.
Input
<
Tensor
>
(
"OutLinearWeight"
);
const
auto
*
input
=
ctx
.
Input
<
Tensor
>
(
input_name
);
auto
*
out_linear_weight_grad
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"OutLinearWeight"
));
auto
*
out_linear_bias_grad
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"OutLinearBias"
));
auto
*
input_grad
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
input_name
));
out_linear_weight_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
out_linear_bias_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
input_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
int
m
=
config
.
batch_size
*
config
.
seq_len_m
*
config
.
seq_len_r
;
int
n
=
config
.
q_dim
;
int
k
=
config
.
num_heads
*
config
.
key_dim
;
auto
out_linear_compute
=
AttnMatMul
<
T
>
(
ctx
.
cuda_device_context
(),
false
,
false
,
m
,
n
,
k
,
true
);
out_linear_compute
.
ComputeBackward
(
input
,
out_linear_weight
,
out_grad
,
input_grad
,
out_linear_weight_grad
,
out_linear_bias_grad
);
return
input_grad
;
}
template
<
typename
T
>
class
FusedGateAttentionOpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
auto
*
query
=
ctx
.
Input
<
Tensor
>
(
"Query"
);
const
auto
*
key
=
ctx
.
Input
<
Tensor
>
(
"Key"
);
const
auto
*
query_weight
=
ctx
.
Input
<
Tensor
>
(
"QueryWeight"
);
const
auto
*
qkv_weight
=
ctx
.
Input
<
Tensor
>
(
"QKVWeight"
);
const
auto
*
src_mask
=
ctx
.
Input
<
Tensor
>
(
"SrcMask"
);
const
auto
*
nonbatched_bias
=
ctx
.
Input
<
Tensor
>
(
"NonbatchedBias"
);
auto
*
q_transpose_out
=
ctx
.
Output
<
Tensor
>
(
"QueryTransposeOut"
);
auto
*
k_transpose_out
=
ctx
.
Output
<
Tensor
>
(
"KeyTransposeOut"
);
auto
*
v_transpose_out
=
ctx
.
Output
<
Tensor
>
(
"ValueTransposeOut"
);
auto
*
qkv_transpose_out
=
ctx
.
Output
<
Tensor
>
(
"QKVTransposeOut"
);
auto
*
softmax_out
=
ctx
.
Output
<
Tensor
>
(
"SoftmaxOut"
);
auto
*
fmha_out
=
ctx
.
Output
<
Tensor
>
(
"FMHAOut"
);
const
bool
merge_qkv
=
ctx
.
Attr
<
bool
>
(
"merge_qkv"
);
const
bool
has_gating
=
ctx
.
Attr
<
bool
>
(
"has_gating"
);
// When seq_len_r = m_size, q_dim = kv_dim, QKV matmul can be merged.
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>();
GateAttentionConfig
<
T
>
config
(
query
,
key
,
query_weight
,
qkv_weight
,
merge_qkv
);
if
(
merge_qkv
)
{
// 1. Merged QKV Matmul: einsum(nbhqk,nbkhc -> nbqhc)
Tensor
*
qkv_out
=
config
.
GetQKVOut
(
dev_ctx
);
ComputeMergedQKVMatmulForward
<
T
>
(
ctx
,
config
,
query
,
qkv_out
);
qkv_transpose_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
VLOG
(
4
)
<<
"qkv_transpose_out:"
<<
MemoryDebugString
(
*
qkv_transpose_out
);
}
else
{
// 1. Separated QKV Matmul
Tensor
*
query_out
=
config
.
GetQueryOut
(
dev_ctx
);
Tensor
*
key_out
=
config
.
GetKeyOut
(
dev_ctx
);
Tensor
*
value_out
=
config
.
GetValueOut
(
dev_ctx
);
ComputeSeparatedQKVMatmulForward
<
T
>
(
ctx
,
config
,
query
,
key
,
query_out
,
key_out
,
value_out
);
q_transpose_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
k_transpose_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
v_transpose_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
VLOG
(
4
)
<<
"q_transpose_out: "
<<
MemoryDebugString
(
*
q_transpose_out
);
VLOG
(
4
)
<<
"k_transpose_out: "
<<
MemoryDebugString
(
*
k_transpose_out
);
VLOG
(
4
)
<<
"v_transpose_out: "
<<
MemoryDebugString
(
*
v_transpose_out
);
}
softmax_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
fmha_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
VLOG
(
4
)
<<
"softmax_out: "
<<
MemoryDebugString
(
*
softmax_out
);
VLOG
(
4
)
<<
"fmha_out: "
<<
MemoryDebugString
(
*
fmha_out
);
// 2. FMHA
auto
fmha_compute
=
FMHAGateRef
<
T
>
(
dev_ctx
,
merge_qkv
);
fmha_compute
.
ComputeForward
(
nonbatched_bias
,
src_mask
,
q_transpose_out
,
k_transpose_out
,
v_transpose_out
,
qkv_transpose_out
,
softmax_out
,
fmha_out
,
&
config
);
// 3. Gating Linear
Tensor
*
fmha_or_gate_out
=
!
has_gating
?
fmha_out
:
ComputeGatingLinearForward
<
T
>
(
ctx
,
config
,
query
,
fmha_out
);
// 4. Output Linear
ComputeOutputLinearForward
<
T
>
(
ctx
,
config
,
fmha_or_gate_out
);
}
};
template
<
typename
T
>
class
FusedGateAttentionGradKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
auto
has_gating
=
ctx
.
Attr
<
bool
>
(
"has_gating"
);
const
auto
merge_qkv
=
ctx
.
Attr
<
bool
>
(
"merge_qkv"
);
// forward input
const
auto
*
query
=
ctx
.
Input
<
Tensor
>
(
"Query"
);
const
auto
*
key
=
ctx
.
Input
<
Tensor
>
(
"Key"
);
const
auto
*
query_weight
=
ctx
.
Input
<
Tensor
>
(
"QueryWeight"
);
const
auto
*
qkv_weight
=
ctx
.
Input
<
Tensor
>
(
"QKVWeight"
);
// forward output, backward input
const
auto
*
q_transpose_out
=
ctx
.
Input
<
Tensor
>
(
"QueryTransposeOut"
);
const
auto
*
k_transpose_out
=
ctx
.
Input
<
Tensor
>
(
"KeyTransposeOut"
);
const
auto
*
v_transpose_out
=
ctx
.
Input
<
Tensor
>
(
"ValueTransposeOut"
);
const
auto
*
qkv_transpose_out
=
ctx
.
Input
<
Tensor
>
(
"QKVTransposeOut"
);
const
auto
*
softmax_out
=
ctx
.
Input
<
Tensor
>
(
"SoftmaxOut"
);
const
auto
*
fmha_out
=
ctx
.
Input
<
Tensor
>
(
"FMHAOut"
);
// backward output
auto
*
query_grad
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Query"
));
query_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
*
nonbatched_bias_grad
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"NonbatchedBias"
));
auto
*
fmha_out_grad
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"FMHAOut"
));
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>();
GateAttentionGradConfig
<
T
>
config
(
query
,
key
,
query_weight
,
qkv_weight
,
merge_qkv
);
// 1. Gradient of Output Linear
Tensor
*
fhma_or_gate_out_grad
=
ComputeOutputLinearBackward
<
T
>
(
ctx
,
config
,
has_gating
);
// 2. Gradient of Gating Linear
if
(
has_gating
)
{
// fhma_or_gate_out_grad is actually gate_out_grad.
fmha_out_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
ComputeGatingLinearBackward
<
T
>
(
ctx
,
config
,
fmha_out
,
fhma_or_gate_out_grad
,
query_grad
,
fmha_out_grad
);
}
// 3. Gradient of FMHA
if
(
nonbatched_bias_grad
)
{
nonbatched_bias_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
}
auto
fmha_compute
=
FMHAGateRef
<
T
>
(
dev_ctx
,
merge_qkv
);
fmha_compute
.
ComputeBackward
(
q_transpose_out
,
k_transpose_out
,
v_transpose_out
,
qkv_transpose_out
,
softmax_out
,
fmha_out_grad
,
nullptr
,
nonbatched_bias_grad
,
&
config
);
bool
use_addto
=
has_gating
?
true
:
false
;
if
(
merge_qkv
)
{
// 4. Gradient of Merged QKV Matmul
Tensor
*
qkv_out_grad
=
config
.
GetQKVOutGrad
(
dev_ctx
);
ComputeMergedQKVMatmulBackward
<
T
>
(
ctx
,
config
,
query
,
qkv_out_grad
,
query_grad
,
use_addto
);
}
else
{
// 4. Gradient of Separated QKV Matmul
auto
*
key_grad
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Key"
));
if
(
key_grad
)
{
key_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
}
Tensor
*
query_out_grad
=
config
.
GetQueryOutGrad
(
dev_ctx
);
Tensor
*
key_out_grad
=
config
.
GetKeyOutGrad
(
dev_ctx
);
Tensor
*
value_out_grad
=
config
.
GetValueOutGrad
(
dev_ctx
);
ComputeSeparatedQKVMatmulBackward
<
T
>
(
ctx
,
config
,
query
,
key
,
query_out_grad
,
key_out_grad
,
value_out_grad
,
query_grad
,
key_grad
,
use_addto
);
}
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
#ifdef PADDLE_WITH_HIP
REGISTER_OP_CUDA_KERNEL
(
fused_gate_attention
,
ops
::
FusedGateAttentionOpKernel
<
float
>
,
ops
::
FusedGateAttentionOpKernel
<
plat
::
float16
>
,
ops
::
FusedGateAttentionOpKernel
<
plat
::
bfloat16
>
);
REGISTER_OP_CUDA_KERNEL
(
fused_gate_attention_grad
,
ops
::
FusedGateAttentionGradKernel
<
float
>
,
ops
::
FusedGateAttentionGradKernel
<
plat
::
float16
>
,
ops
::
FusedGateAttentionGradKernel
<
plat
::
bfloat16
>
);
#else
REGISTER_OP_CUDA_KERNEL
(
fused_gate_attention
,
ops
::
FusedGateAttentionOpKernel
<
float
>
,
ops
::
FusedGateAttentionOpKernel
<
double
>
,
ops
::
FusedGateAttentionOpKernel
<
plat
::
float16
>
,
ops
::
FusedGateAttentionOpKernel
<
plat
::
bfloat16
>
);
REGISTER_OP_CUDA_KERNEL
(
fused_gate_attention_grad
,
ops
::
FusedGateAttentionGradKernel
<
float
>
,
ops
::
FusedGateAttentionGradKernel
<
double
>
,
ops
::
FusedGateAttentionGradKernel
<
plat
::
float16
>
,
ops
::
FusedGateAttentionGradKernel
<
plat
::
bfloat16
>
);
#endif
paddle/fluid/platform/device/gpu/gpu_info.cc
浏览文件 @
fdcdbec5
...
...
@@ -225,9 +225,9 @@ class RecordedGpuMallocHelper {
if
(
UNLIKELY
(
malloc_managed_memory
))
{
result
=
cudaMallocManaged
(
ptr
,
size
);
}
else
{
VLOG
(
10
)
<<
"[cudaMalloc] size="
<<
static_cast
<
double
>
(
size
)
/
(
1
<<
20
)
<<
" MB"
;
result
=
cudaMalloc
(
ptr
,
size
);
VLOG
(
10
)
<<
"[cudaMalloc] size="
<<
static_cast
<
double
>
(
size
)
/
(
1
<<
20
)
<<
" MB, result="
<<
result
;
}
#endif
if
(
result
==
gpuSuccess
)
{
...
...
paddle/fluid/pybind/op_function_generator.h
浏览文件 @
fdcdbec5
...
...
@@ -32,6 +32,10 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
{
"fused_attention"
,
{
"X"
,
"LnScale"
,
"LnBias"
,
"QKVW"
,
"QKVBias"
,
"CacheKV"
,
"SrcMask"
,
"OutLinearW"
,
"OutLinearBias"
,
"Ln2Scale"
,
"Ln2Bias"
}},
{
"fused_gate_attention"
,
{
"Query"
,
"Key"
,
"QueryWeight"
,
"KeyWeight"
,
"ValueWeight"
,
"QKVWeight"
,
"NonbatchedBias"
,
"SrcMask"
,
"GateWeight"
,
"GateBias"
,
"OutLinearWeight"
,
"OutLinearBias"
}},
{
"fused_multi_transformer"
,
{
"X"
,
"LnScale"
,
"LnBias"
,
"QKVW"
,
"QKVBias"
,
"CacheKV"
,
"TimeStep"
,
"SrcMask"
,
"OutLinearW"
,
"OutLinearBias"
,
"FFNLnScale"
,
"FFNLnBias"
,
...
...
@@ -148,6 +152,9 @@ std::map<std::string, std::set<std::string>> op_outs_map = {
"DropoutMaskOut"
,
"Ln2Mean"
,
"Ln2Variance"
,
"BiasDropoutResidualOut"
,
"CacheKVOut"
,
"Y"
}},
{
"fused_gate_attention"
,
{
"QueryTransposeOut"
,
"KeyTransposeOut"
,
"ValueTransposeOut"
,
"QKVTransposeOut"
,
"SoftmaxOut"
,
"FMHAOut"
,
"GateOut"
,
"Out"
}},
{
"sync_batch_norm"
,
{
"Y"
,
"MeanOut"
,
"VarianceOut"
,
"SavedMean"
,
"SavedVariance"
,
"ReserveSpace"
}},
...
...
paddle/phi/kernels/gpudnn/softmax_gpudnn.h
浏览文件 @
fdcdbec5
...
...
@@ -888,19 +888,6 @@ void SoftmaxBackwardCudnnKernel(const GPUContext& dev_ctx,
#endif
}
template
<
typename
T
>
static
bool
CanUseCudnnSoftmax
(
const
GPUContext
&
dev_ctx
)
{
if
(
dev_ctx
.
cudnn_handle
()
!=
nullptr
)
{
if
(
std
::
is_same
<
T
,
phi
::
dtype
::
bfloat16
>::
value
)
{
#if CUDNN_VERSION < 8100
return
false
;
#endif
}
return
true
;
}
return
false
;
}
#if CUDNN_VERSION < 8100
template
<
>
inline
void
SoftmaxForwardCudnnKernel
<
phi
::
dtype
::
bfloat16
>
(
...
...
@@ -927,6 +914,25 @@ inline void SoftmaxBackwardCudnnKernel<phi::dtype::bfloat16>(
}
#endif
template
<
typename
T
>
bool
UseCudnnSoftmax
(
const
GPUContext
&
ctx
,
int
softmax_dim
,
bool
last_dim
)
{
bool
cudnn_available
=
ctx
.
cudnn_handle
();
if
(
!
ctx
.
cudnn_handle
())
{
if
(
std
::
is_same
<
T
,
phi
::
dtype
::
bfloat16
>::
value
)
{
#if CUDNN_VERSION < 8100
cudnn_available
=
false
;
#endif
}
}
constexpr
int
max_dim
=
512
;
if
(
!
cudnn_available
||
!
last_dim
||
(
softmax_dim
<=
max_dim
&&
sizeof
(
T
)
<=
4
))
{
return
false
;
}
else
{
return
true
;
}
}
template
<
typename
T
,
bool
LogMode
=
false
>
void
SoftmaxForwardCUDAKernelDriver
(
const
GPUContext
&
dev_ctx
,
const
DenseTensor
&
x
,
...
...
@@ -941,10 +947,7 @@ void SoftmaxForwardCUDAKernelDriver(const GPUContext& dev_ctx,
int
dim
=
tensor_dims
[
1
];
int
D
=
tensor_dims
[
2
];
constexpr
int
max_dim
=
512
;
if
(
D
==
1
&&
(
!
CanUseCudnnSoftmax
<
T
>
(
dev_ctx
)
||
(
dim
<=
max_dim
&&
sizeof
(
T
)
<=
4
)))
{
if
(
D
==
1
&&
!
UseCudnnSoftmax
<
T
>
(
dev_ctx
,
dim
,
true
))
{
int
dim_log2
=
static_cast
<
int
>
(
Log2Ceil
(
dim
));
int
dim_ceil
=
1
<<
dim_log2
;
int
warp_size
=
(
dim_ceil
<
32
)
?
dim_ceil
:
32
;
...
...
@@ -1016,10 +1019,7 @@ void SoftmaxBackwardCUDAKernelDriver(const GPUContext& dev_ctx,
int
dim
=
tensor_dims
[
1
];
int
D
=
tensor_dims
[
2
];
constexpr
int
max_dim
=
512
;
if
(
D
==
1
&&
(
!
CanUseCudnnSoftmax
<
T
>
(
dev_ctx
)
||
(
dim
<=
max_dim
&&
sizeof
(
T
)
<=
4
)))
{
if
(
D
==
1
&&
!
UseCudnnSoftmax
<
T
>
(
dev_ctx
,
dim
,
true
))
{
int
dim_log2
=
Log2Ceil
(
dim
);
int
dim_ceil
=
1
<<
dim_log2
;
int
warp_size
=
(
dim_ceil
<
32
)
?
dim_ceil
:
32
;
...
...
python/paddle/fluid/tests/unittests/CMakeLists.txt
浏览文件 @
fdcdbec5
...
...
@@ -327,6 +327,7 @@ if ((NOT WITH_NCCL) AND (NOT WITH_RCCL))
endif
()
if
(((
NOT WITH_ROCM
)
AND
(
NOT WITH_GPU
))
OR WIN32
)
LIST
(
REMOVE_ITEM TEST_OPS test_fused_gate_attention_op
)
LIST
(
REMOVE_ITEM TEST_OPS test_boxps
)
endif
()
list
(
REMOVE_ITEM TEST_OPS test_seq_concat_op
)
# FIXME(helin): https://github.com/PaddlePaddle/Paddle/issues/8290
...
...
python/paddle/fluid/tests/unittests/test_fused_gate_attention_op.py
0 → 100644
浏览文件 @
fdcdbec5
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
numpy
as
np
import
paddle
import
paddle.nn
as
nn
from
paddle
import
tensor
import
unittest
from
op_test
import
OpTest
,
convert_float_to_uint16
from
test_sparse_attention_op
import
get_cuda_version
from
paddle
import
_C_ops
from
paddle.fluid.framework
import
default_main_program
from
paddle.fluid
import
core
@
unittest
.
skipIf
(
not
core
.
is_compiled_with_cuda
(),
"Paddle is not compiled with CUDA"
)
class
TestFusedGateAttentionOp
(
OpTest
):
def
setUp
(
self
):
self
.
__class__
.
op_type
=
"fused_gate_attention"
# use autograd to check grad in this unittest.
self
.
__class__
.
no_need_check_grad
=
True
self
.
config
()
self
.
merge_qkv
=
self
.
q_dim
==
self
.
kv_dim
self
.
generate_input_data
()
def
config
(
self
):
self
.
dtype
=
"float32"
self
.
has_gating
=
True
self
.
batch_size
=
1
self
.
msa_len
=
3
self
.
res_len
=
5
self
.
q_dim
=
6
self
.
num_heads
=
2
self
.
key_dim
=
4
self
.
m_size
=
self
.
res_len
self
.
kv_dim
=
self
.
q_dim
self
.
out_dim
=
self
.
q_dim
self
.
bias_attr
=
True
def
generate_input_data
(
self
):
def
_random
(
shape
):
if
self
.
dtype
==
"bfloat16"
:
data
=
np
.
random
.
random
(
shape
).
astype
(
"float32"
)
return
convert_float_to_uint16
(
data
)
else
:
return
np
.
random
.
random
(
shape
).
astype
(
self
.
dtype
)
np
.
random
.
seed
(
123
)
self
.
query
=
_random
(
(
self
.
batch_size
,
self
.
msa_len
,
self
.
res_len
,
self
.
q_dim
))
self
.
q_weight
=
_random
((
self
.
q_dim
,
self
.
num_heads
,
self
.
key_dim
))
self
.
k_weight
=
_random
((
self
.
kv_dim
,
self
.
num_heads
,
self
.
key_dim
))
self
.
v_weight
=
_random
((
self
.
kv_dim
,
self
.
num_heads
,
self
.
key_dim
))
if
self
.
merge_qkv
:
self
.
key
=
None
# (3, self.num_heads, self.key_dim, self.q_dim)
q_weight_t
=
np
.
transpose
(
self
.
q_weight
,
axes
=
[
1
,
2
,
0
])
k_weight_t
=
np
.
transpose
(
self
.
k_weight
,
axes
=
[
1
,
2
,
0
])
v_weight_t
=
np
.
transpose
(
self
.
v_weight
,
axes
=
[
1
,
2
,
0
])
self
.
qkv_weight
=
np
.
stack
([
q_weight_t
,
k_weight_t
,
v_weight_t
])
else
:
self
.
key
=
_random
(
(
self
.
batch_size
,
self
.
msa_len
,
self
.
m_size
,
self
.
kv_dim
))
self
.
qkv_weight
=
None
self
.
attn_mask
=
_random
(
(
self
.
batch_size
,
self
.
msa_len
,
1
,
1
,
self
.
m_size
))
if
self
.
bias_attr
:
self
.
nonbatched_bias
=
_random
(
(
self
.
batch_size
,
1
,
self
.
num_heads
,
self
.
res_len
,
self
.
m_size
))
if
self
.
has_gating
:
self
.
gating_w
=
_random
((
self
.
q_dim
,
self
.
num_heads
,
self
.
key_dim
))
self
.
gating_b
=
_random
((
self
.
num_heads
,
self
.
key_dim
))
self
.
output_w
=
_random
((
self
.
num_heads
,
self
.
key_dim
,
self
.
out_dim
))
self
.
output_b
=
_random
((
self
.
out_dim
))
self
.
dout
=
_random
(
(
self
.
batch_size
,
self
.
msa_len
,
self
.
res_len
,
self
.
q_dim
))
def
get_reference_out
(
self
):
paddle
.
disable_static
(
place
=
paddle
.
CUDAPlace
(
0
))
query
=
paddle
.
to_tensor
(
self
.
query
,
stop_gradient
=
False
)
key
=
query
if
self
.
merge_qkv
else
paddle
.
to_tensor
(
self
.
key
,
stop_gradient
=
False
)
q_weight
=
paddle
.
to_tensor
(
self
.
q_weight
,
stop_gradient
=
False
)
k_weight
=
paddle
.
to_tensor
(
self
.
k_weight
,
stop_gradient
=
False
)
v_weight
=
paddle
.
to_tensor
(
self
.
v_weight
,
stop_gradient
=
False
)
src_mask
=
paddle
.
to_tensor
(
self
.
attn_mask
,
stop_gradient
=
True
)
c
=
self
.
key_dim
**
(
-
0.5
)
# [batch_size, msa_len, num_heads, res_len, key_dim]
q
=
paddle
.
einsum
(
'nbqa,ahc->nbqhc'
,
query
,
q_weight
)
*
c
# [batch_size, msa_len, num_heads, m_size, key_dim]
k
=
paddle
.
einsum
(
'nbka,ahc->nbkhc'
,
key
,
k_weight
)
# [batch_size, msa_len, num_heads, m_size, key_dim]
v
=
paddle
.
einsum
(
'nbka,ahc->nbkhc'
,
key
,
v_weight
)
# [batch_size, msa_len, num_heads, res_len, m_size]
logits
=
paddle
.
einsum
(
'nbqhc,nbkhc->nbhqk'
,
q
,
k
)
# qk_out
logits
=
logits
+
src_mask
if
self
.
bias_attr
:
nonbatched_bias
=
paddle
.
to_tensor
(
self
.
nonbatched_bias
,
stop_gradient
=
False
)
logits
=
logits
+
nonbatched_bias
weights
=
nn
.
functional
.
softmax
(
logits
)
# softmax_out
weighted_avg
=
paddle
.
einsum
(
'nbhqk,nbkhc->nbqhc'
,
weights
,
v
)
if
self
.
has_gating
:
gating_w
=
paddle
.
to_tensor
(
self
.
gating_w
,
stop_gradient
=
False
)
gating_b
=
paddle
.
to_tensor
(
self
.
gating_b
,
stop_gradient
=
False
)
gate_values
=
paddle
.
einsum
(
'nbqc,chv->nbqhv'
,
query
,
gating_w
)
+
gating_b
gate_values
=
nn
.
functional
.
sigmoid
(
gate_values
)
weighted_avg
=
weighted_avg
*
gate_values
output_b
=
paddle
.
to_tensor
(
self
.
output_b
,
stop_gradient
=
False
)
output_w
=
paddle
.
to_tensor
(
self
.
output_w
,
stop_gradient
=
False
)
out
=
paddle
.
einsum
(
'nbqhc,hco->nbqo'
,
weighted_avg
,
output_w
)
+
output_b
paddle
.
autograd
.
backward
(
[
out
],
[
paddle
.
to_tensor
(
self
.
dout
)],
retain_graph
=
True
)
if
self
.
merge_qkv
:
return
out
,
query
.
grad
,
None
else
:
return
out
,
query
.
grad
,
key
.
grad
def
get_fused_gate_attention_out
(
self
):
paddle
.
disable_static
(
place
=
paddle
.
CUDAPlace
(
0
))
query
=
paddle
.
to_tensor
(
self
.
query
,
stop_gradient
=
False
)
if
self
.
merge_qkv
:
key
=
None
q_weight
=
None
k_weight
=
None
v_weight
=
None
qkv_weight
=
paddle
.
to_tensor
(
self
.
qkv_weight
,
stop_gradient
=
False
)
else
:
key
=
paddle
.
to_tensor
(
self
.
key
,
stop_gradient
=
False
)
q_weight
=
paddle
.
to_tensor
(
self
.
q_weight
,
stop_gradient
=
False
)
k_weight
=
paddle
.
to_tensor
(
self
.
k_weight
,
stop_gradient
=
False
)
v_weight
=
paddle
.
to_tensor
(
self
.
v_weight
,
stop_gradient
=
False
)
qkv_weight
=
None
src_mask
=
paddle
.
to_tensor
(
self
.
attn_mask
,
stop_gradient
=
True
)
if
self
.
bias_attr
:
nonbatched_bias
=
paddle
.
to_tensor
(
self
.
nonbatched_bias
,
stop_gradient
=
False
)
else
:
nonbatched_bias
=
None
if
self
.
has_gating
:
gating_w
=
paddle
.
to_tensor
(
self
.
gating_w
,
stop_gradient
=
False
)
gating_b
=
paddle
.
to_tensor
(
self
.
gating_b
,
stop_gradient
=
False
)
else
:
gating_w
=
None
gating_b
=
None
output_w
=
paddle
.
to_tensor
(
self
.
output_w
,
stop_gradient
=
False
)
output_b
=
paddle
.
to_tensor
(
self
.
output_b
,
stop_gradient
=
False
)
_
,
_
,
_
,
_
,
_
,
_
,
_
,
out
=
_C_ops
.
fused_gate_attention
(
query
,
key
,
q_weight
,
k_weight
,
v_weight
,
qkv_weight
,
nonbatched_bias
,
src_mask
,
gating_w
,
gating_b
,
output_w
,
output_b
,
'has_gating'
,
self
.
has_gating
,
'merge_qkv'
,
self
.
merge_qkv
)
paddle
.
autograd
.
backward
(
[
out
],
[
paddle
.
to_tensor
(
self
.
dout
)],
retain_graph
=
True
)
if
key
is
not
None
:
return
out
,
query
.
grad
,
key
.
grad
else
:
return
out
,
query
.
grad
,
None
def
check_output_and_grad
(
self
,
atol
,
rtol
):
out_ref
,
query_grad_ref
,
key_grad_ref
=
self
.
get_reference_out
()
out
,
query_grad
,
key_grad
=
self
.
get_fused_gate_attention_out
()
np
.
testing
.
assert_allclose
(
out_ref
,
out
.
numpy
(),
atol
=
atol
,
rtol
=
rtol
)
np
.
testing
.
assert_allclose
(
query_grad_ref
,
query_grad
.
numpy
(),
atol
=
atol
,
rtol
=
rtol
)
if
key_grad_ref
is
not
None
and
key_grad
is
not
None
:
np
.
testing
.
assert_allclose
(
key_grad_ref
,
key_grad
.
numpy
(),
atol
=
atol
,
rtol
=
rtol
)
def
test_output_and_grad
(
self
):
self
.
check_output_and_grad
(
atol
=
1e-5
,
rtol
=
1e-5
)
class
TestSeparatedQKVCase
(
TestFusedGateAttentionOp
):
def
config
(
self
):
self
.
dtype
=
"float32"
self
.
has_gating
=
False
self
.
batch_size
=
1
self
.
msa_len
=
3
self
.
res_len
=
5
self
.
q_dim
=
6
self
.
num_heads
=
2
self
.
key_dim
=
4
self
.
m_size
=
4
self
.
kv_dim
=
2
self
.
out_dim
=
self
.
q_dim
self
.
bias_attr
=
False
class
TestMergeQKVNoBiasGatingCase
(
TestFusedGateAttentionOp
):
def
config
(
self
):
super
().
config
()
self
.
has_gating
=
False
self
.
bias_attr
=
False
class
TestMergeQKVFp16Case
(
TestFusedGateAttentionOp
):
def
config
(
self
):
super
().
config
()
self
.
dtype
=
"float16"
def
test_output_and_grad
(
self
):
self
.
check_output_and_grad
(
atol
=
1e-1
,
rtol
=
1e-5
)
@
unittest
.
skipIf
(
not
core
.
is_compiled_with_cuda
()
or
get_cuda_version
()
<
11000
,
"core is not compiled with CUDA and cuda version need larger than or equal to 11.3"
)
class
TestMergeQKVBF16Case
(
TestFusedGateAttentionOp
):
def
config
(
self
):
super
().
config
()
self
.
dtype
=
"bfloat16"
def
test_output_and_grad
(
self
):
self
.
check_output_and_grad
(
atol
=
1e-1
,
rtol
=
1e-3
)
if
__name__
==
"__main__"
:
unittest
.
main
()
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录