Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
071708fa
P
Paddle
项目概览
PaddlePaddle
/
Paddle
1 年多 前同步成功
通知
2302
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
071708fa
编写于
11月 17, 2022
作者:
T
taixiurong
提交者:
GitHub
11月 17, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
xpu-paddlepaddle-41 [任务] ffn and attention test=kunlun (#46658)
上级
b4460eee
变更
8
隐藏空白更改
内联
并排
Showing
8 changed file
with
2723 addition
and
1 deletion
+2723
-1
paddle/fluid/operators/fused/CMakeLists.txt
paddle/fluid/operators/fused/CMakeLists.txt
+2
-0
paddle/fluid/operators/fused/fused_attention_op_xpu.cc
paddle/fluid/operators/fused/fused_attention_op_xpu.cc
+939
-0
paddle/fluid/operators/fused/fused_feedforward_op_xpu.cc
paddle/fluid/operators/fused/fused_feedforward_op_xpu.cc
+828
-0
paddle/fluid/operators/fused/xpu_fused_common_function.h
paddle/fluid/operators/fused/xpu_fused_common_function.h
+224
-0
paddle/fluid/platform/device/xpu/xpu2_op_list.h
paddle/fluid/platform/device/xpu/xpu2_op_list.h
+12
-0
paddle/phi/kernels/xpu/xpu_api_wrapper.h
paddle/phi/kernels/xpu/xpu_api_wrapper.h
+8
-1
python/paddle/fluid/tests/unittests/xpu/test_fused_attention_op_xpu.py
.../fluid/tests/unittests/xpu/test_fused_attention_op_xpu.py
+331
-0
python/paddle/fluid/tests/unittests/xpu/test_fused_feedforward_op_xpu.py
...luid/tests/unittests/xpu/test_fused_feedforward_op_xpu.py
+379
-0
未找到文件。
paddle/fluid/operators/fused/CMakeLists.txt
浏览文件 @
071708fa
...
...
@@ -38,6 +38,8 @@ if(WITH_XPU)
op_library
(
resnet_basic_block_op
)
op_library
(
resnet_unit_op
)
op_library
(
fused_gemm_epilogue_op
)
op_library
(
fused_attention_op
)
op_library
(
fused_feedforward_op
)
endif
()
if
(
WITH_GPU OR WITH_ROCM
)
...
...
paddle/fluid/operators/fused/fused_attention_op_xpu.cc
0 → 100644
浏览文件 @
071708fa
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_XPU
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/operators/fused/xpu_fused_common_function.h"
#include "paddle/fluid/operators/matmul_v2_op.h"
#include "paddle/fluid/operators/xpu_api_wrapper.h"
#include "paddle/fluid/platform/device/device_wrapper.h"
namespace
paddle
{
namespace
operators
{
using
Tensor
=
phi
::
DenseTensor
;
template
<
typename
DeviceContext
,
typename
T
>
class
FusedAttentionOpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
using
XPUTypeT
=
typename
XPUTypeTrait
<
T
>::
Type
;
// inputs tensor
auto
*
input_x
=
ctx
.
Input
<
Tensor
>
(
"X"
);
const
auto
pre_layer_norm
=
ctx
.
Attr
<
bool
>
(
"pre_layer_norm"
);
// shape [3, num_head, dim_head, dim_embed]
auto
*
qkv_weight
=
ctx
.
Input
<
Tensor
>
(
"QKVW"
);
// shape [3 , num_head, dim_head]
auto
*
qkv_bias
=
ctx
.
Input
<
Tensor
>
(
"QKVBias"
);
// shape [batch_size, 1, 1, seq_len]
auto
*
src_mask
=
ctx
.
Input
<
Tensor
>
(
"SrcMask"
);
// shape [dim_embed, dim_embed]
auto
*
out_linear_weight
=
ctx
.
Input
<
Tensor
>
(
"OutLinearW"
);
// shape [dim_embed]
auto
*
out_linear_bias
=
ctx
.
Input
<
Tensor
>
(
"OutLinearBias"
);
const
Tensor
*
ln_scale
=
nullptr
;
const
Tensor
*
ln_bias
=
nullptr
;
float
epsilon
=
0.0
f
;
if
(
pre_layer_norm
)
{
ln_scale
=
ctx
.
Input
<
Tensor
>
(
"LnScale"
);
ln_bias
=
ctx
.
Input
<
Tensor
>
(
"LnBias"
);
epsilon
=
ctx
.
Attr
<
float
>
(
"epsilon"
);
}
else
{
ln_scale
=
ctx
.
Input
<
Tensor
>
(
"Ln2Scale"
);
ln_bias
=
ctx
.
Input
<
Tensor
>
(
"Ln2Bias"
);
epsilon
=
ctx
.
Attr
<
float
>
(
"ln_epsilon"
);
}
// outputs tensor
// qkv 的值,并已经做了transpos后的值
// shape [3, batch_size, num_head, seq_len, dim_head]
auto
*
TransposeOut2
=
ctx
.
Output
<
Tensor
>
(
"TransposeOut2"
);
// shape [batch_size, num_head, seq_len, seq_len]
auto
*
softmax_out
=
ctx
.
Output
<
Tensor
>
(
"SoftmaxOut"
);
// shape [batch_size, num_head, seq_len, seq_len]
auto
*
attn_dropout_mask_out
=
ctx
.
Output
<
Tensor
>
(
"AttnDropoutMaskOut"
);
// shape [batch_size, num_head, seq_len, seq_len]
auto
*
attn_dropout_out
=
ctx
.
Output
<
Tensor
>
(
"AttnDropoutOut"
);
// shape [[batch_size, seq_len, num_head, dim_head]]
auto
*
fmha_out
=
ctx
.
Output
<
Tensor
>
(
"FMHAOut"
);
// shape [batch_size, seq_len, dim_embed]
auto
*
dropout_mask_out
=
ctx
.
Output
<
Tensor
>
(
"DropoutMaskOut"
);
// final output
// shape [batch_size, seq_len, dim_embed]
auto
*
out
=
ctx
.
Output
<
Tensor
>
(
"Y"
);
// 下面这个tensor是不需要返回, 但是新的动态图需要
auto
*
QKOut
=
ctx
.
Output
<
Tensor
>
(
"QKOut"
);
QKOut
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
*
QKTVOut
=
ctx
.
Output
<
Tensor
>
(
"QKTVOut"
);
QKTVOut
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
*
OutLinearOut
=
ctx
.
Output
<
Tensor
>
(
"OutLinearOut"
);
OutLinearOut
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
*
QKVBiasOut
=
ctx
.
Output
<
Tensor
>
(
"QKVBiasOut"
);
QKVBiasOut
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
*
SrcMaskOut
=
ctx
.
Output
<
Tensor
>
(
"SrcMaskOut"
);
SrcMaskOut
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
*
qkv_out
=
ctx
.
Output
<
Tensor
>
(
"QKVOut"
);
qkv_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
Tensor
*
bias_dropout_residual_out
=
nullptr
;
Tensor
*
ln_mean
=
nullptr
;
Tensor
*
ln_var
=
nullptr
;
Tensor
*
ln_out
=
nullptr
;
if
(
pre_layer_norm
)
{
ln_mean
=
ctx
.
Output
<
Tensor
>
(
"LnMean"
);
ln_var
=
ctx
.
Output
<
Tensor
>
(
"LnVariance"
);
ln_out
=
ctx
.
Output
<
Tensor
>
(
"LnOut"
);
}
else
{
ln_mean
=
ctx
.
Output
<
Tensor
>
(
"Ln2Mean"
);
ln_var
=
ctx
.
Output
<
Tensor
>
(
"Ln2Variance"
);
bias_dropout_residual_out
=
ctx
.
Output
<
Tensor
>
(
"BiasDropoutResidualOut"
);
}
// dropout info
float
attn_dropout_rate
=
ctx
.
Attr
<
float
>
(
"attn_dropout_rate"
);
bool
is_test_1
=
ctx
.
Attr
<
bool
>
(
"is_test"
);
auto
&
dropout_implementation_1
=
ctx
.
Attr
<
std
::
string
>
(
"attn_dropout_implementation"
);
bool
is_upscale_in_train_1
=
(
dropout_implementation_1
==
"upscale_in_train"
);
auto
*
seed_1
=
ctx
.
HasInput
(
"Seed1"
)
?
ctx
.
Input
<
Tensor
>
(
"Seed1"
)
:
nullptr
;
bool
is_fix_seed_1
=
ctx
.
Attr
<
bool
>
(
"attn_dropout_fix_seed"
);
int
seed_val_1
=
ctx
.
Attr
<
int
>
(
"attn_dropout_seed"
);
XPUDropoutParam
attn_dropout_param
;
attn_dropout_param
.
initXPUDropoutParam
(
attn_dropout_rate
,
is_upscale_in_train_1
,
is_test_1
,
is_fix_seed_1
,
seed_1
,
seed_val_1
);
XPUDropoutParam
dropout_param
(
ctx
,
0
);
// 先计算纬度
const
auto
input_x_dims
=
input_x
->
dims
();
const
auto
qkv_w_dims
=
qkv_weight
->
dims
();
int
batch_size
=
input_x_dims
[
0
];
int
seq_len
=
input_x_dims
[
1
];
int
embed_dims
=
input_x_dims
[
2
];
int
num_heads
=
qkv_w_dims
[
1
];
int
head_dims
=
qkv_w_dims
[
2
];
// 输入指针
const
XPUTypeT
*
input_x_ptr
=
reinterpret_cast
<
const
XPUTypeT
*>
(
input_x
->
data
<
T
>
());
const
XPUTypeT
*
qkv_weight_ptr
=
reinterpret_cast
<
const
XPUTypeT
*>
(
qkv_weight
->
data
<
T
>
());
const
XPUTypeT
*
qkv_bias_ptr
=
reinterpret_cast
<
const
XPUTypeT
*>
(
qkv_bias
->
data
<
T
>
());
const
XPUTypeT
*
src_mask_ptr
=
(
src_mask
==
nullptr
)
?
(
nullptr
)
:
(
reinterpret_cast
<
const
XPUTypeT
*>
(
src_mask
->
data
<
T
>
()));
const
XPUTypeT
*
out_linear_weight_ptr
=
reinterpret_cast
<
const
XPUTypeT
*>
(
out_linear_weight
->
data
<
T
>
());
const
XPUTypeT
*
out_linear_bias_ptr
=
reinterpret_cast
<
const
XPUTypeT
*>
(
out_linear_bias
->
data
<
T
>
());
const
float
*
ln_scale_ptr
=
(
ln_scale
==
nullptr
)
?
(
nullptr
)
:
ln_scale
->
data
<
float
>
();
const
float
*
ln_bias_ptr
=
(
ln_bias
==
nullptr
)
?
(
nullptr
)
:
ln_bias
->
data
<
float
>
();
// 输出指针
XPUTypeT
*
qkv_transpose_out_ptr
=
reinterpret_cast
<
XPUTypeT
*>
(
TransposeOut2
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()));
XPUTypeT
*
softmax_out_ptr
=
reinterpret_cast
<
XPUTypeT
*>
(
softmax_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()));
XPUTypeT
*
attn_dropout_mask_out_ptr
=
reinterpret_cast
<
XPUTypeT
*>
(
attn_dropout_mask_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()));
XPUTypeT
*
attn_dropout_out_ptr
=
reinterpret_cast
<
XPUTypeT
*>
(
attn_dropout_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()));
XPUTypeT
*
fmha_out_ptr
=
reinterpret_cast
<
XPUTypeT
*>
(
fmha_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()));
XPUTypeT
*
dropout_mask_out_ptr
=
reinterpret_cast
<
XPUTypeT
*>
(
dropout_mask_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()));
XPUTypeT
*
out_ptr
=
reinterpret_cast
<
XPUTypeT
*>
(
out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()));
XPUTypeT
*
bias_dropout_residual_out_ptr
=
(
bias_dropout_residual_out
==
nullptr
)
?
(
nullptr
)
:
(
reinterpret_cast
<
XPUTypeT
*>
(
bias_dropout_residual_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
())));
float
*
ln_mean_ptr
=
(
ln_mean
==
nullptr
)
?
(
nullptr
)
:
ln_mean
->
mutable_data
<
float
>
(
ctx
.
GetPlace
());
float
*
ln_var_ptr
=
(
ln_var
==
nullptr
)
?
(
nullptr
)
:
ln_var
->
mutable_data
<
float
>
(
ctx
.
GetPlace
());
XPUTypeT
*
ln_out_ptr
=
(
ln_out
==
nullptr
)
?
(
nullptr
)
:
(
reinterpret_cast
<
XPUTypeT
*>
(
ln_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
())));
auto
&
dev_ctx
=
ctx
.
template
device_context
<
DeviceContext
>();
xpu
::
Context
*
xpu_ctx
=
dev_ctx
.
x_context
();
xpu
::
ctx_guard
RAII_GUARD
(
xpu_ctx
);
int
l3_total_size
=
xpu_ctx
->
_l3_mgr
.
get_size
();
XPUTypeT
*
qkv_before_transpos_ptr
=
NULL
;
// x2[batch_size, seq_len, 3, num_heads,head_dims]
XPUTypeT
*
qk_ptr
=
NULL
;
// qk [batch_size, num_heads, seq_len, seq_len]
XPUTypeT
*
qkv_ptr
=
NULL
;
// qkv[batch_size, num_heads, seq_len, head_dims]
XPUTypeT
*
linear_out_ptr
=
NULL
;
// x4, x5 [batch_size, seq_len, embed_dims]
int
temp_size_1
=
batch_size
*
seq_len
*
3
*
num_heads
*
head_dims
;
int
temp_size_2
=
batch_size
*
num_heads
*
seq_len
*
seq_len
;
int
temp_size_3
=
batch_size
*
num_heads
*
seq_len
*
head_dims
;
int
temp_size_4
=
batch_size
*
seq_len
*
embed_dims
;
std
::
vector
<
int
>
temp_vec
=
{
temp_size_1
,
temp_size_2
,
temp_size_3
,
temp_size_4
};
std
::
sort
(
temp_vec
.
begin
(),
temp_vec
.
end
(),
std
::
greater
<
int
>
());
XPUTypeT
*
max_gm_ptr
=
RAII_GUARD
.
alloc
<
XPUTypeT
>
(
temp_vec
[
0
]);
PADDLE_ENFORCE_XDNN_NOT_NULL
(
max_gm_ptr
);
qkv_before_transpos_ptr
=
max_gm_ptr
;
qk_ptr
=
max_gm_ptr
;
qkv_ptr
=
max_gm_ptr
;
linear_out_ptr
=
max_gm_ptr
;
int
sizeof_t
=
sizeof
(
XPUTypeT
);
for
(
size_t
i
=
0
;
i
<
temp_vec
.
size
();
++
i
)
{
if
(
l3_total_size
>=
temp_vec
[
i
]
*
sizeof_t
)
{
XPUTypeT
*
l3_ptr
=
RAII_GUARD
.
alloc_l3
<
XPUTypeT
>
(
temp_vec
[
i
]);
qkv_before_transpos_ptr
=
(
temp_size_1
<=
temp_vec
[
i
])
?
l3_ptr
:
max_gm_ptr
;
qk_ptr
=
(
temp_size_2
<=
temp_vec
[
i
])
?
l3_ptr
:
max_gm_ptr
;
qkv_ptr
=
(
temp_size_3
<=
temp_vec
[
i
])
?
l3_ptr
:
max_gm_ptr
;
linear_out_ptr
=
(
temp_size_4
<=
temp_vec
[
i
])
?
l3_ptr
:
max_gm_ptr
;
break
;
}
}
int
r
=
0
;
const
XPUTypeT
*
x_cacl_ptr
=
input_x_ptr
;
if
(
pre_layer_norm
)
{
r
=
xpu
::
layer_norm
(
xpu_ctx
,
input_x_ptr
,
ln_out_ptr
,
batch_size
*
seq_len
,
embed_dims
,
epsilon
,
ln_scale_ptr
,
ln_bias_ptr
,
ln_mean_ptr
,
ln_var_ptr
);
PADDLE_ENFORCE_XDNN_SUCCESS
(
r
,
"layer_norm"
);
x_cacl_ptr
=
ln_out_ptr
;
}
// fc
phi
::
XpuFcInfo
qkv_fc_info
;
qkv_fc_info
.
InitFcInfo
(
0
,
batch_size
*
seq_len
,
3
*
num_heads
*
head_dims
,
embed_dims
,
false
,
true
,
nullptr
,
nullptr
,
nullptr
);
phi
::
MatMulXPUFunction
<
XPUTypeT
>
(
xpu_ctx
,
x_cacl_ptr
,
qkv_weight_ptr
,
qkv_before_transpos_ptr
,
qkv_fc_info
,
1.0
f
);
// bias
r
=
xpu
::
broadcast_add
(
xpu_ctx
,
qkv_before_transpos_ptr
,
qkv_bias_ptr
,
qkv_before_transpos_ptr
,
{
batch_size
*
seq_len
,
3
*
num_heads
*
head_dims
},
{
3
*
num_heads
*
head_dims
});
PADDLE_ENFORCE_XDNN_SUCCESS
(
r
,
"broadcast_add"
);
// transpose
r
=
xpu
::
transpose
(
xpu_ctx
,
qkv_before_transpos_ptr
,
qkv_transpose_out_ptr
,
{
batch_size
,
seq_len
,
3
,
num_heads
,
head_dims
},
{
2
,
0
,
3
,
1
,
4
});
PADDLE_ENFORCE_XDNN_SUCCESS
(
r
,
"transpose"
);
int
qkv_every_size
=
batch_size
*
seq_len
*
num_heads
*
head_dims
;
{
float
alpha
=
1.0
/
sqrt
(
head_dims
);
r
=
scale
(
xpu_ctx
,
qkv_transpose_out_ptr
,
qkv_transpose_out_ptr
,
qkv_every_size
,
false
,
alpha
,
0.0
f
);
PADDLE_ENFORCE_XDNN_SUCCESS
(
r
,
"scale"
);
}
// begin fhma
// 1. qk 2. qk + mask 3. softmax 4.dropout 5. qkv 6. transpos
{
const
XPUTypeT
*
q_ptr
=
qkv_transpose_out_ptr
;
const
XPUTypeT
*
k_ptr
=
q_ptr
+
qkv_every_size
;
const
XPUTypeT
*
v_ptr
=
k_ptr
+
qkv_every_size
;
phi
::
XpuFcInfo
qk_fc_info
;
qk_fc_info
.
InitFcInfo
(
batch_size
*
num_heads
,
seq_len
,
seq_len
,
head_dims
,
false
,
true
,
nullptr
,
nullptr
,
nullptr
);
phi
::
MatMulXPUFunction
<
XPUTypeT
>
(
xpu_ctx
,
q_ptr
,
k_ptr
,
qk_ptr
,
qk_fc_info
,
1.0
f
);
if
(
src_mask_ptr
)
{
r
=
xpu
::
broadcast_add
(
xpu_ctx
,
qk_ptr
,
src_mask_ptr
,
qk_ptr
,
{
batch_size
,
num_heads
,
seq_len
,
seq_len
},
{
batch_size
,
1
,
1
,
seq_len
});
PADDLE_ENFORCE_XDNN_SUCCESS
(
r
,
"broadcast_add"
);
}
// do softmax
r
=
xpu
::
softmax
(
xpu_ctx
,
qk_ptr
,
softmax_out_ptr
,
{
batch_size
,
num_heads
,
seq_len
,
seq_len
},
3
);
PADDLE_ENFORCE_XDNN_SUCCESS
(
r
,
"softmax"
);
// do dropout
Dropout
<
XPUTypeT
>
(
xpu_ctx
,
softmax_out_ptr
,
attn_dropout_mask_out_ptr
,
attn_dropout_out_ptr
,
attn_dropout_param
,
batch_size
*
num_heads
*
seq_len
*
seq_len
);
phi
::
XpuFcInfo
qktv_fc_info
;
qktv_fc_info
.
InitFcInfo
(
batch_size
*
num_heads
,
seq_len
,
head_dims
,
seq_len
,
false
,
false
,
nullptr
,
nullptr
,
nullptr
);
phi
::
MatMulXPUFunction
<
XPUTypeT
>
(
xpu_ctx
,
attn_dropout_out_ptr
,
v_ptr
,
qkv_ptr
,
qktv_fc_info
,
1.0
f
);
r
=
xpu
::
transpose
(
xpu_ctx
,
qkv_ptr
,
fmha_out_ptr
,
{
batch_size
,
num_heads
,
seq_len
,
head_dims
},
{
0
,
2
,
1
,
3
});
PADDLE_ENFORCE_XDNN_SUCCESS
(
r
,
"transpose"
);
}
// linear_out
phi
::
XpuFcInfo
linear_fc_info
;
linear_fc_info
.
InitFcInfo
(
0
,
batch_size
*
seq_len
,
embed_dims
,
embed_dims
,
false
,
false
,
nullptr
,
nullptr
,
nullptr
);
phi
::
MatMulXPUFunction
<
XPUTypeT
>
(
xpu_ctx
,
fmha_out_ptr
,
out_linear_weight_ptr
,
linear_out_ptr
,
linear_fc_info
,
1.0
f
);
// out_linear_bias_ptr
r
=
xpu
::
broadcast_add
(
xpu_ctx
,
linear_out_ptr
,
out_linear_bias_ptr
,
linear_out_ptr
,
{
batch_size
*
seq_len
,
embed_dims
},
{
embed_dims
});
PADDLE_ENFORCE_XDNN_SUCCESS
(
r
,
"broadcast_add"
);
Dropout
(
xpu_ctx
,
linear_out_ptr
,
dropout_mask_out_ptr
,
linear_out_ptr
,
dropout_param
,
batch_size
*
seq_len
*
embed_dims
);
XPUTypeT
*
real_out_ptr
=
out_ptr
;
if
(
pre_layer_norm
==
false
)
{
real_out_ptr
=
bias_dropout_residual_out_ptr
;
}
r
=
xpu
::
add
(
xpu_ctx
,
linear_out_ptr
,
input_x_ptr
,
real_out_ptr
,
batch_size
*
seq_len
*
embed_dims
);
PADDLE_ENFORCE_XDNN_SUCCESS
(
r
,
"add"
);
if
(
pre_layer_norm
==
false
)
{
r
=
xpu
::
layer_norm
(
xpu_ctx
,
real_out_ptr
,
out_ptr
,
batch_size
*
seq_len
,
embed_dims
,
epsilon
,
ln_scale_ptr
,
ln_bias_ptr
,
ln_mean_ptr
,
ln_var_ptr
);
PADDLE_ENFORCE_XDNN_SUCCESS
(
r
,
"layer_norm"
);
}
}
};
// template <typename T>
template
<
typename
DeviceContext
,
typename
T
>
class
FusedAttentionGradXPUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
using
XPUTypeT
=
typename
XPUTypeTrait
<
T
>::
Type
;
const
auto
pre_layer_norm
=
ctx
.
Attr
<
bool
>
(
"pre_layer_norm"
);
// dropout info
float
attn_dropout_prob
=
ctx
.
Attr
<
float
>
(
"attn_dropout_rate"
);
bool
is_test_1
=
ctx
.
Attr
<
bool
>
(
"is_test"
);
auto
&
dropout_implementation_1
=
ctx
.
Attr
<
std
::
string
>
(
"attn_dropout_implementation"
);
bool
is_upscale_in_train_1
=
(
dropout_implementation_1
==
"upscale_in_train"
);
auto
*
seed_1
=
ctx
.
HasInput
(
"Seed1"
)
?
ctx
.
Input
<
Tensor
>
(
"Seed1"
)
:
nullptr
;
bool
is_fix_seed_1
=
ctx
.
Attr
<
bool
>
(
"attn_dropout_fix_seed"
);
int
seed_val_1
=
ctx
.
Attr
<
int
>
(
"attn_dropout_seed"
);
XPUDropoutParam
attn_dropout_param
;
attn_dropout_param
.
initXPUDropoutParam
(
attn_dropout_prob
,
is_upscale_in_train_1
,
is_test_1
,
is_fix_seed_1
,
seed_1
,
seed_val_1
);
XPUDropoutParam
dropout_param
(
ctx
,
0
);
// get inputs.
auto
*
d_y
=
ctx
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Y"
));
const
XPUTypeT
*
d_y_ptr
=
reinterpret_cast
<
const
XPUTypeT
*>
(
d_y
->
data
<
T
>
());
// 前向必要参数
auto
*
input_x
=
ctx
.
Input
<
Tensor
>
(
"X"
);
const
XPUTypeT
*
input_x_ptr
=
reinterpret_cast
<
const
XPUTypeT
*>
(
input_x
->
data
<
T
>
());
auto
*
qkv_transpose_out
=
ctx
.
Input
<
Tensor
>
(
"TransposeOut2"
);
const
XPUTypeT
*
qkv_transpose_out_ptr
=
reinterpret_cast
<
const
XPUTypeT
*>
(
qkv_transpose_out
->
data
<
T
>
());
auto
*
qkv_weight
=
ctx
.
Input
<
Tensor
>
(
"QKVW"
);
const
XPUTypeT
*
qkv_weight_ptr
=
reinterpret_cast
<
const
XPUTypeT
*>
(
qkv_weight
->
data
<
T
>
());
auto
*
softmax_out
=
ctx
.
Input
<
Tensor
>
(
"SoftmaxOut"
);
const
XPUTypeT
*
softmax_out_ptr
=
reinterpret_cast
<
const
XPUTypeT
*>
(
softmax_out
->
data
<
T
>
());
auto
*
attn_dropout_out
=
ctx
.
Input
<
Tensor
>
(
"AttnDropoutOut"
);
const
XPUTypeT
*
attn_dropout_out_ptr
=
reinterpret_cast
<
const
XPUTypeT
*>
(
attn_dropout_out
->
data
<
T
>
());
auto
*
attn_dropout_mask
=
ctx
.
Input
<
Tensor
>
(
"AttnDropoutMaskOut"
);
const
XPUTypeT
*
attn_dropout_mask_ptr
=
reinterpret_cast
<
const
XPUTypeT
*>
(
attn_dropout_mask
->
data
<
T
>
());
auto
*
fmha_out
=
ctx
.
Input
<
Tensor
>
(
"FMHAOut"
);
const
XPUTypeT
*
fmha_out_ptr
=
reinterpret_cast
<
const
XPUTypeT
*>
(
fmha_out
->
data
<
T
>
());
auto
*
out_linear_weight
=
ctx
.
Input
<
Tensor
>
(
"OutLinearW"
);
const
XPUTypeT
*
out_linear_weight_ptr
=
reinterpret_cast
<
const
XPUTypeT
*>
(
out_linear_weight
->
data
<
T
>
());
auto
*
dropout_mask_out
=
ctx
.
Input
<
Tensor
>
(
"DropoutMaskOut"
);
const
XPUTypeT
*
dropout_mask_out_ptr
=
reinterpret_cast
<
const
XPUTypeT
*>
(
dropout_mask_out
->
data
<
T
>
());
// 需要计算的梯度
auto
*
d_qkv_weight
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"QKVW"
));
XPUTypeT
*
d_qkv_weight_ptr
=
reinterpret_cast
<
XPUTypeT
*>
(
d_qkv_weight
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()));
auto
*
d_qkv_bias
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"QKVBias"
));
XPUTypeT
*
d_qkv_bias_ptr
=
reinterpret_cast
<
XPUTypeT
*>
(
d_qkv_bias
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()));
auto
*
d_out_linear_weight
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"OutLinearW"
));
XPUTypeT
*
d_out_linear_weight_ptr
=
reinterpret_cast
<
XPUTypeT
*>
(
d_out_linear_weight
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()));
auto
*
d_out_linear_bias
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"OutLinearBias"
));
XPUTypeT
*
d_out_linear_bias_ptr
=
reinterpret_cast
<
XPUTypeT
*>
(
d_out_linear_bias
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()));
// 有可能需要
auto
*
d_src_mask_out
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"SrcMaskOut"
));
XPUTypeT
*
d_src_mask_out_ptr
=
(
d_src_mask_out
==
nullptr
)
?
(
nullptr
)
:
(
reinterpret_cast
<
XPUTypeT
*>
(
d_src_mask_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
())));
// 输出 dx
auto
*
d_x
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"X"
));
XPUTypeT
*
d_x_ptr
=
reinterpret_cast
<
XPUTypeT
*>
(
d_x
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()));
const
Tensor
*
ln_out
=
nullptr
;
const
Tensor
*
bias_dropout_residual_out
=
nullptr
;
const
Tensor
*
ln_scale
=
nullptr
;
const
Tensor
*
ln_mean
=
nullptr
;
const
Tensor
*
ln_var
=
nullptr
;
Tensor
*
d_ln_scale
=
nullptr
;
Tensor
*
d_ln_bias
=
nullptr
;
const
XPUTypeT
*
ln_out_ptr
=
NULL
;
const
float
*
ln_scale_ptr
=
NULL
;
const
float
*
ln_mean_ptr
=
NULL
;
const
float
*
ln_var_ptr
=
NULL
;
const
XPUTypeT
*
bias_dropout_residual_out_ptr
=
NULL
;
float
*
d_ln_scale_ptr
=
nullptr
;
float
*
d_ln_bias_ptr
=
nullptr
;
float
epsilon
=
0.0
f
;
if
(
pre_layer_norm
)
{
ln_out
=
ctx
.
Input
<
Tensor
>
(
"LnOut"
);
ln_out_ptr
=
reinterpret_cast
<
const
XPUTypeT
*>
(
ln_out
->
data
<
T
>
());
ln_scale
=
ctx
.
Input
<
Tensor
>
(
"LnScale"
);
ln_mean
=
ctx
.
Input
<
Tensor
>
(
"LnMean"
);
ln_var
=
ctx
.
Input
<
Tensor
>
(
"LnVariance"
);
epsilon
=
ctx
.
Attr
<
float
>
(
"epsilon"
);
d_ln_scale
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"LnScale"
));
d_ln_bias
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"LnBias"
));
}
else
{
ln_scale
=
ctx
.
Input
<
Tensor
>
(
"Ln2Scale"
);
ln_mean
=
ctx
.
Input
<
Tensor
>
(
"Ln2Mean"
);
ln_var
=
ctx
.
Input
<
Tensor
>
(
"Ln2Variance"
);
epsilon
=
ctx
.
Attr
<
float
>
(
"ln_epsilon"
);
d_ln_scale
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Ln2Scale"
));
d_ln_bias
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Ln2Bias"
));
bias_dropout_residual_out
=
ctx
.
Input
<
Tensor
>
(
"BiasDropoutResidualOut"
);
bias_dropout_residual_out_ptr
=
reinterpret_cast
<
const
XPUTypeT
*>
(
bias_dropout_residual_out
->
data
<
T
>
());
}
ln_scale_ptr
=
ln_scale
->
data
<
float
>
();
ln_mean_ptr
=
ln_mean
->
data
<
float
>
();
ln_var_ptr
=
ln_var
->
data
<
float
>
();
d_ln_scale_ptr
=
d_ln_scale
->
mutable_data
<
float
>
(
ctx
.
GetPlace
());
d_ln_bias_ptr
=
d_ln_bias
->
mutable_data
<
float
>
(
ctx
.
GetPlace
());
const
auto
input_x_dims
=
input_x
->
dims
();
const
auto
qkv_w_dims
=
qkv_weight
->
dims
();
int
batch_size
=
input_x_dims
[
0
];
int
seq_len
=
input_x_dims
[
1
];
int
embed_dims
=
input_x_dims
[
2
];
int
num_heads
=
qkv_w_dims
[
1
];
int
head_dims
=
qkv_w_dims
[
2
];
auto
&
dev_ctx
=
ctx
.
template
device_context
<
DeviceContext
>();
xpu
::
Context
*
xpu_ctx
=
dev_ctx
.
x_context
();
xpu
::
ctx_guard
RAII_GUARD
(
xpu_ctx
);
int
r
=
0
;
// int l3_total_size = xpu_ctx->_l3_mgr.get_size();
XPUTypeT
*
d_ln_grad_ptr
=
NULL
;
// dx5 [batch_size, seq_len, hidden]
XPUTypeT
*
d_dropout_grad_ptr
=
NULL
;
// dx5 [batch_size, seq_len, hidden]
XPUTypeT
*
d_fmha_out_ptr
=
NULL
;
// d_fmha_out [batch_size, seq_len, num_heads, head_dims]
XPUTypeT
*
d_fmha_out_transpos_tmp_ptr
=
NULL
;
// d_fmha_out_transpos [batch_size, seq_len, num_heads,
// head_dims]
XPUTypeT
*
d_qk_ptr
=
NULL
;
// d_qk_ptr[batch_size, num_heads, seq_len, seq_len]
XPUTypeT
*
d_combination_qkv_ptr
=
NULL
;
// d_combination_qkv_ptr[3, batch_size, num_heads, seq_len,
// head_dims]
XPUTypeT
*
d_transpos_qkv_ptr
=
NULL
;
// dx2 [batch_size, seq_len, 3, num_heads, head_dims]
XPUTypeT
*
d_last_layernorm_grad_ptr
=
NULL
;
// d_layer_out [batch_size, seq_len, embed_dims]
const
XPUTypeT
*
dy_input_ptr
=
d_y_ptr
;
d_ln_grad_ptr
=
RAII_GUARD
.
alloc
<
XPUTypeT
>
(
batch_size
*
seq_len
*
embed_dims
);
d_dropout_grad_ptr
=
RAII_GUARD
.
alloc_l3_or_gm
<
XPUTypeT
>
(
batch_size
*
seq_len
*
embed_dims
);
d_fmha_out_ptr
=
RAII_GUARD
.
alloc_l3_or_gm
<
XPUTypeT
>
(
batch_size
*
seq_len
*
num_heads
*
head_dims
);
d_combination_qkv_ptr
=
RAII_GUARD
.
alloc
<
XPUTypeT
>
(
batch_size
*
seq_len
*
embed_dims
*
3
);
d_transpos_qkv_ptr
=
RAII_GUARD
.
alloc_l3_or_gm
<
XPUTypeT
>
(
batch_size
*
seq_len
*
embed_dims
*
3
);
d_fmha_out_transpos_tmp_ptr
=
RAII_GUARD
.
alloc_l3_or_gm
<
XPUTypeT
>
(
batch_size
*
seq_len
*
embed_dims
);
d_qk_ptr
=
RAII_GUARD
.
alloc_l3_or_gm
<
XPUTypeT
>
(
batch_size
*
seq_len
*
seq_len
*
num_heads
);
d_last_layernorm_grad_ptr
=
RAII_GUARD
.
alloc_l3_or_gm
<
XPUTypeT
>
(
batch_size
*
seq_len
*
embed_dims
);
if
(
pre_layer_norm
==
false
)
{
r
=
xpu
::
layer_norm_grad
(
xpu_ctx
,
bias_dropout_residual_out_ptr
,
d_y_ptr
,
d_ln_grad_ptr
,
batch_size
*
seq_len
,
embed_dims
,
epsilon
,
ln_scale_ptr
,
ln_mean_ptr
,
ln_var_ptr
,
d_ln_scale_ptr
,
d_ln_bias_ptr
);
PADDLE_ENFORCE_XDNN_SUCCESS
(
r
,
"layer_norm_grad"
);
dy_input_ptr
=
d_ln_grad_ptr
;
}
// dropout_grad
DropoutGrad
<
XPUTypeT
>
(
xpu_ctx
,
dy_input_ptr
,
dropout_mask_out_ptr
,
d_dropout_grad_ptr
,
dropout_param
,
batch_size
*
num_heads
*
seq_len
*
head_dims
);
// linear_out
phi
::
XpuFcInfo
linear_fc_info
;
linear_fc_info
.
InitFcInfo
(
0
,
batch_size
*
seq_len
,
embed_dims
,
embed_dims
,
false
,
false
,
nullptr
,
nullptr
,
nullptr
);
const
XPUTypeT
*
a_1
=
reinterpret_cast
<
const
XPUTypeT
*>
(
NULL
);
const
XPUTypeT
*
b_1
=
reinterpret_cast
<
const
XPUTypeT
*>
(
NULL
);
const
XPUTypeT
*
a_2
=
reinterpret_cast
<
const
XPUTypeT
*>
(
NULL
);
const
XPUTypeT
*
b_2
=
reinterpret_cast
<
const
XPUTypeT
*>
(
NULL
);
XPUTypeT
*
c_1
=
d_fmha_out_ptr
;
XPUTypeT
*
c_2
=
d_out_linear_weight_ptr
;
phi
::
XpuFcInfo
info_dfmha
;
phi
::
XpuFcInfo
info_dlinear_w
;
std
::
tuple
<
phi
::
XpuFcInfo
,
phi
::
XpuFcInfo
,
const
XPUTypeT
*
,
const
XPUTypeT
*
,
const
XPUTypeT
*
,
const
XPUTypeT
*>
fc_info
=
phi
::
MatmulGradFcInfo
(
xpu_ctx
,
&
RAII_GUARD
,
linear_fc_info
,
false
,
false
,
fmha_out_ptr
,
out_linear_weight_ptr
,
d_dropout_grad_ptr
);
std
::
tie
(
info_dfmha
,
info_dlinear_w
,
a_1
,
b_1
,
a_2
,
b_2
)
=
fc_info
;
phi
::
MatMulXPUFunction
<
XPUTypeT
>
(
xpu_ctx
,
a_2
,
b_2
,
c_2
,
info_dlinear_w
,
1.0
f
,
true
);
phi
::
MatMulXPUFunction
<
XPUTypeT
>
(
xpu_ctx
,
a_1
,
b_1
,
c_1
,
info_dfmha
,
1.0
f
,
true
);
// dlinear_bias
r
=
xpu
::
reduce_sum
(
xpu_ctx
,
d_dropout_grad_ptr
,
d_out_linear_bias_ptr
,
{
batch_size
*
seq_len
,
embed_dims
},
{
0
});
PADDLE_ENFORCE_XDNN_SUCCESS
(
r
,
"reduce_sum"
);
{
int
qkv_size
=
batch_size
*
seq_len
*
num_heads
*
head_dims
;
const
XPUTypeT
*
q_out_ptr
=
qkv_transpose_out_ptr
;
const
XPUTypeT
*
k_out_ptr
=
q_out_ptr
+
qkv_size
;
const
XPUTypeT
*
v_out_ptr
=
k_out_ptr
+
qkv_size
;
XPUTypeT
*
d_q_out_ptr
=
d_combination_qkv_ptr
;
XPUTypeT
*
d_k_out_ptr
=
d_q_out_ptr
+
qkv_size
;
XPUTypeT
*
d_v_out_ptr
=
d_k_out_ptr
+
qkv_size
;
r
=
xpu
::
transpose
<
XPUTypeT
>
(
xpu_ctx
,
d_fmha_out_ptr
,
d_fmha_out_transpos_tmp_ptr
,
{
batch_size
,
seq_len
,
num_heads
,
head_dims
},
{
0
,
2
,
1
,
3
});
PADDLE_ENFORCE_XDNN_SUCCESS
(
r
,
"transpose"
);
phi
::
XpuFcInfo
qktv_fc_info
;
qktv_fc_info
.
InitFcInfo
(
batch_size
*
num_heads
,
seq_len
,
head_dims
,
seq_len
,
false
,
false
,
nullptr
,
nullptr
,
nullptr
);
const
XPUTypeT
*
a_1
=
reinterpret_cast
<
const
XPUTypeT
*>
(
NULL
);
const
XPUTypeT
*
b_1
=
reinterpret_cast
<
const
XPUTypeT
*>
(
NULL
);
const
XPUTypeT
*
a_2
=
reinterpret_cast
<
const
XPUTypeT
*>
(
NULL
);
const
XPUTypeT
*
b_2
=
reinterpret_cast
<
const
XPUTypeT
*>
(
NULL
);
XPUTypeT
*
c_1
=
d_qk_ptr
;
XPUTypeT
*
c_2
=
d_v_out_ptr
;
phi
::
XpuFcInfo
info_d_qk
;
phi
::
XpuFcInfo
info_d_v
;
std
::
tuple
<
phi
::
XpuFcInfo
,
phi
::
XpuFcInfo
,
const
XPUTypeT
*
,
const
XPUTypeT
*
,
const
XPUTypeT
*
,
const
XPUTypeT
*>
fc_info
=
phi
::
MatmulGradFcInfo
(
xpu_ctx
,
&
RAII_GUARD
,
qktv_fc_info
,
false
,
false
,
attn_dropout_out_ptr
,
v_out_ptr
,
d_fmha_out_transpos_tmp_ptr
);
std
::
tie
(
info_d_qk
,
info_d_v
,
a_1
,
b_1
,
a_2
,
b_2
)
=
fc_info
;
phi
::
MatMulXPUFunction
<
XPUTypeT
>
(
xpu_ctx
,
a_1
,
b_1
,
c_1
,
info_d_qk
,
1.0
f
,
true
);
phi
::
MatMulXPUFunction
<
XPUTypeT
>
(
xpu_ctx
,
a_2
,
b_2
,
c_2
,
info_d_v
,
1.0
f
,
true
);
DropoutGrad
<
XPUTypeT
>
(
xpu_ctx
,
d_qk_ptr
,
attn_dropout_mask_ptr
,
d_qk_ptr
,
attn_dropout_param
,
batch_size
*
seq_len
*
seq_len
*
num_heads
);
r
=
xpu
::
softmax_grad
<
XPUTypeT
>
(
xpu_ctx
,
softmax_out_ptr
,
d_qk_ptr
,
d_qk_ptr
,
{
batch_size
,
num_heads
,
seq_len
,
seq_len
},
3
);
PADDLE_ENFORCE_XDNN_SUCCESS
(
r
,
"softmax_grad"
);
if
(
d_src_mask_out_ptr
)
{
r
=
xpu
::
copy
<
XPUTypeT
>
(
xpu_ctx
,
d_qk_ptr
,
d_src_mask_out_ptr
,
batch_size
*
seq_len
*
seq_len
*
num_heads
);
PADDLE_ENFORCE_XDNN_SUCCESS
(
r
,
"copy"
);
}
phi
::
XpuFcInfo
qk_fc_info
;
qk_fc_info
.
InitFcInfo
(
batch_size
*
num_heads
,
seq_len
,
seq_len
,
head_dims
,
false
,
true
,
nullptr
,
nullptr
,
nullptr
);
a_1
=
reinterpret_cast
<
const
XPUTypeT
*>
(
NULL
);
b_1
=
reinterpret_cast
<
const
XPUTypeT
*>
(
NULL
);
a_2
=
reinterpret_cast
<
const
XPUTypeT
*>
(
NULL
);
b_2
=
reinterpret_cast
<
const
XPUTypeT
*>
(
NULL
);
c_1
=
d_q_out_ptr
;
c_2
=
d_k_out_ptr
;
phi
::
XpuFcInfo
info_d_q
;
phi
::
XpuFcInfo
info_d_k
;
fc_info
=
phi
::
MatmulGradFcInfo
(
xpu_ctx
,
&
RAII_GUARD
,
qk_fc_info
,
false
,
true
,
q_out_ptr
,
k_out_ptr
,
d_qk_ptr
);
std
::
tie
(
info_d_q
,
info_d_k
,
a_1
,
b_1
,
a_2
,
b_2
)
=
fc_info
;
phi
::
MatMulXPUFunction
<
XPUTypeT
>
(
xpu_ctx
,
a_1
,
b_1
,
c_1
,
info_d_q
,
1.0
f
/
sqrt
(
head_dims
),
true
);
phi
::
MatMulXPUFunction
<
XPUTypeT
>
(
xpu_ctx
,
a_2
,
b_2
,
c_2
,
info_d_k
,
1.0
f
,
true
);
}
//
r
=
xpu
::
transpose
<
XPUTypeT
>
(
xpu_ctx
,
d_combination_qkv_ptr
,
d_transpos_qkv_ptr
,
{
3
,
batch_size
,
num_heads
,
seq_len
,
head_dims
},
{
1
,
3
,
0
,
2
,
4
});
PADDLE_ENFORCE_XDNN_SUCCESS
(
r
,
"transpose"
);
// dx and d_qkv_w
phi
::
XpuFcInfo
qkv_fc_info
;
qkv_fc_info
.
InitFcInfo
(
0
,
batch_size
*
seq_len
,
3
*
num_heads
*
head_dims
,
embed_dims
,
false
,
true
,
nullptr
,
nullptr
,
nullptr
);
a_1
=
reinterpret_cast
<
const
XPUTypeT
*>
(
NULL
);
b_1
=
reinterpret_cast
<
const
XPUTypeT
*>
(
NULL
);
a_2
=
reinterpret_cast
<
const
XPUTypeT
*>
(
NULL
);
b_2
=
reinterpret_cast
<
const
XPUTypeT
*>
(
NULL
);
c_1
=
(
pre_layer_norm
==
true
)
?
d_last_layernorm_grad_ptr
:
d_x_ptr
;
c_2
=
d_qkv_weight_ptr
;
phi
::
XpuFcInfo
info_d_x
;
phi
::
XpuFcInfo
info_d_qkv_w
;
const
XPUTypeT
*
use_calc_input_x_ptr
=
(
pre_layer_norm
==
true
)
?
ln_out_ptr
:
input_x_ptr
;
fc_info
=
phi
::
MatmulGradFcInfo
(
xpu_ctx
,
&
RAII_GUARD
,
qkv_fc_info
,
false
,
true
,
use_calc_input_x_ptr
,
qkv_weight_ptr
,
d_transpos_qkv_ptr
);
std
::
tie
(
info_d_x
,
info_d_qkv_w
,
a_1
,
b_1
,
a_2
,
b_2
)
=
fc_info
;
phi
::
MatMulXPUFunction
<
XPUTypeT
>
(
xpu_ctx
,
a_1
,
b_1
,
c_1
,
info_d_x
,
1.0
f
,
true
);
phi
::
MatMulXPUFunction
<
XPUTypeT
>
(
xpu_ctx
,
a_2
,
b_2
,
c_2
,
info_d_qkv_w
,
1.0
f
,
true
);
// d_qkv_bias
r
=
xpu
::
reduce_sum
(
xpu_ctx
,
d_transpos_qkv_ptr
,
d_qkv_bias_ptr
,
{
batch_size
*
seq_len
,
3
*
embed_dims
},
{
0
});
PADDLE_ENFORCE_XDNN_SUCCESS
(
r
,
"reduce_sum"
);
if
(
pre_layer_norm
)
{
r
=
xpu
::
layer_norm_grad
(
xpu_ctx
,
input_x_ptr
,
c_1
,
d_x_ptr
,
batch_size
*
seq_len
,
embed_dims
,
epsilon
,
ln_scale_ptr
,
ln_mean_ptr
,
ln_var_ptr
,
d_ln_scale_ptr
,
d_ln_bias_ptr
);
PADDLE_ENFORCE_XDNN_SUCCESS
(
r
,
"layer_norm_grad"
);
}
// add rediaus dy
r
=
xpu
::
add
(
xpu_ctx
,
dy_input_ptr
,
d_x_ptr
,
d_x_ptr
,
batch_size
*
seq_len
*
embed_dims
);
PADDLE_ENFORCE_XDNN_SUCCESS
(
r
,
"add"
);
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_XPU_KERNEL
(
fused_attention
,
ops
::
FusedAttentionOpKernel
<
phi
::
XPUContext
,
float
>
,
ops
::
FusedAttentionOpKernel
<
phi
::
XPUContext
,
paddle
::
platform
::
float16
>
);
REGISTER_OP_XPU_KERNEL
(
fused_attention_grad
,
ops
::
FusedAttentionGradXPUKernel
<
phi
::
XPUContext
,
float
>
,
ops
::
FusedAttentionGradXPUKernel
<
phi
::
XPUContext
,
paddle
::
platform
::
float16
>
);
#endif
paddle/fluid/operators/fused/fused_feedforward_op_xpu.cc
0 → 100644
浏览文件 @
071708fa
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_XPU
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/operators/matmul_v2_op.h"
#include "paddle/fluid/operators/xpu_api_wrapper.h"
#include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/phi/core/ddim.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/fluid/operators/fused/xpu_fused_common_function.h"
namespace
paddle
{
namespace
operators
{
using
Tensor
=
phi
::
DenseTensor
;
template
<
typename
DeviceContext
,
typename
T
>
class
FusedFeedForwardXPUKernel
:
public
framework
::
OpKernel
<
T
>
{
using
XPUTypeT
=
typename
XPUTypeTrait
<
T
>::
Type
;
public:
void
FFN
(
const
phi
::
XPUContext
&
dev_ctx
,
const
Tensor
*
x
,
const
Tensor
*
linear1_weight
,
const
Tensor
*
linear1_bias
,
const
Tensor
*
linear2_weight
,
const
Tensor
*
linear2_bias
,
const
Tensor
*
ln_scale
,
const
Tensor
*
ln_bias
,
Tensor
*
out
,
Tensor
*
dropout1_mask
,
Tensor
*
dropout2_mask
,
Tensor
*
ln_mean
,
Tensor
*
ln_variance
,
Tensor
*
linear1_out
,
Tensor
*
ln1_out
,
Tensor
*
dropout1_out
,
Tensor
*
dropout2_out
,
const
int
bsz_seq
,
const
int
d_model
,
const
int
dim_feedforward
,
const
std
::
string
&
act_method
,
const
bool
pre_layer_norm
,
const
float
epsilon1
,
const
float
epsilon2
,
const
XPUDropoutParam
&
dropout_param1
,
const
XPUDropoutParam
&
dropout_param2
,
int
ring_id
)
const
{
xpu
::
Context
*
xpu_ctx
=
dev_ctx
.
x_context
();
xpu
::
ctx_guard
RAII_GUARD
(
xpu_ctx
);
int
r
=
xpu
::
SUCCESS
;
const
XPUTypeT
*
x_ptr
=
reinterpret_cast
<
const
XPUTypeT
*>
(
x
->
data
<
T
>
());
const
XPUTypeT
*
residual_ptr
=
x_ptr
;
const
XPUTypeT
*
linear1_weight_ptr
=
reinterpret_cast
<
const
XPUTypeT
*>
(
linear1_weight
->
data
<
T
>
());
const
XPUTypeT
*
linear1_bias_ptr
=
reinterpret_cast
<
const
XPUTypeT
*>
(
linear1_bias
->
data
<
T
>
());
const
XPUTypeT
*
linear2_weight_ptr
=
reinterpret_cast
<
const
XPUTypeT
*>
(
linear2_weight
->
data
<
T
>
());
const
XPUTypeT
*
linear2_bias_ptr
=
reinterpret_cast
<
const
XPUTypeT
*>
(
linear2_bias
->
data
<
T
>
());
const
float
*
ln_scale_ptr
=
ln_scale
->
data
<
float
>
();
const
float
*
ln_bias_ptr
=
ln_bias
->
data
<
float
>
();
// out
XPUTypeT
*
out_ptr
=
reinterpret_cast
<
XPUTypeT
*>
(
out
->
data
<
T
>
());
XPUTypeT
*
linear1_out_ptr
=
reinterpret_cast
<
XPUTypeT
*>
(
linear1_out
->
data
<
T
>
());
XPUTypeT
*
dropout1_mask_ptr
=
reinterpret_cast
<
XPUTypeT
*>
(
dropout1_mask
->
data
<
T
>
());
XPUTypeT
*
dropout2_mask_ptr
=
reinterpret_cast
<
XPUTypeT
*>
(
dropout2_mask
->
data
<
T
>
());
float
*
ln_mean_ptr
=
ln_mean
->
data
<
float
>
();
float
*
ln_variance_ptr
=
ln_variance
->
data
<
float
>
();
XPUTypeT
*
dropout1_out_ptr
=
reinterpret_cast
<
XPUTypeT
*>
(
dropout1_out
->
data
<
T
>
());
XPUTypeT
*
dropout2_out_ptr
=
reinterpret_cast
<
XPUTypeT
*>
(
dropout2_out
->
data
<
T
>
());
size_t
l3_total_size
=
xpu_ctx
->
_l3_mgr
.
get_size
();
XPUTypeT
*
linear2_before_tmp_ptr
=
NULL
;
// dim_feedforward * bsz_seq
XPUTypeT
*
linear2_after_tmp_ptr
=
NULL
;
// d_model * bsz_seq
if
(
l3_total_size
>=
dim_feedforward
*
bsz_seq
*
sizeof
(
T
))
{
XPUTypeT
*
l3_ptr
=
RAII_GUARD
.
alloc_l3
<
XPUTypeT
>
(
dim_feedforward
*
bsz_seq
);
PADDLE_ENFORCE_XDNN_NOT_NULL
(
l3_ptr
);
linear2_before_tmp_ptr
=
linear2_after_tmp_ptr
=
l3_ptr
;
}
else
if
((
l3_total_size
<
dim_feedforward
*
bsz_seq
*
sizeof
(
T
))
&&
(
l3_total_size
>=
d_model
*
bsz_seq
*
sizeof
(
T
)))
{
XPUTypeT
*
l3_ptr
=
RAII_GUARD
.
alloc_l3
<
XPUTypeT
>
(
d_model
*
bsz_seq
);
PADDLE_ENFORCE_XDNN_NOT_NULL
(
l3_ptr
);
linear2_after_tmp_ptr
=
l3_ptr
;
linear2_before_tmp_ptr
=
RAII_GUARD
.
alloc
<
XPUTypeT
>
(
dim_feedforward
*
bsz_seq
);
PADDLE_ENFORCE_XDNN_NOT_NULL
(
linear2_before_tmp_ptr
);
}
else
{
XPUTypeT
*
gm_ptr
=
RAII_GUARD
.
alloc
<
XPUTypeT
>
(
dim_feedforward
*
bsz_seq
);
PADDLE_ENFORCE_XDNN_NOT_NULL
(
gm_ptr
);
linear2_before_tmp_ptr
=
linear2_after_tmp_ptr
=
gm_ptr
;
}
// layernorm
if
(
pre_layer_norm
)
{
XPUTypeT
*
ln1_out_ptr
=
reinterpret_cast
<
XPUTypeT
*>
(
ln1_out
->
data
<
T
>
());
r
=
xpu
::
layer_norm
(
xpu_ctx
,
x_ptr
,
ln1_out_ptr
,
bsz_seq
,
d_model
,
epsilon1
,
ln_scale_ptr
,
ln_bias_ptr
,
ln_mean_ptr
,
ln_variance_ptr
);
PADDLE_ENFORCE_XDNN_SUCCESS
(
r
,
"layer_norm "
);
x_ptr
=
ln1_out_ptr
;
}
// fc
phi
::
XpuFcInfo
linear1_fc_info
;
linear1_fc_info
.
InitFcInfo
(
0
,
bsz_seq
,
dim_feedforward
,
d_model
,
false
,
false
,
nullptr
,
nullptr
,
nullptr
);
phi
::
MatMulXPUFunction
<
XPUTypeT
>
(
xpu_ctx
,
x_ptr
,
linear1_weight_ptr
,
linear2_before_tmp_ptr
,
linear1_fc_info
,
1.0
f
);
// bias
r
=
xpu
::
broadcast_add
(
xpu_ctx
,
linear2_before_tmp_ptr
,
linear1_bias_ptr
,
linear1_out_ptr
,
{
bsz_seq
,
dim_feedforward
},
{
dim_feedforward
});
PADDLE_ENFORCE_XDNN_SUCCESS
(
r
,
"broadcast_add"
);
// act
if
(
act_method
==
"gelu"
)
{
r
=
xpu
::
gelu
(
xpu_ctx
,
linear1_out_ptr
,
linear2_before_tmp_ptr
,
linear1_out
->
numel
());
PADDLE_ENFORCE_XDNN_SUCCESS
(
r
,
"gelu"
);
}
else
if
(
act_method
==
"relu"
)
{
r
=
xpu
::
relu
(
xpu_ctx
,
linear1_out_ptr
,
linear2_before_tmp_ptr
,
linear1_out
->
numel
());
PADDLE_ENFORCE_XDNN_SUCCESS
(
r
,
"relu"
);
}
else
{
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Currently only supports gelu or relu activation functions!"
));
}
// dropout1
Dropout
<
XPUTypeT
>
(
xpu_ctx
,
linear2_before_tmp_ptr
,
dropout1_mask_ptr
,
dropout1_out_ptr
,
dropout_param1
,
dropout1_out
->
numel
());
// fc
phi
::
XpuFcInfo
linear2_fc_info
;
linear2_fc_info
.
InitFcInfo
(
0
,
bsz_seq
,
d_model
,
dim_feedforward
,
false
,
false
,
nullptr
,
nullptr
,
nullptr
);
phi
::
MatMulXPUFunction
<
XPUTypeT
>
(
xpu_ctx
,
dropout1_out_ptr
,
linear2_weight_ptr
,
dropout2_out_ptr
,
linear2_fc_info
,
1.0
f
);
// bias
r
=
xpu
::
broadcast_add
(
xpu_ctx
,
dropout2_out_ptr
,
linear2_bias_ptr
,
dropout2_out_ptr
,
{
bsz_seq
,
d_model
},
{
d_model
});
PADDLE_ENFORCE_XDNN_SUCCESS
(
r
,
"broadcast_add"
);
// dropout2
Dropout
<
XPUTypeT
>
(
xpu_ctx
,
dropout2_out_ptr
,
dropout2_mask_ptr
,
dropout2_out_ptr
,
dropout_param2
,
dropout2_out
->
numel
());
// residual_ptr + dropout_out
XPUTypeT
*
residual_add_out_ptr
=
out_ptr
;
if
(
pre_layer_norm
==
false
)
{
residual_add_out_ptr
=
dropout2_out_ptr
;
}
r
=
xpu
::
broadcast_add
(
xpu_ctx
,
residual_ptr
,
dropout2_out_ptr
,
residual_add_out_ptr
,
{
bsz_seq
,
d_model
},
{
bsz_seq
,
d_model
});
PADDLE_ENFORCE_XDNN_SUCCESS
(
r
,
"broadcast_add"
);
if
(
pre_layer_norm
==
false
)
{
r
=
xpu
::
layer_norm
(
xpu_ctx
,
residual_add_out_ptr
,
out_ptr
,
bsz_seq
,
d_model
,
epsilon2
,
ln_scale_ptr
,
ln_bias_ptr
,
ln_mean_ptr
,
ln_variance_ptr
);
PADDLE_ENFORCE_XDNN_SUCCESS
(
r
,
"layer_norm"
);
}
}
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
auto
place
=
context
.
GetPlace
();
auto
*
x
=
context
.
Input
<
Tensor
>
(
"X"
);
auto
*
linear1_weight
=
context
.
Input
<
Tensor
>
(
"Linear1Weight"
);
auto
*
linear1_bias
=
context
.
Input
<
Tensor
>
(
"Linear1Bias"
);
auto
*
linear2_weight
=
context
.
Input
<
Tensor
>
(
"Linear2Weight"
);
auto
*
linear2_bias
=
context
.
Input
<
Tensor
>
(
"Linear2Bias"
);
const
bool
pre_layer_norm
=
context
.
Attr
<
bool
>
(
"pre_layer_norm"
);
const
Tensor
*
ln_scale
=
nullptr
;
const
Tensor
*
ln_bias
=
nullptr
;
Tensor
*
ln_mean
=
nullptr
;
Tensor
*
ln_variance
=
nullptr
;
Tensor
*
ln1_out
=
nullptr
;
if
(
pre_layer_norm
)
{
ln_scale
=
context
.
Input
<
Tensor
>
(
"Ln1Scale"
);
ln_bias
=
context
.
Input
<
Tensor
>
(
"Ln1Bias"
);
ln_mean
=
context
.
Output
<
Tensor
>
(
"Ln1Mean"
);
ln_variance
=
context
.
Output
<
Tensor
>
(
"Ln1Variance"
);
ln1_out
=
context
.
Output
<
Tensor
>
(
"Ln1Out"
);
ln1_out
->
mutable_data
<
T
>
(
place
);
}
else
{
ln_scale
=
context
.
Input
<
Tensor
>
(
"Ln2Scale"
);
ln_bias
=
context
.
Input
<
Tensor
>
(
"Ln2Bias"
);
ln_mean
=
context
.
Output
<
Tensor
>
(
"Ln2Mean"
);
ln_variance
=
context
.
Output
<
Tensor
>
(
"Ln2Variance"
);
}
auto
*
out
=
context
.
Output
<
Tensor
>
(
"Out"
);
auto
*
dropout1_mask
=
context
.
Output
<
Tensor
>
(
"Dropout1Mask"
);
auto
*
dropout2_mask
=
context
.
Output
<
Tensor
>
(
"Dropout2Mask"
);
auto
*
linear1_out
=
context
.
Output
<
Tensor
>
(
"Linear1Out"
);
auto
*
dropout1_out
=
context
.
Output
<
Tensor
>
(
"Dropout1Out"
);
auto
*
dropout2_out
=
context
.
Output
<
Tensor
>
(
"Dropout2Out"
);
const
std
::
string
act_method
=
context
.
Attr
<
std
::
string
>
(
"act_method"
);
const
int
ring_id
=
context
.
Attr
<
int
>
(
"ring_id"
);
const
float
epsilon1
=
context
.
Attr
<
float
>
(
"ln1_epsilon"
);
const
float
epsilon2
=
context
.
Attr
<
float
>
(
"ln2_epsilon"
);
XPUDropoutParam
dropout_param1
;
dropout_param1
.
initXPUDropoutParam
(
context
,
1
);
XPUDropoutParam
dropout_param2
;
dropout_param2
.
initXPUDropoutParam
(
context
,
2
);
ln_mean
->
mutable_data
<
float
>
(
place
);
ln_variance
->
mutable_data
<
float
>
(
place
);
out
->
mutable_data
<
T
>
(
place
);
dropout1_mask
->
mutable_data
<
T
>
(
place
);
dropout2_mask
->
mutable_data
<
T
>
(
place
);
dropout1_out
->
mutable_data
<
T
>
(
place
);
dropout2_out
->
mutable_data
<
T
>
(
place
);
linear1_out
->
mutable_data
<
T
>
(
place
);
auto
x_dim
=
x
->
dims
();
auto
mat_dim_x
=
phi
::
funcs
::
CreateMatrixDescriptor
(
RowMatrixFromVector
(
x_dim
),
0
,
false
);
auto
dim
=
linear1_weight
->
dims
();
int
d_model
=
dim
[
0
];
int
dim_feedforward
=
dim
[
dim
.
size
()
-
1
];
int
bsz_seq
=
mat_dim_x
.
batch_size_
*
mat_dim_x
.
height_
;
auto
&
dev_ctx
=
context
.
template
device_context
<
phi
::
XPUContext
>();
FFN
(
dev_ctx
,
x
,
linear1_weight
,
linear1_bias
,
linear2_weight
,
linear2_bias
,
ln_scale
,
ln_bias
,
out
,
dropout1_mask
,
dropout2_mask
,
ln_mean
,
ln_variance
,
linear1_out
,
ln1_out
,
dropout1_out
,
dropout2_out
,
bsz_seq
,
d_model
,
dim_feedforward
,
act_method
,
pre_layer_norm
,
epsilon1
,
epsilon2
,
dropout_param1
,
dropout_param2
,
ring_id
);
}
};
template
<
typename
DeviceContext
,
typename
T
>
class
FusedFeedForwardGradXPUKernel
:
public
framework
::
OpKernel
<
T
>
{
using
XPUTypeT
=
typename
XPUTypeTrait
<
T
>::
Type
;
public:
void
FFNGrad
(
const
phi
::
XPUContext
&
dev_ctx
,
const
Tensor
*
d_out
,
const
Tensor
*
x
,
const
Tensor
*
dropout1_mask
,
const
Tensor
*
dropout2_mask
,
const
Tensor
*
linear1_out
,
const
Tensor
*
ln1_out
,
const
Tensor
*
dropout1_out
,
const
Tensor
*
dropout2_out
,
const
Tensor
*
linear1_weight
,
const
Tensor
*
linear2_weight
,
const
Tensor
*
ln_scale
,
const
Tensor
*
ln_mean
,
const
Tensor
*
ln_variance
,
Tensor
*
d_x
,
Tensor
*
d_linear1_weight
,
Tensor
*
d_linear1_bias
,
Tensor
*
d_linear2_weight
,
Tensor
*
d_linear2_bias
,
Tensor
*
d_ln_scale
,
Tensor
*
d_ln_bias
,
const
int
bsz_seq
,
const
int
d_model
,
const
int
dim_feedforward
,
const
XPUDropoutParam
&
dropout_param1
,
const
XPUDropoutParam
&
dropout_param2
,
const
std
::
string
&
act_method
,
const
bool
pre_layer_norm
,
const
float
epsilon
,
const
int
ring_id
)
const
{
xpu
::
Context
*
xpu_ctx
=
dev_ctx
.
x_context
();
xpu
::
ctx_guard
RAII_GUARD
(
xpu_ctx
);
int
r
=
xpu
::
SUCCESS
;
// inputs ptr
const
XPUTypeT
*
d_out_ptr
=
reinterpret_cast
<
const
XPUTypeT
*>
(
d_out
->
data
<
T
>
());
const
XPUTypeT
*
x_ptr
=
reinterpret_cast
<
const
XPUTypeT
*>
(
x
->
data
<
T
>
());
const
XPUTypeT
*
dropout1_mask_ptr
=
reinterpret_cast
<
const
XPUTypeT
*>
(
dropout1_mask
->
data
<
T
>
());
const
XPUTypeT
*
dropout2_mask_ptr
=
reinterpret_cast
<
const
XPUTypeT
*>
(
dropout2_mask
->
data
<
T
>
());
const
XPUTypeT
*
linear1_out_ptr
=
reinterpret_cast
<
const
XPUTypeT
*>
(
linear1_out
->
data
<
T
>
());
const
XPUTypeT
*
dropout1_out_ptr
=
reinterpret_cast
<
const
XPUTypeT
*>
(
dropout1_out
->
data
<
T
>
());
const
XPUTypeT
*
linear1_weight_ptr
=
reinterpret_cast
<
const
XPUTypeT
*>
(
linear1_weight
->
data
<
T
>
());
const
XPUTypeT
*
linear2_weight_ptr
=
reinterpret_cast
<
const
XPUTypeT
*>
(
linear2_weight
->
data
<
T
>
());
const
float
*
ln_scale_ptr
=
ln_scale
->
data
<
float
>
();
const
float
*
ln_mean_ptr
=
ln_mean
->
data
<
float
>
();
const
float
*
ln_variance_ptr
=
ln_variance
->
data
<
float
>
();
// outputs ptr
XPUTypeT
*
d_x_ptr
=
reinterpret_cast
<
XPUTypeT
*>
(
d_x
->
data
<
T
>
());
XPUTypeT
*
d_linear1_weight_ptr
=
reinterpret_cast
<
XPUTypeT
*>
(
d_linear1_weight
->
data
<
T
>
());
XPUTypeT
*
d_linear1_bias_ptr
=
reinterpret_cast
<
XPUTypeT
*>
(
d_linear1_bias
->
data
<
T
>
());
XPUTypeT
*
d_linear2_weight_ptr
=
reinterpret_cast
<
XPUTypeT
*>
(
d_linear2_weight
->
data
<
T
>
());
XPUTypeT
*
d_linear2_bias_ptr
=
reinterpret_cast
<
XPUTypeT
*>
(
d_linear2_bias
->
data
<
T
>
());
float
*
d_ln_scale_ptr
=
d_ln_scale
->
data
<
float
>
();
float
*
d_ln_bias_ptr
=
d_ln_bias
->
data
<
float
>
();
size_t
l3_total_size
=
xpu_ctx
->
_l3_mgr
.
get_size
();
XPUTypeT
*
big_tmp_l3_ptr
=
NULL
;
// dim_feedforward * bsz_seq
XPUTypeT
*
small_tmp_l3_ptr
=
NULL
;
// d_model * bsz_seq
XPUTypeT
*
big_tmp_gm_ptr
=
NULL
;
// dim_feedforward * bsz_seq
XPUTypeT
*
small_tmp_gm_ptr
=
NULL
;
// d_model * bsz_seq
XPUTypeT
*
d_layernorm_out_ptr
=
NULL
;
// dx9
XPUTypeT
*
d_dropout2_out_ptr
=
NULL
;
// dx7
XPUTypeT
*
d_linear2_out_ptr
=
NULL
;
// dx5
XPUTypeT
*
d_dropout1_out_ptr
=
NULL
;
// dx4
XPUTypeT
*
d_act_out_ptr
=
NULL
;
// dx3
XPUTypeT
*
d_linear1_out_ptr
=
NULL
;
// dx1
const
XPUTypeT
*
d_residual_ptr
=
d_out_ptr
;
if
(
l3_total_size
>=
(
dim_feedforward
*
bsz_seq
*
sizeof
(
T
)
+
d_model
*
bsz_seq
*
sizeof
(
T
)))
{
big_tmp_l3_ptr
=
RAII_GUARD
.
alloc_l3
<
XPUTypeT
>
(
dim_feedforward
*
bsz_seq
);
PADDLE_ENFORCE_XDNN_NOT_NULL
(
big_tmp_l3_ptr
);
small_tmp_l3_ptr
=
RAII_GUARD
.
alloc_l3
<
XPUTypeT
>
(
d_model
*
bsz_seq
);
PADDLE_ENFORCE_XDNN_NOT_NULL
(
small_tmp_l3_ptr
);
d_layernorm_out_ptr
=
small_tmp_l3_ptr
;
d_dropout2_out_ptr
=
small_tmp_l3_ptr
;
d_linear2_out_ptr
=
big_tmp_l3_ptr
;
d_dropout1_out_ptr
=
big_tmp_l3_ptr
;
d_act_out_ptr
=
big_tmp_l3_ptr
;
d_linear1_out_ptr
=
small_tmp_l3_ptr
;
}
else
if
(
l3_total_size
>=
dim_feedforward
*
bsz_seq
*
sizeof
(
T
))
{
big_tmp_l3_ptr
=
RAII_GUARD
.
alloc_l3
<
XPUTypeT
>
(
dim_feedforward
*
bsz_seq
);
PADDLE_ENFORCE_XDNN_NOT_NULL
(
big_tmp_l3_ptr
);
small_tmp_l3_ptr
=
big_tmp_l3_ptr
;
big_tmp_gm_ptr
=
RAII_GUARD
.
alloc
<
XPUTypeT
>
(
dim_feedforward
*
bsz_seq
);
PADDLE_ENFORCE_XDNN_NOT_NULL
(
big_tmp_gm_ptr
);
small_tmp_gm_ptr
=
RAII_GUARD
.
alloc
<
XPUTypeT
>
(
d_model
*
bsz_seq
);
PADDLE_ENFORCE_XDNN_NOT_NULL
(
small_tmp_gm_ptr
);
d_layernorm_out_ptr
=
small_tmp_l3_ptr
;
d_dropout2_out_ptr
=
small_tmp_gm_ptr
;
d_linear2_out_ptr
=
big_tmp_l3_ptr
;
d_dropout1_out_ptr
=
big_tmp_l3_ptr
;
d_act_out_ptr
=
big_tmp_gm_ptr
;
d_linear1_out_ptr
=
small_tmp_l3_ptr
;
}
else
if
(
l3_total_size
>=
d_model
*
bsz_seq
*
sizeof
(
T
))
{
big_tmp_gm_ptr
=
RAII_GUARD
.
alloc
<
XPUTypeT
>
(
dim_feedforward
*
bsz_seq
);
PADDLE_ENFORCE_XDNN_NOT_NULL
(
big_tmp_gm_ptr
);
small_tmp_l3_ptr
=
RAII_GUARD
.
alloc_l3
<
XPUTypeT
>
(
d_model
*
bsz_seq
);
PADDLE_ENFORCE_XDNN_NOT_NULL
(
small_tmp_l3_ptr
);
d_layernorm_out_ptr
=
small_tmp_l3_ptr
;
d_dropout2_out_ptr
=
small_tmp_l3_ptr
;
d_linear2_out_ptr
=
big_tmp_gm_ptr
;
d_dropout1_out_ptr
=
big_tmp_gm_ptr
;
d_act_out_ptr
=
big_tmp_gm_ptr
;
d_linear1_out_ptr
=
small_tmp_l3_ptr
;
}
else
{
big_tmp_gm_ptr
=
RAII_GUARD
.
alloc
<
XPUTypeT
>
(
dim_feedforward
*
bsz_seq
);
PADDLE_ENFORCE_XDNN_NOT_NULL
(
big_tmp_gm_ptr
);
small_tmp_gm_ptr
=
RAII_GUARD
.
alloc
<
XPUTypeT
>
(
d_model
*
bsz_seq
);
PADDLE_ENFORCE_XDNN_NOT_NULL
(
small_tmp_gm_ptr
);
d_layernorm_out_ptr
=
small_tmp_gm_ptr
;
d_dropout2_out_ptr
=
small_tmp_gm_ptr
;
d_linear2_out_ptr
=
big_tmp_gm_ptr
;
d_dropout1_out_ptr
=
big_tmp_gm_ptr
;
d_act_out_ptr
=
big_tmp_gm_ptr
;
d_linear1_out_ptr
=
small_tmp_gm_ptr
;
}
if
(
pre_layer_norm
==
false
)
{
const
XPUTypeT
*
dropout2_out_ptr
=
reinterpret_cast
<
const
XPUTypeT
*>
(
dropout2_out
->
data
<
T
>
());
r
=
xpu
::
layer_norm_grad
(
xpu_ctx
,
dropout2_out_ptr
,
d_out_ptr
,
d_layernorm_out_ptr
,
bsz_seq
,
d_model
,
epsilon
,
ln_scale_ptr
,
ln_mean_ptr
,
ln_variance_ptr
,
d_ln_scale_ptr
,
d_ln_bias_ptr
);
PADDLE_ENFORCE_XDNN_SUCCESS
(
r
,
"layer_norm_grad"
);
d_residual_ptr
=
d_layernorm_out_ptr
;
}
DropoutGrad
(
xpu_ctx
,
d_residual_ptr
,
dropout2_mask_ptr
,
d_dropout2_out_ptr
,
dropout_param2
,
bsz_seq
*
d_model
);
// linear_grad2
r
=
xpu
::
reduce_sum
(
xpu_ctx
,
d_dropout2_out_ptr
,
d_linear2_bias_ptr
,
{
bsz_seq
,
d_model
},
{
0
});
PADDLE_ENFORCE_XDNN_SUCCESS
(
r
,
"reduce_sum"
);
phi
::
XpuFcInfo
linear2_fc_info
;
linear2_fc_info
.
InitFcInfo
(
0
,
bsz_seq
,
d_model
,
dim_feedforward
,
false
,
false
,
nullptr
,
nullptr
,
nullptr
);
const
XPUTypeT
*
a_1
=
reinterpret_cast
<
const
XPUTypeT
*>
(
NULL
);
const
XPUTypeT
*
b_1
=
reinterpret_cast
<
const
XPUTypeT
*>
(
NULL
);
const
XPUTypeT
*
a_2
=
reinterpret_cast
<
const
XPUTypeT
*>
(
NULL
);
const
XPUTypeT
*
b_2
=
reinterpret_cast
<
const
XPUTypeT
*>
(
NULL
);
XPUTypeT
*
c_1
=
d_linear2_out_ptr
;
XPUTypeT
*
c_2
=
d_linear2_weight_ptr
;
phi
::
XpuFcInfo
info_d_dropout1
;
phi
::
XpuFcInfo
info_dw2
;
std
::
tuple
<
phi
::
XpuFcInfo
,
phi
::
XpuFcInfo
,
const
XPUTypeT
*
,
const
XPUTypeT
*
,
const
XPUTypeT
*
,
const
XPUTypeT
*>
fc_info
=
phi
::
MatmulGradFcInfo
(
xpu_ctx
,
&
RAII_GUARD
,
linear2_fc_info
,
false
,
false
,
dropout1_out_ptr
,
linear2_weight_ptr
,
d_dropout2_out_ptr
);
std
::
tie
(
info_d_dropout1
,
info_dw2
,
a_1
,
b_1
,
a_2
,
b_2
)
=
fc_info
;
// if l3_total_size >= dim_feedforward * bsz_seq * sizeof(T), first transpos
if
(
l3_total_size
>=
dim_feedforward
*
bsz_seq
*
sizeof
(
T
)
&&
info_dw2
.
trans_x
)
{
r
=
xpu
::
transpose
<
XPUTypeT
>
(
xpu_ctx
,
dropout1_out_ptr
,
big_tmp_l3_ptr
,
{
bsz_seq
,
dim_feedforward
},
{
1
,
0
});
PADDLE_ENFORCE_XDNN_SUCCESS
(
r
,
"transpose"
);
a_2
=
big_tmp_l3_ptr
;
info_dw2
.
trans_x
=
!
info_dw2
.
trans_x
;
info_dw2
.
stride_x
=
info_dw2
.
k
;
}
phi
::
MatMulXPUFunction
<
XPUTypeT
>
(
xpu_ctx
,
a_1
,
b_1
,
c_1
,
info_d_dropout1
,
1.0
f
,
true
);
phi
::
MatMulXPUFunction
<
XPUTypeT
>
(
xpu_ctx
,
a_2
,
b_2
,
c_2
,
info_dw2
,
1.0
f
,
true
);
// dropout_grad1
DropoutGrad
(
xpu_ctx
,
d_linear2_out_ptr
,
dropout1_mask_ptr
,
d_dropout1_out_ptr
,
dropout_param1
,
bsz_seq
*
dim_feedforward
);
// act_grad
if
(
act_method
==
"gelu"
)
{
r
=
xpu
::
gelu_grad
(
xpu_ctx
,
linear1_out_ptr
,
linear1_out_ptr
,
d_dropout1_out_ptr
,
d_act_out_ptr
,
bsz_seq
*
dim_feedforward
);
PADDLE_ENFORCE_XDNN_SUCCESS
(
r
,
"gelu_grad"
);
}
else
if
(
act_method
==
"relu"
)
{
r
=
xpu
::
relu_grad
(
xpu_ctx
,
linear1_out_ptr
,
linear1_out_ptr
,
d_dropout1_out_ptr
,
d_act_out_ptr
,
bsz_seq
*
dim_feedforward
);
PADDLE_ENFORCE_XDNN_SUCCESS
(
r
,
"relu_grad"
);
}
else
{
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Currently only supports gelu or relu activation functions!"
));
}
// linear1_grad
r
=
xpu
::
reduce_sum
(
xpu_ctx
,
d_act_out_ptr
,
d_linear1_bias_ptr
,
{
bsz_seq
,
dim_feedforward
},
{
0
});
PADDLE_ENFORCE_XDNN_SUCCESS
(
r
,
"reduce_sum"
);
phi
::
XpuFcInfo
linear1_fc_info
;
linear1_fc_info
.
InitFcInfo
(
0
,
bsz_seq
,
dim_feedforward
,
d_model
,
false
,
false
,
nullptr
,
nullptr
,
nullptr
);
a_1
=
reinterpret_cast
<
const
XPUTypeT
*>
(
NULL
);
b_1
=
reinterpret_cast
<
const
XPUTypeT
*>
(
NULL
);
a_2
=
reinterpret_cast
<
const
XPUTypeT
*>
(
NULL
);
b_2
=
reinterpret_cast
<
const
XPUTypeT
*>
(
NULL
);
c_1
=
(
pre_layer_norm
==
true
?
d_linear1_out_ptr
:
d_x_ptr
);
c_2
=
d_linear1_weight_ptr
;
phi
::
XpuFcInfo
info_dx
;
phi
::
XpuFcInfo
info_dw1
;
const
XPUTypeT
*
linear1_x_ptr
=
(
pre_layer_norm
==
true
?
reinterpret_cast
<
const
XPUTypeT
*>
(
ln1_out
->
data
<
T
>
())
:
x_ptr
);
if
(
l3_total_size
>=
d_model
*
bsz_seq
*
sizeof
(
T
)
&&
info_dw1
.
trans_x
)
{
r
=
xpu
::
transpose
<
XPUTypeT
>
(
xpu_ctx
,
linear1_x_ptr
,
small_tmp_l3_ptr
,
{
bsz_seq
,
d_model
},
{
1
,
0
});
PADDLE_ENFORCE_XDNN_SUCCESS
(
r
,
"transpose"
);
a_2
=
small_tmp_l3_ptr
;
info_dw1
.
trans_x
=
!
info_dw1
.
trans_x
;
info_dw1
.
stride_x
=
info_dw1
.
k
;
}
fc_info
=
phi
::
MatmulGradFcInfo
(
xpu_ctx
,
&
RAII_GUARD
,
linear1_fc_info
,
false
,
false
,
linear1_x_ptr
,
linear1_weight_ptr
,
d_act_out_ptr
);
std
::
tie
(
info_dx
,
info_dw1
,
a_1
,
b_1
,
a_2
,
b_2
)
=
fc_info
;
phi
::
MatMulXPUFunction
<
XPUTypeT
>
(
xpu_ctx
,
a_1
,
b_1
,
c_1
,
info_dx
,
1.0
f
,
true
);
phi
::
MatMulXPUFunction
<
XPUTypeT
>
(
xpu_ctx
,
a_2
,
b_2
,
c_2
,
info_dw1
,
1.0
f
,
true
);
if
(
pre_layer_norm
)
{
r
=
xpu
::
layer_norm_grad
(
xpu_ctx
,
x_ptr
,
c_1
,
c_1
,
bsz_seq
,
d_model
,
epsilon
,
ln_scale_ptr
,
ln_mean_ptr
,
ln_variance_ptr
,
d_ln_scale_ptr
,
d_ln_bias_ptr
);
PADDLE_ENFORCE_XDNN_SUCCESS
(
r
,
"layer_norm_grad"
);
}
r
=
xpu
::
add
(
xpu_ctx
,
c_1
,
d_residual_ptr
,
d_x_ptr
,
d_model
*
bsz_seq
);
PADDLE_ENFORCE_XDNN_SUCCESS
(
r
,
"add"
);
}
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
auto
place
=
context
.
GetPlace
();
const
bool
pre_layer_norm
=
context
.
Attr
<
bool
>
(
"pre_layer_norm"
);
// inputs
auto
*
d_out
=
context
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
x
=
context
.
Input
<
Tensor
>
(
"X"
);
auto
*
dropout1_mask
=
context
.
Input
<
Tensor
>
(
"Dropout1Mask"
);
auto
*
dropout2_mask
=
context
.
Input
<
Tensor
>
(
"Dropout2Mask"
);
auto
*
linear1_out
=
context
.
Input
<
Tensor
>
(
"Linear1Out"
);
auto
*
ln1_out
=
pre_layer_norm
?
context
.
Input
<
Tensor
>
(
"Ln1Out"
)
:
nullptr
;
auto
*
dropout1_out
=
context
.
Input
<
Tensor
>
(
"Dropout1Out"
);
auto
*
dropout2_out
=
context
.
Input
<
Tensor
>
(
"Dropout2Out"
);
auto
*
linear1_weight
=
context
.
Input
<
Tensor
>
(
"Linear1Weight"
);
auto
*
linear2_weight
=
context
.
Input
<
Tensor
>
(
"Linear2Weight"
);
const
Tensor
*
ln_mean
=
nullptr
;
const
Tensor
*
ln_variance
=
nullptr
;
const
Tensor
*
ln_scale
=
nullptr
;
if
(
pre_layer_norm
)
{
ln_mean
=
context
.
Input
<
Tensor
>
(
"Ln1Mean"
);
ln_variance
=
context
.
Input
<
Tensor
>
(
"Ln1Variance"
);
ln_scale
=
context
.
Input
<
Tensor
>
(
"Ln1Scale"
);
}
else
{
ln_mean
=
context
.
Input
<
Tensor
>
(
"Ln2Mean"
);
ln_variance
=
context
.
Input
<
Tensor
>
(
"Ln2Variance"
);
ln_scale
=
context
.
Input
<
Tensor
>
(
"Ln2Scale"
);
}
// output
auto
*
d_x
=
context
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"X"
));
Tensor
*
d_ln_scale
=
nullptr
;
Tensor
*
d_ln_bias
=
nullptr
;
if
(
pre_layer_norm
)
{
d_ln_scale
=
context
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Ln1Scale"
));
d_ln_bias
=
context
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Ln1Bias"
));
}
else
{
d_ln_scale
=
context
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Ln2Scale"
));
d_ln_bias
=
context
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Ln2Bias"
));
}
auto
*
d_linear1_weight
=
context
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Linear1Weight"
));
auto
*
d_linear1_bias
=
context
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Linear1Bias"
));
auto
*
d_linear2_weight
=
context
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Linear2Weight"
));
auto
*
d_linear2_bias
=
context
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Linear2Bias"
));
float
epsilon
=
0.0
f
;
if
(
pre_layer_norm
)
{
epsilon
=
context
.
Attr
<
float
>
(
"ln1_epsilon"
);
}
else
{
epsilon
=
context
.
Attr
<
float
>
(
"ln2_epsilon"
);
}
const
std
::
string
act_method
=
context
.
Attr
<
std
::
string
>
(
"act_method"
);
XPUDropoutParam
dropout_param1
(
context
,
1
);
XPUDropoutParam
dropout_param2
(
context
,
2
);
const
int
ring_id
=
context
.
Attr
<
int
>
(
"ring_id"
);
d_x
->
mutable_data
<
T
>
(
place
);
d_ln_scale
->
mutable_data
<
float
>
(
place
);
d_ln_bias
->
mutable_data
<
float
>
(
place
);
d_linear1_bias
->
mutable_data
<
T
>
(
place
);
d_linear2_bias
->
mutable_data
<
T
>
(
place
);
d_linear1_weight
->
mutable_data
<
T
>
(
place
);
d_linear2_weight
->
mutable_data
<
T
>
(
place
);
auto
x_dim
=
x
->
dims
();
auto
mat_dim_x
=
phi
::
funcs
::
CreateMatrixDescriptor
(
RowMatrixFromVector
(
x_dim
),
0
,
false
);
auto
linear1_weight_dim
=
linear1_weight
->
dims
();
int
d_model
=
linear1_weight_dim
[
0
];
int
dim_feedforward
=
linear1_weight_dim
[
linear1_weight_dim
.
size
()
-
1
];
int
bsz_seq
=
mat_dim_x
.
batch_size_
*
mat_dim_x
.
height_
;
auto
&
dev_ctx
=
context
.
template
device_context
<
phi
::
XPUContext
>();
FFNGrad
(
dev_ctx
,
d_out
,
x
,
dropout1_mask
,
dropout2_mask
,
linear1_out
,
ln1_out
,
dropout1_out
,
dropout2_out
,
linear1_weight
,
linear2_weight
,
ln_scale
,
ln_mean
,
ln_variance
,
d_x
,
d_linear1_weight
,
d_linear1_bias
,
d_linear2_weight
,
d_linear2_bias
,
d_ln_scale
,
d_ln_bias
,
bsz_seq
,
d_model
,
dim_feedforward
,
dropout_param1
,
dropout_param2
,
act_method
,
pre_layer_norm
,
epsilon
,
ring_id
);
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_XPU_KERNEL
(
fused_feedforward
,
ops
::
FusedFeedForwardXPUKernel
<
phi
::
XPUContext
,
float
>
,
ops
::
FusedFeedForwardXPUKernel
<
phi
::
XPUContext
,
paddle
::
platform
::
float16
>
);
REGISTER_OP_XPU_KERNEL
(
fused_feedforward_grad
,
ops
::
FusedFeedForwardGradXPUKernel
<
phi
::
XPUContext
,
float
>
,
ops
::
FusedFeedForwardGradXPUKernel
<
phi
::
XPUContext
,
paddle
::
platform
::
float16
>
);
#endif
paddle/fluid/operators/fused/xpu_fused_common_function.h
0 → 100644
浏览文件 @
071708fa
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#ifdef PADDLE_WITH_XPU
#include "paddle/fluid/platform/device/device_wrapper.h"
namespace
paddle
{
namespace
operators
{
using
Tensor
=
phi
::
DenseTensor
;
struct
XPUDropoutParam
{
float
dropout_prob
;
bool
is_upscale_in_train
;
bool
is_test
;
bool
fix_seed
;
const
Tensor
*
tensor_seed
;
int
seed_val
;
XPUDropoutParam
()
{
fix_seed
=
false
;
is_test
=
false
;
is_upscale_in_train
=
false
;
dropout_prob
=
0.5
;
tensor_seed
=
nullptr
;
seed_val
=
0
;
}
XPUDropoutParam
(
const
framework
::
ExecutionContext
&
context
,
const
int
dropout_index
)
{
std
::
string
pre_fix
=
"dropout"
;
std
::
string
str_index
=
std
::
to_string
(
dropout_index
);
if
(
dropout_index
>
0
)
{
pre_fix
=
pre_fix
+
str_index
+
"_"
;
}
else
{
pre_fix
=
pre_fix
+
"_"
;
}
dropout_prob
=
context
.
Attr
<
float
>
(
pre_fix
+
"rate"
);
auto
&
dropout_implementation
=
context
.
Attr
<
std
::
string
>
(
pre_fix
+
"implementation"
);
is_upscale_in_train
=
(
dropout_implementation
==
"upscale_in_train"
);
is_test
=
context
.
Attr
<
bool
>
(
"is_test"
);
fix_seed
=
context
.
Attr
<
bool
>
(
pre_fix
+
"fix_seed"
);
std
::
string
str_seed
=
"Dropout"
;
if
(
dropout_index
>
0
)
{
str_seed
=
str_seed
+
str_index
+
"Seed"
;
}
else
{
str_seed
=
str_seed
+
"Seed"
;
}
tensor_seed
=
context
.
HasInput
(
str_seed
)
?
context
.
Input
<
Tensor
>
(
str_seed
)
:
nullptr
;
if
(
tensor_seed
)
{
seed_val
=
*
(
tensor_seed
->
data
<
int
>
());
}
else
{
seed_val
=
fix_seed
?
context
.
Attr
<
int
>
(
pre_fix
+
"seed"
)
:
0
;
}
}
void
initXPUDropoutParam
(
float
dropout_prob_
,
bool
is_upscale_in_train_
,
bool
is_test_
,
bool
fix_seed_
,
const
Tensor
*
tensor_seed
,
int
seed_val_
)
{
dropout_prob
=
dropout_prob_
;
is_upscale_in_train
=
is_upscale_in_train_
;
is_test
=
is_test_
;
fix_seed
=
fix_seed_
;
if
(
tensor_seed
)
{
seed_val
=
*
(
tensor_seed
->
data
<
int
>
());
}
else
{
seed_val
=
fix_seed
?
seed_val_
:
0
;
}
}
void
initXPUDropoutParam
(
const
framework
::
ExecutionContext
&
context
,
int
dropout_index
)
{
std
::
string
pre_fix
=
"dropout"
;
std
::
string
str_index
=
std
::
to_string
(
dropout_index
);
if
(
dropout_index
>
0
)
{
pre_fix
=
pre_fix
+
str_index
+
"_"
;
}
else
{
pre_fix
=
pre_fix
+
"_"
;
}
dropout_prob
=
context
.
Attr
<
float
>
(
pre_fix
+
"rate"
);
auto
&
dropout_implementation
=
context
.
Attr
<
std
::
string
>
(
pre_fix
+
"implementation"
);
is_upscale_in_train
=
(
dropout_implementation
==
"upscale_in_train"
);
is_test
=
context
.
Attr
<
bool
>
(
"is_test"
);
fix_seed
=
context
.
Attr
<
bool
>
(
pre_fix
+
"fix_seed"
);
std
::
string
str_seed
=
"Dropout"
;
if
(
dropout_index
>
0
)
{
str_seed
=
str_seed
+
str_index
+
"Seed"
;
}
else
{
str_seed
=
str_seed
+
"Seed"
;
}
tensor_seed
=
context
.
HasInput
(
str_seed
)
?
context
.
Input
<
Tensor
>
(
str_seed
)
:
nullptr
;
if
(
tensor_seed
)
{
seed_val
=
*
(
tensor_seed
->
data
<
int
>
());
}
else
{
seed_val
=
fix_seed
?
context
.
Attr
<
int
>
(
pre_fix
+
"seed"
)
:
0
;
}
}
};
/******************
* check is l3
*******************/
static
bool
is_in_l3
(
const
void
*
addr
)
{
int64_t
addr_int
=
(
int64_t
)
addr
;
int
addr_int_high
=
addr_int
>>
32
;
return
(
addr_int_high
==
0
);
}
/*************************
* dropout
*************************/
template
<
typename
T
>
void
Dropout
(
xpu
::
Context
*
xpu_ctx
,
const
T
*
x
,
T
*
mask
,
T
*
y
,
const
XPUDropoutParam
&
param
,
int
len
)
{
using
XPUType
=
typename
XPUTypeTrait
<
T
>::
Type
;
int
r
=
XPU_SUCCESS
;
if
(
param
.
dropout_prob
==
0.0
f
)
{
r
=
xpu
::
copy
(
xpu_ctx
,
reinterpret_cast
<
const
XPUType
*>
(
x
),
reinterpret_cast
<
XPUType
*>
(
y
),
len
);
PADDLE_ENFORCE_XDNN_SUCCESS
(
r
,
"copy"
);
return
;
}
if
(
!
param
.
is_test
)
{
if
(
param
.
dropout_prob
==
1.0
f
)
{
r
=
xpu
::
constant
(
xpu_ctx
,
reinterpret_cast
<
XPUType
*>
(
y
),
len
,
XPUType
(
0
));
PADDLE_ENFORCE_XDNN_SUCCESS
(
r
,
"constant"
);
r
=
xpu
::
constant
(
xpu_ctx
,
reinterpret_cast
<
XPUType
*>
(
mask
),
len
,
XPUType
(
0
));
PADDLE_ENFORCE_XDNN_SUCCESS
(
r
,
"constant"
);
}
else
{
r
=
xpu
::
dropout
(
xpu_ctx
,
reinterpret_cast
<
const
XPUType
*>
(
x
),
reinterpret_cast
<
XPUType
*>
(
y
),
reinterpret_cast
<
XPUType
*>
(
mask
),
param
.
seed_val
,
len
,
param
.
is_upscale_in_train
,
param
.
dropout_prob
);
PADDLE_ENFORCE_XDNN_SUCCESS
(
r
,
"dropout"
);
}
}
else
{
float
scale
=
(
param
.
is_upscale_in_train
)
?
(
1.0
)
:
(
static_cast
<
float
>
(
1.0
f
-
param
.
dropout_prob
));
r
=
xpu
::
scale
(
xpu_ctx
,
reinterpret_cast
<
const
XPUType
*>
(
x
),
reinterpret_cast
<
XPUType
*>
(
y
),
len
,
false
,
scale
,
0.0
f
);
PADDLE_ENFORCE_XDNN_SUCCESS
(
r
,
"scale"
);
}
}
template
<
typename
T
>
void
DropoutGrad
(
xpu
::
Context
*
xpu_ctx
,
const
T
*
dy
,
const
T
*
mask
,
T
*
dx
,
const
XPUDropoutParam
&
param
,
int
len
)
{
using
XPUType
=
typename
XPUTypeTrait
<
T
>::
Type
;
if
(
param
.
dropout_prob
==
0.0
f
)
{
int
r
=
xpu
::
copy
(
xpu_ctx
,
reinterpret_cast
<
const
XPUType
*>
(
dy
),
reinterpret_cast
<
XPUType
*>
(
dx
),
len
);
PADDLE_ENFORCE_XDNN_SUCCESS
(
r
,
"copy"
);
return
;
}
if
(
!
param
.
is_upscale_in_train
)
{
int
r
=
xpu
::
mul
(
xpu_ctx
,
reinterpret_cast
<
const
XPUType
*>
(
dy
),
reinterpret_cast
<
const
XPUType
*>
(
mask
),
reinterpret_cast
<
XPUType
*>
(
dx
),
len
);
PADDLE_ENFORCE_XDNN_SUCCESS
(
r
,
"mul"
);
}
else
{
int
r
=
xpu
::
dropout_grad
(
xpu_ctx
,
reinterpret_cast
<
const
XPUType
*>
(
mask
),
reinterpret_cast
<
const
XPUType
*>
(
dy
),
reinterpret_cast
<
XPUType
*>
(
dx
),
param
.
dropout_prob
,
len
);
PADDLE_ENFORCE_XDNN_SUCCESS
(
r
,
"dropout_grad"
);
}
}
}
// namespace operators
}
// namespace paddle
#endif
paddle/fluid/platform/device/xpu/xpu2_op_list.h
浏览文件 @
071708fa
...
...
@@ -704,6 +704,18 @@ XPUOpMap& get_kl2_ops() {
{
"fused_gemm_epilogue_grad"
,
XPUKernelSet
({
pOpKernelType
(
vartype
::
FP32
,
XPUPlace
()),
pOpKernelType
(
vartype
::
FP16
,
XPUPlace
())})},
{
"fused_attention"
,
XPUKernelSet
({
pOpKernelType
(
vartype
::
FP32
,
XPUPlace
()),
pOpKernelType
(
vartype
::
FP16
,
XPUPlace
())})},
{
"fused_attention_grad"
,
XPUKernelSet
({
pOpKernelType
(
vartype
::
FP32
,
XPUPlace
()),
pOpKernelType
(
vartype
::
FP16
,
XPUPlace
())})},
{
"fused_feedforward"
,
XPUKernelSet
({
pOpKernelType
(
vartype
::
FP32
,
XPUPlace
()),
pOpKernelType
(
vartype
::
FP16
,
XPUPlace
())})},
{
"fused_feedforward_grad"
,
XPUKernelSet
({
pOpKernelType
(
vartype
::
FP32
,
XPUPlace
()),
pOpKernelType
(
vartype
::
FP16
,
XPUPlace
())})},
};
return
s_xpu2_kernels
;
...
...
paddle/phi/kernels/xpu/xpu_api_wrapper.h
浏览文件 @
071708fa
...
...
@@ -382,7 +382,8 @@ static void MatMulXPUFunction(xpu::Context* xpu_ctx,
const
T
*
y
,
T
*
out
,
const
XpuFcInfo
&
fcinfo
,
float
alpha
)
{
float
alpha
,
bool
is_grad
=
false
)
{
using
XPUType
=
typename
XPUTypeTrait
<
T
>::
Type
;
int
fccal_type
=
FCCalcType
<
XPUType
>
();
...
...
@@ -398,6 +399,12 @@ static void MatMulXPUFunction(xpu::Context* xpu_ctx,
};
auto
fc_api
=
fc_api_list
[
fccal_type
];
if
(
std
::
getenv
(
"XPU_PADDLE_FC_GRAD_LOCAL"
)
!=
nullptr
)
{
if
(
is_grad
)
{
fc_api
=
fc_api_list
[
2
];
}
}
auto
fc_batch_api
=
fc_batch_api_list
[
fccal_type
];
int
m
=
fcinfo
.
m
;
...
...
python/paddle/fluid/tests/unittests/xpu/test_fused_attention_op_xpu.py
0 → 100644
浏览文件 @
071708fa
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
numpy
as
np
import
sys
sys
.
path
.
append
(
".."
)
import
paddle
import
paddle.nn.functional
as
F
import
paddle.incubate.nn.functional
as
incubate_f
from
paddle.nn.layer.norm
import
LayerNorm
from
paddle.nn.layer.common
import
Linear
,
Dropout
from
paddle.nn.layer.transformer
import
_convert_attention_mask
from
paddle
import
tensor
from
paddle.fluid
import
layers
import
unittest
from
op_test_xpu
import
XPUOpTest
from
paddle.fluid.framework
import
default_main_program
from
xpu.get_test_cover_info
import
(
create_test_class
,
get_xpu_op_support_types
,
XPUOpTestWrapper
,
)
default_main_program
().
random_seed
=
42
class
XPUTestFusedAttentionOp
(
XPUOpTestWrapper
):
def
__init__
(
self
):
self
.
op_name
=
'fused_attention'
self
.
use_dynamic_create_class
=
False
class
TestFusedAttentionOp
(
XPUOpTest
):
def
setUp
(
self
):
self
.
config
()
self
.
generate_input_data
()
self
.
rtol
=
1e-5
self
.
atol
=
1e-3
if
self
.
x_type
==
np
.
float16
or
str
(
self
.
x_type
)
==
"float16"
:
self
.
atol
=
1e-1
paddle
.
set_default_dtype
(
self
.
x_type
)
self
.
__class__
.
op_type
=
"fused_attention"
# use autograd to check grad in this unittest.
self
.
__class__
.
no_need_check_grad
=
True
self
.
q_proj
=
Linear
(
self
.
embed_dim
,
self
.
embed_dim
,
self
.
weight_attr
,
bias_attr
=
self
.
bias_attr
,
)
self
.
k_proj
=
Linear
(
self
.
kdim
,
self
.
embed_dim
,
self
.
weight_attr
,
bias_attr
=
self
.
bias_attr
,
)
self
.
v_proj
=
Linear
(
self
.
vdim
,
self
.
embed_dim
,
self
.
weight_attr
,
bias_attr
=
self
.
bias_attr
,
)
self
.
out_proj
=
Linear
(
self
.
embed_dim
,
self
.
embed_dim
,
self
.
weight_attr
,
bias_attr
=
self
.
bias_attr
,
)
paddle
.
set_default_dtype
(
np
.
float32
)
self
.
norm1
=
LayerNorm
(
self
.
embed_dim
)
self
.
norm2
=
LayerNorm
(
self
.
embed_dim
)
paddle
.
set_default_dtype
(
self
.
x_type
)
self
.
dropout
=
Dropout
(
self
.
dropout_prob
,
mode
=
"upscale_in_train"
)
def
config
(
self
):
self
.
x_type
=
self
.
in_type
self
.
attn_mask_type
=
np
.
float32
self
.
pre_layer_norm
=
True
self
.
has_attn_mask
=
False
self
.
training
=
True
self
.
batch_size
=
8
self
.
query_length
=
128
self
.
cache_length
=
128
self
.
head_dim
=
64
self
.
num_heads
=
16
self
.
embed_dim
=
self
.
head_dim
*
self
.
num_heads
self
.
dropout_prob
=
0.0
self
.
attn_dropout_prob
=
0.0
self
.
weight_attr
=
None
self
.
bias_attr
=
None
self
.
kdim
,
self
.
vdim
=
self
.
embed_dim
,
self
.
embed_dim
self
.
key_length
,
self
.
value_length
=
(
self
.
query_length
,
self
.
query_length
,
)
def
generate_input_data
(
self
):
self
.
query
=
np
.
random
.
rand
(
self
.
batch_size
,
self
.
query_length
,
self
.
embed_dim
).
astype
(
self
.
x_type
)
out_seq_len
=
self
.
key_length
if
self
.
has_attn_mask
:
# [B, n_head, seq_len, out_seq_len]
self
.
attn_mask
=
np
.
ones
(
(
self
.
batch_size
,
self
.
num_heads
,
self
.
query_length
,
out_seq_len
,
),
dtype
=
self
.
attn_mask_type
,
)
else
:
self
.
attn_mask
=
None
self
.
key
,
self
.
value
=
self
.
query
,
self
.
query
self
.
dout
=
np
.
random
.
random
(
(
self
.
batch_size
,
self
.
query_length
,
self
.
embed_dim
)
).
astype
(
self
.
x_type
)
def
GetBaselineOut
(
self
):
paddle
.
disable_static
()
tensor_query
=
paddle
.
to_tensor
(
self
.
query
,
stop_gradient
=
False
)
if
self
.
has_attn_mask
:
attn_mask
=
paddle
.
to_tensor
(
self
.
attn_mask
,
stop_gradient
=
False
)
else
:
attn_mask
=
None
residual
=
tensor_query
ln1_out
=
tensor_query
if
self
.
pre_layer_norm
:
ln1_out
=
self
.
norm1
(
tensor_query
)
q
=
self
.
q_proj
(
ln1_out
)
q
=
tensor
.
reshape
(
x
=
q
,
shape
=
[
0
,
0
,
self
.
num_heads
,
self
.
head_dim
])
q_out
=
tensor
.
transpose
(
x
=
q
,
perm
=
[
0
,
2
,
1
,
3
])
k
=
self
.
k_proj
(
ln1_out
)
v
=
self
.
v_proj
(
ln1_out
)
k
=
tensor
.
reshape
(
x
=
k
,
shape
=
[
0
,
0
,
self
.
num_heads
,
self
.
head_dim
])
k_out
=
tensor
.
transpose
(
x
=
k
,
perm
=
[
0
,
2
,
1
,
3
])
v
=
tensor
.
reshape
(
x
=
v
,
shape
=
[
0
,
0
,
self
.
num_heads
,
self
.
head_dim
])
v_out
=
tensor
.
transpose
(
x
=
v
,
perm
=
[
0
,
2
,
1
,
3
])
# [B, n_head, seq_len, head_dim] * [B, n_head, out_seq_len, head_dim]
# --> [B, n_head, seq_len, out_seq_len]
qk_out
=
layers
.
matmul
(
x
=
q_out
*
self
.
head_dim
**-
0.5
,
y
=
k_out
,
transpose_y
=
True
)
if
attn_mask
is
not
None
:
attn_mask
=
_convert_attention_mask
(
attn_mask
,
qk_out
.
dtype
)
attn_mask_out
=
qk_out
+
attn_mask
softmax_out
=
F
.
softmax
(
attn_mask_out
)
else
:
softmax_out
=
F
.
softmax
(
qk_out
)
if
self
.
dropout_prob
:
dropout_out
=
F
.
dropout
(
softmax_out
,
self
.
dropout_prob
,
training
=
self
.
training
,
mode
=
"upscale_in_train"
,
)
# [B, n_head, seq_len, out_seq_len] * [B, n_head, out_seq_len, head_dim]
# --> [B, n_head, seq_len, head_dim]
qktv_out
=
tensor
.
matmul
(
dropout_out
,
v_out
)
else
:
qktv_out
=
tensor
.
matmul
(
softmax_out
,
v_out
)
fmha_out
=
tensor
.
transpose
(
qktv_out
,
perm
=
[
0
,
2
,
1
,
3
])
out_linear_in
=
tensor
.
reshape
(
x
=
fmha_out
,
shape
=
[
0
,
0
,
fmha_out
.
shape
[
2
]
*
fmha_out
.
shape
[
3
]]
)
out
=
self
.
out_proj
(
out_linear_in
)
residual_out
=
residual
+
self
.
dropout
(
out
)
if
not
self
.
pre_layer_norm
:
final_out
=
self
.
norm1
(
residual_out
)
else
:
final_out
=
residual_out
paddle
.
autograd
.
backward
(
[
final_out
],
[
paddle
.
to_tensor
(
self
.
dout
)],
retain_graph
=
True
)
return
final_out
,
tensor_query
.
grad
def
GetFusedAttentionOut
(
self
):
paddle
.
disable_static
()
q_proj_weight
=
paddle
.
to_tensor
(
self
.
q_proj
.
weight
,
stop_gradient
=
False
)
k_proj_weight
=
paddle
.
to_tensor
(
self
.
k_proj
.
weight
,
stop_gradient
=
False
)
v_proj_weight
=
paddle
.
to_tensor
(
self
.
v_proj
.
weight
,
stop_gradient
=
False
)
out_linear_weight
=
paddle
.
to_tensor
(
self
.
out_proj
.
weight
,
stop_gradient
=
False
)
if
self
.
bias_attr
is
False
:
qkv_bias_tensor
=
None
out_linear_bias
=
None
else
:
q_proj_bias
=
paddle
.
to_tensor
(
self
.
q_proj
.
bias
,
stop_gradient
=
False
)
k_proj_bias
=
paddle
.
to_tensor
(
self
.
k_proj
.
bias
,
stop_gradient
=
False
)
v_proj_bias
=
paddle
.
to_tensor
(
self
.
v_proj
.
bias
,
stop_gradient
=
False
)
qkv_bias
=
np
.
concatenate
(
(
q_proj_bias
.
numpy
(),
k_proj_bias
.
numpy
(),
v_proj_bias
.
numpy
(),
)
)
qkv_bias
=
qkv_bias
.
reshape
((
3
,
self
.
num_heads
,
self
.
head_dim
))
qkv_bias_tensor
=
paddle
.
to_tensor
(
qkv_bias
,
stop_gradient
=
False
)
out_linear_bias
=
paddle
.
to_tensor
(
self
.
out_proj
.
bias
,
stop_gradient
=
False
)
ln1_scale
=
paddle
.
to_tensor
(
self
.
norm1
.
weight
,
stop_gradient
=
False
)
ln1_bias
=
paddle
.
to_tensor
(
self
.
norm1
.
bias
,
stop_gradient
=
False
)
ln2_scale
=
paddle
.
to_tensor
(
self
.
norm2
.
weight
,
stop_gradient
=
False
)
ln2_bias
=
paddle
.
to_tensor
(
self
.
norm2
.
bias
,
stop_gradient
=
False
)
q_proj_weight
=
q_proj_weight
.
numpy
().
transpose
((
1
,
0
))
k_proj_weight
=
k_proj_weight
.
numpy
().
transpose
((
1
,
0
))
v_proj_weight
=
v_proj_weight
.
numpy
().
transpose
((
1
,
0
))
qkv_weight
=
np
.
concatenate
(
(
q_proj_weight
,
k_proj_weight
,
v_proj_weight
)
)
qkv_weight
=
qkv_weight
.
reshape
(
(
3
,
self
.
num_heads
,
self
.
head_dim
,
self
.
embed_dim
)
)
x
=
paddle
.
to_tensor
(
self
.
query
,
stop_gradient
=
False
)
cache_kv
=
None
if
self
.
has_attn_mask
:
attn_mask
=
paddle
.
to_tensor
(
self
.
attn_mask
,
stop_gradient
=
False
)
else
:
attn_mask
=
None
qkv_weight_tensor
=
paddle
.
to_tensor
(
qkv_weight
,
stop_gradient
=
False
)
epsilon
=
1e-05
ln2_epsilon
=
1e-05
if
attn_mask
is
not
None
:
attn_mask
=
_convert_attention_mask
(
attn_mask
,
x
.
dtype
)
final_out
=
incubate_f
.
fused_multi_head_attention
(
x
,
qkv_weight_tensor
,
out_linear_weight
,
self
.
pre_layer_norm
,
ln1_scale
,
ln1_bias
,
ln2_scale
,
ln2_bias
,
epsilon
,
qkv_bias_tensor
,
out_linear_bias
,
cache_kv
,
attn_mask
,
self
.
dropout_prob
,
self
.
attn_dropout_prob
,
ln2_epsilon
,
)
paddle
.
autograd
.
backward
(
[
final_out
],
[
paddle
.
to_tensor
(
self
.
dout
)],
retain_graph
=
True
)
return
final_out
,
x
.
grad
def
test_fused_attention_op
(
self
):
final_out_ref
,
x_grad_ref
=
self
.
GetBaselineOut
()
final_out
,
x_grad
=
self
.
GetFusedAttentionOut
()
np
.
testing
.
assert_allclose
(
final_out_ref
,
final_out
.
numpy
(),
rtol
=
self
.
rtol
,
atol
=
self
.
atol
)
np
.
testing
.
assert_allclose
(
x_grad_ref
,
x_grad
.
numpy
(),
rtol
=
self
.
rtol
,
atol
=
self
.
atol
)
class
TestFusedAttentionOpPreLn
(
TestFusedAttentionOp
):
def
config
(
self
):
super
().
config
()
self
.
pre_layer_norm
=
True
class
TestFusedAttentionOpNoneAttnMask
(
TestFusedAttentionOp
):
def
config
(
self
):
super
().
config
()
self
.
pre_layer_norm
=
True
self
.
has_attn_mask
=
False
support_types
=
get_xpu_op_support_types
(
'fused_attention'
)
for
stype
in
support_types
:
create_test_class
(
globals
(),
XPUTestFusedAttentionOp
,
stype
)
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/xpu/test_fused_feedforward_op_xpu.py
0 → 100644
浏览文件 @
071708fa
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
numpy
as
np
import
sys
sys
.
path
.
append
(
".."
)
import
paddle
from
paddle.nn.layer
import
transformer
import
paddle.nn.functional
as
F
import
paddle.incubate.nn.functional
as
incubate_f
from
paddle.nn.layer.norm
import
LayerNorm
from
paddle.nn.layer.common
import
Linear
,
Dropout
import
unittest
from
op_test_xpu
import
XPUOpTest
from
paddle.fluid.framework
import
default_main_program
from
xpu.get_test_cover_info
import
(
create_test_class
,
XPUOpTestWrapper
,
)
class
XPUTestFusedFFNOp
(
XPUOpTestWrapper
):
def
__init__
(
self
):
self
.
op_name
=
'fused_feedforward'
self
.
use_dynamic_create_class
=
False
class
TestFusedFFNOp
(
XPUOpTest
):
def
getDtype
(
self
):
self
.
dtype
=
self
.
in_type
self
.
layer_norm_dtype
=
"float32"
def
getShape
(
self
):
self
.
batch_size
=
np
.
random
.
randint
(
1
,
32
)
self
.
query_length
=
np
.
random
.
randint
(
32
,
128
)
self
.
d_model
=
np
.
random
.
randint
(
32
,
512
)
self
.
dim_feedforward
=
np
.
random
.
randint
(
32
,
512
)
def
getDiff
(
self
):
self
.
rtol
=
1e-2
self
.
atol
=
1e-3
if
self
.
dtype
==
np
.
float16
or
self
.
dtype
==
"float16"
:
self
.
atol
=
1e-1
def
getActivation
(
self
):
self
.
act_method
=
"gelu"
def
getNormalizeBefore
(
self
):
self
.
pre_layer_norm
=
False
def
setUp
(
self
):
paddle
.
disable_static
()
self
.
__class__
.
op_type
=
"fused_feedforward"
# check grad in test_out_and_grad()
self
.
__class__
.
no_need_check_grad
=
True
self
.
getDtype
()
self
.
getShape
()
self
.
getDiff
()
self
.
getActivation
()
self
.
getNormalizeBefore
()
paddle
.
set_default_dtype
(
self
.
dtype
)
self
.
weight_attr
=
None
self
.
bias_attr
=
None
self
.
weight_attrs
=
transformer
.
_convert_param_attr_to_list
(
self
.
weight_attr
,
2
)
self
.
bias_attrs
=
transformer
.
_convert_param_attr_to_list
(
self
.
bias_attr
,
2
)
self
.
linear1
=
Linear
(
self
.
d_model
,
self
.
dim_feedforward
,
self
.
weight_attrs
[
1
],
bias_attr
=
self
.
bias_attrs
[
1
],
)
self
.
linear2
=
Linear
(
self
.
dim_feedforward
,
self
.
d_model
,
self
.
weight_attrs
[
1
],
bias_attr
=
self
.
bias_attrs
[
1
],
)
paddle
.
set_default_dtype
(
self
.
layer_norm_dtype
)
self
.
norm1
=
LayerNorm
(
self
.
d_model
)
self
.
norm2
=
LayerNorm
(
self
.
d_model
)
paddle
.
set_default_dtype
(
self
.
dtype
)
self
.
dropout1
=
Dropout
(
0.0
,
mode
=
"upscale_in_train"
)
self
.
dropout2
=
Dropout
(
0.0
,
mode
=
"upscale_in_train"
)
self
.
activation
=
getattr
(
F
,
self
.
act_method
)
self
.
src
=
np
.
random
.
random
(
(
self
.
batch_size
,
self
.
query_length
,
self
.
d_model
)
).
astype
(
self
.
dtype
)
self
.
dout
=
np
.
random
.
random
(
(
self
.
batch_size
,
self
.
query_length
,
self
.
d_model
)
).
astype
(
self
.
dtype
)
def
Base
(
self
):
paddle
.
disable_static
()
tensor_src
=
paddle
.
to_tensor
(
self
.
src
,
stop_gradient
=
False
)
residual
=
tensor_src
if
self
.
pre_layer_norm
:
ln1_out
=
self
.
norm1
(
tensor_src
)
linear2_out
=
self
.
linear2
(
self
.
dropout1
(
self
.
activation
(
self
.
linear1
(
ln1_out
)))
)
dropout2_out
=
residual
+
self
.
dropout2
(
linear2_out
)
paddle
.
autograd
.
backward
(
[
dropout2_out
],
[
paddle
.
to_tensor
(
self
.
dout
)],
True
)
return
dropout2_out
,
tensor_src
.
grad
else
:
linear2_out
=
self
.
linear2
(
self
.
dropout1
(
self
.
activation
(
self
.
linear1
(
tensor_src
)))
)
dropout2_out
=
residual
+
self
.
dropout2
(
linear2_out
)
dropout2_out
=
self
.
norm2
(
dropout2_out
)
paddle
.
autograd
.
backward
(
[
dropout2_out
],
[
paddle
.
to_tensor
(
self
.
dout
)],
True
)
return
dropout2_out
,
tensor_src
.
grad
def
FusedFFN
(
self
):
paddle
.
disable_static
()
linear1_weight
=
paddle
.
to_tensor
(
self
.
linear1
.
weight
,
stop_gradient
=
False
)
linear1_bias
=
paddle
.
to_tensor
(
self
.
linear1
.
bias
,
stop_gradient
=
False
)
linear2_weight
=
paddle
.
to_tensor
(
self
.
linear2
.
weight
,
stop_gradient
=
False
)
linear2_bias
=
paddle
.
to_tensor
(
self
.
linear2
.
bias
,
stop_gradient
=
False
)
ln1_scale
=
paddle
.
to_tensor
(
self
.
norm1
.
weight
,
stop_gradient
=
False
)
ln1_bias
=
paddle
.
to_tensor
(
self
.
norm1
.
bias
,
stop_gradient
=
False
)
ln2_scale
=
paddle
.
to_tensor
(
self
.
norm2
.
weight
,
stop_gradient
=
False
)
ln2_bias
=
paddle
.
to_tensor
(
self
.
norm2
.
bias
,
stop_gradient
=
False
)
x
=
paddle
.
to_tensor
(
self
.
src
,
stop_gradient
=
False
)
out
=
incubate_f
.
fused_feedforward
(
x
,
linear1_weight
,
linear2_weight
,
linear1_bias
,
linear2_bias
,
ln1_scale
,
ln1_bias
,
ln2_scale
,
ln2_bias
,
0.0
,
0.0
,
activation
=
self
.
act_method
,
pre_layer_norm
=
self
.
pre_layer_norm
,
)
paddle
.
autograd
.
backward
([
out
],
[
paddle
.
to_tensor
(
self
.
dout
)])
return
out
,
x
.
grad
def
test_out_and_grad
(
self
):
default_main_program
().
random_seed
=
42
base_out
,
base_grad
=
self
.
Base
()
fused_out
,
fused_grad
=
self
.
FusedFFN
()
np
.
testing
.
assert_allclose
(
base_out
.
numpy
(),
fused_out
.
numpy
(),
rtol
=
self
.
rtol
,
atol
=
self
.
atol
,
)
np
.
testing
.
assert_allclose
(
base_grad
.
numpy
(),
fused_grad
.
numpy
(),
rtol
=
self
.
rtol
,
atol
=
self
.
atol
,
)
class
TestFusedFFNOpActivation
(
TestFusedFFNOp
):
def
getActivation
(
self
):
self
.
act_method
=
"relu"
class
TestFusedFFNOpNormalizeBefore
(
TestFusedFFNOp
):
def
getNormalizeBefore
(
self
):
self
.
pre_layer_norm
=
True
def
getShape
(
self
):
self
.
batch_size
=
1
self
.
query_length
=
1
self
.
d_model
=
8
self
.
dim_feedforward
=
8
class
APITestStaticFusedFFN
(
unittest
.
TestCase
):
def
test_static
(
self
):
paddle
.
enable_static
()
default_main_program
().
random_seed
=
42
dtype
=
"float32"
layer_norm_dtype
=
"float32"
batch_size
=
1
d_model
=
8
dim_feedforward
=
8
x
=
paddle
.
static
.
data
(
name
=
'x'
,
shape
=
[
batch_size
,
d_model
,
dim_feedforward
],
dtype
=
dtype
)
linear1_weight
=
paddle
.
static
.
data
(
name
=
'linear1_weight'
,
shape
=
[
d_model
,
dim_feedforward
],
dtype
=
dtype
)
linear1_bias
=
paddle
.
static
.
data
(
name
=
'linear1_bias'
,
shape
=
[
dim_feedforward
],
dtype
=
dtype
)
linear2_weight
=
paddle
.
static
.
data
(
name
=
'linear2_weight'
,
shape
=
[
dim_feedforward
,
d_model
],
dtype
=
dtype
)
linear2_bias
=
paddle
.
static
.
data
(
name
=
'linear2_bias'
,
shape
=
[
d_model
])
ln1_scale
=
paddle
.
static
.
data
(
name
=
'ln1_scale'
,
shape
=
[
d_model
])
ln1_bias
=
paddle
.
static
.
data
(
name
=
'ln1_scale'
,
shape
=
[
d_model
])
ln2_scale
=
paddle
.
static
.
data
(
name
=
'ln2_scale'
,
shape
=
[
d_model
])
ln2_bias
=
paddle
.
static
.
data
(
name
=
'ln2_scale'
,
shape
=
[
d_model
])
fused_out
=
incubate_f
.
fused_feedforward
(
x
,
linear1_weight
,
linear2_weight
,
linear1_bias
,
linear2_bias
,
ln1_scale
,
ln1_bias
,
ln2_scale
,
ln2_bias
,
0.0
,
0.0
,
activation
=
"relu"
,
pre_layer_norm
=
False
,
)
linear1_out
=
F
.
linear
(
x
,
linear1_weight
,
linear1_bias
)
act_out
=
F
.
relu
(
linear1_out
)
dropout1_out
=
F
.
dropout
(
x
=
act_out
,
p
=
0.0
,
training
=
False
)
linear2_out
=
F
.
linear
(
dropout1_out
,
linear2_weight
,
linear2_bias
)
dropout2_out
=
x
+
F
.
dropout
(
x
=
linear2_out
,
p
=
0.0
,
training
=
False
)
ln_out
=
F
.
layer_norm
(
dropout2_out
,
normalized_shape
=
list
([
d_model
]),
weight
=
ln2_scale
,
bias
=
ln2_bias
,
)
exe
=
paddle
.
static
.
Executor
(
paddle
.
XPUPlace
(
0
))
x_data
=
np
.
random
.
random
(
(
batch_size
,
d_model
,
dim_feedforward
)
).
astype
(
dtype
)
linear1_weight_data
=
np
.
random
.
random
(
(
d_model
,
dim_feedforward
)
).
astype
(
dtype
)
linear1_bias_data
=
np
.
zeros
((
dim_feedforward
)).
astype
(
dtype
)
linear2_weight_data
=
np
.
random
.
random
(
(
dim_feedforward
,
d_model
)
).
astype
(
dtype
)
linear2_bias_data
=
np
.
zeros
((
d_model
)).
astype
(
dtype
)
ln1_scale_data
=
np
.
ones
((
d_model
)).
astype
(
layer_norm_dtype
)
ln1_bias_data
=
np
.
zeros
((
d_model
)).
astype
(
layer_norm_dtype
)
ln2_scale_data
=
np
.
ones
((
d_model
)).
astype
(
layer_norm_dtype
)
ln2_bias_data
=
np
.
zeros
((
d_model
)).
astype
(
layer_norm_dtype
)
res_list
=
[
fused_out
,
ln_out
]
real_res
=
[]
for
res
in
res_list
:
fetch
=
exe
.
run
(
feed
=
{
'x'
:
x_data
,
'linear1_weight'
:
linear1_weight_data
,
'linear1_bias'
:
linear1_bias_data
,
'linear2_weight'
:
linear2_weight_data
,
'linear2_bias'
:
linear2_bias_data
,
'ln1_scale'
:
ln1_scale_data
,
'ln1_bias'
:
ln1_bias_data
,
'ln2_scale'
:
ln2_scale_data
,
'ln2_bias'
:
ln2_bias_data
,
},
fetch_list
=
[
res
],
)
real_res
.
append
(
fetch
)
np
.
testing
.
assert_allclose
(
real_res
[
0
],
real_res
[
1
],
rtol
=
1e-05
,
atol
=
0.001
)
class
TestFusedFFNOpError
(
unittest
.
TestCase
):
def
test_errors
(
self
):
paddle
.
enable_static
()
with
paddle
.
static
.
program_guard
(
paddle
.
static
.
Program
(),
paddle
.
static
.
Program
()
):
def
test_dtype
():
x
=
paddle
.
static
.
data
(
name
=
'x'
,
shape
=
[
1
,
10
,
10
],
dtype
=
"int32"
)
linear1_weight
=
paddle
.
static
.
data
(
name
=
'linear1_weight'
,
shape
=
[
1
,
10
,
10
],
dtype
=
"float32"
)
linear2_weight
=
paddle
.
static
.
data
(
name
=
'linear2_weight'
,
shape
=
[
1
,
10
,
10
],
dtype
=
"float32"
)
incubate_f
.
fused_feedforward
(
x
,
linear1_weight
,
linear2_weight
)
self
.
assertRaises
(
TypeError
,
test_dtype
)
def
test_dropout_rate_type
():
x
=
paddle
.
static
.
data
(
name
=
'x1'
,
shape
=
[
1
,
10
,
10
],
dtype
=
"float32"
)
linear1_weight
=
paddle
.
static
.
data
(
name
=
'linear1_weight1'
,
shape
=
[
10
,
10
],
dtype
=
"float32"
)
linear2_weight
=
paddle
.
static
.
data
(
name
=
'linear2_weight1'
,
shape
=
[
10
,
10
],
dtype
=
"float32"
)
incubate_f
.
fused_feedforward
(
x
,
linear1_weight
,
linear2_weight
,
dropout1_rate
=
"a"
)
self
.
assertRaises
(
TypeError
,
test_dropout_rate_type
)
def
test_dropout_rate_value
():
x
=
paddle
.
static
.
data
(
name
=
'x2'
,
shape
=
[
1
,
10
,
10
],
dtype
=
"float32"
)
linear1_weight
=
paddle
.
static
.
data
(
name
=
'linear1_weight2'
,
shape
=
[
10
,
10
],
dtype
=
"float32"
)
linear2_weight
=
paddle
.
static
.
data
(
name
=
'linear2_weight2'
,
shape
=
[
10
,
10
],
dtype
=
"float32"
)
incubate_f
.
fused_feedforward
(
x
,
linear1_weight
,
linear2_weight
,
dropout2_rate
=-
1
)
self
.
assertRaises
(
ValueError
,
test_dropout_rate_value
)
def
test_dropout_mode
():
x
=
paddle
.
static
.
data
(
name
=
'x3'
,
shape
=
[
1
,
10
,
10
],
dtype
=
"float32"
)
linear1_weight
=
paddle
.
static
.
data
(
name
=
'linear1_weight3'
,
shape
=
[
10
,
10
],
dtype
=
"float32"
)
linear2_weight
=
paddle
.
static
.
data
(
name
=
'linear2_weight3'
,
shape
=
[
10
,
10
],
dtype
=
"float32"
)
incubate_f
.
fused_feedforward
(
x
,
linear1_weight
,
linear2_weight
,
mode
=
'test'
)
self
.
assertRaises
(
ValueError
,
test_dropout_mode
)
support_types
=
{
"float32"
}
# get_xpu_op_support_types('fused_feedforward')
for
stype
in
support_types
:
create_test_class
(
globals
(),
XPUTestFusedFFNOp
,
stype
)
if
__name__
==
"__main__"
:
unittest
.
main
()
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录