Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
4bbbed9a
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 1 年 前同步成功
通知
2299
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
4bbbed9a
编写于
9月 07, 2022
作者:
W
Wilber
提交者:
GitHub
9月 07, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Fix fused cuda op's mutable data [2] (#45562)
上级
26d161ef
变更
16
显示空白变更内容
内联
并排
Showing
16 changed file
with
376 addition
and
246 deletion
+376
-246
paddle/fluid/operators/fused/attn_bias_add.cu.h
paddle/fluid/operators/fused/attn_bias_add.cu.h
+2
-1
paddle/fluid/operators/fused/conv_fusion_op.cu
paddle/fluid/operators/fused/conv_fusion_op.cu
+1
-1
paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h
paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h
+12
-7
paddle/fluid/operators/fused/cudnn_norm_conv.cu.h
paddle/fluid/operators/fused/cudnn_norm_conv.cu.h
+9
-7
paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h
paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h
+12
-8
paddle/fluid/operators/fused/fused_attention_op.cu
paddle/fluid/operators/fused/fused_attention_op.cu
+102
-61
paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cu
...rators/fused/fused_bias_dropout_residual_layer_norm_op.cu
+24
-15
paddle/fluid/operators/fused/fused_bn_activation_op.cu
paddle/fluid/operators/fused/fused_bn_activation_op.cu
+50
-30
paddle/fluid/operators/fused/fused_bn_add_activation_op.cu
paddle/fluid/operators/fused/fused_bn_add_activation_op.cu
+35
-21
paddle/fluid/operators/fused/fused_feedforward_op.cu
paddle/fluid/operators/fused/fused_feedforward_op.cu
+40
-32
paddle/fluid/operators/fused/fused_gate_attention.h
paddle/fluid/operators/fused/fused_gate_attention.h
+7
-4
paddle/fluid/operators/fused/fused_gate_attention_op.cu
paddle/fluid/operators/fused/fused_gate_attention_op.cu
+15
-10
paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu
paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu
+5
-6
paddle/fluid/operators/fused/fused_multi_transformer_op.cu
paddle/fluid/operators/fused/fused_multi_transformer_op.cu
+46
-30
paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu
paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu
+9
-9
paddle/fluid/operators/fused/fusion_conv_inception_op.cu
paddle/fluid/operators/fused/fusion_conv_inception_op.cu
+7
-4
未找到文件。
paddle/fluid/operators/fused/attn_bias_add.cu.h
浏览文件 @
4bbbed9a
...
...
@@ -326,7 +326,8 @@ void Launch2DColumnReduce(const phi::GPUContext& dev_ctx,
}
else
{
framework
::
Tensor
tmp_sum
;
tmp_sum
.
Resize
({
grid
.
y
,
left_num
});
tmp_sum
.
mutable_data
<
ReduceParamType
<
T
>>
(
dev_ctx
.
GetPlace
());
dev_ctx
.
template
Alloc
<
ReduceParamType
<
T
>
>
(
&
tmp_sum
,
tmp_sum
.
numel
()
*
sizeof
(
ReduceParamType
<
T
>
));
BiasAddBw2DReduceKernel
<
T
><<<
grid
,
block
,
0
,
stream
>>>
(
d_out
,
...
...
paddle/fluid/operators/fused/conv_fusion_op.cu
浏览文件 @
4bbbed9a
...
...
@@ -49,7 +49,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
auto
*
bias
=
ctx
.
Input
<
Tensor
>
(
"Bias"
);
auto
*
residual
=
ctx
.
Input
<
Tensor
>
(
"ResidualData"
);
auto
*
output
=
ctx
.
Output
<
Tensor
>
(
"Output"
);
output
->
mutable_data
<
T
>
(
ctx
.
GetPlace
(
));
dev_ctx
.
template
Alloc
<
T
>(
output
,
output
->
numel
()
*
sizeof
(
T
));
std
::
vector
<
int
>
strides
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"strides"
);
std
::
vector
<
int
>
paddings
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"paddings"
);
...
...
paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h
浏览文件 @
4bbbed9a
...
...
@@ -84,7 +84,6 @@ class CudnnBNStatsFinalize {
float
momentum
,
int64_t
ele_count
,
bool
is_train
)
{
auto
place
=
ctx
.
GetPlace
();
if
(
is_train
)
{
TrainInit
(
ctx
);
}
else
{
...
...
@@ -98,12 +97,18 @@ class CudnnBNStatsFinalize {
const_cast
<
float
*>
(
sum_of_squares
.
data
<
float
>
());
float
*
scale_ptr
=
const_cast
<
float
*>
(
scale
.
data
<
float
>
());
float
*
bias_ptr
=
const_cast
<
float
*>
(
bias
.
data
<
float
>
());
float
*
saved_mean_ptr
=
saved_mean
->
mutable_data
<
float
>
(
place
);
float
*
saved_invstd_ptr
=
saved_invstd
->
mutable_data
<
float
>
(
place
);
float
*
running_mean_ptr
=
running_mean
->
mutable_data
<
float
>
(
place
);
float
*
running_var_ptr
=
running_var
->
mutable_data
<
float
>
(
place
);
T
*
equiv_scale_ptr
=
equiv_scale
->
mutable_data
<
T
>
(
place
);
T
*
equiv_bias_ptr
=
equiv_bias
->
mutable_data
<
T
>
(
place
);
float
*
saved_mean_ptr
=
ctx
.
template
Alloc
<
float
>(
saved_mean
,
saved_mean
->
numel
()
*
sizeof
(
float
));
float
*
saved_invstd_ptr
=
ctx
.
template
Alloc
<
float
>(
saved_invstd
,
saved_invstd
->
numel
()
*
sizeof
(
float
));
float
*
running_mean_ptr
=
ctx
.
template
Alloc
<
float
>(
running_mean
,
running_mean
->
numel
()
*
sizeof
(
float
));
float
*
running_var_ptr
=
ctx
.
template
Alloc
<
float
>(
running_var
,
running_var
->
numel
()
*
sizeof
(
float
));
T
*
equiv_scale_ptr
=
ctx
.
template
Alloc
<
T
>(
equiv_scale
,
equiv_scale
->
numel
()
*
sizeof
(
T
));
T
*
equiv_bias_ptr
=
ctx
.
template
Alloc
<
T
>(
equiv_bias
,
equiv_bias
->
numel
()
*
sizeof
(
T
));
op
.
SetOpVariantParamAttrPtr
(
CUDNN_PTR_BN_SCALE
,
scale_ptr
);
op
.
SetOpVariantParamAttrPtr
(
CUDNN_PTR_BN_BIAS
,
bias_ptr
);
op
.
SetOpVariantParamAttrPtr
(
CUDNN_PTR_BN_RUNNING_MEAN
,
running_mean_ptr
);
...
...
paddle/fluid/operators/fused/cudnn_norm_conv.cu.h
浏览文件 @
4bbbed9a
...
...
@@ -193,7 +193,6 @@ class CudnnNormConvolution {
Tensor
*
sum
,
Tensor
*
sum_of_squares
)
{
auto
cudnn_handle
=
ctx
.
cudnn_handle
();
auto
place
=
ctx
.
GetPlace
();
CudnnFusionOp
*
fwd_op
=
GetForwardOp
(
ctx
);
size_t
workspace_size
=
RoundUp
(
...
...
@@ -210,9 +209,11 @@ class CudnnNormConvolution {
CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES
,
&
workspace_size
);
// output ptr
T
*
output_ptr
=
output
->
mutable_data
<
T
>
(
place
);
float
*
sum_ptr
=
sum
->
mutable_data
<
float
>
(
place
);
float
*
sum_of_squares_ptr
=
sum_of_squares
->
mutable_data
<
float
>
(
place
);
T
*
output_ptr
=
ctx
.
template
Alloc
<
T
>(
output
,
output
->
numel
()
*
sizeof
(
T
));
float
*
sum_ptr
=
ctx
.
template
Alloc
<
float
>(
sum
,
sum
->
numel
()
*
sizeof
(
float
));
float
*
sum_of_squares_ptr
=
ctx
.
template
Alloc
<
float
>(
sum_of_squares
,
sum_of_squares
->
numel
()
*
sizeof
(
float
));
fwd_op
->
SetOpVariantParamAttrPtr
(
CUDNN_PTR_YDATA
,
output_ptr
);
fwd_op
->
SetOpVariantParamAttrPtr
(
CUDNN_PTR_YSUM
,
sum_ptr
);
fwd_op
->
SetOpVariantParamAttrPtr
(
CUDNN_PTR_YSQSUM
,
sum_of_squares_ptr
);
...
...
@@ -311,17 +312,18 @@ class CudnnNormConvolutionGrad {
Tensor
*
input_grad
,
Tensor
*
filter_grad
,
bool
use_addto
=
false
)
{
auto
place
=
ctx
.
GetPlace
();
T
*
input_ptr
=
const_cast
<
T
*>
(
input
.
data
<
T
>
());
T
*
filter_ptr
=
const_cast
<
T
*>
(
filter
.
data
<
T
>
());
T
*
output_grad_ptr
=
const_cast
<
T
*>
(
output_grad
.
data
<
T
>
());
if
(
filter_grad
)
{
T
*
filter_grad_ptr
=
filter_grad
->
mutable_data
<
T
>
(
place
);
T
*
filter_grad_ptr
=
ctx
.
template
Alloc
<
T
>(
filter_grad
,
filter_grad
->
numel
()
*
sizeof
(
T
));
BackwardFilter
(
ctx
,
output_grad_ptr
,
input_ptr
,
filter_grad_ptr
);
}
if
(
input_grad
)
{
T
*
input_grad_ptr
=
input_grad
->
mutable_data
<
T
>
(
place
);
T
*
input_grad_ptr
=
ctx
.
template
Alloc
<
T
>(
input_grad
,
input_grad
->
numel
()
*
sizeof
(
T
));
BackwardData
(
ctx
,
output_grad_ptr
,
filter_ptr
,
input_grad_ptr
,
use_addto
);
}
}
...
...
paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h
浏览文件 @
4bbbed9a
...
...
@@ -127,7 +127,6 @@ class CudnnScaleBiasAddRelu {
Tensor
*
bitmask
)
{
ForwardInit
(
ctx
);
auto
handle
=
ctx
.
cudnn_handle
();
auto
place
=
ctx
.
GetPlace
();
auto
workspace_handle
=
ctx
.
cudnn_workspace_handle
();
fwd_workspace_byte_
=
fwd_op_
.
GetWorkspaceSizeInBytes
(
handle
);
// Set variant_param
...
...
@@ -156,8 +155,9 @@ class CudnnScaleBiasAddRelu {
CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES
,
&
fwd_workspace_byte_
);
// output ptr
T
*
out_ptr
=
out
->
mutable_data
<
T
>
(
place
);
int32_t
*
bitmask_ptr
=
bitmask
->
mutable_data
<
int32_t
>
(
place
);
T
*
out_ptr
=
ctx
.
template
Alloc
<
T
>(
out
,
out
->
numel
()
*
sizeof
(
T
));
int32_t
*
bitmask_ptr
=
ctx
.
template
Alloc
<
int32_t
>(
bitmask
,
bitmask
->
numel
()
*
sizeof
(
int32_t
));
fwd_op_
.
SetOpVariantParamAttrPtr
(
CUDNN_PTR_YDATA
,
out_ptr
);
fwd_op_
.
SetOpVariantParamAttrPtr
(
CUDNN_PTR_ACTIVATION_BITMASK
,
bitmask_ptr
);
...
...
@@ -186,7 +186,6 @@ class CudnnScaleBiasAddRelu {
double
eps
)
{
BackwardInit
(
ctx
);
auto
handle
=
ctx
.
cudnn_handle
();
auto
place
=
ctx
.
GetPlace
();
auto
workspace_handle
=
ctx
.
cudnn_workspace_handle
();
bwd_workspace_byte_
=
bwd_op_
.
GetWorkspaceSizeInBytes
(
handle
);
// Set variant_param
...
...
@@ -199,10 +198,15 @@ class CudnnScaleBiasAddRelu {
float
*
saved_invstd_ptr
=
const_cast
<
float
*>
(
saved_invstd
.
data
<
float
>
());
int32_t
*
bitmask_ptr
=
bitmask
?
const_cast
<
int32_t
*>
(
bitmask
->
data
<
int32_t
>
())
:
nullptr
;
T
*
dx_ptr
=
dx
->
mutable_data
<
T
>
(
place
);
T
*
dz_ptr
=
dz
?
dz
->
mutable_data
<
T
>
(
place
)
:
nullptr
;
float
*
dscale_ptr
=
dscale
?
dscale
->
mutable_data
<
float
>
(
place
)
:
nullptr
;
float
*
dbias_ptr
=
dbias
?
dbias
->
mutable_data
<
float
>
(
place
)
:
nullptr
;
T
*
dx_ptr
=
ctx
.
template
Alloc
<
T
>(
dx
,
dx
->
numel
()
*
sizeof
(
T
));
T
*
dz_ptr
=
dz
?
ctx
.
template
Alloc
<
T
>(
dz
,
dz
->
numel
()
*
sizeof
(
T
))
:
nullptr
;
float
*
dscale_ptr
=
dscale
?
ctx
.
template
Alloc
<
float
>(
dscale
,
dscale
->
numel
()
*
sizeof
(
float
))
:
nullptr
;
float
*
dbias_ptr
=
dbias
?
ctx
.
template
Alloc
<
float
>(
dbias
,
dbias
->
numel
()
*
sizeof
(
float
))
:
nullptr
;
bwd_op_
.
SetOpVariantParamAttrPtr
(
CUDNN_PTR_XDATA
,
x_ptr
);
bwd_op_
.
SetOpVariantParamAttrPtr
(
CUDNN_PTR_DYDATA
,
dy_ptr
);
...
...
paddle/fluid/operators/fused/fused_attention_op.cu
浏览文件 @
4bbbed9a
...
...
@@ -64,7 +64,7 @@ static void AllReduce(framework::Tensor &tensor, // NOLINT
int64_t
numel
=
tensor
.
numel
();
const
void
*
sendbuff
=
tensor
.
data
<
T
>
();
auto
place
=
ctx
.
GetPlace
();
void
*
recvbuff
=
tensor
.
mutable_data
<
T
>
(
place
);
void
*
recvbuff
=
ctx
.
template
Alloc
<
T
>(
&
tensor
,
tensor
.
numel
()
*
sizeof
(
T
)
);
auto
comm
=
platform
::
NCCLCommContext
::
Instance
().
Get
(
ring_id
,
place
);
auto
stream
=
ctx
.
stream
();
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
ncclAllReduce
(
...
...
@@ -83,7 +83,7 @@ class FusedAttentionOpKernel : public framework::OpKernel<T> {
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
using
U
=
LayerNormParamType
<
T
>
;
auto
*
input_x
=
ctx
.
Input
<
Tensor
>
(
"X"
);
auto
&
dev_ctx
=
ctx
.
template
device_context
<
phi
::
GPUContext
>();
const
auto
pre_layer_norm
=
ctx
.
Attr
<
bool
>
(
"pre_layer_norm"
);
const
float
epsilon
=
ctx
.
Attr
<
float
>
(
"epsilon"
);
auto
*
ln_scale
=
ctx
.
Input
<
Tensor
>
(
"LnScale"
);
...
...
@@ -145,40 +145,53 @@ class FusedAttentionOpKernel : public framework::OpKernel<T> {
auto
*
x_data
=
input_x
->
data
<
T
>
();
auto
*
qkv_weight_data
=
qkv_weight
->
data
<
T
>
();
auto
*
qkv_bias_data
=
(
qkv_bias
==
nullptr
)
?
nullptr
:
qkv_bias
->
data
<
T
>
();
auto
*
qkv_out_data
=
qkv_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
*
qkv_out_data
=
dev_ctx
.
template
Alloc
<
T
>(
qkv_out
,
qkv_out
->
numel
()
*
sizeof
(
T
));
auto
*
qkv_bias_out_data
=
(
qkv_bias
==
nullptr
)
?
nullptr
:
qkv_bias_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
(
qkv_bias
==
nullptr
)
?
nullptr
:
dev_ctx
.
template
Alloc
<
T
>(
qkv_bias_out
,
qkv_bias_out
->
numel
()
*
sizeof
(
T
));
// get data ptr for FMHA.
auto
*
transpose_out_2_data
=
transpose_out_2
->
mutable_data
<
T
>
(
ctx
.
GetPlace
(
));
auto
*
transpose_out_2_data
=
dev_ctx
.
template
Alloc
<
T
>(
transpose_out_2
,
transpose_out_2
->
numel
()
*
sizeof
(
T
));
auto
*
cache_kv_out_data
=
(
cache_kv_out
==
nullptr
)
?
nullptr
:
cache_kv_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
*
qk_out_data
=
qk_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
*
qktv_out_data
=
qktv_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
:
dev_ctx
.
template
Alloc
<
T
>(
cache_kv_out
,
cache_kv_out
->
numel
()
*
sizeof
(
T
));
auto
*
qk_out_data
=
dev_ctx
.
template
Alloc
<
T
>(
qk_out
,
qk_out
->
numel
()
*
sizeof
(
T
));
auto
*
qktv_out_data
=
dev_ctx
.
template
Alloc
<
T
>(
qktv_out
,
qktv_out
->
numel
()
*
sizeof
(
T
));
auto
*
src_mask_out_data
=
(
src_mask
==
nullptr
)
?
nullptr
:
src_mask_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
*
softmax_out_data
=
softmax_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
*
attn_dropout_mask_out_data
=
attn_dropout_mask_out
->
mutable_data
<
uint8_t
>
(
ctx
.
GetPlace
());
auto
*
attn_dropout_out_data
=
attn_dropout_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
*
fmha_out_data
=
fmha_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
(
src_mask
==
nullptr
)
?
nullptr
:
dev_ctx
.
template
Alloc
<
T
>(
src_mask_out
,
src_mask_out
->
numel
()
*
sizeof
(
T
));
auto
*
softmax_out_data
=
dev_ctx
.
template
Alloc
<
T
>(
softmax_out
,
softmax_out
->
numel
()
*
sizeof
(
T
));
auto
*
attn_dropout_mask_out_data
=
dev_ctx
.
template
Alloc
<
uint8_t
>(
attn_dropout_mask_out
,
attn_dropout_mask_out
->
numel
()
*
sizeof
(
uint8_t
));
auto
*
attn_dropout_out_data
=
dev_ctx
.
template
Alloc
<
T
>(
attn_dropout_out
,
attn_dropout_out
->
numel
()
*
sizeof
(
T
));
auto
*
fmha_out_data
=
dev_ctx
.
template
Alloc
<
T
>(
fmha_out
,
fmha_out
->
numel
()
*
sizeof
(
T
));
// get data ptr for out_linear.
auto
*
out_linear_weight_data
=
out_linear_weight
->
data
<
T
>
();
auto
*
out_linear_bias_data
=
(
out_linear_bias
==
nullptr
)
?
nullptr
:
out_linear_bias
->
data
<
T
>
();
auto
*
out_linear_out_data
=
out_linear_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
*
out_linear_out_data
=
dev_ctx
.
template
Alloc
<
T
>(
out_linear_out
,
out_linear_out
->
numel
()
*
sizeof
(
T
));
// get data ptr for bias+dropout+residual+layernorm
auto
*
dropout_mask_out_data
=
dropout_mask_out
->
mutable_data
<
uint8_t
>
(
ctx
.
GetPlace
());
auto
*
final_out_data
=
out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
*
dropout_mask_out_data
=
dev_ctx
.
template
Alloc
<
uint8_t
>(
dropout_mask_out
,
dropout_mask_out
->
numel
()
*
sizeof
(
uint8_t
));
auto
*
final_out_data
=
dev_ctx
.
template
Alloc
<
T
>(
out
,
out
->
numel
()
*
sizeof
(
T
));
int
batch_size
=
input_x_dims
[
0
];
int
max_seq_len
=
input_x_dims
[
1
];
...
...
@@ -248,9 +261,12 @@ class FusedAttentionOpKernel : public framework::OpKernel<T> {
auto
*
ln_scale_data
=
(
ln_scale
==
nullptr
?
nullptr
:
ln_scale
->
data
<
U
>
());
auto
*
ln_bias_data
=
(
ln_bias
==
nullptr
?
nullptr
:
ln_bias
->
data
<
U
>
());
auto
*
ln_mean_data
=
ln_mean
->
mutable_data
<
U
>
(
ctx
.
GetPlace
());
auto
*
ln_var_data
=
ln_var
->
mutable_data
<
U
>
(
ctx
.
GetPlace
());
auto
*
ln_out_data
=
ln_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
*
ln_mean_data
=
dev_ctx
.
template
Alloc
<
U
>(
ln_mean
,
ln_mean
->
numel
()
*
sizeof
(
U
));
auto
*
ln_var_data
=
dev_ctx
.
template
Alloc
<
U
>(
ln_var
,
ln_var
->
numel
()
*
sizeof
(
U
));
auto
*
ln_out_data
=
dev_ctx
.
template
Alloc
<
T
>(
ln_out
,
ln_out
->
numel
()
*
sizeof
(
T
));
layer_norm_compute
.
ComputeForward
(
x_data
,
ln_scale_data
,
...
...
@@ -321,10 +337,13 @@ class FusedAttentionOpKernel : public framework::OpKernel<T> {
const
U
*
ln_scale_2_ptr
=
ln_scale_2
?
ln_scale_2
->
data
<
U
>
()
:
nullptr
;
const
U
*
ln_bias_2_ptr
=
ln_bias_2
?
ln_bias_2
->
data
<
U
>
()
:
nullptr
;
T
*
bias_dropout_residual_out_ptr
=
bias_dropout_residual_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
U
*
ln_mean_2_ptr
=
ln_mean_2
->
mutable_data
<
U
>
(
ctx
.
GetPlace
());
U
*
ln_var_2_ptr
=
ln_var_2
->
mutable_data
<
U
>
(
ctx
.
GetPlace
());
T
*
bias_dropout_residual_out_ptr
=
dev_ctx
.
template
Alloc
<
T
>(
bias_dropout_residual_out
,
bias_dropout_residual_out
->
numel
()
*
sizeof
(
T
));
U
*
ln_mean_2_ptr
=
dev_ctx
.
template
Alloc
<
U
>(
ln_mean_2
,
ln_mean_2
->
numel
()
*
sizeof
(
U
));
U
*
ln_var_2_ptr
=
dev_ctx
.
template
Alloc
<
U
>(
ln_var_2
,
ln_var_2
->
numel
()
*
sizeof
(
U
));
// output = layernorm(residual + dropout(input + bias))
fused_dropout_layernorm_helper
.
LayernormResidualDropoutBias
(
ctx
.
cuda_device_context
(),
...
...
@@ -352,6 +371,7 @@ class FusedAttentionGradKernel : public framework::OpKernel<T> {
const
float
ln2epsilon
=
ctx
.
Attr
<
float
>
(
"ln_epsilon"
);
float
attn_dropout_prob
=
ctx
.
Attr
<
float
>
(
"attn_dropout_rate"
);
auto
&
dev_ctx
=
ctx
.
template
device_context
<
phi
::
GPUContext
>();
bool
is_test_1
=
ctx
.
Attr
<
bool
>
(
"is_test"
);
auto
&
dropout_implementation_1
=
ctx
.
Attr
<
std
::
string
>
(
"attn_dropout_implementation"
);
...
...
@@ -432,29 +452,37 @@ class FusedAttentionGradKernel : public framework::OpKernel<T> {
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"OutLinearOut"
));
auto
*
d_bias_dropout_residual_out
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"BiasDropoutResidualOut"
));
auto
*
d_x_data
=
d
_x
->
mutable_data
<
T
>
(
ctx
.
GetPlace
(
));
auto
*
d_x_data
=
d
ev_ctx
.
template
Alloc
<
T
>(
d_x
,
d_x
->
numel
()
*
sizeof
(
T
));
// when qkv_bias is not nullptr, d_qkv_out is equals to d_qkv_bias_out, the
// space can be reused.
auto
*
d_qkv_out_data
=
(
d_qkv_bias_out
!=
nullptr
)
?
nullptr
:
d_qkv_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
:
dev_ctx
.
template
Alloc
<
T
>(
d_qkv_out
,
d_qkv_out
->
numel
()
*
sizeof
(
T
));
auto
*
d_qkv_bias_out_data
=
(
d_qkv_bias_out
==
nullptr
)
?
nullptr
:
d_qkv_bias_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
*
d_qktv_out_data
=
d_qktv_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
*
d_transpose_out_2_data
=
d_transpose_out_2
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
*
d_qk_out_data
=
d_qk_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
*
d_softmax_out_data
=
d_softmax_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
*
d_attn_dropout_out_data
=
d_attn_dropout_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
:
dev_ctx
.
template
Alloc
<
T
>(
d_qkv_bias_out
,
d_qkv_bias_out
->
numel
()
*
sizeof
(
T
));
auto
*
d_qktv_out_data
=
dev_ctx
.
template
Alloc
<
T
>(
d_qktv_out
,
d_qktv_out
->
numel
()
*
sizeof
(
T
));
auto
*
d_transpose_out_2_data
=
dev_ctx
.
template
Alloc
<
T
>(
d_transpose_out_2
,
d_transpose_out_2
->
numel
()
*
sizeof
(
T
));
auto
*
d_qk_out_data
=
dev_ctx
.
template
Alloc
<
T
>(
d_qk_out
,
d_qk_out
->
numel
()
*
sizeof
(
T
));
auto
*
d_softmax_out_data
=
dev_ctx
.
template
Alloc
<
T
>(
d_softmax_out
,
d_softmax_out
->
numel
()
*
sizeof
(
T
));
auto
*
d_attn_dropout_out_data
=
dev_ctx
.
template
Alloc
<
T
>(
d_attn_dropout_out
,
d_attn_dropout_out
->
numel
()
*
sizeof
(
T
));
auto
*
d_src_mask_out_data
=
(
src_mask
==
nullptr
)
?
nullptr
:
d_src_mask_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
*
d_fmha_out_data
=
d_fmha_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
*
d_out_linear_out_data
=
d_out_linear_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
(
src_mask
==
nullptr
)
?
nullptr
:
dev_ctx
.
template
Alloc
<
T
>(
d_src_mask_out
,
d_src_mask_out
->
numel
()
*
sizeof
(
T
));
auto
*
d_fmha_out_data
=
dev_ctx
.
template
Alloc
<
T
>(
d_fmha_out
,
d_fmha_out
->
numel
()
*
sizeof
(
T
));
auto
*
d_out_linear_out_data
=
dev_ctx
.
template
Alloc
<
T
>(
d_out_linear_out
,
d_out_linear_out
->
numel
()
*
sizeof
(
T
));
// parameter grad
auto
*
d_qkv_weight
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"QKVW"
));
...
...
@@ -466,16 +494,20 @@ class FusedAttentionGradKernel : public framework::OpKernel<T> {
auto
*
d_ln_2_scale
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Ln2Scale"
));
auto
*
d_ln_2_bias
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Ln2Bias"
));
auto
*
d_qkv_weight_data
=
d_qkv_weight
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
*
d_qkv_bias_data
=
(
d_qkv_bias
==
nullptr
)
auto
*
d_qkv_weight_data
=
dev_ctx
.
template
Alloc
<
T
>(
d_qkv_weight
,
d_qkv_weight
->
numel
()
*
sizeof
(
T
));
auto
*
d_qkv_bias_data
=
(
d_qkv_bias
==
nullptr
)
?
nullptr
:
d_qkv_bias
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
*
d_out_linear_weight_data
=
d_out_linear_weight
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
:
dev_ctx
.
template
Alloc
<
T
>(
d_qkv_bias
,
d_qkv_bias
->
numel
()
*
sizeof
(
T
));
auto
*
d_out_linear_weight_data
=
dev_ctx
.
template
Alloc
<
T
>(
d_out_linear_weight
,
d_out_linear_weight
->
numel
()
*
sizeof
(
T
));
auto
*
d_out_linear_bias_data
=
(
d_out_linear_bias
==
nullptr
)
?
nullptr
:
d_out_linear_bias
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
:
dev_ctx
.
template
Alloc
<
T
>(
d_out_linear_bias
,
d_out_linear_bias
->
numel
()
*
sizeof
(
T
));
const
auto
input_x_dims
=
input_x
->
dims
();
const
auto
qkv_w_dims
=
qkv_weight
->
dims
();
...
...
@@ -496,7 +528,8 @@ class FusedAttentionGradKernel : public framework::OpKernel<T> {
T
*
d_residual_data
=
nullptr
;
if
(
add_residual
)
{
d_residual
.
Resize
(
input_x_dims
);
d_residual_data
=
d_residual
.
mutable_data
<
T
>
(
ctx
.
GetPlace
());
d_residual_data
=
dev_ctx
.
template
Alloc
<
T
>(
&
d_residual
,
d_residual
.
numel
()
*
sizeof
(
T
));
}
bool
transA
=
false
;
...
...
@@ -560,13 +593,16 @@ class FusedAttentionGradKernel : public framework::OpKernel<T> {
auto
*
d_ln_2_scale_data
=
(
d_ln_2_scale
==
nullptr
?
nullptr
:
d_ln_2_scale
->
mutable_data
<
U
>
(
ctx
.
GetPlace
()));
:
dev_ctx
.
template
Alloc
<
U
>(
d_ln_2_scale
,
d_ln_2_scale
->
numel
()
*
sizeof
(
U
)));
auto
*
d_ln_2_bias_data
=
(
d_ln_2_bias
==
nullptr
?
nullptr
:
d_ln_2_bias
->
mutable_data
<
U
>
(
ctx
.
GetPlace
()));
auto
*
d_bias_dropout_residual_out_data
=
d_bias_dropout_residual_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
:
dev_ctx
.
template
Alloc
<
U
>(
d_ln_2_bias
,
d_ln_2_bias
->
numel
()
*
sizeof
(
U
)));
auto
*
d_bias_dropout_residual_out_data
=
dev_ctx
.
template
Alloc
<
T
>(
d_bias_dropout_residual_out
,
d_bias_dropout_residual_out
->
numel
()
*
sizeof
(
T
));
fused_dropout_layernorm_helper
.
LayernormResidualDropoutBiasGrad
(
ctx
.
cuda_device_context
(),
...
...
@@ -638,13 +674,18 @@ class FusedAttentionGradKernel : public framework::OpKernel<T> {
auto
*
d_ln_out
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"LnOut"
));
auto
*
d_ln_scale
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"LnScale"
));
auto
*
d_ln_bias
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"LnBias"
));
auto
*
d_ln_out_data
=
d_ln_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
*
d_ln_out_data
=
dev_ctx
.
template
Alloc
<
T
>(
d_ln_out
,
d_ln_out
->
numel
()
*
sizeof
(
T
));
auto
*
d_ln_scale_data
=
(
d_ln_scale
==
nullptr
?
nullptr
:
d_ln_scale
->
mutable_data
<
U
>
(
ctx
.
GetPlace
()));
(
d_ln_scale
==
nullptr
?
nullptr
:
dev_ctx
.
template
Alloc
<
U
>(
d_ln_scale
,
d_ln_scale
->
numel
()
*
sizeof
(
U
)));
auto
*
d_ln_bias_data
=
(
d_ln_bias
==
nullptr
?
nullptr
:
d_ln_bias
->
mutable_data
<
U
>
(
ctx
.
GetPlace
()));
(
d_ln_bias
==
nullptr
?
nullptr
:
dev_ctx
.
template
Alloc
<
U
>(
d_ln_bias
,
d_ln_bias
->
numel
()
*
sizeof
(
U
)));
if
(
qkv_bias
!=
nullptr
)
{
qkv_compute
.
ComputeBackward
(
ln_out
,
qkv_weight
,
...
...
paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cu
浏览文件 @
4bbbed9a
...
...
@@ -31,6 +31,7 @@ template <typename T>
class
FusedBiasDropoutResidualLnOpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
&
dev_ctx
=
ctx
.
template
device_context
<
phi
::
GPUContext
>();
using
U
=
LayerNormParamType
<
T
>
;
auto
*
input_x
=
ctx
.
Input
<
Tensor
>
(
"X"
);
auto
*
bias
=
ctx
.
Input
<
Tensor
>
(
"Bias"
);
...
...
@@ -50,12 +51,14 @@ class FusedBiasDropoutResidualLnOpKernel : public framework::OpKernel<T> {
auto
*
ln_scale_data
=
(
ln_scale
==
nullptr
?
nullptr
:
ln_scale
->
data
<
U
>
());
auto
*
ln_bias_data
=
(
ln_bias
==
nullptr
?
nullptr
:
ln_bias
->
data
<
U
>
());
auto
*
bias_dropout_residual_out_data
=
bias_dropout_residual_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
*
ln_mean_data
=
ln_mean
->
mutable_data
<
U
>
(
ctx
.
GetPlace
());
auto
*
ln_var_data
=
ln_var
->
mutable_data
<
U
>
(
ctx
.
GetPlace
());
auto
*
dropout_mask_out_data
=
dropout_mask_out
->
mutable_data
<
uint8_t
>
(
ctx
.
GetPlace
());
auto
*
y_data
=
y
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
dev_ctx
.
Alloc
<
T
>
(
bias_dropout_residual_out
,
bias_dropout_residual_out
->
numel
()
*
sizeof
(
T
));
auto
*
ln_mean_data
=
dev_ctx
.
Alloc
<
U
>
(
ln_mean
,
ln_mean
->
numel
()
*
sizeof
(
U
));
auto
*
ln_var_data
=
dev_ctx
.
Alloc
<
U
>
(
ln_var
,
ln_var
->
numel
()
*
sizeof
(
U
));
auto
*
dropout_mask_out_data
=
dev_ctx
.
Alloc
<
uint8_t
>
(
dropout_mask_out
,
dropout_mask_out
->
numel
()
*
sizeof
(
uint8_t
));
auto
*
y_data
=
dev_ctx
.
Alloc
<
T
>
(
y
,
y
->
numel
()
*
sizeof
(
T
));
const
auto
input_x_dims
=
input_x
->
dims
();
int
bsz_seq
=
1
;
...
...
@@ -92,7 +95,7 @@ class FusedBiasDropoutResidualLnGradKernel : public framework::OpKernel<T> {
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
using
U
=
LayerNormParamType
<
T
>
;
const
float
ln_epsilon
=
ctx
.
Attr
<
float
>
(
"ln_epsilon"
);
auto
&
dev_ctx
=
ctx
.
template
device_context
<
phi
::
GPUContext
>();
auto
*
d_y
=
ctx
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Y"
));
auto
*
ln_scale
=
ctx
.
Input
<
Tensor
>
(
"LnScale"
);
auto
*
dropout_mask_out
=
ctx
.
Input
<
Tensor
>
(
"DropoutMaskOut"
);
...
...
@@ -114,18 +117,24 @@ class FusedBiasDropoutResidualLnGradKernel : public framework::OpKernel<T> {
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"BiasDropoutResidualOut"
));
auto
*
d_ln_scale
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"LnScale"
));
auto
*
d_ln_bias
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"LnBias"
));
auto
*
d_x_data
=
d_x
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
*
d_residual_data
=
d_residual
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
*
d_x_data
=
dev_ctx
.
Alloc
<
T
>
(
d_x
,
d_x
->
numel
()
*
sizeof
(
T
));
auto
*
d_residual_data
=
dev_ctx
.
Alloc
<
T
>
(
d_residual
,
d_residual
->
numel
()
*
sizeof
(
T
));
auto
*
d_bias_dropout_residual_out_data
=
d_bias_dropout_residual_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
dev_ctx
.
Alloc
<
T
>
(
d_bias_dropout_residual_out
,
d_bias_dropout_residual_out
->
numel
()
*
sizeof
(
T
));
auto
*
d_bias_data
=
(
d_bias
==
nullptr
?
nullptr
:
d_bias
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()));
(
d_bias
==
nullptr
?
nullptr
:
dev_ctx
.
Alloc
<
T
>
(
d_bias
,
d_bias
->
numel
()
*
sizeof
(
T
)));
auto
*
d_ln_scale_data
=
(
d_ln_scale
==
nullptr
?
nullptr
:
d_ln_scale
->
mutable_data
<
U
>
(
ctx
.
GetPlace
()));
(
d_ln_scale
==
nullptr
?
nullptr
:
dev_ctx
.
Alloc
<
U
>
(
d_ln_scale
,
d_ln_scale
->
numel
()
*
sizeof
(
U
)));
auto
*
d_ln_bias_data
=
(
d_ln_bias
==
nullptr
?
nullptr
:
d_ln_bias
->
mutable_data
<
U
>
(
ctx
.
GetPlace
()));
(
d_ln_bias
==
nullptr
?
nullptr
:
dev_ctx
.
Alloc
<
U
>
(
d_ln_bias
,
d_ln_bias
->
numel
()
*
sizeof
(
U
)));
const
auto
input_x_dims
=
d_y
->
dims
();
int
bsz_seq
=
1
;
...
...
paddle/fluid/operators/fused/fused_bn_activation_op.cu
浏览文件 @
4bbbed9a
...
...
@@ -45,6 +45,7 @@ class FusedBatchNormActKernel<phi::GPUContext, T>
platform
::
is_gpu_place
(
ctx
.
GetPlace
()),
true
,
platform
::
errors
::
PreconditionNotMet
(
"It must use CUDAPlace."
));
auto
&
dev_ctx
=
ctx
.
template
device_context
<
phi
::
GPUContext
>();
double
epsilon
=
static_cast
<
double
>
(
ctx
.
Attr
<
float
>
(
"epsilon"
));
float
momentum
=
ctx
.
Attr
<
float
>
(
"momentum"
);
std
::
string
act_type
=
ctx
.
Attr
<
std
::
string
>
(
"act_type"
);
...
...
@@ -73,22 +74,26 @@ class FusedBatchNormActKernel<phi::GPUContext, T>
// initialize them.
auto
*
mean_out
=
ctx
.
Output
<
Tensor
>
(
"MeanOut"
);
auto
*
variance_out
=
ctx
.
Output
<
Tensor
>
(
"VarianceOut"
);
mean_out
->
mutable_data
<
BatchNormParamType
<
T
>>
(
ctx
.
GetPlace
());
variance_out
->
mutable_data
<
BatchNormParamType
<
T
>>
(
ctx
.
GetPlace
());
dev_ctx
.
Alloc
<
BatchNormParamType
<
T
>>
(
mean_out
,
mean_out
->
numel
()
*
sizeof
(
BatchNormParamType
<
T
>
));
dev_ctx
.
Alloc
<
BatchNormParamType
<
T
>>
(
variance_out
,
variance_out
->
numel
()
*
sizeof
(
BatchNormParamType
<
T
>
));
auto
*
saved_mean
=
ctx
.
Output
<
Tensor
>
(
"SavedMean"
);
auto
*
saved_variance
=
ctx
.
Output
<
Tensor
>
(
"SavedVariance"
);
saved_mean
->
mutable_data
<
BatchNormParamType
<
T
>>
(
ctx
.
GetPlace
());
saved_variance
->
mutable_data
<
BatchNormParamType
<
T
>>
(
ctx
.
GetPlace
());
dev_ctx
.
Alloc
<
BatchNormParamType
<
T
>>
(
saved_mean
,
saved_mean
->
numel
()
*
sizeof
(
BatchNormParamType
<
T
>
));
dev_ctx
.
Alloc
<
BatchNormParamType
<
T
>>
(
saved_variance
,
saved_variance
->
numel
()
*
sizeof
(
BatchNormParamType
<
T
>
));
auto
*
y
=
ctx
.
Output
<
Tensor
>
(
"Y"
);
y
->
mutable_data
<
T
>
(
ctx
.
GetPlace
(
));
dev_ctx
.
Alloc
<
T
>
(
y
,
y
->
numel
()
*
sizeof
(
T
));
int
N
,
C
,
H
,
W
,
D
;
const
DataLayout
data_layout
=
DataLayout
::
kNHWC
;
ExtractNCWHD
(
x_dims
,
data_layout
,
&
N
,
&
C
,
&
H
,
&
W
,
&
D
);
auto
&
dev_ctx
=
ctx
.
template
device_context
<
phi
::
GPUContext
>();
if
((
N
*
H
*
W
*
D
)
==
1
)
{
// Only 1 element in normalization dimension,
// skip the batch norm calculation, let y = act(x).
...
...
@@ -172,10 +177,17 @@ class FusedBatchNormActKernel<phi::GPUContext, T>
/*xDesc=*/
data_desc_
,
/*sizeInBytes=*/
&
reserve_space_size
));
reserve_space_ptr
=
reserve_space
->
mutable_data
(
ctx
.
GetPlace
(),
x
->
dtype
(),
reserve_space_size
);
workspace_ptr
=
workspace_tensor
.
mutable_data
(
ctx
.
GetPlace
(),
x
->
dtype
(),
workspace_size
);
reserve_space
->
Resize
({
static_cast
<
int64_t
>
(
(
reserve_space_size
+
experimental
::
SizeOf
(
x
->
dtype
())
-
1
)
/
experimental
::
SizeOf
(
x
->
dtype
()))});
reserve_space_ptr
=
dev_ctx
.
Alloc
<
T
>
(
reserve_space
,
reserve_space
->
numel
()
*
sizeof
(
T
));
workspace_tensor
.
Resize
({
static_cast
<
int64_t
>
(
(
workspace_size
+
experimental
::
SizeOf
(
x
->
dtype
())
-
1
)
/
experimental
::
SizeOf
(
x
->
dtype
()))});
workspace_ptr
=
dev_ctx
.
Alloc
<
T
>
(
&
workspace_tensor
,
workspace_tensor
.
numel
()
*
sizeof
(
T
));
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cudnnBatchNormalizationForwardTrainingEx
(
handle
,
...
...
@@ -193,15 +205,18 @@ class FusedBatchNormActKernel<phi::GPUContext, T>
scale
->
template
data
<
BatchNormParamType
<
T
>
>
(),
bias
->
template
data
<
BatchNormParamType
<
T
>
>
(),
this_factor
,
mean_out
->
template
mutable_data
<
BatchNormParamType
<
T
>
>
(
ctx
.
GetPlace
()),
variance_out
->
template
mutable_data
<
BatchNormParamType
<
T
>
>
(
ctx
.
GetPlace
()),
dev_ctx
.
template
Alloc
<
BatchNormParamType
<
T
>
>
(
mean_out
,
mean_out
->
numel
()
*
sizeof
(
BatchNormParamType
<
T
>
)),
dev_ctx
.
template
Alloc
<
BatchNormParamType
<
T
>
>
(
variance_out
,
variance_out
->
numel
()
*
sizeof
(
BatchNormParamType
<
T
>
)),
epsilon
,
saved_mean
->
template
mutable_data
<
BatchNormParamType
<
T
>
>
(
ctx
.
GetPlace
()),
saved_variance
->
template
mutable_data
<
BatchNormParamType
<
T
>
>
(
ctx
.
GetPlace
()),
dev_ctx
.
template
Alloc
<
BatchNormParamType
<
T
>
>
(
saved_mean
,
saved_mean
->
numel
()
*
sizeof
(
BatchNormParamType
<
T
>
)),
dev_ctx
.
template
Alloc
<
BatchNormParamType
<
T
>
>
(
saved_variance
,
saved_variance
->
numel
()
*
sizeof
(
BatchNormParamType
<
T
>
)),
activation_desc_
,
workspace_ptr
,
workspace_size
,
...
...
@@ -227,7 +242,7 @@ class FusedBatchNormActGradKernel<phi::GPUContext, T>
platform
::
errors
::
PreconditionNotMet
(
"It must use CUDAPlace."
));
double
epsilon
=
static_cast
<
double
>
(
ctx
.
Attr
<
float
>
(
"epsilon"
));
std
::
string
act_type
=
ctx
.
Attr
<
std
::
string
>
(
"act_type"
);
auto
&
dev_ctx
=
ctx
.
template
device_context
<
phi
::
GPUContext
>();
const
auto
*
x
=
ctx
.
Input
<
Tensor
>
(
"X"
);
const
auto
*
y
=
ctx
.
Input
<
Tensor
>
(
"Y"
);
const
auto
*
d_y
=
ctx
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Y"
));
...
...
@@ -250,14 +265,16 @@ class FusedBatchNormActGradKernel<phi::GPUContext, T>
auto
*
d_scale
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Scale"
));
auto
*
d_bias
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Bias"
));
d
_x
->
mutable_data
<
T
>
(
ctx
.
GetPlace
(
));
d
ev_ctx
.
Alloc
<
T
>
(
d_x
,
d_x
->
numel
()
*
sizeof
(
T
));
PADDLE_ENFORCE_EQ
(
d_scale
&&
d_bias
,
true
,
platform
::
errors
::
PreconditionNotMet
(
"Both the scale grad and the bias grad must not be null."
));
d_scale
->
mutable_data
<
BatchNormParamType
<
T
>>
(
ctx
.
GetPlace
());
d_bias
->
mutable_data
<
BatchNormParamType
<
T
>>
(
ctx
.
GetPlace
());
dev_ctx
.
Alloc
<
BatchNormParamType
<
T
>>
(
d_scale
,
d_scale
->
numel
()
*
sizeof
(
BatchNormParamType
<
T
>
));
dev_ctx
.
Alloc
<
BatchNormParamType
<
T
>>
(
d_bias
,
d_bias
->
numel
()
*
sizeof
(
BatchNormParamType
<
T
>
));
PADDLE_ENFORCE_EQ
(
scale
->
dims
().
size
(),
1UL
,
platform
::
errors
::
PreconditionNotMet
(
...
...
@@ -268,7 +285,6 @@ class FusedBatchNormActGradKernel<phi::GPUContext, T>
platform
::
errors
::
PreconditionNotMet
(
"The size of scale is equal to the channel of Input(X)."
));
auto
&
dev_ctx
=
ctx
.
template
device_context
<
phi
::
GPUContext
>();
if
((
N
*
H
*
W
*
D
)
==
1
)
{
if
(
act_type
==
"relu"
)
{
auto
x_v
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
x
);
...
...
@@ -344,8 +360,11 @@ class FusedBatchNormActGradKernel<phi::GPUContext, T>
/*activationDesc=*/
activation_desc_
,
/*sizeInBytes=*/
&
workspace_size
));
workspace_ptr
=
workspace_tensor
.
mutable_data
(
ctx
.
GetPlace
(),
x
->
type
(),
workspace_size
);
workspace_tensor
.
Resize
({
static_cast
<
int64_t
>
(
(
workspace_size
+
experimental
::
SizeOf
(
x
->
dtype
())
-
1
)
/
experimental
::
SizeOf
(
x
->
dtype
()))});
workspace_ptr
=
dev_ctx
.
Alloc
<
T
>
(
&
workspace_tensor
,
workspace_tensor
.
numel
()
*
sizeof
(
T
));
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cudnnBatchNormalizationBackwardEx
(
...
...
@@ -365,16 +384,17 @@ class FusedBatchNormActGradKernel<phi::GPUContext, T>
/*dzDesc=*/
nullptr
,
/*dzData=*/
nullptr
,
/*dxDesc=*/
data_desc_
,
/*dxData=*/
d_x
->
template
mutable_data
<
T
>(
ctx
.
GetPlace
()),
/*dxData=*/
dev_ctx
.
template
Alloc
<
T
>(
d_x
,
d_x
->
numel
()
*
sizeof
(
T
)),
/*dBnScaleBiasDesc=*/
bn_param_desc_
,
/*bnScaleData=*/
scale
->
template
data
<
BatchNormParamType
<
T
>
>
(),
/*bnBiasData=*/
bias
->
template
data
<
BatchNormParamType
<
T
>
>
(),
/*dBnScaleData=*/
d
_scale
->
template
mutable_data
<
BatchNormParamType
<
T
>
>
(
ctx
.
GetPlace
(
)),
d
ev_ctx
.
template
Alloc
<
BatchNormParamType
<
T
>
>
(
d_scale
,
d_scale
->
numel
()
*
sizeof
(
BatchNormParamType
<
T
>
)),
/*dBnBiasData=*/
d
_bias
->
template
mutable_data
<
BatchNormParamType
<
T
>
>
(
ctx
.
GetPlace
(
)),
d
ev_ctx
.
template
Alloc
<
BatchNormParamType
<
T
>
>
(
d_bias
,
d_bias
->
numel
()
*
sizeof
(
BatchNormParamType
<
T
>
)),
/*epsilon=*/
epsilon
,
/*savedMean=*/
saved_mean_data
,
/*savedInvVariance=*/
saved_var_data
,
...
...
paddle/fluid/operators/fused/fused_bn_add_activation_op.cu
浏览文件 @
4bbbed9a
...
...
@@ -23,6 +23,7 @@
#include "paddle/fluid/operators/norm_utils.h"
#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/phi/common/data_type.h"
#include "paddle/phi/kernels/funcs/math_function.h"
DECLARE_bool
(
cudnn_batchnorm_spatial_persistent
);
...
...
@@ -44,6 +45,7 @@ class FusedBatchNormAddActKernel<phi::GPUContext, T>
platform
::
is_gpu_place
(
ctx
.
GetPlace
()),
true
,
platform
::
errors
::
PreconditionNotMet
(
"It must use CUDAPlace."
));
auto
&
dev_ctx
=
ctx
.
template
device_context
<
phi
::
GPUContext
>();
double
epsilon
=
static_cast
<
double
>
(
ctx
.
Attr
<
float
>
(
"epsilon"
));
float
momentum
=
ctx
.
Attr
<
float
>
(
"momentum"
);
std
::
string
act_type
=
ctx
.
Attr
<
std
::
string
>
(
"act_type"
);
...
...
@@ -66,23 +68,26 @@ class FusedBatchNormAddActKernel<phi::GPUContext, T>
auto
*
mean_out
=
ctx
.
Output
<
Tensor
>
(
"MeanOut"
);
auto
*
variance_out
=
ctx
.
Output
<
Tensor
>
(
"VarianceOut"
);
mean_out
->
mutable_data
<
BatchNormParamType
<
T
>>
(
ctx
.
GetPlace
());
variance_out
->
mutable_data
<
BatchNormParamType
<
T
>>
(
ctx
.
GetPlace
());
dev_ctx
.
Alloc
<
BatchNormParamType
<
T
>>
(
mean_out
,
mean_out
->
numel
()
*
sizeof
(
BatchNormParamType
<
T
>
));
dev_ctx
.
Alloc
<
BatchNormParamType
<
T
>>
(
variance_out
,
variance_out
->
numel
()
*
sizeof
(
BatchNormParamType
<
T
>
));
auto
*
saved_mean
=
ctx
.
Output
<
Tensor
>
(
"SavedMean"
);
auto
*
saved_variance
=
ctx
.
Output
<
Tensor
>
(
"SavedVariance"
);
saved_mean
->
mutable_data
<
BatchNormParamType
<
T
>>
(
ctx
.
GetPlace
());
saved_variance
->
mutable_data
<
BatchNormParamType
<
T
>>
(
ctx
.
GetPlace
());
dev_ctx
.
Alloc
<
BatchNormParamType
<
T
>>
(
saved_mean
,
saved_mean
->
numel
()
*
sizeof
(
BatchNormParamType
<
T
>
));
dev_ctx
.
Alloc
<
BatchNormParamType
<
T
>>
(
saved_variance
,
saved_variance
->
numel
()
*
sizeof
(
BatchNormParamType
<
T
>
));
auto
*
y
=
ctx
.
Output
<
Tensor
>
(
"Y"
);
y
->
mutable_data
<
T
>
(
ctx
.
GetPlace
(
));
dev_ctx
.
Alloc
<
T
>
(
y
,
y
->
numel
()
*
sizeof
(
T
));
int
N
,
C
,
H
,
W
,
D
;
const
DataLayout
data_layout
=
DataLayout
::
kNHWC
;
ExtractNCWHD
(
in_dims
,
data_layout
,
&
N
,
&
C
,
&
H
,
&
W
,
&
D
);
auto
&
dev_ctx
=
ctx
.
template
device_context
<
phi
::
GPUContext
>();
// ------------------- cudnn descriptors ---------------------
auto
handle
=
dev_ctx
.
cudnn_handle
();
cudnnTensorDescriptor_t
data_desc_
;
...
...
@@ -149,10 +154,17 @@ class FusedBatchNormAddActKernel<phi::GPUContext, T>
/*xDesc=*/
data_desc_
,
/*sizeInBytes=*/
&
reserve_space_size
));
reserve_space_ptr
=
reserve_space
->
mutable_data
(
ctx
.
GetPlace
(),
x
->
dtype
(),
reserve_space_size
);
workspace_ptr
=
workspace_tensor
.
mutable_data
(
ctx
.
GetPlace
(),
x
->
dtype
(),
workspace_size
);
reserve_space
->
Resize
({
static_cast
<
int64_t
>
(
(
reserve_space_size
+
experimental
::
SizeOf
(
x
->
dtype
())
-
1
)
/
experimental
::
SizeOf
(
x
->
dtype
()))});
reserve_space_ptr
=
dev_ctx
.
Alloc
<
T
>
(
reserve_space
,
reserve_space
->
numel
()
*
sizeof
(
T
));
workspace_tensor
.
Resize
({
static_cast
<
int64_t
>
(
(
workspace_size
+
experimental
::
SizeOf
(
x
->
dtype
())
-
1
)
/
experimental
::
SizeOf
(
x
->
dtype
()))});
workspace_ptr
=
dev_ctx
.
Alloc
<
T
>
(
&
workspace_tensor
,
workspace_tensor
.
numel
()
*
sizeof
(
T
));
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cudnnBatchNormalizationForwardTrainingEx
(
handle
,
...
...
@@ -170,15 +182,18 @@ class FusedBatchNormAddActKernel<phi::GPUContext, T>
scale
->
template
data
<
BatchNormParamType
<
T
>
>
(),
bias
->
template
data
<
BatchNormParamType
<
T
>
>
(),
this_factor
,
mean_out
->
template
mutable_data
<
BatchNormParamType
<
T
>
>
(
ctx
.
GetPlace
()),
variance_out
->
template
mutable_data
<
BatchNormParamType
<
T
>
>
(
ctx
.
GetPlace
()),
dev_ctx
.
template
Alloc
<
BatchNormParamType
<
T
>
>
(
mean_out
,
mean_out
->
numel
()
*
sizeof
(
BatchNormParamType
<
T
>
)),
dev_ctx
.
template
Alloc
<
BatchNormParamType
<
T
>
>
(
variance_out
,
variance_out
->
numel
()
*
sizeof
(
BatchNormParamType
<
T
>
)),
epsilon
,
saved_mean
->
template
mutable_data
<
BatchNormParamType
<
T
>
>
(
ctx
.
GetPlace
()),
saved_variance
->
template
mutable_data
<
BatchNormParamType
<
T
>
>
(
ctx
.
GetPlace
()),
dev_ctx
.
template
Alloc
<
BatchNormParamType
<
T
>
>
(
saved_mean
,
saved_mean
->
numel
()
*
sizeof
(
BatchNormParamType
<
T
>
)),
dev_ctx
.
template
Alloc
<
BatchNormParamType
<
T
>
>
(
saved_variance
,
saved_variance
->
numel
()
*
sizeof
(
BatchNormParamType
<
T
>
)),
activation_desc_
,
workspace_ptr
,
workspace_size
,
...
...
@@ -212,6 +227,7 @@ class FusedBatchNormAddActGradKernel<phi::GPUContext, T>
const
auto
*
bias
=
ctx
.
Input
<
Tensor
>
(
"Bias"
);
const
auto
*
reserve_space
=
ctx
.
Input
<
Tensor
>
(
"ReserveSpace"
);
auto
&
dev_ctx
=
ctx
.
template
device_context
<
phi
::
GPUContext
>();
const
auto
&
in_dims
=
x
->
dims
();
int
N
,
C
,
H
,
W
,
D
;
...
...
@@ -243,8 +259,6 @@ class FusedBatchNormAddActGradKernel<phi::GPUContext, T>
platform
::
errors
::
PreconditionNotMet
(
"The size of scale is equal to the channel of Input(X)."
));
auto
&
dev_ctx
=
ctx
.
template
device_context
<
phi
::
GPUContext
>();
std
::
vector
<
int
>
dims
=
{
N
,
C
,
H
,
W
,
D
};
std
::
vector
<
int
>
strides
=
{
H
*
W
*
C
*
D
,
1
,
W
*
D
*
C
,
D
*
C
,
C
};
// ------------------- cudnn descriptors ---------------------
...
...
paddle/fluid/operators/fused/fused_feedforward_op.cu
浏览文件 @
4bbbed9a
...
...
@@ -57,7 +57,7 @@ static void AllReduce(framework::Tensor& tensor, // NOLINT
int64_t
numel
=
tensor
.
numel
();
const
void
*
sendbuff
=
tensor
.
data
<
T
>
();
auto
place
=
ctx
.
GetPlace
();
void
*
recvbuff
=
tensor
.
mutable_data
<
T
>
(
place
);
void
*
recvbuff
=
ctx
.
Alloc
<
T
>
(
&
tensor
,
tensor
.
numel
()
*
sizeof
(
T
)
);
auto
comm
=
platform
::
NCCLCommContext
::
Instance
().
Get
(
ring_id
,
place
);
auto
stream
=
ctx
.
stream
();
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
ncclAllReduce
(
...
...
@@ -125,7 +125,6 @@ class FusedFeedForwardKernel : public framework::OpKernel<T> {
FusedDropoutLayerNormHelper
<
T
,
uint8_t
>
fused_dropout_layernorm_helper
(
ctx
,
bsz_seq
,
d_model
,
dropout_param2
,
epsilon2
);
auto
place
=
ctx
.
GetPlace
();
using
U
=
LayerNormParamType
<
T
>
;
const
framework
::
Tensor
*
in
=
&
x
;
...
...
@@ -158,7 +157,8 @@ class FusedFeedForwardKernel : public framework::OpKernel<T> {
dropout1_out
->
data
<
T
>
(),
dropout1_mask
->
data
<
uint8_t
>
());
framework
::
Tensor
linear2_out
;
linear2_out
.
mutable_data
<
T
>
({
bsz_seq
,
d_model
},
place
);
linear2_out
.
Resize
({
bsz_seq
,
d_model
});
ctx
.
Alloc
<
T
>
(
&
linear2_out
,
linear2_out
.
numel
()
*
sizeof
(
T
));
MatMul
(
ctx
,
*
dropout1_out
,
linear2_weight
,
&
linear2_out
);
// tensor model parallel
...
...
@@ -203,6 +203,7 @@ class FusedFeedForwardKernel : public framework::OpKernel<T> {
auto
*
linear2_weight
=
context
.
Input
<
framework
::
Tensor
>
(
"Linear2Weight"
);
auto
*
linear2_bias
=
context
.
Input
<
framework
::
Tensor
>
(
"Linear2Bias"
);
const
bool
pre_layer_norm
=
context
.
Attr
<
bool
>
(
"pre_layer_norm"
);
auto
&
dev_ctx
=
context
.
template
device_context
<
phi
::
GPUContext
>();
auto
*
ln1_scale
=
pre_layer_norm
?
context
.
Input
<
framework
::
Tensor
>
(
"Ln1Scale"
)
:
nullptr
;
...
...
@@ -245,22 +246,23 @@ class FusedFeedForwardKernel : public framework::OpKernel<T> {
DropoutParam
dropout_param2
(
context
,
2
);
using
U
=
LayerNormParamType
<
T
>
;
auto
place
=
context
.
GetPlace
();
out
->
mutable_data
<
T
>
(
place
);
dropout1_mask
->
mutable_data
<
uint8_t
>
(
place
);
dropout2_mask
->
mutable_data
<
uint8_t
>
(
place
);
dev_ctx
.
Alloc
<
T
>
(
out
,
out
->
numel
()
*
sizeof
(
T
));
dev_ctx
.
Alloc
<
uint8_t
>
(
dropout1_mask
,
dropout1_mask
->
numel
()
*
sizeof
(
uint8_t
));
dev_ctx
.
Alloc
<
uint8_t
>
(
dropout2_mask
,
dropout2_mask
->
numel
()
*
sizeof
(
uint8_t
));
if
(
pre_layer_norm
)
{
ln1_mean
->
mutable_data
<
U
>
(
place
);
ln1_variance
->
mutable_data
<
U
>
(
place
);
ln1_out
->
mutable_data
<
T
>
(
place
);
dev_ctx
.
Alloc
<
U
>
(
ln1_mean
,
ln1_mean
->
numel
()
*
sizeof
(
U
)
);
dev_ctx
.
Alloc
<
U
>
(
ln1_variance
,
ln1_variance
->
numel
()
*
sizeof
(
U
)
);
dev_ctx
.
Alloc
<
T
>
(
ln1_out
,
ln1_out
->
numel
()
*
sizeof
(
T
)
);
}
else
{
ln2_mean
->
mutable_data
<
U
>
(
place
);
ln2_variance
->
mutable_data
<
U
>
(
place
);
dev_ctx
.
Alloc
<
U
>
(
ln2_mean
,
ln2_mean
->
numel
()
*
sizeof
(
U
)
);
dev_ctx
.
Alloc
<
U
>
(
ln2_variance
,
ln2_variance
->
numel
()
*
sizeof
(
U
)
);
}
linear1_out
->
mutable_data
<
T
>
(
place
);
d
ropout1_out
->
mutable_data
<
T
>
(
place
);
d
ropout2_out
->
mutable_data
<
T
>
(
place
);
dev_ctx
.
Alloc
<
T
>
(
linear1_out
,
linear1_out
->
numel
()
*
sizeof
(
T
)
);
d
ev_ctx
.
Alloc
<
T
>
(
dropout1_out
,
dropout1_out
->
numel
()
*
sizeof
(
T
)
);
d
ev_ctx
.
Alloc
<
T
>
(
dropout2_out
,
dropout2_out
->
numel
()
*
sizeof
(
T
)
);
auto
x_dim
=
x
->
dims
();
auto
mat_dim_x
=
phi
::
funcs
::
CreateMatrixDescriptor
(
...
...
@@ -374,7 +376,6 @@ class FusedFeedForwardGradKernel : public framework::OpKernel<T> {
FusedDropoutLayerNormHelper
<
T
,
uint8_t
>
fused_dropout_layernorm_helper
(
ctx
,
bsz_seq
,
d_model
,
dropout_param2
,
epsilon2
);
auto
place
=
ctx
.
GetPlace
();
using
U
=
LayerNormParamType
<
T
>
;
const
U
*
ln1_gamma_ptr
=
ln1_gamma
==
nullptr
?
nullptr
:
ln1_gamma
->
data
<
U
>
();
...
...
@@ -396,12 +397,16 @@ class FusedFeedForwardGradKernel : public framework::OpKernel<T> {
U
*
d_ln2_beta_ptr
=
d_ln2_beta
==
nullptr
?
nullptr
:
d_ln2_beta
->
data
<
U
>
();
framework
::
Tensor
d_linear2_out
,
d_dropout2_out
,
d_residual
;
d_linear2_out
.
mutable_data
<
T
>
({
bsz_seq
,
d_model
},
place
);
d_dropout2_out
.
mutable_data
<
T
>
({
bsz_seq
,
d_model
},
place
);
d_linear2_out
.
Resize
({
bsz_seq
,
d_model
});
ctx
.
Alloc
<
T
>
(
&
d_linear2_out
,
d_linear2_out
.
numel
()
*
sizeof
(
T
));
d_dropout2_out
.
Resize
({
bsz_seq
,
d_model
});
ctx
.
Alloc
<
T
>
(
&
d_dropout2_out
,
d_dropout2_out
.
numel
()
*
sizeof
(
T
));
T
*
d_residual_ptr
=
nullptr
;
if
(
add_residual
)
{
d_residual_ptr
=
d_residual
.
mutable_data
<
T
>
(
d_x
->
dims
(),
place
);
d_residual
.
Resize
(
d_x
->
dims
());
d_residual_ptr
=
ctx
.
Alloc
<
T
>
(
&
d_residual
,
d_residual
.
numel
()
*
sizeof
(
T
));
}
if
(
pre_layer_norm
)
{
fused_dropout_layernorm_helper
.
ResidualDropoutBiasGrad
(
...
...
@@ -429,7 +434,8 @@ class FusedFeedForwardGradKernel : public framework::OpKernel<T> {
}
framework
::
Tensor
d_dropout1_out
;
d_dropout1_out
.
mutable_data
<
T
>
({
bsz_seq
,
dim_feedforward
},
place
);
d_dropout1_out
.
Resize
({
bsz_seq
,
dim_feedforward
});
ctx
.
Alloc
<
T
>
(
&
d_dropout1_out
,
d_dropout1_out
.
numel
()
*
sizeof
(
T
));
MatMulGrad
(
ctx
,
d_linear2_out
,
dropout1_out
,
...
...
@@ -438,7 +444,8 @@ class FusedFeedForwardGradKernel : public framework::OpKernel<T> {
d_linear2_weight
);
framework
::
Tensor
d_linear1_out
;
d_linear1_out
.
mutable_data
<
T
>
({
bsz_seq
,
dim_feedforward
},
place
);
d_linear1_out
.
Resize
({
bsz_seq
,
dim_feedforward
});
ctx
.
Alloc
<
T
>
(
&
d_linear1_out
,
d_linear1_out
.
numel
()
*
sizeof
(
T
));
fused_act_dropout_helper
.
DropoutActBiasGrad
(
ctx
,
d_dropout1_out
.
data
<
T
>
(),
linear1_out
.
data
<
T
>
(),
...
...
@@ -450,7 +457,8 @@ class FusedFeedForwardGradKernel : public framework::OpKernel<T> {
if
(
pre_layer_norm
)
{
framework
::
Tensor
d_ln1_out
;
d_ln1_out
.
mutable_data
<
T
>
({
bsz_seq
,
d_model
},
place
);
d_ln1_out
.
Resize
({
bsz_seq
,
d_model
});
ctx
.
Alloc
<
T
>
(
&
d_ln1_out
,
d_ln1_out
.
numel
()
*
sizeof
(
T
));
MatMulGrad
(
ctx
,
d_linear1_out
,
*
ln1_out
,
...
...
@@ -485,6 +493,7 @@ class FusedFeedForwardGradKernel : public framework::OpKernel<T> {
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
using
U
=
LayerNormParamType
<
T
>
;
auto
&
dev_ctx
=
context
.
template
device_context
<
phi
::
GPUContext
>();
auto
d_out
=
*
context
.
Input
<
framework
::
Tensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
x
=
*
context
.
Input
<
framework
::
Tensor
>
(
"X"
);
...
...
@@ -550,28 +559,27 @@ class FusedFeedForwardGradKernel : public framework::OpKernel<T> {
DropoutParam
dropout_param1
(
context
,
1
);
DropoutParam
dropout_param2
(
context
,
2
);
auto
place
=
context
.
GetPlace
();
d_x
->
mutable_data
<
T
>
(
place
);
dev_ctx
.
Alloc
<
T
>
(
d_x
,
d_x
->
numel
()
*
sizeof
(
T
));
if
(
d_ln1_scale
)
{
d
_ln1_scale
->
mutable_data
<
U
>
(
place
);
d
ev_ctx
.
Alloc
<
U
>
(
d_ln1_scale
,
d_ln1_scale
->
numel
()
*
sizeof
(
U
)
);
}
if
(
d_ln1_bias
)
{
d
_ln1_bias
->
mutable_data
<
U
>
(
place
);
d
ev_ctx
.
Alloc
<
U
>
(
d_ln1_bias
,
d_ln1_bias
->
numel
()
*
sizeof
(
U
)
);
}
if
(
d_ln2_scale
)
{
d
_ln2_scale
->
mutable_data
<
U
>
(
place
);
d
ev_ctx
.
Alloc
<
U
>
(
d_ln2_scale
,
d_ln2_scale
->
numel
()
*
sizeof
(
U
)
);
}
if
(
d_ln2_bias
)
{
d
_ln2_bias
->
mutable_data
<
U
>
(
place
);
d
ev_ctx
.
Alloc
<
U
>
(
d_ln2_bias
,
d_ln2_bias
->
numel
()
*
sizeof
(
U
)
);
}
if
(
d_linear1_bias
)
{
d
_linear1_bias
->
mutable_data
<
T
>
(
place
);
d
ev_ctx
.
Alloc
<
T
>
(
d_linear1_bias
,
d_linear1_bias
->
numel
()
*
sizeof
(
T
)
);
}
if
(
d_linear2_bias
)
{
d
_linear2_bias
->
mutable_data
<
T
>
(
place
);
d
ev_ctx
.
Alloc
<
T
>
(
d_linear2_bias
,
d_linear2_bias
->
numel
()
*
sizeof
(
T
)
);
}
d
_linear1_weight
->
mutable_data
<
T
>
(
place
);
d
_linear2_weight
->
mutable_data
<
T
>
(
place
);
d
ev_ctx
.
Alloc
<
T
>
(
d_linear1_weight
,
d_linear1_weight
->
numel
()
*
sizeof
(
T
)
);
d
ev_ctx
.
Alloc
<
T
>
(
d_linear2_weight
,
d_linear2_weight
->
numel
()
*
sizeof
(
T
)
);
auto
x_dim
=
x
.
dims
();
auto
mat_dim_x
=
phi
::
funcs
::
CreateMatrixDescriptor
(
...
...
paddle/fluid/operators/fused/fused_gate_attention.h
浏览文件 @
4bbbed9a
...
...
@@ -47,7 +47,7 @@ template <typename T>
void
AllocWithDebugInfo
(
const
phi
::
GPUContext
&
dev_ctx
,
const
std
::
string
&
info
,
Tensor
*
t
)
{
t
->
mutable_data
<
T
>
(
dev_ctx
.
GetPlace
(
));
dev_ctx
.
Alloc
<
T
>
(
t
,
t
->
numel
()
*
sizeof
(
T
));
VLOG
(
4
)
<<
info
<<
": "
<<
MemoryDebugString
(
*
t
);
}
...
...
@@ -505,9 +505,12 @@ class FMHAGateRef {
k_transpose_out_grad
.
Resize
(
config
->
kv_transpose_out_dims
);
v_transpose_out_grad
.
Resize
(
config
->
kv_transpose_out_dims
);
q_grad_ptr
=
q_transpose_out_grad
.
mutable_data
<
T
>
(
dev_ctx_
.
GetPlace
());
k_grad_ptr
=
k_transpose_out_grad
.
mutable_data
<
T
>
(
dev_ctx_
.
GetPlace
());
v_grad_ptr
=
v_transpose_out_grad
.
mutable_data
<
T
>
(
dev_ctx_
.
GetPlace
());
q_grad_ptr
=
dev_ctx_
.
Alloc
<
T
>
(
&
q_transpose_out_grad
,
q_transpose_out_grad
.
numel
()
*
sizeof
(
T
));
k_grad_ptr
=
dev_ctx_
.
Alloc
<
T
>
(
&
k_transpose_out_grad
,
k_transpose_out_grad
.
numel
()
*
sizeof
(
T
));
v_grad_ptr
=
dev_ctx_
.
Alloc
<
T
>
(
&
v_transpose_out_grad
,
v_transpose_out_grad
.
numel
()
*
sizeof
(
T
));
}
Tensor
softmax_out_grad
;
...
...
paddle/fluid/operators/fused/fused_gate_attention_op.cu
浏览文件 @
4bbbed9a
...
...
@@ -90,7 +90,8 @@ void ComputeMergedQKVMatmulBackward(const framework::ExecutionContext &ctx,
auto
*
qkv_weight
=
ctx
.
Input
<
Tensor
>
(
"QKVWeight"
);
auto
*
qkv_weight_grad
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"QKVWeight"
));
qkv_weight_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
&
dev_ctx
=
ctx
.
template
device_context
<
phi
::
GPUContext
>();
dev_ctx
.
Alloc
<
T
>
(
qkv_weight_grad
,
qkv_weight_grad
->
numel
()
*
sizeof
(
T
));
// Gradient of GEMM(query, qkv_weight)
int
m
=
config
.
batch_size
*
config
.
seq_len_m
*
config
.
seq_len_r
;
...
...
@@ -160,7 +161,8 @@ void ComputeSeparatedQKVMatmulBackward(const framework::ExecutionContext &ctx,
const
auto
*
key_weight
=
ctx
.
Input
<
Tensor
>
(
"KeyWeight"
);
auto
*
key_weight_grad
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"KeyWeight"
));
key_weight_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
&
dev_ctx
=
ctx
.
template
device_context
<
phi
::
GPUContext
>();
dev_ctx
.
Alloc
<
T
>
(
key_weight_grad
,
key_weight_grad
->
numel
()
*
sizeof
(
T
));
int
kv_m
=
config
.
batch_size
*
config
.
seq_len_m
*
config
.
m_size
;
int
kv_n
=
config
.
num_heads
*
config
.
head_dim
;
...
...
@@ -174,7 +176,7 @@ void ComputeSeparatedQKVMatmulBackward(const framework::ExecutionContext &ctx,
auto
*
value_weight
=
ctx
.
Input
<
Tensor
>
(
"ValueWeight"
);
auto
*
value_weight_grad
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"ValueWeight"
));
value_weight_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
(
));
dev_ctx
.
Alloc
<
T
>
(
value_weight_grad
,
value_weight_grad
->
numel
()
*
sizeof
(
T
));
kv_compute
.
ComputeBackward
(
key
,
value_weight
,
...
...
@@ -188,7 +190,7 @@ void ComputeSeparatedQKVMatmulBackward(const framework::ExecutionContext &ctx,
const
auto
*
query_weight
=
ctx
.
Input
<
Tensor
>
(
"QueryWeight"
);
auto
*
query_weight_grad
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"QueryWeight"
));
query_weight_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
(
));
dev_ctx
.
Alloc
<
T
>
(
query_weight_grad
,
query_weight_grad
->
numel
()
*
sizeof
(
T
));
int
q_m
=
config
.
batch_size
*
config
.
seq_len_m
*
config
.
seq_len_r
;
int
q_n
=
config
.
num_heads
*
config
.
head_dim
;
...
...
@@ -242,11 +244,11 @@ void ComputeGatingLinearBackward(const framework::ExecutionContext &ctx,
Tensor
*
fmha_out_grad
)
{
const
auto
*
gate_weight
=
ctx
.
Input
<
Tensor
>
(
"GateWeight"
);
const
auto
*
gate_bias
=
ctx
.
Input
<
Tensor
>
(
"GateBias"
);
auto
&
dev_ctx
=
ctx
.
template
device_context
<
phi
::
GPUContext
>();
// Re-compute gate_bias_out
Tensor
gate_bias_out
;
gate_bias_out
.
Resize
(
config
.
gate_out_dims
);
gate_bias_out
.
mutable_data
<
T
>
(
ctx
.
GetPlace
(
));
dev_ctx
.
Alloc
<
T
>
(
&
gate_bias_out
,
gate_bias_out
.
numel
()
*
sizeof
(
T
));
int
m
=
config
.
batch_size
*
config
.
seq_len_m
*
config
.
seq_len_r
;
int
n
=
config
.
num_heads
*
config
.
head_dim
;
...
...
@@ -267,8 +269,8 @@ void ComputeGatingLinearBackward(const framework::ExecutionContext &ctx,
auto
*
gate_weight_grad
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"GateWeight"
));
auto
*
gate_bias_grad
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"GateBias"
));
gate_weight_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
(
));
gate_bias_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
(
));
dev_ctx
.
Alloc
<
T
>
(
gate_weight_grad
,
gate_weight_grad
->
numel
()
*
sizeof
(
T
));
dev_ctx
.
Alloc
<
T
>
(
gate_bias_grad
,
gate_bias_grad
->
numel
()
*
sizeof
(
T
));
gate_attn_compute
.
ComputeBackward
(
query
,
gate_weight
,
...
...
@@ -301,6 +303,7 @@ void ComputeOutputLinearBackward(const framework::ExecutionContext &ctx,
const
GateAttentionGradConfig
<
T
>
&
config
,
const
Tensor
*
input
,
Tensor
*
input_grad
)
{
auto
&
dev_ctx
=
ctx
.
template
device_context
<
phi
::
GPUContext
>();
const
auto
*
out_grad
=
ctx
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Out"
));
const
auto
*
out_linear_weight
=
ctx
.
Input
<
Tensor
>
(
"OutLinearWeight"
);
...
...
@@ -309,8 +312,10 @@ void ComputeOutputLinearBackward(const framework::ExecutionContext &ctx,
auto
*
out_linear_bias_grad
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"OutLinearBias"
));
out_linear_weight_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
out_linear_bias_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
dev_ctx
.
Alloc
<
T
>
(
out_linear_weight_grad
,
out_linear_weight_grad
->
numel
()
*
sizeof
(
T
));
dev_ctx
.
Alloc
<
T
>
(
out_linear_bias_grad
,
out_linear_bias_grad
->
numel
()
*
sizeof
(
T
));
int
m
=
config
.
batch_size
*
config
.
seq_len_m
*
config
.
seq_len_r
;
int
n
=
config
.
q_dim
;
...
...
paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu
浏览文件 @
4bbbed9a
...
...
@@ -46,7 +46,7 @@ class FusedGemmEpilogueKernel : public framework::OpKernel<T> {
<<
" , activation = "
<<
activation
;
bool
enable_auxiliary
=
reserve_space
==
nullptr
?
false
:
true
;
out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
(
));
dev_ctx
->
Alloc
<
T
>
(
out
,
out
->
numel
()
*
sizeof
(
T
));
auto
*
out_data
=
out
->
data
<
T
>
();
auto
x_mat_dims
=
...
...
@@ -110,8 +110,7 @@ class FusedGemmEpilogueKernel : public framework::OpKernel<T> {
}
else
{
reserve_space_size
=
phi
::
product
(
out
->
dims
())
*
sizeof
(
T
);
}
reserve_space
->
mutable_data
(
ctx
.
GetPlace
(),
out
->
type
(),
reserve_space_size
);
dev_ctx
->
Alloc
(
reserve_space
,
out
->
type
(),
reserve_space_size
);
void
*
aux_data
=
reinterpret_cast
<
void
*>
(
reserve_space
->
data
<
T
>
());
PADDLE_ENFORCE_GPU_SUCCESS
(
...
...
@@ -493,7 +492,7 @@ class FusedGemmEpilogueGradKernel : public framework::OpKernel<T> {
workspace_size
,
phi
::
Stream
(
reinterpret_cast
<
phi
::
StreamId
>
(
dev_ctx
.
stream
())));
auto
*
dx_data
=
d
x
->
mutable_data
<
T
>
(
ctx
.
GetPlace
(
));
auto
*
dx_data
=
d
ev_ctx
->
Alloc
<
T
>
(
dx
,
dx
->
numel
()
*
sizeof
(
T
));
const
auto
*
y_data
=
y
->
data
<
T
>
();
const
auto
*
dout_data
=
dout
->
data
<
T
>
();
const
auto
*
a_data
=
kXGradAIsDZ
?
dout_data
:
y_data
;
...
...
@@ -601,7 +600,7 @@ class FusedGemmEpilogueGradKernel : public framework::OpKernel<T> {
sizeof
(
epiloque_func_for_dy
)));
if
(
dbias
)
{
auto
*
dbias_data
=
d
bias
->
mutable_data
<
T
>
(
ctx
.
GetPlace
(
));
auto
*
dbias_data
=
d
ev_ctx
->
Alloc
<
T
>
(
dbias
,
dbias
->
numel
()
*
sizeof
(
T
));
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cublasLtMatmulDescSetAttribute
(
dy_operation_desc
,
...
...
@@ -614,7 +613,7 @@ class FusedGemmEpilogueGradKernel : public framework::OpKernel<T> {
dev_ctx
.
GetPlace
(),
workspace_size
,
phi
::
Stream
(
reinterpret_cast
<
phi
::
StreamId
>
(
dev_ctx
.
stream
())));
auto
*
dy_data
=
d
y
->
mutable_data
<
T
>
(
ctx
.
GetPlace
(
));
auto
*
dy_data
=
d
ev_ctx
->
Alloc
<
T
>
(
dy
,
dy
->
numel
()
*
sizeof
(
T
));
const
auto
*
dout_data
=
dout
->
data
<
T
>
();
const
auto
*
x_data
=
x
->
data
<
T
>
();
const
auto
*
a_data
=
kYGradAIsDZ
?
dout_data
:
x_data
;
...
...
paddle/fluid/operators/fused/fused_multi_transformer_op.cu
浏览文件 @
4bbbed9a
...
...
@@ -70,7 +70,7 @@ static void AllReduce(framework::Tensor &tensor, // NOLINT
int64_t
numel
=
tensor
.
numel
();
const
void
*
sendbuff
=
tensor
.
data
<
T
>
();
auto
place
=
ctx
.
GetPlace
();
void
*
recvbuff
=
tensor
.
mutable_data
<
T
>
(
place
);
void
*
recvbuff
=
ctx
.
Alloc
<
T
>
(
&
tensor
,
tensor
.
numel
()
*
sizeof
(
T
)
);
auto
comm
=
platform
::
NCCLCommContext
::
Instance
().
Get
(
ring_id
,
place
);
auto
stream
=
ctx
.
stream
();
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
ncclAllReduce
(
...
...
@@ -1161,7 +1161,6 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
using
U
=
LayerNormParamType
<
T
>
;
auto
place
=
ctx
.
GetPlace
();
auto
&
dev_ctx
=
ctx
.
cuda_device_context
();
auto
*
time_step
=
ctx
.
Input
<
Tensor
>
(
"TimeStep"
);
...
...
@@ -1181,8 +1180,11 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
auto
ln_compute
=
AttnLayerNorm
<
T
>
(
dev_ctx
,
epsilon
,
bsz_seq
,
dim_embed
);
Tensor
ln_mean
,
ln_var
;
auto
*
ln_mean_data
=
ln_mean
.
mutable_data
<
U
>
({
bsz_seq
},
place
);
auto
*
ln_var_data
=
ln_var
.
mutable_data
<
U
>
({
bsz_seq
},
place
);
ln_mean
.
Resize
({{
bsz_seq
}});
auto
*
ln_mean_data
=
dev_ctx
.
Alloc
<
U
>
(
&
ln_mean
,
ln_mean
.
numel
()
*
sizeof
(
U
));
ln_var
.
Resize
({{
bsz_seq
}});
auto
*
ln_var_data
=
dev_ctx
.
Alloc
<
U
>
(
&
ln_var
,
ln_var
.
numel
()
*
sizeof
(
U
));
// 2. qkv
// x: qkv's input [batch_size, seq_len, dim_embed]
...
...
@@ -1207,8 +1209,9 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
input_size
,
compute_bias
);
Tensor
qkv_out
;
qkv_out
.
Resize
({{
bsz
,
seq_len
,
3
,
num_head
,
dim_head
}});
auto
*
qkv_out_data
=
qkv_out
.
mutable_data
<
T
>
({
bsz
,
seq_len
,
3
,
num_head
,
dim_head
},
place
);
dev_ctx
.
Alloc
<
T
>
(
&
qkv_out
,
qkv_out
.
numel
()
*
sizeof
(
T
)
);
// 3. fmha
AttnDropoutParam
attn_param
(
...
...
@@ -1243,26 +1246,32 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
}
Tensor
transpose_out_2
,
qk_out
;
auto
*
transpose_out_2_data
=
transpose_out_2
.
mutable_data
<
T
>
(
{
3
,
bsz
,
num_head
,
seq_len
,
dim_head
},
place
);
auto
*
qk_out_data
=
qk_out
.
mutable_data
<
T
>
({
bsz
,
num_head
,
seq_len
,
out_seq_len
},
place
);
transpose_out_2
.
Resize
({{
3
,
bsz
,
num_head
,
seq_len
,
dim_head
}});
auto
*
transpose_out_2_data
=
dev_ctx
.
Alloc
<
T
>
(
&
transpose_out_2
,
transpose_out_2
.
numel
()
*
sizeof
(
T
));
qk_out
.
Resize
({{
bsz
,
num_head
,
seq_len
,
out_seq_len
}});
auto
*
qk_out_data
=
dev_ctx
.
Alloc
<
T
>
(
&
qk_out
,
qk_out
.
numel
()
*
sizeof
(
T
));
Tensor
softmax_out
;
Tensor
attn_dropout_mask_out
,
attn_dropout_out
;
Tensor
qktv_out
,
fmha_out
;
auto
*
softmax_out_data
=
softmax_out
.
mutable_data
<
T
>
(
{
bsz
,
num_head
,
seq_len
,
out_seq_len
},
place
);
auto
*
attn_dropout_mask_out_data
=
attn_dropout_mask_out
.
mutable_data
<
T
>
(
{
bsz
,
num_head
,
seq_len
,
out_seq_len
},
place
);
auto
*
attn_dropout_data_data
=
attn_dropout_out
.
mutable_data
<
T
>
(
{
bsz
,
num_head
,
seq_len
,
out_seq_len
},
place
);
softmax_out
.
Resize
({{
bsz
,
num_head
,
seq_len
,
out_seq_len
}});
auto
*
softmax_out_data
=
dev_ctx
.
Alloc
<
T
>
(
&
softmax_out
,
softmax_out
.
numel
()
*
sizeof
(
T
));
attn_dropout_mask_out
.
Resize
({{
bsz
,
num_head
,
seq_len
,
out_seq_len
}});
auto
*
attn_dropout_mask_out_data
=
dev_ctx
.
Alloc
<
T
>
(
&
attn_dropout_mask_out
,
attn_dropout_mask_out
.
numel
()
*
sizeof
(
T
));
attn_dropout_out
.
Resize
({{
bsz
,
num_head
,
seq_len
,
out_seq_len
}});
auto
*
attn_dropout_data_data
=
dev_ctx
.
Alloc
<
T
>
(
&
attn_dropout_out
,
attn_dropout_out
.
numel
()
*
sizeof
(
T
));
qktv_out
.
Resize
({{
bsz
,
num_head
,
seq_len
,
dim_head
}});
auto
*
qktv_out_data
=
qktv_out
.
mutable_data
<
T
>
({
bsz
,
num_head
,
seq_len
,
dim_head
},
place
);
dev_ctx
.
Alloc
<
T
>
(
&
qktv_out
,
qktv_out
.
numel
()
*
sizeof
(
T
));
fmha_out
.
Resize
({{
bsz
,
seq_len
,
num_head
,
dim_head
}});
auto
*
fmha_out_data
=
fmha_out
.
mutable_data
<
T
>
({
bsz
,
seq_len
,
num_head
,
dim_head
},
place
);
dev_ctx
.
Alloc
<
T
>
(
&
fmha_out
,
fmha_out
.
numel
()
*
sizeof
(
T
)
);
// 4. out_linear
auto
out_linear_weights
=
ctx
.
MultiInput
<
Tensor
>
(
"OutLinearW"
);
...
...
@@ -1281,12 +1290,14 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
Tensor
bias_dropout_residual_out
,
dropout_mask_out
;
T
*
bias_dropout_residual_out_data
=
nullptr
;
if
(
pre_layer_norm
)
{
bias_dropout_residual_out
.
Resize
({{
bsz
,
seq_len
,
dim_embed
}});
bias_dropout_residual_out_data
=
bias_dropout_residual_out
.
mutable_data
<
T
>
({
bsz
,
seq_len
,
dim_embed
}
,
place
);
dev_ctx
.
Alloc
<
T
>
(
&
bias_dropout_residual_out
,
bias_dropout_residual_out
.
numel
()
*
sizeof
(
T
)
);
}
auto
*
dropout_mask_out_data
=
dropout_mask_out
.
mutable_data
<
uint8_t
>
(
{
bsz
,
seq_len
,
dim_embed
},
place
);
dropout_mask_out
.
Resize
({{
bsz
,
seq_len
,
dim_embed
}});
auto
*
dropout_mask_out_data
=
dev_ctx
.
Alloc
<
uint8_t
>
(
&
dropout_mask_out
,
dropout_mask_out
.
numel
()
*
sizeof
(
uint8_t
));
// 6. ffn matmul1
auto
ffn1_weights
=
ctx
.
MultiInput
<
Tensor
>
(
"FFN1Weight"
);
...
...
@@ -1297,17 +1308,21 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
auto
ffn1_linear_compute
=
AttnMatMul
<
T
>
(
dev_ctx
,
false
,
false
,
bsz_seq
,
dim_ffn
,
dim_embed
,
false
);
Tensor
ffn1_out
;
auto
*
ffn1_out_data
=
ffn1_out
.
mutable_data
<
T
>
({
bsz_seq
,
dim_ffn
},
place
);
ffn1_out
.
Resize
({{
bsz_seq
,
dim_ffn
}});
auto
*
ffn1_out_data
=
dev_ctx
.
Alloc
<
T
>
(
&
ffn1_out
,
ffn1_out
.
numel
()
*
sizeof
(
T
));
// 7. ffn act + bias
DropoutParam
ffn1_dropout_param
(
true
,
0
,
true
,
true
,
0.0
,
nullptr
,
0
);
FusedDropoutHelper
<
T
,
uint8_t
>
fused_act_dropout_helper
(
dev_ctx
,
bsz_seq
,
dim_ffn
,
ffn1_dropout_param
);
Tensor
ffn1_dropout_out
,
ffn1_dropout_mask
;
auto
*
ffn1_dropout_out_data
=
ffn1_dropout_out
.
mutable_data
<
T
>
({
bsz_seq
,
dim_ffn
},
place
);
auto
*
ffn1_dropout_mask_data
=
ffn1_dropout_mask
.
mutable_data
<
uint8_t
>
({
bsz_seq
,
dim_ffn
},
place
);
ffn1_dropout_out
.
Resize
({{
bsz_seq
,
dim_ffn
}});
auto
*
ffn1_dropout_out_data
=
dev_ctx
.
Alloc
<
T
>
(
&
ffn1_dropout_out
,
ffn1_dropout_out
.
numel
()
*
sizeof
(
T
));
ffn1_dropout_mask
.
Resize
({{
bsz_seq
,
dim_ffn
}});
auto
*
ffn1_dropout_mask_data
=
dev_ctx
.
Alloc
<
uint8_t
>
(
&
ffn1_dropout_mask
,
ffn1_dropout_mask
.
numel
()
*
sizeof
(
uint8_t
));
// 8. ffn2 matmul
auto
ffn2_weights
=
ctx
.
MultiInput
<
Tensor
>
(
"FFN2Weight"
);
...
...
@@ -1322,11 +1337,12 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
// calc
auto
*
out
=
ctx
.
Output
<
Tensor
>
(
"Out"
);
auto
*
from_data
=
out
->
mutable_data
<
T
>
(
place
);
auto
*
from_data
=
dev_ctx
.
Alloc
<
T
>
(
out
,
out
->
numel
()
*
sizeof
(
T
)
);
Tensor
*
from_tensor
=
out
;
Tensor
tmp_out
;
tmp_out
.
Resize
({{
bsz
,
seq_len
,
dim_embed
}});
auto
*
tmp_out_data
=
tmp_out
.
mutable_data
<
T
>
({
bsz
,
seq_len
,
dim_embed
},
place
);
dev_ctx
.
Alloc
<
T
>
(
&
tmp_out
,
tmp_out
.
numel
()
*
sizeof
(
T
)
);
auto
*
x_data
=
input_x
->
data
<
T
>
();
Tensor
*
buf0
=
nullptr
;
...
...
paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu
浏览文件 @
4bbbed9a
...
...
@@ -426,7 +426,7 @@ class FusedSeqpoolCVMCUDAKernel : public framework::OpKernel<T> {
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
inputs
=
ctx
.
MultiInput
<
LoDTensor
>
(
"X"
);
auto
outputs
=
ctx
.
MultiOutput
<
framework
::
Tensor
>
(
"Out"
);
auto
&
dev_ctx
=
ctx
.
template
device_context
<
phi
::
GPUContext
>();
const
auto
slot_size
=
inputs
.
size
();
std
::
vector
<
const
float
*>
input_data
(
slot_size
);
std
::
vector
<
const
size_t
*>
lods_data
(
slot_size
);
...
...
@@ -478,13 +478,13 @@ class FusedSeqpoolCVMCUDAKernel : public framework::OpKernel<T> {
}
else
{
output
->
Resize
({
batch_size
,
embedding_size
-
cvm_offset
});
}
output_data
[
i
]
=
reinterpret_cast
<
T
*>
(
output
->
mutable_data
<
T
>
(
ctx
.
GetPlace
(
)));
output_data
[
i
]
=
reinterpret_cast
<
T
*>
(
dev_ctx
.
Alloc
<
T
>
(
output
,
output
->
numel
()
*
sizeof
(
T
)));
mix_lods_v
[
i
]
=
new
paddle
::
framework
::
MixVector
<
size_t
>
(
&
lods
);
lods_data
[
i
]
=
mix_lods_v
[
i
]
->
CUDAData
(
ctx
.
GetPlace
());
seqpool_output
_data
[
i
]
=
reinterpret_cast
<
T
*>
(
seqpool_outputs
[
i
].
mutable_data
<
T
>
(
{
batch_size
,
embedding_size
},
ctx
.
GetPlace
(
)));
seqpool_output
s
[
i
].
Resize
({
batch_size
,
embedding_size
});
seqpool_output_data
[
i
]
=
reinterpret_cast
<
T
*>
(
dev_ctx
.
Alloc
<
T
>
(
&
seqpool_outputs
[
i
],
seqpool_outputs
[
i
].
numel
()
*
sizeof
(
T
)));
}
FusedSeqpoolCVM
(
ctx
,
...
...
@@ -512,7 +512,7 @@ class FusedSeqpoolCVMGradCUDAKernel : public framework::OpKernel<T> {
auto
out_grads
=
ctx
.
MultiInput
<
LoDTensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
in_grads
=
ctx
.
MultiOutput
<
LoDTensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
cvm
=
ctx
.
Input
<
LoDTensor
>
(
"CVM"
);
auto
&
dev_ctx
=
ctx
.
template
device_context
<
phi
::
GPUContext
>();
std
::
string
pooltype
=
ctx
.
Attr
<
std
::
string
>
(
"pooltype"
);
auto
use_cvm
=
ctx
.
Attr
<
bool
>
(
"use_cvm"
);
const
int
cvm_offset
=
ctx
.
Attr
<
int
>
(
"cvm_offset"
);
...
...
@@ -559,8 +559,8 @@ class FusedSeqpoolCVMGradCUDAKernel : public framework::OpKernel<T> {
auto
*
out_grad
=
out_grads
[
i
];
out_grads_data
[
i
]
=
reinterpret_cast
<
const
T
*>
(
out_grad
->
data
<
T
>
());
in_grads_data
[
i
]
=
reinterpret_cast
<
T
*>
(
in_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
(
)));
in_grads_data
[
i
]
=
reinterpret_cast
<
T
*>
(
dev_ctx
.
Alloc
<
T
>
(
in_grad
,
in_grad
->
numel
()
*
sizeof
(
T
)));
mix_lods_v
[
i
]
=
new
paddle
::
framework
::
MixVector
<
size_t
>
(
&
lods
);
lods_data
[
i
]
=
mix_lods_v
[
i
]
->
CUDAData
(
ctx
.
GetPlace
());
cvm_data
[
i
]
=
reinterpret_cast
<
const
T
*>
(
cvm
->
data
<
T
>
());
...
...
paddle/fluid/operators/fused/fusion_conv_inception_op.cu
浏览文件 @
4bbbed9a
...
...
@@ -55,8 +55,10 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
static_cast
<
size_t
>
(
ctx
.
Attr
<
int
>
(
"workspace_size_MB"
));
const
T
*
input_data
=
input
->
data
<
T
>
();
T
*
output_data
=
output
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
T
*
temp_data
=
temp_outs
[
0
]
->
mutable_data
<
T
>
(
input
->
dims
(),
ctx
.
GetPlace
());
T
*
output_data
=
dev_ctx
.
Alloc
<
T
>
(
output
,
output
->
numel
()
*
sizeof
(
T
));
temp_outs
[
0
]
->
Resize
(
input
->
dims
());
T
*
temp_data
=
dev_ctx
.
Alloc
<
T
>
(
temp_outs
[
0
],
temp_outs
[
0
]
->
numel
()
*
sizeof
(
T
));
DataLayout
layout
=
DataLayout
::
kNCHW
;
std
::
vector
<
int
>
in_dim
=
phi
::
vectorize
<
int
>
(
input
->
dims
());
...
...
@@ -254,8 +256,9 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
in_datas
.
push_back
(
static_cast
<
const
void
*>
(
input_data
));
in_datas
.
push_back
(
static_cast
<
const
void
*>
(
output_data
+
(
oc0
+
oc1
)
*
h
*
w
));
T
*
temp2_data
=
temp_outs
[
1
]
->
mutable_data
<
T
>
(
phi
::
make_ddim
(
out_dims
[
2
]),
ctx
.
GetPlace
());
temp_outs
[
1
]
->
Resize
(
phi
::
make_ddim
(
out_dims
[
2
]));
T
*
temp2_data
=
dev_ctx
.
Alloc
<
T
>
(
temp_outs
[
1
],
temp_outs
[
1
]
->
numel
()
*
sizeof
(
T
));
in_datas
.
push_back
(
static_cast
<
const
void
*>
(
temp2_data
+
oc2
*
h
*
w
));
std
::
vector
<
void
*>
out_datas
;
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录