Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
2afa9b76
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
2afa9b76
编写于
7月 07, 2022
作者:
W
wanghuancoder
提交者:
GitHub
7月 07, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[Eager] Menual fused attention in eager (#43974)
* fused_gate_attention manual code in eager
上级
9aaae254
变更
9
展开全部
隐藏空白更改
内联
并排
Showing
9 changed file
with
1250 addition
and
9 deletion
+1250
-9
paddle/fluid/eager/api/manual/fluid_manual/dygraph_forward_api.h
...fluid/eager/api/manual/fluid_manual/dygraph_forward_api.h
+34
-0
paddle/fluid/eager/api/manual/fluid_manual/forwards/CMakeLists.txt
...uid/eager/api/manual/fluid_manual/forwards/CMakeLists.txt
+8
-0
paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_attention_fwd_func.cc
.../manual/fluid_manual/forwards/fused_attention_fwd_func.cc
+628
-0
paddle/fluid/eager/api/manual/fluid_manual/nodes/CMakeLists.txt
.../fluid/eager/api/manual/fluid_manual/nodes/CMakeLists.txt
+6
-1
paddle/fluid/eager/api/manual/fluid_manual/nodes/fused_attention_node.cc
...ger/api/manual/fluid_manual/nodes/fused_attention_node.cc
+366
-0
paddle/fluid/eager/api/manual/fluid_manual/nodes/nodes.h
paddle/fluid/eager/api/manual/fluid_manual/nodes/nodes.h
+202
-0
paddle/fluid/eager/auto_code_generator/eager_generator.cc
paddle/fluid/eager/auto_code_generator/eager_generator.cc
+4
-2
python/paddle/fluid/tests/unittests/test_fused_attention_op.py
...n/paddle/fluid/tests/unittests/test_fused_attention_op.py
+1
-3
python/paddle/fluid/tests/unittests/test_fused_gate_attention_op.py
...dle/fluid/tests/unittests/test_fused_gate_attention_op.py
+1
-3
未找到文件。
paddle/fluid/eager/api/manual/fluid_manual/dygraph_forward_api.h
浏览文件 @
2afa9b76
...
...
@@ -67,3 +67,37 @@ fused_feedforward_dygraph_function(
const
paddle
::
experimental
::
Tensor
&
Ln2Scale
,
const
paddle
::
experimental
::
Tensor
&
Ln2Bias
,
const
paddle
::
framework
::
AttributeMap
&
attr_map
);
std
::
tuple
<
paddle
::
experimental
::
Tensor
,
paddle
::
experimental
::
Tensor
,
paddle
::
experimental
::
Tensor
,
paddle
::
experimental
::
Tensor
,
paddle
::
experimental
::
Tensor
,
paddle
::
experimental
::
Tensor
,
paddle
::
experimental
::
Tensor
,
paddle
::
experimental
::
Tensor
,
paddle
::
experimental
::
Tensor
,
paddle
::
experimental
::
Tensor
,
paddle
::
experimental
::
Tensor
,
paddle
::
experimental
::
Tensor
,
paddle
::
experimental
::
Tensor
,
paddle
::
experimental
::
Tensor
,
paddle
::
experimental
::
Tensor
,
paddle
::
experimental
::
Tensor
,
paddle
::
experimental
::
Tensor
,
paddle
::
experimental
::
Tensor
,
paddle
::
experimental
::
Tensor
,
paddle
::
experimental
::
Tensor
>
fused_attention_dygraph_function
(
const
paddle
::
experimental
::
Tensor
&
X
,
const
paddle
::
experimental
::
Tensor
&
LnScale
,
const
paddle
::
experimental
::
Tensor
&
LnBias
,
const
paddle
::
experimental
::
Tensor
&
QKVW
,
const
paddle
::
experimental
::
Tensor
&
QKVBias
,
const
paddle
::
experimental
::
Tensor
&
CacheKV
,
const
paddle
::
experimental
::
Tensor
&
SrcMask
,
const
paddle
::
experimental
::
Tensor
&
OutLinearW
,
const
paddle
::
experimental
::
Tensor
&
OutLinearBias
,
const
paddle
::
experimental
::
Tensor
&
Ln2Scale
,
const
paddle
::
experimental
::
Tensor
&
Ln2Bias
,
const
paddle
::
framework
::
AttributeMap
&
attr_map
);
paddle/fluid/eager/api/manual/fluid_manual/forwards/CMakeLists.txt
浏览文件 @
2afa9b76
...
...
@@ -12,6 +12,14 @@ cc_library(
add_dependencies
(
fused_feedforward_fwd_func eager_codegen
)
cc_library
(
fused_attention_fwd_func
SRCS fused_attention_fwd_func.cc
DEPS
${
eager_deps
}
${
fluid_deps
}
${
GLOB_OP_LIB
}
${
GLOB_OPERATOR_DEPS
}
)
add_dependencies
(
fused_attention_fwd_func eager_codegen
)
set
(
fluid_manual_functions
fused_gate_attention_fwd_func fused_feedforward_fwd_func
fused_attention_fwd_func
PARENT_SCOPE
)
paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_attention_fwd_func.cc
0 → 100644
浏览文件 @
2afa9b76
此差异已折叠。
点击以展开。
paddle/fluid/eager/api/manual/fluid_manual/nodes/CMakeLists.txt
浏览文件 @
2afa9b76
...
...
@@ -8,6 +8,11 @@ cc_library(
SRCS fused_feedforward_node.cc
DEPS
${
eager_deps
}
${
fluid_deps
}
)
cc_library
(
fused_attention_node
SRCS fused_attention_node.cc
DEPS
${
eager_deps
}
${
fluid_deps
}
)
set
(
fluid_manual_nodes
fused_gate_attention_node fused_feedforward_node
fused_gate_attention_node fused_feedforward_node
fused_attention_node
PARENT_SCOPE
)
paddle/fluid/eager/api/manual/fluid_manual/nodes/fused_attention_node.cc
0 → 100644
浏览文件 @
2afa9b76
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "glog/logging.h"
#include "paddle/fluid/eager/api/manual/fluid_manual/nodes/nodes.h"
#include "paddle/fluid/eager/api/utils/global_utils.h"
#include "paddle/fluid/eager/utils.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/imperative/tracer.h"
#include "paddle/phi/api/all.h"
paddle
::
small_vector
<
std
::
vector
<
paddle
::
experimental
::
Tensor
>
,
egr
::
kSlotSmallVectorSize
>
fused_attentionGradNodeCompat
::
operator
()(
paddle
::
small_vector
<
std
::
vector
<
paddle
::
experimental
::
Tensor
>
,
egr
::
kSlotSmallVectorSize
>&
grads
,
bool
create_graph
,
bool
is_new_grad
)
{
VLOG
(
3
)
<<
"Running Eager Backward Node: fused_attentionGradNodeCompat"
;
const
auto
&
out_metas
=
OutputMeta
();
paddle
::
small_vector
<
std
::
vector
<
paddle
::
experimental
::
Tensor
>
,
egr
::
kSlotSmallVectorSize
>
outputs
(
23
);
paddle
::
small_vector
<
std
::
vector
<
paddle
::
experimental
::
Tensor
>
,
egr
::
kSlotSmallVectorSize
>
hooked_grads0
=
fused_attentionGradNodeCompat
::
ApplyGradientHooks
(
grads
);
bool
pre_layer_norm
=
false
;
if
(
attr_map_
.
count
(
"pre_layer_norm"
))
{
pre_layer_norm
=
BOOST_GET_CONST
(
bool
,
attr_map_
.
at
(
"pre_layer_norm"
));
}
std
::
map
<
std
::
string
,
std
::
vector
<
std
::
shared_ptr
<
egr
::
EagerVariable
>>>
ins0
=
{{
"AttnDropoutMaskOut"
,
egr
::
EagerUtils
::
TrySyncToVars
(
egr
::
EagerUtils
::
RecoverTensorWrapper
(
&
this
->
AttnDropoutMaskOut_
))},
{
"AttnDropoutOut"
,
egr
::
EagerUtils
::
TrySyncToVars
(
egr
::
EagerUtils
::
RecoverTensorWrapper
(
&
this
->
AttnDropoutOut_
))},
{
"DropoutMaskOut"
,
egr
::
EagerUtils
::
TrySyncToVars
(
egr
::
EagerUtils
::
RecoverTensorWrapper
(
&
this
->
DropoutMaskOut_
))},
{
"FMHAOut"
,
egr
::
EagerUtils
::
TrySyncToVars
(
egr
::
EagerUtils
::
RecoverTensorWrapper
(
&
this
->
FMHAOut_
))},
{
"OutLinearOut"
,
egr
::
EagerUtils
::
TrySyncToVars
(
egr
::
EagerUtils
::
RecoverTensorWrapper
(
&
this
->
OutLinearOut_
))},
{
"OutLinearW"
,
egr
::
EagerUtils
::
TrySyncToVars
(
egr
::
EagerUtils
::
RecoverTensorWrapper
(
&
this
->
OutLinearW_
))},
{
"QKOut"
,
egr
::
EagerUtils
::
TrySyncToVars
(
egr
::
EagerUtils
::
RecoverTensorWrapper
(
&
this
->
QKOut_
))},
{
"QKTVOut"
,
egr
::
EagerUtils
::
TrySyncToVars
(
egr
::
EagerUtils
::
RecoverTensorWrapper
(
&
this
->
QKTVOut_
))},
{
"QKVOut"
,
egr
::
EagerUtils
::
TrySyncToVars
(
egr
::
EagerUtils
::
RecoverTensorWrapper
(
&
this
->
QKVOut_
))},
{
"QKVW"
,
egr
::
EagerUtils
::
TrySyncToVars
(
egr
::
EagerUtils
::
RecoverTensorWrapper
(
&
this
->
QKVW_
))},
{
"SoftmaxOut"
,
egr
::
EagerUtils
::
TrySyncToVars
(
egr
::
EagerUtils
::
RecoverTensorWrapper
(
&
this
->
SoftmaxOut_
))},
{
"TransposeOut2"
,
egr
::
EagerUtils
::
TrySyncToVars
(
egr
::
EagerUtils
::
RecoverTensorWrapper
(
&
this
->
TransposeOut2_
))},
{
"X"
,
egr
::
EagerUtils
::
TrySyncToVars
(
egr
::
EagerUtils
::
RecoverTensorWrapper
(
&
this
->
X_
))},
{
"Y@GRAD"
,
egr
::
EagerUtils
::
TrySyncToVars
(
hooked_grads0
[
19
])}};
std
::
map
<
std
::
string
,
std
::
vector
<
std
::
shared_ptr
<
egr
::
EagerVariable
>>>
outs0
;
if
((
!
out_metas
[
7
].
empty
())
&&
(
!
(
out_metas
[
7
][
0
].
IsStopGradient
())))
{
outs0
.
insert
({
"OutLinearW@GRAD"
,
{
std
::
make_shared
<
egr
::
EagerVariable
>
(
egr
::
Controller
::
Instance
().
GenerateUniqueName
())}});
}
if
((
!
out_metas
[
3
].
empty
())
&&
(
!
(
out_metas
[
3
][
0
].
IsStopGradient
())))
{
outs0
.
insert
({
"QKVW@GRAD"
,
{
std
::
make_shared
<
egr
::
EagerVariable
>
(
egr
::
Controller
::
Instance
().
GenerateUniqueName
())}});
}
if
((
!
out_metas
[
0
].
empty
())
&&
(
!
(
out_metas
[
0
][
0
].
IsStopGradient
())))
{
outs0
.
insert
({
"X@GRAD"
,
{
std
::
make_shared
<
egr
::
EagerVariable
>
(
egr
::
Controller
::
Instance
().
GenerateUniqueName
())}});
}
auto
QKVOut
=
egr
::
EagerUtils
::
RecoverTensorWrapper
(
&
this
->
QKVOut_
);
if
(
QKVOut
.
defined
()
&&
(
!
out_metas
[
15
].
empty
())
&&
(
!
out_metas
[
15
][
0
].
IsStopGradient
()))
outs0
[
"QKVOut@GRAD"
]
=
{
std
::
make_shared
<
egr
::
EagerVariable
>
(
egr
::
Controller
::
Instance
().
GenerateUniqueName
())};
auto
QKTVOut
=
egr
::
EagerUtils
::
RecoverTensorWrapper
(
&
this
->
QKTVOut_
);
if
(
QKTVOut
.
defined
()
&&
(
!
out_metas
[
16
].
empty
())
&&
(
!
out_metas
[
16
][
0
].
IsStopGradient
()))
outs0
[
"QKTVOut@GRAD"
]
=
{
std
::
make_shared
<
egr
::
EagerVariable
>
(
egr
::
Controller
::
Instance
().
GenerateUniqueName
())};
auto
TransposeOut2
=
egr
::
EagerUtils
::
RecoverTensorWrapper
(
&
this
->
TransposeOut2_
);
if
(
TransposeOut2
.
defined
()
&&
(
!
out_metas
[
17
].
empty
())
&&
(
!
out_metas
[
17
][
0
].
IsStopGradient
()))
outs0
[
"TransposeOut2@GRAD"
]
=
{
std
::
make_shared
<
egr
::
EagerVariable
>
(
egr
::
Controller
::
Instance
().
GenerateUniqueName
())};
auto
QKOut
=
egr
::
EagerUtils
::
RecoverTensorWrapper
(
&
this
->
QKOut_
);
if
(
QKOut
.
defined
()
&&
(
!
out_metas
[
18
].
empty
())
&&
(
!
out_metas
[
18
][
0
].
IsStopGradient
()))
outs0
[
"QKOut@GRAD"
]
=
{
std
::
make_shared
<
egr
::
EagerVariable
>
(
egr
::
Controller
::
Instance
().
GenerateUniqueName
())};
auto
SoftmaxOut
=
egr
::
EagerUtils
::
RecoverTensorWrapper
(
&
this
->
SoftmaxOut_
);
if
(
SoftmaxOut
.
defined
()
&&
(
!
out_metas
[
19
].
empty
())
&&
(
!
out_metas
[
19
][
0
].
IsStopGradient
()))
outs0
[
"SoftmaxOut@GRAD"
]
=
{
std
::
make_shared
<
egr
::
EagerVariable
>
(
egr
::
Controller
::
Instance
().
GenerateUniqueName
())};
auto
AttnDropoutOut
=
egr
::
EagerUtils
::
RecoverTensorWrapper
(
&
this
->
AttnDropoutOut_
);
if
(
AttnDropoutOut
.
defined
()
&&
(
!
out_metas
[
20
].
empty
())
&&
(
!
out_metas
[
20
][
0
].
IsStopGradient
()))
outs0
[
"AttnDropoutOut@GRAD"
]
=
{
std
::
make_shared
<
egr
::
EagerVariable
>
(
egr
::
Controller
::
Instance
().
GenerateUniqueName
())};
auto
FMHAOut
=
egr
::
EagerUtils
::
RecoverTensorWrapper
(
&
this
->
FMHAOut_
);
if
(
FMHAOut
.
defined
()
&&
(
!
out_metas
[
21
].
empty
())
&&
(
!
out_metas
[
21
][
0
].
IsStopGradient
()))
outs0
[
"FMHAOut@GRAD"
]
=
{
std
::
make_shared
<
egr
::
EagerVariable
>
(
egr
::
Controller
::
Instance
().
GenerateUniqueName
())};
auto
OutLinearOut
=
egr
::
EagerUtils
::
RecoverTensorWrapper
(
&
this
->
OutLinearOut_
);
if
(
OutLinearOut
.
defined
()
&&
(
!
out_metas
[
22
].
empty
())
&&
(
!
out_metas
[
22
][
0
].
IsStopGradient
()))
outs0
[
"OutLinearOut@GRAD"
]
=
{
std
::
make_shared
<
egr
::
EagerVariable
>
(
egr
::
Controller
::
Instance
().
GenerateUniqueName
())};
auto
QKVBias
=
egr
::
EagerUtils
::
RecoverTensorWrapper
(
&
this
->
QKVBias_
);
if
(
QKVBias
.
defined
())
{
ins0
[
"QKVBias"
]
=
egr
::
EagerUtils
::
TrySyncToVars
(
QKVBias
);
auto
QKVBiasOut
=
egr
::
EagerUtils
::
RecoverTensorWrapper
(
&
this
->
QKVBiasOut_
);
ins0
[
"QKVBiasOut"
]
=
egr
::
EagerUtils
::
TrySyncToVars
(
QKVBiasOut
);
if
(
QKVBias
.
defined
()
&&
(
!
out_metas
[
4
].
empty
())
&&
(
!
out_metas
[
4
][
0
].
IsStopGradient
()))
outs0
[
"QKVBias@GRAD"
]
=
{
std
::
make_shared
<
egr
::
EagerVariable
>
(
egr
::
Controller
::
Instance
().
GenerateUniqueName
())};
if
(
QKVBiasOut
.
defined
()
&&
(
!
out_metas
[
11
].
empty
())
&&
(
!
out_metas
[
11
][
0
].
IsStopGradient
()))
outs0
[
"QKVBiasOut@GRAD"
]
=
{
std
::
make_shared
<
egr
::
EagerVariable
>
(
egr
::
Controller
::
Instance
().
GenerateUniqueName
())};
}
auto
SrcMask
=
egr
::
EagerUtils
::
RecoverTensorWrapper
(
&
this
->
SrcMask_
);
if
(
SrcMask
.
defined
())
{
ins0
[
"SrcMask"
]
=
egr
::
EagerUtils
::
TrySyncToVars
(
SrcMask
);
auto
SrcMaskOut
=
egr
::
EagerUtils
::
RecoverTensorWrapper
(
&
this
->
SrcMaskOut_
);
ins0
[
"SrcMaskOut"
]
=
egr
::
EagerUtils
::
TrySyncToVars
(
SrcMaskOut
);
if
(
SrcMaskOut
.
defined
()
&&
(
!
out_metas
[
12
].
empty
())
&&
(
!
out_metas
[
12
][
0
].
IsStopGradient
()))
outs0
[
"SrcMaskOut@GRAD"
]
=
{
std
::
make_shared
<
egr
::
EagerVariable
>
(
egr
::
Controller
::
Instance
().
GenerateUniqueName
())};
}
auto
OutLinearBias
=
egr
::
EagerUtils
::
RecoverTensorWrapper
(
&
this
->
OutLinearBias_
);
if
(
OutLinearBias
.
defined
())
{
ins0
[
"OutLinearBias"
]
=
egr
::
EagerUtils
::
TrySyncToVars
(
OutLinearBias
);
if
(
OutLinearBias
.
defined
()
&&
(
!
out_metas
[
8
].
empty
())
&&
(
!
out_metas
[
8
][
0
].
IsStopGradient
()))
outs0
[
"OutLinearBias@GRAD"
]
=
{
std
::
make_shared
<
egr
::
EagerVariable
>
(
egr
::
Controller
::
Instance
().
GenerateUniqueName
())};
}
if
(
pre_layer_norm
)
{
auto
LnScale
=
egr
::
EagerUtils
::
RecoverTensorWrapper
(
&
this
->
LnScale_
);
if
(
LnScale
.
defined
())
{
ins0
[
"LnScale"
]
=
egr
::
EagerUtils
::
TrySyncToVars
(
LnScale
);
if
(
LnScale
.
defined
()
&&
(
!
out_metas
[
1
].
empty
())
&&
(
!
out_metas
[
1
][
0
].
IsStopGradient
()))
outs0
[
"LnScale@GRAD"
]
=
{
std
::
make_shared
<
egr
::
EagerVariable
>
(
egr
::
Controller
::
Instance
().
GenerateUniqueName
())};
}
auto
LnBias
=
egr
::
EagerUtils
::
RecoverTensorWrapper
(
&
this
->
LnBias_
);
if
(
LnBias
.
defined
())
{
ins0
[
"LnBias"
]
=
egr
::
EagerUtils
::
TrySyncToVars
(
LnBias
);
if
(
LnBias
.
defined
()
&&
(
!
out_metas
[
2
].
empty
())
&&
(
!
out_metas
[
2
][
0
].
IsStopGradient
()))
outs0
[
"LnBias@GRAD"
]
=
{
std
::
make_shared
<
egr
::
EagerVariable
>
(
egr
::
Controller
::
Instance
().
GenerateUniqueName
())};
}
auto
LnOut
=
egr
::
EagerUtils
::
RecoverTensorWrapper
(
&
this
->
LnOut_
);
if
(
LnOut
.
defined
())
{
ins0
[
"LnOut"
]
=
egr
::
EagerUtils
::
TrySyncToVars
(
LnOut
);
if
(
LnOut
.
defined
()
&&
(
!
out_metas
[
13
].
empty
())
&&
(
!
out_metas
[
13
][
0
].
IsStopGradient
()))
outs0
[
"LnOut@GRAD"
]
=
{
std
::
make_shared
<
egr
::
EagerVariable
>
(
egr
::
Controller
::
Instance
().
GenerateUniqueName
())};
}
auto
LnMean
=
egr
::
EagerUtils
::
RecoverTensorWrapper
(
&
this
->
LnMean_
);
if
(
LnMean
.
defined
())
{
ins0
[
"LnMean"
]
=
egr
::
EagerUtils
::
TrySyncToVars
(
LnMean
);
}
auto
LnVariance
=
egr
::
EagerUtils
::
RecoverTensorWrapper
(
&
this
->
LnVariance_
);
if
(
LnVariance
.
defined
())
{
ins0
[
"LnVariance"
]
=
egr
::
EagerUtils
::
TrySyncToVars
(
LnVariance
);
}
}
else
{
auto
Ln2Scale
=
egr
::
EagerUtils
::
RecoverTensorWrapper
(
&
this
->
Ln2Scale_
);
if
(
Ln2Scale
.
defined
())
{
ins0
[
"Ln2Scale"
]
=
egr
::
EagerUtils
::
TrySyncToVars
(
Ln2Scale
);
if
(
Ln2Scale
.
defined
()
&&
(
!
out_metas
[
9
].
empty
())
&&
(
!
out_metas
[
9
][
0
].
IsStopGradient
()))
outs0
[
"Ln2Scale@GRAD"
]
=
{
std
::
make_shared
<
egr
::
EagerVariable
>
(
egr
::
Controller
::
Instance
().
GenerateUniqueName
())};
}
auto
Ln2Bias
=
egr
::
EagerUtils
::
RecoverTensorWrapper
(
&
this
->
Ln2Bias_
);
if
(
Ln2Bias
.
defined
())
{
ins0
[
"Ln2Bias"
]
=
egr
::
EagerUtils
::
TrySyncToVars
(
Ln2Bias
);
if
(
Ln2Bias
.
defined
()
&&
(
!
out_metas
[
10
].
empty
())
&&
(
!
out_metas
[
10
][
0
].
IsStopGradient
()))
outs0
[
"Ln2Bias@GRAD"
]
=
{
std
::
make_shared
<
egr
::
EagerVariable
>
(
egr
::
Controller
::
Instance
().
GenerateUniqueName
())};
}
auto
BiasDropoutResidualOut
=
egr
::
EagerUtils
::
RecoverTensorWrapper
(
&
this
->
BiasDropoutResidualOut_
);
auto
Ln2Mean
=
egr
::
EagerUtils
::
RecoverTensorWrapper
(
&
this
->
Ln2Mean_
);
auto
Ln2Variance
=
egr
::
EagerUtils
::
RecoverTensorWrapper
(
&
this
->
Ln2Variance_
);
ins0
[
"BiasDropoutResidualOut"
]
=
egr
::
EagerUtils
::
TrySyncToVars
(
BiasDropoutResidualOut
);
ins0
[
"Ln2Mean"
]
=
egr
::
EagerUtils
::
TrySyncToVars
(
Ln2Mean
);
ins0
[
"Ln2Variance"
]
=
egr
::
EagerUtils
::
TrySyncToVars
(
Ln2Variance
);
if
(
BiasDropoutResidualOut
.
defined
()
&&
(
!
out_metas
[
14
].
empty
())
&&
(
!
out_metas
[
14
][
0
].
IsStopGradient
()))
outs0
[
"BiasDropoutResidualOut@GRAD"
]
=
{
std
::
make_shared
<
egr
::
EagerVariable
>
(
egr
::
Controller
::
Instance
().
GenerateUniqueName
())};
}
auto
&
attrs_map0
=
this
->
attr_map_
;
// Pass the entire attribute map to TraceOp
// The underlying kernel will pickup whatever attribute they need at runtime
egr
::
Controller
::
Instance
().
GetCurrentTracer
()
->
TraceOp
(
"fused_attention_grad"
,
ins0
,
outs0
,
attrs_map0
,
egr
::
Controller
::
Instance
().
GetExpectedPlace
(),
&
this
->
default_attr_map_
,
false
,
{});
if
(
outs0
.
find
(
"OutLinearW@GRAD"
)
!=
outs0
.
end
())
{
outputs
[
7
]
=
egr
::
EagerUtils
::
GetOutputs
(
outs0
[
"OutLinearW@GRAD"
]);
}
if
(
outs0
.
find
(
"QKVW@GRAD"
)
!=
outs0
.
end
())
{
outputs
[
3
]
=
egr
::
EagerUtils
::
GetOutputs
(
outs0
[
"QKVW@GRAD"
]);
}
if
(
outs0
.
find
(
"X@GRAD"
)
!=
outs0
.
end
())
{
outputs
[
0
]
=
egr
::
EagerUtils
::
GetOutputs
(
outs0
[
"X@GRAD"
]);
}
if
(
outs0
.
find
(
"QKVOut@GRAD"
)
!=
outs0
.
end
())
{
outputs
[
15
]
=
egr
::
EagerUtils
::
GetOutputs
(
outs0
[
"QKVOut@GRAD"
]);
}
if
(
outs0
.
find
(
"QKTVOut@GRAD"
)
!=
outs0
.
end
())
{
outputs
[
16
]
=
egr
::
EagerUtils
::
GetOutputs
(
outs0
[
"QKTVOut@GRAD"
]);
}
if
(
outs0
.
find
(
"TransposeOut2@GRAD"
)
!=
outs0
.
end
())
{
outputs
[
17
]
=
egr
::
EagerUtils
::
GetOutputs
(
outs0
[
"TransposeOut2@GRAD"
]);
}
if
(
outs0
.
find
(
"QKOut@GRAD"
)
!=
outs0
.
end
())
{
outputs
[
18
]
=
egr
::
EagerUtils
::
GetOutputs
(
outs0
[
"QKOut@GRAD"
]);
}
if
(
outs0
.
find
(
"SoftmaxOut@GRAD"
)
!=
outs0
.
end
())
{
outputs
[
19
]
=
egr
::
EagerUtils
::
GetOutputs
(
outs0
[
"SoftmaxOut@GRAD"
]);
}
if
(
outs0
.
find
(
"AttnDropoutOut@GRAD"
)
!=
outs0
.
end
())
{
outputs
[
20
]
=
egr
::
EagerUtils
::
GetOutputs
(
outs0
[
"AttnDropoutOut@GRAD"
]);
}
if
(
outs0
.
find
(
"FMHAOut@GRAD"
)
!=
outs0
.
end
())
{
outputs
[
21
]
=
egr
::
EagerUtils
::
GetOutputs
(
outs0
[
"FMHAOut@GRAD"
]);
}
if
(
outs0
.
find
(
"OutLinearOut@GRAD"
)
!=
outs0
.
end
())
{
outputs
[
22
]
=
egr
::
EagerUtils
::
GetOutputs
(
outs0
[
"OutLinearOut@GRAD"
]);
}
if
(
QKVBias
.
defined
())
{
if
(
outs0
.
find
(
"QKVBias@GRAD"
)
!=
outs0
.
end
())
{
outputs
[
4
]
=
egr
::
EagerUtils
::
GetOutputs
(
outs0
[
"QKVBias@GRAD"
]);
}
if
(
outs0
.
find
(
"QKVBiasOut@GRAD"
)
!=
outs0
.
end
())
{
outputs
[
11
]
=
egr
::
EagerUtils
::
GetOutputs
(
outs0
[
"QKVBiasOut@GRAD"
]);
}
}
if
(
SrcMask
.
defined
())
{
if
(
outs0
.
find
(
"SrcMaskOut@GRAD"
)
!=
outs0
.
end
())
{
outputs
[
12
]
=
egr
::
EagerUtils
::
GetOutputs
(
outs0
[
"SrcMaskOut@GRAD"
]);
}
}
if
(
OutLinearBias
.
defined
())
{
if
(
outs0
.
find
(
"OutLinearBias@GRAD"
)
!=
outs0
.
end
())
{
outputs
[
8
]
=
egr
::
EagerUtils
::
GetOutputs
(
outs0
[
"OutLinearBias@GRAD"
]);
}
}
if
(
pre_layer_norm
)
{
auto
LnScale
=
egr
::
EagerUtils
::
RecoverTensorWrapper
(
&
this
->
LnScale_
);
if
(
LnScale
.
defined
())
{
if
(
outs0
.
find
(
"LnScale@GRAD"
)
!=
outs0
.
end
())
{
outputs
[
1
]
=
egr
::
EagerUtils
::
GetOutputs
(
outs0
[
"LnScale@GRAD"
]);
}
}
auto
LnBias
=
egr
::
EagerUtils
::
RecoverTensorWrapper
(
&
this
->
LnBias_
);
if
(
LnBias
.
defined
())
{
if
(
outs0
.
find
(
"LnBias@GRAD"
)
!=
outs0
.
end
())
{
outputs
[
2
]
=
egr
::
EagerUtils
::
GetOutputs
(
outs0
[
"LnBias@GRAD"
]);
}
}
auto
LnOut
=
egr
::
EagerUtils
::
RecoverTensorWrapper
(
&
this
->
LnOut_
);
if
(
LnOut
.
defined
())
{
if
(
outs0
.
find
(
"LnOut@GRAD"
)
!=
outs0
.
end
())
{
outputs
[
13
]
=
egr
::
EagerUtils
::
GetOutputs
(
outs0
[
"LnOut@GRAD"
]);
}
}
}
else
{
auto
Ln2Scale
=
egr
::
EagerUtils
::
RecoverTensorWrapper
(
&
this
->
Ln2Scale_
);
if
(
Ln2Scale
.
defined
())
{
if
(
outs0
.
find
(
"Ln2Scale@GRAD"
)
!=
outs0
.
end
())
{
outputs
[
9
]
=
egr
::
EagerUtils
::
GetOutputs
(
outs0
[
"Ln2Scale@GRAD"
]);
}
}
auto
Ln2Bias
=
egr
::
EagerUtils
::
RecoverTensorWrapper
(
&
this
->
Ln2Bias_
);
if
(
Ln2Bias
.
defined
())
{
if
(
outs0
.
find
(
"Ln2Bias@GRAD"
)
!=
outs0
.
end
())
{
outputs
[
10
]
=
egr
::
EagerUtils
::
GetOutputs
(
outs0
[
"Ln2Bias@GRAD"
]);
}
}
if
(
outs0
.
find
(
"BiasDropoutResidualOut@GRAD"
)
!=
outs0
.
end
())
{
outputs
[
14
]
=
egr
::
EagerUtils
::
GetOutputs
(
outs0
[
"BiasDropoutResidualOut@GRAD"
]);
}
}
if
(
NeedComplexToRealConversion
())
HandleComplexGradToRealGrad
(
&
outputs
);
return
outputs
;
}
paddle/fluid/eager/api/manual/fluid_manual/nodes/nodes.h
浏览文件 @
2afa9b76
...
...
@@ -329,3 +329,205 @@ class fused_feedforwardGradNodeCompat : public egr::GradNodeBase {
paddle
::
framework
::
AttributeMap
attr_map_
;
paddle
::
framework
::
AttributeMap
default_attr_map_
;
};
class
fused_attentionGradNodeCompat
:
public
egr
::
GradNodeBase
{
public:
fused_attentionGradNodeCompat
()
:
egr
::
GradNodeBase
()
{
VLOG
(
7
)
<<
" Construct fused_attentionGradNodeCompat "
;
}
fused_attentionGradNodeCompat
(
size_t
bwd_in_slot_num
,
size_t
bwd_out_slot_num
)
:
egr
::
GradNodeBase
(
bwd_in_slot_num
,
bwd_out_slot_num
)
{
VLOG
(
7
)
<<
" Construct fused_attentionGradNodeCompat "
;
}
~
fused_attentionGradNodeCompat
()
override
{
VLOG
(
6
)
<<
" Destruct fused_attentionGradNodeCompat "
;
}
virtual
paddle
::
small_vector
<
std
::
vector
<
paddle
::
experimental
::
Tensor
>
,
egr
::
kSlotSmallVectorSize
>
operator
()(
paddle
::
small_vector
<
std
::
vector
<
paddle
::
experimental
::
Tensor
>
,
// NOLINT
egr
::
kSlotSmallVectorSize
>&
grads
,
// NOLINT
bool
create_graph
=
false
,
bool
is_new_grad
=
false
)
override
;
void
ClearTensorWrappers
()
override
{
AttnDropoutMaskOut_
.
clear
();
AttnDropoutOut_
.
clear
();
BiasDropoutResidualOut_
.
clear
();
DropoutMaskOut_
.
clear
();
FMHAOut_
.
clear
();
Ln2Bias_
.
clear
();
Ln2Mean_
.
clear
();
Ln2Scale_
.
clear
();
Ln2Variance_
.
clear
();
OutLinearBias_
.
clear
();
OutLinearOut_
.
clear
();
OutLinearW_
.
clear
();
QKOut_
.
clear
();
QKTVOut_
.
clear
();
QKVBias_
.
clear
();
QKVBiasOut_
.
clear
();
QKVOut_
.
clear
();
QKVW_
.
clear
();
SoftmaxOut_
.
clear
();
SrcMask_
.
clear
();
SrcMaskOut_
.
clear
();
TransposeOut2_
.
clear
();
X_
.
clear
();
SetIsTensorWrappersCleared
(
true
);
}
std
::
string
name
()
override
{
return
"fused_attentionGradNodeCompat"
;
}
std
::
shared_ptr
<
GradNodeBase
>
Copy
()
const
override
{
{
auto
copied_node
=
std
::
shared_ptr
<
fused_attentionGradNodeCompat
>
(
new
fused_attentionGradNodeCompat
(
*
this
));
return
copied_node
;
}
}
// SetX, SetY, ...
void
SetTensorWrapperAttnDropoutMaskOut
(
const
paddle
::
experimental
::
Tensor
&
AttnDropoutMaskOut
)
{
AttnDropoutMaskOut_
=
egr
::
TensorWrapper
(
AttnDropoutMaskOut
,
false
);
}
void
SetTensorWrapperAttnDropoutOut
(
const
paddle
::
experimental
::
Tensor
&
AttnDropoutOut
)
{
AttnDropoutOut_
=
egr
::
TensorWrapper
(
AttnDropoutOut
,
false
);
}
void
SetTensorWrapperBiasDropoutResidualOut
(
const
paddle
::
experimental
::
Tensor
&
BiasDropoutResidualOut
)
{
BiasDropoutResidualOut_
=
egr
::
TensorWrapper
(
BiasDropoutResidualOut
,
false
);
}
void
SetTensorWrapperDropoutMaskOut
(
const
paddle
::
experimental
::
Tensor
&
DropoutMaskOut
)
{
DropoutMaskOut_
=
egr
::
TensorWrapper
(
DropoutMaskOut
,
false
);
}
void
SetTensorWrapperFMHAOut
(
const
paddle
::
experimental
::
Tensor
&
FMHAOut
)
{
FMHAOut_
=
egr
::
TensorWrapper
(
FMHAOut
,
false
);
}
void
SetTensorWrapperLn2Bias
(
const
paddle
::
experimental
::
Tensor
&
Ln2Bias
)
{
Ln2Bias_
=
egr
::
TensorWrapper
(
Ln2Bias
,
false
);
}
void
SetTensorWrapperLn2Mean
(
const
paddle
::
experimental
::
Tensor
&
Ln2Mean
)
{
Ln2Mean_
=
egr
::
TensorWrapper
(
Ln2Mean
,
false
);
}
void
SetTensorWrapperLn2Scale
(
const
paddle
::
experimental
::
Tensor
&
Ln2Scale
)
{
Ln2Scale_
=
egr
::
TensorWrapper
(
Ln2Scale
,
false
);
}
void
SetTensorWrapperLn2Variance
(
const
paddle
::
experimental
::
Tensor
&
Ln2Variance
)
{
Ln2Variance_
=
egr
::
TensorWrapper
(
Ln2Variance
,
false
);
}
void
SetTensorWrapperOutLinearBias
(
const
paddle
::
experimental
::
Tensor
&
OutLinearBias
)
{
OutLinearBias_
=
egr
::
TensorWrapper
(
OutLinearBias
,
false
);
}
void
SetTensorWrapperOutLinearOut
(
const
paddle
::
experimental
::
Tensor
&
OutLinearOut
)
{
OutLinearOut_
=
egr
::
TensorWrapper
(
OutLinearOut
,
false
);
}
void
SetTensorWrapperOutLinearW
(
const
paddle
::
experimental
::
Tensor
&
OutLinearW
)
{
OutLinearW_
=
egr
::
TensorWrapper
(
OutLinearW
,
false
);
}
void
SetTensorWrapperQKOut
(
const
paddle
::
experimental
::
Tensor
&
QKOut
)
{
QKOut_
=
egr
::
TensorWrapper
(
QKOut
,
false
);
}
void
SetTensorWrapperQKTVOut
(
const
paddle
::
experimental
::
Tensor
&
QKTVOut
)
{
QKTVOut_
=
egr
::
TensorWrapper
(
QKTVOut
,
false
);
}
void
SetTensorWrapperQKVBias
(
const
paddle
::
experimental
::
Tensor
&
QKVBias
)
{
QKVBias_
=
egr
::
TensorWrapper
(
QKVBias
,
false
);
}
void
SetTensorWrapperQKVBiasOut
(
const
paddle
::
experimental
::
Tensor
&
QKVBiasOut
)
{
QKVBiasOut_
=
egr
::
TensorWrapper
(
QKVBiasOut
,
false
);
}
void
SetTensorWrapperQKVOut
(
const
paddle
::
experimental
::
Tensor
&
QKVOut
)
{
QKVOut_
=
egr
::
TensorWrapper
(
QKVOut
,
false
);
}
void
SetTensorWrapperQKVW
(
const
paddle
::
experimental
::
Tensor
&
QKVW
)
{
QKVW_
=
egr
::
TensorWrapper
(
QKVW
,
false
);
}
void
SetTensorWrapperSoftmaxOut
(
const
paddle
::
experimental
::
Tensor
&
SoftmaxOut
)
{
SoftmaxOut_
=
egr
::
TensorWrapper
(
SoftmaxOut
,
false
);
}
void
SetTensorWrapperSrcMask
(
const
paddle
::
experimental
::
Tensor
&
SrcMask
)
{
SrcMask_
=
egr
::
TensorWrapper
(
SrcMask
,
false
);
}
void
SetTensorWrapperSrcMaskOut
(
const
paddle
::
experimental
::
Tensor
&
SrcMaskOut
)
{
SrcMaskOut_
=
egr
::
TensorWrapper
(
SrcMaskOut
,
false
);
}
void
SetTensorWrapperTransposeOut2
(
const
paddle
::
experimental
::
Tensor
&
TransposeOut2
)
{
TransposeOut2_
=
egr
::
TensorWrapper
(
TransposeOut2
,
false
);
}
void
SetTensorWrapperX
(
const
paddle
::
experimental
::
Tensor
&
X
)
{
X_
=
egr
::
TensorWrapper
(
X
,
false
);
}
void
SetTensorWrapperLnScale
(
const
paddle
::
experimental
::
Tensor
&
LnScale
)
{
LnScale_
=
egr
::
TensorWrapper
(
LnScale
,
false
);
}
void
SetTensorWrapperLnBias
(
const
paddle
::
experimental
::
Tensor
&
LnBias
)
{
LnBias_
=
egr
::
TensorWrapper
(
LnBias
,
false
);
}
void
SetTensorWrapperLnOut
(
const
paddle
::
experimental
::
Tensor
&
LnOut
)
{
LnOut_
=
egr
::
TensorWrapper
(
LnOut
,
false
);
}
void
SetTensorWrapperLnMean
(
const
paddle
::
experimental
::
Tensor
&
LnMean
)
{
LnMean_
=
egr
::
TensorWrapper
(
LnMean
,
false
);
}
void
SetTensorWrapperLnVariance
(
const
paddle
::
experimental
::
Tensor
&
LnVariance
)
{
LnVariance_
=
egr
::
TensorWrapper
(
LnVariance
,
false
);
}
// SetAttrMap
void
SetAttrMap
(
paddle
::
framework
::
AttributeMap
&&
attr_map
)
{
attr_map_
=
std
::
move
(
attr_map
);
}
void
SetDefaultAttrMap
(
paddle
::
framework
::
AttributeMap
&&
default_attr_map
)
{
default_attr_map_
=
std
::
move
(
default_attr_map
);
}
private:
// TensorWrappers
egr
::
TensorWrapper
AttnDropoutMaskOut_
;
egr
::
TensorWrapper
AttnDropoutOut_
;
egr
::
TensorWrapper
BiasDropoutResidualOut_
;
egr
::
TensorWrapper
DropoutMaskOut_
;
egr
::
TensorWrapper
FMHAOut_
;
egr
::
TensorWrapper
Ln2Bias_
;
egr
::
TensorWrapper
Ln2Mean_
;
egr
::
TensorWrapper
Ln2Scale_
;
egr
::
TensorWrapper
Ln2Variance_
;
egr
::
TensorWrapper
OutLinearBias_
;
egr
::
TensorWrapper
OutLinearOut_
;
egr
::
TensorWrapper
OutLinearW_
;
egr
::
TensorWrapper
QKOut_
;
egr
::
TensorWrapper
QKTVOut_
;
egr
::
TensorWrapper
QKVBias_
;
egr
::
TensorWrapper
QKVBiasOut_
;
egr
::
TensorWrapper
QKVOut_
;
egr
::
TensorWrapper
QKVW_
;
egr
::
TensorWrapper
SoftmaxOut_
;
egr
::
TensorWrapper
SrcMask_
;
egr
::
TensorWrapper
SrcMaskOut_
;
egr
::
TensorWrapper
TransposeOut2_
;
egr
::
TensorWrapper
X_
;
egr
::
TensorWrapper
LnScale_
;
egr
::
TensorWrapper
LnBias_
;
egr
::
TensorWrapper
LnOut_
;
egr
::
TensorWrapper
LnMean_
;
egr
::
TensorWrapper
LnVariance_
;
// Attribute Map
paddle
::
framework
::
AttributeMap
attr_map_
;
paddle
::
framework
::
AttributeMap
default_attr_map_
;
};
paddle/fluid/eager/auto_code_generator/eager_generator.cc
浏览文件 @
2afa9b76
...
...
@@ -51,8 +51,10 @@ static std::unordered_set<std::string> ops_to_fill_zero_for_empty_grads = {
"split"
,
"rnn"
};
/* --- Black Ops list that's NO NEED to apply code generation --- */
static
std
::
unordered_set
<
std
::
string
>
black_ops_list
=
{
"run_program"
,
"fused_gate_attention"
,
"fused_feedforward"
};
static
std
::
unordered_set
<
std
::
string
>
black_ops_list
=
{
"run_program"
,
"fused_gate_attention"
,
"fused_feedforward"
,
"fused_attention"
};
static
std
::
string
LegalizeVariableName
(
const
std
::
string
&
var_name
)
{
std
::
string
ret
=
var_name
;
...
...
python/paddle/fluid/tests/unittests/test_fused_attention_op.py
浏览文件 @
2afa9b76
...
...
@@ -26,9 +26,7 @@ from paddle import tensor
from
paddle.fluid
import
layers
import
unittest
from
op_test
import
OpTest
from
paddle.fluid.framework
import
default_main_program
,
_enable_legacy_dygraph
_enable_legacy_dygraph
()
from
paddle.fluid.framework
import
default_main_program
default_main_program
().
random_seed
=
42
...
...
python/paddle/fluid/tests/unittests/test_fused_gate_attention_op.py
浏览文件 @
2afa9b76
...
...
@@ -26,11 +26,9 @@ import unittest
from
op_test
import
OpTest
,
convert_float_to_uint16
,
convert_uint16_to_float
from
test_sparse_attention_op
import
get_cuda_version
from
paddle
import
_C_ops
from
paddle.fluid.framework
import
default_main_program
,
_enable_legacy_dygraph
from
paddle.fluid.framework
import
default_main_program
from
paddle.fluid
import
core
_enable_legacy_dygraph
()
@
unittest
.
skipIf
(
not
core
.
is_compiled_with_cuda
(),
"Paddle is not compiled with CUDA"
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录