Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
67497119
P
Paddle
项目概览
PaddlePaddle
/
Paddle
1 年多 前同步成功
通知
2305
Star
20932
Fork
5423
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
67497119
编写于
5月 31, 2022
作者:
L
Li Min
提交者:
GitHub
5月 31, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Rename dropout is test (#43098)
* replace dropout_is_test with is_test. * improve atol on a100.
上级
ae45d981
变更
11
隐藏空白更改
内联
并排
Showing
11 changed file
with
78 addition
and
52 deletion
+78
-52
paddle/fluid/operators/fused/fused_attention_op.cc
paddle/fluid/operators/fused/fused_attention_op.cc
+6
-12
paddle/fluid/operators/fused/fused_attention_op.cu
paddle/fluid/operators/fused/fused_attention_op.cu
+2
-2
paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cc
...rators/fused/fused_bias_dropout_residual_layer_norm_op.cc
+5
-6
paddle/fluid/operators/fused/fused_dropout_helper.h
paddle/fluid/operators/fused/fused_dropout_helper.h
+1
-1
paddle/fluid/operators/fused/fused_feedforward_op.cc
paddle/fluid/operators/fused/fused_feedforward_op.cc
+4
-9
paddle/fluid/operators/fused/fused_multi_transformer_op.cc
paddle/fluid/operators/fused/fused_multi_transformer_op.cc
+1
-1
python/paddle/fluid/tests/unittests/test_fused_attention_op.py
...n/paddle/fluid/tests/unittests/test_fused_attention_op.py
+20
-5
python/paddle/fluid/tests/unittests/test_fused_attention_op_api.py
...ddle/fluid/tests/unittests/test_fused_attention_op_api.py
+14
-2
python/paddle/fluid/tests/unittests/test_fused_feedforward_op.py
...paddle/fluid/tests/unittests/test_fused_feedforward_op.py
+6
-1
python/paddle/fluid/tests/unittests/test_fused_transformer_encoder_layer.py
...d/tests/unittests/test_fused_transformer_encoder_layer.py
+11
-3
python/paddle/incubate/nn/functional/fused_transformer.py
python/paddle/incubate/nn/functional/fused_transformer.py
+8
-10
未找到文件。
paddle/fluid/operators/fused/fused_attention_op.cc
浏览文件 @
67497119
...
...
@@ -194,7 +194,7 @@ class FusedAttentionOp : public framework::OperatorWithKernel {
// the same as QKOut's shape.
ctx
->
SetOutputDim
(
"AttnDropoutOut"
,
{
x_dim
[
0
],
y_dim
[
1
],
x_dim
[
1
],
out_seq_len
});
if
(
ctx
->
Attrs
().
Get
<
bool
>
(
"
attn_dropout_
is_test"
)
==
false
)
{
if
(
ctx
->
Attrs
().
Get
<
bool
>
(
"is_test"
)
==
false
)
{
ctx
->
SetOutputDim
(
"AttnDropoutMaskOut"
,
{
x_dim
[
0
],
y_dim
[
1
],
x_dim
[
1
],
out_seq_len
});
}
...
...
@@ -206,7 +206,7 @@ class FusedAttentionOp : public framework::OperatorWithKernel {
ctx
->
SetOutputDim
(
"FMHAOut"
,
{
x_dim
[
0
],
x_dim
[
1
],
y_dim
[
1
],
y_dim
[
2
]});
ctx
->
SetOutputDim
(
"OutLinearOut"
,
ctx
->
GetInputDim
(
"X"
));
if
(
ctx
->
Attrs
().
Get
<
bool
>
(
"
dropout_
is_test"
)
==
false
)
{
if
(
ctx
->
Attrs
().
Get
<
bool
>
(
"is_test"
)
==
false
)
{
ctx
->
SetOutputDim
(
"DropoutMaskOut"
,
ctx
->
GetInputDim
(
"X"
));
}
...
...
@@ -301,7 +301,7 @@ class FusedAttentionOpMaker : public framework::OpProtoAndCheckerMaker {
platform
::
errors
::
InvalidArgument
(
"'attn_dropout_rate' must be between 0.0 and 1.0."
));
});
AddAttr
<
bool
>
(
"
attn_dropout_
is_test"
,
AddAttr
<
bool
>
(
"is_test"
,
"(bool, default false) Set to true for inference only, false "
"for training. Some layers may run faster when this is true."
)
.
SetDefault
(
false
);
...
...
@@ -345,11 +345,6 @@ class FusedAttentionOpMaker : public framework::OpProtoAndCheckerMaker {
platform
::
errors
::
InvalidArgument
(
"'dropout_rate' must be between 0.0 and 1.0."
));
});
AddAttr
<
bool
>
(
"dropout_is_test"
,
"(bool, default false) Set to true for inference only, false "
"for training. Some layers may run faster when this is true."
)
.
SetDefault
(
false
);
AddAttr
<
bool
>
(
"dropout_fix_seed"
,
"A flag indicating whether to use a fixed seed to generate "
"random mask. NOTE: DO NOT set this flag to true in "
...
...
@@ -418,10 +413,9 @@ class FusedAttentionGradOp : public framework::OperatorWithKernel {
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE_EQ
(
ctx
->
Attrs
().
Get
<
bool
>
(
"attn_dropout_is_test"
),
false
,
platform
::
errors
::
InvalidArgument
(
"GradOp is only callable when attn_dropout_is_test is false"
));
PADDLE_ENFORCE_EQ
(
ctx
->
Attrs
().
Get
<
bool
>
(
"is_test"
),
false
,
platform
::
errors
::
InvalidArgument
(
"GradOp is only callable when is_test is false"
));
if
(
ctx
->
Attrs
().
Get
<
bool
>
(
"pre_layer_norm"
)
==
false
)
{
OP_INOUT_CHECK
(
ctx
->
HasInput
(
"Ln2Mean"
),
"Input"
,
"Ln2Mean"
,
...
...
paddle/fluid/operators/fused/fused_attention_op.cu
浏览文件 @
67497119
...
...
@@ -109,7 +109,7 @@ class FusedAttentionOpKernel : public framework::OpKernel<T> {
const
float
ln_epsilon
=
ctx
.
Attr
<
float
>
(
"ln_epsilon"
);
float
attn_dropout_rate
=
ctx
.
Attr
<
float
>
(
"attn_dropout_rate"
);
bool
is_test_1
=
ctx
.
Attr
<
bool
>
(
"
attn_dropout_
is_test"
);
bool
is_test_1
=
ctx
.
Attr
<
bool
>
(
"is_test"
);
auto
&
dropout_implementation_1
=
ctx
.
Attr
<
std
::
string
>
(
"attn_dropout_implementation"
);
bool
is_upscale_in_train_1
=
...
...
@@ -280,7 +280,7 @@ class FusedAttentionGradKernel : public framework::OpKernel<T> {
const
float
ln2epsilon
=
ctx
.
Attr
<
float
>
(
"ln_epsilon"
);
float
attn_dropout_prob
=
ctx
.
Attr
<
float
>
(
"attn_dropout_rate"
);
bool
is_test_1
=
ctx
.
Attr
<
bool
>
(
"
attn_dropout_
is_test"
);
bool
is_test_1
=
ctx
.
Attr
<
bool
>
(
"is_test"
);
auto
&
dropout_implementation_1
=
ctx
.
Attr
<
std
::
string
>
(
"attn_dropout_implementation"
);
bool
is_upscale_in_train_1
=
...
...
paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cc
浏览文件 @
67497119
...
...
@@ -44,7 +44,7 @@ class FusedBiasDropoutResidualLnOp : public framework::OperatorWithKernel {
left
*=
x_dim
[
i
];
}
ctx
->
SetOutputDim
(
"BiasDropoutResidualOut"
,
ctx
->
GetInputDim
(
"X"
));
if
(
ctx
->
Attrs
().
Get
<
bool
>
(
"
dropout_
is_test"
)
==
false
)
{
if
(
ctx
->
Attrs
().
Get
<
bool
>
(
"is_test"
)
==
false
)
{
ctx
->
SetOutputDim
(
"DropoutMaskOut"
,
ctx
->
GetInputDim
(
"X"
));
}
ctx
->
SetOutputDim
(
"LnMean"
,
{
left
});
...
...
@@ -91,7 +91,7 @@ class FusedBiasDropoutResidualLnOpMaker
platform
::
errors
::
InvalidArgument
(
"'dropout_rate' must be between 0.0 and 1.0."
));
});
AddAttr
<
bool
>
(
"
dropout_
is_test"
,
AddAttr
<
bool
>
(
"is_test"
,
"(bool, default false) Set to true for inference only, false "
"for training. Some layers may run faster when this is true."
)
.
SetDefault
(
false
);
...
...
@@ -140,10 +140,9 @@ class FusedBiasDropoutResidualLnGradOp : public framework::OperatorWithKernel {
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE_EQ
(
ctx
->
Attrs
().
Get
<
bool
>
(
"dropout_is_test"
),
false
,
platform
::
errors
::
InvalidArgument
(
"GradOp is only callable when dropout_is_test is false"
));
PADDLE_ENFORCE_EQ
(
ctx
->
Attrs
().
Get
<
bool
>
(
"is_test"
),
false
,
platform
::
errors
::
InvalidArgument
(
"GradOp is only callable when is_test is false"
));
OP_INOUT_CHECK
(
ctx
->
HasInput
(
"LnMean"
),
"Input"
,
"LnMean"
,
"FusedBiasDropoutResidualLnGrad"
);
OP_INOUT_CHECK
(
ctx
->
HasInput
(
"LnVariance"
),
"Input"
,
"LnVariance"
,
...
...
paddle/fluid/operators/fused/fused_dropout_helper.h
浏览文件 @
67497119
...
...
@@ -82,7 +82,7 @@ struct DropoutParam {
auto
&
dropout_implementation
=
context
.
Attr
<
std
::
string
>
(
pre_fix
+
"implementation"
);
is_upscale_in_train
=
(
dropout_implementation
==
"upscale_in_train"
);
is_test
=
context
.
Attr
<
bool
>
(
pre_fix
+
"is_test"
);
is_test
=
context
.
Attr
<
bool
>
(
"is_test"
);
fix_seed
=
context
.
Attr
<
bool
>
(
pre_fix
+
"fix_seed"
);
std
::
string
str_seed
=
"Dropout"
;
...
...
paddle/fluid/operators/fused/fused_feedforward_op.cc
浏览文件 @
67497119
...
...
@@ -61,14 +61,14 @@ class FusedFeedForwardOp : public framework::OperatorWithKernel {
tmp_dim_x
[
dim_x
.
size
()
-
1
]
=
dim_Linear1Weight
[
dim_Linear1Weight
.
size
()
-
1
];
context
->
SetOutputDim
(
"Out"
,
dim_x
);
if
(
context
->
Attrs
().
Get
<
bool
>
(
"
dropout1_
is_test"
)
==
false
)
{
if
(
context
->
Attrs
().
Get
<
bool
>
(
"is_test"
)
==
false
)
{
context
->
SetOutputDim
(
"Dropout1Mask"
,
tmp_dim_x
);
}
context
->
SetOutputDim
(
"Dropout1Out"
,
tmp_dim_x
);
context
->
SetOutputDim
(
"Linear1Out"
,
tmp_dim_x
);
context
->
SetOutputDim
(
"Dropout2Out"
,
dim_x
);
if
(
context
->
Attrs
().
Get
<
bool
>
(
"
dropout2_
is_test"
)
==
false
)
{
if
(
context
->
Attrs
().
Get
<
bool
>
(
"is_test"
)
==
false
)
{
context
->
SetOutputDim
(
"Dropout2Mask"
,
dim_x
);
}
framework
::
DDim
mean_dim
=
...
...
@@ -185,9 +185,7 @@ class FusedFeedForwardOpMaker : public framework::OpProtoAndCheckerMaker {
"dropout2_implementation can only be downgrade_in_infer or "
"upscale_in_train"
));
});
AddAttr
<
bool
>
(
"dropout1_is_test"
,
"the is_test of first dropout"
)
.
SetDefault
(
false
);
AddAttr
<
bool
>
(
"dropout2_is_test"
,
"the is_test of second dropout"
)
AddAttr
<
bool
>
(
"is_test"
,
"the is_test attribute of dropout"
)
.
SetDefault
(
false
);
AddAttr
<
bool
>
(
"dropout1_fix_seed"
,
"the is_test of first dropout"
)
.
SetDefault
(
false
);
...
...
@@ -218,10 +216,7 @@ class FusedFeedForwardOpGrad : public framework::OperatorWithKernel {
protected:
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE_EQ
(
ctx
->
Attrs
().
Get
<
bool
>
(
"dropout1_is_test"
),
false
,
platform
::
errors
::
InvalidArgument
(
"GradOp is only callable when is_test is false"
));
PADDLE_ENFORCE_EQ
(
ctx
->
Attrs
().
Get
<
bool
>
(
"dropout2_is_test"
),
false
,
PADDLE_ENFORCE_EQ
(
ctx
->
Attrs
().
Get
<
bool
>
(
"is_test"
),
false
,
platform
::
errors
::
InvalidArgument
(
"GradOp is only callable when is_test is false"
));
bool
pre_layer_norm
=
ctx
->
Attrs
().
Get
<
bool
>
(
"pre_layer_norm"
);
...
...
paddle/fluid/operators/fused/fused_multi_transformer_op.cc
浏览文件 @
67497119
...
...
@@ -221,7 +221,7 @@ class FusedMultiTransformerOpOpMaker
"'dropout_rate' must be between 0.0 and 1.0."
));
});
AddAttr
<
bool
>
(
"
dropout_
is_test"
,
AddAttr
<
bool
>
(
"is_test"
,
"(bool, default false) Set to true for inference only, false "
"for training. Some layers may run faster when this is true."
)
.
SetDefault
(
false
);
...
...
python/paddle/fluid/tests/unittests/test_fused_attention_op.py
浏览文件 @
67497119
...
...
@@ -36,6 +36,18 @@ class TestFusedAttentionOp(OpTest):
def
setUp
(
self
):
self
.
config
()
self
.
generate_input_data
()
self
.
rtol
=
1e-5
# FIXME(limin29): Because there is a problem with the test precision
# on A100, atol is temporarily set to 1e-2, and it will be
# changed back after the precision problem is solved.
self
.
atol
=
1e-2
# make sure local development precision
if
"V100"
in
paddle
.
device
.
cuda
.
get_device_name
():
self
.
atol
=
1e-4
if
self
.
x_type
is
np
.
float16
:
self
.
atol
=
1e-1
paddle
.
set_default_dtype
(
self
.
x_type
)
self
.
__class__
.
op_type
=
"fused_attention"
# use autograd to check grad in this unittest.
...
...
@@ -274,9 +286,9 @@ class TestFusedAttentionOp(OpTest):
final_out_ref
,
x_grad_ref
=
self
.
GetBaselineOut
()
final_out
,
x_grad
=
self
.
GetFusedAttentionOut
()
np
.
testing
.
assert_allclose
(
final_out_ref
,
final_out
.
numpy
(),
rtol
=
1e-5
,
atol
=
1e-4
)
final_out_ref
,
final_out
.
numpy
(),
rtol
=
self
.
rtol
,
atol
=
self
.
atol
)
np
.
testing
.
assert_allclose
(
x_grad_ref
,
x_grad
.
numpy
(),
rtol
=
1e-5
,
atol
=
1e-4
)
x_grad_ref
,
x_grad
.
numpy
(),
rtol
=
self
.
rtol
,
atol
=
self
.
atol
)
class
TestFusedAttentionOpBiasIsNone
(
TestFusedAttentionOp
):
...
...
@@ -307,9 +319,9 @@ class TestFusedAttentionOpFp16(TestFusedAttentionOp):
final_out_ref
,
x_grad_ref
=
self
.
GetBaselineOut
()
final_out
,
x_grad
=
self
.
GetFusedAttentionOut
()
np
.
testing
.
assert_allclose
(
final_out_ref
,
final_out
.
numpy
(),
rtol
=
1e-5
,
atol
=
1e-1
)
final_out_ref
,
final_out
.
numpy
(),
rtol
=
self
.
rtol
,
atol
=
self
.
atol
)
np
.
testing
.
assert_allclose
(
x_grad_ref
,
x_grad
.
numpy
(),
rtol
=
1e-5
,
atol
=
1e-1
)
x_grad_ref
,
x_grad
.
numpy
(),
rtol
=
self
.
rtol
,
atol
=
self
.
atol
)
class
TestFusedAttentionOpCacheKV
(
TestFusedAttentionOp
):
...
...
@@ -325,7 +337,10 @@ class TestFusedAttentionOpCacheKV(TestFusedAttentionOp):
final_out_ref
=
self
.
GetBaselineOut
()
final_out
,
cache_kv_out
=
self
.
GetFusedAttentionOut
()
np
.
testing
.
assert_allclose
(
final_out_ref
,
final_out
.
numpy
(),
rtol
=
1e-5
,
atol
=
1e-4
)
final_out_ref
,
final_out
.
numpy
(),
rtol
=
self
.
rtol
,
atol
=
self
.
atol
)
if
__name__
==
"__main__"
:
...
...
python/paddle/fluid/tests/unittests/test_fused_attention_op_api.py
浏览文件 @
67497119
...
...
@@ -173,6 +173,17 @@ class TestFusedAttentionAPI(unittest.TestCase):
self
.
config
()
self
.
generate_input_data
()
self
.
rtol
=
1e-5
# FIXME(limin29): Because there is a problem with the test precision
# on A100, atol is temporarily set to 1e-2, and it will be
# changed back after the precision problem is solved.
self
.
atol
=
1e-2
# make sure local development precision
if
"V100"
in
paddle
.
device
.
cuda
.
get_device_name
():
self
.
atol
=
1e-4
if
self
.
x_type
is
np
.
float16
:
self
.
atol
=
1e-1
def
setAttnMask
(
self
):
self
.
has_attn_mask
=
True
...
...
@@ -256,7 +267,8 @@ class TestFusedAttentionAPI(unittest.TestCase):
fused_attn
.
ln_scale
.
numpy
(),
fused_attn_ln_bias
,
fused_attn
.
qkv_weight
.
numpy
(),
fused_attn_qkv_bias
,
fused_attn
.
linear_weight
.
numpy
(),
fused_attn_linear_bias
)
np
.
testing
.
assert_allclose
(
ref_out
,
out
.
numpy
(),
rtol
=
1e-5
,
atol
=
1e-4
)
np
.
testing
.
assert_allclose
(
ref_out
,
out
.
numpy
(),
rtol
=
self
.
rtol
,
atol
=
self
.
atol
)
def
run_static
(
self
):
fused_attn
=
FusedMultiHeadAttention
(
...
...
@@ -341,7 +353,7 @@ class TestFusedAttentionAPI(unittest.TestCase):
self
.
attn_mask
,
ln_scale
,
ln_bias
,
ln_2_scale
,
ln_2_bias
,
qkv_weight
,
qkv_bias
,
linear_weight
,
linear_bias
)
np
.
testing
.
assert_allclose
(
ref_out
,
out
,
rtol
=
1e-5
,
atol
=
1e-4
)
np
.
testing
.
assert_allclose
(
ref_out
,
out
,
rtol
=
self
.
rtol
,
atol
=
self
.
atol
)
def
test_dynamic_api
(
self
):
paddle
.
disable_static
(
place
=
paddle
.
CUDAPlace
(
0
))
...
...
python/paddle/fluid/tests/unittests/test_fused_feedforward_op.py
浏览文件 @
67497119
...
...
@@ -40,7 +40,12 @@ class TestFusedFFNOp(OpTest):
def
getDiff
(
self
):
self
.
rtol
=
1e-3
self
.
atol
=
1e-4
# FIXME(limin29): Because there is a problem with the test precision
# on A100, atol is temporarily set to 1e-2, and it will be
# changed back after the precision problem is solved.
self
.
atol
=
1e-2
if
"V100"
in
paddle
.
device
.
cuda
.
get_device_name
():
self
.
atol
=
1e-4
def
getActivation
(
self
):
self
.
act_method
=
"gelu"
...
...
python/paddle/fluid/tests/unittests/test_fused_transformer_encoder_layer.py
浏览文件 @
67497119
...
...
@@ -49,6 +49,14 @@ class TestFusedTransformerEncoderLayer(unittest.TestCase):
self
.
setPreLayerNorm
()
self
.
setAttnMask
()
self
.
rtol
=
1e-3
# FIXME(limin29): Because there is a problem with the test precision
# on A100, atol is temporarily set to 1e-2, and it will be
# changed back after the precision problem is solved.
self
.
atol
=
1e-2
if
"V100"
in
paddle
.
device
.
cuda
.
get_device_name
():
self
.
atol
=
1e-4
def
fused_weight
(
self
,
weight
,
num_head
):
a
=
paddle
.
transpose
(
weight
,
perm
=
[
1
,
0
])
return
paddle
.
reshape
(
...
...
@@ -151,13 +159,13 @@ class TestFusedTransformerEncoderLayer(unittest.TestCase):
self
.
assertTrue
(
fused_encoder
.
fused_attn
.
extra_repr
(),
correct_attn_str
)
np
.
testing
.
assert_allclose
(
fused_out
.
numpy
(),
base_out
.
numpy
(),
rtol
=
1e-3
,
atol
=
1e-4
)
fused_out
.
numpy
(),
base_out
.
numpy
(),
rtol
=
self
.
rtol
,
atol
=
self
.
atol
)
self
.
assertTrue
(
np
.
allclose
(
fused_out
.
grad
.
numpy
(),
base_out
.
grad
.
numpy
(),
rtol
=
1e-3
,
atol
=
1e-4
))
rtol
=
self
.
rtol
,
atol
=
self
.
atol
))
class
TestFusedTransformerEncoderLayerAct
(
TestFusedTransformerEncoderLayer
):
...
...
python/paddle/incubate/nn/functional/fused_transformer.py
浏览文件 @
67497119
...
...
@@ -298,7 +298,7 @@ def fused_bias_dropout_residual_layer_norm(x,
seed
=
default_main_program
().
random_seed
_
,
_
,
_
,
_
,
final_out
=
_C_ops
.
fused_bias_dropout_residual_layer_norm
(
x
,
residual
,
bias
,
ln_scale
,
ln_bias
,
'dropout_rate'
,
dropout_rate
,
'ln_epsilon'
,
ln_epsilon
,
'
dropout_
is_test'
,
not
training
,
'ln_epsilon'
,
ln_epsilon
,
'is_test'
,
not
training
,
'dropout_fix_seed'
,
seed
is
not
None
,
'dropout_seed'
,
seed
if
seed
is
not
None
else
0
,
'dropout_implementation'
,
mode
)
return
final_out
...
...
@@ -327,7 +327,7 @@ def fused_bias_dropout_residual_layer_norm(x,
attrs
=
{
'ln_epsilon'
:
ln_epsilon
,
'dropout_rate'
:
dropout_rate
,
'
dropout_
is_test'
:
not
training
,
'is_test'
:
not
training
,
'dropout_fix_seed'
:
seed
is
not
None
,
'dropout_seed'
:
seed
if
seed
is
not
None
else
0
,
'dropout_implementation'
:
mode
,
...
...
@@ -513,10 +513,9 @@ def fused_multi_head_attention(x,
attn_mask
,
linear_weight
,
linear_bias
,
ln_scale
,
ln_bias
,
'pre_layer_norm'
,
pre_layer_norm
,
'epsilon'
,
pre_ln_epsilon
,
'dropout_rate'
,
dropout_rate
,
'attn_dropout_rate'
,
attn_dropout_rate
,
'ln_epsilon'
,
ln_epsilon
,
'attn_dropout_is_test'
,
not
training
,
'dropout_is_test'
,
not
training
,
'attn_dropout_fix_seed'
,
seed
is
not
None
,
'dropout_fix_seed'
,
seed
is
not
None
,
'attn_dropout_seed'
,
seed
attn_dropout_rate
,
'ln_epsilon'
,
ln_epsilon
,
'is_test'
,
not
training
,
'attn_dropout_fix_seed'
,
seed
is
not
None
,
'dropout_fix_seed'
,
seed
is
not
None
,
'attn_dropout_seed'
,
seed
if
seed
is
not
None
else
0
,
'dropout_seed'
,
seed
if
seed
is
not
None
else
0
,
'attn_dropout_implementation'
,
mode
,
'dropout_implementation'
,
mode
,
'ring_id'
,
ring_id
)
...
...
@@ -562,8 +561,7 @@ def fused_multi_head_attention(x,
'ln_epsilon'
:
ln_epsilon
,
'dropout_rate'
:
dropout_rate
,
'attn_dropout_rate'
:
attn_dropout_rate
,
'attn_dropout_is_test'
:
not
training
,
'dropout_is_test'
:
not
training
,
'is_test'
:
not
training
,
'attn_dropout_fix_seed'
:
seed
is
not
None
,
'dropout_fix_seed'
:
seed
is
not
None
,
'attn_dropout_seed'
:
seed
if
seed
is
not
None
else
0
,
...
...
@@ -801,7 +799,7 @@ def fused_multi_transformer(x,
time_step
,
attn_mask
,
linear_weights
,
linear_biases
,
ffn_ln_scales
,
ffn_ln_biases
,
ffn1_weights
,
ffn1_biases
,
ffn2_weights
,
ffn2_biases
,
cache_kvs
,
'pre_layer_norm'
,
pre_layer_norm
,
'epsilon'
,
epsilon
,
'dropout_rate'
,
dropout_rate
,
'
dropout_
is_test'
,
not
training
,
'dropout_rate'
,
dropout_rate
,
'is_test'
,
not
training
,
'dropout_implementation'
,
mode
,
'act_method'
,
activation
,
'ring_id'
,
ring_id
)
if
cache_kvs
is
not
None
:
...
...
@@ -848,7 +846,7 @@ def fused_multi_transformer(x,
'pre_layer_norm'
:
pre_layer_norm
,
'epsilon'
:
epsilon
,
'dropout_rate'
:
dropout_rate
,
'
dropout_
is_test'
:
not
training
,
'is_test'
:
not
training
,
'dropout_implementation'
:
mode
,
'act_method'
:
activation
,
'ring_id'
:
ring_id
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录