Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
e4459a40
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
e4459a40
编写于
4月 07, 2022
作者:
S
sneaxiy
提交者:
GitHub
4月 07, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Add Output(Step) to DistributedFusedLamb optimizer (#41249)
* add Output(Step) to distributed fused lamb op * add _set_step
上级
f78cc3da
变更
5
隐藏空白更改
内联
并排
Showing
5 changed file
with
49 addition
and
16 deletion
+49
-16
paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cc
...id/operators/optimizers/distributed_fused_lamb_init_op.cc
+1
-0
paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu
...id/operators/optimizers/distributed_fused_lamb_init_op.cu
+4
-0
paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc
...e/fluid/operators/optimizers/distributed_fused_lamb_op.cc
+1
-0
paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
...e/fluid/operators/optimizers/distributed_fused_lamb_op.cu
+30
-16
python/paddle/incubate/optimizer/distributed_fused_lamb.py
python/paddle/incubate/optimizer/distributed_fused_lamb.py
+13
-0
未找到文件。
paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cc
浏览文件 @
e4459a40
...
...
@@ -94,6 +94,7 @@ class DistributedFusedLambInitOpMaker
AddOutput
(
"GradOut"
,
"The output gradient list."
).
AsDuplicable
();
AddOutput
(
"GlobalScale"
,
"The global scale. It is usually the scale factor for AMP."
);
AddOutput
(
"Step"
,
"The global step which excludes the NaN/Inf step."
);
AddAttr
<
float
>
(
"beta1"
,
"The initial value of Beta1Pow."
);
AddAttr
<
float
>
(
"beta2"
,
"The initial value of Beta2Pow."
);
...
...
paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu
浏览文件 @
e4459a40
...
...
@@ -698,6 +698,10 @@ class DistributedFusedLambInitOpKernel<platform::CUDADeviceContext, T>
TensorFillConstant
<
float
>
(
dev_ctx
,
global_scale
,
{
1
},
1.0
f
);
}
VLOG
(
10
)
<<
"Init global scale ends"
;
TensorFillConstant
<
int64_t
>
(
dev_ctx
,
ctx
.
Output
<
framework
::
Tensor
>
(
"Step"
),
{
1
},
static_cast
<
int64_t
>
(
0
));
dev_ctx
.
Wait
();
VLOG
(
10
)
<<
"Wait for H2D copy"
;
}
...
...
paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc
浏览文件 @
e4459a40
...
...
@@ -110,6 +110,7 @@ class DistributedFusedLambOpMaker : public framework::OpProtoAndCheckerMaker {
.
AsDuplicable
();
AddOutput
(
"FoundInf"
,
"Whether there is NaN/Inf"
);
AddOutput
(
"Step"
,
"The global step which excludes the NaN/Inf step."
);
AddAttr
<
float
>
(
"beta1"
,
"The initial Beta1Pow value."
);
AddAttr
<
float
>
(
"beta2"
,
"The initial Beta2Pow value."
);
...
...
paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
浏览文件 @
e4459a40
...
...
@@ -381,8 +381,9 @@ static __global__ void UpdateLambMomentAndTrustRatioDivCUDAKernel(
const
T
*
__restrict__
square_grad_norm_p
,
const
T
*
__restrict__
global_scale
,
const
T
*
__restrict__
beta1pow_p
,
const
T
*
__restrict__
beta2pow_p
,
T
*
__restrict__
mom1_p
,
T
*
__restrict__
mom2_p
,
T
*
__restrict__
trust_ratio_div_p
,
bool
*
found_inf
,
T
weight_decay
,
int
weight_decay_end_numel
,
T
beta1
,
T
beta2
,
T
epsilon
,
T
*
__restrict__
mom2_p
,
T
*
__restrict__
trust_ratio_div_p
,
bool
*
__restrict__
found_inf
,
int64_t
*
__restrict__
step
,
T
weight_decay
,
int
weight_decay_end_numel
,
T
beta1
,
T
beta2
,
T
epsilon
,
T
max_global_grad_norm
,
int
num
,
T
rescale_grad
)
{
T
square_grad_norm
=
*
square_grad_norm_p
;
bool
need_update_found_inf
=
...
...
@@ -392,6 +393,7 @@ static __global__ void UpdateLambMomentAndTrustRatioDivCUDAKernel(
return
;
}
else
if
(
need_update_found_inf
)
{
*
found_inf
=
false
;
++
(
*
step
);
}
T
scale
=
rescale_grad
/
global_scale
[
0
];
...
...
@@ -467,8 +469,8 @@ static void MultiTensorUpdateLambMomentAndTrustRatioDiv(
const
platform
::
CUDADeviceContext
&
dev_ctx
,
const
int
*
offsets
,
int
n
,
const
T
*
param_p
,
const
GradT
*
grad_p
,
const
T
*
square_grad_norm_p
,
const
T
*
global_scale
,
const
T
*
beta1pow_p
,
const
T
*
beta2pow_p
,
T
*
mom1_p
,
T
*
mom2_p
,
T
*
trust_ratio_div_p
,
bool
*
found_inf_p
,
T
weight_decay
,
int
weight_decay_end_idx
,
T
beta1
,
T
beta2
,
T
epsilon
,
T
*
mom2_p
,
T
*
trust_ratio_div_p
,
bool
*
found_inf_p
,
int64_t
*
step
,
T
weight_decay
,
int
weight_decay_end_idx
,
T
beta1
,
T
beta2
,
T
epsilon
,
T
max_global_grad_norm
,
T
rescale_grad
)
{
if
(
n
<=
0
)
return
;
int
numel
=
offsets
[
n
]
-
offsets
[
0
];
...
...
@@ -496,15 +498,24 @@ static void MultiTensorUpdateLambMomentAndTrustRatioDiv(
auto
stream
=
dev_ctx
.
stream
();
auto
config
=
platform
::
GetGpuLaunchConfig1D
(
dev_ctx
,
numel
,
vec_size
);
if
(
found_inf_p
==
nullptr
)
{
PADDLE_ENFORCE_EQ
(
step
,
nullptr
,
platform
::
errors
::
InvalidArgument
(
"Output(Step) cannot be updated twice in one mini-batch."
));
}
else
{
PADDLE_ENFORCE_NOT_NULL
(
step
,
platform
::
errors
::
InvalidArgument
(
"Output(Step) cannot be nullptr."
));
}
#define PD_LAUNCH_LAMB_MOM_TRUST_RATIO_DIV_KERNEL \
do { \
UpdateLambMomentAndTrustRatioDivCUDAKernel<T, GradT, kVecSize><<< \
config.block_per_grid, config.thread_per_block, 0, stream>>>( \
param_p, grad_p, square_grad_norm_p, global_scale, beta1pow_p, \
beta2pow_p, mom1_p, mom2_p, trust_ratio_div_p, found_inf_p,
\
weight_decay, weight_decay_end_numel, beta1, beta2, epsilon, \
max_global_grad_norm, numel, rescale_grad); \
#define PD_LAUNCH_LAMB_MOM_TRUST_RATIO_DIV_KERNEL
\
do {
\
UpdateLambMomentAndTrustRatioDivCUDAKernel<T, GradT, kVecSize><<<
\
config.block_per_grid, config.thread_per_block, 0, stream>>>(
\
param_p, grad_p, square_grad_norm_p, global_scale, beta1pow_p,
\
beta2pow_p, mom1_p, mom2_p, trust_ratio_div_p, found_inf_p,
step,
\
weight_decay, weight_decay_end_numel, beta1, beta2, epsilon,
\
max_global_grad_norm, numel, rescale_grad);
\
} while (0)
PD_VEC_LAUNCH_KERNEL
(
vec_size
,
PD_LAUNCH_LAMB_MOM_TRUST_RATIO_DIV_KERNEL
);
...
...
@@ -1315,6 +1326,8 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
const
auto
*
fp16_partial_fused_offsets
=
fp16_partial_fused_offsets_t
->
data
<
int
>
();
auto
*
step
=
ctx
.
Output
<
framework
::
Tensor
>
(
"Step"
)
->
data
<
int64_t
>
();
VLOG
(
1
)
<<
"FusedParamOffsets: "
<<
FlattenToString
(
fused_offsets
,
fused_offsets_t
->
numel
(),
fused_offsets_t
->
place
());
...
...
@@ -1337,8 +1350,8 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
dev_ctx
,
fp32_partial_fused_offsets
,
fp32_local_param_num
,
fp32_param
+
fp32_offset
,
fp32_sum_grad
,
fp32_square_grad_norm
,
global_scale
,
beta1pow
,
beta2pow
,
moment1
,
moment2
,
trust_ratio_div
,
found_inf
,
weight_decay
,
fp32_weight_decay_end_idx
,
beta1
,
beta2
,
epsilon
,
max_global_grad_norm
,
rescale_grad
);
found_inf
,
step
,
weight_decay
,
fp32_weight_decay_end_idx
,
beta1
,
beta2
,
epsilon
,
max_global_grad_norm
,
rescale_grad
);
VLOG
(
10
)
<<
"Update FP32 Moment and TrustRatioDiv done"
;
}
float
*
master_param
=
nullptr
;
...
...
@@ -1346,13 +1359,14 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
master_param
=
fp32_param
+
fp32_numel
;
VLOG
(
10
)
<<
"Update FP16 Moment and TrustRatioDiv starts"
;
auto
tmp_found_inf
=
has_fp32_param
?
nullptr
:
found_inf
;
auto
tmp_step
=
has_fp32_param
?
nullptr
:
step
;
MultiTensorUpdateLambMomentAndTrustRatioDiv
(
dev_ctx
,
fp16_partial_fused_offsets
,
fp16_local_param_num
,
master_param
+
fp16_offset
,
fp16_sum_grad
,
fp32_square_grad_norm
,
global_scale
,
beta1pow
,
beta2pow
,
moment1
+
fp32_numel_each_device
,
moment2
+
fp32_numel_each_device
,
trust_ratio_div
+
fp32_numel_each_device
,
tmp_found_inf
,
weight_decay
,
fp16_weight_decay_end_idx
,
beta1
,
beta2
,
epsilon
,
trust_ratio_div
+
fp32_numel_each_device
,
tmp_found_inf
,
tmp_step
,
weight_decay
,
fp16_weight_decay_end_idx
,
beta1
,
beta2
,
epsilon
,
max_global_grad_norm
,
rescale_grad
);
VLOG
(
10
)
<<
"Update FP16 Moment and TrustRatioDiv done"
;
}
...
...
python/paddle/incubate/optimizer/distributed_fused_lamb.py
浏览文件 @
e4459a40
...
...
@@ -75,9 +75,18 @@ class DistributedFusedLamb(Optimizer):
name
=
unique_name
.
generate
(
'found_inf'
),
shape
=
[
1
],
dtype
=
core
.
VarDesc
.
VarType
.
BOOL
)
self
.
_step
=
None
self
.
_param_to_master_param
=
{}
def
_set_step
(
self
,
step
):
self
.
_step
=
step
def
_get_or_create_step
(
self
):
if
self
.
_step
is
None
:
self
.
_step
=
self
.
_create_persistable_var
(
'step'
,
dtype
=
'int64'
)
return
self
.
_step
def
_set_scale
(
self
,
scale
):
assert
scale
is
not
None
if
not
isinstance
(
scale
,
Variable
):
...
...
@@ -189,6 +198,8 @@ class DistributedFusedLamb(Optimizer):
param_order
=
self
.
_create_persistable_var
(
'param_order'
,
dtype
=
'int32'
)
param_order
.
is_distributed
=
True
step
=
self
.
_get_or_create_step
()
rank
=
get_rank
()
nranks
=
get_world_size
()
scale
=
self
.
_get_or_create_scale
()
...
...
@@ -234,6 +245,7 @@ class DistributedFusedLamb(Optimizer):
'FP16ShardFusedParamOffsets'
:
[
fp16_partial_fused_offsets
],
'FusedParamOffsets'
:
[
fused_offsets
],
'ParamOrder'
:
[
param_order
],
'Step'
:
[
step
],
},
attrs
=
{
'alignment'
:
self
.
_alignment
,
...
...
@@ -290,6 +302,7 @@ class DistributedFusedLamb(Optimizer):
'ParamOut'
:
params
,
'GradOut'
:
grads
,
'FoundInf'
:
[
self
.
_found_inf
],
'Step'
:
[
step
],
},
attrs
=
{
'weight_decay'
:
self
.
_weight_decay
,
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录