Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
6c9fa665
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
6c9fa665
编写于
12月 25, 2022
作者:
W
wanghuancoder
提交者:
GitHub
12月 25, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
delete legacy dygraph code in python/paddle/optimizer (#49308)
上级
983ae1d7
变更
9
隐藏空白更改
内联
并排
Showing
9 changed file
with
348 addition
and
600 deletion
+348
-600
python/paddle/optimizer/adadelta.py
python/paddle/optimizer/adadelta.py
+23
-23
python/paddle/optimizer/adam.py
python/paddle/optimizer/adam.py
+64
-134
python/paddle/optimizer/adamax.py
python/paddle/optimizer/adamax.py
+32
-51
python/paddle/optimizer/adamw.py
python/paddle/optimizer/adamw.py
+83
-119
python/paddle/optimizer/lamb.py
python/paddle/optimizer/lamb.py
+41
-69
python/paddle/optimizer/lr.py
python/paddle/optimizer/lr.py
+1
-10
python/paddle/optimizer/momentum.py
python/paddle/optimizer/momentum.py
+51
-105
python/paddle/optimizer/optimizer.py
python/paddle/optimizer/optimizer.py
+27
-53
python/paddle/optimizer/sgd.py
python/paddle/optimizer/sgd.py
+26
-36
未找到文件。
python/paddle/optimizer/adadelta.py
浏览文件 @
6c9fa665
...
...
@@ -170,29 +170,29 @@ class Adadelta(Optimizer):
self
.
_epsilon
,
)
return
None
if
not
isinstance
(
block
,
framework
.
Block
):
raise
TypeError
(
"block is not instance of framework.Block."
)
# Create the adadelta optimizer op
adadelta_op
=
block
.
append_op
(
type
=
self
.
type
,
inputs
=
{
"Param"
:
param_and_grad
[
0
],
"Grad"
:
param_and_grad
[
1
],
"AvgSquaredGrad"
:
avg_squared_grad_acc
,
"AvgSquaredUpdate"
:
avg_squared_update_acc
,
},
outputs
=
{
"ParamOut"
:
param_and_grad
[
0
],
"AvgSquaredGradOut"
:
avg_squared_grad_acc
,
"AvgSquaredUpdateOut"
:
avg_squared_update_acc
,
},
attrs
=
{
"epsilon"
:
self
.
_epsilon
,
"rho"
:
self
.
_rho
},
stop_gradient
=
True
,
)
return
adadelta_op
else
:
if
not
isinstance
(
block
,
framework
.
Block
):
raise
TypeError
(
"block is not instance of framework.Block."
)
# Create the adadelta optimizer op
adadelta_op
=
block
.
append_op
(
type
=
self
.
type
,
inputs
=
{
"Param"
:
param_and_grad
[
0
],
"Grad"
:
param_and_grad
[
1
],
"AvgSquaredGrad"
:
avg_squared_grad_acc
,
"AvgSquaredUpdate"
:
avg_squared_update_acc
,
},
outputs
=
{
"ParamOut"
:
param_and_grad
[
0
],
"AvgSquaredGradOut"
:
avg_squared_grad_acc
,
"AvgSquaredUpdateOut"
:
avg_squared_update_acc
,
},
attrs
=
{
"epsilon"
:
self
.
_epsilon
,
"rho"
:
self
.
_rho
},
stop_gradient
=
True
,
)
return
adadelta_op
def
_update_param_group
(
self
,
parameters
):
self
.
_epsilon
=
parameters
.
get
(
'epsilon'
,
self
.
_default_dict
[
'epsilon'
])
...
...
python/paddle/optimizer/adam.py
浏览文件 @
6c9fa665
...
...
@@ -16,7 +16,7 @@ import warnings
from
collections
import
defaultdict
import
paddle
from
paddle
import
_C_ops
,
_legacy_C_ops
from
paddle
import
_C_ops
from
..fluid
import
core
,
framework
,
unique_name
from
..fluid.dygraph
import
base
as
imperative_base
...
...
@@ -393,98 +393,55 @@ class Adam(Optimizer):
)
return
None
if
framework
.
_in_legacy_dygraph
():
_beta1
=
(
self
.
_beta1
if
not
isinstance
(
self
.
_beta1
,
Variable
)
else
self
.
_beta1
.
numpy
().
item
(
0
)
)
_beta2
=
(
self
.
_beta2
if
not
isinstance
(
self
.
_beta2
,
Variable
)
else
self
.
_beta2
.
numpy
().
item
(
0
)
)
_
,
_
,
_
,
_
,
_
,
_
=
_legacy_C_ops
.
adam
(
param_and_grad
[
0
],
param_and_grad
[
1
],
lr
,
moment1
,
moment2
,
beta1_pow_acc
,
beta2_pow_acc
,
master_weight
,
param_and_grad
[
0
],
moment1
,
moment2
,
beta1_pow_acc
,
beta2_pow_acc
,
master_weight
,
'epsilon'
,
self
.
_epsilon
,
'lazy_mode'
,
self
.
_lazy_mode
,
'min_row_size_to_use_multithread'
,
1000
,
'beta1'
,
_beta1
,
'beta2'
,
_beta2
,
'multi_precision'
,
find_master
,
)
return
None
inputs
=
{
"Param"
:
[
param_and_grad
[
0
]],
"Grad"
:
[
param_and_grad
[
1
]],
"LearningRate"
:
[
lr
],
"Moment1"
:
[
moment1
],
"Moment2"
:
[
moment2
],
"Beta1Pow"
:
[
beta1_pow_acc
],
"Beta2Pow"
:
[
beta2_pow_acc
],
}
outputs
=
{
"ParamOut"
:
[
param_and_grad
[
0
]],
"Moment1Out"
:
[
moment1
],
"Moment2Out"
:
[
moment2
],
"Beta1PowOut"
:
[
beta1_pow_acc
],
"Beta2PowOut"
:
[
beta2_pow_acc
],
}
attrs
=
{
"lazy_mode"
:
self
.
_lazy_mode
,
"min_row_size_to_use_multithread"
:
1000
,
"multi_precision"
:
find_master
,
}
if
isinstance
(
self
.
_beta1
,
Variable
):
inputs
[
'Beta1Tensor'
]
=
self
.
_beta1
else
:
attrs
[
'beta1'
]
=
self
.
_beta1
if
isinstance
(
self
.
_beta2
,
Variable
):
inputs
[
'Beta2Tensor'
]
=
self
.
_beta2
else
:
attrs
[
'beta2'
]
=
self
.
_beta2
if
isinstance
(
self
.
_epsilon
,
Variable
):
inputs
[
'EpsilonTensor'
]
=
self
.
_epsilon
else
:
attrs
[
'epsilon'
]
=
self
.
_epsilon
if
find_master
:
inputs
[
"MasterParam"
]
=
master_weight
outputs
[
"MasterParamOut"
]
=
master_weight
adam_op
=
block
.
append_op
(
type
=
self
.
type
,
inputs
=
inputs
,
outputs
=
outputs
,
attrs
=
attrs
,
stop_gradient
=
True
,
)
inputs
=
{
"Param"
:
[
param_and_grad
[
0
]],
"Grad"
:
[
param_and_grad
[
1
]],
"LearningRate"
:
[
lr
],
"Moment1"
:
[
moment1
],
"Moment2"
:
[
moment2
],
"Beta1Pow"
:
[
beta1_pow_acc
],
"Beta2Pow"
:
[
beta2_pow_acc
],
}
outputs
=
{
"ParamOut"
:
[
param_and_grad
[
0
]],
"Moment1Out"
:
[
moment1
],
"Moment2Out"
:
[
moment2
],
"Beta1PowOut"
:
[
beta1_pow_acc
],
"Beta2PowOut"
:
[
beta2_pow_acc
],
}
attrs
=
{
"lazy_mode"
:
self
.
_lazy_mode
,
"min_row_size_to_use_multithread"
:
1000
,
"multi_precision"
:
find_master
,
}
if
isinstance
(
self
.
_beta1
,
Variable
):
inputs
[
'Beta1Tensor'
]
=
self
.
_beta1
else
:
attrs
[
'beta1'
]
=
self
.
_beta1
if
isinstance
(
self
.
_beta2
,
Variable
):
inputs
[
'Beta2Tensor'
]
=
self
.
_beta2
else
:
attrs
[
'beta2'
]
=
self
.
_beta2
if
isinstance
(
self
.
_epsilon
,
Variable
):
inputs
[
'EpsilonTensor'
]
=
self
.
_epsilon
else
:
attrs
[
'epsilon'
]
=
self
.
_epsilon
if
find_master
:
inputs
[
"MasterParam"
]
=
master_weight
outputs
[
"MasterParamOut"
]
=
master_weight
adam_op
=
block
.
append_op
(
type
=
self
.
type
,
inputs
=
inputs
,
outputs
=
outputs
,
attrs
=
attrs
,
stop_gradient
=
True
,
)
return
adam_op
return
adam_op
@
imperative_base
.
no_grad
@
framework
.
dygraph_only
...
...
@@ -729,55 +686,28 @@ class Adam(Optimizer):
else
self
.
_beta2
.
numpy
().
item
(
0
)
)
if
framework
.
_non_static
_mode
():
if
framework
.
in_dygraph
_mode
():
master_weight
=
self
.
_master_weight_dict
[
key
]
master_weight
=
(
master_weight
[
param_group_idx
]
if
master_weight
is
not
None
else
None
)
if
in_dygraph_mode
():
_
,
_
,
_
,
_
,
_
,
_
=
_C_ops
.
merged_adam_
(
self
.
_param_dict
[
key
][
param_group_idx
],
grad_dict
[
key
],
lr_dict
[
key
],
self
.
_moment1_dict
[
key
][
param_group_idx
],
self
.
_moment2_dict
[
key
][
param_group_idx
],
self
.
_beta1_pow_acc_dict
[
key
][
param_group_idx
],
self
.
_beta2_pow_acc_dict
[
key
][
param_group_idx
],
master_weight
,
_beta1
,
_beta2
,
self
.
_epsilon
,
find_master
,
False
,
)
else
:
_
,
_
,
_
,
_
,
_
,
_
=
_legacy_C_ops
.
merged_adam
(
self
.
_param_dict
[
key
][
param_group_idx
],
grad_dict
[
key
],
lr_dict
[
key
],
self
.
_moment1_dict
[
key
][
param_group_idx
],
self
.
_moment2_dict
[
key
][
param_group_idx
],
self
.
_beta1_pow_acc_dict
[
key
][
param_group_idx
],
self
.
_beta2_pow_acc_dict
[
key
][
param_group_idx
],
master_weight
,
self
.
_param_dict
[
key
][
param_group_idx
],
self
.
_moment1_dict
[
key
][
param_group_idx
],
self
.
_moment2_dict
[
key
][
param_group_idx
],
self
.
_beta1_pow_acc_dict
[
key
][
param_group_idx
],
self
.
_beta2_pow_acc_dict
[
key
][
param_group_idx
],
master_weight
,
'epsilon'
,
self
.
_epsilon
,
'beta1'
,
_beta1
,
'beta2'
,
_beta2
,
'multi_precision'
,
find_master
,
)
_
,
_
,
_
,
_
,
_
,
_
=
_C_ops
.
merged_adam_
(
self
.
_param_dict
[
key
][
param_group_idx
],
grad_dict
[
key
],
lr_dict
[
key
],
self
.
_moment1_dict
[
key
][
param_group_idx
],
self
.
_moment2_dict
[
key
][
param_group_idx
],
self
.
_beta1_pow_acc_dict
[
key
][
param_group_idx
],
self
.
_beta2_pow_acc_dict
[
key
][
param_group_idx
],
master_weight
,
_beta1
,
_beta2
,
self
.
_epsilon
,
find_master
,
False
,
)
else
:
inputs
=
{
"Param"
:
self
.
_param_dict
[
key
][
param_group_idx
],
...
...
python/paddle/optimizer/adamax.py
浏览文件 @
6c9fa665
...
...
@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from
paddle
import
_C_ops
,
_legacy_C_ops
from
paddle
import
_C_ops
from
..fluid
import
framework
from
..fluid.dygraph
import
no_grad
...
...
@@ -210,24 +210,6 @@ class Adamax(Optimizer):
self
.
_beta2
,
self
.
_epsilon
,
)
elif
framework
.
_in_legacy_dygraph
():
_legacy_C_ops
.
adamax
(
param_and_grad
[
0
],
param_and_grad
[
1
],
self
.
_create_param_lr
(
param_and_grad
),
moment
,
inf_norm
,
beta1_pow_acc
,
param_and_grad
[
0
],
moment
,
inf_norm
,
"beta1"
,
self
.
_beta1
,
"beta2"
,
self
.
_beta2
,
"epsilon"
,
self
.
_epsilon
,
)
else
:
# create the adamax optimize op
adamax_op
=
block
.
append_op
(
...
...
@@ -271,20 +253,20 @@ class Adamax(Optimizer):
beta1_pow_acc
,
self
.
_beta1
,
0.0
,
True
)
beta1_pow_acc
.
copy_
(
tmp
,
False
)
continue
with
param
.
block
.
program
.
_optimized_guard
(
[
param
,
grad
]
),
name_scope
(
'adamax'
):
beta1_pow_acc
=
self
.
_get_accumulator
(
self
.
_beta1_pow_acc_str
,
param
)
block
.
append_op
(
type
=
"scale"
,
inputs
=
{
"X"
:
beta1_pow_acc
},
outputs
=
{
"Out"
:
beta1_pow_acc
},
attrs
=
{
"scale"
:
self
.
_beta1
},
stop_gradient
=
True
,
)
else
:
with
param
.
block
.
program
.
_optimized_guard
(
[
param
,
grad
]
),
name_scope
(
'adamax'
):
beta1_pow_acc
=
self
.
_get_accumulator
(
self
.
_beta1_pow_acc_str
,
param
)
block
.
append_op
(
type
=
"scale"
,
inputs
=
{
"X"
:
beta1_pow_acc
},
outputs
=
{
"Out"
:
beta1_pow_acc
},
attrs
=
{
"scale"
:
self
.
_beta1
},
stop_gradient
=
True
,
)
else
:
for
param
,
grad
in
parameters_and_grads
[
'params'
]:
if
grad
is
None
or
param
.
stop_gradient
is
True
:
...
...
@@ -301,24 +283,23 @@ class Adamax(Optimizer):
beta1_pow_acc
,
self
.
_beta1
,
0.0
,
True
)
beta1_pow_acc
.
copy_
(
tmp
,
False
)
continue
with
param
.
block
.
program
.
_optimized_guard
(
[
param
,
grad
]
),
name_scope
(
'adamax'
):
beta1_pow_acc
=
self
.
_get_accumulator
(
self
.
_beta1_pow_acc_str
,
param
)
self
.
_beta1
=
parameters_and_grads
.
get
(
'beta1'
,
self
.
_default_dict
[
'beta1'
]
)
block
.
append_op
(
type
=
"scale"
,
inputs
=
{
"X"
:
beta1_pow_acc
},
outputs
=
{
"Out"
:
beta1_pow_acc
},
attrs
=
{
"scale"
:
self
.
_beta1
},
stop_gradient
=
True
,
)
else
:
with
param
.
block
.
program
.
_optimized_guard
(
[
param
,
grad
]
),
name_scope
(
'adamax'
):
beta1_pow_acc
=
self
.
_get_accumulator
(
self
.
_beta1_pow_acc_str
,
param
)
self
.
_beta1
=
parameters_and_grads
.
get
(
'beta1'
,
self
.
_default_dict
[
'beta1'
]
)
block
.
append_op
(
type
=
"scale"
,
inputs
=
{
"X"
:
beta1_pow_acc
},
outputs
=
{
"Out"
:
beta1_pow_acc
},
attrs
=
{
"scale"
:
self
.
_beta1
},
stop_gradient
=
True
,
)
def
_update_param_group
(
self
,
parameters
):
self
.
_beta1
=
parameters
.
get
(
'beta1'
,
self
.
_default_dict
[
'beta1'
])
...
...
python/paddle/optimizer/adamw.py
浏览文件 @
6c9fa665
...
...
@@ -18,7 +18,7 @@ from collections.abc import Callable
import
paddle
from
..
import
_C_ops
,
_legacy_C_ops
from
..
import
_C_ops
from
..fluid
import
core
,
framework
,
unique_name
from
..fluid.clip
import
GradientClipBase
from
..fluid.dygraph
import
base
as
imperative_base
...
...
@@ -473,7 +473,7 @@ class AdamW(Optimizer):
lr
=
self
.
_create_param_lr
(
param_and_grad
)
# create the adamw optimize op
if
framework
.
_non_static
_mode
():
if
framework
.
in_dygraph
_mode
():
lr_ratio_
=
(
1.0
if
self
.
_lr_ratio
is
None
...
...
@@ -491,126 +491,90 @@ class AdamW(Optimizer):
else
self
.
_beta2
.
numpy
().
item
(
0
)
)
if
framework
.
in_dygraph_mode
():
found_inf
=
self
.
_get_auxiliary_var
(
'found_inf'
)
_
,
_
,
_
,
_
,
_
,
_
=
_C_ops
.
adamw_
(
param_and_grad
[
0
],
param_and_grad
[
1
],
lr
,
moment1
,
moment2
,
beta1_pow_acc
,
beta2_pow_acc
,
master_weight
,
found_inf
,
_beta1
,
_beta2
,
self
.
_epsilon
,
lr_ratio_
,
self
.
_weight_decay
,
with_decay
,
self
.
_lazy_mode
,
1000
,
find_master
,
False
,
)
else
:
_
,
_
,
_
,
_
,
_
,
_
=
_legacy_C_ops
.
adamw
(
param_and_grad
[
0
],
param_and_grad
[
1
],
lr
,
moment1
,
moment2
,
beta1_pow_acc
,
beta2_pow_acc
,
master_weight
,
param_and_grad
[
0
],
moment1
,
moment2
,
beta1_pow_acc
,
beta2_pow_acc
,
master_weight
,
'epsilon'
,
self
.
_epsilon
,
'lazy_mode'
,
self
.
_lazy_mode
,
'min_row_size_to_use_multithread'
,
1000
,
'beta1'
,
_beta1
,
'beta2'
,
_beta2
,
"with_decay"
,
with_decay
,
'coeff'
,
self
.
_weight_decay
,
'multi_precision'
,
find_master
,
'lr_ratio'
,
lr_ratio_
,
)
found_inf
=
self
.
_get_auxiliary_var
(
'found_inf'
)
_
,
_
,
_
,
_
,
_
,
_
=
_C_ops
.
adamw_
(
param_and_grad
[
0
],
param_and_grad
[
1
],
lr
,
moment1
,
moment2
,
beta1_pow_acc
,
beta2_pow_acc
,
master_weight
,
found_inf
,
_beta1
,
_beta2
,
self
.
_epsilon
,
lr_ratio_
,
self
.
_weight_decay
,
with_decay
,
self
.
_lazy_mode
,
1000
,
find_master
,
False
,
)
return
None
inputs
=
{
"Param"
:
[
param_and_grad
[
0
]],
"Grad"
:
[
param_and_grad
[
1
]],
"LearningRate"
:
[
lr
],
"Moment1"
:
[
moment1
],
"Moment2"
:
[
moment2
],
"Beta1Pow"
:
[
beta1_pow_acc
],
"Beta2Pow"
:
[
beta2_pow_acc
],
}
# Pass found_inf to adamw, to skip update for not only param, but also momentum and beta_pow
found_inf
=
self
.
_get_auxiliary_var
(
'found_inf'
)
if
found_inf
:
inputs
[
'SkipUpdate'
]
=
found_inf
outputs
=
{
"ParamOut"
:
[
param_and_grad
[
0
]],
"Moment1Out"
:
[
moment1
],
"Moment2Out"
:
[
moment2
],
"Beta1PowOut"
:
[
beta1_pow_acc
],
"Beta2PowOut"
:
[
beta2_pow_acc
],
}
attrs
=
{
"lazy_mode"
:
self
.
_lazy_mode
,
"min_row_size_to_use_multithread"
:
1000
,
"multi_precision"
:
find_master
,
"with_decay"
:
with_decay
,
"coeff"
:
self
.
_weight_decay
,
"lr_ratio"
:
1.0
if
self
.
_lr_ratio
is
None
else
self
.
_lr_ratio
(
param_and_grad
[
0
]),
}
if
isinstance
(
self
.
_beta1
,
Variable
):
inputs
[
'Beta1Tensor'
]
=
self
.
_beta1
else
:
attrs
[
'beta1'
]
=
self
.
_beta1
if
isinstance
(
self
.
_beta2
,
Variable
):
inputs
[
'Beta2Tensor'
]
=
self
.
_beta2
else
:
attrs
[
'beta2'
]
=
self
.
_beta2
if
isinstance
(
self
.
_epsilon
,
Variable
):
inputs
[
'EpsilonTensor'
]
=
self
.
_epsilon
else
:
attrs
[
'epsilon'
]
=
self
.
_epsilon
if
find_master
:
inputs
[
"MasterParam"
]
=
master_weight
outputs
[
"MasterParamOut"
]
=
master_weight
adamw_op
=
block
.
append_op
(
type
=
self
.
type
,
inputs
=
inputs
,
outputs
=
outputs
,
attrs
=
attrs
,
stop_gradient
=
True
,
)
inputs
=
{
"Param"
:
[
param_and_grad
[
0
]],
"Grad"
:
[
param_and_grad
[
1
]],
"LearningRate"
:
[
lr
],
"Moment1"
:
[
moment1
],
"Moment2"
:
[
moment2
],
"Beta1Pow"
:
[
beta1_pow_acc
],
"Beta2Pow"
:
[
beta2_pow_acc
],
}
# Pass found_inf to adamw, to skip update for not only param, but also momentum and beta_pow
found_inf
=
self
.
_get_auxiliary_var
(
'found_inf'
)
if
found_inf
:
inputs
[
'SkipUpdate'
]
=
found_inf
outputs
=
{
"ParamOut"
:
[
param_and_grad
[
0
]],
"Moment1Out"
:
[
moment1
],
"Moment2Out"
:
[
moment2
],
"Beta1PowOut"
:
[
beta1_pow_acc
],
"Beta2PowOut"
:
[
beta2_pow_acc
],
}
attrs
=
{
"lazy_mode"
:
self
.
_lazy_mode
,
"min_row_size_to_use_multithread"
:
1000
,
"multi_precision"
:
find_master
,
"with_decay"
:
with_decay
,
"coeff"
:
self
.
_weight_decay
,
"lr_ratio"
:
1.0
if
self
.
_lr_ratio
is
None
else
self
.
_lr_ratio
(
param_and_grad
[
0
]),
}
if
isinstance
(
self
.
_beta1
,
Variable
):
inputs
[
'Beta1Tensor'
]
=
self
.
_beta1
else
:
attrs
[
'beta1'
]
=
self
.
_beta1
if
isinstance
(
self
.
_beta2
,
Variable
):
inputs
[
'Beta2Tensor'
]
=
self
.
_beta2
else
:
attrs
[
'beta2'
]
=
self
.
_beta2
if
isinstance
(
self
.
_epsilon
,
Variable
):
inputs
[
'EpsilonTensor'
]
=
self
.
_epsilon
else
:
attrs
[
'epsilon'
]
=
self
.
_epsilon
if
find_master
:
inputs
[
"MasterParam"
]
=
master_weight
outputs
[
"MasterParamOut"
]
=
master_weight
adamw_op
=
block
.
append_op
(
type
=
self
.
type
,
inputs
=
inputs
,
outputs
=
outputs
,
attrs
=
attrs
,
stop_gradient
=
True
,
)
return
adamw_op
return
adamw_op
def
__str__
(
self
):
return
" "
.
join
([
"Weight Decay, params:"
,
","
.
join
(
self
.
_params_name
)])
...
...
python/paddle/optimizer/lamb.py
浏览文件 @
6c9fa665
...
...
@@ -13,7 +13,7 @@
# limitations under the License.
import
paddle
from
paddle
import
_C_ops
,
_legacy_C_ops
from
paddle
import
_C_ops
from
paddle.fluid.executor
import
global_scope
from
..fluid
import
core
,
framework
,
unique_name
...
...
@@ -313,76 +313,48 @@ class Lamb(Optimizer):
find_master
,
)
return
None
if
framework
.
_non_static_mode
():
_legacy_C_ops
.
lamb
(
param_and_grad
[
0
],
param_and_grad
[
1
],
lr
,
moment1
,
moment2
,
beta1_pow_acc
,
beta2_pow_acc
,
master_weight
,
param_and_grad
[
0
],
moment1
,
moment2
,
beta1_pow_acc
,
beta2_pow_acc
,
master_weight
,
'beta1'
,
self
.
_beta1
,
'beta2'
,
self
.
_beta2
,
'epsilon'
,
self
.
_epsilon
,
'weight_decay'
,
weight_decay
,
'multi_precision'
,
find_master
,
else
:
# create the lamb optimize op
inputs
=
{
"Param"
:
param_and_grad
[
0
],
"Grad"
:
param_and_grad
[
1
],
"LearningRate"
:
lr
,
"Moment1"
:
moment1
,
"Moment2"
:
moment2
,
"Beta1Pow"
:
beta1_pow_acc
,
"Beta2Pow"
:
beta2_pow_acc
,
}
outputs
=
{
"ParamOut"
:
param_and_grad
[
0
],
"Moment1Out"
:
moment1
,
"Moment2Out"
:
moment2
,
"Beta1PowOut"
:
beta1_pow_acc
,
"Beta2PowOut"
:
beta2_pow_acc
,
}
attrs
=
{
"beta1"
:
self
.
_beta1
,
"beta2"
:
self
.
_beta2
,
"epsilon"
:
self
.
_epsilon
,
"weight_decay"
:
weight_decay
,
"multi_precision"
:
find_master
,
}
if
find_master
:
inputs
[
"MasterParam"
]
=
master_weight
outputs
[
"MasterParamOut"
]
=
master_weight
if
found_inf
:
inputs
[
"SkipUpdate"
]
=
found_inf
lamb_op
=
block
.
append_op
(
type
=
self
.
type
,
inputs
=
inputs
,
outputs
=
outputs
,
attrs
=
attrs
,
stop_gradient
=
True
,
)
return
None
# create the lamb optimize op
inputs
=
{
"Param"
:
param_and_grad
[
0
],
"Grad"
:
param_and_grad
[
1
],
"LearningRate"
:
lr
,
"Moment1"
:
moment1
,
"Moment2"
:
moment2
,
"Beta1Pow"
:
beta1_pow_acc
,
"Beta2Pow"
:
beta2_pow_acc
,
}
outputs
=
{
"ParamOut"
:
param_and_grad
[
0
],
"Moment1Out"
:
moment1
,
"Moment2Out"
:
moment2
,
"Beta1PowOut"
:
beta1_pow_acc
,
"Beta2PowOut"
:
beta2_pow_acc
,
}
attrs
=
{
"beta1"
:
self
.
_beta1
,
"beta2"
:
self
.
_beta2
,
"epsilon"
:
self
.
_epsilon
,
"weight_decay"
:
weight_decay
,
"multi_precision"
:
find_master
,
}
if
find_master
:
inputs
[
"MasterParam"
]
=
master_weight
outputs
[
"MasterParamOut"
]
=
master_weight
if
found_inf
:
inputs
[
"SkipUpdate"
]
=
found_inf
lamb_op
=
block
.
append_op
(
type
=
self
.
type
,
inputs
=
inputs
,
outputs
=
outputs
,
attrs
=
attrs
,
stop_gradient
=
True
,
)
return
lamb_op
return
lamb_op
def
_update_param_group
(
self
,
parameters
):
self
.
_beta1
=
parameters
.
get
(
'beta1'
,
self
.
_default_dict
[
'beta1'
])
...
...
python/paddle/optimizer/lr.py
浏览文件 @
6c9fa665
...
...
@@ -20,8 +20,6 @@ import numpy
import
paddle.fluid.core
as
core
from
paddle
import
Tensor
from
..fluid.framework
import
_in_legacy_dygraph
__all__
=
[
# noqa
'LRScheduler'
,
'NoamDecay'
,
...
...
@@ -1395,15 +1393,8 @@ class ReduceOnPlateau(LRScheduler):
else
:
self
.
last_epoch
=
epoch
if
not
_in_legacy_dygraph
():
tmp
=
core
.
eager
.
Tensor
else
:
# need to declarate explicitly
from
paddle.framework
import
VarBase
as
Tensor
tmp
=
Tensor
# loss must be float, numpy.ndarray or 1-D Tensor with shape [1]
if
isinstance
(
metrics
,
(
tmp
,
numpy
.
ndarray
)):
if
isinstance
(
metrics
,
(
core
.
eager
.
Tensor
,
numpy
.
ndarray
)):
assert
len
(
metrics
.
shape
)
==
1
and
metrics
.
shape
[
0
]
==
1
,
(
"the metrics.shape "
"should be (1L,), but the current metrics.shape is {}. Maybe that "
...
...
python/paddle/optimizer/momentum.py
浏览文件 @
6c9fa665
...
...
@@ -15,8 +15,8 @@
import
warnings
import
paddle
from
paddle
import
_C_ops
,
_legacy_C_ops
from
paddle.fluid.framework
import
_in_legacy_dygraph
,
in_dygraph_mode
from
paddle
import
_C_ops
from
paddle.fluid.framework
import
in_dygraph_mode
from
paddle.fluid.regularizer
import
L2DecayRegularizer
from
..fluid
import
core
,
framework
,
unique_name
...
...
@@ -333,30 +333,6 @@ class Momentum(Optimizer):
else
None
)
if
_in_legacy_dygraph
():
if
isinstance
(
param_and_grad
,
dict
):
self
.
_update_regularization
(
param_and_grad
[
'weight_decay'
])
_
,
_
,
_
=
_legacy_C_ops
.
momentum
(
param_and_grad
[
0
],
param_and_grad
[
1
],
velocity_acc
,
lr
,
master_weight
,
param_and_grad
[
0
],
velocity_acc
,
master_weight
,
'mu'
,
self
.
_momentum
,
'use_nesterov'
,
self
.
_use_nesterov
,
'regularization_method'
,
regularization_method
,
'regularization_coeff'
,
regularization_coeff
,
'multi_precision'
,
find_master
,
)
return
None
if
in_dygraph_mode
():
if
isinstance
(
param_and_grad
,
dict
):
self
.
_update_regularization
(
param_and_grad
[
'weight_decay'
])
...
...
@@ -373,42 +349,42 @@ class Momentum(Optimizer):
find_master
,
self
.
_rescale_grad
,
)
else
:
attrs
=
{
"mu"
:
self
.
_momentum
,
"use_nesterov"
:
self
.
_use_nesterov
,
"regularization_method"
:
regularization_method
,
"regularization_coeff"
:
regularization_coeff
,
"multi_precision"
:
find_master
,
"rescale_grad"
:
self
.
_rescale_grad
,
}
inputs
=
{
"Param"
:
[
param_and_grad
[
0
]],
"Grad"
:
[
param_and_grad
[
1
]],
"Velocity"
:
[
velocity_acc
],
"LearningRate"
:
[
lr
],
}
outputs
=
{
"ParamOut"
:
[
param_and_grad
[
0
]],
"VelocityOut"
:
[
velocity_acc
],
}
if
find_master
:
inputs
[
"MasterParam"
]
=
master_weight
outputs
[
"MasterParamOut"
]
=
master_weight
# create the momentum optimize op
momentum_op
=
block
.
append_op
(
type
=
self
.
type
,
inputs
=
inputs
,
outputs
=
outputs
,
attrs
=
attrs
,
stop_gradient
=
True
,
)
attrs
=
{
"mu"
:
self
.
_momentum
,
"use_nesterov"
:
self
.
_use_nesterov
,
"regularization_method"
:
regularization_method
,
"regularization_coeff"
:
regularization_coeff
,
"multi_precision"
:
find_master
,
"rescale_grad"
:
self
.
_rescale_grad
,
}
inputs
=
{
"Param"
:
[
param_and_grad
[
0
]],
"Grad"
:
[
param_and_grad
[
1
]],
"Velocity"
:
[
velocity_acc
],
"LearningRate"
:
[
lr
],
}
outputs
=
{
"ParamOut"
:
[
param_and_grad
[
0
]],
"VelocityOut"
:
[
velocity_acc
],
}
if
find_master
:
inputs
[
"MasterParam"
]
=
master_weight
outputs
[
"MasterParamOut"
]
=
master_weight
# create the momentum optimize op
momentum_op
=
block
.
append_op
(
type
=
self
.
type
,
inputs
=
inputs
,
outputs
=
outputs
,
attrs
=
attrs
,
stop_gradient
=
True
,
)
return
momentum_op
return
momentum_op
def
_multi_tensor_init
(
self
,
target_block
,
parameters
,
param_group_idx
):
"""
...
...
@@ -553,50 +529,20 @@ class Momentum(Optimizer):
else
None
)
if
framework
.
_non_static_mode
():
if
in_dygraph_mode
():
_
,
_
,
_
=
_C_ops
.
merged_momentum_
(
self
.
_param_dict
[
key
][
param_group_idx
],
grad_dict
[
key
],
self
.
_velocity_dict
[
key
][
param_group_idx
],
lr_dict
[
key
],
master_weight
,
self
.
_momentum
,
self
.
_use_nesterov
,
self
.
_regularization_method_dict
[
key
][
param_group_idx
],
self
.
_regularization_coeff_dict
[
key
][
param_group_idx
],
find_master
,
self
.
_rescale_grad
,
)
else
:
_
,
_
,
_
=
_legacy_C_ops
.
merged_momentum
(
self
.
_param_dict
[
key
][
param_group_idx
],
grad_dict
[
key
],
self
.
_velocity_dict
[
key
][
param_group_idx
],
lr_dict
[
key
],
master_weight
,
self
.
_param_dict
[
key
][
param_group_idx
],
self
.
_velocity_dict
[
key
][
param_group_idx
],
master_weight
,
'mu'
,
self
.
_momentum
,
'use_nesterov'
,
self
.
_use_nesterov
,
'regularization_method'
,
self
.
_regularization_method_dict
[
key
][
param_group_idx
],
'regularization_coeff'
,
self
.
_regularization_coeff_dict
[
key
][
param_group_idx
],
'multi_precision'
,
find_master
,
)
if
in_dygraph_mode
():
_
,
_
,
_
=
_C_ops
.
merged_momentum_
(
self
.
_param_dict
[
key
][
param_group_idx
],
grad_dict
[
key
],
self
.
_velocity_dict
[
key
][
param_group_idx
],
lr_dict
[
key
],
master_weight
,
self
.
_momentum
,
self
.
_use_nesterov
,
self
.
_regularization_method_dict
[
key
][
param_group_idx
],
self
.
_regularization_coeff_dict
[
key
][
param_group_idx
],
find_master
,
self
.
_rescale_grad
,
)
else
:
inputs
=
{
"Param"
:
self
.
_param_dict
[
key
][
param_group_idx
],
...
...
python/paddle/optimizer/optimizer.py
浏览文件 @
6c9fa665
...
...
@@ -18,13 +18,12 @@ from collections import defaultdict
import
numpy
as
np
import
paddle
from
paddle
import
_C_ops
,
_legacy_C_ops
from
paddle
import
_C_ops
from
paddle.fluid
import
core
from
paddle.fluid.framework
import
(
Variable
,
_current_expected_place
,
_in_eager_without_dygraph_check
,
_in_legacy_dygraph
,
default_main_program
,
device_guard
,
in_dygraph_mode
,
...
...
@@ -534,17 +533,6 @@ class Optimizer:
current_lr
.
dtype
,
place
,
)
elif
_in_legacy_dygraph
():
_legacy_C_ops
.
fill_constant
(
current_lr
,
'value'
,
float
(
value
),
'dtype'
,
current_lr
.
dtype
,
'shape'
,
list
(
current_lr
.
shape
),
)
else
:
global_block
=
framework
.
default_main_program
().
global_block
()
global_block
.
append_op
(
...
...
@@ -1042,28 +1030,16 @@ class Optimizer:
if
self
.
_dtype
is
None
:
self
.
_dtype
=
loss
.
dtype
if
framework
.
_non_static
_mode
():
if
framework
.
in_dygraph
_mode
():
parameter_list
=
parameters
if
parameters
else
self
.
_parameter_list
if
framework
.
in_dygraph_mode
():
# It is very time-consuming to call c++ functions in a loop on the python side.
# We put this part of the code on the c++ side to improve the speed in eager mode.
params_grads
=
[]
grads
=
core
.
eager
.
get_all_grads
(
parameter_list
)
for
index
,
grad
in
enumerate
(
grads
):
if
grad
is
not
None
:
params_grads
.
append
((
parameter_list
[
index
],
grad
))
else
:
# Keep the original code to support legacy mode.
# Delete the else branch when the legacy mode exits.
params_grads
=
[]
for
param
in
parameter_list
:
if
param
.
stop_gradient
:
continue
if
param
.
_grad_ivar
()
is
not
None
:
# create gradient tensor
grad_var
=
param
.
_grad_ivar
()
params_grads
.
append
((
param
,
grad_var
))
# It is very time-consuming to call c++ functions in a loop on the python side.
# We put this part of the code on the c++ side to improve the speed in eager mode.
params_grads
=
[]
grads
=
core
.
eager
.
get_all_grads
(
parameter_list
)
for
index
,
grad
in
enumerate
(
grads
):
if
grad
is
not
None
:
params_grads
.
append
((
parameter_list
[
index
],
grad
))
else
:
if
callbacks
is
None
:
callbacks
=
[
error_clip_callback
]
...
...
@@ -1207,28 +1183,26 @@ class Optimizer:
if
framework
.
in_dygraph_mode
():
return
_C_ops
.
add_n
([
grad
,
regularization_term
])
elif
framework
.
_in_legacy_dygraph
():
return
_legacy_C_ops
.
sum
([
grad
,
regularization_term
])
new_grad
=
grad
if
grad
.
type
==
core
.
VarDesc
.
VarType
.
SELECTED_ROWS
:
# FIXME(zcd): If the grad is SELECTED_ROWS, after regularization,
# the grad's type and name will be changed. But the gradient's name
# is used in ParallelExecutor Reduce mode, so I add a flag for
# the new_grad here.
new_grad
=
grad
.
block
.
create_var
(
name
=
grad
.
name
+
core
.
kNewGradSuffix
(),
dtype
=
param
.
dtype
,
shape
=
param
.
shape
,
lod_level
=
param
.
lod_level
,
type
=
core
.
VarDesc
.
VarType
.
LOD_TENSOR
,
)
else
:
new_grad
=
grad
if
grad
.
type
==
core
.
VarDesc
.
VarType
.
SELECTED_ROWS
:
# FIXME(zcd): If the grad is SELECTED_ROWS, after regularization,
# the grad's type and name will be changed. But the gradient's name
# is used in ParallelExecutor Reduce mode, so I add a flag for
# the new_grad here.
new_grad
=
grad
.
block
.
create_var
(
name
=
grad
.
name
+
core
.
kNewGradSuffix
(),
dtype
=
param
.
dtype
,
shape
=
param
.
shape
,
lod_level
=
param
.
lod_level
,
type
=
core
.
VarDesc
.
VarType
.
LOD_TENSOR
,
)
inputs
=
{
"X"
:
[
grad
,
regularization_term
]}
outputs
=
{
"Out"
:
[
new_grad
]}
grad
.
block
.
append_op
(
type
=
'sum'
,
inputs
=
inputs
,
outputs
=
outputs
)
inputs
=
{
"X"
:
[
grad
,
regularization_term
]}
outputs
=
{
"Out"
:
[
new_grad
]}
grad
.
block
.
append_op
(
type
=
'sum'
,
inputs
=
inputs
,
outputs
=
outputs
)
return
new_grad
return
new_grad
def
append_regularization_ops
(
self
,
parameters_and_grads
,
regularization
=
None
...
...
python/paddle/optimizer/sgd.py
浏览文件 @
6c9fa665
...
...
@@ -15,11 +15,11 @@
import
warnings
import
paddle
from
paddle
import
_C_ops
,
_legacy_C_ops
from
paddle
import
_C_ops
from
..fluid
import
core
,
framework
,
unique_name
from
..fluid.dygraph
import
no_grad
from
..fluid.framework
import
_in_legacy_dygraph
,
in_dygraph_mode
from
..fluid.framework
import
in_dygraph_mode
from
..fluid.layer_helper
import
LayerHelper
from
.optimizer
import
Optimizer
...
...
@@ -166,42 +166,32 @@ class SGD(Optimizer):
find_master
,
)
return
None
if
_in_legacy_dygraph
():
_legacy_C_ops
.
sgd
(
param_and_grad
[
0
],
lr
,
param_and_grad
[
1
],
master_weight
,
param_and_grad
[
0
],
master_weight
,
else
:
assert
isinstance
(
block
,
framework
.
Block
)
# create the optimize op
inputs
=
{
"Param"
:
param_and_grad
[
0
],
"Grad"
:
param_and_grad
[
1
],
"LearningRate"
:
lr
,
}
outputs
=
{
"ParamOut"
:
param_and_grad
[
0
]}
attrs
=
{
"multi_precision"
:
find_master
}
if
find_master
:
inputs
[
"MasterParam"
]
=
master_weight
outputs
[
"MasterParamOut"
]
=
master_weight
sgd_op
=
block
.
append_op
(
type
=
self
.
type
,
inputs
=
inputs
,
outputs
=
outputs
,
attrs
=
attrs
,
stop_gradient
=
True
,
)
return
None
assert
isinstance
(
block
,
framework
.
Block
)
# create the optimize op
inputs
=
{
"Param"
:
param_and_grad
[
0
],
"Grad"
:
param_and_grad
[
1
],
"LearningRate"
:
lr
,
}
outputs
=
{
"ParamOut"
:
param_and_grad
[
0
]}
attrs
=
{
"multi_precision"
:
find_master
}
if
find_master
:
inputs
[
"MasterParam"
]
=
master_weight
outputs
[
"MasterParamOut"
]
=
master_weight
sgd_op
=
block
.
append_op
(
type
=
self
.
type
,
inputs
=
inputs
,
outputs
=
outputs
,
attrs
=
attrs
,
stop_gradient
=
True
,
)
return
sgd_op
return
sgd_op
def
_update_param_group
(
self
,
parameters
):
parameters
=
parameters
.
get
(
'params'
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录