Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
53d1d0f0
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
53d1d0f0
编写于
6月 15, 2018
作者:
W
Wu Yi
提交者:
GitHub
6月 15, 2018
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
add LARS support (#10374)
上级
dd55cc16
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
59 addition
and
7 deletion
+59
-7
python/paddle/fluid/layers/learning_rate_scheduler.py
python/paddle/fluid/layers/learning_rate_scheduler.py
+40
-1
python/paddle/fluid/optimizer.py
python/paddle/fluid/optimizer.py
+18
-5
python/paddle/fluid/tests/book/test_recognize_digits.py
python/paddle/fluid/tests/book/test_recognize_digits.py
+1
-1
未找到文件。
python/paddle/fluid/layers/learning_rate_scheduler.py
浏览文件 @
53d1d0f0
...
...
@@ -25,10 +25,11 @@ import nn
import
ops
import
tensor
from
..initializer
import
init_on_cpu
from
..framework
import
default_main_program
,
Parameter
__all__
=
[
'exponential_decay'
,
'natural_exp_decay'
,
'inverse_time_decay'
,
'polynomial_decay'
,
'piecewise_decay'
,
'noam_decay'
'polynomial_decay'
,
'piecewise_decay'
,
'noam_decay'
,
'append_LARS'
]
...
...
@@ -261,3 +262,41 @@ def piecewise_decay(boundaries, values):
tensor
.
assign
(
last_value_var
,
lr
)
return
lr
def
append_LARS
(
params_grads
,
learning_rate
,
weight_decay
):
"""Applies LARS (LAYER-WISE ADAPTIVE RATE SCALING) to learning rate for
each layer.
```python
learning_rate *= local_gw_ratio * sqrt(sumsq(param))
/ (sqrt(sumsq(gradient))+ weight_decay * sqrt(sumsq(param)))
```
Args:
learning_rate: A learning rate Variable. This
is the global learning rate for LARS.
weight_decay: A Python `float` number.
Returns:
The decayed learning rate
"""
def
_balanced_weight
(
param_norm
,
grad_norm
):
if
weight_decay
==
1.0
:
return
grad_norm
+
param_norm
else
:
return
grad_norm
+
weight_decay
*
param_norm
for
param
,
grad
in
params_grads
:
param_lr
=
param
.
optimize_attr
[
'learning_rate'
]
param_norm
=
ops
.
sqrt
(
nn
.
reduce_sum
(
input
=
ops
.
square
(
param
)))
grad_norm
=
ops
.
sqrt
(
nn
.
reduce_sum
(
input
=
ops
.
square
(
grad
)))
if
type
(
param_lr
)
==
float
and
param_lr
==
1.0
:
decayed_lr
=
learning_rate
*
param_norm
\
/
_balanced_weight
(
param_norm
,
grad_norm
)
else
:
decayed_lr
=
learning_rate
*
param_lr
*
param_norm
\
/
_balanced_weight
(
param_norm
,
grad_norm
)
# set back param local learning rate
param
.
optimize_attr
[
'learning_rate'
]
=
decayed_lr
python/paddle/fluid/optimizer.py
浏览文件 @
53d1d0f0
...
...
@@ -13,7 +13,7 @@
# limitations under the License.
import
re
from
collections
import
defaultdict
from
paddle.fluid.framework
import
Program
from
paddle.fluid.framework
import
Program
,
Variable
import
framework
import
layers
from
backward
import
append_backward
...
...
@@ -41,7 +41,10 @@ class Optimizer(object):
but need to use one of it's implementation.
"""
def
__init__
(
self
,
learning_rate
,
regularization
=
None
):
def
__init__
(
self
,
learning_rate
,
regularization
=
None
,
LARS_weight_decay
=
0.0
):
if
not
isinstance
(
learning_rate
,
float
)
and
\
not
isinstance
(
learning_rate
,
framework
.
Variable
):
raise
TypeError
(
"learning rate should be float or Variable"
)
...
...
@@ -61,6 +64,7 @@ class Optimizer(object):
# {accum_name : { paramter_name : accumulator_for_parameter, ...}, ...}
self
.
_accumulators
=
defaultdict
(
lambda
:
dict
())
self
.
helper
=
None
self
.
_LARS_weight_decay
=
LARS_weight_decay
def
_create_global_learning_rate
(
self
):
lr
=
self
.
global_learning_rate
()
...
...
@@ -100,10 +104,15 @@ class Optimizer(object):
# create learning rate variable for every parameter
param
=
param_and_grad
[
0
]
param_lr
=
param
.
optimize_attr
[
'learning_rate'
]
if
param_lr
==
1.0
:
return
self
.
global_learning_rate
()
if
type
(
param_lr
)
==
Variable
:
# param learning rate has been updated (LARS)
print
(
"returns updated param lr "
,
param_lr
)
return
param_lr
else
:
return
self
.
global_learning_rate
()
*
param_lr
if
param_lr
==
1.0
:
return
self
.
global_learning_rate
()
else
:
return
self
.
global_learning_rate
()
*
param_lr
def
_create_accumulators
(
self
,
block
,
parameters
):
"""Create all accumulators needed by the parameters
...
...
@@ -210,6 +219,10 @@ class Optimizer(object):
self
.
_create_accumulators
(
loss
.
block
,
[
p
[
0
]
for
p
in
parameters_and_grads
])
self
.
_create_global_learning_rate
()
if
self
.
_LARS_weight_decay
>
0.0
:
layers
.
append_LARS
(
parameters_and_grads
,
self
.
global_learning_rate
(),
self
.
_LARS_weight_decay
)
optimize_ops
=
[]
for
param_and_grad
in
parameters_and_grads
:
...
...
python/paddle/fluid/tests/book/test_recognize_digits.py
浏览文件 @
53d1d0f0
...
...
@@ -94,7 +94,7 @@ def train(nn_type,
test_program
=
fluid
.
default_main_program
().
clone
(
for_test
=
True
)
optimizer
=
fluid
.
optimizer
.
Adam
(
learning_rate
=
0.001
)
optimizer
=
fluid
.
optimizer
.
Adam
(
learning_rate
=
0.001
,
LARS_weight_decay
=
0.3
)
optimizer
.
minimize
(
avg_loss
)
place
=
fluid
.
CUDAPlace
(
0
)
if
use_cuda
else
fluid
.
CPUPlace
()
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录