未验证 提交 53d1d0f0 编写于 作者: W Wu Yi 提交者: GitHub

add LARS support (#10374)

上级 dd55cc16
...@@ -25,10 +25,11 @@ import nn ...@@ -25,10 +25,11 @@ import nn
import ops import ops
import tensor import tensor
from ..initializer import init_on_cpu from ..initializer import init_on_cpu
from ..framework import default_main_program, Parameter
__all__ = [ __all__ = [
'exponential_decay', 'natural_exp_decay', 'inverse_time_decay', 'exponential_decay', 'natural_exp_decay', 'inverse_time_decay',
'polynomial_decay', 'piecewise_decay', 'noam_decay' 'polynomial_decay', 'piecewise_decay', 'noam_decay', 'append_LARS'
] ]
...@@ -261,3 +262,41 @@ def piecewise_decay(boundaries, values): ...@@ -261,3 +262,41 @@ def piecewise_decay(boundaries, values):
tensor.assign(last_value_var, lr) tensor.assign(last_value_var, lr)
return lr return lr
def append_LARS(params_grads, learning_rate, weight_decay):
"""Applies LARS (LAYER-WISE ADAPTIVE RATE SCALING) to learning rate for
each layer.
```python
learning_rate *= local_gw_ratio * sqrt(sumsq(param))
/ (sqrt(sumsq(gradient))+ weight_decay * sqrt(sumsq(param)))
```
Args:
learning_rate: A learning rate Variable. This
is the global learning rate for LARS.
weight_decay: A Python `float` number.
Returns:
The decayed learning rate
"""
def _balanced_weight(param_norm, grad_norm):
if weight_decay == 1.0:
return grad_norm + param_norm
else:
return grad_norm + weight_decay * param_norm
for param, grad in params_grads:
param_lr = param.optimize_attr['learning_rate']
param_norm = ops.sqrt(nn.reduce_sum(input=ops.square(param)))
grad_norm = ops.sqrt(nn.reduce_sum(input=ops.square(grad)))
if type(param_lr) == float and param_lr == 1.0:
decayed_lr = learning_rate * param_norm \
/ _balanced_weight(param_norm, grad_norm)
else:
decayed_lr = learning_rate * param_lr * param_norm \
/ _balanced_weight(param_norm, grad_norm)
# set back param local learning rate
param.optimize_attr['learning_rate'] = decayed_lr
...@@ -13,7 +13,7 @@ ...@@ -13,7 +13,7 @@
# limitations under the License. # limitations under the License.
import re import re
from collections import defaultdict from collections import defaultdict
from paddle.fluid.framework import Program from paddle.fluid.framework import Program, Variable
import framework import framework
import layers import layers
from backward import append_backward from backward import append_backward
...@@ -41,7 +41,10 @@ class Optimizer(object): ...@@ -41,7 +41,10 @@ class Optimizer(object):
but need to use one of it's implementation. but need to use one of it's implementation.
""" """
def __init__(self, learning_rate, regularization=None): def __init__(self,
learning_rate,
regularization=None,
LARS_weight_decay=0.0):
if not isinstance(learning_rate, float) and \ if not isinstance(learning_rate, float) and \
not isinstance(learning_rate, framework.Variable): not isinstance(learning_rate, framework.Variable):
raise TypeError("learning rate should be float or Variable") raise TypeError("learning rate should be float or Variable")
...@@ -61,6 +64,7 @@ class Optimizer(object): ...@@ -61,6 +64,7 @@ class Optimizer(object):
# {accum_name : { paramter_name : accumulator_for_parameter, ...}, ...} # {accum_name : { paramter_name : accumulator_for_parameter, ...}, ...}
self._accumulators = defaultdict(lambda: dict()) self._accumulators = defaultdict(lambda: dict())
self.helper = None self.helper = None
self._LARS_weight_decay = LARS_weight_decay
def _create_global_learning_rate(self): def _create_global_learning_rate(self):
lr = self.global_learning_rate() lr = self.global_learning_rate()
...@@ -100,6 +104,11 @@ class Optimizer(object): ...@@ -100,6 +104,11 @@ class Optimizer(object):
# create learning rate variable for every parameter # create learning rate variable for every parameter
param = param_and_grad[0] param = param_and_grad[0]
param_lr = param.optimize_attr['learning_rate'] param_lr = param.optimize_attr['learning_rate']
if type(param_lr) == Variable:
# param learning rate has been updated (LARS)
print("returns updated param lr ", param_lr)
return param_lr
else:
if param_lr == 1.0: if param_lr == 1.0:
return self.global_learning_rate() return self.global_learning_rate()
else: else:
...@@ -210,6 +219,10 @@ class Optimizer(object): ...@@ -210,6 +219,10 @@ class Optimizer(object):
self._create_accumulators(loss.block, self._create_accumulators(loss.block,
[p[0] for p in parameters_and_grads]) [p[0] for p in parameters_and_grads])
self._create_global_learning_rate() self._create_global_learning_rate()
if self._LARS_weight_decay > 0.0:
layers.append_LARS(parameters_and_grads,
self.global_learning_rate(),
self._LARS_weight_decay)
optimize_ops = [] optimize_ops = []
for param_and_grad in parameters_and_grads: for param_and_grad in parameters_and_grads:
......
...@@ -94,7 +94,7 @@ def train(nn_type, ...@@ -94,7 +94,7 @@ def train(nn_type,
test_program = fluid.default_main_program().clone(for_test=True) test_program = fluid.default_main_program().clone(for_test=True)
optimizer = fluid.optimizer.Adam(learning_rate=0.001) optimizer = fluid.optimizer.Adam(learning_rate=0.001, LARS_weight_decay=0.3)
optimizer.minimize(avg_loss) optimizer.minimize(avg_loss)
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册