未验证 提交 53d1d0f0 编写于 作者: W Wu Yi 提交者: GitHub

add LARS support (#10374)

上级 dd55cc16
......@@ -25,10 +25,11 @@ import nn
import ops
import tensor
from ..initializer import init_on_cpu
from ..framework import default_main_program, Parameter
__all__ = [
'exponential_decay', 'natural_exp_decay', 'inverse_time_decay',
'polynomial_decay', 'piecewise_decay', 'noam_decay'
'polynomial_decay', 'piecewise_decay', 'noam_decay', 'append_LARS'
]
......@@ -261,3 +262,41 @@ def piecewise_decay(boundaries, values):
tensor.assign(last_value_var, lr)
return lr
def append_LARS(params_grads, learning_rate, weight_decay):
"""Applies LARS (LAYER-WISE ADAPTIVE RATE SCALING) to learning rate for
each layer.
```python
learning_rate *= local_gw_ratio * sqrt(sumsq(param))
/ (sqrt(sumsq(gradient))+ weight_decay * sqrt(sumsq(param)))
```
Args:
learning_rate: A learning rate Variable. This
is the global learning rate for LARS.
weight_decay: A Python `float` number.
Returns:
The decayed learning rate
"""
def _balanced_weight(param_norm, grad_norm):
if weight_decay == 1.0:
return grad_norm + param_norm
else:
return grad_norm + weight_decay * param_norm
for param, grad in params_grads:
param_lr = param.optimize_attr['learning_rate']
param_norm = ops.sqrt(nn.reduce_sum(input=ops.square(param)))
grad_norm = ops.sqrt(nn.reduce_sum(input=ops.square(grad)))
if type(param_lr) == float and param_lr == 1.0:
decayed_lr = learning_rate * param_norm \
/ _balanced_weight(param_norm, grad_norm)
else:
decayed_lr = learning_rate * param_lr * param_norm \
/ _balanced_weight(param_norm, grad_norm)
# set back param local learning rate
param.optimize_attr['learning_rate'] = decayed_lr
......@@ -13,7 +13,7 @@
# limitations under the License.
import re
from collections import defaultdict
from paddle.fluid.framework import Program
from paddle.fluid.framework import Program, Variable
import framework
import layers
from backward import append_backward
......@@ -41,7 +41,10 @@ class Optimizer(object):
but need to use one of it's implementation.
"""
def __init__(self, learning_rate, regularization=None):
def __init__(self,
learning_rate,
regularization=None,
LARS_weight_decay=0.0):
if not isinstance(learning_rate, float) and \
not isinstance(learning_rate, framework.Variable):
raise TypeError("learning rate should be float or Variable")
......@@ -61,6 +64,7 @@ class Optimizer(object):
# {accum_name : { paramter_name : accumulator_for_parameter, ...}, ...}
self._accumulators = defaultdict(lambda: dict())
self.helper = None
self._LARS_weight_decay = LARS_weight_decay
def _create_global_learning_rate(self):
lr = self.global_learning_rate()
......@@ -100,6 +104,11 @@ class Optimizer(object):
# create learning rate variable for every parameter
param = param_and_grad[0]
param_lr = param.optimize_attr['learning_rate']
if type(param_lr) == Variable:
# param learning rate has been updated (LARS)
print("returns updated param lr ", param_lr)
return param_lr
else:
if param_lr == 1.0:
return self.global_learning_rate()
else:
......@@ -210,6 +219,10 @@ class Optimizer(object):
self._create_accumulators(loss.block,
[p[0] for p in parameters_and_grads])
self._create_global_learning_rate()
if self._LARS_weight_decay > 0.0:
layers.append_LARS(parameters_and_grads,
self.global_learning_rate(),
self._LARS_weight_decay)
optimize_ops = []
for param_and_grad in parameters_and_grads:
......
......@@ -94,7 +94,7 @@ def train(nn_type,
test_program = fluid.default_main_program().clone(for_test=True)
optimizer = fluid.optimizer.Adam(learning_rate=0.001)
optimizer = fluid.optimizer.Adam(learning_rate=0.001, LARS_weight_decay=0.3)
optimizer.minimize(avg_loss)
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册