diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index c0df4c2dc6cd46a90d4205b26123fb6c44fcaf88..0bd950d5f6c15c2bf85b7f4aabfd0dfa194d8fd0 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -361,7 +361,6 @@ paddle.fluid.layers.inverse_time_decay (ArgSpec(args=['learning_rate', 'decay_st paddle.fluid.layers.polynomial_decay (ArgSpec(args=['learning_rate', 'decay_steps', 'end_learning_rate', 'power', 'cycle'], varargs=None, keywords=None, defaults=(0.0001, 1.0, False)), ('document', '882634f420f626642f0874481263da40')) paddle.fluid.layers.piecewise_decay (ArgSpec(args=['boundaries', 'values'], varargs=None, keywords=None, defaults=None), ('document', 'c717d9d1d78a53c809d01b8bc56f3cae')) paddle.fluid.layers.noam_decay (ArgSpec(args=['d_model', 'warmup_steps'], varargs=None, keywords=None, defaults=None), ('document', 'd9a95746353fd574be36dc28d8726c28')) -paddle.fluid.layers.append_LARS (ArgSpec(args=['params_grads', 'learning_rate', 'weight_decay'], varargs=None, keywords=None, defaults=None), ('document', 'd24fa1e7d62ac8a534fc6a86002f84f8')) paddle.fluid.layers.cosine_decay (ArgSpec(args=['learning_rate', 'step_each_epoch', 'epochs'], varargs=None, keywords=None, defaults=None), ('document', '9588c64c26ffaef3c466e404a6af9d9b')) paddle.fluid.layers.linear_lr_warmup (ArgSpec(args=['learning_rate', 'warmup_steps', 'start_lr', 'end_lr'], varargs=None, keywords=None, defaults=None), ('document', '2ef3f5ca5cd71ea4217c418e5a7a0565')) paddle.fluid.contrib.InitState.__init__ (ArgSpec(args=['self', 'init', 'shape', 'value', 'init_boot', 'need_reorder', 'dtype'], varargs=None, keywords=None, defaults=(None, None, 0.0, None, False, 'float32')), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) diff --git a/python/paddle/fluid/layers/learning_rate_scheduler.py b/python/paddle/fluid/layers/learning_rate_scheduler.py index b7d1eeba80d93d549a019455087bb7cc1d2a1083..d5be9039c6d6fa2c3e9059684ecabd383581668c 100644 --- a/python/paddle/fluid/layers/learning_rate_scheduler.py +++ b/python/paddle/fluid/layers/learning_rate_scheduler.py @@ -35,8 +35,8 @@ from ..dygraph import learning_rate_scheduler as imperate_lr __all__ = [ 'exponential_decay', 'natural_exp_decay', 'inverse_time_decay', - 'polynomial_decay', 'piecewise_decay', 'noam_decay', 'append_LARS', - 'cosine_decay', 'linear_lr_warmup' + 'polynomial_decay', 'piecewise_decay', 'noam_decay', 'cosine_decay', + 'linear_lr_warmup' ] @@ -381,50 +381,6 @@ def cosine_decay(learning_rate, step_each_epoch, epochs): return decayed_lr -def append_LARS(params_grads, learning_rate, weight_decay): - """ - Applies LARS (LAYER-WISE ADAPTIVE RATE SCALING) to learning rate for - each layer. - - Args: - learning_rate: A learning rate Variable. This - is the global learning rate for LARS. - weight_decay: A Python `float` number. - - Returns: - The decayed learning rate - Examples: - .. code-block:: python - - learning_rate *= local_gw_ratio * sqrt(sumsq(param)) - / (sqrt(sumsq(gradient))+ weight_decay * sqrt(sumsq(param))) - """ - - assert not imperative_base.enabled( - ), "append_LARS is NOT supported in dygraph mode now" - - def _balanced_weight(param_norm, grad_norm): - if weight_decay == 1.0: - return grad_norm + param_norm - else: - return grad_norm + weight_decay * param_norm - - for param, grad in params_grads: - with param.block.program.optimized_guard( - [param, grad]), name_scope("optimizer"): - param_lr = param.optimize_attr['learning_rate'] - param_norm = ops.sqrt(nn.reduce_sum(input=ops.square(param))) - grad_norm = ops.sqrt(nn.reduce_sum(input=ops.square(grad))) - if type(param_lr) == float and param_lr == 1.0: - decayed_lr = learning_rate * param_norm \ - / _balanced_weight(param_norm, grad_norm) - else: - decayed_lr = learning_rate * param_lr * param_norm \ - / _balanced_weight(param_norm, grad_norm) - # set back param local learning rate - param.optimize_attr['learning_rate'] = decayed_lr - - def linear_lr_warmup(learning_rate, warmup_steps, start_lr, end_lr): """ Applies linear learning rate warmup before the normal learning rate