From 0ca6274451d3693f363f2b8b5d6b29ce722febaf Mon Sep 17 00:00:00 2001 From: dzhwinter Date: Mon, 11 Dec 2017 23:15:35 +0800 Subject: [PATCH] "add global regularization" (#6443) * "add global regularization" * Polish `append_regularization_ops` --- python/paddle/v2/fluid/optimizer.py | 38 +++++++++++---------------- python/paddle/v2/fluid/regularizer.py | 15 ++++++++--- 2 files changed, 27 insertions(+), 26 deletions(-) diff --git a/python/paddle/v2/fluid/optimizer.py b/python/paddle/v2/fluid/optimizer.py index 719e3b2563..bbdfab2df9 100644 --- a/python/paddle/v2/fluid/optimizer.py +++ b/python/paddle/v2/fluid/optimizer.py @@ -18,8 +18,9 @@ class Optimizer(object): but need to use one of it's implementation. """ - def __init__(self, global_step=None): + def __init__(self, global_step=None, regularization=None): self._global_step = global_step + self.regularization = regularization # Dictionary of accumulators. Some optimizer subclasses need to # allocate and manage extra variables associated with the parameters # to train. These variables are called accumulators. @@ -199,7 +200,8 @@ class Optimizer(object): """ params_grads = append_backward_ops(loss, parameter_list, no_grad_set) # Add regularization if any - params_grads = append_regularization_ops(params_grads) + params_grads = append_regularization_ops(params_grads, + self.regularization) optimize_ops = self.create_optimization_pass(params_grads, loss, startup_program) return optimize_ops @@ -209,9 +211,9 @@ class SGDOptimizer(Optimizer): """ Simple SGD optimizer without any state. """ - def __init__(self, learning_rate, global_step=None): + def __init__(self, learning_rate, **kwargs): assert learning_rate is not None - super(SGDOptimizer, self).__init__(global_step) + super(SGDOptimizer, self).__init__(**kwargs) self.type = "sgd" self._learning_rate = learning_rate @@ -236,14 +238,10 @@ class MomentumOptimizer(Optimizer): """ _velocity_acc_str = "velocity" - def __init__(self, - learning_rate, - momentum, - use_nesterov=False, - global_step=None): + def __init__(self, learning_rate, momentum, use_nesterov=False, **kwargs): assert learning_rate is not None assert momentum is not None - super(MomentumOptimizer, self).__init__(global_step) + super(MomentumOptimizer, self).__init__(**kwargs) self.type = "momentum" self._learning_rate = learning_rate self._momentum = momentum @@ -284,10 +282,10 @@ class AdagradOptimizer(Optimizer): """ _moment_acc_str = "moment" - def __init__(self, learning_rate, epsilon=1.0e-6, global_step=None): + def __init__(self, learning_rate, epsilon=1.0e-6, **kwargs): assert learning_rate is not None assert epsilon is not None - super(AdagradOptimizer, self).__init__(global_step) + super(AdagradOptimizer, self).__init__(**kwargs) self.type = "adagrad" self._learning_rate = learning_rate self._epsilon = epsilon @@ -331,12 +329,12 @@ class AdamOptimizer(Optimizer): beta1=0.9, beta2=0.999, epsilon=1e-8, - global_step=None): + **kwargs): assert learning_rate is not None assert beta1 is not None assert beta2 is not None assert epsilon is not None - super(AdamOptimizer, self).__init__(global_step) + super(AdamOptimizer, self).__init__(**kwargs) self.type = "adam" self._learning_rate = learning_rate self._beta1 = beta1 @@ -436,12 +434,12 @@ class AdamaxOptimizer(Optimizer): beta1=0.9, beta2=0.999, epsilon=1e-8, - global_step=None): + **kwargs): assert learning_rate is not None assert beta1 is not None assert beta2 is not None assert epsilon is not None - super(AdamaxOptimizer, self).__init__() + super(AdamaxOptimizer, self).__init__(**kwargs) self.type = "adamax" self._learning_rate = learning_rate self._beta1 = beta1 @@ -514,16 +512,12 @@ class DecayedAdagradOptimizer(Optimizer): """ _moment_acc_str = "moment" - def __init__(self, - learning_rate, - decay=0.95, - epsilon=1.0e-6, - global_step=None): + def __init__(self, learning_rate, decay=0.95, epsilon=1.0e-6, **kwargs): assert learning_rate is not None assert decay is not None assert epsilon is not None - super(DecayedAdagradOptimizer, self).__init__(global_step) + super(DecayedAdagradOptimizer, self).__init__(**kwargs) self.type = "decayed_adagrad" self._learning_rate = learning_rate self._decay = decay diff --git a/python/paddle/v2/fluid/regularizer.py b/python/paddle/v2/fluid/regularizer.py index bb1ac8911e..d1955b0047 100644 --- a/python/paddle/v2/fluid/regularizer.py +++ b/python/paddle/v2/fluid/regularizer.py @@ -3,7 +3,7 @@ import framework __all__ = ['append_regularization_ops', 'L1Decay', 'L2Decay'] -def append_regularization_ops(parameters_and_grads): +def append_regularization_ops(parameters_and_grads, regularization=None): """Create and add backward regularization Operators Creates and adds backward regularization operators in the BlockDesc. @@ -14,6 +14,8 @@ def append_regularization_ops(parameters_and_grads): Args: parameters_and_grads: A list of (parameters, gradients) pairs that need to be regularized. + regularization: A global regularizer. If the parameter is not + set. It will be applied with regularizer. Returns: list of (parameters, gradients) pair with the regularized gradient @@ -23,14 +25,19 @@ def append_regularization_ops(parameters_and_grads): """ params_and_grads = [] for param, grad in parameters_and_grads: + regularization_term = None + if param.regularizer is not None: + # Add variable for regularization term in grad block + regularization_term = param.regularizer(param, grad.block) + elif regularization is not None: + regularization_term = regularization(param, grad.block) + # If no gradient or no regularization specified, # then we don't need to do anything - if grad is None or param.regularizer is None: + if grad is None or regularization_term is None: params_and_grads.append((param, grad)) continue - # Add variable for regularization term in grad block - regularization_term = param.regularizer(param, grad.block) assert grad.shape == regularization_term.shape grad.block.append_op( -- GitLab