diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index ef7b16a19e10a28bd1cc34496fb908580c5d7330..3c8c83560de254d381c03341d2ad74a8e7bd5887 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -43,11 +43,7 @@ class Optimizer(object): but need to use one of it's implementation. """ - def __init__(self, - learning_rate, - regularization=None, - LARS_weight_decay=0.0, - name=None): + def __init__(self, learning_rate, regularization=None, name=None): if not isinstance(learning_rate, float) and \ not isinstance(learning_rate, framework.Variable): raise TypeError("learning rate should be float or Variable") @@ -68,7 +64,6 @@ class Optimizer(object): # {accum_name : { paramter_name : accumulator_for_parameter, ...}, ...} self._accumulators = defaultdict(lambda: dict()) self.helper = None - self._LARS_weight_decay = LARS_weight_decay def _create_global_learning_rate(self): lr = self._global_learning_rate() @@ -227,10 +222,6 @@ class Optimizer(object): self._create_accumulators(loss.block, [p[0] for p in parameters_and_grads]) self._create_global_learning_rate() - if self._LARS_weight_decay > 0.0: - layers.append_LARS(parameters_and_grads, - self._global_learning_rate(), - self._LARS_weight_decay) optimize_ops = [] for param_and_grad in parameters_and_grads: @@ -287,6 +278,9 @@ class SGDOptimizer(Optimizer): Args: learning_rate (float|Variable): the learning rate used to update parameters. \ Can be a float value or a Variable with one float value as data element. + regularization: A Regularizer, such as + fluid.regularizer.L2DecayRegularizer. + name: A optional name prefix. Examples: .. code-block:: python @@ -295,10 +289,12 @@ class SGDOptimizer(Optimizer): sgd_optimizer.minimize(cost) """ - def __init__(self, learning_rate, **kwargs): + def __init__(self, learning_rate, regularization=None, name=None): assert learning_rate is not None super(SGDOptimizer, self).__init__( - learning_rate=learning_rate, **kwargs) + learning_rate=learning_rate, + regularization=regularization, + name=name) self.type = "sgd" def _append_optimize_op(self, block, param_and_grad): @@ -343,6 +339,9 @@ class MomentumOptimizer(Optimizer): Can be a float value or a Variable with one float value as data element. momentum (float): momentum factor use_nesterov (bool): enables Nesterov momentum + regularization: A Regularizer, such as + fluid.regularizer.L2DecayRegularizer. + name: A optional name prefix. Examples: .. code-block:: python @@ -352,11 +351,18 @@ class MomentumOptimizer(Optimizer): """ _velocity_acc_str = "velocity" - def __init__(self, learning_rate, momentum, use_nesterov=False, **kwargs): + def __init__(self, + learning_rate, + momentum, + use_nesterov=False, + regularization=None, + name=None): assert learning_rate is not None assert momentum is not None super(MomentumOptimizer, self).__init__( - learning_rate=learning_rate, **kwargs) + learning_rate=learning_rate, + regularization=regularization, + name=name) self.type = "momentum" self._momentum = momentum self._use_nesterov = bool(use_nesterov) @@ -412,6 +418,9 @@ class AdagradOptimizer(Optimizer): learning_rate (float|Variable): the learning rate used to update parameters. \ Can be a float value or a Variable with one float value as data element. epsilon (float): a small float value for numerical stability. + regularization: A Regularizer, such as + fluid.regularizer.L2DecayRegularizer. + name: A optional name prefix. Examples: .. code-block:: python @@ -421,11 +430,17 @@ class AdagradOptimizer(Optimizer): """ _moment_acc_str = "moment" - def __init__(self, learning_rate, epsilon=1.0e-6, **kwargs): + def __init__(self, + learning_rate, + epsilon=1.0e-6, + regularization=None, + name=None): assert learning_rate is not None assert epsilon is not None super(AdagradOptimizer, self).__init__( - learning_rate=learning_rate, **kwargs) + learning_rate=learning_rate, + regularization=regularization, + name=name) self.type = "adagrad" self._epsilon = epsilon @@ -485,6 +500,9 @@ class AdamOptimizer(Optimizer): beta1 (float): The exponential decay rate for the 1st moment estimates. beta2 (float): The exponential decay rate for the 2nd moment estimates. epsilon (float): a small float value for numerical stability. + regularization: A Regularizer, such as + fluid.regularizer.L2DecayRegularizer. + name: A optional name prefix. Examples: .. code-block:: python @@ -503,13 +521,16 @@ class AdamOptimizer(Optimizer): beta1=0.9, beta2=0.999, epsilon=1e-8, - **kwargs): + regularization=None, + name=None): assert learning_rate is not None assert beta1 is not None assert beta2 is not None assert epsilon is not None super(AdamOptimizer, self).__init__( - learning_rate=learning_rate, **kwargs) + learning_rate=learning_rate, + regularization=regularization, + name=name) self.type = "adam" self._beta1 = beta1 self._beta2 = beta2 @@ -629,6 +650,9 @@ class AdamaxOptimizer(Optimizer): beta1 (float): The exponential decay rate for the 1st moment estimates. beta2 (float): The exponential decay rate for the 2nd moment estimates. epsilon (float): a small float value for numerical stability. + regularization: A Regularizer, such as + fluid.regularizer.L2DecayRegularizer. + name: A optional name prefix. Examples: .. code-block:: python @@ -645,13 +669,16 @@ class AdamaxOptimizer(Optimizer): beta1=0.9, beta2=0.999, epsilon=1e-8, - **kwargs): + regularization=None, + name=None): assert learning_rate is not None assert beta1 is not None assert beta2 is not None assert epsilon is not None super(AdamaxOptimizer, self).__init__( - learning_rate=learning_rate, **kwargs) + learning_rate=learning_rate, + regularization=regularization, + name=name) self.type = "adamax" self._beta1 = beta1 self._beta2 = beta2 @@ -742,6 +769,9 @@ class DecayedAdagradOptimizer(Optimizer): Can be a float value or a Variable with one float value as data element. decay (float): decay rate. epsilon (float): a small float value for numerical stability. + regularization: A Regularizer, such as + fluid.regularizer.L2DecayRegularizer. + name: A optional name prefix. Examples: .. code-block:: python @@ -751,13 +781,20 @@ class DecayedAdagradOptimizer(Optimizer): """ _moment_acc_str = "moment" - def __init__(self, learning_rate, decay=0.95, epsilon=1.0e-6, **kwargs): + def __init__(self, + learning_rate, + decay=0.95, + epsilon=1.0e-6, + regularization=None, + name=None): assert learning_rate is not None assert decay is not None assert epsilon is not None super(DecayedAdagradOptimizer, self).__init__( - learning_rate=learning_rate, **kwargs) + learning_rate=learning_rate, + regularization=regularization, + name=name) self.type = "decayed_adagrad" self._decay = decay self._epsilon = epsilon @@ -811,6 +848,9 @@ class AdadeltaOptimizer(Optimizer): learning_rate(float): global learning rate rho(float): rho in equation epsilon(float): epsilon in equation + regularization: A Regularizer, such as + fluid.regularizer.L2DecayRegularizer. + name: A optional name prefix. Examples: .. code-block:: python @@ -823,7 +863,12 @@ class AdadeltaOptimizer(Optimizer): _avg_squared_grad_acc_str = "_avg_squared_grad" _avg_squared_update_acc_str = "_avg_squared_update" - def __init__(self, learning_rate, epsilon=1.0e-6, rho=0.95, **kwargs): + def __init__(self, + learning_rate, + epsilon=1.0e-6, + rho=0.95, + regularization=None, + name=None): if learning_rate is None: raise ValueError("learning_rate is not set.") if epsilon is None: @@ -831,7 +876,9 @@ class AdadeltaOptimizer(Optimizer): if rho is None: raise ValueError("rho is not set.") super(AdadeltaOptimizer, self).__init__( - learning_rate=learning_rate, **kwargs) + learning_rate=learning_rate, + regularization=regularization, + name=name) self.type = "adadelta" self._epsilon = epsilon self._rho = rho @@ -932,6 +979,9 @@ class RMSPropOptimizer(Optimizer): the gradient; if False, by the uncentered second moment. Setting this to True may help with training, but is slightly more expensive in terms of computation and memory. Defaults to False. + regularization: A Regularizer, such as + fluid.regularizer.L2DecayRegularizer. + name: A optional name prefix. Raises: ValueError: If learning_rate, rho, epsilon, momentum are None. @@ -953,9 +1003,12 @@ class RMSPropOptimizer(Optimizer): epsilon=1.0e-6, momentum=0.0, centered=False, - **kwargs): + regularization=None, + name=None): super(RMSPropOptimizer, self).__init__( - learning_rate=learning_rate, **kwargs) + learning_rate=learning_rate, + regularization=regularization, + name=name) if learning_rate is None: raise ValueError("learning_rate is not set.") if rho is None: @@ -1061,6 +1114,9 @@ class FtrlOptimizer(Optimizer): l1 (float): l2 (float): lr_power (float): + regularization: A Regularizer, such as + fluid.regularizer.L2DecayRegularizer. + name: A optional name prefix. Raises: ValueError: If learning_rate, rho, epsilon, momentum are None. @@ -1075,9 +1131,17 @@ class FtrlOptimizer(Optimizer): _squared_acc_str = "squared" _linear_acc_str = "linear" - def __init__(self, learning_rate, l1=0.0, l2=0.0, lr_power=-0.5, **kwargs): + def __init__(self, + learning_rate, + l1=0.0, + l2=0.0, + lr_power=-0.5, + regularization=None, + name=None): super(FtrlOptimizer, self).__init__( - learning_rate=learning_rate, **kwargs) + learning_rate=learning_rate, + regularization=regularization, + name=name) if learning_rate is None: raise ValueError("learning_rate is not set.") @@ -1155,7 +1219,9 @@ class ModelAverage(Optimizer): average_window_rate: The rate of average window. min_average_window: The minimum size of average window. max_average_window: The maximum size of average window. - + regularization: A Regularizer, such as + fluid.regularizer.L2DecayRegularizer. + name: A optional name prefix. Examples: .. code-block:: python @@ -1178,8 +1244,10 @@ class ModelAverage(Optimizer): average_window_rate, min_average_window=10000, max_average_window=10000, - **kwargs): - super(ModelAverage, self).__init__(0.0, **kwargs) + regularization=None, + name=None): + super(ModelAverage, self).__init__( + 0.0, regularization=regularization, name=name) self.average_window = average_window_rate self.min_average_window = min_average_window self.max_average_window = max_average_window diff --git a/python/paddle/fluid/regularizer.py b/python/paddle/fluid/regularizer.py index 8f4678649f2146c84150ad2659e497bbf0365d03..a4336e955f21b0b09bf3dadbd437855c06745860 100644 --- a/python/paddle/fluid/regularizer.py +++ b/python/paddle/fluid/regularizer.py @@ -190,14 +190,11 @@ class L1DecayRegularizer(WeightDecayRegularizer): Examples: .. code-block:: python - program = fluid.framework.Program() - block = program.global_block() - mul_x = block.create_parameter( - dtype="float32", - shape=[5, 10], - lod_level=0, - name="mul.x", - regularizer=fluid.regularizer.L1DecayRegularizer(0.5)) + optimizer = fluid.optimizer.Adagrad( + learning_rate=1e-4, + regularization=fluid.regularizer.L1DecayRegularizer( + regularization_coeff=0.1)) + optimizer.minimize(avg_cost) """ def __init__(self, regularization_coeff=0.0):