diff --git a/doc/v2/api/fluid/optimizer.rst b/doc/v2/api/fluid/optimizer.rst index 9b165f870459b4f9ef2efe24f5604a3fcb96f7f3..2f820595c35c2bccd6a5c8a20c60d796c04c8e97 100644 --- a/doc/v2/api/fluid/optimizer.rst +++ b/doc/v2/api/fluid/optimizer.rst @@ -47,3 +47,10 @@ DecayedAdagrad :members: :noindex: +Adadelta +-------------- + +.. autoclass:: paddle.fluid.optimizer.AdadeltaOptimizer + :members: + :noindex: + diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 8b8621469d856e63dfd9685d7cd3d4c1d2ada1ce..e8623ee0dae820ece1874ed58ecf962b5f296d94 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -24,7 +24,9 @@ from layer_helper import LayerHelper from regularizer import append_regularization_ops from clip import append_gradient_clip_ops, error_clip_callback -__all__ = ['SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad'] +__all__ = [ + 'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad', 'Adadelta' +] class Optimizer(object): @@ -580,6 +582,88 @@ class DecayedAdagradOptimizer(Optimizer): return decayed_adagrad_op +class AdadeltaOptimizer(Optimizer): + """ + **Adadelta Optimizer** + Simple Adadelta optimizer with average squared grad state and + average squared update state. + The details of adadelta please refer to this + `ADADELTA: AN ADAPTIVE LEARNING RATE METHOD + `_. + + .. math:: + + E(g_t^2) &= \\rho * E(g_{t-1}^2) + (1-\\rho) * g^2 \\\\ + learning\\_rate &= sqrt( ( E(dx_{t-1}^2) + \\epsilon ) / ( \\ + E(g_t^2) + \\epsilon ) ) \\\\ + E(dx_t^2) &= \\rho * E(dx_{t-1}^2) + (1-\\rho) * (-g*learning\\_rate)^2 + + Args: + learning_rate(float): global leraning rate + rho(float): rho in equation + epsilon(float): epsilon in equation + + Examples: + .. code-block:: python + + optimizer = fluid.optimizer.Adadelta( + learning_rate=0.0003, epsilon=1.0e-6, rho=0.95) + _, params_grads = optimizer.minimize(cost) + """ + + _avg_squared_grad_acc_str = "_avg_squared_grad" + _avg_squared_update_acc_str = "_avg_squared_update" + + def __init__(self, learning_rate, epsilon=1.0e-6, rho=0.95, **kwargs): + if learning_rate is None: + raise ValueError("learning_rate is not set.") + if epsilon is None: + raise ValueError("epsilon is not set.") + if rho is None: + raise ValueError("rho is not set.") + super(AdadeltaOptimizer, self).__init__( + learning_rate=learning_rate, **kwargs) + self.type = "adadelta" + self._epsilon = epsilon + self._rho = rho + + def _create_accumulators(self, block, parameters): + if not isinstance(block, framework.Block): + raise TypeError("block is not instance of framework.Block.") + + for p in parameters: + self._add_accumulator(self._avg_squared_grad_acc_str, p) + self._add_accumulator(self._avg_squared_update_acc_str, p) + + def _append_optimize_op(self, block, param_and_grad): + if not isinstance(block, framework.Block): + raise TypeError("block is not instance of framework.Block.") + + avg_squared_grad_acc = self._get_accumulator( + self._avg_squared_grad_acc_str, param_and_grad[0]) + avg_squared_update_acc = self._get_accumulator( + self._avg_squared_update_acc_str, param_and_grad[0]) + + # Create the adadelta optimizer op + adadelta_op = block.append_op( + type=self.type, + inputs={ + "Param": param_and_grad[0], + "Grad": param_and_grad[1], + "AvgSquaredGrad": avg_squared_grad_acc, + "AvgSquaredUpdate": avg_squared_update_acc + }, + outputs={ + "ParamOut": param_and_grad[0], + "AvgSquaredGradOut": avg_squared_grad_acc, + "AvgSquaredUpdateOut": avg_squared_update_acc + }, + attrs={"epsilon": self._epsilon, + "rho": self._rho}) + + return adadelta_op + + # We short the class name, since users will use the optimizer with the package # name. The sample code: # @@ -594,3 +678,4 @@ Adagrad = AdagradOptimizer Adam = AdamOptimizer Adamax = AdamaxOptimizer DecayedAdagrad = DecayedAdagradOptimizer +Adadelta = AdadeltaOptimizer