diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 214e0a7645dcc914d6acc111907f06cc054d4b62..f40c4cb9278e6e387fd4c2bde9d7ac15e2574649 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -282,7 +282,7 @@ class SGDOptimizer(Optimizer): Examples: .. code-block:: python - sgd_optimizer = SGDOptimizer(learning_rate=0.2) + sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.2) sgd_optimizer.minimize(cost) """ @@ -338,7 +338,7 @@ class MomentumOptimizer(Optimizer): Examples: .. code-block:: python - optimizer = MomentumOptimizer(learning_rate=0.2, momentum=0.1) + optimizer = fluid.optimizer.Momentum(learning_rate=0.2, momentum=0.1) optimizer.minimize(cost) """ _velocity_acc_str = "velocity" @@ -383,7 +383,32 @@ class MomentumOptimizer(Optimizer): class AdagradOptimizer(Optimizer): - """Simple Adagrad optimizer with moment state + """ + **Adaptive Gradient Algorithm (Adagrad)** + + The update is done as follows: + + .. math:: + + moment\_out &= moment + grad * grad + + param\_out &= param - \\frac{learning\_rate * grad}{\sqrt{moment\_out} + \epsilon} + + The original paper(http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) + does not have the epsilon attribute. It is added here in our implementation + as also proposed here: http://cs231n.github.io/neural-networks-3/#ada + for numerical stability to avoid the division by zero error. + + Args: + learning_rate (float|Variable): the learning rate used to update parameters. \ + Can be a float value or a Variable with one float value as data element. + epsilon (float): a small float value for numerical stability. + + Examples: + .. code-block:: python + + optimizer = fluid.optimizer.Adagrad(learning_rate=0.2) + optimizer.minimize(cost) """ _moment_acc_str = "moment"