diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 46828af1b329c8691c8aa90601c98c9cd19d4875..c8758b2ea98834dadf7132c2a011522ab78305e2 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -449,7 +449,40 @@ class AdagradOptimizer(Optimizer): class AdamOptimizer(Optimizer): - """Implements the Adam Optimizer + """ + This implements the Adam optimizer from Section 2 of the Adam + paper : https://arxiv.org/abs/1412.6980. + Adam is a first-order gradient-based optimization method based on + adaptive estimates of lower-order moments. + + Adam updates: + + .. math:: + + t & = t + 1 + + moment\_1\_out & = {\\beta}_1 * moment\_1 + (1 - {\\beta}_1) * grad + + moment\_2\_out & = {\\beta}_2 * moment\_2 + (1 - {\\beta}_2) * grad * grad + + learning\_rate & = learning\_rate * \\ + \\frac{\sqrt{1 - {\\beta}_2^t}}{1 - {\\beta}_1^t} + + param\_out & = param - learning\_rate * \\frac{moment\_1}{\sqrt{moment\_2} + \epsilon} + + Args: + learning_rate (float|Variable): the learning rate used to update parameters. \ + Can be a float value or a Variable with one float value as data element. + beta1 (float): The exponential decay rate for the 1st moment estimates. + beta2 (float): The exponential decay rate for the 2nd moment estimates. + epsilon (float): a small float value for numerical stability. + + Examples: + .. code-block:: python + + optimizer = fluid.optimizer.Adam(learning_rate=0.2) + optimizer.minimize(cost) + """ _moment1_acc_str = "moment1" _moment2_acc_str = "moment2"