diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 46828af1b329c8691c8aa90601c98c9cd19d4875..c8758b2ea98834dadf7132c2a011522ab78305e2 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -449,7 +449,40 @@ class AdagradOptimizer(Optimizer):
 
 
 class AdamOptimizer(Optimizer):
-    """Implements the Adam Optimizer
+    """
+    This implements the Adam optimizer from Section 2 of the Adam
+    paper : https://arxiv.org/abs/1412.6980.
+    Adam is a first-order gradient-based optimization method based on
+    adaptive estimates of lower-order moments.
+
+    Adam updates:
+
+    .. math::
+
+        t & = t + 1
+
+        moment\_1\_out & = {\\beta}_1 * moment\_1 + (1 - {\\beta}_1) * grad
+
+        moment\_2\_out & = {\\beta}_2 * moment\_2 + (1 - {\\beta}_2) * grad * grad
+
+        learning\_rate & = learning\_rate * \\
+                          \\frac{\sqrt{1 - {\\beta}_2^t}}{1 - {\\beta}_1^t}
+
+        param\_out & = param - learning\_rate * \\frac{moment\_1}{\sqrt{moment\_2} + \epsilon}
+
+    Args:
+        learning_rate (float|Variable): the learning rate used to update parameters. \
+        Can be a float value or a Variable with one float value as data element.
+        beta1 (float): The exponential decay rate for the 1st moment estimates.
+        beta2 (float): The exponential decay rate for the 2nd moment estimates.
+        epsilon (float): a small float value for numerical stability.
+
+    Examples:
+        .. code-block:: python
+
+            optimizer = fluid.optimizer.Adam(learning_rate=0.2)
+            optimizer.minimize(cost)
+
     """
     _moment1_acc_str = "moment1"
     _moment2_acc_str = "moment2"