diff --git a/doc/api/v2/config/optimizer.rst b/doc/api/v2/config/optimizer.rst
index ec6ba0aa46239f3806ca950e8863b953d0c4150b..b32373fdef52a7aa9d64b12cda3f76cb2abf351b 100644
--- a/doc/api/v2/config/optimizer.rst
+++ b/doc/api/v2/config/optimizer.rst
@@ -1,5 +1,3 @@
-..  _api_v2.optimizer:
-
 ==========
 Optimizer
 ==========
diff --git a/python/paddle/v2/optimizer.py b/python/paddle/v2/optimizer.py
index 1a01d95c205c0626374e1814a170ce2d58f23a60..d153d30ea205e572b5bb55b91edfb808f0de1a57 100644
--- a/python/paddle/v2/optimizer.py
+++ b/python/paddle/v2/optimizer.py
@@ -47,6 +47,35 @@ class Optimizer(object):
 
 
 class Momentum(Optimizer):
+    """
+    SGD Optimizer.
+
+    SGD is an optimization method, trying to find a neural network that
+    minimize the "cost/error" of it by iteration. In paddle's implementation
+    SGD Optimizer is synchronized, which means all gradients will be wait to
+    calculate and reduced into one gradient, then do optimize operation.
+
+    The neural network consider the learning problem of minimizing an objective
+    function, that has the form of a sum
+
+    ..  math::
+
+        Q(w) = \\sum_{i}^{n} Q_i(w)
+
+    The value of function Q sometimes is the cost of neural network (Mean
+    Square Error between prediction and label for example). The function Q is
+    parametrised by w, the weight/bias of neural network. And weights is what to
+    be learned. The i is the i-th observation in (trainning) data.
+
+    So, the SGD method will optimize the weight by
+
+    ..  math::
+
+        w = w - \\eta \\nabla Q(w) = w - \\eta \\sum_{i}^{n} \\nabla Q_i(w)
+
+    where :math:`\\eta` is learning rate. And :math:`n` is batch size.
+    """
+
     def __init__(self, momentum=None, sparse=False, **kwargs):
         learning_method = v1_optimizers.MomentumOptimizer(
             momentum=momentum, sparse=sparse)
@@ -55,6 +84,26 @@ class Momentum(Optimizer):
 
 
 class Adam(Optimizer):
+    """
+    Adam optimizer.
+    The details of please refer `Adam: A Method for Stochastic Optimization
+    <https://arxiv.org/abs/1412.6980>`_
+
+    ..  math::
+
+        m(w, t) & = \\beta_1 m(w, t-1) + (1 - \\beta_1) \\nabla Q_i(w) \\\\
+        v(w, t) & = \\beta_2 v(w, t-1) + (1 - \\beta_2)(\\nabla Q_i(w)) ^2 \\\\
+        w & = w - \\frac{\\eta}{\\sqrt{v(w,t) + \\epsilon}}
+
+    :param beta1: the :math:`\\beta_1` in equation.
+    :type beta1: float
+    :param beta2: the :math:`\\beta_2` in equation.
+    :type beta2: float
+    :param epsilon: the :math:`\\epsilon` in equation. It is used to prevent
+                        divided by zero.
+    :type epsilon: float
+    """
+
     def __init__(self, beta1=0.9, beta2=0.999, epsilon=1e-8, **kwargs):
         learning_method = v1_optimizers.AdamOptimizer(
             beta1=beta1, beta2=beta2, epsilon=epsilon)
@@ -62,6 +111,24 @@ class Adam(Optimizer):
 
 
 class Adamax(Optimizer):
+    """
+    Adamax optimizer.
+
+    The details of please refer this `Adam: A Method for Stochastic Optimization
+    <https://arxiv.org/abs/1412.6980>`_
+
+    ..  math::
+
+        m_t & = \\beta_1 * m_{t-1} + (1-\\beta_1)* \\nabla Q_i(w) \\\\
+        u_t & = max(\\beta_2*u_{t-1}, abs(\\nabla Q_i(w))) \\\\
+        w_t & = w_{t-1} - (\\eta/(1-\\beta_1^t))*m_t/u_t
+
+    :param beta1: the :math:`\\beta_1` in the equation.
+    :type beta1: float
+    :param beta2: the :math:`\\beta_2` in the equation.
+    :type beta2: float
+    """
+
     def __init__(self, beta1=0.9, beta2=0.999, **kwargs):
         learning_method = v1_optimizers.AdamaxOptimizer(
             beta1=beta1, beta2=beta2)
@@ -69,12 +136,40 @@ class Adamax(Optimizer):
 
 
 class AdaGrad(Optimizer):
+    """
+    Adagrad(for ADAptive GRAdient algorithm) optimizer.
+
+    For details please refer this `Adaptive Subgradient Methods for
+    Online Learning and Stochastic Optimization
+    <http://www.magicbroom.info/Papers/DuchiHaSi10.pdf>`_.
+
+    ..  math::
+
+        G &= \\sum_{\\tau=1}^{t} g_{\\tau} g_{\\tau}^T \\\\
+        w & = w - \\eta diag(G)^{-\\frac{1}{2}} \\circ g
+    """
+
     def __init__(self, **kwargs):
         learning_method = v1_optimizers.AdaGradOptimizer()
         super(AdaGrad, self).__init__(learning_method=learning_method, **kwargs)
 
 
 class DecayedAdaGrad(Optimizer):
+    """
+    AdaGrad method with decayed sum gradients. The equations of this method
+    show as follow.
+
+    ..  math::
+
+        E(g_t^2) &= \\rho * E(g_{t-1}^2) + (1-\\rho) * g^2 \\\\
+        learning\\_rate &= 1/sqrt( ( E(g_t^2) + \\epsilon )
+
+    :param rho: The :math:`\\rho` parameter in that equation
+    :type rho: float
+    :param epsilon: The :math:`\\epsilon` parameter in that equation.
+    :type epsilon: float
+    """
+
     def __init__(self, rho=0.95, epsilon=1e-06, **kwargs):
         learning_method = v1_optimizers.DecayedAdaGradOptimizer(
             rho=rho, epsilon=epsilon)
@@ -83,6 +178,24 @@ class DecayedAdaGrad(Optimizer):
 
 
 class AdaDelta(Optimizer):
+    """
+    AdaDelta method. The details of adadelta please refer to this
+    `ADADELTA: AN ADAPTIVE LEARNING RATE METHOD
+    <http://www.matthewzeiler.com/pubs/googleTR2012/googleTR2012.pdf>`_.
+
+    ..  math::
+
+        E(g_t^2) &= \\rho * E(g_{t-1}^2) + (1-\\rho) * g^2 \\\\
+        learning\\_rate &= sqrt( ( E(dx_{t-1}^2) + \\epsilon ) / ( \\
+                          E(g_t^2) + \\epsilon ) ) \\\\
+        E(dx_t^2) &= \\rho * E(dx_{t-1}^2) + (1-\\rho) * (-g*learning\\_rate)^2
+
+    :param rho: :math:`\\rho` in equation
+    :type rho: float
+    :param epsilon: :math:`\\rho` in equation
+    :type epsilon: float
+    """
+    
     def __init__(self, rho=0.95, epsilon=1e-06, **kwargs):
         learning_method = v1_optimizers.AdaDeltaOptimizer(
             rho=rho, epsilon=epsilon)
@@ -91,6 +204,24 @@ class AdaDelta(Optimizer):
 
 
 class RMSProp(Optimizer):
+    """
+    RMSProp(for Root Mean Square Propagation) optimizer. For details please
+    refer this `slide <http://www.cs.toronto.edu/~tijmen/csc321/slides/
+    lecture_slides_lec6.pdf>`_.
+
+    The equations of this method as follows:
+
+    ..  math::
+
+        v(w, t) & = \\rho v(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2 \\\\
+        w & = w - \\frac{\\eta} {\\sqrt{v(w,t) + \\epsilon}} \\nabla Q_{i}(w)
+
+    :param rho: the :math:`\\rho` in the equation. The forgetting factor.
+    :type rho: float
+    :param epsilon: the :math:`\\epsilon` in the equation.
+    :type epsilon: float
+    """
+
     def __init__(self, rho=0.95, epsilon=1e-6, **kwargs):
         learning_method = v1_optimizers.RMSPropOptimizer(
             rho=rho, epsilon=epsilon)