diff --git a/doc/api/v2/config/optimizer.rst b/doc/api/v2/config/optimizer.rst index ec6ba0aa46239f3806ca950e8863b953d0c4150b..b32373fdef52a7aa9d64b12cda3f76cb2abf351b 100644 --- a/doc/api/v2/config/optimizer.rst +++ b/doc/api/v2/config/optimizer.rst @@ -1,5 +1,3 @@ -.. _api_v2.optimizer: - ========== Optimizer ========== diff --git a/python/paddle/v2/optimizer.py b/python/paddle/v2/optimizer.py index 1a01d95c205c0626374e1814a170ce2d58f23a60..d153d30ea205e572b5bb55b91edfb808f0de1a57 100644 --- a/python/paddle/v2/optimizer.py +++ b/python/paddle/v2/optimizer.py @@ -47,6 +47,35 @@ class Optimizer(object): class Momentum(Optimizer): + """ + SGD Optimizer. + + SGD is an optimization method, trying to find a neural network that + minimize the "cost/error" of it by iteration. In paddle's implementation + SGD Optimizer is synchronized, which means all gradients will be wait to + calculate and reduced into one gradient, then do optimize operation. + + The neural network consider the learning problem of minimizing an objective + function, that has the form of a sum + + .. math:: + + Q(w) = \\sum_{i}^{n} Q_i(w) + + The value of function Q sometimes is the cost of neural network (Mean + Square Error between prediction and label for example). The function Q is + parametrised by w, the weight/bias of neural network. And weights is what to + be learned. The i is the i-th observation in (trainning) data. + + So, the SGD method will optimize the weight by + + .. math:: + + w = w - \\eta \\nabla Q(w) = w - \\eta \\sum_{i}^{n} \\nabla Q_i(w) + + where :math:`\\eta` is learning rate. And :math:`n` is batch size. + """ + def __init__(self, momentum=None, sparse=False, **kwargs): learning_method = v1_optimizers.MomentumOptimizer( momentum=momentum, sparse=sparse) @@ -55,6 +84,26 @@ class Momentum(Optimizer): class Adam(Optimizer): + """ + Adam optimizer. + The details of please refer `Adam: A Method for Stochastic Optimization + `_ + + .. math:: + + m(w, t) & = \\beta_1 m(w, t-1) + (1 - \\beta_1) \\nabla Q_i(w) \\\\ + v(w, t) & = \\beta_2 v(w, t-1) + (1 - \\beta_2)(\\nabla Q_i(w)) ^2 \\\\ + w & = w - \\frac{\\eta}{\\sqrt{v(w,t) + \\epsilon}} + + :param beta1: the :math:`\\beta_1` in equation. + :type beta1: float + :param beta2: the :math:`\\beta_2` in equation. + :type beta2: float + :param epsilon: the :math:`\\epsilon` in equation. It is used to prevent + divided by zero. + :type epsilon: float + """ + def __init__(self, beta1=0.9, beta2=0.999, epsilon=1e-8, **kwargs): learning_method = v1_optimizers.AdamOptimizer( beta1=beta1, beta2=beta2, epsilon=epsilon) @@ -62,6 +111,24 @@ class Adam(Optimizer): class Adamax(Optimizer): + """ + Adamax optimizer. + + The details of please refer this `Adam: A Method for Stochastic Optimization + `_ + + .. math:: + + m_t & = \\beta_1 * m_{t-1} + (1-\\beta_1)* \\nabla Q_i(w) \\\\ + u_t & = max(\\beta_2*u_{t-1}, abs(\\nabla Q_i(w))) \\\\ + w_t & = w_{t-1} - (\\eta/(1-\\beta_1^t))*m_t/u_t + + :param beta1: the :math:`\\beta_1` in the equation. + :type beta1: float + :param beta2: the :math:`\\beta_2` in the equation. + :type beta2: float + """ + def __init__(self, beta1=0.9, beta2=0.999, **kwargs): learning_method = v1_optimizers.AdamaxOptimizer( beta1=beta1, beta2=beta2) @@ -69,12 +136,40 @@ class Adamax(Optimizer): class AdaGrad(Optimizer): + """ + Adagrad(for ADAptive GRAdient algorithm) optimizer. + + For details please refer this `Adaptive Subgradient Methods for + Online Learning and Stochastic Optimization + `_. + + .. math:: + + G &= \\sum_{\\tau=1}^{t} g_{\\tau} g_{\\tau}^T \\\\ + w & = w - \\eta diag(G)^{-\\frac{1}{2}} \\circ g + """ + def __init__(self, **kwargs): learning_method = v1_optimizers.AdaGradOptimizer() super(AdaGrad, self).__init__(learning_method=learning_method, **kwargs) class DecayedAdaGrad(Optimizer): + """ + AdaGrad method with decayed sum gradients. The equations of this method + show as follow. + + .. math:: + + E(g_t^2) &= \\rho * E(g_{t-1}^2) + (1-\\rho) * g^2 \\\\ + learning\\_rate &= 1/sqrt( ( E(g_t^2) + \\epsilon ) + + :param rho: The :math:`\\rho` parameter in that equation + :type rho: float + :param epsilon: The :math:`\\epsilon` parameter in that equation. + :type epsilon: float + """ + def __init__(self, rho=0.95, epsilon=1e-06, **kwargs): learning_method = v1_optimizers.DecayedAdaGradOptimizer( rho=rho, epsilon=epsilon) @@ -83,6 +178,24 @@ class DecayedAdaGrad(Optimizer): class AdaDelta(Optimizer): + """ + AdaDelta method. The details of adadelta please refer to this + `ADADELTA: AN ADAPTIVE LEARNING RATE METHOD + `_. + + .. math:: + + E(g_t^2) &= \\rho * E(g_{t-1}^2) + (1-\\rho) * g^2 \\\\ + learning\\_rate &= sqrt( ( E(dx_{t-1}^2) + \\epsilon ) / ( \\ + E(g_t^2) + \\epsilon ) ) \\\\ + E(dx_t^2) &= \\rho * E(dx_{t-1}^2) + (1-\\rho) * (-g*learning\\_rate)^2 + + :param rho: :math:`\\rho` in equation + :type rho: float + :param epsilon: :math:`\\rho` in equation + :type epsilon: float + """ + def __init__(self, rho=0.95, epsilon=1e-06, **kwargs): learning_method = v1_optimizers.AdaDeltaOptimizer( rho=rho, epsilon=epsilon) @@ -91,6 +204,24 @@ class AdaDelta(Optimizer): class RMSProp(Optimizer): + """ + RMSProp(for Root Mean Square Propagation) optimizer. For details please + refer this `slide `_. + + The equations of this method as follows: + + .. math:: + + v(w, t) & = \\rho v(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2 \\\\ + w & = w - \\frac{\\eta} {\\sqrt{v(w,t) + \\epsilon}} \\nabla Q_{i}(w) + + :param rho: the :math:`\\rho` in the equation. The forgetting factor. + :type rho: float + :param epsilon: the :math:`\\epsilon` in the equation. + :type epsilon: float + """ + def __init__(self, rho=0.95, epsilon=1e-6, **kwargs): learning_method = v1_optimizers.RMSPropOptimizer( rho=rho, epsilon=epsilon)