提交 c36bf197 编写于 作者: Q qijun

add optimizer doc

上级 929090ed
.. _api_v2.optimizer:
========== ==========
Optimizer Optimizer
========== ==========
......
...@@ -47,6 +47,35 @@ class Optimizer(object): ...@@ -47,6 +47,35 @@ class Optimizer(object):
class Momentum(Optimizer): class Momentum(Optimizer):
"""
SGD Optimizer.
SGD is an optimization method, trying to find a neural network that
minimize the "cost/error" of it by iteration. In paddle's implementation
SGD Optimizer is synchronized, which means all gradients will be wait to
calculate and reduced into one gradient, then do optimize operation.
The neural network consider the learning problem of minimizing an objective
function, that has the form of a sum
.. math::
Q(w) = \\sum_{i}^{n} Q_i(w)
The value of function Q sometimes is the cost of neural network (Mean
Square Error between prediction and label for example). The function Q is
parametrised by w, the weight/bias of neural network. And weights is what to
be learned. The i is the i-th observation in (trainning) data.
So, the SGD method will optimize the weight by
.. math::
w = w - \\eta \\nabla Q(w) = w - \\eta \\sum_{i}^{n} \\nabla Q_i(w)
where :math:`\\eta` is learning rate. And :math:`n` is batch size.
"""
def __init__(self, momentum=None, sparse=False, **kwargs): def __init__(self, momentum=None, sparse=False, **kwargs):
learning_method = v1_optimizers.MomentumOptimizer( learning_method = v1_optimizers.MomentumOptimizer(
momentum=momentum, sparse=sparse) momentum=momentum, sparse=sparse)
...@@ -55,6 +84,26 @@ class Momentum(Optimizer): ...@@ -55,6 +84,26 @@ class Momentum(Optimizer):
class Adam(Optimizer): class Adam(Optimizer):
"""
Adam optimizer.
The details of please refer `Adam: A Method for Stochastic Optimization
<https://arxiv.org/abs/1412.6980>`_
.. math::
m(w, t) & = \\beta_1 m(w, t-1) + (1 - \\beta_1) \\nabla Q_i(w) \\\\
v(w, t) & = \\beta_2 v(w, t-1) + (1 - \\beta_2)(\\nabla Q_i(w)) ^2 \\\\
w & = w - \\frac{\\eta}{\\sqrt{v(w,t) + \\epsilon}}
:param beta1: the :math:`\\beta_1` in equation.
:type beta1: float
:param beta2: the :math:`\\beta_2` in equation.
:type beta2: float
:param epsilon: the :math:`\\epsilon` in equation. It is used to prevent
divided by zero.
:type epsilon: float
"""
def __init__(self, beta1=0.9, beta2=0.999, epsilon=1e-8, **kwargs): def __init__(self, beta1=0.9, beta2=0.999, epsilon=1e-8, **kwargs):
learning_method = v1_optimizers.AdamOptimizer( learning_method = v1_optimizers.AdamOptimizer(
beta1=beta1, beta2=beta2, epsilon=epsilon) beta1=beta1, beta2=beta2, epsilon=epsilon)
...@@ -62,6 +111,24 @@ class Adam(Optimizer): ...@@ -62,6 +111,24 @@ class Adam(Optimizer):
class Adamax(Optimizer): class Adamax(Optimizer):
"""
Adamax optimizer.
The details of please refer this `Adam: A Method for Stochastic Optimization
<https://arxiv.org/abs/1412.6980>`_
.. math::
m_t & = \\beta_1 * m_{t-1} + (1-\\beta_1)* \\nabla Q_i(w) \\\\
u_t & = max(\\beta_2*u_{t-1}, abs(\\nabla Q_i(w))) \\\\
w_t & = w_{t-1} - (\\eta/(1-\\beta_1^t))*m_t/u_t
:param beta1: the :math:`\\beta_1` in the equation.
:type beta1: float
:param beta2: the :math:`\\beta_2` in the equation.
:type beta2: float
"""
def __init__(self, beta1=0.9, beta2=0.999, **kwargs): def __init__(self, beta1=0.9, beta2=0.999, **kwargs):
learning_method = v1_optimizers.AdamaxOptimizer( learning_method = v1_optimizers.AdamaxOptimizer(
beta1=beta1, beta2=beta2) beta1=beta1, beta2=beta2)
...@@ -69,12 +136,40 @@ class Adamax(Optimizer): ...@@ -69,12 +136,40 @@ class Adamax(Optimizer):
class AdaGrad(Optimizer): class AdaGrad(Optimizer):
"""
Adagrad(for ADAptive GRAdient algorithm) optimizer.
For details please refer this `Adaptive Subgradient Methods for
Online Learning and Stochastic Optimization
<http://www.magicbroom.info/Papers/DuchiHaSi10.pdf>`_.
.. math::
G &= \\sum_{\\tau=1}^{t} g_{\\tau} g_{\\tau}^T \\\\
w & = w - \\eta diag(G)^{-\\frac{1}{2}} \\circ g
"""
def __init__(self, **kwargs): def __init__(self, **kwargs):
learning_method = v1_optimizers.AdaGradOptimizer() learning_method = v1_optimizers.AdaGradOptimizer()
super(AdaGrad, self).__init__(learning_method=learning_method, **kwargs) super(AdaGrad, self).__init__(learning_method=learning_method, **kwargs)
class DecayedAdaGrad(Optimizer): class DecayedAdaGrad(Optimizer):
"""
AdaGrad method with decayed sum gradients. The equations of this method
show as follow.
.. math::
E(g_t^2) &= \\rho * E(g_{t-1}^2) + (1-\\rho) * g^2 \\\\
learning\\_rate &= 1/sqrt( ( E(g_t^2) + \\epsilon )
:param rho: The :math:`\\rho` parameter in that equation
:type rho: float
:param epsilon: The :math:`\\epsilon` parameter in that equation.
:type epsilon: float
"""
def __init__(self, rho=0.95, epsilon=1e-06, **kwargs): def __init__(self, rho=0.95, epsilon=1e-06, **kwargs):
learning_method = v1_optimizers.DecayedAdaGradOptimizer( learning_method = v1_optimizers.DecayedAdaGradOptimizer(
rho=rho, epsilon=epsilon) rho=rho, epsilon=epsilon)
...@@ -83,6 +178,24 @@ class DecayedAdaGrad(Optimizer): ...@@ -83,6 +178,24 @@ class DecayedAdaGrad(Optimizer):
class AdaDelta(Optimizer): class AdaDelta(Optimizer):
"""
AdaDelta method. The details of adadelta please refer to this
`ADADELTA: AN ADAPTIVE LEARNING RATE METHOD
<http://www.matthewzeiler.com/pubs/googleTR2012/googleTR2012.pdf>`_.
.. math::
E(g_t^2) &= \\rho * E(g_{t-1}^2) + (1-\\rho) * g^2 \\\\
learning\\_rate &= sqrt( ( E(dx_{t-1}^2) + \\epsilon ) / ( \\
E(g_t^2) + \\epsilon ) ) \\\\
E(dx_t^2) &= \\rho * E(dx_{t-1}^2) + (1-\\rho) * (-g*learning\\_rate)^2
:param rho: :math:`\\rho` in equation
:type rho: float
:param epsilon: :math:`\\rho` in equation
:type epsilon: float
"""
def __init__(self, rho=0.95, epsilon=1e-06, **kwargs): def __init__(self, rho=0.95, epsilon=1e-06, **kwargs):
learning_method = v1_optimizers.AdaDeltaOptimizer( learning_method = v1_optimizers.AdaDeltaOptimizer(
rho=rho, epsilon=epsilon) rho=rho, epsilon=epsilon)
...@@ -91,6 +204,24 @@ class AdaDelta(Optimizer): ...@@ -91,6 +204,24 @@ class AdaDelta(Optimizer):
class RMSProp(Optimizer): class RMSProp(Optimizer):
"""
RMSProp(for Root Mean Square Propagation) optimizer. For details please
refer this `slide <http://www.cs.toronto.edu/~tijmen/csc321/slides/
lecture_slides_lec6.pdf>`_.
The equations of this method as follows:
.. math::
v(w, t) & = \\rho v(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2 \\\\
w & = w - \\frac{\\eta} {\\sqrt{v(w,t) + \\epsilon}} \\nabla Q_{i}(w)
:param rho: the :math:`\\rho` in the equation. The forgetting factor.
:type rho: float
:param epsilon: the :math:`\\epsilon` in the equation.
:type epsilon: float
"""
def __init__(self, rho=0.95, epsilon=1e-6, **kwargs): def __init__(self, rho=0.95, epsilon=1e-6, **kwargs):
learning_method = v1_optimizers.RMSPropOptimizer( learning_method = v1_optimizers.RMSPropOptimizer(
rho=rho, epsilon=epsilon) rho=rho, epsilon=epsilon)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册