未验证 提交 3b5fc2ad 编写于 作者: S sunzhongkai588 提交者: GitHub

Change formula error in paddle.optimizer (#34539)

* fix paddle.optimizer test=document_fix

* fix paddle.optimizer test=document_fix
上级 145cdb5a
......@@ -31,11 +31,11 @@ class Adadelta(Optimizer):
.. math::
E(g_t^2) &= \\rho * E(g_{t-1}^2) + (1-\\rho) * g^2
E(g_t^2) &= \rho * E(g_{t-1}^2) + (1-\rho) * g^2
learning\_rate &= \sqrt{ ( E(dx_{t-1}^2) + \\epsilon ) / ( E(g_t^2) + \\epsilon ) }
learning\_rate &= \sqrt{ ( E(dx_{t-1}^2) + \epsilon ) / ( E(g_t^2) + \epsilon ) }
E(dx_t^2) &= \\rho * E(dx_{t-1}^2) + (1-\\rho) * (-g*learning\_rate)^2
E(dx_t^2) &= \rho * E(dx_{t-1}^2) + (1-\rho) * (-g*learning\_rate)^2
Args:
learning_rate (float|Tensor|LearningRateDecay, optional): The learning rate used to update ``Parameter``.
......
......@@ -32,7 +32,7 @@ class Adagrad(Optimizer):
moment\_out &= moment + grad * grad
param\_out &= param - \\frac{learning\_rate * grad}{\sqrt{moment\_out} + \epsilon}
param\_out &= param - \frac{learning\_rate * grad}{\sqrt{moment\_out} + \epsilon}
The original paper does not have the ``epsilon`` attribute. It is added here
......
......@@ -42,14 +42,14 @@ class Adam(Optimizer):
t & = t + 1
moment\_1\_out & = {\\beta}_1 * moment\_1 + (1 - {\\beta}_1) * grad
moment\_1\_out & = {\beta}_1 * moment\_1 + (1 - {\beta}_1) * grad
moment\_2\_out & = {\\beta}_2 * moment\_2 + (1 - {\\beta}_2) * grad * grad
moment\_2\_out & = {\beta}_2 * moment\_2 + (1 - {\beta}_2) * grad * grad
learning\_rate & = learning\_rate * \\
\\frac{\sqrt{1 - {\\beta}_2^t}}{1 - {\\beta}_1^t}
learning\_rate & = learning\_rate * \
\frac{\sqrt{1 - {\beta}_2^t}}{1 - {\beta}_1^t}
param\_out & = param - learning\_rate * \\frac{moment\_1}{\sqrt{moment\_2} + \epsilon}
param\_out & = param - learning\_rate * \frac{moment\_1}{\sqrt{moment\_2} + \epsilon}
Related paper: `Adam: A Method for Stochastic Optimization <https://arxiv.org/abs/1412.6980>`_
......
......@@ -33,13 +33,13 @@ class Adamax(Optimizer):
t & = t + 1
moment\_out & = {\\beta}_1 * moment + (1 - {\\beta}_1) * grad
moment\_out & = {\beta}_1 * moment + (1 - {\beta}_1) * grad
inf\_norm\_out & = max({\\beta}_2 * inf\_norm + \epsilon, |grad|)
inf\_norm\_out & = max({\beta}_2 * inf\_norm + \epsilon, |grad|)
learning\_rate & = \\frac{learning\_rate}{1 - {\\beta}_1^t}
learning\_rate & = \frac{learning\_rate}{1 - {\beta}_1^t}
param\_out & = param - learning\_rate * \\frac{moment\_out}{inf\_norm\_out}
param\_out & = param - learning\_rate * \frac{moment\_out}{inf\_norm\_out}
Related paper: `Adam: A Method for Stochastic Optimization <https://arxiv.org/abs/1412.6980>`_
......
......@@ -32,14 +32,14 @@ class AdamW(Adam):
t & = t + 1
moment\_1\_out & = {\\beta}_1 * moment\_1 + (1 - {\\beta}_1) * grad
moment\_1\_out & = {\beta}_1 * moment\_1 + (1 - {\beta}_1) * grad
moemnt\_2\_out & = {\\beta}_2 * moment\_2 + (1 - {\\beta}_2) * grad * grad
moemnt\_2\_out & = {\beta}_2 * moment\_2 + (1 - {\beta}_2) * grad * grad
learning\_rate & = learning\_rate * \\
\\frac{\sqrt{1 - {\\beta}_2^t}}{1 - {beta}_1^t}
learning\_rate & = learning\_rate *
\frac{\sqrt{1 - {\beta}_2^t}}{1 - {beta}_1^t}
param\_out & = param - learning\_rate * (\\frac{moment\_1}{\sqrt{moment\_2} + \epsilon} + \lambda * param)
param\_out & = param - learning\_rate * (\frac{moment\_1}{\sqrt{moment\_2} + \epsilon} + \lambda * param)
Args:
......
......@@ -34,17 +34,17 @@ class Lamb(Optimizer):
.. math::
m_t &= \\beta_1 m_{t - 1}+ (1 - \\beta_1)g_t
m_t &= \beta_1 m_{t - 1}+ (1 - \beta_1)g_t
v_t &= \\beta_2 v_{t - 1} + (1 - \\beta_2)g_t^2
v_t &= \beta_2 v_{t - 1} + (1 - \beta_2)g_t^2
m_t &= \\frac{m_t}{\\beta_1^t}
m_t &= \frac{m_t}{\beta_1^t}
v_t &= \\frac{v_t}{\\beta_2^t}
v_t &= \frac{v_t}{\beta_2^t}
r_t &= \\frac{m_t}{\\sqrt{v_t}+\\epsilon}
r_t &= \frac{m_t}{\sqrt{v_t}+\epsilon}
w_t &= w_{t-1} -\\eta_t \\frac{\\left \| w_{t-1}\\right \|}{\\left \| r_t + \\lambda w_{t-1}\\right \|} (r_t + \\lambda w_{t-1})
w_t &= w_{t-1} -\eta_t \frac{\left \| w_{t-1}\right \|}{\left \| r_t + \lambda w_{t-1}\right \|} (r_t + \lambda w_{t-1})
where :math:`m` is the 1st moment, and :math:`v` the 2nd moment, :math:`\\eta` the
......@@ -76,8 +76,8 @@ class Lamb(Optimizer):
.. code-block:: python
import paddle
import numpy as np
inp = paddle.uniform(min=-0.1, max=0.1, shape=[10, 10], dtype='float32')
inp = paddle.uniform(shape=[10, 10], dtype='float32', min=-0.1, max=0.1)
linear = paddle.nn.Linear(10, 10)
out = linear(inp)
loss = paddle.mean(out)
......@@ -88,30 +88,6 @@ class Lamb(Optimizer):
lamb.step()
lamb.clear_grad()
#Note that the learning_rate of linear_2 is 0.01.
linear_1 = paddle.nn.Linear(10, 10)
linear_2 = paddle.nn.Linear(10, 10)
inp = paddle.uniform(shape=[10, 10], min=-0.1, max=0.1)
out = linear_1(inp)
out = linear_2(out)
loss = paddle.mean(out)
lamb = paddle.optimizer.Lamb(
learning_rate=0.1,
parameters=[{
'params': linear_1.parameters()
}, {
'params': linear_2.parameters(),
'weight_decay': 0.001,
'learning_rate': 0.1,
'lamb_weight_decay': 0.02
}],
weight_decay=0.01,
lamb_weight_decay=0.01)
out.backward()
lamb.step()
lamb.clear_grad()
"""
_moment1_acc_str = "moment1"
_moment2_acc_str = "moment2"
......
......@@ -472,7 +472,7 @@ class InverseTimeDecay(LRScheduler):
.. math::
new\_learning\_rate = \\frac{learning\_rate}{1 + gamma * epoch}
new\_learning\_rate = \frac{learning\_rate}{1 + gamma * epoch}
Args:
learning_rate (float): The initial learning rate. It is a python float number.
......@@ -555,9 +555,9 @@ class PolynomialDecay(LRScheduler):
.. math::
decay\_steps & = decay\_steps * math.ceil(\\frac{epoch}{decay\_steps})
decay\_steps & = decay\_steps * math.ceil(\frac{epoch}{decay\_steps})
new\_learning\_rate & = (learning\_rate-end\_lr)*(1-\\frac{epoch}{decay\_steps})^{power}+end\_lr
new\_learning\_rate & = (learning\_rate-end\_lr)*(1-\frac{epoch}{decay\_steps})^{power}+end\_lr
If cycle is set to False, then:
......@@ -565,7 +565,7 @@ class PolynomialDecay(LRScheduler):
epoch & = min(epoch, decay\_steps)
new\_learning\_rate & = (learning\_rate-end\_lr)*(1-\\frac{epoch}{decay\_steps})^{power}+end\_lr
new\_learning\_rate & = (learning\_rate-end\_lr)*(1-\frac{epoch}{decay\_steps})^{power}+end\_lr
Args:
......@@ -676,7 +676,7 @@ class LinearWarmup(LRScheduler):
.. math::
lr = start\_lr + (end\_lr - start\_lr) * \\frac{epoch}{warmup\_steps}
lr = start\_lr + (end\_lr - start\_lr) * \frac{epoch}{warmup\_steps}
where start_lr is the initial learning rate, and end_lr is the final learning rate;
......@@ -1407,14 +1407,13 @@ class CosineAnnealingDecay(LRScheduler):
.. math::
\\begin{aligned}
\eta_t & = \eta_{min} + \\frac{1}{2}(\eta_{max} - \eta_{min})\left(1
+ \cos\left(\\frac{T_{cur}}{T_{max}}\pi\\right)\\right),
& T_{cur} \\neq (2k+1)T_{max}; \\
\eta_{t+1} & = \eta_{t} + \\frac{1}{2}(\eta_{max} - \eta_{min})
\left(1 - \cos\left(\\frac{1}{T_{max}}\pi\\right)\\right),
& T_{cur} = (2k+1)T_{max}.
\end{aligned}
\eta_t & = \eta_{min} + \frac{1}{2}(\eta_{max} - \eta_{min})\left(1
+ \cos\left(\frac{T_{cur}}{T_{max}}\pi\right)\right),
& T_{cur} \neq (2k+1)T_{max};
\eta_{t+1} & = \eta_{t} + \frac{1}{2}(\eta_{max} - \eta_{min})
\left(1 - \cos\left(\frac{1}{T_{max}}\pi\right)\right),
& T_{cur} = (2k+1)T_{max}.
It has been proposed in `SGDR: Stochastic Gradient Descent with Warm Restarts <https://arxiv.org/abs/1608.03983>`_.
Note that this only implements the cosine annealing part of SGDR, and not the restarts.
......
......@@ -30,9 +30,9 @@ class RMSProp(Optimizer):
.. math::
r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2
r(w, t) & = \rho r(w, t-1) + (1 - \rho)(\nabla Q_{i}(w))^2
w & = w - \\frac{\\eta} {\\sqrt{r(w,t) + \\epsilon}} \\nabla Q_{i}(w)
w & = w - \frac{\eta} {\sqrt{r(w,t) + \epsilon}} \nabla Q_{i}(w)
The first equation calculates moving average of the squared gradient for
each weight. Then dividing the gradient by :math:`sqrt{v(w,t)}`.
......@@ -42,10 +42,10 @@ class RMSProp(Optimizer):
.. math::
r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2
r(w, t) & = \rho r(w, t-1) + (1 - \rho)(\nabla Q_{i}(w))^2
v(w, t) & = \\beta v(w, t-1) + \\frac{\\eta} {\\sqrt{r(w,t) +
\\epsilon}} \\nabla Q_{i}(w)
v(w, t) & = \beta v(w, t-1) + \frac{\eta} {\sqrt{r(w,t) +
\epsilon}} \nabla Q_{i}(w)
w & = w - v(w, t)
......@@ -53,12 +53,12 @@ class RMSProp(Optimizer):
.. math::
r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2
r(w, t) & = \rho r(w, t-1) + (1 - \rho)(\nabla Q_{i}(w))^2
g(w, t) & = \\rho g(w, t-1) + (1 - \\rho)\\nabla Q_{i}(w)
g(w, t) & = \rho g(w, t-1) + (1 - \rho)\nabla Q_{i}(w)
v(w, t) & = \\beta v(w, t-1) + \\frac{\\eta} {\\sqrt{r(w,t) - (g(w, t))^2 +
\\epsilon}} \\nabla Q_{i}(w)
v(w, t) & = \beta v(w, t-1) + \frac{\eta} {\sqrt{r(w,t) - (g(w, t))^2 +
\epsilon}} \nabla Q_{i}(w)
w & = w - v(w, t)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册