From 3b5fc2adb7acb7b7db92fa065566643897935f27 Mon Sep 17 00:00:00 2001 From: sunzhongkai588 <70642955+sunzhongkai588@users.noreply.github.com> Date: Mon, 2 Aug 2021 21:11:50 +0800 Subject: [PATCH] Change formula error in paddle.optimizer (#34539) * fix paddle.optimizer test=document_fix * fix paddle.optimizer test=document_fix --- python/paddle/optimizer/adadelta.py | 6 ++--- python/paddle/optimizer/adagrad.py | 2 +- python/paddle/optimizer/adam.py | 10 ++++---- python/paddle/optimizer/adamax.py | 8 +++--- python/paddle/optimizer/adamw.py | 10 ++++---- python/paddle/optimizer/lamb.py | 40 ++++++----------------------- python/paddle/optimizer/lr.py | 25 +++++++++--------- python/paddle/optimizer/rmsprop.py | 18 ++++++------- 8 files changed, 47 insertions(+), 72 deletions(-) diff --git a/python/paddle/optimizer/adadelta.py b/python/paddle/optimizer/adadelta.py index dd088b18ca..32050c12ec 100644 --- a/python/paddle/optimizer/adadelta.py +++ b/python/paddle/optimizer/adadelta.py @@ -31,11 +31,11 @@ class Adadelta(Optimizer): .. math:: - E(g_t^2) &= \\rho * E(g_{t-1}^2) + (1-\\rho) * g^2 + E(g_t^2) &= \rho * E(g_{t-1}^2) + (1-\rho) * g^2 - learning\_rate &= \sqrt{ ( E(dx_{t-1}^2) + \\epsilon ) / ( E(g_t^2) + \\epsilon ) } + learning\_rate &= \sqrt{ ( E(dx_{t-1}^2) + \epsilon ) / ( E(g_t^2) + \epsilon ) } - E(dx_t^2) &= \\rho * E(dx_{t-1}^2) + (1-\\rho) * (-g*learning\_rate)^2 + E(dx_t^2) &= \rho * E(dx_{t-1}^2) + (1-\rho) * (-g*learning\_rate)^2 Args: learning_rate (float|Tensor|LearningRateDecay, optional): The learning rate used to update ``Parameter``. diff --git a/python/paddle/optimizer/adagrad.py b/python/paddle/optimizer/adagrad.py index 6238d32e9c..7ca4ab648a 100644 --- a/python/paddle/optimizer/adagrad.py +++ b/python/paddle/optimizer/adagrad.py @@ -32,7 +32,7 @@ class Adagrad(Optimizer): moment\_out &= moment + grad * grad - param\_out &= param - \\frac{learning\_rate * grad}{\sqrt{moment\_out} + \epsilon} + param\_out &= param - \frac{learning\_rate * grad}{\sqrt{moment\_out} + \epsilon} The original paper does not have the ``epsilon`` attribute. It is added here diff --git a/python/paddle/optimizer/adam.py b/python/paddle/optimizer/adam.py index e59deb5d61..e065ee91c6 100644 --- a/python/paddle/optimizer/adam.py +++ b/python/paddle/optimizer/adam.py @@ -42,14 +42,14 @@ class Adam(Optimizer): t & = t + 1 - moment\_1\_out & = {\\beta}_1 * moment\_1 + (1 - {\\beta}_1) * grad + moment\_1\_out & = {\beta}_1 * moment\_1 + (1 - {\beta}_1) * grad - moment\_2\_out & = {\\beta}_2 * moment\_2 + (1 - {\\beta}_2) * grad * grad + moment\_2\_out & = {\beta}_2 * moment\_2 + (1 - {\beta}_2) * grad * grad - learning\_rate & = learning\_rate * \\ - \\frac{\sqrt{1 - {\\beta}_2^t}}{1 - {\\beta}_1^t} + learning\_rate & = learning\_rate * \ + \frac{\sqrt{1 - {\beta}_2^t}}{1 - {\beta}_1^t} - param\_out & = param - learning\_rate * \\frac{moment\_1}{\sqrt{moment\_2} + \epsilon} + param\_out & = param - learning\_rate * \frac{moment\_1}{\sqrt{moment\_2} + \epsilon} Related paper: `Adam: A Method for Stochastic Optimization `_ diff --git a/python/paddle/optimizer/adamax.py b/python/paddle/optimizer/adamax.py index 867b770372..de70e2e72a 100644 --- a/python/paddle/optimizer/adamax.py +++ b/python/paddle/optimizer/adamax.py @@ -33,13 +33,13 @@ class Adamax(Optimizer): t & = t + 1 - moment\_out & = {\\beta}_1 * moment + (1 - {\\beta}_1) * grad + moment\_out & = {\beta}_1 * moment + (1 - {\beta}_1) * grad - inf\_norm\_out & = max({\\beta}_2 * inf\_norm + \epsilon, |grad|) + inf\_norm\_out & = max({\beta}_2 * inf\_norm + \epsilon, |grad|) - learning\_rate & = \\frac{learning\_rate}{1 - {\\beta}_1^t} + learning\_rate & = \frac{learning\_rate}{1 - {\beta}_1^t} - param\_out & = param - learning\_rate * \\frac{moment\_out}{inf\_norm\_out} + param\_out & = param - learning\_rate * \frac{moment\_out}{inf\_norm\_out} Related paper: `Adam: A Method for Stochastic Optimization `_ diff --git a/python/paddle/optimizer/adamw.py b/python/paddle/optimizer/adamw.py index f830a9096c..11ba49c070 100644 --- a/python/paddle/optimizer/adamw.py +++ b/python/paddle/optimizer/adamw.py @@ -32,14 +32,14 @@ class AdamW(Adam): t & = t + 1 - moment\_1\_out & = {\\beta}_1 * moment\_1 + (1 - {\\beta}_1) * grad + moment\_1\_out & = {\beta}_1 * moment\_1 + (1 - {\beta}_1) * grad - moemnt\_2\_out & = {\\beta}_2 * moment\_2 + (1 - {\\beta}_2) * grad * grad + moemnt\_2\_out & = {\beta}_2 * moment\_2 + (1 - {\beta}_2) * grad * grad - learning\_rate & = learning\_rate * \\ - \\frac{\sqrt{1 - {\\beta}_2^t}}{1 - {beta}_1^t} + learning\_rate & = learning\_rate * + \frac{\sqrt{1 - {\beta}_2^t}}{1 - {beta}_1^t} - param\_out & = param - learning\_rate * (\\frac{moment\_1}{\sqrt{moment\_2} + \epsilon} + \lambda * param) + param\_out & = param - learning\_rate * (\frac{moment\_1}{\sqrt{moment\_2} + \epsilon} + \lambda * param) Args: diff --git a/python/paddle/optimizer/lamb.py b/python/paddle/optimizer/lamb.py index 1d72c1df2b..43d4d326bd 100644 --- a/python/paddle/optimizer/lamb.py +++ b/python/paddle/optimizer/lamb.py @@ -34,17 +34,17 @@ class Lamb(Optimizer): .. math:: - m_t &= \\beta_1 m_{t - 1}+ (1 - \\beta_1)g_t + m_t &= \beta_1 m_{t - 1}+ (1 - \beta_1)g_t - v_t &= \\beta_2 v_{t - 1} + (1 - \\beta_2)g_t^2 + v_t &= \beta_2 v_{t - 1} + (1 - \beta_2)g_t^2 - m_t &= \\frac{m_t}{\\beta_1^t} + m_t &= \frac{m_t}{\beta_1^t} - v_t &= \\frac{v_t}{\\beta_2^t} + v_t &= \frac{v_t}{\beta_2^t} - r_t &= \\frac{m_t}{\\sqrt{v_t}+\\epsilon} + r_t &= \frac{m_t}{\sqrt{v_t}+\epsilon} - w_t &= w_{t-1} -\\eta_t \\frac{\\left \| w_{t-1}\\right \|}{\\left \| r_t + \\lambda w_{t-1}\\right \|} (r_t + \\lambda w_{t-1}) + w_t &= w_{t-1} -\eta_t \frac{\left \| w_{t-1}\right \|}{\left \| r_t + \lambda w_{t-1}\right \|} (r_t + \lambda w_{t-1}) where :math:`m` is the 1st moment, and :math:`v` the 2nd moment, :math:`\\eta` the @@ -76,8 +76,8 @@ class Lamb(Optimizer): .. code-block:: python import paddle - import numpy as np - inp = paddle.uniform(min=-0.1, max=0.1, shape=[10, 10], dtype='float32') + + inp = paddle.uniform(shape=[10, 10], dtype='float32', min=-0.1, max=0.1) linear = paddle.nn.Linear(10, 10) out = linear(inp) loss = paddle.mean(out) @@ -88,30 +88,6 @@ class Lamb(Optimizer): lamb.step() lamb.clear_grad() - - #Note that the learning_rate of linear_2 is 0.01. - linear_1 = paddle.nn.Linear(10, 10) - linear_2 = paddle.nn.Linear(10, 10) - inp = paddle.uniform(shape=[10, 10], min=-0.1, max=0.1) - out = linear_1(inp) - out = linear_2(out) - loss = paddle.mean(out) - lamb = paddle.optimizer.Lamb( - learning_rate=0.1, - parameters=[{ - 'params': linear_1.parameters() - }, { - 'params': linear_2.parameters(), - 'weight_decay': 0.001, - 'learning_rate': 0.1, - 'lamb_weight_decay': 0.02 - }], - weight_decay=0.01, - lamb_weight_decay=0.01) - out.backward() - lamb.step() - lamb.clear_grad() - """ _moment1_acc_str = "moment1" _moment2_acc_str = "moment2" diff --git a/python/paddle/optimizer/lr.py b/python/paddle/optimizer/lr.py index 7cea2645fa..be1786696b 100644 --- a/python/paddle/optimizer/lr.py +++ b/python/paddle/optimizer/lr.py @@ -472,7 +472,7 @@ class InverseTimeDecay(LRScheduler): .. math:: - new\_learning\_rate = \\frac{learning\_rate}{1 + gamma * epoch} + new\_learning\_rate = \frac{learning\_rate}{1 + gamma * epoch} Args: learning_rate (float): The initial learning rate. It is a python float number. @@ -555,9 +555,9 @@ class PolynomialDecay(LRScheduler): .. math:: - decay\_steps & = decay\_steps * math.ceil(\\frac{epoch}{decay\_steps}) + decay\_steps & = decay\_steps * math.ceil(\frac{epoch}{decay\_steps}) - new\_learning\_rate & = (learning\_rate-end\_lr)*(1-\\frac{epoch}{decay\_steps})^{power}+end\_lr + new\_learning\_rate & = (learning\_rate-end\_lr)*(1-\frac{epoch}{decay\_steps})^{power}+end\_lr If cycle is set to False, then: @@ -565,7 +565,7 @@ class PolynomialDecay(LRScheduler): epoch & = min(epoch, decay\_steps) - new\_learning\_rate & = (learning\_rate-end\_lr)*(1-\\frac{epoch}{decay\_steps})^{power}+end\_lr + new\_learning\_rate & = (learning\_rate-end\_lr)*(1-\frac{epoch}{decay\_steps})^{power}+end\_lr Args: @@ -676,7 +676,7 @@ class LinearWarmup(LRScheduler): .. math:: - lr = start\_lr + (end\_lr - start\_lr) * \\frac{epoch}{warmup\_steps} + lr = start\_lr + (end\_lr - start\_lr) * \frac{epoch}{warmup\_steps} where start_lr is the initial learning rate, and end_lr is the final learning rate; @@ -1407,14 +1407,13 @@ class CosineAnnealingDecay(LRScheduler): .. math:: - \\begin{aligned} - \eta_t & = \eta_{min} + \\frac{1}{2}(\eta_{max} - \eta_{min})\left(1 - + \cos\left(\\frac{T_{cur}}{T_{max}}\pi\\right)\\right), - & T_{cur} \\neq (2k+1)T_{max}; \\ - \eta_{t+1} & = \eta_{t} + \\frac{1}{2}(\eta_{max} - \eta_{min}) - \left(1 - \cos\left(\\frac{1}{T_{max}}\pi\\right)\\right), - & T_{cur} = (2k+1)T_{max}. - \end{aligned} + \eta_t & = \eta_{min} + \frac{1}{2}(\eta_{max} - \eta_{min})\left(1 + + \cos\left(\frac{T_{cur}}{T_{max}}\pi\right)\right), + & T_{cur} \neq (2k+1)T_{max}; + + \eta_{t+1} & = \eta_{t} + \frac{1}{2}(\eta_{max} - \eta_{min}) + \left(1 - \cos\left(\frac{1}{T_{max}}\pi\right)\right), + & T_{cur} = (2k+1)T_{max}. It has been proposed in `SGDR: Stochastic Gradient Descent with Warm Restarts `_. Note that this only implements the cosine annealing part of SGDR, and not the restarts. diff --git a/python/paddle/optimizer/rmsprop.py b/python/paddle/optimizer/rmsprop.py index 14249df3f5..6a59052fc0 100644 --- a/python/paddle/optimizer/rmsprop.py +++ b/python/paddle/optimizer/rmsprop.py @@ -30,9 +30,9 @@ class RMSProp(Optimizer): .. math:: - r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2 + r(w, t) & = \rho r(w, t-1) + (1 - \rho)(\nabla Q_{i}(w))^2 - w & = w - \\frac{\\eta} {\\sqrt{r(w,t) + \\epsilon}} \\nabla Q_{i}(w) + w & = w - \frac{\eta} {\sqrt{r(w,t) + \epsilon}} \nabla Q_{i}(w) The first equation calculates moving average of the squared gradient for each weight. Then dividing the gradient by :math:`sqrt{v(w,t)}`. @@ -42,10 +42,10 @@ class RMSProp(Optimizer): .. math:: - r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2 + r(w, t) & = \rho r(w, t-1) + (1 - \rho)(\nabla Q_{i}(w))^2 - v(w, t) & = \\beta v(w, t-1) + \\frac{\\eta} {\\sqrt{r(w,t) + - \\epsilon}} \\nabla Q_{i}(w) + v(w, t) & = \beta v(w, t-1) + \frac{\eta} {\sqrt{r(w,t) + + \epsilon}} \nabla Q_{i}(w) w & = w - v(w, t) @@ -53,12 +53,12 @@ class RMSProp(Optimizer): .. math:: - r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2 + r(w, t) & = \rho r(w, t-1) + (1 - \rho)(\nabla Q_{i}(w))^2 - g(w, t) & = \\rho g(w, t-1) + (1 - \\rho)\\nabla Q_{i}(w) + g(w, t) & = \rho g(w, t-1) + (1 - \rho)\nabla Q_{i}(w) - v(w, t) & = \\beta v(w, t-1) + \\frac{\\eta} {\\sqrt{r(w,t) - (g(w, t))^2 + - \\epsilon}} \\nabla Q_{i}(w) + v(w, t) & = \beta v(w, t-1) + \frac{\eta} {\sqrt{r(w,t) - (g(w, t))^2 + + \epsilon}} \nabla Q_{i}(w) w & = w - v(w, t) -- GitLab