提交 7aa7a09b 编写于 作者: M Megvii Engine Team

feat(mge/optimizer): add optimizer adadelta

GitOrigin-RevId: 244bc0d74a0cc7d8d7274e2ff22cb24f0e95f2ca
上级 205291a3
...@@ -6,6 +6,7 @@ ...@@ -6,6 +6,7 @@
# Unless required by applicable law or agreed to in writing, # Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an # software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
from .adadelta import Adadelta
from .adagrad import Adagrad from .adagrad import Adagrad
from .adam import Adam from .adam import Adam
from .lr_scheduler import LRScheduler from .lr_scheduler import LRScheduler
......
from typing import Iterable, Union
import numpy as np
from ..core import Buffer, Parameter
from ..functional import sqrt
from .internal import add_update_fastpath as add_update
from .optimizer import Optimizer
class Adadelta(Optimizer):
r"""Implements Adadelta algorithm.
It has been proposed in `"ADADELTA: An Adaptive Learning Rate Method" <https://arxiv.org/abs/1212.5701>`_.
:param params: iterable of parameters to optimize or dicts defining
parameter groups.
:param lr: coefficient that scale delta before it is applied
to the parameters (default: 1.0).
:param rho: coefficient used for computing a running average
of squared gradients (default: 0.9).
:param eps: term added to the denominator to improve
numerical stability (default: 1e-6).
:param weight_decay: weight decay (L2 penalty) (default: 0).
"""
def __init__(
self,
params: Union[Iterable[Parameter], dict],
lr: float = 1.0,
rho: float = 0.9,
eps: float = 1e-6,
weight_decay: float = 0.0,
):
assert lr >= 0.0, "Invalid learning rate: {}".format(lr)
assert rho >= 0.0 and rho <= 1.0, "Invalid rho value: {}".format(rho)
assert eps >= 0.0, "Invalid epsilon value: {}".format(eps)
assert weight_decay >= 0.0, "Invalid weight_decay value: {}".format(
weight_decay
)
defaults = dict(lr=lr, rho=rho, eps=eps, weight_decay=weight_decay)
super().__init__(params, defaults)
def _create_state(self, param_group):
for param in param_group["params"]:
self._add_state(param, "square_avg")
self._add_state(param, "acc_delta")
self._add_state(param, "step", initializer=0.0)
def _updates(self, param_group):
lr = param_group["lr"]
weight_decay = param_group["weight_decay"]
rho = param_group["rho"]
eps = param_group["eps"]
for param in param_group["params"]:
if not isinstance(param.grad, Buffer):
raise TypeError(
"grad must be a Buffer, maybe you forget to call backward()?"
)
if not param.requires_grad:
continue
step = self._state[param]["step"]
step = add_update(step, 1)
grad = param.grad
if weight_decay != 0.0:
grad = add_update(grad, param, beta=weight_decay)
square_avg = self._state[param]["square_avg"]
acc_delta = self._state[param]["acc_delta"]
square_avg = add_update(square_avg, grad ** 2, alpha=rho, beta=1 - rho)
std = sqrt(square_avg + eps)
delta = sqrt(acc_delta + eps) / std * grad
add_update(param, delta, beta=-lr)
acc_delta = add_update(acc_delta, delta ** 2, alpha=rho, beta=1 - rho)
...@@ -189,72 +189,70 @@ def test_adam(): ...@@ -189,72 +189,70 @@ def test_adam():
_test_optimizer("Adam", case, CheckValue, update_lr=True) _test_optimizer("Adam", case, CheckValue, update_lr=True)
def test_adam(): def test_adagrad():
class CheckValue: class CheckValue:
def __init__(self, net, **kwarg): def __init__(self, net, **kwarg):
self.m_slots = TensorDict() self.s_slots = TensorDict()
self.v_slots = TensorDict()
for param in net.parameters(): for param in net.parameters():
self.m_slots[param] = np.zeros(param.shape).astype(np.float32) self.s_slots[param] = np.zeros(param.shape).astype(np.float32)
self.v_slots[param] = np.zeros(param.shape).astype(np.float32)
for k, v in kwarg.items(): for k, v in kwarg.items():
setattr(self, k, v) setattr(self, k, v)
def __call__(self, ori_params, new_params, step): def __call__(self, ori_params, new_params, step):
for param in new_params: for param in new_params:
grad = param.grad.numpy() grad = param.grad.numpy()
m = self.m_slots[param] self.s_slots[param] += grad ** 2
v = self.v_slots[param] delta = grad / (self.s_slots[param] + self.eps) ** 0.5
m *= self.betas[0] delta *= -(self.lr / (1 + (step - 1) * self.lr_decay))
m += (1 - self.betas[0]) * grad assertTensorClose(param.numpy(), ori_params[param] + delta)
v *= self.betas[1]
v += (1 - self.betas[1]) * grad * grad
delta = (m / (1 - self.betas[0] ** step)) / (
np.sqrt(v / (1 - self.betas[1] ** step)) + self.eps
)
assertTensorClose(param.numpy(), ori_params[param] - self.lr * delta)
cases = [ cases = [
{"betas": (0.8, 0.9), "eps": 1e-04, "lr": 0.01}, {"lr": 0.01, "eps": 1e-06, "lr_decay": 0.01},
{"lr": 0.01, "eps": 1e-06, "lr_decay": 0.0}, # without lr_decay
{ {
"betas": (0.8, 0.9),
"eps": 1e-04,
"lr": 0.01, "lr": 0.01,
"eps": 1e-06,
"lr_decay": 0.01,
"weight_decay": 0.1, "weight_decay": 0.1,
}, # with weight_decay }, # with weight_decay
] ]
for case in cases: for case in cases:
_test_optimizer("Adam", case, CheckValue) _test_optimizer("Adagrad", case, CheckValue)
_test_optimizer("Adam", case, CheckValue, update_lr=True) _test_optimizer("Adagrad", case, CheckValue, update_lr=True)
def test_adagrad(): def test_adadelta():
class CheckValue: class CheckValue:
def __init__(self, net, **kwarg): def __init__(self, net, **kwarg):
self.s_slots = TensorDict() self.s_slots = TensorDict()
self.a_slots = TensorDict()
for param in net.parameters(): for param in net.parameters():
self.s_slots[param] = np.zeros(param.shape).astype(np.float32) self.s_slots[param] = np.zeros(param.shape).astype(np.float32)
self.a_slots[param] = np.zeros(param.shape).astype(np.float32)
for k, v in kwarg.items(): for k, v in kwarg.items():
setattr(self, k, v) setattr(self, k, v)
def __call__(self, ori_params, new_params, step): def __call__(self, ori_params, new_params, step):
for param in new_params: for param in new_params:
grad = param.grad.numpy() grad = param.grad.numpy()
self.s_slots[param] += grad ** 2 self.s_slots[param] = self.s_slots[param] * self.rho + grad ** 2 * (
delta = grad / (self.s_slots[param] + self.eps) ** 0.5 1 - self.rho
delta *= -(self.lr / (1 + (step - 1) * self.lr_decay)) )
delta = (
grad
* ((self.a_slots[param] + self.eps) ** 0.5)
/ (self.s_slots[param] + self.eps) ** 0.5
)
self.a_slots[param] = self.a_slots[param] * self.rho + delta ** 2 * (
1 - self.rho
)
delta *= -self.lr
assertTensorClose(param.numpy(), ori_params[param] + delta) assertTensorClose(param.numpy(), ori_params[param] + delta)
cases = [ cases = [
{"lr": 0.01, "eps": 1e-06, "lr_decay": 0.01}, {"lr": 1.0, "eps": 1e-06, "rho": 0.9},
{"lr": 0.01, "eps": 1e-06, "lr_decay": 0.0}, # without lr_decay {"lr": 1.0, "eps": 1e-06, "rho": 0.9, "weight_decay": 0.9}, # with weight_decay
{
"lr": 0.01,
"eps": 1e-06,
"lr_decay": 0.01,
"weight_decay": 0.1,
}, # with weight_decay
] ]
for case in cases: for case in cases:
_test_optimizer("Adagrad", case, CheckValue) _test_optimizer("Adadelta", case, CheckValue)
_test_optimizer("Adagrad", case, CheckValue, update_lr=True) _test_optimizer("Adadelta", case, CheckValue, update_lr=True)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册