diff --git a/python_module/megengine/optimizer/__init__.py b/python_module/megengine/optimizer/__init__.py index 328cfb9fcd0e74c9ac44ade12f30df90101dc329..ad783e0605e0308354d5e2ef3ba21327086f3938 100644 --- a/python_module/megengine/optimizer/__init__.py +++ b/python_module/megengine/optimizer/__init__.py @@ -6,6 +6,7 @@ # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +from .adadelta import Adadelta from .adagrad import Adagrad from .adam import Adam from .lr_scheduler import LRScheduler diff --git a/python_module/megengine/optimizer/adadelta.py b/python_module/megengine/optimizer/adadelta.py new file mode 100644 index 0000000000000000000000000000000000000000..7d793608c6683f0d896ec4b550553cac1c5a2651 --- /dev/null +++ b/python_module/megengine/optimizer/adadelta.py @@ -0,0 +1,78 @@ +from typing import Iterable, Union + +import numpy as np + +from ..core import Buffer, Parameter +from ..functional import sqrt +from .internal import add_update_fastpath as add_update +from .optimizer import Optimizer + + +class Adadelta(Optimizer): + r"""Implements Adadelta algorithm. + + It has been proposed in `"ADADELTA: An Adaptive Learning Rate Method" `_. + + :param params: iterable of parameters to optimize or dicts defining + parameter groups. + :param lr: coefficient that scale delta before it is applied + to the parameters (default: 1.0). + :param rho: coefficient used for computing a running average + of squared gradients (default: 0.9). + :param eps: term added to the denominator to improve + numerical stability (default: 1e-6). + :param weight_decay: weight decay (L2 penalty) (default: 0). + """ + + def __init__( + self, + params: Union[Iterable[Parameter], dict], + lr: float = 1.0, + rho: float = 0.9, + eps: float = 1e-6, + weight_decay: float = 0.0, + ): + assert lr >= 0.0, "Invalid learning rate: {}".format(lr) + assert rho >= 0.0 and rho <= 1.0, "Invalid rho value: {}".format(rho) + assert eps >= 0.0, "Invalid epsilon value: {}".format(eps) + assert weight_decay >= 0.0, "Invalid weight_decay value: {}".format( + weight_decay + ) + + defaults = dict(lr=lr, rho=rho, eps=eps, weight_decay=weight_decay) + super().__init__(params, defaults) + + def _create_state(self, param_group): + for param in param_group["params"]: + self._add_state(param, "square_avg") + self._add_state(param, "acc_delta") + self._add_state(param, "step", initializer=0.0) + + def _updates(self, param_group): + lr = param_group["lr"] + weight_decay = param_group["weight_decay"] + rho = param_group["rho"] + eps = param_group["eps"] + + for param in param_group["params"]: + if not isinstance(param.grad, Buffer): + raise TypeError( + "grad must be a Buffer, maybe you forget to call backward()?" + ) + + if not param.requires_grad: + continue + + step = self._state[param]["step"] + step = add_update(step, 1) + grad = param.grad + if weight_decay != 0.0: + grad = add_update(grad, param, beta=weight_decay) + + square_avg = self._state[param]["square_avg"] + acc_delta = self._state[param]["acc_delta"] + square_avg = add_update(square_avg, grad ** 2, alpha=rho, beta=1 - rho) + std = sqrt(square_avg + eps) + delta = sqrt(acc_delta + eps) / std * grad + add_update(param, delta, beta=-lr) + acc_delta = add_update(acc_delta, delta ** 2, alpha=rho, beta=1 - rho) diff --git a/python_module/test/unit/optimizer/test_optimizer.py b/python_module/test/unit/optimizer/test_optimizer.py index 0d988d079c0fd4d8492336d52d1b6d81d9be6229..e172df799b5ea6a4f218771942a3bbd2ba33c345 100644 --- a/python_module/test/unit/optimizer/test_optimizer.py +++ b/python_module/test/unit/optimizer/test_optimizer.py @@ -189,72 +189,70 @@ def test_adam(): _test_optimizer("Adam", case, CheckValue, update_lr=True) -def test_adam(): +def test_adagrad(): class CheckValue: def __init__(self, net, **kwarg): - self.m_slots = TensorDict() - self.v_slots = TensorDict() + self.s_slots = TensorDict() for param in net.parameters(): - self.m_slots[param] = np.zeros(param.shape).astype(np.float32) - self.v_slots[param] = np.zeros(param.shape).astype(np.float32) + self.s_slots[param] = np.zeros(param.shape).astype(np.float32) for k, v in kwarg.items(): setattr(self, k, v) def __call__(self, ori_params, new_params, step): for param in new_params: grad = param.grad.numpy() - m = self.m_slots[param] - v = self.v_slots[param] - m *= self.betas[0] - m += (1 - self.betas[0]) * grad - v *= self.betas[1] - v += (1 - self.betas[1]) * grad * grad - delta = (m / (1 - self.betas[0] ** step)) / ( - np.sqrt(v / (1 - self.betas[1] ** step)) + self.eps - ) - assertTensorClose(param.numpy(), ori_params[param] - self.lr * delta) + self.s_slots[param] += grad ** 2 + delta = grad / (self.s_slots[param] + self.eps) ** 0.5 + delta *= -(self.lr / (1 + (step - 1) * self.lr_decay)) + assertTensorClose(param.numpy(), ori_params[param] + delta) cases = [ - {"betas": (0.8, 0.9), "eps": 1e-04, "lr": 0.01}, + {"lr": 0.01, "eps": 1e-06, "lr_decay": 0.01}, + {"lr": 0.01, "eps": 1e-06, "lr_decay": 0.0}, # without lr_decay { - "betas": (0.8, 0.9), - "eps": 1e-04, "lr": 0.01, + "eps": 1e-06, + "lr_decay": 0.01, "weight_decay": 0.1, }, # with weight_decay ] for case in cases: - _test_optimizer("Adam", case, CheckValue) - _test_optimizer("Adam", case, CheckValue, update_lr=True) + _test_optimizer("Adagrad", case, CheckValue) + _test_optimizer("Adagrad", case, CheckValue, update_lr=True) -def test_adagrad(): +def test_adadelta(): class CheckValue: def __init__(self, net, **kwarg): self.s_slots = TensorDict() + self.a_slots = TensorDict() for param in net.parameters(): self.s_slots[param] = np.zeros(param.shape).astype(np.float32) + self.a_slots[param] = np.zeros(param.shape).astype(np.float32) for k, v in kwarg.items(): setattr(self, k, v) def __call__(self, ori_params, new_params, step): for param in new_params: grad = param.grad.numpy() - self.s_slots[param] += grad ** 2 - delta = grad / (self.s_slots[param] + self.eps) ** 0.5 - delta *= -(self.lr / (1 + (step - 1) * self.lr_decay)) + self.s_slots[param] = self.s_slots[param] * self.rho + grad ** 2 * ( + 1 - self.rho + ) + delta = ( + grad + * ((self.a_slots[param] + self.eps) ** 0.5) + / (self.s_slots[param] + self.eps) ** 0.5 + ) + self.a_slots[param] = self.a_slots[param] * self.rho + delta ** 2 * ( + 1 - self.rho + ) + delta *= -self.lr assertTensorClose(param.numpy(), ori_params[param] + delta) cases = [ - {"lr": 0.01, "eps": 1e-06, "lr_decay": 0.01}, - {"lr": 0.01, "eps": 1e-06, "lr_decay": 0.0}, # without lr_decay - { - "lr": 0.01, - "eps": 1e-06, - "lr_decay": 0.01, - "weight_decay": 0.1, - }, # with weight_decay + {"lr": 1.0, "eps": 1e-06, "rho": 0.9}, + {"lr": 1.0, "eps": 1e-06, "rho": 0.9, "weight_decay": 0.9}, # with weight_decay ] for case in cases: - _test_optimizer("Adagrad", case, CheckValue) - _test_optimizer("Adagrad", case, CheckValue, update_lr=True) + _test_optimizer("Adadelta", case, CheckValue) + _test_optimizer("Adadelta", case, CheckValue, update_lr=True)