From 30d6b4f65debb3058f50442b413a65b9d2018439 Mon Sep 17 00:00:00 2001 From: Megvii Engine Team Date: Tue, 2 Feb 2021 12:35:09 +0800 Subject: [PATCH] fix(mge): fix scalar parameter change to 1-dim parameter after backward and optimize GitOrigin-RevId: 1794369a71251475cbe8f839cbf35f91a3adee99 --- .../python/megengine/autodiff/grad_manager.py | 2 ++ .../python/megengine/functional/inplace.py | 6 +++++- .../python/megengine/optimizer/adadelta.py | 19 +++++++++++-------- .../python/megengine/optimizer/adagrad.py | 18 ++++++++++-------- imperative/python/megengine/optimizer/adam.py | 2 +- imperative/python/megengine/optimizer/sgd.py | 8 ++++---- .../python/test/integration/test_optimizer.py | 18 +++++++++++++----- 7 files changed, 46 insertions(+), 27 deletions(-) diff --git a/imperative/python/megengine/autodiff/grad_manager.py b/imperative/python/megengine/autodiff/grad_manager.py index 6a9c65a3..3c619512 100644 --- a/imperative/python/megengine/autodiff/grad_manager.py +++ b/imperative/python/megengine/autodiff/grad_manager.py @@ -279,6 +279,8 @@ class GradManager: tensor.grad = grad else: tensor.grad += grad + if tensor.isscalar() and tensor.grad is not None: + tensor.grad.setscalar() finally: self.release() backwarding_grad_manager = cache diff --git a/imperative/python/megengine/functional/inplace.py b/imperative/python/megengine/functional/inplace.py index 9a955900..30b96f75 100644 --- a/imperative/python/megengine/functional/inplace.py +++ b/imperative/python/megengine/functional/inplace.py @@ -12,4 +12,8 @@ from ..core.ops.builtin import InplaceAdd def _inplace_add_(dest, delta, alpha, beta): - return dest._reset(apply(InplaceAdd(), dest, delta, alpha, beta)[0]) + isscalar = dest.isscalar() + dest._reset(apply(InplaceAdd(), dest, delta, alpha, beta)[0]) + if isscalar: + dest.setscalar() + return dest diff --git a/imperative/python/megengine/optimizer/adadelta.py b/imperative/python/megengine/optimizer/adadelta.py index 73f73cf1..60b18593 100644 --- a/imperative/python/megengine/optimizer/adadelta.py +++ b/imperative/python/megengine/optimizer/adadelta.py @@ -61,16 +61,19 @@ class Adadelta(Optimizer): rho = param_group["rho"] eps = param_group["eps"] + def make_scalar(val): + return tensor(val) + # since `conver_inputs` is disabled for param updates, # scalar should be explicitly tansforred to tensor - _lr = tensor([lr]) - _weight_decay = tensor([weight_decay]) - _rho = tensor([rho]) - _eps = tensor([eps]) - - c05 = tensor([0.5]) - c1 = tensor([1.0]) - c2 = tensor([2.0]) + + _lr = make_scalar(lr) + _weight_decay = make_scalar(weight_decay) + _rho = make_scalar(rho) + _eps = make_scalar(eps) + + c1, c2, c05 = map(make_scalar, (1.0, 2.0, 0.5)) + for param in param_group["params"]: if param.grad is None: diff --git a/imperative/python/megengine/optimizer/adagrad.py b/imperative/python/megengine/optimizer/adagrad.py index 3332518b..9b309077 100644 --- a/imperative/python/megengine/optimizer/adagrad.py +++ b/imperative/python/megengine/optimizer/adagrad.py @@ -60,16 +60,18 @@ class Adagrad(Optimizer): weight_decay = param_group["weight_decay"] eps = param_group["eps"] + def make_scalar(val): + return tensor(val) + # since `conver_inputs` is disabled for param updates, # scalar should be explicitly tansforred to tensor - _lr = tensor([lr]) - _lr_decay = tensor([lr_decay]) - _weight_decay = tensor([weight_decay]) - _eps = tensor([eps]) - - c05 = tensor([0.5]) - c1 = tensor([1.0]) - c2 = tensor([2.0]) + + _lr, _lr_decay = map(make_scalar, (lr, lr_decay)) + _weight_decay = make_scalar(weight_decay) + _eps = make_scalar(eps) + + c1, c2, c05 = map(make_scalar, (1.0, 2.0, 0.5)) + for param in param_group["params"]: if param.grad is None: diff --git a/imperative/python/megengine/optimizer/adam.py b/imperative/python/megengine/optimizer/adam.py index 13d490fa..4bd7bea6 100644 --- a/imperative/python/megengine/optimizer/adam.py +++ b/imperative/python/megengine/optimizer/adam.py @@ -61,7 +61,7 @@ class Adam(Optimizer): beta0, beta1 = param_group["betas"] def make_scalar(val): - return tensor([val]) + return tensor(val) # since `conver_inputs` is disabled for param updates, # scalar should be explicitly tansforred to tensor diff --git a/imperative/python/megengine/optimizer/sgd.py b/imperative/python/megengine/optimizer/sgd.py index 8a61ab88..95e5867c 100644 --- a/imperative/python/megengine/optimizer/sgd.py +++ b/imperative/python/megengine/optimizer/sgd.py @@ -57,13 +57,13 @@ class SGD(Optimizer): # since `conver_inputs` is disabled for param updates, # scalar should be explicitly tansforred to tensor - _lr = tensor([lr]) - _weight_decay = tensor([weight_decay]) - _momentum = tensor([momentum]) + _lr = tensor(lr) + _weight_decay = tensor(weight_decay) + _momentum = tensor(momentum) inplace_mode = int(os.getenv("MEGENGINE_INPLACE_UPDATE", "0")) if inplace_mode: - _neg_lr = tensor([-lr]) + _neg_lr = tensor(-lr) c1 = tensor([1.0]) for param in param_group["params"]: diff --git a/imperative/python/test/integration/test_optimizer.py b/imperative/python/test/integration/test_optimizer.py index 28b84fe8..6210233e 100644 --- a/imperative/python/test/integration/test_optimizer.py +++ b/imperative/python/test/integration/test_optimizer.py @@ -32,7 +32,7 @@ class MLP(Module): class Simple(Module): def __init__(self): super().__init__() - self.a = Parameter([1.23], dtype=np.float32) + self.a = Parameter(1.23, dtype=np.float32) def forward(self, x): x = x * self.a @@ -64,6 +64,7 @@ def _test_optimizer(opt_str, test_case, check_class, update_lr=False): ori_params = {} for param in net.parameters(): + assert param._tuple_shape is () ori_params[param] = np.copy(param.numpy()) opt.step() step += 1 @@ -95,6 +96,7 @@ def _test_optimizer(opt_str, test_case, check_class, update_lr=False): ori_params = {} for param in net.parameters(): + assert param._tuple_shape is () ori_params[param] = np.copy(param.numpy()) train_func( @@ -121,7 +123,9 @@ def test_sgd(): delta = -self.lr * self.slots[param] else: delta = -self.lr * grad - np.testing.assert_almost_equal(param.numpy(), ori_params[param] + delta) + np.testing.assert_almost_equal( + param.numpy(), ori_params[param] + delta, decimal=6 + ) cases = [ {"momentum": 0.9, "lr": 0.01}, # SGD with momentum @@ -157,7 +161,7 @@ def test_adam(): np.sqrt(v / (1 - self.betas[1] ** step)) + self.eps ) np.testing.assert_almost_equal( - param.numpy(), ori_params[param] - self.lr * delta + param.numpy(), ori_params[param] - self.lr * delta, decimal=6 ) cases = [ @@ -189,7 +193,9 @@ def test_adagrad(): self.s_slots[param] += grad ** 2 delta = grad / (self.s_slots[param] + self.eps) ** 0.5 delta *= -(self.lr / (1 + (step - 1) * self.lr_decay)) - np.testing.assert_almost_equal(param.numpy(), ori_params[param] + delta) + np.testing.assert_almost_equal( + param.numpy(), ori_params[param] + delta, decimal=6 + ) cases = [ {"lr": 0.01, "eps": 1e-06, "lr_decay": 0.01}, @@ -232,7 +238,9 @@ def test_adadelta(): 1 - self.rho ) delta *= -self.lr - np.testing.assert_almost_equal(param.numpy(), ori_params[param] + delta) + np.testing.assert_almost_equal( + param.numpy(), ori_params[param] + delta, decimal=6 + ) cases = [ {"lr": 1.0, "eps": 1e-06, "rho": 0.9}, -- GitLab