diff --git a/python_module/test/unit/optimizer/test_optimizer.py b/python_module/test/unit/optimizer/test_optimizer.py index b800d0b63a14b4d8205c6894fc4de188c3e3bae3..8f496c15e1621c33acf765666fa51cd3484bbf63 100644 --- a/python_module/test/unit/optimizer/test_optimizer.py +++ b/python_module/test/unit/optimizer/test_optimizer.py @@ -12,249 +12,178 @@ import numpy as np from helpers import MLP, graph_mode import megengine.functional as F -from megengine import load, save +from megengine import load, optimizer, save from megengine.core import TensorDict, tensor from megengine.jit import trace -from megengine.optimizer import SGD, Adam from megengine.test import assertTensorClose def get_input(): - batch_size = 2 - input_dim = 28 - data_shape = (batch_size, input_dim) - label_shape = (batch_size,) - data = tensor() - label = tensor(dtype=np.int32) + batch_size, input_dim = 2, 28 + data_shape, label_shape = (batch_size, input_dim), (batch_size,) + data, label = tensor(dtype=np.float32), tensor(dtype=np.int32) data.set_value(np.random.random(data_shape).astype(np.float32)) label.set_value(np.random.randint(0, 10, label_shape)) return data, data_shape, label, label_shape -def test_sgd_simple(): - data, data_shape, label, label_shape = get_input() - mlp = MLP() - opt = SGD(mlp.parameters(), lr=0.01, weight_decay=0.1) - for idx in range(3): - data.set_value(np.random.random(data_shape).astype(np.float32)) - label.set_value(np.random.randint(0, 10, label_shape)) - pred = mlp(data) - loss = F.square_loss(pred, label.reshape(-1, 1)) - if idx % 2: - opt.zero_grad() - else: - mlp.zero_grad() - opt.backward(loss) - grads = TensorDict() - orig_params = TensorDict() - for param in mlp.parameters(): - grad = F.grad(loss, param, use_virtual_grad=False) - assertTensorClose(grad.numpy(), param.grad.numpy()) - grads[param] = np.copy(grad.numpy()) - orig_params[param] = np.copy(param.numpy()) - opt.step() - for param in mlp.parameters(): - assertTensorClose( - param.numpy(), orig_params[param] * 0.999 - grads[param] * 0.01 - ) - - -def test_sgd_momentum(): +@graph_mode("eager", "static") +def test_optimizer_serialization(): data, data_shape, label, label_shape = get_input() mlp = MLP() - opt = SGD(mlp.parameters(), lr=0.01, momentum=0.9) - slots = TensorDict() - for param in mlp.parameters(): - slots[param] = np.zeros(param.shape).astype(np.float32) - for _ in range(3): - data.set_value(np.random.random(data_shape).astype(np.float32)) - label.set_value(np.random.randint(0, 10, label_shape)) - pred = mlp(data) - loss = F.square_loss(pred, label.reshape(-1, 1)) - opt.zero_grad() - opt.backward(loss) - orig_params = TensorDict() - grads = TensorDict() - for param in mlp.parameters(): - orig_params[param] = np.copy(param.numpy()) - grads[param] = np.copy(param.grad.numpy()) - opt.step() - for param in mlp.parameters(): - slot = slots[param] - orig_param = orig_params[param] - slot *= 0.9 - slot -= param.grad.numpy() * 0.01 - assertTensorClose(param.numpy(), orig_param + slot) - - -# TODO: put opt.step() inside trace -def test_sgd_momentum_static(): - _, data_shape, _, label_shape = get_input() - mlp = MLP() - opt = SGD(mlp.parameters(), lr=0.01, momentum=0.9) - - @trace - def f(data, label): - pred = mlp(data) - loss = F.square_loss(pred, label.reshape(-1, 1)) - opt.zero_grad() - opt.backward(loss) - + opt = optimizer.SGD(mlp.parameters(), lr=0.01, momentum=0.9) slots = TensorDict() for param in mlp.parameters(): slots[param] = np.zeros(param.shape).astype(np.float32) - for _ in range(3): - f( - np.random.random(data_shape).astype(np.float32), - np.random.randint(0, 10, label_shape).astype(np.int32), - ) - orig_params = TensorDict() - grads = TensorDict() - for param in mlp.parameters(): - orig_params[param] = np.copy(param.numpy()) - grads[param] = np.copy(param.grad.numpy()) - opt.step() - for param in mlp.parameters(): - slot = slots[param] - orig_param = orig_params[param] - slot *= 0.9 - slot -= param.grad.numpy() * 0.01 - assertTensorClose(param.numpy(), orig_param + slot) - -def test_update_lr(): - data, data_shape, label, label_shape = get_input() - mlp = MLP() - opt = SGD(mlp.parameters(), lr=0.01) pred = mlp(data) loss = F.square_loss(pred, label.reshape(-1, 1)) opt.zero_grad() opt.backward(loss) opt.step() - for group in opt.param_groups: - group["lr"] += 0.02 - for _ in range(3): + for param in mlp.parameters(): + slots[param] = slots[param] * 0.9 + param.grad.numpy() + + with BytesIO() as fout: + save(opt.state_dict(), fout) + fout.seek(0) + state_dict = load(fout) + opt1 = optimizer.SGD(mlp.parameters(), lr=0.02, momentum=0.8) + opt1.load_state_dict(state_dict) + data.set_value(np.random.random(data_shape).astype(np.float32)) label.set_value(np.random.randint(0, 10, label_shape)) pred = mlp(data) loss = F.square_loss(pred, label.reshape(-1, 1)) - opt.zero_grad() - opt.backward(loss) + opt1.zero_grad() + opt1.backward(loss) + orig_params = TensorDict() for param in mlp.parameters(): - grad = F.grad(loss, param, use_virtual_grad=False) - assertTensorClose(grad.numpy(), param.grad.numpy()) - orig_params = [] + orig_params[param] = np.copy(param.numpy()) + opt1.step() for param in mlp.parameters(): - orig_params.append(np.copy(param.numpy())) - opt.step() - for param, orig_param in zip(mlp.parameters(), orig_params): - assertTensorClose(param.numpy(), orig_param - param.grad.numpy() * 0.03) + orig_param = orig_params[param] + slots[param] = slots[param] * 0.9 + param.grad.numpy() + assertTensorClose(param.numpy(), orig_param - 0.01 * slots[param]) -def test_adam(): +def _test_optimizer(opt_str, test_case, check_class, update_lr=False): + iter_num = 3 data, data_shape, label, label_shape = get_input() - mlp = MLP() - beta0 = 0.8 - beta1 = 0.9 - eps = 1e-4 - opt = Adam(mlp.parameters(), lr=0.01, betas=(beta0, beta1), eps=eps) - m_slots = TensorDict() - v_slots = TensorDict() - for param in mlp.parameters(): - m_slots[param] = np.zeros(param.shape).astype(np.float32) - v_slots[param] = np.zeros(param.shape).astype(np.float32) - step_size = 0 - def check_value(): - for param in mlp.parameters(): - grad = param.grad.numpy() - orig_param = orig_params[param] - m = m_slots[param] - v = v_slots[param] - m *= beta0 - m += (1 - beta0) * grad - v *= beta1 - v += (1 - beta1) * grad * grad - update = (m / (1 - beta0 ** step_size)) / ( - np.sqrt(v / (1 - beta1 ** step_size)) + eps - ) - assertTensorClose(param.numpy(), orig_param - 0.01 * update) + net = MLP() + opt = getattr(optimizer, opt_str)(net.parameters(), **test_case) + check_func = check_class(net, **test_case) - # eager - for _ in range(3): + step = 0 + + # eager graph + for i in range(iter_num): + if update_lr and i == 1: # change learning rate + for group in opt.param_groups: + group["lr"] += 0.01 + check_func.lr += 0.01 data.set_value(np.random.random(data_shape).astype(np.float32)) label.set_value(np.random.randint(0, 10, label_shape)) - pred = mlp(data) + pred = net(data) loss = F.square_loss(pred, label.reshape(-1, 1)) opt.zero_grad() - grads = opt.backward(loss) - orig_params = TensorDict() - for param in mlp.parameters(): - orig_params[param] = np.copy(param.numpy()) + opt.backward(loss) + ori_params = TensorDict() + for param in net.parameters(): + ori_params[param] = np.copy(param.numpy()) opt.step() - step_size += 1 - check_value() + step += 1 + check_func(ori_params, net.parameters(), step) - # static + # static graph @trace - def f(data, label): - pred = mlp(data) + def train_func(data, label): + pred = net(data) loss = F.square_loss(pred, label.reshape(-1, 1)) opt.backward(loss) - for _ in range(3): + for i in range(iter_num): + if update_lr and i == 1: # change learning rate + for group in opt.param_groups: + group["lr"] += 0.01 + check_func.lr += 0.01 opt.zero_grad() - orig_params = TensorDict() - for param in mlp.parameters(): - orig_params[param] = np.copy(param.numpy()) - f( + ori_params = TensorDict() + for param in net.parameters(): + ori_params[param] = np.copy(param.numpy()) + train_func( np.random.random(data_shape).astype(np.float32), np.random.randint(0, 10, label_shape).astype(np.int32), ) opt.step() - step_size += 1 - check_value() + step += 1 + check_func(ori_params, net.parameters(), step) + + +def test_sgd(): + class CheckValue: + def __init__(self, net, **kwarg): + self.slots = TensorDict() + for param in net.parameters(): + self.slots[param] = np.zeros(param.shape).astype(np.float32) + for k, v in kwarg.items(): + setattr(self, k, v) + + def __call__(self, ori_params, new_params, step): + for param in new_params: + grad = param.grad.numpy() + if hasattr(self, "momentum"): + self.slots[param] = grad + self.slots[param] * self.momentum + delta = -self.lr * self.slots[param] + else: + delta = -self.lr * grad + assertTensorClose(param.numpy(), ori_params[param] + delta) + + cases = [ + {"momentum": 0.9, "lr": 0.01}, # SGD with momentum + {"lr": 0.01}, # simple SGD + {"weight_decay": 0.1, "lr": 0.01}, # with weight_decay + ] + for case in cases: + _test_optimizer("SGD", case, CheckValue) + _test_optimizer("SGD", case, CheckValue, update_lr=True) -@graph_mode("eager", "static") -def test_optimizer_serialization(): - data, data_shape, label, label_shape = get_input() - mlp = MLP() - opt = SGD(mlp.parameters(), lr=0.01, momentum=0.9) - slots = TensorDict() - for param in mlp.parameters(): - slots[param] = np.zeros(param.shape).astype(np.float32) - - pred = mlp(data) - loss = F.square_loss(pred, label.reshape(-1, 1)) - opt.zero_grad() - opt.backward(loss) - opt.step() - for param in mlp.parameters(): - slot = slots[param] - slot *= 0.9 - slot -= param.grad.numpy() * 0.01 - - with BytesIO() as fout: - save(opt.state_dict(), fout) - fout.seek(0) - state_dict = load(fout) - opt1 = SGD(mlp.parameters(), lr=0.02, momentum=0.8) - opt1.load_state_dict(state_dict) - - data.set_value(np.random.random(data_shape).astype(np.float32)) - label.set_value(np.random.randint(0, 10, label_shape)) - pred = mlp(data) - loss = F.square_loss(pred, label.reshape(-1, 1)) - opt1.zero_grad() - opt1.backward(loss) - orig_params = TensorDict() - for param in mlp.parameters(): - orig_params[param] = np.copy(param.numpy()) - opt1.step() - for param in mlp.parameters(): - orig_param = orig_params[param] - slot = slots[param] - slot *= 0.9 - slot -= param.grad.numpy() * 0.01 - assertTensorClose(param.numpy(), orig_param + slot) +def test_adam(): + class CheckValue: + def __init__(self, net, **kwarg): + self.m_slots = TensorDict() + self.v_slots = TensorDict() + for param in net.parameters(): + self.m_slots[param] = np.zeros(param.shape).astype(np.float32) + self.v_slots[param] = np.zeros(param.shape).astype(np.float32) + for k, v in kwarg.items(): + setattr(self, k, v) + + def __call__(self, ori_params, new_params, step): + for param in new_params: + grad = param.grad.numpy() + m = self.m_slots[param] + v = self.v_slots[param] + m *= self.betas[0] + m += (1 - self.betas[0]) * grad + v *= self.betas[1] + v += (1 - self.betas[1]) * grad * grad + delta = (m / (1 - self.betas[0] ** step)) / ( + np.sqrt(v / (1 - self.betas[1] ** step)) + self.eps + ) + assertTensorClose(param.numpy(), ori_params[param] - self.lr * delta) + + cases = [ + {"betas": (0.8, 0.9), "eps": 1e-04, "lr": 0.01}, + { + "betas": (0.8, 0.9), + "eps": 1e-04, + "lr": 0.01, + "weight_decay": 0.1, + }, # with weight_decay + ] + for case in cases: + _test_optimizer("Adam", case, CheckValue) + _test_optimizer("Adam", case, CheckValue, update_lr=True)