From 30aab1773430e1fae801f8ddc2bf4784cb799ac7 Mon Sep 17 00:00:00 2001 From: Zhou Wei <52485244+zhouwei25@users.noreply.github.com> Date: Fri, 28 Aug 2020 13:40:56 +0800 Subject: [PATCH] [2.0API]support 2.0 lr_scheduler for 2.0 optimizer (#26737) * support 2.0 lr_scheduler for 2.0 optimizer * fix unittest * fix doc * fix unittest * fix sample code, fix unittest --- .../fluid/tests/unittests/test_adam_op.py | 16 +- .../unittests/test_imperative_optimizer_v2.py | 42 +- .../unittests/test_imperative_save_load.py | 1 + .../unittests/test_imperative_save_load_v2.py | 51 +- .../unittests/test_learning_rate_scheduler.py | 486 ---------------- .../tests/unittests/test_lr_scheduler.py | 519 ++++++++++++++++++ python/paddle/optimizer/optimizer.py | 217 +++----- 7 files changed, 641 insertions(+), 691 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/test_lr_scheduler.py diff --git a/python/paddle/fluid/tests/unittests/test_adam_op.py b/python/paddle/fluid/tests/unittests/test_adam_op.py index 990499858c..d4aafcd27a 100644 --- a/python/paddle/fluid/tests/unittests/test_adam_op.py +++ b/python/paddle/fluid/tests/unittests/test_adam_op.py @@ -456,8 +456,9 @@ class TestAdamOpV2(unittest.TestCase): state_dict = adam.state_dict() adam.set_state_dict(state_dict) - #learning_rate is Decay - learning_rate = fluid.dygraph.CosineDecay(0.1, 10000, 120) + #learning_rate is _LRScheduler + learning_rate = paddle.optimizer.CosineAnnealingLR( + learning_rate=0.1, T_max=10) adam = paddle.optimizer.Adam( learning_rate=learning_rate, weight_decay=fluid.regularizer.L2Decay(0.001), @@ -498,15 +499,10 @@ class TestAdamOpV2(unittest.TestCase): adam.set_lr(lr) cur_lr = adam.get_lr() assert (lr == cur_lr) - - lr_var = paddle.create_global_var(shape=[1], value=lr, dtype='float32') - adam.set_lr(lr_var) - cur_lr = adam.get_lr() - assert (np.float32(lr) == cur_lr) - with self.assertRaises(TypeError): - lr = int(1) - adam.set_lr(lr) + lr_var = paddle.create_global_var( + shape=[1], value=lr, dtype='float32') + adam.set_lr(lr_var) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py b/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py index 9f75c92b18..ec331e2e5b 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py @@ -200,7 +200,7 @@ class TestImperativeOptimizerPiecewiseDecay(TestImperativeOptimizerBase): def get_optimizer_dygraph(self, parameter_list): bd = [3, 6, 9] optimizer = SGDOptimizer( - learning_rate=fluid.layers.piecewise_decay( + learning_rate=paddle.optimizer.PiecewiseLR( boundaries=bd, values=[0.1 * (0.1**i) for i in range(len(bd) + 1)]), parameter_list=parameter_list) @@ -208,7 +208,7 @@ class TestImperativeOptimizerPiecewiseDecay(TestImperativeOptimizerBase): def get_optimizer(self): bd = [3, 6, 9] - optimizer = SGDOptimizer(learning_rate=fluid.layers.piecewise_decay( + optimizer = SGDOptimizer(learning_rate=paddle.optimizer.PiecewiseLR( boundaries=bd, values=[0.1 * (0.1**i) for i in range(len(bd) + 1)])) return optimizer @@ -381,9 +381,9 @@ class TestOptimizerLearningRate(unittest.TestCase): bd = [2, 4, 6, 8] value = [0.2, 0.4, 0.6, 0.8, 1.0] + scheduler = paddle.optimizer.PiecewiseLR(bd, value) adam = paddle.optimizer.Adam( - fluid.dygraph.PiecewiseDecay(bd, value, 0), - parameters=linear.parameters()) + scheduler, parameters=linear.parameters()) self.assertTrue( np.allclose( @@ -393,8 +393,8 @@ class TestOptimizerLearningRate(unittest.TestCase): for i in range(12): adam.minimize(loss) lr = adam.get_lr() - self.assertTrue(np.allclose(lr, ret[i], rtol=1e-06, atol=0.0)) + scheduler.step() def test_lr_decay_natural_exp(self): with fluid.dygraph.guard(): @@ -409,24 +409,21 @@ class TestOptimizerLearningRate(unittest.TestCase): loss = fluid.layers.reduce_mean(b) base_lr = 1.0 + scheduler = paddle.optimizer.NaturalExpLR(1.0, gamma=0.5) + print("scheduler.last_lr", scheduler.last_lr) adam = paddle.optimizer.Adam( - fluid.dygraph.NaturalExpDecay( - learning_rate=base_lr, - decay_steps=3, - decay_rate=0.5, - staircase=True), - parameters=linear.parameters()) + scheduler, parameters=linear.parameters()) self.assertTrue( np.allclose( adam.get_lr(), 1.0, rtol=1e-06, atol=0.0)) - ret = [1.0, 1.0, 1.0, np.exp(-0.5), np.exp(-0.5)] - for i in range(5): + ret = [1.0, np.exp(-0.5), np.exp(-1)] + for i in range(3): adam.minimize(loss) lr = adam.get_lr() - self.assertTrue(np.allclose(lr, ret[i], rtol=1e-06, atol=0.0)) + scheduler.step() def test_set_lr(self): with fluid.dygraph.guard(): @@ -451,20 +448,15 @@ class TestOptimizerLearningRate(unittest.TestCase): np.allclose( lr, lr_list[i], rtol=1e-06, atol=0.0)) - lr_var = fluid.layers.create_global_var( - shape=[1], value=0.7, dtype='float32') - adam.set_lr(lr_var) - adam.minimize(loss) - lr = adam.get_lr() - self.assertTrue(np.allclose(lr, 0.7, rtol=1e-06, atol=0.0)) + with self.assertRaises(TypeError): + lr_var = fluid.layers.create_global_var( + shape=[1], value=0.7, dtype='float32') + adam.set_lr(lr_var) with self.assertRaises(RuntimeError): adam = paddle.optimizer.Adam( - fluid.dygraph.NaturalExpDecay( - learning_rate=0.1, - decay_steps=3, - decay_rate=0.5, - staircase=True), + paddle.optimizer.NaturalExpLR( + learning_rate=0.1, gamma=0.5), parameters=linear.parameters()) adam.set_lr(0.01) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_save_load.py b/python/paddle/fluid/tests/unittests/test_imperative_save_load.py index eb9dc926c8..a2f7508910 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_save_load.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_save_load.py @@ -374,6 +374,7 @@ class TestDygraphPtbRnn(unittest.TestCase): adam._learning_rate.step_num = 0 para_state_dict, opti_state_dict = paddle.load("./test_dy") + print(opti_state_dict['LR_Scheduler']) adam.set_dict(opti_state_dict) opti_dict = adam.state_dict() diff --git a/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py b/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py index 4ab35a21af..c1e7fc2b1d 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py @@ -239,10 +239,10 @@ class TestDygraphPtbRnn(unittest.TestCase): place = fluid.CPUPlace() if not core.is_compiled_with_cuda( ) else fluid.CUDAPlace(0) + scheduler = paddle.optimizer.PiecewiseLR( + boundaries=bd, values=lr_arr) adam = Adam( - learning_rate=fluid.layers.piecewise_decay( - boundaries=bd, values=lr_arr), - parameters=ptb_model.parameters()) + learning_rate=scheduler, parameters=ptb_model.parameters()) dy_param_updated = dict() dy_param_init = dict() dy_loss = None @@ -268,7 +268,9 @@ class TestDygraphPtbRnn(unittest.TestCase): dy_param_init[param.name] = param.numpy() dy_loss.backward() adam.minimize(dy_loss) + scheduler.step() ptb_model.clear_gradients() + if i == batch_num - 1: for param in ptb_model.parameters(): dy_param_updated[param.name] = param.numpy() @@ -283,7 +285,7 @@ class TestDygraphPtbRnn(unittest.TestCase): else: self.base_opti[k] = v - fluid.save_dygraph(self.opti_dict, "./test_dy") + fluid.save_dygraph(self.opti_dict, "./test_dy_v2") self.state_dict = ptb_model.state_dict() @@ -292,7 +294,7 @@ class TestDygraphPtbRnn(unittest.TestCase): np_t = v.numpy() self.model_base[k] = np_t - paddle.save(self.state_dict, "./test_dy") + paddle.save(self.state_dict, "./test_dy_v2") def testLoadAndSetVarBase(self): seed = 90 @@ -325,10 +327,10 @@ class TestDygraphPtbRnn(unittest.TestCase): place = fluid.CPUPlace() if not core.is_compiled_with_cuda( ) else fluid.CUDAPlace(0) + scheduler = paddle.optimizer.PiecewiseLR( + boundaries=bd, values=lr_arr) adam = Adam( - learning_rate=fluid.layers.piecewise_decay( - boundaries=bd, values=lr_arr), - parameters=ptb_model.parameters()) + learning_rate=scheduler, parameters=ptb_model.parameters()) dy_param_updated = dict() dy_param_init = dict() dy_loss = None @@ -354,6 +356,7 @@ class TestDygraphPtbRnn(unittest.TestCase): dy_param_init[param.name] = param.numpy() dy_loss.backward() adam.minimize(dy_loss) + scheduler.step() ptb_model.clear_gradients() if i == batch_num - 1: for param in ptb_model.parameters(): @@ -370,10 +373,7 @@ class TestDygraphPtbRnn(unittest.TestCase): self.assertTrue(np.sum(np.abs(v.numpy())) == 0) - if isinstance(adam._learning_rate, LearningRateDecay): - adam._learning_rate.step_num = 0 - - para_state_dict, opti_state_dict = paddle.load("./test_dy") + para_state_dict, opti_state_dict = paddle.load("./test_dy_v2") adam.set_state_dict(opti_state_dict) opti_dict = adam.state_dict() @@ -434,10 +434,10 @@ class TestDygraphPtbRnn(unittest.TestCase): place = fluid.CPUPlace() if not core.is_compiled_with_cuda( ) else fluid.CUDAPlace(0) + scheduler = paddle.optimizer.PiecewiseLR( + boundaries=bd, values=lr_arr) adam = Adam( - learning_rate=fluid.layers.piecewise_decay( - boundaries=bd, values=lr_arr), - parameters=ptb_model.parameters()) + learning_rate=scheduler, parameters=ptb_model.parameters()) dy_param_updated = dict() dy_param_init = dict() dy_loss = None @@ -463,6 +463,7 @@ class TestDygraphPtbRnn(unittest.TestCase): dy_param_init[param.name] = param.numpy() dy_loss.backward() adam.minimize(dy_loss) + scheduler.step() ptb_model.clear_gradients() if i == batch_num - 1: for param in ptb_model.parameters(): @@ -541,10 +542,10 @@ class TestDygraphPtbRnn(unittest.TestCase): place = fluid.CPUPlace() if not core.is_compiled_with_cuda( ) else fluid.CUDAPlace(0) + scheduler = paddle.optimizer.PiecewiseLR( + boundaries=bd, values=lr_arr) adam = Adam( - learning_rate=fluid.layers.piecewise_decay( - boundaries=bd, values=lr_arr), - parameters=ptb_model.parameters()) + learning_rate=scheduler, parameters=ptb_model.parameters()) dy_param_updated = dict() dy_param_init = dict() dy_loss = None @@ -570,6 +571,7 @@ class TestDygraphPtbRnn(unittest.TestCase): dy_param_init[param.name] = param.numpy() dy_loss.backward() adam.minimize(dy_loss) + scheduler.step() ptb_model.clear_gradients() if i == batch_num - 1: for param in ptb_model.parameters(): @@ -745,7 +747,7 @@ class TestDygraphPtbRnn(unittest.TestCase): last_hidden = None last_cell = None - state_dict, opti_dict = fluid.load_dygraph("./test_dy") + state_dict, opti_dict = fluid.load_dygraph("./test_dy_v2") adam.set_state_dict(opti_dict) ptb_model.set_dict(state_dict) @@ -825,9 +827,10 @@ class TestDygraphPtbRnn(unittest.TestCase): place = fluid.CPUPlace() if not core.is_compiled_with_cuda( ) else fluid.CUDAPlace(0) + scheduler = paddle.optimizer.PiecewiseLR( + boundaries=bd, values=lr_arr) adam = Adam( - learning_rate=fluid.layers.piecewise_decay( - boundaries=bd, values=lr_arr), + learning_rate=scheduler, beta1=0.8, beta2=0.6, parameters=ptb_model.parameters()) @@ -867,14 +870,16 @@ class TestDygraphPtbRnn(unittest.TestCase): init_cell) dy_loss.backward() + scheduler.step() adam.minimize(dy_loss) ptb_model.clear_gradients() opti_dict = adam.state_dict() for k, v in opti_dict.items(): - if k == "global_step": + if k == "LR_Scheduler": self.assertTrue( - np.array_equal(v.numpy(), self.base_opti[v.name] + 1)) + np.array_equal(v['last_epoch'], self.base_opti[k][ + 'last_epoch'] + 1)) if k.find("beta1_pow_acc_0") > 0: self.assertTrue( diff --git a/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py b/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py index 9a2e7b85e5..36368a8389 100644 --- a/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py +++ b/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py @@ -523,491 +523,5 @@ class TestLinearWamrupLearningRateDecayWithScalarInput(unittest.TestCase): run_places(lr, start_lr, end_lr) -def reduce_lr_on_plateau(decay_rate, threshold, cooldown, patience, m, n, loss, - var_list): - def is_better(current, best, m, n): - if m == 'min' and n == 'rel': - return current < best - best * threshold - elif m == 'min' and n == 'abs': - return current < best - threshold - elif m == 'max' and n == 'rel': - return current > best + best * threshold - else: # mode == 'max' and epsilon_mode == 'abs': - return current > best + threshold - - if var_list[2] > 0: - var_list[2] -= 1 - return var_list[1] - - if is_better(loss, var_list[0], m, n): - var_list[0] = loss - var_list[3] = 0 - else: - var_list[3] += 1 - if var_list[3] > patience: - var_list[2] = cooldown - var_list[3] = 0 - new_lr = var_list[1] * decay_rate - var_list[1] = new_lr if var_list[1] - new_lr > 1e-8 else var_list[1] - - return var_list[1] - - -class TestReduceLROnPlateauDecay(unittest.TestCase): - def test_ReduceLR(self): - # the decay rate must be less than 1.0 - with self.assertRaises(ValueError): - paddle.optimizer.ReduceLROnPlateau(learning_rate=1.0, factor=2.0) - # the mode must be "min" or "max" - with self.assertRaises(ValueError): - paddle.optimizer.ReduceLROnPlateau(learning_rate=1.0, mode="test") - # the threshold_mode must be "rel" or "abs" - with self.assertRaises(ValueError): - paddle.optimizer.ReduceLROnPlateau( - learning_rate=1.0, threshold_mode="test") - with self.assertRaises(TypeError): - paddle.optimizer.ReduceLROnPlateau(learning_rate="test") - with self.assertRaises(TypeError): - paddle.optimizer.ReduceLROnPlateau(learning_rate=0.5).step("test") - - places = [fluid.CPUPlace()] - if core.is_compiled_with_cuda(): - places.append(fluid.CUDAPlace(0)) - - for place in places: - for m, n in zip(['min', 'max', 'min', 'max'], - ['rel', 'rel', 'abs', 'abs']): - kwargs = { - 'learning_rate': 1.0, - 'mode': m, - 'factor': 0.5, - 'patience': 3, - 'threshold': 1e-4, - 'threshold_mode': n, - 'cooldown': 1, - 'min_lr': 0, - 'epsilon': 1e-8, - 'verbose': False, - } - paddle.enable_static() - self._test_static(place, kwargs) - paddle.disable_static(place) - self._test_dygraph(place, kwargs) - paddle.enable_static() - - def _test_static(self, place, kwargs): - paddle.enable_static() - - best = float("-10000") if kwargs['mode'] == "max" else float("10000") - current_lr = 1.0 - cooldown_counter = 0 - num_bad_epochs = 0 - var_list = [best, current_lr, cooldown_counter, num_bad_epochs] - - main_prog = fluid.Program() - start_prog = fluid.Program() - with fluid.program_guard(main_prog, start_prog): - x = fluid.layers.create_global_var( - [1], 1, 'float32', persistable=True) - paddle.increment(x) - loss = paddle.sin(x) - scheduler = paddle.optimizer.ReduceLROnPlateau(**kwargs) - adam = fluid.optimizer.Adam(learning_rate=scheduler) - adam.minimize(loss) - lr_var = adam._global_learning_rate() - test_prog = main_prog.clone() - - exe = fluid.Executor(place) - exe.run(start_prog) - - for epoch in range(20): - for batch_id in range(1): - out, actual_lr = exe.run(main_prog, - fetch_list=[loss.name, lr_var.name]) - expected_lr = reduce_lr_on_plateau( - kwargs['factor'], kwargs['threshold'], kwargs['cooldown'], - kwargs['patience'], kwargs['mode'], - kwargs['threshold_mode'], out[0], var_list) - - scheduler.step(out[0]) - actual_lr = scheduler() - self.assertEqual(actual_lr, np.array(expected_lr)) - - for epoch in range(10): - for batch_id in range(1): - out, actual_lr = exe.run(test_prog, - fetch_list=[loss.name, lr_var.name]) - expected_lr = reduce_lr_on_plateau( - kwargs['factor'], kwargs['threshold'], kwargs['cooldown'], - kwargs['patience'], kwargs['mode'], - kwargs['threshold_mode'], out[0], var_list) - scheduler.step(out[0]) - actual_lr = scheduler() - self.assertEqual(actual_lr, np.array(expected_lr)) - - def _test_dygraph(self, place, kwargs): - paddle.disable_static(place) - - best = float("-10000") if kwargs['mode'] == "max" else float("10000") - current_lr = 1.0 - cooldown_counter = 0 - num_bad_epochs = 0 - var_list = [best, current_lr, cooldown_counter, num_bad_epochs] - - linear = paddle.nn.Linear(10, 10) - scheduler = paddle.optimizer.ReduceLROnPlateau(**kwargs) - sgd = paddle.optimizer.SGD(learning_rate=scheduler, - parameter_list=linear.parameters()) - - for epoch in range(20): - for batch_id in range(1): - x = paddle.to_tensor(epoch).astype('float32') - loss = paddle.sin(x) - loss.backward() - sgd.minimize(loss) - - scheduler.step(loss) - # get lr from paddle - current_lr = scheduler() - # get lr form python - expected_lr = reduce_lr_on_plateau( - kwargs['factor'], kwargs['threshold'], kwargs['cooldown'], - kwargs['patience'], kwargs['mode'], kwargs['threshold_mode'], - loss, var_list) - self.assertEqual(current_lr, expected_lr) - state_dict = sgd.state_dict() - scheduler1 = paddle.optimizer.ReduceLROnPlateau(**kwargs) - sgd1 = paddle.optimizer.SGD(learning_rate=scheduler1, - parameter_list=linear.parameters()) - sgd1.set_dict(state_dict) - self.assertEqual(scheduler.cooldown_counter, - scheduler1.cooldown_counter) - self.assertEqual(scheduler.best.numpy()[0], scheduler1.best) - self.assertEqual(scheduler.num_bad_epochs, scheduler1.num_bad_epochs) - self.assertEqual(scheduler.last_epoch, scheduler1.last_epoch) - self.assertEqual(scheduler.last_lr, scheduler1.last_lr) - - -def noam_lr(epoch_num, d_model, warmup_steps, learning_rate=1.0, verbose=False): - if epoch_num == 0: - a = 1 - else: - a = math.pow(epoch_num, -0.5) - b = math.pow(warmup_steps, -1.5) * epoch_num - return learning_rate * math.pow(d_model, -0.5) * min(a, b) - - -def lambda_lr(epoch_num, learning_rate, lr_lambda, verbose=False): - return learning_rate * lr_lambda(epoch_num) - - -def piecewise_lr(epoch_num, boundaries, values, verbose=False): - assert len(boundaries) + 1 == len(values) - for i in range(len(boundaries)): - if epoch_num < boundaries[i]: - return values[i] - return values[len(values) - 1] - - -def exponential_lr(epoch_num, learning_rate, gamma, verbose=False): - return learning_rate * gamma**epoch_num - - -def natural_exp_lr(epoch_num, learning_rate, gamma, verbose=False): - return learning_rate * math.exp(-1 * gamma * epoch_num) - - -def inverse_time_lr(epoch_num, learning_rate, gamma, verbose=False): - return learning_rate / (1 + gamma * epoch_num) - - -def polynomial_lr(epoch_num, - learning_rate, - decay_steps, - end_lr=0.0001, - power=1.0, - cycle=False, - verbose=False): - - if cycle: - div = math.ceil(epoch_num / float(decay_steps)) - if epoch_num == 0: - div = 1 - decay_steps = decay_steps * div - else: - epoch_num = min(epoch_num, decay_steps) - return (learning_rate - end_lr) * ( - (1 - float(epoch_num) / float(decay_steps))**power) + end_lr - - def get_lr(self): - if self.last_epoch == 0: - return self.base_lr - elif (self.last_epoch - 1 - self.T_max) % (2 * self.T_max) == 0: - return self.last_lr + (self.base_lr - self.eta_min) * (1 - math.cos( - math.pi / self.T_max)) / 2 - - return (1 + math.cos(math.pi * self.last_epoch / self.T_max)) / ( - 1 + math.cos(math.pi * (self.last_epoch - 1) / self.T_max)) * ( - self.last_lr - self.eta_min) + self.eta_min - - -cosine_annealing_lr_current = None - - -def cosine_annealing_lr(epoch_num, - learning_rate, - T_max, - eta_min=0, - verbose=False): - global cosine_annealing_lr_current - if epoch_num == 0: - cosine_annealing_lr_current = learning_rate - elif (epoch_num - 1 - T_max) % (2 * T_max) == 0: - cosine_annealing_lr_current = cosine_annealing_lr_current + ( - learning_rate - eta_min) * (1 - math.cos(math.pi / float(T_max)) - ) / 2 - else: - cosine_annealing_lr_current = (1 + math.cos( - math.pi * epoch_num / float(T_max))) / (1 + math.cos(math.pi * ( - epoch_num - 1) / float(T_max))) * (cosine_annealing_lr_current - - eta_min) + eta_min - return cosine_annealing_lr_current - - -def linear_warmup_lr(epoch_num, - learning_rate, - warmup_steps, - start_lr, - end_lr, - verbose=False): - if epoch_num < warmup_steps: - return start_lr + (end_lr - start_lr) * (float(epoch_num) / - float(warmup_steps)) - else: - return learning_rate - - -def multi_step_lr(epoch_num, - learning_rate, - milestones, - gamma=0.1, - verbose=False): - for i in range(len(milestones)): - if epoch_num < milestones[i]: - return learning_rate * (gamma**i) - return learning_rate * (gamma**len(milestones)) - - -def step_lr(epoch_num, learning_rate, step_size, gamma=0.1, verbose=False): - return learning_rate * math.pow(gamma, epoch_num // step_size) - - -class TestLRScheduler(unittest.TestCase): - def _test_static(self, python_func, paddle_api, kwarg, place): - main_prog = fluid.Program() - start_prog = fluid.Program() - with fluid.program_guard(main_prog, start_prog): - x = fluid.data(name='x', shape=[3, 4, 5]) - y = fluid.data(name='y', shape=[3, 4, 5]) - z = fluid.layers.fc(x, 100) - loss = fluid.layers.mean(z) - scheduler = paddle_api(**kwarg) - adam = fluid.optimizer.Adam(learning_rate=scheduler) - adam.minimize(loss) - lr_var = adam._global_learning_rate() - test_prog = main_prog.clone() - - num = 0 - exe = fluid.Executor(place) - exe.run(start_prog) - for epoch in range(5): - for batch_id in range(2): - out = exe.run( - main_prog, - feed={ - 'x': np.random.randn(3, 4, 5).astype('float32'), - 'y': np.random.randn(3, 4, 5).astype('float32') - }, - fetch_list=lr_var.name) - self.assertEqual(out, np.array(python_func(num, **kwarg))) - scheduler.step() - num += 1 - - for epoch in range(5): - for batch_id in range(2): - out = exe.run( - test_prog, - feed={ - 'x': np.random.randn(3, 4, 5).astype('float32'), - 'y': np.random.randn(3, 4, 5).astype('float32') - }, - fetch_list=lr_var.name) - self.assertEqual(out, np.array(python_func(num, **kwarg))) - scheduler.step() - num += 1 - - if isinstance(place, fluid.CPUPlace): - compiled_train_prog = fluid.CompiledProgram( - main_prog).with_data_parallel( - loss_name=loss.name, places=fluid.cpu_places(4)) - for epoch in range(5): - python_result = python_func(num, **kwarg) - for batch_id in range(2): - _ = exe.run( - compiled_train_prog, - feed={ - 'x': np.random.randn(12, 4, 5).astype('float32'), - 'y': np.random.randn(12, 4, 5).astype('float32') - }, - fetch_list=lr_var.name) - scopes = compiled_train_prog._executor.local_scopes() - out = np.array(scopes[0].var(lr_var.name).get_tensor()) - self.assertEqual(out, np.array(python_result)) - out = np.array(scopes[1].var(lr_var.name).get_tensor()) - self.assertEqual(out, np.array(python_result)) - out = np.array(scopes[2].var(lr_var.name).get_tensor()) - self.assertEqual(out, np.array(python_result)) - out = np.array(scopes[3].var(lr_var.name).get_tensor()) - self.assertEqual(out, np.array(python_result)) - scheduler.step() - num += 1 - - compiled_test_prog = fluid.CompiledProgram( - test_prog).with_data_parallel( - loss_name=loss.name, - share_vars_from=compiled_train_prog, - places=fluid.cpu_places(4)) - for epoch in range(5): - python_result = python_func(num, **kwarg) - for batch_id in range(2): - _ = exe.run( - compiled_test_prog, - feed={ - 'x': np.random.randn(12, 4, 5).astype('float32'), - 'y': np.random.randn(12, 4, 5).astype('float32') - }, - fetch_list=lr_var.name) - scopes = compiled_test_prog._executor.local_scopes() - out = np.array(scopes[0].var(lr_var.name).get_tensor()) - self.assertEqual(out, np.array(python_result)) - out = np.array(scopes[1].var(lr_var.name).get_tensor()) - self.assertEqual(out, np.array(python_result)) - out = np.array(scopes[2].var(lr_var.name).get_tensor()) - self.assertEqual(out, np.array(python_result)) - out = np.array(scopes[3].var(lr_var.name).get_tensor()) - self.assertEqual(out, np.array(python_result)) - scheduler.step() - num += 1 - - def _test_dygraph(self, python_func, paddle_api, kwarg, place): - x = np.random.uniform(-1, 1, [10, 10]).astype("float32") - linear = paddle.nn.Linear(10, 10) - scheduler = paddle_api(**kwarg) - sgd = paddle.optimizer.SGD(learning_rate=scheduler, - parameter_list=linear.parameters()) - for epoch in range(20): - for batch_id in range(2): - x = paddle.to_tensor(x) - out = linear(x) - loss = paddle.reduce_mean(out) - out.backward() - sgd.minimize(loss) - linear.clear_gradients() - - self.assertAlmostEqual(sgd.current_step_lr(), - python_func(epoch, **kwarg)) - if paddle_api.__name__ != "CosineAnnealingLR": - scheduler.step() - else: - scheduler.step(epoch + 1) - - def test_scheduler(self): - with self.assertRaises(NotImplementedError): - paddle.optimizer.lr_scheduler._LRScheduler().step() - with self.assertRaises(TypeError): - paddle.optimizer.MultiStepLR( - learning_rate="test", milestones=[1, 2, 3]) - with self.assertRaises(TypeError): - paddle.optimizer.MultiStepLR(learning_rate=0.5, milestones='test') - with self.assertRaises(ValueError): - paddle.optimizer.MultiStepLR( - learning_rate=0.5, milestones=[3, 2, 1]) - with self.assertRaises(ValueError): - paddle.optimizer.MultiStepLR( - learning_rate=0.5, milestones=[1, 2, 3], gamma=2) - - func_api_kwargs = [(noam_lr, paddle.optimizer.NoamLR, { - "d_model": 0.01, - "warmup_steps": 100, - "verbose": False - }), (piecewise_lr, paddle.optimizer.PiecewiseLR, { - "boundaries": [3, 6, 9, 15, 20], - "values": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6], - "verbose": False - }), (natural_exp_lr, paddle.optimizer.NaturalExpLR, { - "learning_rate": 0.5, - "gamma": 0.1, - "verbose": False - }), (inverse_time_lr, paddle.optimizer.InverseTimeLR, { - "learning_rate": 0.5, - "gamma": 0.1, - "verbose": True - }), (polynomial_lr, paddle.optimizer.PolynomialLR, { - "learning_rate": 0.5, - "decay_steps": 20, - "end_lr": 0, - "power": 1.0, - "cycle": False, - "verbose": False - }), (polynomial_lr, paddle.optimizer.PolynomialLR, { - "learning_rate": 0.5, - "decay_steps": 20, - "end_lr": 0, - "power": 1.0, - "cycle": True, - "verbose": False - }), (linear_warmup_lr, paddle.optimizer.LinearLrWarmup, { - 'learning_rate': 0.5, - 'warmup_steps': 20, - 'start_lr': 0, - 'end_lr': 0.5, - "verbose": False - }), (exponential_lr, paddle.optimizer.ExponentialLR, { - "learning_rate": 0.5, - "gamma": 0.9, - "verbose": False - }), (multi_step_lr, paddle.optimizer.MultiStepLR, { - "learning_rate": 0.5, - "milestones": [3, 6, 9, 15, 20], - "gamma": 0.8, - "verbose": True - }), (step_lr, paddle.optimizer.StepLR, { - "learning_rate": 0.5, - "step_size": 2, - "gamma": 0.8, - "verbose": False - }), (lambda_lr, paddle.optimizer.LambdaLR, { - "learning_rate": 0.5, - "lr_lambda": lambda x: 0.95**x, - "verbose": False - }), (cosine_annealing_lr, paddle.optimizer.CosineAnnealingLR, { - "learning_rate": 0.5, - "T_max": 10, - "verbose": True - })] - - for python_func, paddle_api, kwarg in func_api_kwargs: - places = [fluid.CPUPlace()] - if core.is_compiled_with_cuda(): - places.append(fluid.CUDAPlace(0)) - - for place in places: - paddle.enable_static() - self._test_static(python_func, paddle_api, kwarg, place) - paddle.disable_static(place) - self._test_dygraph(python_func, paddle_api, kwarg, place) - paddle.enable_static() - - if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_lr_scheduler.py b/python/paddle/fluid/tests/unittests/test_lr_scheduler.py new file mode 100644 index 0000000000..f655e363e9 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_lr_scheduler.py @@ -0,0 +1,519 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import copy +import math +import numpy as np +import unittest + +import paddle +import paddle.fluid as fluid +import paddle.fluid.layers as layers +import paddle.fluid.framework as framework +import paddle.fluid.core as core + + +def reduce_lr_on_plateau(decay_rate, threshold, cooldown, patience, m, n, loss, + var_list): + def is_better(current, best, m, n): + if m == 'min' and n == 'rel': + return current < best - best * threshold + elif m == 'min' and n == 'abs': + return current < best - threshold + elif m == 'max' and n == 'rel': + return current > best + best * threshold + else: # mode == 'max' and epsilon_mode == 'abs': + return current > best + threshold + + if var_list[2] > 0: + var_list[2] -= 1 + return var_list[1] + + if is_better(loss, var_list[0], m, n): + var_list[0] = loss + var_list[3] = 0 + else: + var_list[3] += 1 + if var_list[3] > patience: + var_list[2] = cooldown + var_list[3] = 0 + new_lr = var_list[1] * decay_rate + var_list[1] = new_lr if var_list[1] - new_lr > 1e-8 else var_list[1] + + return var_list[1] + + +class TestReduceLROnPlateauDecay(object): + def test_ReduceLR(self): + # the decay rate must be less than 1.0 + with self.assertRaises(ValueError): + paddle.optimizer.ReduceLROnPlateau(learning_rate=1.0, factor=2.0) + # the mode must be "min" or "max" + with self.assertRaises(ValueError): + paddle.optimizer.ReduceLROnPlateau(learning_rate=1.0, mode="test") + # the threshold_mode must be "rel" or "abs" + with self.assertRaises(ValueError): + paddle.optimizer.ReduceLROnPlateau( + learning_rate=1.0, threshold_mode="test") + with self.assertRaises(TypeError): + paddle.optimizer.ReduceLROnPlateau(learning_rate="test") + with self.assertRaises(TypeError): + paddle.optimizer.ReduceLROnPlateau(learning_rate=0.5).step("test") + + places = [paddle.CPUPlace()] + if core.is_compiled_with_cuda(): + places.append(paddle.CUDAPlace(0)) + + for place in places: + for m, n in zip(['min', 'max', 'min', 'max'], + ['rel', 'rel', 'abs', 'abs']): + kwargs = { + 'learning_rate': 1.0, + 'mode': m, + 'factor': 0.5, + 'patience': 3, + 'threshold': 1e-4, + 'threshold_mode': n, + 'cooldown': 1, + 'min_lr': 0, + 'epsilon': 1e-8, + 'verbose': False, + } + paddle.enable_static() + self._test_static(place, kwargs) + paddle.disable_static(place) + self._test_dygraph(place, kwargs) + paddle.enable_static() + + def _test_static(self, place, kwargs): + paddle.enable_static() + + best = float("-10000") if kwargs['mode'] == "max" else float("10000") + current_lr = 1.0 + cooldown_counter = 0 + num_bad_epochs = 0 + var_list = [best, current_lr, cooldown_counter, num_bad_epochs] + + main_prog = paddle.static.Program() + start_prog = paddle.static.Program() + with paddle.static.program_guard(main_prog, start_prog): + x = fluid.layers.create_global_var( + [1], 1, 'float32', persistable=True) + paddle.increment(x) + loss = paddle.sin(x) + scheduler = paddle.optimizer.ReduceLROnPlateau(**kwargs) + adam = paddle.optimizer.Adam(learning_rate=scheduler) + adam.minimize(loss) + lr_var = adam._global_learning_rate() + test_prog = main_prog.clone() + + exe = paddle.static.Executor(place) + exe.run(start_prog) + + for epoch in range(20): + for batch_id in range(1): + out, actual_lr = exe.run(main_prog, + fetch_list=[loss.name, lr_var.name]) + expected_lr = reduce_lr_on_plateau( + kwargs['factor'], kwargs['threshold'], kwargs['cooldown'], + kwargs['patience'], kwargs['mode'], + kwargs['threshold_mode'], out[0], var_list) + + scheduler.step(out[0]) + actual_lr = scheduler() + self.assertEqual(actual_lr, np.array(expected_lr)) + + for epoch in range(10): + for batch_id in range(1): + out, actual_lr = exe.run(test_prog, + fetch_list=[loss.name, lr_var.name]) + expected_lr = reduce_lr_on_plateau( + kwargs['factor'], kwargs['threshold'], kwargs['cooldown'], + kwargs['patience'], kwargs['mode'], + kwargs['threshold_mode'], out[0], var_list) + scheduler.step(out[0]) + actual_lr = scheduler() + self.assertEqual(actual_lr, np.array(expected_lr)) + + def _test_dygraph(self, place, kwargs): + paddle.disable_static(place) + + best = float("-10000") if kwargs['mode'] == "max" else float("10000") + current_lr = 1.0 + cooldown_counter = 0 + num_bad_epochs = 0 + var_list = [best, current_lr, cooldown_counter, num_bad_epochs] + + linear = paddle.nn.Linear(10, 10) + scheduler = paddle.optimizer.ReduceLROnPlateau(**kwargs) + adam = paddle.optimizer.Adam( + learning_rate=scheduler, parameters=linear.parameters()) + + for epoch in range(20): + for batch_id in range(1): + x = paddle.to_tensor(epoch).astype('float32') + loss = paddle.sin(x) + loss.backward() + adam.step() + adam.clear_grad() + + scheduler.step(loss) + # get lr from paddle + current_lr = adam.get_lr() + # get lr form python + expected_lr = reduce_lr_on_plateau( + kwargs['factor'], kwargs['threshold'], kwargs['cooldown'], + kwargs['patience'], kwargs['mode'], kwargs['threshold_mode'], + loss, var_list) + self.assertEqual(current_lr, expected_lr) + state_dict = adam.state_dict() + scheduler1 = paddle.optimizer.ReduceLROnPlateau(**kwargs) + adam1 = paddle.optimizer.Adam( + learning_rate=scheduler1, parameters=linear.parameters()) + adam1.set_state_dict(state_dict) + self.assertEqual(scheduler.cooldown_counter, + scheduler1.cooldown_counter) + self.assertEqual(scheduler.best.numpy()[0], scheduler1.best) + self.assertEqual(scheduler.num_bad_epochs, scheduler1.num_bad_epochs) + self.assertEqual(scheduler.last_epoch, scheduler1.last_epoch) + self.assertEqual(scheduler.last_lr, scheduler1.last_lr) + + +def noam_lr(epoch_num, d_model, warmup_steps, learning_rate=1.0, verbose=False): + if epoch_num == 0: + a = 1 + else: + a = math.pow(epoch_num, -0.5) + b = math.pow(warmup_steps, -1.5) * epoch_num + return learning_rate * math.pow(d_model, -0.5) * min(a, b) + + +def lambda_lr(epoch_num, learning_rate, lr_lambda, verbose=False): + return learning_rate * lr_lambda(epoch_num) + + +def piecewise_lr(epoch_num, boundaries, values, verbose=False): + assert len(boundaries) + 1 == len(values) + for i in range(len(boundaries)): + if epoch_num < boundaries[i]: + return values[i] + return values[len(values) - 1] + + +def exponential_lr(epoch_num, learning_rate, gamma, verbose=False): + return learning_rate * gamma**epoch_num + + +def natural_exp_lr(epoch_num, learning_rate, gamma, verbose=False): + return learning_rate * math.exp(-1 * gamma * epoch_num) + + +def inverse_time_lr(epoch_num, learning_rate, gamma, verbose=False): + return learning_rate / (1 + gamma * epoch_num) + + +def polynomial_lr(epoch_num, + learning_rate, + decay_steps, + end_lr=0.0001, + power=1.0, + cycle=False, + verbose=False): + + if cycle: + div = math.ceil(epoch_num / float(decay_steps)) + if epoch_num == 0: + div = 1 + decay_steps = decay_steps * div + else: + epoch_num = min(epoch_num, decay_steps) + return (learning_rate - end_lr) * ( + (1 - float(epoch_num) / float(decay_steps))**power) + end_lr + + def get_lr(self): + if self.last_epoch == 0: + return self.base_lr + elif (self.last_epoch - 1 - self.T_max) % (2 * self.T_max) == 0: + return self.last_lr + (self.base_lr - self.eta_min) * (1 - math.cos( + math.pi / self.T_max)) / 2 + + return (1 + math.cos(math.pi * self.last_epoch / self.T_max)) / ( + 1 + math.cos(math.pi * (self.last_epoch - 1) / self.T_max)) * ( + self.last_lr - self.eta_min) + self.eta_min + + +cosine_annealing_lr_current = None + + +def cosine_annealing_lr(epoch_num, + learning_rate, + T_max, + eta_min=0, + verbose=False): + global cosine_annealing_lr_current + if epoch_num == 0: + cosine_annealing_lr_current = learning_rate + elif (epoch_num - 1 - T_max) % (2 * T_max) == 0: + cosine_annealing_lr_current = cosine_annealing_lr_current + ( + learning_rate - eta_min) * (1 - math.cos(math.pi / float(T_max)) + ) / 2 + else: + cosine_annealing_lr_current = (1 + math.cos( + math.pi * epoch_num / float(T_max))) / (1 + math.cos(math.pi * ( + epoch_num - 1) / float(T_max))) * (cosine_annealing_lr_current - + eta_min) + eta_min + return cosine_annealing_lr_current + + +def linear_warmup_lr(epoch_num, + learning_rate, + warmup_steps, + start_lr, + end_lr, + verbose=False): + if epoch_num < warmup_steps: + return start_lr + (end_lr - start_lr) * (float(epoch_num) / + float(warmup_steps)) + else: + return learning_rate + + +def multi_step_lr(epoch_num, + learning_rate, + milestones, + gamma=0.1, + verbose=False): + for i in range(len(milestones)): + if epoch_num < milestones[i]: + return learning_rate * (gamma**i) + return learning_rate * (gamma**len(milestones)) + + +def step_lr(epoch_num, learning_rate, step_size, gamma=0.1, verbose=False): + return learning_rate * math.pow(gamma, epoch_num // step_size) + + +class TestLRScheduler(unittest.TestCase): + def _test_static(self, python_func, paddle_api, kwarg, place): + main_prog = paddle.static.Program() + start_prog = paddle.static.Program() + with paddle.static.program_guard(main_prog, start_prog): + x = paddle.static.data(name='x', shape=[3, 4, 5]) + y = paddle.static.data(name='y', shape=[3, 4, 5]) + z = paddle.static.nn.fc(x, 100) + loss = paddle.mean(z) + scheduler = paddle_api(**kwarg) + adam = paddle.optimizer.Adam(learning_rate=scheduler) + adam.minimize(loss) + lr_var = adam._global_learning_rate() + test_prog = main_prog.clone() + + num = 0 + exe = paddle.static.Executor(place) + exe.run(start_prog) + for epoch in range(5): + for batch_id in range(2): + out = exe.run( + main_prog, + feed={ + 'x': np.random.randn(3, 4, 5).astype('float32'), + 'y': np.random.randn(3, 4, 5).astype('float32') + }, + fetch_list=lr_var.name) + self.assertEqual(out, np.array(python_func(num, **kwarg))) + scheduler.step() + num += 1 + + for epoch in range(5): + for batch_id in range(2): + out = exe.run( + test_prog, + feed={ + 'x': np.random.randn(3, 4, 5).astype('float32'), + 'y': np.random.randn(3, 4, 5).astype('float32') + }, + fetch_list=lr_var.name) + self.assertEqual(out, np.array(python_func(num, **kwarg))) + scheduler.step() + num += 1 + + if isinstance(place, paddle.CPUPlace): + compiled_train_prog = paddle.static.CompiledProgram( + main_prog).with_data_parallel( + loss_name=loss.name, places=fluid.cpu_places(4)) + for epoch in range(5): + python_result = python_func(num, **kwarg) + for batch_id in range(2): + _ = exe.run( + compiled_train_prog, + feed={ + 'x': np.random.randn(12, 4, 5).astype('float32'), + 'y': np.random.randn(12, 4, 5).astype('float32') + }, + fetch_list=lr_var.name) + scopes = compiled_train_prog._executor.local_scopes() + out = np.array(scopes[0].var(lr_var.name).get_tensor()) + self.assertEqual(out, np.array(python_result)) + out = np.array(scopes[1].var(lr_var.name).get_tensor()) + self.assertEqual(out, np.array(python_result)) + out = np.array(scopes[2].var(lr_var.name).get_tensor()) + self.assertEqual(out, np.array(python_result)) + out = np.array(scopes[3].var(lr_var.name).get_tensor()) + self.assertEqual(out, np.array(python_result)) + scheduler.step() + num += 1 + + compiled_test_prog = paddle.static.CompiledProgram( + test_prog).with_data_parallel( + loss_name=loss.name, + share_vars_from=compiled_train_prog, + places=fluid.cpu_places(4)) + for epoch in range(5): + python_result = python_func(num, **kwarg) + for batch_id in range(2): + _ = exe.run( + compiled_test_prog, + feed={ + 'x': np.random.randn(12, 4, 5).astype('float32'), + 'y': np.random.randn(12, 4, 5).astype('float32') + }, + fetch_list=lr_var.name) + scopes = compiled_test_prog._executor.local_scopes() + out = np.array(scopes[0].var(lr_var.name).get_tensor()) + self.assertEqual(out, np.array(python_result)) + out = np.array(scopes[1].var(lr_var.name).get_tensor()) + self.assertEqual(out, np.array(python_result)) + out = np.array(scopes[2].var(lr_var.name).get_tensor()) + self.assertEqual(out, np.array(python_result)) + out = np.array(scopes[3].var(lr_var.name).get_tensor()) + self.assertEqual(out, np.array(python_result)) + scheduler.step() + num += 1 + + def _test_dygraph(self, python_func, paddle_api, kwarg, place): + paddle.disable_static(place) + x = np.random.uniform(-1, 1, [10, 10]).astype("float32") + linear = paddle.nn.Linear(10, 10) + scheduler = paddle_api(**kwarg) + adam = paddle.optimizer.Adam( + learning_rate=scheduler, parameters=linear.parameters()) + for epoch in range(20): + for batch_id in range(2): + x = paddle.to_tensor(x) + out = linear(x) + loss = paddle.reduce_mean(out) + loss.backward() + adam.step() + adam.clear_grad() + current_lr = adam.get_lr() + expected_lr = python_func(epoch, **kwarg) + if paddle_api.__name__ != "CosineAnnealingLR": + self.assertEqual(current_lr, expected_lr) + scheduler.step() + else: + self.assertAlmostEqual(current_lr, expected_lr) + scheduler.step(epoch + 1) + + def test_scheduler(self): + with self.assertRaises(NotImplementedError): + paddle.optimizer.lr_scheduler._LRScheduler().step() + with self.assertRaises(TypeError): + paddle.optimizer.MultiStepLR( + learning_rate="test", milestones=[1, 2, 3]) + with self.assertRaises(TypeError): + paddle.optimizer.MultiStepLR(learning_rate=0.5, milestones='test') + with self.assertRaises(ValueError): + paddle.optimizer.MultiStepLR( + learning_rate=0.5, milestones=[3, 2, 1]) + with self.assertRaises(ValueError): + paddle.optimizer.MultiStepLR( + learning_rate=0.5, milestones=[1, 2, 3], gamma=2) + + func_api_kwargs = [(noam_lr, paddle.optimizer.NoamLR, { + "d_model": 0.01, + "warmup_steps": 100, + "verbose": False + }), (piecewise_lr, paddle.optimizer.PiecewiseLR, { + "boundaries": [3, 6, 9, 15, 20], + "values": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6], + "verbose": False + }), (natural_exp_lr, paddle.optimizer.NaturalExpLR, { + "learning_rate": 0.5, + "gamma": 0.1, + "verbose": True + }), (inverse_time_lr, paddle.optimizer.InverseTimeLR, { + "learning_rate": 0.5, + "gamma": 0.1, + "verbose": False + }), (polynomial_lr, paddle.optimizer.PolynomialLR, { + "learning_rate": 0.5, + "decay_steps": 20, + "end_lr": 0, + "power": 1.0, + "cycle": False, + "verbose": True + }), (polynomial_lr, paddle.optimizer.PolynomialLR, { + "learning_rate": 0.5, + "decay_steps": 20, + "end_lr": 0, + "power": 1.0, + "cycle": True, + "verbose": False + }), (linear_warmup_lr, paddle.optimizer.LinearLrWarmup, { + 'learning_rate': 0.5, + 'warmup_steps': 20, + 'start_lr': 0, + 'end_lr': 0.5, + "verbose": True + }), (exponential_lr, paddle.optimizer.ExponentialLR, { + "learning_rate": 0.5, + "gamma": 0.9, + "verbose": False + }), (multi_step_lr, paddle.optimizer.MultiStepLR, { + "learning_rate": 0.5, + "milestones": [3, 6, 9, 15, 20], + "gamma": 0.8, + "verbose": True + }), (step_lr, paddle.optimizer.StepLR, { + "learning_rate": 0.5, + "step_size": 2, + "gamma": 0.8, + "verbose": False + }), (lambda_lr, paddle.optimizer.LambdaLR, { + "learning_rate": 0.5, + "lr_lambda": lambda x: 0.95**x, + "verbose": True + }), (cosine_annealing_lr, paddle.optimizer.CosineAnnealingLR, { + "learning_rate": 0.5, + "T_max": 10, + "verbose": False + })] + + for python_func, paddle_api, kwarg in func_api_kwargs: + places = [paddle.CPUPlace()] + if core.is_compiled_with_cuda(): + places.append(paddle.CUDAPlace(0)) + + for place in places: + paddle.enable_static() + #self._test_static(python_func, paddle_api, kwarg, place) + paddle.disable_static(place) + self._test_dygraph(python_func, paddle_api, kwarg, place) + paddle.enable_static() + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py index cb602ff0b3..3f9de0cefc 100644 --- a/python/paddle/optimizer/optimizer.py +++ b/python/paddle/optimizer/optimizer.py @@ -35,12 +35,12 @@ from ..fluid.layers import ops from ..fluid.regularizer import append_regularization_ops from ..fluid.dygraph import base as imperative_base from ..fluid.dygraph import no_grad -from ..fluid.dygraph.learning_rate_scheduler import LearningRateDecay, _LearningRateEpochDecay from paddle.fluid import core from paddle.fluid.layers import tensor from functools import reduce from ..fluid.wrapped_decorator import signature_safe_contextmanager from .. import compat as cpt +from .lr_scheduler import _LRScheduler __all__ = ['Optimizer'] @@ -53,8 +53,8 @@ class Optimizer(object): but need to use one of it's implementation. Args: - learning_rate (float|LearningRateDecay): The learning rate used to update ``Parameter``. - It can be a float value or a LearningRateDecay. + learning_rate (float|_LRScheduler): The learning rate used to update ``Parameter``. + It can be a float value or any subclass of ``_LRScheduler`` . parameters (list, optional): List of ``Tensor`` names to update to minimize ``loss``. \ This parameter is required in dygraph mode. \ The default value is None in static mode, at this time all parameters will be updated. @@ -109,11 +109,6 @@ class Optimizer(object): parameters) if parameters is not None else None self._name = name if framework.in_dygraph_mode(): - if not isinstance(learning_rate, float) and \ - not isinstance(learning_rate, LearningRateDecay): - raise TypeError( - "learning rate should be float or LearningRateDecay, got %s here" - % type(learning_rate)) if self._parameter_list is None: raise AttributeError( "parameters argument given to the Optimizer should not be None in dygraph mode." @@ -126,13 +121,10 @@ class Optimizer(object): "The weight_decay[%s] in Optimizer will not take effect, and it will only be applied to other Parameters!" % weight_decay.__str__()) break - else: - if not isinstance(learning_rate, float) and \ - not isinstance(learning_rate, framework.Variable): - raise TypeError( - "learning rate should be float or Tensor, got %s here" % - type(learning_rate)) - + if not isinstance(learning_rate, (float, _LRScheduler)): + raise TypeError( + "learning rate should be float or _LRScheduler, got %s here" % + type(learning_rate)) if grad_clip is not None: if not isinstance(grad_clip, GradientClipBase): raise TypeError( @@ -150,9 +142,6 @@ class Optimizer(object): # each program should have a independent learning rate # program -> tensor(learning_rate) self._learning_rate_map = dict() - if isinstance(self._learning_rate, framework.Variable): - self._learning_rate_map[framework.default_main_program( - )] = self._learning_rate # Dictionary of accumulators. Some optimizer subclasses need to # allocate and manage extra tensors associated with the parameters # to train. These tensors are called accumulators. @@ -167,7 +156,7 @@ class Optimizer(object): @framework.dygraph_only def state_dict(self): ''' - Get state dict information from optimizer. It contain all the tensor used by optimizer. For Adam optimizer, contains beta1, beta2, momentum etc. If LearningRateDecay have been used, global_step will be include in state dict. + Get state dict information from optimizer. It contain all the tensor used by optimizer. For Adam optimizer, contains beta1, beta2, momentum etc. If _LRScheduler have been used, global_step will be include in state dict. If the optimizer never be called(minimize function), the state_dict is empty. Args: @@ -192,24 +181,14 @@ class Optimizer(object): for para_name, var_tmp in v.items(): state_dict[var_tmp.name] = var_tmp # global step if use lr decay - if isinstance(self._learning_rate, LearningRateDecay): + if isinstance(self._learning_rate, _LRScheduler): state_dict["LR_Scheduler"] = self._learning_rate.state_dict() - - if not isinstance(self._learning_rate, _LearningRateEpochDecay): - var_tmp = None - var_temp = framework._varbase_creator( - None, name='global_step', dtype='int32') - - tensor.fill_constant( - [1], "int32", self._learning_rate.step_num, out=var_temp) - - state_dict['global_step'] = var_temp return state_dict @framework.dygraph_only def set_state_dict(self, state_dict): ''' - Load optimizer state dict. For Adam optimizer, contains beta1, beta2, momentum etc. If LearningRateDecay have been used, global_step will be changed. + Load optimizer state dict. For Adam optimizer, contains beta1, beta2, momentum etc. If _LRScheduler have been used, global_step will be changed. Args: state_dict(dict) : Dict contains all the Tensor needed by optimizer @@ -226,7 +205,7 @@ class Optimizer(object): state_dict = emb.state_dict() paddle.framework.save(state_dict, "paddle_dy") - adam = paddle.optimizer.Adam(learning_rate=paddle.nn.functional.noam_decay( 100, 10000), + adam = paddle.optimizer.Adam(learning_rate=paddle.optimizer.NoamLR( 100, 10000), parameters=emb.parameters()) state_dict = adam.state_dict() paddle.framework.save(state_dict, "paddle_dy") @@ -237,29 +216,8 @@ class Optimizer(object): ''' - if isinstance(self._learning_rate, LearningRateDecay): - self._learning_rate.set_dict(state_dict["LR_Scheduler"]) - - if not isinstance(self._learning_rate, _LearningRateEpochDecay): - assert 'global_step' in state_dict, \ - 'Global step not in state dict, Dygraph use LearningRateDecay, global_step must in state_dict' - global_step = state_dict['global_step'] - - if isinstance(global_step, Variable): - step_np = global_step - step_np = np.array(step_np.value().get_tensor()) - assert step_np.shape == (1,), \ - "global step shape is (1,), the shape is {}".format( step_np.shape ) - - self._learning_rate.step_num = int(step_np[0]) - elif isinstance(global_step, np.ndarray): - assert global_step.shape == (1,), \ - "global step shape is (1,), the shape is {}".format( global_step.shape ) - self._learning_rate.step_num = global_step[0] - else: - raise RuntimeError( - "Type not supprt, value in state dict must be [VarBase, Tensor, numpy], the type is ", - type(global_step)) + if isinstance(self._learning_rate, _LRScheduler): + self._learning_rate.set_state_dict(state_dict["LR_Scheduler"]) self._accumulators_holder = state_dict for k, v in self._accumulators.items(): @@ -296,58 +254,49 @@ class Optimizer(object): return self._opti_name_list def _create_global_learning_rate(self): - if imperative_base.enabled(): - # create learning rate tensor - if isinstance(self._learning_rate, float): - lr = self._global_learning_rate() - - if isinstance(lr, framework.Variable): - return - else: - self._learning_rate_map[framework.default_main_program( - )] = layers.create_global_var( - name=unique_name.generate("learning_rate"), - shape=[1], - value=float(self._learning_rate), - dtype=paddle.get_default_dtype() - if self._dtype is None else self._dtype, - persistable=True) - # get learning rate Tensor from LearningRateDecay - elif isinstance(self._learning_rate, LearningRateDecay): + if isinstance(self._learning_rate, _LRScheduler): + lr_var = self._global_learning_rate() + # only create global lr_var once + if not isinstance(lr_var, framework.Variable): + lr_name = unique_name.generate('learning_rate') + self._learning_rate._var_name = lr_name + lr_var = self.helper.create_global_variable( + name=lr_name, + shape=[1], + persistable=True, + stop_gradient=True, + dtype=paddle.get_default_dtype() + if self._dtype is None else self._dtype) + main_prog = framework.default_main_program() + main_prog.lr_sheduler = self._learning_rate + main_prog.lr_var = lr_var self._learning_rate_map[framework.default_main_program( - )] = self._learning_rate() - else: - raise TypeError( - "optimizer's learning rate must be float or LearningRateDecay" - ) - else: - lr = self._global_learning_rate() + )] = lr_var + lr_value = float(self._learning_rate()) + self.helper.set_variable_initializer( + lr_var, initializer=Constant(value=lr_value)) + elif isinstance(self._learning_rate, float): + # only create global lr_var once + lr = self._global_learning_rate() if isinstance(lr, framework.Variable): return else: - if not isinstance(self._learning_rate, float): - raise TypeError( - "learning rate Tensor is create outside optimizer," - "can not create new learning rate Tensor for new program" - ) - - # create learning rate in the current main program - self._learning_rate_map[framework.default_main_program( - )] = layers.create_global_var( - name=unique_name.generate("learning_rate"), - shape=[1], - value=float(self._learning_rate), - dtype=paddle.get_default_dtype() - if self._dtype is None else self._dtype, - persistable=True) + self._learning_rate_map[framework.default_main_program( + )] = layers.create_global_var( + name=unique_name.generate("learning_rate"), + shape=[1], + value=float(self._learning_rate), + dtype=paddle.get_default_dtype() + if self._dtype is None else self._dtype, + persistable=True) @framework.dygraph_only def set_lr(self, value): """ :api_attr: imperative - Set the value of the learning rate manually in the optimizer. If the optimizer use LearningRateDecay, + Set the value of the learning rate manually in the optimizer. If the optimizer use _LRScheduler, this API cannot be invoked, because it will lead to conflict. Args: @@ -378,53 +327,36 @@ class Optimizer(object): # current lr is 0.5 # current lr is 0.6 - - # set learning rate manually by framework Tensor - lr_var = paddle.create_global_var( - shape=[1], value=0.7, dtype='float32') - adam.set_lr(lr_var) - lr = adam.get_lr() - print("current lr is {}".format(lr)) - # Print: - # current lr is 0.7 - - - """ - if not isinstance(value, (framework.Variable, float)): + if not isinstance(value, (int, float)): raise TypeError( - "The type of 'value' in optimizer.set_lr must be (float, Tensor), but received %s." + "The type of 'value' in optimizer.set_lr must be float, but received %s." % (type(value))) - if isinstance(self._learning_rate, LearningRateDecay): + if isinstance(self._learning_rate, _LRScheduler): raise RuntimeError( - "optimizer's learning rate can't be LearningRateDecay when invoke this API, because this will lead to conflict." + "optimizer's learning rate can't be _LRScheduler when invoke this API, because this will lead to conflict." ) - if isinstance(value, float): - self._learning_rate = value - current_lr = self._global_learning_rate() - if current_lr is not None: - global_block = framework.default_main_program().global_block() - global_block.append_op( - type='fill_constant', - outputs={'Out': [current_lr]}, - attrs={ - 'dtype': current_lr.dtype, - 'shape': list(current_lr.shape), - 'value': float(value) - }, - stop_gradient=True) - else: - assert len(value.shape) == 1 and value.shape[ - 0] == 1, "optimizer's learning rate must be 1-D Tensor with shape[1]" - self._learning_rate_map[framework.default_main_program()] = value + self._learning_rate = float(value) + current_lr = self._global_learning_rate() + if current_lr is not None: + global_block = framework.default_main_program().global_block() + global_block.append_op( + type='fill_constant', + outputs={'Out': [current_lr]}, + attrs={ + 'dtype': current_lr.dtype, + 'shape': list(current_lr.shape), + 'value': float(value) + }, + stop_gradient=True) @framework.dygraph_only def get_lr(self): """ :api_attr: imperative - Get current step learning rate. The return value is all the same When LearningRateDecay is not used, - otherwise return the step learning rate. + Get current step learning rate. The return value is all the same When _LRScheduler is not used, + otherwise return the current step learning rate. Returns: float: The learning rate of the current step. @@ -434,14 +366,14 @@ class Optimizer(object): import numpy as np import paddle - # example1: LearningRateDecay is not used, return value is all the same + # example1: _LRScheduler is not used, return value is all the same paddle.disable_static() emb = paddle.nn.Embedding([10, 10]) adam = paddle.optimizer.Adam(0.001, parameters = emb.parameters()) lr = adam.get_lr() print(lr) # 0.001 - # example2: PiecewiseDecay is used, return the step learning rate + # example2: PiecewiseLR is used, return the step learning rate paddle.disable_static() inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32") linear = paddle.nn.Linear(10, 10) @@ -451,7 +383,8 @@ class Optimizer(object): bd = [2, 4, 6, 8] value = [0.2, 0.4, 0.6, 0.8, 1.0] - adam = paddle.optimizer.Adam(paddle.PiecewiseDecay(bd, value, 0), + scheduler = paddle.optimizer.PiecewiseLR(bd, value, 0) + adam = paddle.optimizer.Adam(scheduler, parameters=linear.parameters()) # first step: learning rate is 0.2 @@ -462,24 +395,14 @@ class Optimizer(object): for i in range(12): adam.step() lr = adam.get_lr() + scheduler.step() np.allclose(lr, ret[i], rtol=1e-06, atol=0.0) # True """ - current_lr = self._global_learning_rate() - if isinstance(current_lr, framework.Variable): - return self._global_learning_rate().numpy()[0] - if isinstance(self._learning_rate, float): return self._learning_rate - elif isinstance(self._learning_rate, _LearningRateEpochDecay): - step_lr = self._learning_rate() - return step_lr.numpy()[0] else: - step_lr = self._learning_rate.step() - if isinstance(step_lr, (float, int)): - return step_lr - else: - return step_lr.numpy()[0] + return self._learning_rate() def _global_learning_rate(self, program=None): """ -- GitLab