From 63f242b6fb5abb00ea4951705b400567a08f18c8 Mon Sep 17 00:00:00 2001 From: LoneRanger <836253168@qq.com> Date: Wed, 28 Jun 2023 11:49:42 +0800 Subject: [PATCH] replace PiecewiseDecay, StepDecay, MultiStepDecay, LambdaDecay with 2.0 version (#53992) * replace PiecewiseDecay(LearningRateDecay) with PiecewiseDecay(LRScheduler) * fix bug * fix bug * replace the StepDecay,MultiStepDecay,LambdaDecay with 2.0 version --- .../fluid/dygraph/learning_rate_scheduler.py | 304 ------------------ .../fluid/layers/learning_rate_scheduler.py | 8 +- python/paddle/optimizer/lr.py | 2 + .../test_basic_api_transformation.py | 4 +- test/dygraph_to_static/test_yolov3.py | 6 +- test/legacy_test/test_imperative_optimizer.py | 18 +- .../test_learning_rate_scheduler.py | 53 +-- 7 files changed, 50 insertions(+), 345 deletions(-) diff --git a/python/paddle/fluid/dygraph/learning_rate_scheduler.py b/python/paddle/fluid/dygraph/learning_rate_scheduler.py index dd17dbe5272..c0ecda7059d 100644 --- a/python/paddle/fluid/dygraph/learning_rate_scheduler.py +++ b/python/paddle/fluid/dygraph/learning_rate_scheduler.py @@ -23,13 +23,9 @@ from ..data_feeder import check_type __all__ = [ 'NoamDecay', - 'PiecewiseDecay', 'PolynomialDecay', 'LinearLrWarmup', 'ReduceLROnPlateau', - 'StepDecay', - 'MultiStepDecay', - 'LambdaDecay', ] @@ -131,68 +127,6 @@ class LearningRateDecay: raise NotImplementedError() -class PiecewiseDecay(LearningRateDecay): - """ - :api_attr: imperative - - Piecewise decay scheduler. - - The algorithm can be described as the code below. - - .. code-block:: text - - boundaries = [10000, 20000] - values = [1.0, 0.5, 0.1] - if global_step < 10000: - learning_rate = 1.0 - elif 10000 <= global_step < 20000: - learning_rate = 0.5 - else: - learning_rate = 0.1 - - Parameters: - boundaries(list): A list of steps numbers. The type of element in the list is python int. - values(list): A list of learning rate values that will be picked during - different step boundaries. The type of element in the list is python float. - begin(int): The begin step to initialize the global_step in the description above. - step(int, optional): The step size used to calculate the new global_step in the description above. - The default value is 1. - dtype(str, optional): The data type used to create the learning rate variable. The data type can be set as - 'float32', 'float64'. The default value is 'float32'. - - Returns: - None. - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - import paddle - boundaries = [10000, 20000] - values = [1.0, 0.5, 0.1] - with fluid.dygraph.guard(): - emb = paddle.nn.Embedding(10, 10) - optimizer = fluid.optimizer.SGD( - learning_rate=fluid.dygraph.PiecewiseDecay(boundaries, values, 0), - parameter_list = emb.parameters() ) - """ - - def __init__(self, boundaries, values, begin, step=1, dtype='float32'): - super().__init__(begin, step, dtype) - self.boundaries = boundaries - self.values = values - - self.vars = [] - for value in values: - self.vars.append(value) - - def step(self): - for i in range(len(self.boundaries)): - if self.step_num < self.boundaries[i]: - return self.vars[i] - return self.create_lr_var(self.vars[len(self.values) - 1]) - - class PolynomialDecay(LearningRateDecay): r""" :api_attr: imperative @@ -742,241 +676,3 @@ class _LearningRateEpochDecay(LearningRateDecay): def get_lr(self): raise NotImplementedError - - -class StepDecay(_LearningRateEpochDecay): - """ - :api_attr: imperative - - Decays the learning rate of ``optimizer`` by ``decay_rate`` every ``step_size`` number of epoch. - - The algorithm can be described as the code below. - - .. code-block:: text - - learning_rate = 0.5 - step_size = 30 - decay_rate = 0.1 - - learning_rate = 0.5 if epoch < 30 - learning_rate = 0.05 if 30 <= epoch < 60 - learning_rate = 0.005 if 60 <= epoch < 90 - ... - - Parameters: - learning_rate (float|int): The initial learning rate. It can be set to python float or int number. - step_size (int): Period of learning rate decay. - decay_rate (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * decay_rate`` . - It should be less than 1.0. Default: 0.1. - - Returns: - None. - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - import numpy as np - import paddle - with fluid.dygraph.guard(): - x = np.random.uniform(-1, 1, [10, 10]).astype("float32") - linear = paddle.nn.Linear(10, 10) - input = fluid.dygraph.to_variable(x) - scheduler = fluid.dygraph.StepDecay(0.5, step_size=3) - adam = fluid.optimizer.Adam(learning_rate = scheduler, parameter_list = linear.parameters()) - - for epoch in range(9): - for batch_id in range(5): - out = linear(input) - loss = paddle.mean(out) - adam.minimize(loss) - scheduler.epoch() - - print("epoch:{}, current lr is {}" .format(epoch, adam.current_step_lr())) - # epoch:0, current lr is 0.5 - # epoch:1, current lr is 0.5 - # epoch:2, current lr is 0.5 - # epoch:3, current lr is 0.05 - # epoch:4, current lr is 0.05 - # epoch:5, current lr is 0.05 - # epoch:6, current lr is 0.005 - # epoch:7, current lr is 0.005 - # epoch:8, current lr is 0.005 - - """ - - def __init__(self, learning_rate, step_size, decay_rate=0.1): - if not isinstance(step_size, int): - raise TypeError( - "The type of 'step_size' must be 'int', but received %s." - % type(step_size) - ) - if decay_rate >= 1.0: - raise ValueError('decay_rate should be < 1.0.') - - self.step_size = step_size - self.decay_rate = decay_rate - super().__init__(learning_rate) - - def get_lr(self): - decay_rate = self.create_lr_var(self.decay_rate) - i = self.epoch_num // self.step_size - return self.base_lr * (decay_rate**i) - - -class MultiStepDecay(_LearningRateEpochDecay): - """ - :api_attr: imperative - - Decays the learning rate of ``optimizer`` by ``decay_rate`` once ``epoch`` reaches one of the milestones. - - The algorithm can be described as the code below. - - .. code-block:: text - - learning_rate = 0.5 - milestones = [30, 50] - decay_rate = 0.1 - if epoch < 30: - learning_rate = 0.5 - elif epoch < 50: - learning_rate = 0.05 - else: - learning_rate = 0.005 - - Parameters: - learning_rate (float|int): The initial learning rate. It can be set to python float or int number. - milestones (tuple|list): List or tuple of each boundaries. Must be increasing. - decay_rate (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * decay_rate`` . - It should be less than 1.0. Default: 0.1. - - Returns: - None. - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - import numpy as np - import paddle - with fluid.dygraph.guard(): - x = np.random.uniform(-1, 1, [10, 10]).astype("float32") - linear = paddle.nn.Linear(10, 10) - input = fluid.dygraph.to_variable(x) - scheduler = fluid.dygraph.MultiStepDecay(0.5, milestones=[3, 5]) - adam = fluid.optimizer.Adam(learning_rate = scheduler, parameter_list = linear.parameters()) - - for epoch in range(6): - for batch_id in range(5): - out = linear(input) - loss = paddle.mean(out) - adam.minimize(loss) - scheduler.epoch() - - print("epoch:{}, current lr is {}" .format(epoch, adam.current_step_lr())) - # epoch:0, current lr is 0.5 - # epoch:1, current lr is 0.5 - # epoch:2, current lr is 0.5 - # epoch:3, current lr is 0.05 - # epoch:4, current lr is 0.05 - # epoch:5, current lr is 0.005 - - """ - - def __init__(self, learning_rate, milestones, decay_rate=0.1): - if not isinstance(milestones, (tuple, list)): - raise TypeError( - "The type of 'milestones' in 'MultiStepDecay' must be 'tuple, list', but received %s." - % type(milestones) - ) - - if not all( - [ - milestones[i] < milestones[i + 1] - for i in range(len(milestones) - 1) - ] - ): - raise ValueError('The elements of milestones must be incremented') - if decay_rate >= 1.0: - raise ValueError('decay_rate should be < 1.0.') - - self.milestones = milestones - self.decay_rate = decay_rate - super().__init__(learning_rate) - - def get_lr(self): - decay_rate = self.create_lr_var(self.decay_rate) - for i in range(len(self.milestones)): - if self.epoch_num < self.milestones[i]: - return self.base_lr * (decay_rate**i) - - return self.base_lr * (decay_rate ** len(self.milestones)) - - -class LambdaDecay(_LearningRateEpochDecay): - """ - :api_attr: imperative - - Sets the learning rate of ``optimizer`` to the initial lr times a multiplicative factor, and this multiplicative - factor is computed by function ``lr_lambda`` . ``lr_lambda`` is function which receives ``epoch`` . - - The algorithm can be described as the code below. - - .. code-block:: text - - learning_rate = 0.5 # init learning_rate - lr_lambda = lambda epoch: 0.95 ** epoch - - learning_rate = 0.5 # epoch 0 - learning_rate = 0.475 # epoch 1 - learning_rate = 0.45125 # epoch 2 - - Parameters: - learning_rate (float|int): The initial learning rate. It can be set to python float or int number. - lr_lambda (function): A function which computes a multiplicative factor given an integer parameter ``epoch`` , and - then multiply the initial learning rate by this multiplicative factor. - - Returns: - None. - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - import numpy as np - import paddle - with fluid.dygraph.guard(): - x = np.random.uniform(-1, 1, [10, 10]).astype("float32") - linear = paddle.nn.Linear(10, 10) - input = fluid.dygraph.to_variable(x) - scheduler = fluid.dygraph.LambdaDecay(0.5, lr_lambda=lambda x: 0.95**x) - adam = fluid.optimizer.Adam(learning_rate = scheduler, parameter_list = linear.parameters()) - - for epoch in range(6): - for batch_id in range(5): - out = linear(input) - loss = paddle.mean(out) - adam.minimize(loss) - scheduler.epoch() - - print("epoch:%d, current lr is %f" .format(epoch, adam.current_step_lr())) - # epoch:0, current lr is 0.5 - # epoch:1, current lr is 0.475 - # epoch:2, current lr is 0.45125 - - """ - - def __init__(self, learning_rate, lr_lambda): - if not callable(lr_lambda): - raise TypeError( - "The type of 'lr_lambda' in 'LambdaDecay' must be 'function', but received %s." - % type(lr_lambda) - ) - - self.lr_lambda = lr_lambda - super().__init__(learning_rate) - - def get_lr(self): - base_lr = self.create_lr_var(self.base_lr) - - return self.base_lr * self.lr_lambda(self.epoch_num) diff --git a/python/paddle/fluid/layers/learning_rate_scheduler.py b/python/paddle/fluid/layers/learning_rate_scheduler.py index bc1c8e78038..6605ef7a7b7 100644 --- a/python/paddle/fluid/layers/learning_rate_scheduler.py +++ b/python/paddle/fluid/layers/learning_rate_scheduler.py @@ -410,10 +410,10 @@ def piecewise_decay(boundaries, values): paddle.enable_static() boundaries = [10000, 20000] values = [1.0, 0.5, 0.1] - optimizer = fluid.optimizer.Momentum( + optimizer = paddle.optimizer.Momentum( momentum=0.9, - learning_rate=fluid.layers.piecewise_decay(boundaries=boundaries, values=values), - regularization=paddle.regularizer.L2Decay(1e-4)) + learning_rate=paddle.optimizer.lr.PiecewiseDecay(boundaries, values), + weight_decay=paddle.regularizer.L2Decay(1e-4)) """ @@ -422,7 +422,7 @@ def piecewise_decay(boundaries, values): raise ValueError("len(values) - len(boundaries) should be 1") if in_dygraph_mode(): - decay = imperate_lr.PiecewiseDecay(boundaries, values, 0) + decay = paddle.optimizer.lr.PiecewiseDecay(boundaries, values) return decay else: global_step = _decay_step_counter() diff --git a/python/paddle/optimizer/lr.py b/python/paddle/optimizer/lr.py index aa87a455d56..681ff33ca67 100644 --- a/python/paddle/optimizer/lr.py +++ b/python/paddle/optimizer/lr.py @@ -98,6 +98,8 @@ class LRScheduler: type(learning_rate) ) ) + if learning_rate < 0: + raise ValueError(f"Invalid learning rate: {learning_rate}") self.base_lr = float(learning_rate) self.last_lr = float(learning_rate) self.last_epoch = last_epoch diff --git a/test/dygraph_to_static/test_basic_api_transformation.py b/test/dygraph_to_static/test_basic_api_transformation.py index 62499a9aada..e0a0d9f5923 100644 --- a/test/dygraph_to_static/test_basic_api_transformation.py +++ b/test/dygraph_to_static/test_basic_api_transformation.py @@ -376,9 +376,9 @@ def dyfunc_NoamDecay(): def dyfunc_PiecewiseDecay(): boundaries = [10000, 20000] values = [1.0, 0.5, 0.1] - pd = fluid.dygraph.PiecewiseDecay(boundaries, values, begin=0) + pd = paddle.optimizer.lr.PiecewiseDecay(boundaries, values) lr = pd() - return lr + return paddle.to_tensor(lr) def dyfunc_PolynomialDecay(): diff --git a/test/dygraph_to_static/test_yolov3.py b/test/dygraph_to_static/test_yolov3.py index 640fe7c3fda..eb51fcc20e9 100644 --- a/test/dygraph_to_static/test_yolov3.py +++ b/test/dygraph_to_static/test_yolov3.py @@ -94,11 +94,11 @@ def train(to_static): learning_rate = cfg.learning_rate values = [learning_rate * (gamma**i) for i in range(step_num + 1)] - lr = fluid.dygraph.PiecewiseDecay( - boundaries=boundaries, values=values, begin=0 + lr = paddle.optimizer.lr.PiecewiseDecay( + boundaries=boundaries, values=values ) - lr = fluid.layers.linear_lr_warmup( + lr = paddle.optimizer.lr.LinearWarmup( learning_rate=lr, warmup_steps=cfg.warm_up_iter, start_lr=0.0, diff --git a/test/legacy_test/test_imperative_optimizer.py b/test/legacy_test/test_imperative_optimizer.py index 514dd318ed2..36a60dfcf95 100644 --- a/test/legacy_test/test_imperative_optimizer.py +++ b/test/legacy_test/test_imperative_optimizer.py @@ -262,7 +262,7 @@ class TestImperativeOptimizerPiecewiseDecay(TestImperativeOptimizerBase): def get_optimizer(self): bd = [3, 6, 9] optimizer = SGDOptimizer( - learning_rate=fluid.layers.piecewise_decay( + learning_rate=paddle.optimizer.lr.PiecewiseDecay( boundaries=bd, values=[0.1 * (0.1**i) for i in range(len(bd) + 1)], ) @@ -470,20 +470,20 @@ class TestOptimizerLearningRate(unittest.TestCase): bd = [2, 4, 6, 8] value = [0.2, 0.4, 0.6, 0.8, 1.0] - adam = fluid.optimizer.Adam( - fluid.dygraph.PiecewiseDecay(bd, value, 0), - parameter_list=linear.parameters(), + scheduler = paddle.optimizer.lr.PiecewiseDecay(bd, value) + adam = paddle.optimizer.Adam( + scheduler, + parameters=linear.parameters(), ) - np.testing.assert_allclose( - adam.current_step_lr(), 0.2, rtol=1e-06, atol=0.0 - ) + np.testing.assert_allclose(adam.get_lr(), 0.2, rtol=1e-06, atol=0.0) ret = [0.2, 0.2, 0.4, 0.4, 0.6, 0.6, 0.8, 0.8, 1.0, 1.0, 1.0, 1.0] for i in range(12): adam.minimize(loss) - lr = adam.current_step_lr() - + lr = adam.get_lr() + adam.step() + scheduler.step() np.testing.assert_allclose(lr, ret[i], rtol=1e-06, atol=0.0) def test_lr_decay_natural_exp(self): diff --git a/test/legacy_test/test_learning_rate_scheduler.py b/test/legacy_test/test_learning_rate_scheduler.py index f1cc1fe81b7..b38e29b7df0 100644 --- a/test/legacy_test/test_learning_rate_scheduler.py +++ b/test/legacy_test/test_learning_rate_scheduler.py @@ -127,7 +127,7 @@ class TestLearningRateDecayDygraph(unittest.TestCase): learning_rate=0.1, gamma=0.5, ) - Step_scheduler = fluid.dygraph.StepDecay(0.5, step_size=3) + Step_scheduler = paddle.optimizer.lr.StepDecay(0.5, step_size=3) Reducelr_scheduler = fluid.dygraph.ReduceLROnPlateau( learning_rate=1.0, decay_rate=0.5, patience=5, cooldown=3 ) @@ -154,7 +154,7 @@ class TestLearningRateDecayDygraph(unittest.TestCase): adam3.minimize(loss) linear.clear_gradients() - Step_scheduler.epoch() + Step_scheduler.get_lr() Reducelr_scheduler.step(loss) paddle.save(linear.state_dict(), "save_path.pdparams") @@ -163,7 +163,9 @@ class TestLearningRateDecayDygraph(unittest.TestCase): learning_rate=0.1, gamma=0.5, ) - Step_scheduler_test = fluid.dygraph.StepDecay(0.5, step_size=3) + Step_scheduler_test = paddle.optimizer.lr.StepDecay( + 0.5, step_size=3 + ) Reducelr_scheduler_test = fluid.dygraph.ReduceLROnPlateau( learning_rate=1.0, decay_rate=0.5, patience=5, cooldown=3 ) @@ -189,8 +191,8 @@ class TestLearningRateDecayDygraph(unittest.TestCase): ) adam_test.set_dict(opt_state) self.assertEqual( - adam_test._learning_rate.epoch_num, - adam2._learning_rate.epoch_num, + adam_test._learning_rate.last_epoch, + adam2._learning_rate.last_epoch, "epoch_num is different before and after set_dict", ) self.assertEqual( @@ -288,19 +290,20 @@ class TestLearningRateDecayDygraph(unittest.TestCase): decay_rate = 0.2 linear = paddle.nn.Linear(10, 10) - scheduler = fluid.dygraph.MultiStepDecay( + scheduler = paddle.optimizer.lr.MultiStepDecay( learning_rate, milestones, decay_rate ) - adam = fluid.optimizer.AdamOptimizer( - learning_rate=scheduler, parameter_list=linear.parameters() + adam = paddle.optimizer.Adam( + learning_rate=scheduler, parameters=linear.parameters() ) for epoch in range(10): right_result = multi_step_decay( epoch, learning_rate, milestones, decay_rate ) - fluid_result = adam.current_step_lr() - scheduler.epoch() + fluid_result = adam.get_lr() + adam.step() + scheduler.step() self.assertAlmostEqual( right_result, fluid_result, @@ -310,35 +313,36 @@ class TestLearningRateDecayDygraph(unittest.TestCase): ) with self.assertRaises(ValueError): - lr = fluid.dygraph.MultiStepDecay( + lr = paddle.optimizer.lr.MultiStepDecay( learning_rate, [30, 50, 20], 0.1 ) with self.assertRaises(ValueError): - lr = fluid.dygraph.MultiStepDecay( + lr = paddle.optimizer.lr.MultiStepDecay( learning_rate, [20, 30, 50], 1 ) with self.assertRaises(TypeError): - lr = fluid.dygraph.MultiStepDecay("test", [20, 30, 50]) + lr = paddle.optimizer.lr.MultiStepDecay("test", [20, 30, 50]) with self.assertRaises(ValueError): - lr = fluid.dygraph.MultiStepDecay(-1, [20, 30, 50]) + lr = paddle.optimizer.lr.MultiStepDecay(-1, [20, 30, 50]) def test_StepDecay(self): with fluid.dygraph.guard(): learning_rate = 0.5 step_size = 3 decay_rate = 0.2 - scheduler = fluid.dygraph.StepDecay( + scheduler = paddle.optimizer.lr.StepDecay( learning_rate, step_size, decay_rate ) for epoch in range(10): right_result = step_decay( epoch, learning_rate, step_size, decay_rate ) - fluid_result = scheduler().numpy().item() - scheduler.epoch() + fluid_result = scheduler() + scheduler.get_lr() + scheduler.step() self.assertAlmostEqual( right_result, fluid_result, @@ -348,16 +352,18 @@ class TestLearningRateDecayDygraph(unittest.TestCase): ) with self.assertRaises(TypeError): - lr = fluid.dygraph.StepDecay(learning_rate, "test", 0.1) + lr = paddle.optimizer.lr.StepDecay(learning_rate, "test", 0.1) with self.assertRaises(ValueError): - lr = fluid.dygraph.StepDecay(learning_rate, 20, 2) + lr = paddle.optimizer.lr.StepDecay(learning_rate, 20, 2) def test_LambdaDecay(self): with fluid.dygraph.guard(): learning_rate = 0.5 lr_lambda = lambda x: 0.95**x - scheduler = fluid.dygraph.LambdaDecay(learning_rate, lr_lambda) + scheduler = paddle.optimizer.lr.LambdaDecay( + learning_rate, lr_lambda + ) linear = paddle.nn.Linear(10, 10) adam = fluid.optimizer.Adam( @@ -366,8 +372,9 @@ class TestLearningRateDecayDygraph(unittest.TestCase): for epoch in range(30): right_result = lambda_decay(epoch, learning_rate, lr_lambda) - fluid_result = scheduler().numpy().item() - scheduler.epoch() + fluid_result = scheduler() + scheduler.get_lr() + scheduler.step() self.assertAlmostEqual( right_result, fluid_result, @@ -377,7 +384,7 @@ class TestLearningRateDecayDygraph(unittest.TestCase): ) with self.assertRaises(TypeError): - lr = fluid.dygraph.LambdaDecay(learning_rate, "test") + lr = paddle.optimizer.lr.LambdaDecay(learning_rate, "test") class TestLearningRateDecay(unittest.TestCase): -- GitLab