diff --git a/python/paddle/fluid/dygraph/learning_rate_scheduler.py b/python/paddle/fluid/dygraph/learning_rate_scheduler.py index 3969cf4f950c15e3e1101334ca5bd0b24e85124f..1bfd4c77a2654cd9f467896c769d4232c231ed20 100644 --- a/python/paddle/fluid/dygraph/learning_rate_scheduler.py +++ b/python/paddle/fluid/dygraph/learning_rate_scheduler.py @@ -23,7 +23,7 @@ from ..data_feeder import check_type __all__ = [ 'NoamDecay', 'PiecewiseDecay', 'NaturalExpDecay', 'ExponentialDecay', 'InverseTimeDecay', 'PolynomialDecay', 'CosineDecay', 'LinearLrWarmup', - 'ReduceLROnPlateau' + 'ReduceLROnPlateau', 'StepDecay', 'MultiStepDecay' ] @@ -595,6 +595,8 @@ class NoamDecay(LearningRateDecay): class LinearLrWarmup(LearningRateDecay): """ + :api_attr: imperative + This operator use the linear learning rate warm up strategy to adjust the learning rate preliminarily before the normal learning rate scheduling. For more information, please refer to `Bag of Tricks for Image Classification with Convolutional Neural Networks `_ @@ -684,6 +686,8 @@ class LinearLrWarmup(LearningRateDecay): class ReduceLROnPlateau(LearningRateDecay): """ + :api_attr: imperative + Reduce learning rate when ``loss`` has stopped descending. Models often benefit from reducing the learning rate by 2 to 10 times once model performance has no longer improvement. @@ -788,7 +792,6 @@ class ReduceLROnPlateau(LearningRateDecay): raise ValueError('threshold mode ' + threshold_mode + ' is unknown!') self.threshold_mode = threshold_mode - check_type(learning_rate, 'learning_rate', (float, int, Variable), 'ReduceLROnPlateau') if isinstance(learning_rate, (float, int)): @@ -870,3 +873,217 @@ class ReduceLROnPlateau(LearningRateDecay): else: return current > best + self.threshold + + +class _LearningRateEpochDecay(LearningRateDecay): + """ + :api_attr: imperative + + Base class of learning rate decay, which is updated each epoch. + + Define the common interface of an _LearningRateEpochDecay. + User should not use this class directly, + but need to use one of it's implementation. And invoke method: `epoch()` each epoch. + """ + + def __init__(self, learning_rate, dtype=None): + if not isinstance(learning_rate, (float, int)): + raise TypeError( + "The type of 'learning_rate' must be 'float, int', but received %s." + % type(learning_rate)) + if learning_rate >= 1.0: + raise ValueError("The initial learning rate") + + self.base_lr = float(learning_rate) + + self.epoch_num = -1 + if dtype is None: + self.dtype = "float32" + self.learning_rate = self.create_lr_var(self.base_lr) + + self.epoch() + + def __call__(self): + """ + Return last computed learning rate on current epoch. + """ + return self.learning_rate + + def epoch(self, epoch=None): + """ + compueted learning_rate and update it when invoked. + """ + if epoch is None: + self.epoch_num += 1 + else: + self.epoch_num = epoch + + self.learning_rate = self.get_lr() + if isinstance(self.learning_rate, float): + self.learning_rate = self.create_lr_var(self.learning_rate) + + def get_lr(self): + raise NotImplementedError + + +class StepDecay(_LearningRateEpochDecay): + """ + :api_attr: imperative + + Decays the learning rate of ``optimizer`` by ``decay_rate`` every ``step_size`` number of epoch. + + The algorithm can be described as the code below. + + .. code-block:: text + + learning_rate = 0.5 + step_size = 30 + decay_rate = 0.1 + + learning_rate = 0.5 if epoch < 30 + learning_rate = 0.05 if 30 <= epoch < 60 + learning_rate = 0.005 if 60 <= epoch < 90 + ... + + Parameters: + learning_rate (float|int): The initial learning rate. It can be set to python float or int number. + step_size (int): Period of learning rate decay.. + decay_rate (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * decay_rate`` . + It should be less than 1.0. Default: 0.1. + + Returns: + None. + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + import numpy as np + with fluid.dygraph.guard(): + x = np.random.uniform(-1, 1, [10, 10]).astype("float32") + linear = fluid.dygraph.Linear(10, 10) + input = fluid.dygraph.to_variable(x) + scheduler = fluid.dygraph.StepDecay(0.5, step_size=3) + adam = fluid.optimizer.Adam(learning_rate = scheduler, parameter_list = linear.parameters()) + + for epoch in range(9): + for batch_id in range(5): + out = linear(input) + loss = fluid.layers.reduce_mean(out) + adam.minimize(loss) + scheduler.epoch() + + print("epoch:{}, current lr is {}" .format(epoch, adam.current_step_lr())) + # epoch:0, current lr is 0.5 + # epoch:1, current lr is 0.5 + # epoch:2, current lr is 0.5 + # epoch:3, current lr is 0.05 + # epoch:4, current lr is 0.05 + # epoch:5, current lr is 0.05 + # epoch:6, current lr is 0.005 + # epoch:7, current lr is 0.005 + # epoch:8, current lr is 0.005 + + """ + + def __init__(self, learning_rate, step_size, decay_rate=0.1): + if not isinstance(step_size, int): + raise TypeError( + "The type of 'step_size' must be 'int', but received %s." % + type(step_size)) + if decay_rate >= 1.0: + raise ValueError('decay_rate should be < 1.0.') + + self.step_size = step_size + self.decay_rate = decay_rate + super(StepDecay, self).__init__(learning_rate) + + def get_lr(self): + decay_rate = self.create_lr_var(self.decay_rate) + i = self.epoch_num // self.step_size + return self.base_lr * (decay_rate**i) + + +class MultiStepDecay(_LearningRateEpochDecay): + """ + :api_attr: imperative + + Decays the learning rate of ``optimizer`` by ``decay_rate`` once ``epoch`` reaches one of the milestones. + + The algorithm can be described as the code below. + + .. code-block:: text + + learning_rate = 0.5 + milestones = [30, 50] + decay_rate = 0.1 + if epoch < 30: + learning_rate = 0.5 + elif epoch < 50: + learning_rate = 0.05 + else: + learning_rate = 0.005 + + Parameters: + learning_rate (float|int): The initial learning rate. It can be set to python float or int number. If it + milestones (tuple|list): List or tuple of each boundaries. Must be increasing. + decay_rate (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * decay_rate`` . + It should be less than 1.0. Default: 0.1. + + Returns: + None. + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + import numpy as np + with fluid.dygraph.guard(): + x = np.random.uniform(-1, 1, [10, 10]).astype("float32") + linear = fluid.dygraph.Linear(10, 10) + input = fluid.dygraph.to_variable(x) + scheduler = fluid.dygraph.MultiStepDecay(0.5, milestones=[3, 5]) + adam = fluid.optimizer.Adam(learning_rate = scheduler, parameter_list = linear.parameters()) + + for epoch in range(6): + for batch_id in range(5): + out = linear(input) + loss = fluid.layers.reduce_mean(out) + adam.minimize(loss) + scheduler.epoch() + + print("epoch:{}, current lr is {}" .format(epoch, adam.current_step_lr())) + # epoch:0, current lr is 0.5 + # epoch:1, current lr is 0.5 + # epoch:2, current lr is 0.5 + # epoch:3, current lr is 0.05 + # epoch:4, current lr is 0.05 + # epoch:5, current lr is 0.005 + + """ + + def __init__(self, learning_rate, milestones, decay_rate=0.1): + if not isinstance(milestones, (tuple, list)): + raise TypeError( + "The type of 'milestones' in 'MultiStepDecay' must be 'tuple, list', but received %s." + % type(milestones)) + + if not all([ + milestones[i] < milestones[i + 1] + for i in range(len(milestones) - 1) + ]): + raise ValueError('The elements of milestones must be incremented') + if decay_rate >= 1.0: + raise ValueError('decay_rate should be < 1.0.') + + self.milestones = milestones + self.decay_rate = decay_rate + super(MultiStepDecay, self).__init__(learning_rate) + + def get_lr(self): + decay_rate = self.create_lr_var(self.decay_rate) + for i in range(len(self.milestones)): + if self.epoch_num < self.milestones[i]: + return self.base_lr * (decay_rate**i) + + return self.base_lr * (decay_rate**len(self.milestones)) diff --git a/python/paddle/fluid/layers/learning_rate_scheduler.py b/python/paddle/fluid/layers/learning_rate_scheduler.py index 5188051a9a6bbb40f538af5e0b8c8a4796de6e66..47e62016a20d78ef1209da92d42ceed726e482d6 100644 --- a/python/paddle/fluid/layers/learning_rate_scheduler.py +++ b/python/paddle/fluid/layers/learning_rate_scheduler.py @@ -52,9 +52,9 @@ def _decay_step_counter(begin=0): def noam_decay(d_model, warmup_steps, learning_rate=1.0): """ - :alias_main: paddle.nn.functional.noam_decay - :alias: paddle.nn.functional.noam_decay,paddle.nn.functional.learning_rate.noam_decay - :old_api: paddle.fluid.layers.noam_decay + :alias_main: paddle.nn.functional.noam_decay + :alias: paddle.nn.functional.noam_decay,paddle.nn.functional.learning_rate.noam_decay + :old_api: paddle.fluid.layers.noam_decay Noam decay method. The numpy implementation of noam decay as follows. @@ -115,9 +115,9 @@ def noam_decay(d_model, warmup_steps, learning_rate=1.0): def exponential_decay(learning_rate, decay_steps, decay_rate, staircase=False): """ - :alias_main: paddle.nn.functional.exponential_decay - :alias: paddle.nn.functional.exponential_decay,paddle.nn.functional.learning_rate.exponential_decay - :old_api: paddle.fluid.layers.exponential_decay + :alias_main: paddle.nn.functional.exponential_decay + :alias: paddle.nn.functional.exponential_decay,paddle.nn.functional.learning_rate.exponential_decay + :old_api: paddle.fluid.layers.exponential_decay Applies exponential decay to the learning rate. @@ -176,9 +176,9 @@ def exponential_decay(learning_rate, decay_steps, decay_rate, staircase=False): def natural_exp_decay(learning_rate, decay_steps, decay_rate, staircase=False): """ - :alias_main: paddle.nn.functional.natural_exp_decay - :alias: paddle.nn.functional.natural_exp_decay,paddle.nn.functional.learning_rate.natural_exp_decay - :old_api: paddle.fluid.layers.natural_exp_decay + :alias_main: paddle.nn.functional.natural_exp_decay + :alias: paddle.nn.functional.natural_exp_decay,paddle.nn.functional.learning_rate.natural_exp_decay + :old_api: paddle.fluid.layers.natural_exp_decay Applies natural exponential decay to the initial learning rate. @@ -237,9 +237,9 @@ Applies natural exponential decay to the initial learning rate. def inverse_time_decay(learning_rate, decay_steps, decay_rate, staircase=False): """ - :alias_main: paddle.nn.functional.inverse_time_decay - :alias: paddle.nn.functional.inverse_time_decay,paddle.nn.functional.learning_rate.inverse_time_decay - :old_api: paddle.fluid.layers.inverse_time_decay + :alias_main: paddle.nn.functional.inverse_time_decay + :alias: paddle.nn.functional.inverse_time_decay,paddle.nn.functional.learning_rate.inverse_time_decay + :old_api: paddle.fluid.layers.inverse_time_decay Applies inverse time decay to the initial learning rate. @@ -302,9 +302,9 @@ def polynomial_decay(learning_rate, power=1.0, cycle=False): """ - :alias_main: paddle.nn.functional.polynomial_decay - :alias: paddle.nn.functional.polynomial_decay,paddle.nn.functional.learning_rate.polynomial_decay - :old_api: paddle.fluid.layers.polynomial_decay + :alias_main: paddle.nn.functional.polynomial_decay + :alias: paddle.nn.functional.polynomial_decay,paddle.nn.functional.learning_rate.polynomial_decay + :old_api: paddle.fluid.layers.polynomial_decay 2 Applies polynomial decay to the initial learning rate. @@ -371,9 +371,9 @@ def polynomial_decay(learning_rate, def piecewise_decay(boundaries, values): """ - :alias_main: paddle.nn.functional.piecewise_decay - :alias: paddle.nn.functional.piecewise_decay,paddle.nn.functional.learning_rate.piecewise_decay - :old_api: paddle.fluid.layers.piecewise_decay + :alias_main: paddle.nn.functional.piecewise_decay + :alias: paddle.nn.functional.piecewise_decay,paddle.nn.functional.learning_rate.piecewise_decay + :old_api: paddle.fluid.layers.piecewise_decay Applies piecewise decay to the initial learning rate. @@ -450,9 +450,9 @@ Applies piecewise decay to the initial learning rate. def cosine_decay(learning_rate, step_each_epoch, epochs): """ - :alias_main: paddle.nn.functional.cosine_decay - :alias: paddle.nn.functional.cosine_decay,paddle.nn.functional.learning_rate.cosine_decay - :old_api: paddle.fluid.layers.cosine_decay + :alias_main: paddle.nn.functional.cosine_decay + :alias: paddle.nn.functional.cosine_decay,paddle.nn.functional.learning_rate.cosine_decay + :old_api: paddle.fluid.layers.cosine_decay Applies cosine decay to the learning rate. @@ -499,9 +499,9 @@ def cosine_decay(learning_rate, step_each_epoch, epochs): def linear_lr_warmup(learning_rate, warmup_steps, start_lr, end_lr): """ - :alias_main: paddle.nn.functional.linear_lr_warmup - :alias: paddle.nn.functional.linear_lr_warmup,paddle.nn.functional.learning_rate.linear_lr_warmup - :old_api: paddle.fluid.layers.linear_lr_warmup + :alias_main: paddle.nn.functional.linear_lr_warmup + :alias: paddle.nn.functional.linear_lr_warmup,paddle.nn.functional.learning_rate.linear_lr_warmup + :old_api: paddle.fluid.layers.linear_lr_warmup This operator use the linear learning rate warm up strategy to adjust the learning rate preliminarily before the normal learning rate scheduling. For more information, please refer to `Bag of Tricks for Image Classification with Convolutional Neural Networks `_ diff --git a/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py b/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py index 8b66035c57ab3abf76bae3191b46de755aa5a3f9..3b19b7bb10edebce8e11042a1e77dce9b69185b6 100644 --- a/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py +++ b/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py @@ -98,8 +98,26 @@ def noam_decay(global_step, d_model, warmup_steps, learning_rate=1.0): return decayed_lr -class TestNoamLearningRateDecayDygraphMode(unittest.TestCase): - def test_dygraph_mode(self): +def linear_lr_warmup(global_step, warmup_steps, start_lr, end_lr): + linear_step = end_lr - start_lr + decayed_lr = start_lr + linear_step * (global_step / warmup_steps) + return decayed_lr + + +def multi_step_decay(global_step, learning_rate, milestones, decay_rate=0.1): + for i in range(len(milestones)): + if global_step < milestones[i]: + return learning_rate * math.pow(decay_rate, i) + + return learning_rate * math.pow(decay_rate, len(milestones)) + + +def step_decay(global_step, learning_rate, step_size, decay_rate=0.1): + return learning_rate * math.pow(decay_rate, global_step // step_size) + + +class TestLearningRateDecayDygraph(unittest.TestCase): + def test_NoamDecay(self): with fluid.dygraph.guard(): d_model = 0.01 warmup_steps = 200 @@ -117,6 +135,88 @@ class TestNoamLearningRateDecayDygraphMode(unittest.TestCase): msg='Failed lr scheduler in step {0}, Python result is {1}, Fluid result is {2}'. format(step, right_result, fluid_result[0])) + def test_LinearLrWarmup(self): + with fluid.dygraph.guard(): + lr = fluid.layers.polynomial_decay( + learning_rate=1.0, + decay_steps=10, + end_learning_rate=0.0, + power=1.0) + lr = fluid.layers.linear_lr_warmup( + learning_rate=lr, warmup_steps=2, start_lr=0.0, end_lr=1.0) + + right_result = [0.5, 0.9, 0.8, 0.7, 0.6] + for i in range(5): + + t = lr() + + self.assertTrue( + np.allclose((t.numpy())[0].item(), right_result[i])) + + with self.assertRaises(TypeError): + lr = fluid.layers.linear_lr_warmup( + learning_rate="fake_lr", + warmup_steps=2, + start_lr=0.0, + end_lr=1.0) + + def test_MultiStepDecay(self): + with fluid.dygraph.guard(): + learning_rate = 0.5 + milestones = [2, 4, 8] + decay_rate = 0.2 + scheduler = fluid.dygraph.MultiStepDecay(learning_rate, milestones, + decay_rate) + for epoch in range(10): + right_result = multi_step_decay(epoch, learning_rate, + milestones, decay_rate) + fluid_result = scheduler().numpy()[0] + scheduler.epoch() + self.assertAlmostEqual( + right_result, + fluid_result, + msg='Failed lr scheduler in step {0}, Python result is {1}, Fluid result is {2}'. + format(epoch, right_result, fluid_result)) + + with self.assertRaises(ValueError): + lr = fluid.dygraph.MultiStepDecay(learning_rate, [30, 50, 20], + 0.1) + + with self.assertRaises(ValueError): + lr = fluid.dygraph.MultiStepDecay(learning_rate, [20, 30, 50], + 1) + + with self.assertRaises(TypeError): + lr = fluid.dygraph.MultiStepDecay("test", [20, 30, 50]) + + with self.assertRaises(ValueError): + lr = fluid.dygraph.MultiStepDecay(2.0, [20, 30, 50]) + + def test_StepDecay(self): + with fluid.dygraph.guard(): + learning_rate = 0.5 + step_size = 3 + decay_rate = 0.2 + scheduler = fluid.dygraph.StepDecay(learning_rate, step_size, + decay_rate) + for epoch in range(10): + right_result = step_decay(epoch, learning_rate, step_size, + decay_rate) + fluid_result = scheduler().numpy()[0] + scheduler.epoch() + self.assertAlmostEqual( + right_result, + fluid_result, + msg='Failed lr scheduler in step {0}, Python result is {1}, Fluid result is {2}'. + format(epoch, right_result, fluid_result)) + + with self.assertRaises(TypeError): + lr = fluid.dygraph.MultiStepDecay(learning_rate, "test", 0.1) + + with self.assertRaises(ValueError): + lr = fluid.dygraph.MultiStepDecay(learning_rate, [20, 30, 50], + 1) + class TestLearningRateDecay(unittest.TestCase): def check_decay(self, python_decay_fn, fluid_decay_fn, kwargs): @@ -171,31 +271,26 @@ class TestLearningRateDecay(unittest.TestCase): (natural_exp_decay, layers.natural_exp_decay, common_kwargs_false), (inverse_time_decay, layers.inverse_time_decay, common_kwargs_true), (inverse_time_decay, layers.inverse_time_decay, - common_kwargs_false), - (polynomial_decay, layers.polynomial_decay, { - "learning_rate": 1.0, - "decay_steps": 5, - "cycle": True - }), - (polynomial_decay, layers.polynomial_decay, { - "learning_rate": 1.0, - "decay_steps": 5, - "cycle": False - }), - (piecewise_decay, layers.piecewise_decay, { - "boundaries": [3, 6, 9], - "values": [0.1, 0.2, 0.3, 0.4] - }), - (cosine_decay, layers.cosine_decay, { - "learning_rate": 0.1, - "step_each_epoch": 100, - "epochs": 120 - }), - (noam_decay, layers.noam_decay, { - "d_model": 0.01, - "warmup_steps": 200, - "learning_rate": 2.0 - }), + common_kwargs_false), (polynomial_decay, layers.polynomial_decay, { + "learning_rate": 1.0, + "decay_steps": 5, + "cycle": True + }), (polynomial_decay, layers.polynomial_decay, { + "learning_rate": 1.0, + "decay_steps": 5, + "cycle": False + }), (piecewise_decay, layers.piecewise_decay, { + "boundaries": [3, 6, 9], + "values": [0.1, 0.2, 0.3, 0.4] + }), (cosine_decay, layers.cosine_decay, { + "learning_rate": 0.1, + "step_each_epoch": 100, + "epochs": 120 + }), (noam_decay, layers.noam_decay, { + "d_model": 0.01, + "warmup_steps": 200, + "learning_rate": 2.0 + }) ] for py_decay_fn, fluid_decay_fn, kwargs in decay_fns: @@ -207,13 +302,7 @@ class TestLearningRateDecay(unittest.TestCase): self.check_decay(py_decay_fn, fluid_decay_fn, kwargs) -def linear_lr_warmup(global_step, warmup_steps, start_lr, end_lr): - linear_step = end_lr - start_lr - decayed_lr = start_lr + linear_step * (global_step / warmup_steps) - return decayed_lr - - -class TestLinearWamrupLearningRateDecay(TestLearningRateDecay): +class TestLinearWamrupLearningRateDecay(unittest.TestCase): def check_decay_with_place(self, place, python_decay_fn, fluid_decay_fn, kwargs): main_prog = fluid.Program() @@ -304,37 +393,6 @@ class TestLinearWamrupLearningRateDecayWithScalarInput(unittest.TestCase): run_places(lr, start_lr, end_lr) -class TestLinearWamrupLearningRateDecayDygraphMode(unittest.TestCase): - def test_dygraph_mode(self): - with fluid.dygraph.guard(): - lr = fluid.layers.polynomial_decay( - learning_rate=1.0, - decay_steps=10, - end_learning_rate=0.0, - power=1.0) - lr = fluid.layers.linear_lr_warmup( - learning_rate=lr, warmup_steps=2, start_lr=0.0, end_lr=1.0) - - right_result = [0.5, 0.9, 0.8, 0.7, 0.6] - for i in range(5): - - t = lr() - - self.assertTrue( - np.allclose((t.numpy())[0].item(), right_result[i])) - - -class TestLinearWamrupLearningRateDecayDygraphModeTypeCheck(unittest.TestCase): - def test_dygraph_mode(self): - with fluid.dygraph.guard(): - with self.assertRaises(TypeError): - lr = fluid.layers.linear_lr_warmup( - learning_rate="fake_lr", - warmup_steps=2, - start_lr=0.0, - end_lr=1.0) - - def reduce_lr_on_plateau(decay_rate, threshold, cooldown, patience, m, n, loss, var_list): def is_better(current, best, m, n):