replace PiecewiseDecay, StepDecay, MultiStepDecay, LambdaDecay with 2.0 version (#53992)

* replace PiecewiseDecay(LearningRateDecay) with PiecewiseDecay(LRScheduler) * fix bug * fix bug * replace the StepDecay,MultiStepDecay,LambdaDecay with 2.0 version

replace PiecewiseDecay, StepDecay, MultiStepDecay, LambdaDecay with 2.0 version (#53992)
* replace PiecewiseDecay(LearningRateDecay) with PiecewiseDecay(LRScheduler) * fix bug * fix bug * replace the StepDecay,MultiStepDecay,LambdaDecay with 2.0 version
63f242b6 · LoneRanger · GitHub · 54b86fd4 · 63f242b6 · 63f242b6
7 changed file
--- a/python/paddle/fluid/dygraph/learning_rate_scheduler.py
+++ b/python/paddle/fluid/dygraph/learning_rate_scheduler.py
@@ -23,13 +23,9 @@ from ..data_feeder import check_type
 __all__ = [
    'NoamDecay',
-    'PiecewiseDecay',
    'PolynomialDecay',
    'LinearLrWarmup',
    'ReduceLROnPlateau',
-    'StepDecay',
-    'MultiStepDecay',
-    'LambdaDecay',
 ]
@@ -131,68 +127,6 @@ class LearningRateDecay:
        raise NotImplementedError()
-class PiecewiseDecay(LearningRateDecay):
-    """
-    :api_attr: imperative
-    Piecewise decay scheduler.
-    The algorithm can be described as the code below.
-    .. code-block:: text
-        boundaries = [10000, 20000]
-        values = [1.0, 0.5, 0.1]
-        if global_step < 10000:
-            learning_rate = 1.0
-        elif 10000 <= global_step < 20000:
-            learning_rate = 0.5
-        else:
-            learning_rate = 0.1
-    Parameters:
-        boundaries(list): A list of steps numbers. The type of element in the list is python int.
-        values(list): A list of learning rate values that will be picked during
-            different step boundaries. The type of element in the list is python float.
-        begin(int): The begin step to initialize the global_step in the description above.
-        step(int, optional): The step size used to calculate the new global_step in the description above.
-            The default value is 1.
-        dtype(str, optional): The data type used to create the learning rate variable. The data type can be set as
-            'float32', 'float64'. The default value is 'float32'.
-    Returns:
-        None.
-    Examples:
-        .. code-block:: python
-          import paddle.fluid as fluid
-          import paddle
-          boundaries = [10000, 20000]
-          values = [1.0, 0.5, 0.1]
-          with fluid.dygraph.guard():
-              emb = paddle.nn.Embedding(10, 10)
-              optimizer = fluid.optimizer.SGD(
-                 learning_rate=fluid.dygraph.PiecewiseDecay(boundaries, values, 0),
-                 parameter_list = emb.parameters() )
-    """
-    def __init__(self, boundaries, values, begin, step=1, dtype='float32'):
-        super().__init__(begin, step, dtype)
-        self.boundaries = boundaries
-        self.values = values
-        self.vars = []
-        for value in values:
-            self.vars.append(value)
-    def step(self):
-        for i in range(len(self.boundaries)):
-            if self.step_num < self.boundaries[i]:
-                return self.vars[i]
-        return self.create_lr_var(self.vars[len(self.values) - 1])
 class PolynomialDecay(LearningRateDecay):
    r"""
    :api_attr: imperative
@@ -742,241 +676,3 @@ class _LearningRateEpochDecay(LearningRateDecay):
    def get_lr(self):
        raise NotImplementedError
-class StepDecay(_LearningRateEpochDecay):
-    """
-    :api_attr: imperative
-    Decays the learning rate of ``optimizer`` by ``decay_rate`` every ``step_size`` number of epoch.
-    The algorithm can be described as the code below.
-    .. code-block:: text
-        learning_rate = 0.5
-        step_size = 30
-        decay_rate = 0.1
-        learning_rate = 0.5     if epoch < 30
-        learning_rate = 0.05    if 30 <= epoch < 60
-        learning_rate = 0.005   if 60 <= epoch < 90
-        ...
-    Parameters:
-        learning_rate (float|int): The initial learning rate. It can be set to python float or int number.
-        step_size (int): Period of learning rate decay.
-        decay_rate (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * decay_rate`` .
-            It should be less than 1.0. Default: 0.1.
-    Returns:
-        None.
-    Examples:
-        .. code-block:: python
-            import paddle.fluid as fluid
-            import numpy as np
-            import paddle
-            with fluid.dygraph.guard():
-                x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
-                linear = paddle.nn.Linear(10, 10)
-                input = fluid.dygraph.to_variable(x)
-                scheduler = fluid.dygraph.StepDecay(0.5, step_size=3)
-                adam = fluid.optimizer.Adam(learning_rate = scheduler, parameter_list = linear.parameters())
-                for epoch in range(9):
-                    for batch_id in range(5):
-                        out = linear(input)
-                        loss = paddle.mean(out)
-                        adam.minimize(loss)
-                    scheduler.epoch()
-                    print("epoch:{}, current lr is {}" .format(epoch, adam.current_step_lr()))
-                    # epoch:0, current lr is 0.5
-                    # epoch:1, current lr is 0.5
-                    # epoch:2, current lr is 0.5
-                    # epoch:3, current lr is 0.05
-                    # epoch:4, current lr is 0.05
-                    # epoch:5, current lr is 0.05
-                    # epoch:6, current lr is 0.005
-                    # epoch:7, current lr is 0.005
-                    # epoch:8, current lr is 0.005
-    """
-    def __init__(self, learning_rate, step_size, decay_rate=0.1):
-        if not isinstance(step_size, int):
-            raise TypeError(
-                "The type of 'step_size' must be 'int', but received %s."
-                % type(step_size)
-            )
-        if decay_rate >= 1.0:
-            raise ValueError('decay_rate should be < 1.0.')
-        self.step_size = step_size
-        self.decay_rate = decay_rate
-        super().__init__(learning_rate)
-    def get_lr(self):
-        decay_rate = self.create_lr_var(self.decay_rate)
-        i = self.epoch_num // self.step_size
-        return self.base_lr * (decay_rate**i)
-class MultiStepDecay(_LearningRateEpochDecay):
-    """
-    :api_attr: imperative
-    Decays the learning rate of ``optimizer`` by ``decay_rate`` once ``epoch`` reaches one of the milestones.
-    The algorithm can be described as the code below.
-    .. code-block:: text
-        learning_rate = 0.5
-        milestones = [30, 50]
-        decay_rate = 0.1
-        if epoch < 30:
-            learning_rate = 0.5
-        elif epoch < 50:
-            learning_rate = 0.05
-        else:
-            learning_rate = 0.005
-    Parameters:
-        learning_rate (float|int): The initial learning rate. It can be set to python float or int number.
-        milestones (tuple|list): List or tuple of each boundaries. Must be increasing.
-        decay_rate (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * decay_rate`` .
-            It should be less than 1.0. Default: 0.1.
-    Returns:
-        None.
-    Examples:
-        .. code-block:: python
-            import paddle.fluid as fluid
-            import numpy as np
-            import paddle
-            with fluid.dygraph.guard():
-                x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
-                linear = paddle.nn.Linear(10, 10)
-                input = fluid.dygraph.to_variable(x)
-                scheduler = fluid.dygraph.MultiStepDecay(0.5, milestones=[3, 5])
-                adam = fluid.optimizer.Adam(learning_rate = scheduler, parameter_list = linear.parameters())
-                for epoch in range(6):
-                    for batch_id in range(5):
-                        out = linear(input)
-                        loss = paddle.mean(out)
-                        adam.minimize(loss)
-                    scheduler.epoch()
-                    print("epoch:{}, current lr is {}" .format(epoch, adam.current_step_lr()))
-                    # epoch:0, current lr is 0.5
-                    # epoch:1, current lr is 0.5
-                    # epoch:2, current lr is 0.5
-                    # epoch:3, current lr is 0.05
-                    # epoch:4, current lr is 0.05
-                    # epoch:5, current lr is 0.005
-    """
-    def __init__(self, learning_rate, milestones, decay_rate=0.1):
-        if not isinstance(milestones, (tuple, list)):
-            raise TypeError(
-                "The type of 'milestones' in 'MultiStepDecay' must be 'tuple, list', but received %s."
-                % type(milestones)
-            )
-        if not all(
-            [
-                milestones[i] < milestones[i + 1]
-                for i in range(len(milestones) - 1)
-            ]
-        ):
-            raise ValueError('The elements of milestones must be incremented')
-        if decay_rate >= 1.0:
-            raise ValueError('decay_rate should be < 1.0.')
-        self.milestones = milestones
-        self.decay_rate = decay_rate
-        super().__init__(learning_rate)
-    def get_lr(self):
-        decay_rate = self.create_lr_var(self.decay_rate)
-        for i in range(len(self.milestones)):
-            if self.epoch_num < self.milestones[i]:
-                return self.base_lr * (decay_rate**i)
-        return self.base_lr * (decay_rate ** len(self.milestones))
-class LambdaDecay(_LearningRateEpochDecay):
-    """
-    :api_attr: imperative
-    Sets the learning rate of ``optimizer`` to the initial lr times a multiplicative factor, and this multiplicative
-    factor is computed by function ``lr_lambda`` . ``lr_lambda`` is function which receives ``epoch`` .
-    The algorithm can be described as the code below.
-    .. code-block:: text
-        learning_rate = 0.5        # init learning_rate
-        lr_lambda = lambda epoch: 0.95 ** epoch
-        learning_rate = 0.5        # epoch 0
-        learning_rate = 0.475      # epoch 1
-        learning_rate = 0.45125    # epoch 2
-    Parameters:
-        learning_rate (float|int): The initial learning rate. It can be set to python float or int number.
-        lr_lambda (function): A function which computes a multiplicative factor given an integer parameter ``epoch`` , and
-            then multiply the initial learning rate by this multiplicative factor.
-    Returns:
-        None.
-    Examples:
-        .. code-block:: python
-            import paddle.fluid as fluid
-            import numpy as np
-            import paddle
-            with fluid.dygraph.guard():
-                x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
-                linear = paddle.nn.Linear(10, 10)
-                input = fluid.dygraph.to_variable(x)
-                scheduler = fluid.dygraph.LambdaDecay(0.5, lr_lambda=lambda x: 0.95**x)
-                adam = fluid.optimizer.Adam(learning_rate = scheduler, parameter_list = linear.parameters())
-                for epoch in range(6):
-                    for batch_id in range(5):
-                        out = linear(input)
-                        loss = paddle.mean(out)
-                        adam.minimize(loss)
-                    scheduler.epoch()
-                    print("epoch:%d, current lr is %f" .format(epoch, adam.current_step_lr()))
-                    # epoch:0, current lr is 0.5
-                    # epoch:1, current lr is 0.475
-                    # epoch:2, current lr is 0.45125
-    """
-    def __init__(self, learning_rate, lr_lambda):
-        if not callable(lr_lambda):
-            raise TypeError(
-                "The type of 'lr_lambda' in 'LambdaDecay' must be 'function', but received %s."
-                % type(lr_lambda)
-            )
-        self.lr_lambda = lr_lambda
-        super().__init__(learning_rate)
-    def get_lr(self):
-        base_lr = self.create_lr_var(self.base_lr)
-        return self.base_lr * self.lr_lambda(self.epoch_num)
--- a/python/paddle/fluid/layers/learning_rate_scheduler.py
+++ b/python/paddle/fluid/layers/learning_rate_scheduler.py
@@ -410,10 +410,10 @@ def piecewise_decay(boundaries, values):
              paddle.enable_static()
              boundaries = [10000, 20000]
              values = [1.0, 0.5, 0.1]
-              optimizer = fluid.optimizer.Momentum(
+              optimizer = paddle.optimizer.Momentum(
                  momentum=0.9,
-                  learning_rate=fluid.layers.piecewise_decay(boundaries=boundaries, values=values),
+                  learning_rate=paddle.optimizer.lr.PiecewiseDecay(boundaries, values),
-                  regularization=paddle.regularizer.L2Decay(1e-4))
+                  weight_decay=paddle.regularizer.L2Decay(1e-4))
    """
@@ -422,7 +422,7 @@ def piecewise_decay(boundaries, values):
            raise ValueError("len(values) - len(boundaries) should be 1")
        if in_dygraph_mode():
-            decay = imperate_lr.PiecewiseDecay(boundaries, values, 0)
+            decay = paddle.optimizer.lr.PiecewiseDecay(boundaries, values)
            return decay
        else:
            global_step = _decay_step_counter()

--- a/python/paddle/optimizer/lr.py
+++ b/python/paddle/optimizer/lr.py
@@ -98,6 +98,8 @@ class LRScheduler:
                    type(learning_rate)
                )
            )
+        if learning_rate < 0:
+            raise ValueError(f"Invalid learning rate: {learning_rate}")
        self.base_lr = float(learning_rate)
        self.last_lr = float(learning_rate)
        self.last_epoch = last_epoch

--- a/test/dygraph_to_static/test_basic_api_transformation.py
+++ b/test/dygraph_to_static/test_basic_api_transformation.py
@@ -376,9 +376,9 @@ def dyfunc_NoamDecay():
 def dyfunc_PiecewiseDecay():
    boundaries = [10000, 20000]
    values = [1.0, 0.5, 0.1]
-    pd = fluid.dygraph.PiecewiseDecay(boundaries, values, begin=0)
+    pd = paddle.optimizer.lr.PiecewiseDecay(boundaries, values)
    lr = pd()
-    return lr
+    return paddle.to_tensor(lr)
 def dyfunc_PolynomialDecay():

--- a/test/dygraph_to_static/test_yolov3.py
+++ b/test/dygraph_to_static/test_yolov3.py
@@ -94,11 +94,11 @@ def train(to_static):
        learning_rate = cfg.learning_rate
        values = [learning_rate * (gamma**i) for i in range(step_num + 1)]
-        lr = fluid.dygraph.PiecewiseDecay(
+        lr = paddle.optimizer.lr.PiecewiseDecay(
-            boundaries=boundaries, values=values, begin=0
+            boundaries=boundaries, values=values
        )
-        lr = fluid.layers.linear_lr_warmup(
+        lr = paddle.optimizer.lr.LinearWarmup(
            learning_rate=lr,
            warmup_steps=cfg.warm_up_iter,
            start_lr=0.0,

--- a/test/legacy_test/test_imperative_optimizer.py
+++ b/test/legacy_test/test_imperative_optimizer.py
@@ -262,7 +262,7 @@ class TestImperativeOptimizerPiecewiseDecay(TestImperativeOptimizerBase):
    def get_optimizer(self):
        bd = [3, 6, 9]
        optimizer = SGDOptimizer(
-            learning_rate=fluid.layers.piecewise_decay(
+            learning_rate=paddle.optimizer.lr.PiecewiseDecay(
                boundaries=bd,
                values=[0.1 * (0.1**i) for i in range(len(bd) + 1)],
            )
@@ -470,20 +470,20 @@ class TestOptimizerLearningRate(unittest.TestCase):
            bd = [2, 4, 6, 8]
            value = [0.2, 0.4, 0.6, 0.8, 1.0]
-            adam = fluid.optimizer.Adam(
+            scheduler = paddle.optimizer.lr.PiecewiseDecay(bd, value)
-                fluid.dygraph.PiecewiseDecay(bd, value, 0),
+            adam = paddle.optimizer.Adam(
-                parameter_list=linear.parameters(),
+                scheduler,
+                parameters=linear.parameters(),
            )
-            np.testing.assert_allclose(
+            np.testing.assert_allclose(adam.get_lr(), 0.2, rtol=1e-06, atol=0.0)
-                adam.current_step_lr(), 0.2, rtol=1e-06, atol=0.0
-            )
            ret = [0.2, 0.2, 0.4, 0.4, 0.6, 0.6, 0.8, 0.8, 1.0, 1.0, 1.0, 1.0]
            for i in range(12):
                adam.minimize(loss)
-                lr = adam.current_step_lr()
+                lr = adam.get_lr()
+                adam.step()
+                scheduler.step()
                np.testing.assert_allclose(lr, ret[i], rtol=1e-06, atol=0.0)
    def test_lr_decay_natural_exp(self):

--- a/test/legacy_test/test_learning_rate_scheduler.py
+++ b/test/legacy_test/test_learning_rate_scheduler.py
@@ -127,7 +127,7 @@ class TestLearningRateDecayDygraph(unittest.TestCase):
                learning_rate=0.1,
                gamma=0.5,
            )
-            Step_scheduler = fluid.dygraph.StepDecay(0.5, step_size=3)
+            Step_scheduler = paddle.optimizer.lr.StepDecay(0.5, step_size=3)
            Reducelr_scheduler = fluid.dygraph.ReduceLROnPlateau(
                learning_rate=1.0, decay_rate=0.5, patience=5, cooldown=3
            )
@@ -154,7 +154,7 @@ class TestLearningRateDecayDygraph(unittest.TestCase):
                adam3.minimize(loss)
                linear.clear_gradients()
-                Step_scheduler.epoch()
+                Step_scheduler.get_lr()
                Reducelr_scheduler.step(loss)
            paddle.save(linear.state_dict(), "save_path.pdparams")
@@ -163,7 +163,9 @@ class TestLearningRateDecayDygraph(unittest.TestCase):
                learning_rate=0.1,
                gamma=0.5,
            )
-            Step_scheduler_test = fluid.dygraph.StepDecay(0.5, step_size=3)
+            Step_scheduler_test = paddle.optimizer.lr.StepDecay(
+                0.5, step_size=3
+            )
            Reducelr_scheduler_test = fluid.dygraph.ReduceLROnPlateau(
                learning_rate=1.0, decay_rate=0.5, patience=5, cooldown=3
            )
@@ -189,8 +191,8 @@ class TestLearningRateDecayDygraph(unittest.TestCase):
            )
            adam_test.set_dict(opt_state)
            self.assertEqual(
-                adam_test._learning_rate.epoch_num,
+                adam_test._learning_rate.last_epoch,
-                adam2._learning_rate.epoch_num,
+                adam2._learning_rate.last_epoch,
                "epoch_num is different before and after set_dict",
            )
            self.assertEqual(
@@ -288,19 +290,20 @@ class TestLearningRateDecayDygraph(unittest.TestCase):
            decay_rate = 0.2
            linear = paddle.nn.Linear(10, 10)
-            scheduler = fluid.dygraph.MultiStepDecay(
+            scheduler = paddle.optimizer.lr.MultiStepDecay(
                learning_rate, milestones, decay_rate
            )
-            adam = fluid.optimizer.AdamOptimizer(
+            adam = paddle.optimizer.Adam(
-                learning_rate=scheduler, parameter_list=linear.parameters()
+                learning_rate=scheduler, parameters=linear.parameters()
            )
            for epoch in range(10):
                right_result = multi_step_decay(
                    epoch, learning_rate, milestones, decay_rate
                )
-                fluid_result = adam.current_step_lr()
+                fluid_result = adam.get_lr()
-                scheduler.epoch()
+                adam.step()
+                scheduler.step()
                self.assertAlmostEqual(
                    right_result,
                    fluid_result,
@@ -310,35 +313,36 @@ class TestLearningRateDecayDygraph(unittest.TestCase):
                )
            with self.assertRaises(ValueError):
-                lr = fluid.dygraph.MultiStepDecay(
+                lr = paddle.optimizer.lr.MultiStepDecay(
                    learning_rate, [30, 50, 20], 0.1
                )
            with self.assertRaises(ValueError):
-                lr = fluid.dygraph.MultiStepDecay(
+                lr = paddle.optimizer.lr.MultiStepDecay(
                    learning_rate, [20, 30, 50], 1
                )
            with self.assertRaises(TypeError):
-                lr = fluid.dygraph.MultiStepDecay("test", [20, 30, 50])
+                lr = paddle.optimizer.lr.MultiStepDecay("test", [20, 30, 50])
            with self.assertRaises(ValueError):
-                lr = fluid.dygraph.MultiStepDecay(-1, [20, 30, 50])
+                lr = paddle.optimizer.lr.MultiStepDecay(-1, [20, 30, 50])
    def test_StepDecay(self):
        with fluid.dygraph.guard():
            learning_rate = 0.5
            step_size = 3
            decay_rate = 0.2
-            scheduler = fluid.dygraph.StepDecay(
+            scheduler = paddle.optimizer.lr.StepDecay(
                learning_rate, step_size, decay_rate
            )
            for epoch in range(10):
                right_result = step_decay(
                    epoch, learning_rate, step_size, decay_rate
                )
-                fluid_result = scheduler().numpy().item()
+                fluid_result = scheduler()
-                scheduler.epoch()
+                scheduler.get_lr()
+                scheduler.step()
                self.assertAlmostEqual(
                    right_result,
                    fluid_result,
@@ -348,16 +352,18 @@ class TestLearningRateDecayDygraph(unittest.TestCase):
                )
            with self.assertRaises(TypeError):
-                lr = fluid.dygraph.StepDecay(learning_rate, "test", 0.1)
+                lr = paddle.optimizer.lr.StepDecay(learning_rate, "test", 0.1)
            with self.assertRaises(ValueError):
-                lr = fluid.dygraph.StepDecay(learning_rate, 20, 2)
+                lr = paddle.optimizer.lr.StepDecay(learning_rate, 20, 2)
    def test_LambdaDecay(self):
        with fluid.dygraph.guard():
            learning_rate = 0.5
            lr_lambda = lambda x: 0.95**x
-            scheduler = fluid.dygraph.LambdaDecay(learning_rate, lr_lambda)
+            scheduler = paddle.optimizer.lr.LambdaDecay(
+                learning_rate, lr_lambda
+            )
            linear = paddle.nn.Linear(10, 10)
            adam = fluid.optimizer.Adam(
@@ -366,8 +372,9 @@ class TestLearningRateDecayDygraph(unittest.TestCase):
            for epoch in range(30):
                right_result = lambda_decay(epoch, learning_rate, lr_lambda)
-                fluid_result = scheduler().numpy().item()
+                fluid_result = scheduler()
-                scheduler.epoch()
+                scheduler.get_lr()
+                scheduler.step()
                self.assertAlmostEqual(
                    right_result,
                    fluid_result,
@@ -377,7 +384,7 @@ class TestLearningRateDecayDygraph(unittest.TestCase):
                )
            with self.assertRaises(TypeError):
-                lr = fluid.dygraph.LambdaDecay(learning_rate, "test")
+                lr = paddle.optimizer.lr.LambdaDecay(learning_rate, "test")
 class TestLearningRateDecay(unittest.TestCase):