diff --git a/ppcls/optimizer/learning_rate.py b/ppcls/optimizer/learning_rate.py
index 1a4561133f948831b9ca0d69821a3394f092fae7..a2a850be6bc622074b3f6e03078171c0890386e7 100644
--- a/ppcls/optimizer/learning_rate.py
+++ b/ppcls/optimizer/learning_rate.py
@@ -15,117 +15,218 @@
 from __future__ import (absolute_import, division, print_function,
                         unicode_literals)
 
-from paddle.optimizer import lr
-from paddle.optimizer.lr import LRScheduler
+from abc import abstractmethod
+from typing import Union
 
+from paddle.optimizer import lr
 from ppcls.utils import logger
 
 
-class Linear(object):
+class LRBase(object):
+    """Base class for custom learning rates
+
+    Args:
+        epochs (int): total epoch(s)
+        step_each_epoch (int): number of iterations within an epoch
+        learning_rate (float): learning rate
+        warmup_epoch (int): number of warmup epoch(s)
+        warmup_start_lr (float): start learning rate within warmup
+        last_epoch (int): last epoch
+        by_epoch (bool): learning rate decays by epoch when by_epoch is True, else by iter
+        verbose (bool): If True, prints a message to stdout for each update. Defaults to False
     """
-    Linear learning rate decay
+
+    def __init__(self,
+                 epochs: int,
+                 step_each_epoch: int,
+                 learning_rate: float,
+                 warmup_epoch: int,
+                 warmup_start_lr: float,
+                 last_epoch: int,
+                 by_epoch: bool,
+                 verbose: bool=False) -> None:
+        """Initialize and record the necessary parameters
+        """
+        super(LRBase, self).__init__()
+        if warmup_epoch >= epochs:
+            msg = f"When using warm up, the value of \"Global.epochs\" must be greater than value of \"Optimizer.lr.warmup_epoch\". The value of \"Optimizer.lr.warmup_epoch\" has been set to {epochs}."
+            logger.warning(msg)
+            warmup_epoch = epochs
+        self.epochs = epochs
+        self.step_each_epoch = step_each_epoch
+        self.learning_rate = learning_rate
+        self.warmup_epoch = warmup_epoch
+        self.warmup_steps = round(
+            self.warmup_epoch *
+            self.step_each_epoch) if by_epoch else self.warmup_epoch
+        self.warmup_start_lr = warmup_start_lr
+        self.last_epoch = last_epoch
+        self.by_epoch = by_epoch
+        self.verbose = verbose
+
+    @abstractmethod
+    def __call__(self, *kargs, **kwargs) -> lr.LRScheduler:
+        """generate an learning rate scheduler
+
+        Returns:
+            lr.LinearWarmup: learning rate scheduler
+        """
+        pass
+
+    def linear_warmup(
+            self,
+            learning_rate: Union[float, lr.LRScheduler]) -> lr.LinearWarmup:
+        """Add an Linear Warmup before learning_rate
+
+        Args:
+            learning_rate (Union[float, lr.LRScheduler]): original learning rate without warmup
+
+        Returns:
+            lr.LinearWarmup: learning rate scheduler with warmup
+        """
+        warmup_lr = lr.LinearWarmup(
+            learning_rate=learning_rate,
+            warmup_steps=self.warmup_steps,
+            start_lr=self.warmup_start_lr,
+            end_lr=self.learning_rate,
+            last_epoch=self.last_epoch,
+            verbose=self.verbose)
+        return warmup_lr
+
+
+class Constant(LRBase):
+    """Constant learning rate
+
     Args:
-        lr (float): The initial learning rate. It is a python float number.
-        epochs(int): The decay step size. It determines the decay cycle.
-        end_lr(float, optional): The minimum final learning rate. Default: 0.0001.
-        power(float, optional): Power of polynomial. Default: 1.0.
-        warmup_epoch(int): The epoch numbers for LinearWarmup. Default: 0.
-        warmup_start_lr(float): Initial learning rate of warm up. Default: 0.0.
-        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
+        epochs (int): total epoch(s)
+        step_each_epoch (int): number of iterations within an epoch
+        learning_rate (float): learning rate
+        warmup_epoch (int): number of warmup epoch(s)
+        warmup_start_lr (float): start learning rate within warmup
+        last_epoch (int): last epoch
+        by_epoch (bool): learning rate decays by epoch when by_epoch is True, else by iter
     """
 
     def __init__(self,
+                 epochs,
+                 step_each_epoch,
                  learning_rate,
+                 warmup_epoch=0,
+                 warmup_start_lr=0.0,
+                 last_epoch=-1,
+                 by_epoch=False,
+                 **kwargs):
+        super(Constant, self).__init__(epochs, step_each_epoch, learning_rate,
+                                       warmup_epoch, warmup_start_lr,
+                                       last_epoch, by_epoch)
+
+    def __call__(self):
+        learning_rate = lr.LRScheduler(
+            learning_rate=self.learning_rate, last_epoch=self.last_epoch)
+
+        def make_get_lr():
+            def get_lr(self):
+                return self.learning_rate
+
+            return get_lr
+
+        setattr(learning_rate, "get_lr", make_get_lr())
+
+        if self.warmup_steps > 0:
+            learning_rate = self.linear_warmup(learning_rate)
+
+        setattr(learning_rate, "by_epoch", self.by_epoch)
+        return learning_rate
+
+
+class Linear(LRBase):
+    """Linear learning rate decay
+
+    Args:
+        epochs (int): total epoch(s)
+        step_each_epoch (int): number of iterations within an epoch
+        learning_rate (float): learning rate
+        end_lr (float, optional): The minimum final learning rate. Defaults to 0.0.
+        power (float, optional): Power of polynomial. Defaults to 1.0.
+        warmup_epoch (int): number of warmup epoch(s)
+        warmup_start_lr (float): start learning rate within warmup
+        last_epoch (int): last epoch
+        by_epoch (bool): learning rate decays by epoch when by_epoch is True, else by iter
+    """
+
+    def __init__(self,
                  epochs,
                  step_each_epoch,
+                 learning_rate,
                  end_lr=0.0,
                  power=1.0,
+                 cycle=False,
                  warmup_epoch=0,
                  warmup_start_lr=0.0,
                  last_epoch=-1,
+                 by_epoch=False,
                  **kwargs):
-        super().__init__()
-        if warmup_epoch >= epochs:
-            msg = f"When using warm up, the value of \"Global.epochs\" must be greater than value of \"Optimizer.lr.warmup_epoch\". The value of \"Optimizer.lr.warmup_epoch\" has been set to {epochs}."
-            logger.warning(msg)
-            warmup_epoch = epochs
-        self.learning_rate = learning_rate
-        self.steps = (epochs - warmup_epoch) * step_each_epoch
+        super(Linear, self).__init__(epochs, step_each_epoch, learning_rate,
+                                     warmup_epoch, warmup_start_lr, last_epoch,
+                                     by_epoch)
+        self.decay_steps = (epochs - self.warmup_epoch) * step_each_epoch
         self.end_lr = end_lr
         self.power = power
-        self.last_epoch = last_epoch
-        self.warmup_steps = round(warmup_epoch * step_each_epoch)
-        self.warmup_start_lr = warmup_start_lr
+        self.cycle = cycle
+        self.warmup_steps = round(self.warmup_epoch * step_each_epoch)
+        if self.by_epoch:
+            self.decay_steps = self.epochs - self.warmup_epoch
 
     def __call__(self):
         learning_rate = lr.PolynomialDecay(
             learning_rate=self.learning_rate,
-            decay_steps=self.steps,
+            decay_steps=self.decay_steps,
             end_lr=self.end_lr,
             power=self.power,
+            cycle=self.cycle,
             last_epoch=self.
-            last_epoch) if self.steps > 0 else self.learning_rate
-        if self.warmup_steps > 0:
-            learning_rate = lr.LinearWarmup(
-                learning_rate=learning_rate,
-                warmup_steps=self.warmup_steps,
-                start_lr=self.warmup_start_lr,
-                end_lr=self.learning_rate,
-                last_epoch=self.last_epoch)
-        return learning_rate
+            last_epoch) if self.decay_steps > 0 else self.learning_rate
 
+        if self.warmup_steps > 0:
+            learning_rate = self.linear_warmup(learning_rate)
 
-class Constant(LRScheduler):
-    """
-    Constant learning rate
-    Args:
-        lr (float): The initial learning rate. It is a python float number.
-        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
-    """
+        setattr(learning_rate, "by_epoch", self.by_epoch)
+        return learning_rate
 
-    def __init__(self, learning_rate, last_epoch=-1, **kwargs):
-        self.learning_rate = learning_rate
-        self.last_epoch = last_epoch
-        super().__init__()
 
-    def get_lr(self):
-        return self.learning_rate
+class Cosine(LRBase):
+    """Cosine learning rate decay
 
+    ``lr = 0.05 * (math.cos(epoch * (math.pi / epochs)) + 1)``
 
-class Cosine(object):
-    """
-    Cosine learning rate decay
-    lr = 0.05 * (math.cos(epoch * (math.pi / epochs)) + 1)
     Args:
-        lr(float): initial learning rate
-        step_each_epoch(int): steps each epoch
-        epochs(int): total training epochs
-        eta_min(float): Minimum learning rate. Default: 0.0.
-        warmup_epoch(int): The epoch numbers for LinearWarmup. Default: 0.
-        warmup_start_lr(float): Initial learning rate of warm up. Default: 0.0.
-        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
+        epochs (int): total epoch(s)
+        step_each_epoch (int): number of iterations within an epoch
+        learning_rate (float): learning rate
+        eta_min (float, optional): Minimum learning rate. Defaults to 0.0.
+        warmup_epoch (int, optional): The epoch numbers for LinearWarmup. Defaults to 0.
+        warmup_start_lr (float, optional): start learning rate within warmup. Defaults to 0.0.
+        last_epoch (int, optional): last epoch. Defaults to -1.
+        by_epoch (bool, optional): learning rate decays by epoch when by_epoch is True, else by iter. Defaults to False.
     """
 
     def __init__(self,
-                 learning_rate,
-                 step_each_epoch,
                  epochs,
+                 step_each_epoch,
+                 learning_rate,
                  eta_min=0.0,
                  warmup_epoch=0,
                  warmup_start_lr=0.0,
                  last_epoch=-1,
+                 by_epoch=False,
                  **kwargs):
-        super().__init__()
-        if warmup_epoch >= epochs:
-            msg = f"When using warm up, the value of \"Global.epochs\" must be greater than value of \"Optimizer.lr.warmup_epoch\". The value of \"Optimizer.lr.warmup_epoch\" has been set to {epochs}."
-            logger.warning(msg)
-            warmup_epoch = epochs
-        self.learning_rate = learning_rate
-        self.T_max = (epochs - warmup_epoch) * step_each_epoch
+        super(Cosine, self).__init__(epochs, step_each_epoch, learning_rate,
+                                     warmup_epoch, warmup_start_lr, last_epoch,
+                                     by_epoch)
+        self.T_max = (self.epochs - self.warmup_epoch) * self.step_each_epoch
         self.eta_min = eta_min
-        self.last_epoch = last_epoch
-        self.warmup_steps = round(warmup_epoch * step_each_epoch)
-        self.warmup_start_lr = warmup_start_lr
+        if self.by_epoch:
+            self.T_max = self.epochs - self.warmup_epoch
 
     def __call__(self):
         learning_rate = lr.CosineAnnealingDecay(
@@ -134,51 +235,47 @@ class Cosine(object):
             eta_min=self.eta_min,
             last_epoch=self.
             last_epoch) if self.T_max > 0 else self.learning_rate
+
         if self.warmup_steps > 0:
-            learning_rate = lr.LinearWarmup(
-                learning_rate=learning_rate,
-                warmup_steps=self.warmup_steps,
-                start_lr=self.warmup_start_lr,
-                end_lr=self.learning_rate,
-                last_epoch=self.last_epoch)
+            learning_rate = self.linear_warmup(learning_rate)
+
+        setattr(learning_rate, "by_epoch", self.by_epoch)
         return learning_rate
 
 
-class Step(object):
-    """
-    Piecewise learning rate decay
+class Step(LRBase):
+    """Step learning rate decay
+
     Args:
-        step_each_epoch(int): steps each epoch
-        learning_rate (float): The initial learning rate. It is a python float number.
+        epochs (int): total epoch(s)
+        step_each_epoch (int): number of iterations within an epoch
+        learning_rate (float): learning rate
         step_size (int): the interval to update.
-        gamma (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma`` .
-            It should be less than 1.0. Default: 0.1.
-        warmup_epoch(int): The epoch numbers for LinearWarmup. Default: 0.
-        warmup_start_lr(float): Initial learning rate of warm up. Default: 0.0.
-        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
+        gamma (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma``. It should be less than 1.0. Default: 0.1.
+        warmup_epoch (int, optional): The epoch numbers for LinearWarmup. Defaults to 0.
+        warmup_start_lr (float, optional): start learning rate within warmup. Defaults to 0.0.
+        last_epoch (int, optional): last epoch. Defaults to -1.
+        by_epoch (bool, optional): learning rate decays by epoch when by_epoch is True, else by iter. Defaults to False.
     """
 
     def __init__(self,
+                 epochs,
+                 step_each_epoch,
                  learning_rate,
                  step_size,
-                 step_each_epoch,
-                 epochs,
                  gamma,
                  warmup_epoch=0,
                  warmup_start_lr=0.0,
                  last_epoch=-1,
+                 by_epoch=False,
                  **kwargs):
-        super().__init__()
-        if warmup_epoch >= epochs:
-            msg = f"When using warm up, the value of \"Global.epochs\" must be greater than value of \"Optimizer.lr.warmup_epoch\". The value of \"Optimizer.lr.warmup_epoch\" has been set to {epochs}."
-            logger.warning(msg)
-            warmup_epoch = epochs
-        self.step_size = step_each_epoch * step_size
-        self.learning_rate = learning_rate
+        super(Step, self).__init__(epochs, step_each_epoch, learning_rate,
+                                   warmup_epoch, warmup_start_lr, last_epoch,
+                                   by_epoch)
+        self.step_size = step_size * step_each_epoch
         self.gamma = gamma
-        self.last_epoch = last_epoch
-        self.warmup_steps = round(warmup_epoch * step_each_epoch)
-        self.warmup_start_lr = warmup_start_lr
+        if self.by_epoch:
+            self.step_size = step_size
 
     def __call__(self):
         learning_rate = lr.StepDecay(
@@ -186,177 +283,102 @@ class Step(object):
             step_size=self.step_size,
             gamma=self.gamma,
             last_epoch=self.last_epoch)
+
         if self.warmup_steps > 0:
-            learning_rate = lr.LinearWarmup(
-                learning_rate=learning_rate,
-                warmup_steps=self.warmup_steps,
-                start_lr=self.warmup_start_lr,
-                end_lr=self.learning_rate,
-                last_epoch=self.last_epoch)
+            learning_rate = self.linear_warmup(learning_rate)
+
+        setattr(learning_rate, "by_epoch", self.by_epoch)
         return learning_rate
 
 
-class Piecewise(object):
-    """
-    Piecewise learning rate decay
+class Piecewise(LRBase):
+    """Piecewise learning rate decay
+
     Args:
-        boundaries(list): A list of steps numbers. The type of element in the list is python int.
-        values(list): A list of learning rate values that will be picked during different epoch boundaries.
-            The type of element in the list is python float.
-        warmup_epoch(int): The epoch numbers for LinearWarmup. Default: 0.
-        warmup_start_lr(float): Initial learning rate of warm up. Default: 0.0.
-        by_epoch(bool): Whether lr decay by epoch. Default: False.
-        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
+        epochs (int): total epoch(s)
+        step_each_epoch (int): number of iterations within an epoch
+        decay_epochs (List[int]): A list of steps numbers. The type of element in the list is python int.
+        values (List[float]): A list of learning rate values that will be picked during different epoch boundaries.
+        warmup_epoch (int, optional): The epoch numbers for LinearWarmup. Defaults to 0.
+        warmup_start_lr (float, optional): start learning rate within warmup. Defaults to 0.0.
+        last_epoch (int, optional): last epoch. Defaults to -1.
+        by_epoch (bool, optional): learning rate decays by epoch when by_epoch is True, else by iter. Defaults to False.
     """
 
     def __init__(self,
+                 epochs,
                  step_each_epoch,
                  decay_epochs,
                  values,
-                 epochs,
                  warmup_epoch=0,
                  warmup_start_lr=0.0,
-                 by_epoch=False,
                  last_epoch=-1,
+                 by_epoch=False,
                  **kwargs):
-        super().__init__()
-        if warmup_epoch >= epochs:
-            msg = f"When using warm up, the value of \"Global.epochs\" must be greater than value of \"Optimizer.lr.warmup_epoch\". The value of \"Optimizer.lr.warmup_epoch\" has been set to {epochs}."
-            logger.warning(msg)
-            warmup_epoch = epochs
-        self.boundaries_steps = [step_each_epoch * e for e in decay_epochs]
-        self.boundaries_epoch = decay_epochs
+        super(Piecewise,
+              self).__init__(epochs, step_each_epoch, values[0], warmup_epoch,
+                             warmup_start_lr, last_epoch, by_epoch)
         self.values = values
-        self.last_epoch = last_epoch
-        self.warmup_steps = round(warmup_epoch * step_each_epoch)
-        self.warmup_epoch = warmup_epoch
-        self.warmup_start_lr = warmup_start_lr
-        self.by_epoch = by_epoch
+        self.boundaries_steps = [e * step_each_epoch for e in decay_epochs]
+        if self.by_epoch is True:
+            self.boundaries_steps = decay_epochs
 
     def __call__(self):
-        if self.by_epoch:
-            learning_rate = lr.PiecewiseDecay(
-                boundaries=self.boundaries_epoch,
-                values=self.values,
-                last_epoch=self.last_epoch)
-            if self.warmup_epoch > 0:
-                learning_rate = lr.LinearWarmup(
-                    learning_rate=learning_rate,
-                    warmup_steps=self.warmup_epoch,
-                    start_lr=self.warmup_start_lr,
-                    end_lr=self.values[0],
-                    last_epoch=self.last_epoch)
-        else:
-            learning_rate = lr.PiecewiseDecay(
-                boundaries=self.boundaries_steps,
-                values=self.values,
-                last_epoch=self.last_epoch)
-            if self.warmup_steps > 0:
-                learning_rate = lr.LinearWarmup(
-                    learning_rate=learning_rate,
-                    warmup_steps=self.warmup_steps,
-                    start_lr=self.warmup_start_lr,
-                    end_lr=self.values[0],
-                    last_epoch=self.last_epoch)
+        learning_rate = lr.PiecewiseDecay(
+            boundaries=self.boundaries_steps,
+            values=self.values,
+            last_epoch=self.last_epoch)
+
+        if self.warmup_steps > 0:
+            learning_rate = self.linear_warmup(learning_rate)
+
         setattr(learning_rate, "by_epoch", self.by_epoch)
         return learning_rate
 
 
-class MultiStepDecay(LRScheduler):
-    """
-    Update the learning rate by ``gamma`` once ``epoch`` reaches one of the milestones.
-    The algorithm can be described as the code below.
-    .. code-block:: text
-        learning_rate = 0.5
-        milestones = [30, 50]
-        gamma = 0.1
-        if epoch < 30:
-            learning_rate = 0.5
-        elif epoch < 50:
-            learning_rate = 0.05
-        else:
-            learning_rate = 0.005
+class MultiStepDecay(LRBase):
+    """MultiStepDecay learning rate decay
+
     Args:
-        learning_rate (float): The initial learning rate. It is a python float number.
-        milestones (tuple|list): List or tuple of each boundaries. Must be increasing.
-        gamma (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma`` .
-            It should be less than 1.0. Default: 0.1.
-        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
-        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
-
-    Returns:
-        ``MultiStepDecay`` instance to schedule learning rate.
-    Examples:
-
-        .. code-block:: python
-            import paddle
-            import numpy as np
-            # train on default dynamic graph mode
-            linear = paddle.nn.Linear(10, 10)
-            scheduler = paddle.optimizer.lr.MultiStepDecay(learning_rate=0.5, milestones=[2, 4, 6], gamma=0.8, verbose=True)
-            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
-            for epoch in range(20):
-                for batch_id in range(5):
-                    x = paddle.uniform([10, 10])
-                    out = linear(x)
-                    loss = paddle.mean(out)
-                    loss.backward()
-                    sgd.step()
-                    sgd.clear_gradients()
-                    scheduler.step()    # If you update learning rate each step
-              # scheduler.step()        # If you update learning rate each epoch
-            # train on static graph mode
-            paddle.enable_static()
-            main_prog = paddle.static.Program()
-            start_prog = paddle.static.Program()
-            with paddle.static.program_guard(main_prog, start_prog):
-                x = paddle.static.data(name='x', shape=[None, 4, 5])
-                y = paddle.static.data(name='y', shape=[None, 4, 5])
-                z = paddle.static.nn.fc(x, 100)
-                loss = paddle.mean(z)
-                scheduler = paddle.optimizer.lr.MultiStepDecay(learning_rate=0.5, milestones=[2, 4, 6], gamma=0.8, verbose=True)
-                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
-                sgd.minimize(loss)
-            exe = paddle.static.Executor()
-            exe.run(start_prog)
-            for epoch in range(20):
-                for batch_id in range(5):
-                    out = exe.run(
-                        main_prog,
-                        feed={
-                            'x': np.random.randn(3, 4, 5).astype('float32'),
-                            'y': np.random.randn(3, 4, 5).astype('float32')
-                        },
-                        fetch_list=loss.name)
-                    scheduler.step()    # If you update learning rate each step
-              # scheduler.step()        # If you update learning rate each epoch
+        epochs (int): total epoch(s)
+        step_each_epoch (int): number of iterations within an epoch
+        learning_rate (float): learning rate
+        milestones (List[int]): List of each boundaries. Must be increasing.
+        gamma (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma``. It should be less than 1.0. Defaults to 0.1.
+        warmup_epoch (int, optional): The epoch numbers for LinearWarmup. Defaults to 0.
+        warmup_start_lr (float, optional): start learning rate within warmup. Defaults to 0.0.
+        last_epoch (int, optional): last epoch. Defaults to -1.
+        by_epoch (bool, optional): learning rate decays by epoch when by_epoch is True, else by iter. Defaults to False.
     """
 
     def __init__(self,
-                 learning_rate,
-                 milestones,
                  epochs,
                  step_each_epoch,
+                 learning_rate,
+                 milestones,
                  gamma=0.1,
+                 warmup_epoch=0,
+                 warmup_start_lr=0.0,
                  last_epoch=-1,
-                 verbose=False):
-        if not isinstance(milestones, (tuple, list)):
-            raise TypeError(
-                "The type of 'milestones' in 'MultiStepDecay' must be 'tuple, list', but received %s."
-                % type(milestones))
-        if not all([
-                milestones[i] < milestones[i + 1]
-                for i in range(len(milestones) - 1)
-        ]):
-            raise ValueError('The elements of milestones must be incremented')
-        if gamma >= 1.0:
-            raise ValueError('gamma should be < 1.0.')
+                 by_epoch=False,
+                 **kwargs):
+        super(MultiStepDecay, self).__init__(
+            epochs, step_each_epoch, learning_rate, warmup_epoch,
+            warmup_start_lr, last_epoch, by_epoch)
         self.milestones = [x * step_each_epoch for x in milestones]
         self.gamma = gamma
-        super().__init__(learning_rate, last_epoch, verbose)
+        if self.by_epoch:
+            self.milestones = milestones
 
-    def get_lr(self):
-        for i in range(len(self.milestones)):
-            if self.last_epoch < self.milestones[i]:
-                return self.base_lr * (self.gamma**i)
-        return self.base_lr * (self.gamma**len(self.milestones))
+    def __call__(self):
+        learning_rate = lr.MultiStepDecay(
+            learning_rate=self.learning_rate,
+            milestones=self.milestones,
+            gamma=self.gamma,
+            last_epoch=self.last_epoch)
+
+        if self.warmup_steps > 0:
+            learning_rate = self.linear_warmup(learning_rate)
+
+        setattr(learning_rate, "by_epoch", self.by_epoch)
+        return learning_rate