diff --git a/demo/optimizer.py b/demo/optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..73f441f897d22c10d2d6e05afaa7491b227b27d4
--- /dev/null
+++ b/demo/optimizer.py
@@ -0,0 +1,300 @@
+#copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+
+import paddle.fluid as fluid
+import paddle.fluid.layers.ops as ops
+from paddle.fluid.initializer import init_on_cpu
+from paddle.fluid.layers.learning_rate_scheduler import _decay_step_counter
+
+
+def cosine_decay(learning_rate, step_each_epoch, epochs=120):
+    """Applies cosine decay to the learning rate.
+    lr = 0.05 * (math.cos(epoch * (math.pi / 120)) + 1)
+    """
+    global_step = _decay_step_counter()
+
+    with init_on_cpu():
+        epoch = ops.floor(global_step / step_each_epoch)
+        decayed_lr = learning_rate * \
+                     (ops.cos(epoch * (math.pi / epochs)) + 1)/2
+    return decayed_lr
+
+
+def cosine_decay_with_warmup(learning_rate, step_each_epoch, epochs=120):
+    """Applies cosine decay to the learning rate.
+    lr = 0.05 * (math.cos(epoch * (math.pi / 120)) + 1)
+    decrease lr for every mini-batch and start with warmup.
+    """
+    global_step = _decay_step_counter()
+    lr = fluid.layers.tensor.create_global_var(
+        shape=[1],
+        value=0.0,
+        dtype='float32',
+        persistable=True,
+        name="learning_rate")
+
+    warmup_epoch = fluid.layers.fill_constant(
+        shape=[1], dtype='float32', value=float(5), force_cpu=True)
+
+    with init_on_cpu():
+        epoch = ops.floor(global_step / step_each_epoch)
+        with fluid.layers.control_flow.Switch() as switch:
+            with switch.case(epoch < warmup_epoch):
+                decayed_lr = learning_rate * (global_step /
+                                              (step_each_epoch * warmup_epoch))
+                fluid.layers.tensor.assign(input=decayed_lr, output=lr)
+            with switch.default():
+                decayed_lr = learning_rate * \
+                    (ops.cos((global_step - warmup_epoch * step_each_epoch) * (math.pi / (epochs * step_each_epoch))) + 1)/2
+                fluid.layers.tensor.assign(input=decayed_lr, output=lr)
+    return lr
+
+
+def exponential_decay_with_warmup(learning_rate,
+                                  step_each_epoch,
+                                  decay_epochs,
+                                  decay_rate=0.97,
+                                  warm_up_epoch=5.0):
+    """Applies exponential decay to the learning rate.
+    """
+    global_step = _decay_step_counter()
+    lr = fluid.layers.tensor.create_global_var(
+        shape=[1],
+        value=0.0,
+        dtype='float32',
+        persistable=True,
+        name="learning_rate")
+
+    warmup_epoch = fluid.layers.fill_constant(
+        shape=[1], dtype='float32', value=float(warm_up_epoch), force_cpu=True)
+
+    with init_on_cpu():
+        epoch = ops.floor(global_step / step_each_epoch)
+        with fluid.layers.control_flow.Switch() as switch:
+            with switch.case(epoch < warmup_epoch):
+                decayed_lr = learning_rate * (global_step /
+                                              (step_each_epoch * warmup_epoch))
+                fluid.layers.assign(input=decayed_lr, output=lr)
+            with switch.default():
+                div_res = (global_step - warmup_epoch * step_each_epoch
+                           ) / decay_epochs
+                div_res = ops.floor(div_res)
+                decayed_lr = learning_rate * (decay_rate**div_res)
+                fluid.layers.assign(input=decayed_lr, output=lr)
+
+    return lr
+
+
+def lr_warmup(learning_rate, warmup_steps, start_lr, end_lr):
+    """ Applies linear learning rate warmup for distributed training
+        Argument learning_rate can be float or a Variable
+        lr = lr + (warmup_rate * step / warmup_steps)
+    """
+    assert (isinstance(end_lr, float))
+    assert (isinstance(start_lr, float))
+    linear_step = end_lr - start_lr
+    with fluid.default_main_program()._lr_schedule_guard():
+        lr = fluid.layers.tensor.create_global_var(
+            shape=[1],
+            value=0.0,
+            dtype='float32',
+            persistable=True,
+            name="learning_rate_warmup")
+
+        global_step = fluid.layers.learning_rate_scheduler._decay_step_counter(
+        )
+
+        with fluid.layers.control_flow.Switch() as switch:
+            with switch.case(global_step < warmup_steps):
+                decayed_lr = start_lr + linear_step * (global_step /
+                                                       warmup_steps)
+                fluid.layers.tensor.assign(decayed_lr, lr)
+            with switch.default():
+                fluid.layers.tensor.assign(learning_rate, lr)
+
+        return lr
+
+
+class Optimizer(object):
+    """A class used to represent several optimizer methods
+
+    Attributes:
+        batch_size: batch size on all devices.
+        lr: learning rate.
+        lr_strategy: learning rate decay strategy.
+        l2_decay: l2_decay parameter.
+        momentum_rate: momentum rate when using Momentum optimizer.
+        step_epochs: piecewise decay steps.
+        num_epochs: number of total epochs.
+
+        total_images: total images.
+        step: total steps in the an epoch.
+        
+    """
+
+    def __init__(self, args):
+        self.batch_size = args.batch_size
+        self.lr = args.lr
+        self.lr_strategy = args.lr_strategy
+        self.l2_decay = args.l2_decay
+        self.momentum_rate = args.momentum_rate
+        self.step_epochs = args.step_epochs
+        self.num_epochs = args.num_epochs
+        self.warm_up_epochs = args.warm_up_epochs
+        self.decay_epochs = args.decay_epochs
+        self.decay_rate = args.decay_rate
+        self.total_images = args.total_images
+
+        self.step = int(math.ceil(float(self.total_images) / self.batch_size))
+
+    def piecewise_decay(self):
+        """piecewise decay with Momentum optimizer
+
+            Returns:
+            a piecewise_decay optimizer
+        """
+        bd = [self.step * e for e in self.step_epochs]
+        lr = [self.lr * (0.1**i) for i in range(len(bd) + 1)]
+        learning_rate = fluid.layers.piecewise_decay(boundaries=bd, values=lr)
+        optimizer = fluid.optimizer.Momentum(
+            learning_rate=learning_rate,
+            momentum=self.momentum_rate,
+            regularization=fluid.regularizer.L2Decay(self.l2_decay))
+        return optimizer
+
+    def cosine_decay(self):
+        """cosine decay with Momentum optimizer
+
+        Returns:
+            a cosine_decay optimizer
+        """
+
+        learning_rate = fluid.layers.cosine_decay(
+            learning_rate=self.lr,
+            step_each_epoch=self.step,
+            epochs=self.num_epochs)
+        optimizer = fluid.optimizer.Momentum(
+            learning_rate=learning_rate,
+            momentum=self.momentum_rate,
+            regularization=fluid.regularizer.L2Decay(self.l2_decay))
+        return optimizer
+
+    def cosine_decay_warmup(self):
+        """cosine decay with warmup
+
+        Returns:
+            a cosine_decay_with_warmup optimizer
+        """
+
+        learning_rate = cosine_decay_with_warmup(
+            learning_rate=self.lr,
+            step_each_epoch=self.step,
+            epochs=self.num_epochs)
+        optimizer = fluid.optimizer.Momentum(
+            learning_rate=learning_rate,
+            momentum=self.momentum_rate,
+            regularization=fluid.regularizer.L2Decay(self.l2_decay))
+        return optimizer
+
+    def exponential_decay_warmup(self):
+        """exponential decay with warmup
+
+        Returns:
+            a exponential_decay_with_warmup optimizer
+        """
+
+        learning_rate = exponential_decay_with_warmup(
+            learning_rate=self.lr,
+            step_each_epoch=self.step,
+            decay_epochs=self.step * self.decay_epochs,
+            decay_rate=self.decay_rate,
+            warm_up_epoch=self.warm_up_epochs)
+        optimizer = fluid.optimizer.RMSProp(
+            learning_rate=learning_rate,
+            regularization=fluid.regularizer.L2Decay(self.l2_decay),
+            momentum=self.momentum_rate,
+            rho=0.9,
+            epsilon=0.001)
+        return optimizer
+
+    def linear_decay(self):
+        """linear decay with Momentum optimizer
+
+        Returns:
+            a linear_decay optimizer
+        """
+
+        end_lr = 0
+        learning_rate = fluid.layers.polynomial_decay(
+            self.lr, self.step, end_lr, power=1)
+        optimizer = fluid.optimizer.Momentum(
+            learning_rate=learning_rate,
+            momentum=self.momentum_rate,
+            regularization=fluid.regularizer.L2Decay(self.l2_decay))
+
+        return optimizer
+
+    def adam_decay(self):
+        """Adam optimizer
+
+        Returns: 
+            an adam_decay optimizer
+        """
+
+        return fluid.optimizer.Adam(learning_rate=self.lr)
+
+    def cosine_decay_RMSProp(self):
+        """cosine decay with RMSProp optimizer
+
+        Returns: 
+            an cosine_decay_RMSProp optimizer
+        """
+
+        learning_rate = fluid.layers.cosine_decay(
+            learning_rate=self.lr,
+            step_each_epoch=self.step,
+            epochs=self.num_epochs)
+        optimizer = fluid.optimizer.RMSProp(
+            learning_rate=learning_rate,
+            momentum=self.momentum_rate,
+            regularization=fluid.regularizer.L2Decay(self.l2_decay),
+            # Apply epsilon=1 on ImageNet dataset.
+            epsilon=1)
+        return optimizer
+
+    def default_decay(self):
+        """default decay
+
+        Returns:
+            default decay optimizer
+        """
+
+        optimizer = fluid.optimizer.Momentum(
+            learning_rate=self.lr,
+            momentum=self.momentum_rate,
+            regularization=fluid.regularizer.L2Decay(self.l2_decay))
+        return optimizer
+
+
+def create_optimizer(args):
+    Opt = Optimizer(args)
+    optimizer = getattr(Opt, args.lr_strategy)()
+
+    return optimizer