diff --git a/python/paddle/fluid/dygraph/learning_rate_scheduler.py b/python/paddle/fluid/dygraph/learning_rate_scheduler.py
index 91bd7836e19b0bbdf86f4d2d8b95847b1074ba3b..047e35deb4144041e163d002dfb6908a0dcfe956 100644
--- a/python/paddle/fluid/dygraph/learning_rate_scheduler.py
+++ b/python/paddle/fluid/dygraph/learning_rate_scheduler.py
@@ -517,7 +517,7 @@ class NoamDecay(LearningRateDecay):
 
     .. math::
 
-        decayed\_learning\_rate = d_{model}^{-0.5} * min(global\_step^{-0.5}, global\_step * warmup\_steps^{-1.5})
+        decayed\_learning\_rate = learning\_rate * d_{model}^{-0.5} * min(global\_step^{-0.5}, global\_step * warmup\_steps^{-1.5})
 
     Please reference `attention is all you need <https://arxiv.org/pdf/1706.03762.pdf>`_ 
 
@@ -531,6 +531,9 @@ class NoamDecay(LearningRateDecay):
             The default value is 1.
         dtype(str, optional): The data type used to create the learning rate variable. The data type can be set as
             'float32', 'float64'. The default value is 'float32'.
+        learning_rate(Variable|float|int): The initial learning rate. If the type
+            is Variable, it's a tensor with shape [1], the data type can be
+            float32 or float64. It also can be set to python int number. Default 1.0
 
     Returns:
         None.
@@ -550,8 +553,15 @@ class NoamDecay(LearningRateDecay):
                   parameter_list = emb.parameters())
     """
 
-    def __init__(self, d_model, warmup_steps, begin=1, step=1, dtype='float32'):
+    def __init__(self,
+                 d_model,
+                 warmup_steps,
+                 begin=1,
+                 step=1,
+                 dtype='float32',
+                 learning_rate=1.0):
         super(NoamDecay, self).__init__(begin, step, dtype)
+        self.learning_rate = learning_rate
         self.d_model = d_model
         self.warmup_steps = warmup_steps
 
@@ -559,7 +569,8 @@ class NoamDecay(LearningRateDecay):
         from .. import layers
         a = self.create_lr_var(self.step_num**-0.5)
         b = self.create_lr_var((self.warmup_steps**-1.5) * self.step_num)
-        lr_value = (self.d_model**-0.5) * layers.elementwise_min(a, b)
+        lr_value = self.learning_rate * (self.d_model
+                                         **-0.5) * layers.elementwise_min(a, b)
         return lr_value
 
 
diff --git a/python/paddle/fluid/layers/learning_rate_scheduler.py b/python/paddle/fluid/layers/learning_rate_scheduler.py
index eb1040712d6b608f9e6cb04a3cfc561218c9be59..76e4fe5fcf4a2e04d1134ae445ff3ff043b4d5b1 100644
--- a/python/paddle/fluid/layers/learning_rate_scheduler.py
+++ b/python/paddle/fluid/layers/learning_rate_scheduler.py
@@ -49,7 +49,7 @@ def _decay_step_counter(begin=0):
     return global_step
 
 
-def noam_decay(d_model, warmup_steps):
+def noam_decay(d_model, warmup_steps, learning_rate=1.0):
     """
     Noam decay method. The numpy implementation of noam decay as follows.
 
@@ -58,11 +58,12 @@ def noam_decay(d_model, warmup_steps):
       import paddle.fluid as fluid
       import numpy as np
       # set hyper parameters
+      base_lr = 0.01
       d_model = 2
       current_steps = 20
       warmup_steps = 200
       # compute
-      lr_value = np.power(d_model, -0.5) * np.min([
+      lr_value = base_lr * np.power(d_model, -0.5) * np.min([
                               np.power(current_steps, -0.5),
                               np.power(warmup_steps, -1.5) * current_steps])
 
@@ -74,6 +75,10 @@ def noam_decay(d_model, warmup_steps):
 
         warmup_steps(Variable): A super parameter.
 
+        learning_rate(Variable|float|int): The initial learning rate. If the type
+            is Variable, it's a tensor with shape [1], the data type can be
+            float32 or float64. It also can be set to python int number. Default 1.0
+
     Returns:
         The decayed learning rate.
     Examples:
@@ -84,18 +89,21 @@ def noam_decay(d_model, warmup_steps):
           learning_rate = 0.01
           lr = fluid.layers.learning_rate_scheduler.noam_decay(
                          1/(warmup_steps *(learning_rate ** 2)),
-                         warmup_steps)
+                         warmup_steps,
+                         learning_rate)
     """
     with default_main_program()._lr_schedule_guard():
         if in_dygraph_mode():
-            decay = imperate_lr.NoamDecay(d_model, warmup_steps)
+            decay = imperate_lr.NoamDecay(
+                d_model, warmup_steps, learning_rate=learning_rate)
             return decay
         else:
             global_step = _decay_step_counter(1)
 
             a = global_step**-0.5
             b = (warmup_steps**-1.5) * global_step
-            lr_value = (d_model**-0.5) * nn.elementwise_min(a, b)
+            lr_value = learning_rate * (d_model**-0.5) * nn.elementwise_min(a,
+                                                                            b)
 
             return lr_value
 
diff --git a/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py b/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py
index e3f79448e7394f1148416a70b08c2bdb128905ce..076009788619410c94518b6e0a2b3f81a3b86b12 100644
--- a/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py
+++ b/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py
@@ -89,6 +89,34 @@ def cosine_decay(global_step, learning_rate, step_each_epoch, epochs):
     return decayed_lr
 
 
+def noam_decay(global_step, d_model, warmup_steps, learning_rate=1.0):
+    a = math.pow(global_step, -0.5)
+    b = math.pow(warmup_steps, -1.5) * global_step
+    decayed_lr = learning_rate * math.pow(d_model, -0.5) * min(a, b)
+
+    return decayed_lr
+
+
+class TestNoamLearningRateDecayDygraphMode(unittest.TestCase):
+    def test_dygraph_mode(self):
+        with fluid.dygraph.guard():
+            d_model = 0.01
+            warmup_steps = 200
+            learning_rate = 2.0
+            lr = fluid.layers.noam_decay(d_model, warmup_steps, learning_rate)
+            for step in range(5):
+                step += 1
+                right_result = noam_decay(step, d_model, warmup_steps,
+                                          learning_rate)
+                fluid_result = lr()
+
+                self.assertAlmostEqual(
+                    right_result,
+                    fluid_result[0],
+                    msg='Failed lr scheduler in step {0}, Python result is {1}, Fluid result is {2}'.
+                    format(step, right_result, fluid_result[0]))
+
+
 class TestLearningRateDecay(unittest.TestCase):
     def check_decay(self, python_decay_fn, fluid_decay_fn, kwargs):
         places = [fluid.CPUPlace()]
@@ -112,6 +140,9 @@ class TestLearningRateDecay(unittest.TestCase):
         exe.run(startup_prog)
 
         for step in range(10):
+            # Step of NoamDecay starts from 1.
+            if python_decay_fn.__name__ == 'noam_decay':
+                step += 1
             lr_val, = exe.run(main_prog, feed={}, fetch_list=[decayed_lr])
             python_decayed_lr = python_decay_fn(
                 global_step=float(step), **kwargs)
@@ -159,6 +190,11 @@ class TestLearningRateDecay(unittest.TestCase):
                 "step_each_epoch": 100,
                 "epochs": 120
             }),
+            (noam_decay, layers.noam_decay, {
+                "d_model": 0.01,
+                "warmup_steps": 200,
+                "learning_rate": 2.0
+            }),
         ]
 
         for py_decay_fn, fluid_decay_fn, kwargs in decay_fns:
@@ -195,6 +231,9 @@ class TestLinearWamrupLearningRateDecay(TestLearningRateDecay):
         exe.run(startup_prog)
 
         for step in range(20):
+            # Step of NoamDecay starts from 1.
+            if fluid_decay_fn.__name__ == 'noam_decay':
+                step += 1
             lr_val, = exe.run(main_prog, feed={}, fetch_list=[decayed_lr])
             if step < warmup_steps:
                 python_decayed_lr = linear_lr_warmup(