Add parameter(learning_rate) in NoamDecay (#23156)

* Add parameter(learning_rate) in NoamDecay test=develop

Add parameter(learning_rate) in NoamDecay (#23156)
* Add parameter(learning_rate) in NoamDecay test=develop
d6f72c4f · Aurelius84 · GitHub · af926306 · d6f72c4f · d6f72c4f
3 changed file
--- a/python/paddle/fluid/dygraph/learning_rate_scheduler.py
+++ b/python/paddle/fluid/dygraph/learning_rate_scheduler.py
@@ -517,7 +517,7 @@ class NoamDecay(LearningRateDecay):

    .. math::

-        decayed\_learning\_rate = d_{model}^{-0.5} * min(global\_step^{-0.5}, global\_step * warmup\_steps^{-1.5})
+        decayed\_learning\_rate = learning\_rate * d_{model}^{-0.5} * min(global\_step^{-0.5}, global\_step * warmup\_steps^{-1.5})

    Please reference `attention is all you need <https://arxiv.org/pdf/1706.03762.pdf>`_ 

@@ -531,6 +531,9 @@ class NoamDecay(LearningRateDecay):
            The default value is 1.
        dtype(str, optional): The data type used to create the learning rate variable. The data type can be set as
            'float32', 'float64'. The default value is 'float32'.
+        learning_rate(Variable|float|int): The initial learning rate. If the type
+            is Variable, it's a tensor with shape [1], the data type can be
+            float32 or float64. It also can be set to python int number. Default 1.0

    Returns:
        None.
@@ -550,8 +553,15 @@ class NoamDecay(LearningRateDecay):
                  parameter_list = emb.parameters())
    """

-    def __init__(self, d_model, warmup_steps, begin=1, step=1, dtype='float32'):
+    def __init__(self,
+                 d_model,
+                 warmup_steps,
+                 begin=1,
+                 step=1,
+                 dtype='float32',
+                 learning_rate=1.0):
        super(NoamDecay, self).__init__(begin, step, dtype)
+        self.learning_rate = learning_rate
        self.d_model = d_model
        self.warmup_steps = warmup_steps

@@ -559,7 +569,8 @@ class NoamDecay(LearningRateDecay):
        from .. import layers
        a = self.create_lr_var(self.step_num**-0.5)
        b = self.create_lr_var((self.warmup_steps**-1.5) * self.step_num)
-        lr_value = (self.d_model**-0.5) * layers.elementwise_min(a, b)
+        lr_value = self.learning_rate * (self.d_model
+                                         **-0.5) * layers.elementwise_min(a, b)
        return lr_value



--- a/python/paddle/fluid/layers/learning_rate_scheduler.py
+++ b/python/paddle/fluid/layers/learning_rate_scheduler.py
@@ -49,7 +49,7 @@ def _decay_step_counter(begin=0):
    return global_step


-def noam_decay(d_model, warmup_steps):
+def noam_decay(d_model, warmup_steps, learning_rate=1.0):
    """
    Noam decay method. The numpy implementation of noam decay as follows.

@@ -58,11 +58,12 @@ def noam_decay(d_model, warmup_steps):
      import paddle.fluid as fluid
      import numpy as np
      # set hyper parameters
+      base_lr = 0.01
      d_model = 2
      current_steps = 20
      warmup_steps = 200
      # compute
-      lr_value = np.power(d_model, -0.5) * np.min([
+      lr_value = base_lr * np.power(d_model, -0.5) * np.min([
                              np.power(current_steps, -0.5),
                              np.power(warmup_steps, -1.5) * current_steps])

@@ -74,6 +75,10 @@ def noam_decay(d_model, warmup_steps):

        warmup_steps(Variable): A super parameter.

+        learning_rate(Variable|float|int): The initial learning rate. If the type
+            is Variable, it's a tensor with shape [1], the data type can be
+            float32 or float64. It also can be set to python int number. Default 1.0
+
    Returns:
        The decayed learning rate.
    Examples:
@@ -84,18 +89,21 @@ def noam_decay(d_model, warmup_steps):
          learning_rate = 0.01
          lr = fluid.layers.learning_rate_scheduler.noam_decay(
                         1/(warmup_steps *(learning_rate ** 2)),
-                         warmup_steps)
+                         warmup_steps,
+                         learning_rate)
    """
    with default_main_program()._lr_schedule_guard():
        if in_dygraph_mode():
-            decay = imperate_lr.NoamDecay(d_model, warmup_steps)
+            decay = imperate_lr.NoamDecay(
+                d_model, warmup_steps, learning_rate=learning_rate)
            return decay
        else:
            global_step = _decay_step_counter(1)

            a = global_step**-0.5
            b = (warmup_steps**-1.5) * global_step
-            lr_value = (d_model**-0.5) * nn.elementwise_min(a, b)
+            lr_value = learning_rate * (d_model**-0.5) * nn.elementwise_min(a,
+                                                                            b)

            return lr_value


--- a/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py
+++ b/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py
@@ -89,6 +89,34 @@ def cosine_decay(global_step, learning_rate, step_each_epoch, epochs):
    return decayed_lr


+def noam_decay(global_step, d_model, warmup_steps, learning_rate=1.0):
+    a = math.pow(global_step, -0.5)
+    b = math.pow(warmup_steps, -1.5) * global_step
+    decayed_lr = learning_rate * math.pow(d_model, -0.5) * min(a, b)
+
+    return decayed_lr
+
+
+class TestNoamLearningRateDecayDygraphMode(unittest.TestCase):
+    def test_dygraph_mode(self):
+        with fluid.dygraph.guard():
+            d_model = 0.01
+            warmup_steps = 200
+            learning_rate = 2.0
+            lr = fluid.layers.noam_decay(d_model, warmup_steps, learning_rate)
+            for step in range(5):
+                step += 1
+                right_result = noam_decay(step, d_model, warmup_steps,
+                                          learning_rate)
+                fluid_result = lr()
+
+                self.assertAlmostEqual(
+                    right_result,
+                    fluid_result[0],
+                    msg='Failed lr scheduler in step {0}, Python result is {1}, Fluid result is {2}'.
+                    format(step, right_result, fluid_result[0]))
+
+
 class TestLearningRateDecay(unittest.TestCase):
    def check_decay(self, python_decay_fn, fluid_decay_fn, kwargs):
        places = [fluid.CPUPlace()]
@@ -112,6 +140,9 @@ class TestLearningRateDecay(unittest.TestCase):
        exe.run(startup_prog)

        for step in range(10):
+            # Step of NoamDecay starts from 1.
+            if python_decay_fn.__name__ == 'noam_decay':
+                step += 1
            lr_val, = exe.run(main_prog, feed={}, fetch_list=[decayed_lr])
            python_decayed_lr = python_decay_fn(
                global_step=float(step), **kwargs)
@@ -159,6 +190,11 @@ class TestLearningRateDecay(unittest.TestCase):
                "step_each_epoch": 100,
                "epochs": 120
            }),
+            (noam_decay, layers.noam_decay, {
+                "d_model": 0.01,
+                "warmup_steps": 200,
+                "learning_rate": 2.0
+            }),
        ]

        for py_decay_fn, fluid_decay_fn, kwargs in decay_fns:
@@ -195,6 +231,9 @@ class TestLinearWamrupLearningRateDecay(TestLearningRateDecay):
        exe.run(startup_prog)

        for step in range(20):
+            # Step of NoamDecay starts from 1.
+            if fluid_decay_fn.__name__ == 'noam_decay':
+                step += 1
            lr_val, = exe.run(main_prog, feed={}, fetch_list=[decayed_lr])
            if step < warmup_steps:
                python_decayed_lr = linear_lr_warmup(