diff --git a/python/paddle/fluid/dygraph/learning_rate_scheduler.py b/python/paddle/fluid/dygraph/learning_rate_scheduler.py index 91bd7836e19b0bbdf86f4d2d8b95847b1074ba3b..047e35deb4144041e163d002dfb6908a0dcfe956 100644 --- a/python/paddle/fluid/dygraph/learning_rate_scheduler.py +++ b/python/paddle/fluid/dygraph/learning_rate_scheduler.py @@ -517,7 +517,7 @@ class NoamDecay(LearningRateDecay): .. math:: - decayed\_learning\_rate = d_{model}^{-0.5} * min(global\_step^{-0.5}, global\_step * warmup\_steps^{-1.5}) + decayed\_learning\_rate = learning\_rate * d_{model}^{-0.5} * min(global\_step^{-0.5}, global\_step * warmup\_steps^{-1.5}) Please reference `attention is all you need `_ @@ -531,6 +531,9 @@ class NoamDecay(LearningRateDecay): The default value is 1. dtype(str, optional): The data type used to create the learning rate variable. The data type can be set as 'float32', 'float64'. The default value is 'float32'. + learning_rate(Variable|float|int): The initial learning rate. If the type + is Variable, it's a tensor with shape [1], the data type can be + float32 or float64. It also can be set to python int number. Default 1.0 Returns: None. @@ -550,8 +553,15 @@ class NoamDecay(LearningRateDecay): parameter_list = emb.parameters()) """ - def __init__(self, d_model, warmup_steps, begin=1, step=1, dtype='float32'): + def __init__(self, + d_model, + warmup_steps, + begin=1, + step=1, + dtype='float32', + learning_rate=1.0): super(NoamDecay, self).__init__(begin, step, dtype) + self.learning_rate = learning_rate self.d_model = d_model self.warmup_steps = warmup_steps @@ -559,7 +569,8 @@ class NoamDecay(LearningRateDecay): from .. import layers a = self.create_lr_var(self.step_num**-0.5) b = self.create_lr_var((self.warmup_steps**-1.5) * self.step_num) - lr_value = (self.d_model**-0.5) * layers.elementwise_min(a, b) + lr_value = self.learning_rate * (self.d_model + **-0.5) * layers.elementwise_min(a, b) return lr_value diff --git a/python/paddle/fluid/layers/learning_rate_scheduler.py b/python/paddle/fluid/layers/learning_rate_scheduler.py index eb1040712d6b608f9e6cb04a3cfc561218c9be59..76e4fe5fcf4a2e04d1134ae445ff3ff043b4d5b1 100644 --- a/python/paddle/fluid/layers/learning_rate_scheduler.py +++ b/python/paddle/fluid/layers/learning_rate_scheduler.py @@ -49,7 +49,7 @@ def _decay_step_counter(begin=0): return global_step -def noam_decay(d_model, warmup_steps): +def noam_decay(d_model, warmup_steps, learning_rate=1.0): """ Noam decay method. The numpy implementation of noam decay as follows. @@ -58,11 +58,12 @@ def noam_decay(d_model, warmup_steps): import paddle.fluid as fluid import numpy as np # set hyper parameters + base_lr = 0.01 d_model = 2 current_steps = 20 warmup_steps = 200 # compute - lr_value = np.power(d_model, -0.5) * np.min([ + lr_value = base_lr * np.power(d_model, -0.5) * np.min([ np.power(current_steps, -0.5), np.power(warmup_steps, -1.5) * current_steps]) @@ -74,6 +75,10 @@ def noam_decay(d_model, warmup_steps): warmup_steps(Variable): A super parameter. + learning_rate(Variable|float|int): The initial learning rate. If the type + is Variable, it's a tensor with shape [1], the data type can be + float32 or float64. It also can be set to python int number. Default 1.0 + Returns: The decayed learning rate. Examples: @@ -84,18 +89,21 @@ def noam_decay(d_model, warmup_steps): learning_rate = 0.01 lr = fluid.layers.learning_rate_scheduler.noam_decay( 1/(warmup_steps *(learning_rate ** 2)), - warmup_steps) + warmup_steps, + learning_rate) """ with default_main_program()._lr_schedule_guard(): if in_dygraph_mode(): - decay = imperate_lr.NoamDecay(d_model, warmup_steps) + decay = imperate_lr.NoamDecay( + d_model, warmup_steps, learning_rate=learning_rate) return decay else: global_step = _decay_step_counter(1) a = global_step**-0.5 b = (warmup_steps**-1.5) * global_step - lr_value = (d_model**-0.5) * nn.elementwise_min(a, b) + lr_value = learning_rate * (d_model**-0.5) * nn.elementwise_min(a, + b) return lr_value diff --git a/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py b/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py index e3f79448e7394f1148416a70b08c2bdb128905ce..076009788619410c94518b6e0a2b3f81a3b86b12 100644 --- a/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py +++ b/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py @@ -89,6 +89,34 @@ def cosine_decay(global_step, learning_rate, step_each_epoch, epochs): return decayed_lr +def noam_decay(global_step, d_model, warmup_steps, learning_rate=1.0): + a = math.pow(global_step, -0.5) + b = math.pow(warmup_steps, -1.5) * global_step + decayed_lr = learning_rate * math.pow(d_model, -0.5) * min(a, b) + + return decayed_lr + + +class TestNoamLearningRateDecayDygraphMode(unittest.TestCase): + def test_dygraph_mode(self): + with fluid.dygraph.guard(): + d_model = 0.01 + warmup_steps = 200 + learning_rate = 2.0 + lr = fluid.layers.noam_decay(d_model, warmup_steps, learning_rate) + for step in range(5): + step += 1 + right_result = noam_decay(step, d_model, warmup_steps, + learning_rate) + fluid_result = lr() + + self.assertAlmostEqual( + right_result, + fluid_result[0], + msg='Failed lr scheduler in step {0}, Python result is {1}, Fluid result is {2}'. + format(step, right_result, fluid_result[0])) + + class TestLearningRateDecay(unittest.TestCase): def check_decay(self, python_decay_fn, fluid_decay_fn, kwargs): places = [fluid.CPUPlace()] @@ -112,6 +140,9 @@ class TestLearningRateDecay(unittest.TestCase): exe.run(startup_prog) for step in range(10): + # Step of NoamDecay starts from 1. + if python_decay_fn.__name__ == 'noam_decay': + step += 1 lr_val, = exe.run(main_prog, feed={}, fetch_list=[decayed_lr]) python_decayed_lr = python_decay_fn( global_step=float(step), **kwargs) @@ -159,6 +190,11 @@ class TestLearningRateDecay(unittest.TestCase): "step_each_epoch": 100, "epochs": 120 }), + (noam_decay, layers.noam_decay, { + "d_model": 0.01, + "warmup_steps": 200, + "learning_rate": 2.0 + }), ] for py_decay_fn, fluid_decay_fn, kwargs in decay_fns: @@ -195,6 +231,9 @@ class TestLinearWamrupLearningRateDecay(TestLearningRateDecay): exe.run(startup_prog) for step in range(20): + # Step of NoamDecay starts from 1. + if fluid_decay_fn.__name__ == 'noam_decay': + step += 1 lr_val, = exe.run(main_prog, feed={}, fetch_list=[decayed_lr]) if step < warmup_steps: python_decayed_lr = linear_lr_warmup(