未验证 提交 d6f72c4f 编写于 作者: A Aurelius84 提交者: GitHub

Add parameter(learning_rate) in NoamDecay (#23156)

* Add parameter(learning_rate) in NoamDecay test=develop
上级 af926306
......@@ -517,7 +517,7 @@ class NoamDecay(LearningRateDecay):
.. math::
decayed\_learning\_rate = d_{model}^{-0.5} * min(global\_step^{-0.5}, global\_step * warmup\_steps^{-1.5})
decayed\_learning\_rate = learning\_rate * d_{model}^{-0.5} * min(global\_step^{-0.5}, global\_step * warmup\_steps^{-1.5})
Please reference `attention is all you need <https://arxiv.org/pdf/1706.03762.pdf>`_
......@@ -531,6 +531,9 @@ class NoamDecay(LearningRateDecay):
The default value is 1.
dtype(str, optional): The data type used to create the learning rate variable. The data type can be set as
'float32', 'float64'. The default value is 'float32'.
learning_rate(Variable|float|int): The initial learning rate. If the type
is Variable, it's a tensor with shape [1], the data type can be
float32 or float64. It also can be set to python int number. Default 1.0
Returns:
None.
......@@ -550,8 +553,15 @@ class NoamDecay(LearningRateDecay):
parameter_list = emb.parameters())
"""
def __init__(self, d_model, warmup_steps, begin=1, step=1, dtype='float32'):
def __init__(self,
d_model,
warmup_steps,
begin=1,
step=1,
dtype='float32',
learning_rate=1.0):
super(NoamDecay, self).__init__(begin, step, dtype)
self.learning_rate = learning_rate
self.d_model = d_model
self.warmup_steps = warmup_steps
......@@ -559,7 +569,8 @@ class NoamDecay(LearningRateDecay):
from .. import layers
a = self.create_lr_var(self.step_num**-0.5)
b = self.create_lr_var((self.warmup_steps**-1.5) * self.step_num)
lr_value = (self.d_model**-0.5) * layers.elementwise_min(a, b)
lr_value = self.learning_rate * (self.d_model
**-0.5) * layers.elementwise_min(a, b)
return lr_value
......
......@@ -49,7 +49,7 @@ def _decay_step_counter(begin=0):
return global_step
def noam_decay(d_model, warmup_steps):
def noam_decay(d_model, warmup_steps, learning_rate=1.0):
"""
Noam decay method. The numpy implementation of noam decay as follows.
......@@ -58,11 +58,12 @@ def noam_decay(d_model, warmup_steps):
import paddle.fluid as fluid
import numpy as np
# set hyper parameters
base_lr = 0.01
d_model = 2
current_steps = 20
warmup_steps = 200
# compute
lr_value = np.power(d_model, -0.5) * np.min([
lr_value = base_lr * np.power(d_model, -0.5) * np.min([
np.power(current_steps, -0.5),
np.power(warmup_steps, -1.5) * current_steps])
......@@ -74,6 +75,10 @@ def noam_decay(d_model, warmup_steps):
warmup_steps(Variable): A super parameter.
learning_rate(Variable|float|int): The initial learning rate. If the type
is Variable, it's a tensor with shape [1], the data type can be
float32 or float64. It also can be set to python int number. Default 1.0
Returns:
The decayed learning rate.
Examples:
......@@ -84,18 +89,21 @@ def noam_decay(d_model, warmup_steps):
learning_rate = 0.01
lr = fluid.layers.learning_rate_scheduler.noam_decay(
1/(warmup_steps *(learning_rate ** 2)),
warmup_steps)
warmup_steps,
learning_rate)
"""
with default_main_program()._lr_schedule_guard():
if in_dygraph_mode():
decay = imperate_lr.NoamDecay(d_model, warmup_steps)
decay = imperate_lr.NoamDecay(
d_model, warmup_steps, learning_rate=learning_rate)
return decay
else:
global_step = _decay_step_counter(1)
a = global_step**-0.5
b = (warmup_steps**-1.5) * global_step
lr_value = (d_model**-0.5) * nn.elementwise_min(a, b)
lr_value = learning_rate * (d_model**-0.5) * nn.elementwise_min(a,
b)
return lr_value
......
......@@ -89,6 +89,34 @@ def cosine_decay(global_step, learning_rate, step_each_epoch, epochs):
return decayed_lr
def noam_decay(global_step, d_model, warmup_steps, learning_rate=1.0):
a = math.pow(global_step, -0.5)
b = math.pow(warmup_steps, -1.5) * global_step
decayed_lr = learning_rate * math.pow(d_model, -0.5) * min(a, b)
return decayed_lr
class TestNoamLearningRateDecayDygraphMode(unittest.TestCase):
def test_dygraph_mode(self):
with fluid.dygraph.guard():
d_model = 0.01
warmup_steps = 200
learning_rate = 2.0
lr = fluid.layers.noam_decay(d_model, warmup_steps, learning_rate)
for step in range(5):
step += 1
right_result = noam_decay(step, d_model, warmup_steps,
learning_rate)
fluid_result = lr()
self.assertAlmostEqual(
right_result,
fluid_result[0],
msg='Failed lr scheduler in step {0}, Python result is {1}, Fluid result is {2}'.
format(step, right_result, fluid_result[0]))
class TestLearningRateDecay(unittest.TestCase):
def check_decay(self, python_decay_fn, fluid_decay_fn, kwargs):
places = [fluid.CPUPlace()]
......@@ -112,6 +140,9 @@ class TestLearningRateDecay(unittest.TestCase):
exe.run(startup_prog)
for step in range(10):
# Step of NoamDecay starts from 1.
if python_decay_fn.__name__ == 'noam_decay':
step += 1
lr_val, = exe.run(main_prog, feed={}, fetch_list=[decayed_lr])
python_decayed_lr = python_decay_fn(
global_step=float(step), **kwargs)
......@@ -159,6 +190,11 @@ class TestLearningRateDecay(unittest.TestCase):
"step_each_epoch": 100,
"epochs": 120
}),
(noam_decay, layers.noam_decay, {
"d_model": 0.01,
"warmup_steps": 200,
"learning_rate": 2.0
}),
]
for py_decay_fn, fluid_decay_fn, kwargs in decay_fns:
......@@ -195,6 +231,9 @@ class TestLinearWamrupLearningRateDecay(TestLearningRateDecay):
exe.run(startup_prog)
for step in range(20):
# Step of NoamDecay starts from 1.
if fluid_decay_fn.__name__ == 'noam_decay':
step += 1
lr_val, = exe.run(main_prog, feed={}, fetch_list=[decayed_lr])
if step < warmup_steps:
python_decayed_lr = linear_lr_warmup(
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册