未验证 提交 d0f8cbcc 编写于 作者: Z Zeyu Chen 提交者: GitHub

Merge pull request #36 from Steffy-zxf/add-linear-warmup-decay-function

Fix the bug that lack the defination of linear_warmup_decay function
...@@ -19,6 +19,8 @@ from __future__ import print_function ...@@ -19,6 +19,8 @@ from __future__ import print_function
import numpy as np import numpy as np
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.layers.learning_rate_scheduler as lr_scheduler
from paddle.fluid.layers import control_flow
def adam_weight_decay_optimization(loss, def adam_weight_decay_optimization(loss,
...@@ -35,7 +37,7 @@ def adam_weight_decay_optimization(loss, ...@@ -35,7 +37,7 @@ def adam_weight_decay_optimization(loss,
warmup_steps) warmup_steps)
elif scheduler == 'linear_decay': elif scheduler == 'linear_decay':
scheduled_lr = linear_warmup_decay(learning_rate, warmup_steps, scheduled_lr = linear_warmup_decay(learning_rate, warmup_steps,
num_train_steps) main_program)
else: else:
raise ValueError("Unkown learning rate scheduler, should be " raise ValueError("Unkown learning rate scheduler, should be "
"'noam_decay' or 'linear_decay'") "'noam_decay' or 'linear_decay'")
...@@ -76,3 +78,26 @@ def adam_weight_decay_optimization(loss, ...@@ -76,3 +78,26 @@ def adam_weight_decay_optimization(loss,
fluid.layers.assign(output=param, input=updated_param) fluid.layers.assign(output=param, input=updated_param)
return scheduled_lr return scheduled_lr
def linear_warmup_decay(init_lr, num_warmup_steps, main_program):
with main_program._lr_schedule_guard():
global_step = lr_scheduler._decay_step_counter()
lr = fluid.layers.create_global_var(
shape=[1],
value=0.0,
dtype='float32',
persistable=True,
name="learning_rate")
with control_flow.Switch() as switch:
with switch.case(global_step < num_warmup_steps):
decayed_lr = init_lr * global_step * 1.0 / num_warmup_steps
fluid.layers.assign(decayed_lr, lr)
with switch.default():
last_value_var = fluid.layers.fill_constant(
shape=[1], dtype='float32', value=float(init_lr))
fluid.layers.assign(last_value_var, lr)
return lr
...@@ -89,7 +89,7 @@ class AdamWeightDecayStrategy(DefaultStrategy): ...@@ -89,7 +89,7 @@ class AdamWeightDecayStrategy(DefaultStrategy):
def __init__(self, def __init__(self,
learning_rate=1e-4, learning_rate=1e-4,
lr_scheduler="linear_decay", lr_scheduler="linear_decay",
warmup_proportion=0.0, warmup_proportion=0.1,
weight_decay=0.01, weight_decay=0.01,
optimizer_name="adam"): optimizer_name="adam"):
super(AdamWeightDecayStrategy, self).__init__( super(AdamWeightDecayStrategy, self).__init__(
...@@ -117,6 +117,12 @@ class AdamWeightDecayStrategy(DefaultStrategy): ...@@ -117,6 +117,12 @@ class AdamWeightDecayStrategy(DefaultStrategy):
def execute(self, loss, main_program, data_reader, config): def execute(self, loss, main_program, data_reader, config):
# calculate wamrup step # calculate wamrup step
dev_count = self._get_dev_count(config) dev_count = self._get_dev_count(config)
data_reader.data_generator(
batch_size=config.batch_size, phase='train', shuffle=True)
data_reader.data_generator(
batch_size=config.batch_size, phase='val', shuffle=False)
data_reader.data_generator(
batch_size=config.batch_size, phase='dev', shuffle=False)
num_train_examples = data_reader.get_num_examples(phase='train') num_train_examples = data_reader.get_num_examples(phase='train')
max_train_steps = config.num_epoch * num_train_examples // config.batch_size // dev_count max_train_steps = config.num_epoch * num_train_examples // config.batch_size // dev_count
warmup_steps = int(max_train_steps * self.warmup_proportion) warmup_steps = int(max_train_steps * self.warmup_proportion)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册