未验证 提交 fee616e8 编写于 作者: J Jiaqi Liu 提交者: GitHub

Add some lr schedulers with warmup (#5256)

* add -linear-schedule-with-warmup

* update lr scheduler usage in run_glue

* add import info

* add other 5 scheduler with warmup

* add copyright, update usage in run_glue

* simplify argument, make warmup arg support float and int

* Add some LambdaDecay scheduler with warmup, update usage in run_glue

* update classname and unify two cosine decays into one

* update usage in run_glue

* fix typo

* update WarmUp to Warmup, and update class name about Const, and update doc, and usage

* update usage of decay class

* update usage of decay class
上级 a22fa4b3
......@@ -29,6 +29,7 @@ from paddlenlp.data import Stack, Tuple, Pad
from paddlenlp.data.sampler import SamplerHelper
from paddlenlp.transformers import BertForSequenceClassification, BertTokenizer
from paddlenlp.transformers import ErnieForSequenceClassification, ErnieTokenizer
from paddlenlp.transformers import LinearDecayWithWarmup
from paddlenlp.metrics import Mcc, PearsonAndSpearman
from paddlenlp.utils.log import logger
......@@ -381,17 +382,11 @@ def do_train(args):
# Create the training-backward program, this pass will not be
# executed in the validation
num_training_steps = args.max_steps if args.max_steps > 0 else len(
train_data_loader) * args.num_train_epochs
with paddle.static.program_guard(main_program, startup_program):
lr_scheduler = paddle.optimizer.lr.LambdaDecay(
args.learning_rate,
lambda current_step, num_warmup_steps=args.warmup_steps,
num_training_steps=args.max_steps if args.max_steps > 0 else
(len(train_data_loader) * args.num_train_epochs): float(
current_step) / float(max(1, num_warmup_steps))
if current_step < num_warmup_steps else max(
0.0,
float(num_training_steps - current_step) / float(
max(1, num_training_steps - num_warmup_steps))))
lr_scheduler = LinearDecayWithWarmup(
args.learning_rate, num_training_steps, args.warmup_steps)
optimizer = paddle.optimizer.AdamW(
learning_rate=lr_scheduler,
epsilon=args.adam_epsilon,
......
......@@ -31,6 +31,7 @@ from paddle.io import DataLoader, Dataset
from paddlenlp.transformers import BertForPretraining, BertModel, BertPretrainingCriterion
from paddlenlp.transformers import BertTokenizer
from paddlenlp.transformers import LinearDecayWithWarmup
from data import create_data_holder, create_pretraining_dataset
MODEL_CLASSES = {"bert": (BertForPretraining, BertTokenizer)}
......@@ -232,7 +233,8 @@ def dist_optimizer(args, optimizer):
if args.use_amp:
dist_strategy.amp = True
custom_black_list = ['lookup_table', 'lookup_table_v2'] if args.use_pure_fp16 else None
custom_black_list = ['lookup_table',
'lookup_table_v2'] if args.use_pure_fp16 else None
dist_strategy.amp_configs = {
'custom_white_list': ['softmax', 'layer_norm', 'gelu'],
'init_loss_scaling': args.scale_loss,
......@@ -305,16 +307,11 @@ def do_train(args):
masked_lm_labels, next_sentence_labels, masked_lm_scale)
# Define the dynamic learing_reate scheduler and optimizer
lr_scheduler = paddle.optimizer.lr.LambdaDecay(
args.learning_rate,
lambda current_step, num_warmup_steps=args.warmup_steps,
num_training_steps=args.max_steps if args.max_steps > 0 else
(len(train_data_loader) * args.num_train_epochs): float(
current_step) / float(max(1, num_warmup_steps))
if current_step < num_warmup_steps else max(
0.0,
float(num_training_steps - current_step) / float(
max(1, num_training_steps - num_warmup_steps))))
num_training_steps = args.max_steps if args.max_steps > 0 else len(
train_data_loader) * args.num_train_epochs
lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps,
args.warmup_steps)
optimizer = paddle.optimizer.AdamW(
learning_rate=lr_scheduler,
......@@ -327,7 +324,7 @@ def do_train(args):
],
multi_precision=args.use_pure_fp16)
if worker_num == 1 and args.use_amp:
custom_black_list=(['lookup_table', 'lookup_table_v2']
custom_black_list = (['lookup_table', 'lookup_table_v2']
if args.use_pure_fp16 else None)
amp_list = paddle.static.amp.AutoMixedPrecisionLists(
custom_white_list=['softmax', 'layer_norm', 'gelu'],
......
......@@ -28,6 +28,7 @@ from paddle.io import DataLoader, Dataset
from paddlenlp.transformers import BertForPretraining, BertModel, BertPretrainingCriterion
from paddlenlp.transformers import ErnieForPretraining, ErnieModel, ErniePretrainingCriterion
from paddlenlp.transformers import BertTokenizer, ErnieTokenizer
from paddlenlp.transformers import LinearDecayWithWarmup
from data import create_data_holder, create_pretraining_dataset
MODEL_CLASSES = {
......@@ -222,16 +223,11 @@ def do_train(args):
masked_lm_labels, next_sentence_labels, masked_lm_scale)
# Define the dynamic learing_reate scheduler and optimizer
lr_scheduler = paddle.optimizer.lr.LambdaDecay(
args.learning_rate,
lambda current_step, num_warmup_steps=args.warmup_steps,
num_training_steps=args.max_steps if args.max_steps > 0 else
(len(train_data_loader) * args.num_train_epochs): float(
current_step) / float(max(1, num_warmup_steps))
if current_step < num_warmup_steps else max(
0.0,
float(num_training_steps - current_step) / float(
max(1, num_training_steps - num_warmup_steps))))
num_training_steps = args.max_steps if args.max_steps > 0 else len(
train_data_loader) * args.num_train_epochs
lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps,
args.warmup_steps)
optimizer = paddle.optimizer.AdamW(
learning_rate=lr_scheduler,
......@@ -244,7 +240,7 @@ def do_train(args):
],
multi_precision=args.use_pure_fp16)
if args.use_amp:
custom_black_list=(['lookup_table', 'lookup_table_v2']
custom_black_list = (['lookup_table', 'lookup_table_v2']
if args.use_pure_fp16 else None)
amp_list = paddle.static.amp.AutoMixedPrecisionLists(
custom_white_list=['layer_norm', 'softmax', 'gelu'],
......
......@@ -9,13 +9,13 @@ import paddle.nn as nn
import paddle.nn.functional as F
import paddle.distributed as dist
from paddle.io import DataLoader, DistributedBatchSampler, BatchSampler
from paddle.optimizer.lr import LambdaDecay
from paddle.optimizer import AdamW
from paddle.metric import Accuracy
from paddlenlp.datasets import MapDatasetWrapper
from paddlenlp.data import Stack, Tuple, Pad
from paddlenlp.transformers import BertTokenizer, BertForSequenceClassification, BertForTokenClassification
from paddlenlp.transformers import LinearDecayWithWarmup
from args import parse_args, set_default_args
import data
......@@ -54,14 +54,6 @@ def save_ckpt(model, optimizer, output_dir, name):
paddle.save(optimizer.state_dict(), opt_path)
def compute_lr_factor(current_step, warmup_steps, max_train_steps):
if current_step < warmup_steps:
factor = float(current_step) / warmup_steps
else:
factor = 1 - float(current_step) / max_train_steps
return factor
class DGULossFunction(nn.Layer):
def __init__(self, task_name):
super(DGULossFunction, self).__init__()
......@@ -117,16 +109,14 @@ def print_logs(args, step, logits, labels, loss, total_time, metric):
def train(args, model, train_data_loader, dev_data_loader, metric, rank):
num_examples = len(train_data_loader) * args.batch_size * args.n_gpu
max_train_steps = args.epochs * len(train_data_loader)
warmup_steps = int(max_train_steps * args.warmup_proportion)
if rank == 0:
print("Num train examples: %d" % num_examples)
print("Max train steps: %d" % max_train_steps)
print("Num warmup steps: %d" % warmup_steps)
factor_fn = partial(
compute_lr_factor,
warmup_steps=warmup_steps,
max_train_steps=max_train_steps)
lr_scheduler = LambdaDecay(args.learning_rate, factor_fn)
print("Warmup proportion: %d" % args.warmup_proportion)
lr_scheduler = LinearDecayWithWarmup(args.learning_rate, max_train_steps,
args.warmup_proportion)
optimizer = AdamW(
learning_rate=lr_scheduler,
parameters=model.parameters(),
......
......@@ -32,6 +32,7 @@ from paddlenlp.data.sampler import SamplerHelper
from paddlenlp.transformers import BertForSequenceClassification, BertTokenizer
from paddlenlp.transformers import ElectraForSequenceClassification, ElectraTokenizer
from paddlenlp.transformers import ErnieForSequenceClassification, ErnieTokenizer
from paddlenlp.transformers import LinearDecayWithWarmup
from paddlenlp.metrics import AccuracyAndF1, Mcc, PearsonAndSpearman
FORMAT = '%(asctime)s-%(levelname)s: %(message)s'
......@@ -355,17 +356,10 @@ def do_train(args):
num_training_steps = args.max_steps if args.max_steps > 0 else (
len(train_data_loader) * args.num_train_epochs)
warmup_steps = args.warmup_steps if args.warmup_steps > 0 else (
int(math.floor(num_training_steps * args.warmup_proportion)))
lr_scheduler = paddle.optimizer.lr.LambdaDecay(
args.learning_rate,
lambda current_step, num_warmup_steps=warmup_steps,
num_training_steps=num_training_steps : float(
current_step) / float(max(1, num_warmup_steps))
if current_step < num_warmup_steps else max(
0.0,
float(num_training_steps - current_step) / float(
max(1, num_training_steps - num_warmup_steps))))
warmup = args.warmup_steps if args.warmup_steps > 0 else args.warmup_proportion
lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps,
warmup)
optimizer = paddle.optimizer.AdamW(
learning_rate=lr_scheduler,
......
......@@ -32,6 +32,7 @@ from paddlenlp.data import Stack, Tuple, Pad
from paddlenlp.transformers import BertForSequenceClassification, BertTokenizer
from paddlenlp.transformers import ElectraForSequenceClassification, ElectraTokenizer
from paddlenlp.transformers import ErnieForSequenceClassification, ErnieTokenizer
from paddlenlp.transformers import LinearDecayWithWarmup
from paddlenlp.metrics import AccuracyAndF1, Mcc, PearsonAndSpearman
FORMAT = '%(asctime)s-%(levelname)s: %(message)s'
......@@ -162,11 +163,13 @@ def parse_args():
type=str,
default="gpu",
help="Device for selecting for the training.")
parser.add_argument("--use_amp",
parser.add_argument(
"--use_amp",
type=distutils.util.strtobool,
default=False,
help="Enable mixed precision training.")
parser.add_argument("--scale_loss",
parser.add_argument(
"--scale_loss",
type=float,
default=2**15,
help="The value of scale_loss for fp16.")
......@@ -360,17 +363,10 @@ def do_train(args):
num_training_steps = args.max_steps if args.max_steps > 0 else (
len(train_data_loader) * args.num_train_epochs)
warmup_steps = args.warmup_steps if args.warmup_steps > 0 else (
int(math.floor(num_training_steps * args.warmup_proportion)))
lr_scheduler = paddle.optimizer.lr.LambdaDecay(
args.learning_rate,
lambda current_step, num_warmup_steps=warmup_steps,
num_training_steps=num_training_steps : float(
current_step) / float(max(1, num_warmup_steps))
if current_step < num_warmup_steps else max(
0.0,
float(num_training_steps - current_step) / float(
max(1, num_training_steps - num_warmup_steps))))
warmup = args.warmup_steps if args.warmup_steps > 0 else args.warmup_proportion
lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps,
warmup)
optimizer = paddle.optimizer.AdamW(
learning_rate=lr_scheduler,
......
......@@ -34,6 +34,7 @@ from paddlenlp.data import Stack, Tuple, Pad
from paddlenlp.transformers import BertForPretraining, BertModel, BertPretrainingCriterion
from paddlenlp.transformers import ErnieForPretraining, ErnieModel, ErniePretrainingCriterion
from paddlenlp.transformers import BertTokenizer, ErnieTokenizer
from paddlenlp.transformers import LinearDecayWithWarmup
FORMAT = '%(asctime)s-%(levelname)s: %(message)s'
logging.basicConfig(level=logging.INFO, format=FORMAT)
......@@ -147,11 +148,13 @@ def parse_args():
type=str,
default="gpu",
help="Device for selecting for the training.")
parser.add_argument("--use_amp",
parser.add_argument(
"--use_amp",
type=distutils.util.strtobool,
default=False,
help="Enable mixed precision training.")
parser.add_argument("--scale_loss",
parser.add_argument(
"--scale_loss",
type=float,
default=2**15,
help="The value of scale_loss for fp16.")
......@@ -301,17 +304,11 @@ def do_train(args):
# If use defalut last_epoch, lr of the first iteration is 0.
# Use `last_epoch = 0` to be consistent with nv bert.
lr_scheduler = paddle.optimizer.lr.LambdaDecay(
args.learning_rate,
lambda current_step, num_warmup_steps=args.warmup_steps,
num_training_steps=args.max_steps if args.max_steps > 0 else
(len(train_data_loader) * args.num_train_epochs): float(
current_step) / float(max(1, num_warmup_steps))
if current_step < num_warmup_steps else max(
0.0,
float(num_training_steps - current_step) / float(
max(1, num_training_steps - num_warmup_steps))),
last_epoch=0)
num_training_steps = args.max_steps if args.max_steps > 0 else len(
train_data_loader) * args.num_train_epochs
lr_scheduler = LinearDecayWithWarmup(
args.learning_rate, num_training_steps, args.warmup_steps, last_epoch=0)
optimizer = paddle.optimizer.AdamW(
learning_rate=lr_scheduler,
......
......@@ -31,6 +31,7 @@ from paddlenlp.datasets import GlueCoLA, GlueSST2, GlueMRPC, GlueSTSB, GlueQQP,
from paddlenlp.data import Stack, Tuple, Pad
from paddlenlp.data.sampler import SamplerHelper
from paddlenlp.transformers import ElectraForSequenceClassification, ElectraTokenizer
from paddlenlp.transformers import LinearDecayWithWarmup
from paddlenlp.utils.log import logger
from paddlenlp.metrics import AccuracyAndF1, Mcc, PearsonAndSpearman
......@@ -226,16 +227,9 @@ def do_train(args):
num_training_steps = args.max_steps if args.max_steps > 0 else (
len(train_data_loader) * args.num_train_epochs)
warmup_steps = int(math.floor(num_training_steps * args.warmup_proportion))
lr_scheduler = paddle.optimizer.lr.LambdaDecay(
args.learning_rate,
lambda current_step, num_warmup_steps=warmup_steps,
num_training_steps=num_training_steps : float(
current_step) / float(max(1, num_warmup_steps))
if current_step < num_warmup_steps else max(
0.0,
float(num_training_steps - current_step) / float(
max(1, num_training_steps - num_warmup_steps))))
lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps,
args.warmup_proportion)
optimizer = paddle.optimizer.AdamW(
learning_rate=lr_scheduler,
......
......@@ -32,6 +32,7 @@ from paddle.io import DataLoader, Dataset
from paddlenlp.transformers import ElectraForTotalPretraining, ElectraModel, ElectraPretrainingCriterion
from paddlenlp.transformers import ElectraDiscriminator, ElectraGenerator
from paddlenlp.transformers import ElectraTokenizer
from paddlenlp.transformers import LinearDecayWithWarmup
FORMAT = '%(asctime)s-%(levelname)s: %(message)s'
logging.basicConfig(level=logging.INFO, format=FORMAT)
......@@ -470,15 +471,9 @@ def do_train(args):
num_training_steps = args.max_steps if args.max_steps > 0 else (
len(train_data_loader) * args.num_train_epochs)
lr_scheduler = paddle.optimizer.lr.LambdaDecay(
args.learning_rate,
lambda current_step, num_warmup_steps=args.warmup_steps,
num_training_steps=num_training_steps: float(
current_step) / float(max(1, num_warmup_steps))
if current_step < num_warmup_steps else max(
0.0,
float(num_training_steps - current_step) / float(
max(1, num_training_steps - num_warmup_steps))))
lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps,
args.warmup_steps)
clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
optimizer = paddle.optimizer.AdamW(
......
......@@ -29,6 +29,7 @@ import paddlenlp as ppnlp
from paddlenlp.datasets import SQuAD, DuReaderRobust, CMRC, DRCD
from paddlenlp.data import Pad, Stack, Tuple
from paddlenlp.transformers import BertForQuestionAnswering, BertTokenizer, ErnieForQuestionAnswering, ErnieTokenizer
from paddlenlp.transformers import LinearDecayWithWarmup
from paddlenlp.metrics.squad import squad_evaluate, compute_predictions
TASK_CLASSES = {"dureader-robust": DuReaderRobust, "cmrc": CMRC, "drcd": DRCD}
......@@ -177,16 +178,11 @@ def do_train(args):
if paddle.distributed.get_world_size() > 1:
model = paddle.DataParallel(model)
lr_scheduler = paddle.optimizer.lr.LambdaDecay(
args.learning_rate,
lambda current_step, warmup_proportion=args.warmup_proportion,
num_training_steps=args.max_steps if args.max_steps > 0 else
(len(train_data_loader)*args.num_train_epochs): float(
current_step) / float(max(1, warmup_proportion*num_training_steps))
if current_step < warmup_proportion*num_training_steps else max(
0.0,
float(num_training_steps - current_step) / float(
max(1, num_training_steps - warmup_proportion*num_training_steps))))
num_training_steps = args.max_steps if args.max_steps > 0 else len(
train_data_loader) * args.num_train_epochs
lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps,
args.warmup_proportion)
optimizer = paddle.optimizer.AdamW(
learning_rate=lr_scheduler,
......
......@@ -28,6 +28,7 @@ import paddlenlp as ppnlp
from paddlenlp.data import Pad, Stack, Tuple
from paddlenlp.transformers import BertForSequenceClassification, BertTokenizer
from paddlenlp.transformers import LinearDecayWithWarmup
MODEL_CLASSES = {"bert": (BertForSequenceClassification, BertTokenizer)}
......@@ -210,16 +211,11 @@ def do_train(args):
if paddle.distributed.get_world_size() > 1:
model = paddle.DataParallel(model)
lr_scheduler = paddle.optimizer.lr.LambdaDecay(
args.learning_rate,
lambda current_step, warmup_proportion=args.warmup_proportion,
num_training_steps=args.max_steps if args.max_steps > 0 else
(len(train_ds.examples)//args.batch_size*args.num_train_epochs): float(
current_step) / float(max(1, warmup_proportion*num_training_steps))
if current_step < warmup_proportion*num_training_steps else max(
0.0,
float(num_training_steps - current_step) / float(
max(1, num_training_steps - warmup_proportion*num_training_steps))))
num_training_steps = args.max_steps if args.max_steps > 0 else len(
train_ds.examples) // args.batch_size * args.num_train_epochs
lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps,
args.warmup_proportion)
optimizer = paddle.optimizer.AdamW(
learning_rate=lr_scheduler,
......
......@@ -28,6 +28,7 @@ import paddlenlp as ppnlp
from paddlenlp.data import Pad, Stack, Tuple
from paddlenlp.transformers import BertForQuestionAnswering, BertTokenizer, ErnieForQuestionAnswering, ErnieTokenizer
from paddlenlp.transformers import LinearDecayWithWarmup
from paddlenlp.metrics.dureader import dureader_evaluate, compute_predictions
MODEL_CLASSES = {
......@@ -168,16 +169,11 @@ def do_train(args):
if paddle.distributed.get_world_size() > 1:
model = paddle.DataParallel(model)
lr_scheduler = paddle.optimizer.lr.LambdaDecay(
args.learning_rate,
lambda current_step, warmup_proportion=args.warmup_proportion,
num_training_steps=args.max_steps if args.max_steps > 0 else
(len(train_dataset.examples)//args.batch_size*args.num_train_epochs): float(
current_step) / float(max(1, warmup_proportion*num_training_steps))
if current_step < warmup_proportion*num_training_steps else max(
0.0,
float(num_training_steps - current_step) / float(
max(1, num_training_steps - warmup_proportion*num_training_steps))))
num_training_steps = args.max_steps if args.max_steps > 0 else len(
train_dataset.examples) // args.batch_size * args.num_train_epochs
lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps,
args.warmup_proportion)
optimizer = paddle.optimizer.AdamW(
learning_rate=lr_scheduler,
......
......@@ -28,6 +28,7 @@ import paddlenlp as ppnlp
from paddlenlp.data import Pad, Stack, Tuple
from paddlenlp.transformers import BertForQuestionAnswering, BertTokenizer, ErnieForQuestionAnswering, ErnieTokenizer
from paddlenlp.transformers import LinearDecayWithWarmup
from paddlenlp.metrics.squad import squad_evaluate, compute_predictions
MODEL_CLASSES = {
......@@ -168,16 +169,11 @@ def do_train(args):
if paddle.distributed.get_world_size() > 1:
model = paddle.DataParallel(model)
lr_scheduler = paddle.optimizer.lr.LambdaDecay(
args.learning_rate,
lambda current_step, warmup_proportion=args.warmup_proportion,
num_training_steps=args.max_steps if args.max_steps > 0 else
(len(train_dataset.examples)//args.batch_size*args.num_train_epochs): float(
current_step) / float(max(1, warmup_proportion*num_training_steps))
if current_step < warmup_proportion*num_training_steps else max(
0.0,
float(num_training_steps - current_step) / float(
max(1, num_training_steps - warmup_proportion*num_training_steps))))
num_training_steps = args.max_steps if args.max_steps > 0 else len(
train_dataset.examples) // args.batch_size * args.num_train_epochs
lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps,
args.warmup_proportion)
optimizer = paddle.optimizer.AdamW(
learning_rate=lr_scheduler,
......
......@@ -28,6 +28,7 @@ from paddle.metric import Metric, Accuracy, Precision, Recall
from paddlenlp.data import Stack, Tuple, Pad
from paddlenlp.data.sampler import SamplerHelper
from paddlenlp.transformers import BertModel, BertForSequenceClassification, BertTokenizer
from paddlenlp.transformers import LinearDecayWithWarmup
from paddlenlp.utils.log import logger
from paddlenlp.metrics import AccuracyAndF1, Mcc, PearsonAndSpearman
import paddlenlp.datasets as datasets
......@@ -459,16 +460,11 @@ def do_train(args):
num_heads=model.bert.config['num_attention_heads'])
reorder_neuron_head(ofa_model.model, head_importance, neuron_importance)
lr_scheduler = paddle.optimizer.lr.LambdaDecay(
args.learning_rate,
lambda current_step, num_warmup_steps=args.warmup_steps,
num_training_steps=args.max_steps if args.max_steps > 0 else
(len(train_data_loader) * args.num_train_epochs): float(
current_step) / float(max(1, num_warmup_steps))
if current_step < num_warmup_steps else max(
0.0,
float(num_training_steps - current_step) / float(
max(1, num_training_steps - num_warmup_steps))))
num_training_steps = args.max_steps if args.max_steps > 0 else len(
train_data_loader) * args.num_train_epochs
lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps,
args.warmup_steps)
optimizer = paddle.optimizer.AdamW(
learning_rate=lr_scheduler,
......
......@@ -27,6 +27,7 @@ import paddlenlp as ppnlp
from paddlenlp.datasets import MSRA_NER
from paddlenlp.data import Stack, Tuple, Pad
from paddlenlp.transformers import BertForTokenClassification, BertTokenizer
from paddlenlp.transformers import LinearDecayWithWarmup
from paddlenlp.metrics import ChunkEvaluator
......@@ -291,16 +292,11 @@ def do_train(args):
if paddle.distributed.get_world_size() > 1:
model = paddle.DataParallel(model)
lr_scheduler = paddle.optimizer.lr.LambdaDecay(
args.learning_rate,
lambda current_step, num_warmup_steps=args.warmup_steps,
num_training_steps=args.max_steps if args.max_steps > 0 else
(len(train_data_loader) * args.num_train_epochs): float(
current_step) / float(max(1, num_warmup_steps))
if current_step < num_warmup_steps else max(
0.0,
float(num_training_steps - current_step) / float(
max(1, num_training_steps - num_warmup_steps))))
num_training_steps = args.max_steps if args.max_steps > 0 else len(
train_data_loader) * args.num_train_epochs
lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps,
args.warmup_steps)
optimizer = paddle.optimizer.AdamW(
learning_rate=lr_scheduler,
......
......@@ -22,8 +22,9 @@ import numpy as np
import paddle
import paddle.nn.functional as F
from paddlenlp.data import Stack, Tuple, Pad
import paddlenlp as ppnlp
from paddlenlp.data import Stack, Tuple, Pad
from paddlenlp.transformers import LinearDecayWithWarmup
# yapf: disable
parser = argparse.ArgumentParser()
......@@ -214,19 +215,10 @@ def do_train():
model = paddle.DataParallel(model)
num_training_steps = len(train_data_loader) * args.epochs
num_warmup_steps = int(args.warmup_proportion * num_training_steps)
def get_lr_factor(current_step):
if current_step < num_warmup_steps:
return float(current_step) / float(max(1, num_warmup_steps))
else:
return max(0.0,
float(num_training_steps - current_step) /
float(max(1, num_training_steps - num_warmup_steps)))
lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps,
args.warmup_proportion)
lr_scheduler = paddle.optimizer.lr.LambdaDecay(
args.learning_rate,
lr_lambda=lambda current_step: get_lr_factor(current_step))
optimizer = paddle.optimizer.AdamW(
learning_rate=lr_scheduler,
parameters=model.parameters(),
......
......@@ -10,6 +10,7 @@ import paddle.nn as nn
from paddle.io import DataLoader
from paddlenlp.transformers import ErnieForGeneration
from paddlenlp.transformers import ErnieTokenizer, ErnieTinyTokenizer, BertTokenizer, ElectraTokenizer, RobertaTokenizer
from paddlenlp.transformers import LinearDecayWithWarmup
from paddlenlp.datasets import Poetry
from paddlenlp.data import Stack, Tuple, Pad
from paddlenlp.metrics import Rouge1, Rouge2
......@@ -175,16 +176,10 @@ def train():
if paddle.distributed.get_world_size() > 1:
model = paddle.DataParallel(model)
max_steps = (len(train_data_loader) * args.num_epochs)
lr_scheduler = paddle.optimizer.lr.LambdaDecay(
args.learning_rate,
lambda current_step, num_warmup_steps=max_steps*args.warmup_proportion,
num_training_steps=max_steps: float(
current_step) / float(max(1, num_warmup_steps))
if current_step < num_warmup_steps else max(
0.0,
float(num_training_steps - current_step) / float(
max(1, num_training_steps - num_warmup_steps))))
max_steps = len(train_data_loader) * args.num_epochs
lr_scheduler = LinearDecayWithWarmup(args.learning_rate, max_steps,
args.warmup_proportion)
optimizer = paddle.optimizer.AdamW(
learning_rate=lr_scheduler,
......
......@@ -22,8 +22,9 @@ import numpy as np
import paddle
import paddle.nn.functional as F
from paddlenlp.data import Stack, Tuple, Pad
import paddlenlp as ppnlp
from paddlenlp.data import Stack, Tuple, Pad
from paddlenlp.transformers import LinearDecayWithWarmup
from model import SentenceTransformer
......@@ -231,19 +232,10 @@ def do_train():
model = paddle.DataParallel(model)
num_training_steps = len(train_data_loader) * args.epochs
num_warmup_steps = int(args.warmup_proption * num_training_steps)
def get_lr_factor(current_step):
if current_step < num_warmup_steps:
return float(current_step) / float(max(1, num_warmup_steps))
else:
return max(0.0,
float(num_training_steps - current_step) /
float(max(1, num_training_steps - num_warmup_steps)))
lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps,
args.warmup_proportion)
lr_scheduler = paddle.optimizer.lr.LambdaDecay(
args.learning_rate,
lr_lambda=lambda current_step: get_lr_factor(current_step))
optimizer = paddle.optimizer.AdamW(
learning_rate=lr_scheduler,
parameters=model.parameters(),
......
......@@ -27,6 +27,7 @@ from paddlenlp.datasets import GlueQNLI, GlueSST2
from paddlenlp.data import Stack, Tuple, Pad
from paddlenlp.data.sampler import SamplerHelper
from paddlenlp.transformers import BertForSequenceClassification, BertTokenizer
from paddlenlp.transformers import LinearDecayWithWarmup
FORMAT = '%(asctime)s-%(levelname)s: %(message)s'
logging.basicConfig(level=logging.INFO, format=FORMAT)
......@@ -332,16 +333,12 @@ def do_train(args):
# Create the training-backward program, this pass will not be
# executed in the validation
with paddle.static.program_guard(main_program, startup_program):
lr_scheduler = paddle.optimizer.lr.LambdaDecay(
args.learning_rate,
lambda current_step, num_warmup_steps=args.warmup_steps,
num_training_steps=args.max_steps if args.max_steps > 0 else
(len(train_data_loader) * args.num_train_epochs): float(
current_step) / float(max(1, num_warmup_steps))
if current_step < num_warmup_steps else max(
0.0,
float(num_training_steps - current_step) / float(
max(1, num_training_steps - num_warmup_steps))))
num_training_steps = args.max_steps if args.max_steps > 0 else len(
train_data_loader) * args.num_train_epochs
lr_scheduler = LinearDecayWithWarmup(
args.learning_rate, num_training_steps, args.warmup_steps)
optimizer = paddle.optimizer.AdamW(
learning_rate=lr_scheduler,
epsilon=args.adam_epsilon,
......
......@@ -28,6 +28,7 @@ from paddle.io import DataLoader, Dataset
from paddlenlp.transformers import BertForPretraining, BertModel, BertPretrainingCriterion
from paddlenlp.transformers import BertTokenizer
from paddlenlp.transformers import LinearDecayWithWarmup
from data import create_data_holder, create_pretraining_dataset
MODEL_CLASSES = {"bert": (BertForPretraining, BertTokenizer)}
......@@ -198,17 +199,11 @@ def do_train(args):
loss = criterion(prediction_scores, seq_relationship_score,
masked_lm_labels, next_sentence_labels, masked_lm_scale)
num_training_steps = args.max_steps if args.max_steps > 0 else len(
train_data_loader) * args.num_train_epochs
# Define the dynamic learing_reate scheduler and optimizer
lr_scheduler = paddle.optimizer.lr.LambdaDecay(
args.learning_rate,
lambda current_step, num_warmup_steps=args.warmup_steps,
num_training_steps=args.max_steps if args.max_steps > 0 else
(len(train_data_loader) * args.num_train_epochs): float(
current_step) / float(max(1, num_warmup_steps))
if current_step < num_warmup_steps else max(
0.0,
float(num_training_steps - current_step) / float(
max(1, num_training_steps - num_warmup_steps))))
lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps,
args.warmup_steps)
optimizer = paddle.optimizer.AdamW(
learning_rate=lr_scheduler,
......
......@@ -27,6 +27,7 @@ import paddle
from paddle.io import DataLoader, Dataset
from paddlenlp.transformers import BertForPretraining, BertModel, BertPretrainingCriterion
from paddlenlp.transformers import BertTokenizer
from paddlenlp.transformers import LinearDecayWithWarmup
from data import create_data_holder, create_pretraining_dataset
MODEL_CLASSES = {"bert": (BertForPretraining, BertTokenizer)}
......@@ -210,17 +211,11 @@ def do_train(args):
loss = criterion(prediction_scores, seq_relationship_score,
masked_lm_labels, next_sentence_labels, masked_lm_scale)
num_training_steps = args.max_steps if args.max_steps > 0 else len(
train_data_loader) * args.num_train_epochs
# Define the dynamic learing_reate scheduler and optimizer
lr_scheduler = paddle.optimizer.lr.LambdaDecay(
args.learning_rate,
lambda current_step, num_warmup_steps=args.warmup_steps,
num_training_steps=args.max_steps if args.max_steps > 0 else
(len(train_data_loader) * args.num_train_epochs): float(
current_step) / float(max(1, num_warmup_steps))
if current_step < num_warmup_steps else max(
0.0,
float(num_training_steps - current_step) / float(
max(1, num_training_steps - num_warmup_steps))))
lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps,
args.warmup_steps)
optimizer = paddle.optimizer.AdamW(
learning_rate=lr_scheduler,
......
......@@ -27,3 +27,4 @@ from .electra.modeling import *
from .electra.tokenizer import *
from .transformer.modeling import *
from .ernie_gen.modeling import ErnieForGeneration
from .optimization import *
......@@ -31,6 +31,7 @@ from paddlenlp.data import *
from paddlenlp.data.sampler import SamplerHelper
from paddlenlp.transformers.model_bert import *
from paddlenlp.transformers.tokenizer_bert import BertTokenizer
from paddlenlp.transformers import LinearDecayWithWarmup
TASK_CLASSES = {
"qnli": (GlueQNLI, paddle.metric.Accuracy), # (dataset, metric)
......@@ -307,16 +308,11 @@ def do_train(args):
if paddle.distributed.get_world_size() > 1:
model = paddle.DataParallel(model)
lr_scheduler = paddle.optimizer.lr.LambdaDecay(
args.learning_rate,
lambda current_step, num_warmup_steps=args.warmup_steps,
num_training_steps=args.max_steps if args.max_steps > 0 else
(len(train_data_loader) * args.num_train_epochs): float(
current_step) / float(max(1, num_warmup_steps))
if current_step < num_warmup_steps else max(
0.0,
float(num_training_steps - current_step) / float(
max(1, num_training_steps - num_warmup_steps))))
num_training_steps = args.max_steps if args.max_steps > 0 else len(
train_data_loader) * args.num_train_epochs
lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps,
args.warmup_steps)
optimizer = paddle.optimizer.AdamW(
learning_rate=lr_scheduler,
......
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import math
from paddle.optimizer.lr import LambdaDecay
__all__ = [
'LinearDecayWithWarmup', 'ConstScheduleWithWarmup', 'CosineDecayWithWarmup',
'PolyDecayWithWarmup'
]
def is_integer(number):
if sys.version > '3':
return isinstance(number, int)
return isinstance(number, (int, long))
class LinearDecayWithWarmup(LambdaDecay):
"""
Create a learning rate scheduler, which increases learning rate linearly
from 0 to given `learning_rate`, after this warmup period learning rate
would be decreased linearly from the base learning rate to 0.
Args:
learning_rate (float): The base learning rate. It is a python float
number.
total_steps (int): The number of training steps.
warmup (int|float): If int, it means the number of steps for warmup.
If float, it means the proportion of warmup in total training steps.
last_epoch (int, optional): The index of last epoch. It can be set to
restart training. If None, it means initial learning rate.
Default: -1.
verbose (bool, optional): If True, prints a message to stdout for each
update. Default: False.
Examples:
.. code-block:: python
from paddlenlp.transformers import LinearDecayWithWarmup
lr, warmup_steps, max_steps = 0.1, 100, 1000
lr_scheduler = LinearDecayWithWarmup(lr, max_steps, warmup_steps)
"""
def __init__(self,
learning_rate,
total_steps,
warmup,
last_epoch=-1,
verbose=False):
warmup_steps = warmup if is_integer(warmup) else int(
math.floor(warmup * total_steps))
def lr_lambda(current_step):
if current_step < warmup_steps:
return float(current_step) / float(max(1, warmup_steps))
return max(0.0,
float(total_steps - current_step) /
float(max(1, total_steps - warmup_steps)))
super(LinearDecayWithWarmup, self).__init__(learning_rate, lr_lambda,
last_epoch, verbose)
class ConstScheduleWithWarmup(LambdaDecay):
"""
Create a learning rate scheduler, which increases learning rate linearly
from 0 to given `learning_rate` during warmup periods and keeps learning
rate a constant after that.
Args:
learning_rate (float): The base learning rate. It is a python float
number.
warmup (int|float): If int, it means the number of steps for warmup.
If float, it means the proportion of warmup in total training steps.
total_steps (int, optional): The number of training steps. If `warmup`
is a float number, `total_steps` must be provided.
last_epoch (int, optional): The index of last epoch. It can be set to
restart training. If None, it means initial learning rate.
Default: -1.
Examples:
.. code-block:: python
from paddlenlp.transformers import ConstScheduleWithWarmup
lr, warmup_steps = 0.1, 100
lr_scheduler = ConstScheduleWithWarmup(lr, warmup_steps)
"""
def __init__(self,
learning_rate,
warmup,
total_steps=None,
last_epoch=-1,
verbose=False):
warmup_steps = warmup if is_integer(warmup) else int(
math.floor(warmup * total_steps))
if is_integer(warmup):
warmup_steps = warmup
elif total_steps:
warmup_steps = int(math.floor(warmup * total_steps))
else:
raise ValueError(
"Please provide total steps if `warmup` is a float number , or provide integer for argument `warmup`."
)
def lr_lambda(current_step):
if current_step < warmup_steps:
return float(current_step) / float(max(1.0, warmup_steps))
return 1.0
super(ConstScheduleWithWarmup, self).__init__(learning_rate, lr_lambda,
last_epoch, verbose)
class CosineDecayWithWarmup(LambdaDecay):
"""
Create a learning rate scheduler, which increases learning rate linearly
from 0 to given `learning_rate`, after this warmup period learning rate
would be decreased following the values of the cosine function. If
`with_hard_restarts` is True, the cosine function could have serveral hard
restarts.
Args:
learning_rate (float): The base learning rate. It is a python float
number.
total_steps (int): The number of training steps.
warmup (int|float): If int, it means the number of steps for warmup.
If float, it means the proportion of warmup in total training steps.
with_hard_restarts (bool) Whether cosine function has several hard
restarts. Default: False.
num_cycles (int|float optional): If `with_hard_restarts` is False, it
means the number of waves in cosine scheduler and should be an
integer number and defaults to 1. If `with_hard_restarts` is True,
it means the number of hard restarts to use and should be a float
number and defaults to be 0.5. Default: None.
last_epoch (int, optional): The index of last epoch. It can be set to
restart training. If None, it means initial learning rate.
Default: -1.
Examples:
.. code-block:: python
from paddlenlp.transformers import CosineDecayWithWarmup
lr, warmup_steps, max_steps = 0.1, 100, 1000
lr_scheduler = CosineDecayWithWarmup(lr, max_steps, warmup_steps)
"""
def __init__(self,
learning_rate,
total_steps,
warmup,
with_hard_restarts=False,
num_cycles=None,
last_epoch=-1,
verbose=False):
warmup_steps = warmup if is_integer(warmup) else int(
math.floor(warmup * total_steps))
# Input check
if num_cycles is not None:
assert not with_hard_restarts and isinstance(num_cycles, int) or with_hard_restarts and isinstance(num_cycles, float), \
"`num_circles` should be an integer while `with_hard_restarts` is False, an float while `with_hard_restarts` is True."
else:
num_cycles = 1 if not with_hard_restarts else 0.5
def lr_lambda(current_step):
if current_step < warmup_steps:
return float(current_step) / float(max(1, warmup_steps))
progress = float(current_step - warmup_steps) / float(
max(1, total_steps - warmup_steps))
if with_hard_restarts:
if progress >= 1.0:
return 0.0
return max(0.0, 0.5 * (1.0 + math.cos(math.pi * (
(float(num_cycles) * progress) % 1.0))))
return max(0.0, 0.5 * (
1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)))
super(CosineDecayWithWarmup, self).__init__(learning_rate, lr_lambda,
last_epoch, verbose)
class PolyDecayWithWarmup(LambdaDecay):
"""
Create a learning rate scheduler, which increases learning rate linearly
from 0 to given `lr_init`, after this warmup period learning rate would
be decreased as a polynomial decay from the base learning rate to the end
learning rate `lr_end`.
Args:
learning_rate (float): The base learning rate. It is a python float
number.
total_steps (int): The number of training steps.
warmup (int|float): If int, it means the number of steps for warmup.
If float, it means the proportion of warmup in total training steps.
lr_end (float, optional): The end learning rate. Default: 1e-7.
power (float, optional): Power factor. Default: 1.0.
last_epoch (int, optional): The index of last epoch. It can be set to
restart training. If None, it means initial learning rate.
Default: -1.
Examples:
.. code-block:: python
from paddlenlp.transformers import PolyDecayWithWarmup
lr, lr_end, warmup_steps, max_steps = 0.1, 1e-6, 100, 1000
lr_scheduler = PolyDecayWithWarmup(lr, max_steps, warmup_steps, lr_end)
"""
def __init__(self,
learning_rate,
total_steps,
warmup,
lr_end=1e-7,
power=1.0,
last_epoch=-1,
verbose=False):
lr_init = learning_rate
assert lr_init > lr_end, f"`lr_end` must be be smaller than `learning_rate`. But `lr_end` is {lr_end} while `learning_rate` is {lr_init}."
warmup_steps = warmup if is_integer(warmup) else int(
math.floor(warmup * total_steps))
def lr_lambda(current_step):
if current_step < warmup_steps:
return float(current_step) / float(max(1, warmup_steps))
elif current_step > total_steps:
return lr_end / lr_init # it multiplies by lr_init equals to lr_end
else:
lr_range = lr_init - lr_end
decay_steps = total_steps - warmup_steps
pct_remaining = 1 - (current_step - warmup_steps) / decay_steps
decay = lr_range * pct_remaining**power + lr_end
return decay / lr_init # it multiplies by lr_init equals to decay
super(PolyDecayWithWarmup, self).__init__(lr_init, lr_lambda,
last_epoch, verbose)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册