From fee616e868262a7b791f3801bd0d1380f56d0e36 Mon Sep 17 00:00:00 2001 From: Jiaqi Liu Date: Fri, 5 Feb 2021 17:32:32 +0800 Subject: [PATCH] Add some lr schedulers with warmup (#5256) * add -linear-schedule-with-warmup * update lr scheduler usage in run_glue * add import info * add other 5 scheduler with warmup * add copyright, update usage in run_glue * simplify argument, make warmup arg support float and int * Add some LambdaDecay scheduler with warmup, update usage in run_glue * update classname and unify two cosine decays into one * update usage in run_glue * fix typo * update WarmUp to Warmup, and update class name about Const, and update doc, and usage * update usage of decay class * update usage of decay class --- PaddleNLP/benchmark/bert/run_glue.py | 15 +- PaddleNLP/benchmark/bert/run_pretrain.py | 23 +- .../benchmark/bert/run_pretrain_single.py | 20 +- PaddleNLP/examples/dialogue/dgu/main.py | 22 +- PaddleNLP/examples/glue/run_glue.py | 16 +- .../examples/language_model/bert/run_glue.py | 34 +-- .../language_model/bert/run_pretrain.py | 39 ++- .../language_model/electra/run_glue.py | 14 +- .../language_model/electra/run_pretrain.py | 13 +- .../DuReader-robust/run_du.py | 16 +- .../DuReader-yesno/run_du.py | 16 +- .../DuReader/run_du.py | 16 +- .../SQuAD/run_squad.py | 16 +- .../model_compression/run_glue_ofa.py | 16 +- .../msra_ner/run_msra_ner.py | 16 +- .../pretrained_models/train.py | 20 +- .../text_generation/ernie-gen/train.py | 15 +- .../sentence_transformers/train.py | 20 +- PaddleNLP/legacy/benchmark/bert/run_glue.py | 17 +- .../legacy/benchmark/bert/run_pretrain.py | 15 +- .../benchmark/bert/run_pretrain_single.py | 15 +- PaddleNLP/paddlenlp/transformers/__init__.py | 1 + .../transformers/converter/run_glue_pp.py | 16 +- .../paddlenlp/transformers/optimization.py | 260 ++++++++++++++++++ 24 files changed, 412 insertions(+), 259 deletions(-) create mode 100644 PaddleNLP/paddlenlp/transformers/optimization.py diff --git a/PaddleNLP/benchmark/bert/run_glue.py b/PaddleNLP/benchmark/bert/run_glue.py index 89211b20..fdf7f877 100644 --- a/PaddleNLP/benchmark/bert/run_glue.py +++ b/PaddleNLP/benchmark/bert/run_glue.py @@ -29,6 +29,7 @@ from paddlenlp.data import Stack, Tuple, Pad from paddlenlp.data.sampler import SamplerHelper from paddlenlp.transformers import BertForSequenceClassification, BertTokenizer from paddlenlp.transformers import ErnieForSequenceClassification, ErnieTokenizer +from paddlenlp.transformers import LinearDecayWithWarmup from paddlenlp.metrics import Mcc, PearsonAndSpearman from paddlenlp.utils.log import logger @@ -381,17 +382,11 @@ def do_train(args): # Create the training-backward program, this pass will not be # executed in the validation + num_training_steps = args.max_steps if args.max_steps > 0 else len( + train_data_loader) * args.num_train_epochs with paddle.static.program_guard(main_program, startup_program): - lr_scheduler = paddle.optimizer.lr.LambdaDecay( - args.learning_rate, - lambda current_step, num_warmup_steps=args.warmup_steps, - num_training_steps=args.max_steps if args.max_steps > 0 else - (len(train_data_loader) * args.num_train_epochs): float( - current_step) / float(max(1, num_warmup_steps)) - if current_step < num_warmup_steps else max( - 0.0, - float(num_training_steps - current_step) / float( - max(1, num_training_steps - num_warmup_steps)))) + lr_scheduler = LinearDecayWithWarmup( + args.learning_rate, num_training_steps, args.warmup_steps) optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, diff --git a/PaddleNLP/benchmark/bert/run_pretrain.py b/PaddleNLP/benchmark/bert/run_pretrain.py index b5adea2a..2c7ebf09 100644 --- a/PaddleNLP/benchmark/bert/run_pretrain.py +++ b/PaddleNLP/benchmark/bert/run_pretrain.py @@ -31,6 +31,7 @@ from paddle.io import DataLoader, Dataset from paddlenlp.transformers import BertForPretraining, BertModel, BertPretrainingCriterion from paddlenlp.transformers import BertTokenizer +from paddlenlp.transformers import LinearDecayWithWarmup from data import create_data_holder, create_pretraining_dataset MODEL_CLASSES = {"bert": (BertForPretraining, BertTokenizer)} @@ -232,7 +233,8 @@ def dist_optimizer(args, optimizer): if args.use_amp: dist_strategy.amp = True - custom_black_list = ['lookup_table', 'lookup_table_v2'] if args.use_pure_fp16 else None + custom_black_list = ['lookup_table', + 'lookup_table_v2'] if args.use_pure_fp16 else None dist_strategy.amp_configs = { 'custom_white_list': ['softmax', 'layer_norm', 'gelu'], 'init_loss_scaling': args.scale_loss, @@ -305,16 +307,11 @@ def do_train(args): masked_lm_labels, next_sentence_labels, masked_lm_scale) # Define the dynamic learing_reate scheduler and optimizer - lr_scheduler = paddle.optimizer.lr.LambdaDecay( - args.learning_rate, - lambda current_step, num_warmup_steps=args.warmup_steps, - num_training_steps=args.max_steps if args.max_steps > 0 else - (len(train_data_loader) * args.num_train_epochs): float( - current_step) / float(max(1, num_warmup_steps)) - if current_step < num_warmup_steps else max( - 0.0, - float(num_training_steps - current_step) / float( - max(1, num_training_steps - num_warmup_steps)))) + num_training_steps = args.max_steps if args.max_steps > 0 else len( + train_data_loader) * args.num_train_epochs + + lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, + args.warmup_steps) optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, @@ -327,8 +324,8 @@ def do_train(args): ], multi_precision=args.use_pure_fp16) if worker_num == 1 and args.use_amp: - custom_black_list=(['lookup_table', 'lookup_table_v2'] - if args.use_pure_fp16 else None) + custom_black_list = (['lookup_table', 'lookup_table_v2'] + if args.use_pure_fp16 else None) amp_list = paddle.static.amp.AutoMixedPrecisionLists( custom_white_list=['softmax', 'layer_norm', 'gelu'], custom_black_list=custom_black_list) diff --git a/PaddleNLP/benchmark/bert/run_pretrain_single.py b/PaddleNLP/benchmark/bert/run_pretrain_single.py index dd16c00d..03430af4 100644 --- a/PaddleNLP/benchmark/bert/run_pretrain_single.py +++ b/PaddleNLP/benchmark/bert/run_pretrain_single.py @@ -28,6 +28,7 @@ from paddle.io import DataLoader, Dataset from paddlenlp.transformers import BertForPretraining, BertModel, BertPretrainingCriterion from paddlenlp.transformers import ErnieForPretraining, ErnieModel, ErniePretrainingCriterion from paddlenlp.transformers import BertTokenizer, ErnieTokenizer +from paddlenlp.transformers import LinearDecayWithWarmup from data import create_data_holder, create_pretraining_dataset MODEL_CLASSES = { @@ -222,16 +223,11 @@ def do_train(args): masked_lm_labels, next_sentence_labels, masked_lm_scale) # Define the dynamic learing_reate scheduler and optimizer - lr_scheduler = paddle.optimizer.lr.LambdaDecay( - args.learning_rate, - lambda current_step, num_warmup_steps=args.warmup_steps, - num_training_steps=args.max_steps if args.max_steps > 0 else - (len(train_data_loader) * args.num_train_epochs): float( - current_step) / float(max(1, num_warmup_steps)) - if current_step < num_warmup_steps else max( - 0.0, - float(num_training_steps - current_step) / float( - max(1, num_training_steps - num_warmup_steps)))) + num_training_steps = args.max_steps if args.max_steps > 0 else len( + train_data_loader) * args.num_train_epochs + + lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, + args.warmup_steps) optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, @@ -244,8 +240,8 @@ def do_train(args): ], multi_precision=args.use_pure_fp16) if args.use_amp: - custom_black_list=(['lookup_table', 'lookup_table_v2'] - if args.use_pure_fp16 else None) + custom_black_list = (['lookup_table', 'lookup_table_v2'] + if args.use_pure_fp16 else None) amp_list = paddle.static.amp.AutoMixedPrecisionLists( custom_white_list=['layer_norm', 'softmax', 'gelu'], custom_black_list=custom_black_list) diff --git a/PaddleNLP/examples/dialogue/dgu/main.py b/PaddleNLP/examples/dialogue/dgu/main.py index 352f0ef0..bfcde0b2 100644 --- a/PaddleNLP/examples/dialogue/dgu/main.py +++ b/PaddleNLP/examples/dialogue/dgu/main.py @@ -9,13 +9,13 @@ import paddle.nn as nn import paddle.nn.functional as F import paddle.distributed as dist from paddle.io import DataLoader, DistributedBatchSampler, BatchSampler -from paddle.optimizer.lr import LambdaDecay from paddle.optimizer import AdamW from paddle.metric import Accuracy from paddlenlp.datasets import MapDatasetWrapper from paddlenlp.data import Stack, Tuple, Pad from paddlenlp.transformers import BertTokenizer, BertForSequenceClassification, BertForTokenClassification +from paddlenlp.transformers import LinearDecayWithWarmup from args import parse_args, set_default_args import data @@ -54,14 +54,6 @@ def save_ckpt(model, optimizer, output_dir, name): paddle.save(optimizer.state_dict(), opt_path) -def compute_lr_factor(current_step, warmup_steps, max_train_steps): - if current_step < warmup_steps: - factor = float(current_step) / warmup_steps - else: - factor = 1 - float(current_step) / max_train_steps - return factor - - class DGULossFunction(nn.Layer): def __init__(self, task_name): super(DGULossFunction, self).__init__() @@ -117,16 +109,14 @@ def print_logs(args, step, logits, labels, loss, total_time, metric): def train(args, model, train_data_loader, dev_data_loader, metric, rank): num_examples = len(train_data_loader) * args.batch_size * args.n_gpu max_train_steps = args.epochs * len(train_data_loader) - warmup_steps = int(max_train_steps * args.warmup_proportion) if rank == 0: print("Num train examples: %d" % num_examples) print("Max train steps: %d" % max_train_steps) - print("Num warmup steps: %d" % warmup_steps) - factor_fn = partial( - compute_lr_factor, - warmup_steps=warmup_steps, - max_train_steps=max_train_steps) - lr_scheduler = LambdaDecay(args.learning_rate, factor_fn) + print("Warmup proportion: %d" % args.warmup_proportion) + + lr_scheduler = LinearDecayWithWarmup(args.learning_rate, max_train_steps, + args.warmup_proportion) + optimizer = AdamW( learning_rate=lr_scheduler, parameters=model.parameters(), diff --git a/PaddleNLP/examples/glue/run_glue.py b/PaddleNLP/examples/glue/run_glue.py index 93de366f..bac9e0a0 100644 --- a/PaddleNLP/examples/glue/run_glue.py +++ b/PaddleNLP/examples/glue/run_glue.py @@ -32,6 +32,7 @@ from paddlenlp.data.sampler import SamplerHelper from paddlenlp.transformers import BertForSequenceClassification, BertTokenizer from paddlenlp.transformers import ElectraForSequenceClassification, ElectraTokenizer from paddlenlp.transformers import ErnieForSequenceClassification, ErnieTokenizer +from paddlenlp.transformers import LinearDecayWithWarmup from paddlenlp.metrics import AccuracyAndF1, Mcc, PearsonAndSpearman FORMAT = '%(asctime)s-%(levelname)s: %(message)s' @@ -355,17 +356,10 @@ def do_train(args): num_training_steps = args.max_steps if args.max_steps > 0 else ( len(train_data_loader) * args.num_train_epochs) - warmup_steps = args.warmup_steps if args.warmup_steps > 0 else ( - int(math.floor(num_training_steps * args.warmup_proportion))) - lr_scheduler = paddle.optimizer.lr.LambdaDecay( - args.learning_rate, - lambda current_step, num_warmup_steps=warmup_steps, - num_training_steps=num_training_steps : float( - current_step) / float(max(1, num_warmup_steps)) - if current_step < num_warmup_steps else max( - 0.0, - float(num_training_steps - current_step) / float( - max(1, num_training_steps - num_warmup_steps)))) + warmup = args.warmup_steps if args.warmup_steps > 0 else args.warmup_proportion + + lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, + warmup) optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, diff --git a/PaddleNLP/examples/language_model/bert/run_glue.py b/PaddleNLP/examples/language_model/bert/run_glue.py index f0d36bc5..f71b701c 100644 --- a/PaddleNLP/examples/language_model/bert/run_glue.py +++ b/PaddleNLP/examples/language_model/bert/run_glue.py @@ -32,6 +32,7 @@ from paddlenlp.data import Stack, Tuple, Pad from paddlenlp.transformers import BertForSequenceClassification, BertTokenizer from paddlenlp.transformers import ElectraForSequenceClassification, ElectraTokenizer from paddlenlp.transformers import ErnieForSequenceClassification, ErnieTokenizer +from paddlenlp.transformers import LinearDecayWithWarmup from paddlenlp.metrics import AccuracyAndF1, Mcc, PearsonAndSpearman FORMAT = '%(asctime)s-%(levelname)s: %(message)s' @@ -162,14 +163,16 @@ def parse_args(): type=str, default="gpu", help="Device for selecting for the training.") - parser.add_argument("--use_amp", - type=distutils.util.strtobool, - default=False, - help="Enable mixed precision training.") - parser.add_argument("--scale_loss", - type=float, - default=2**15, - help="The value of scale_loss for fp16.") + parser.add_argument( + "--use_amp", + type=distutils.util.strtobool, + default=False, + help="Enable mixed precision training.") + parser.add_argument( + "--scale_loss", + type=float, + default=2**15, + help="The value of scale_loss for fp16.") args = parser.parse_args() return args @@ -360,17 +363,10 @@ def do_train(args): num_training_steps = args.max_steps if args.max_steps > 0 else ( len(train_data_loader) * args.num_train_epochs) - warmup_steps = args.warmup_steps if args.warmup_steps > 0 else ( - int(math.floor(num_training_steps * args.warmup_proportion))) - lr_scheduler = paddle.optimizer.lr.LambdaDecay( - args.learning_rate, - lambda current_step, num_warmup_steps=warmup_steps, - num_training_steps=num_training_steps : float( - current_step) / float(max(1, num_warmup_steps)) - if current_step < num_warmup_steps else max( - 0.0, - float(num_training_steps - current_step) / float( - max(1, num_training_steps - num_warmup_steps)))) + warmup = args.warmup_steps if args.warmup_steps > 0 else args.warmup_proportion + + lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, + warmup) optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, diff --git a/PaddleNLP/examples/language_model/bert/run_pretrain.py b/PaddleNLP/examples/language_model/bert/run_pretrain.py index 57978bff..551e77b8 100644 --- a/PaddleNLP/examples/language_model/bert/run_pretrain.py +++ b/PaddleNLP/examples/language_model/bert/run_pretrain.py @@ -34,6 +34,7 @@ from paddlenlp.data import Stack, Tuple, Pad from paddlenlp.transformers import BertForPretraining, BertModel, BertPretrainingCriterion from paddlenlp.transformers import ErnieForPretraining, ErnieModel, ErniePretrainingCriterion from paddlenlp.transformers import BertTokenizer, ErnieTokenizer +from paddlenlp.transformers import LinearDecayWithWarmup FORMAT = '%(asctime)s-%(levelname)s: %(message)s' logging.basicConfig(level=logging.INFO, format=FORMAT) @@ -147,14 +148,16 @@ def parse_args(): type=str, default="gpu", help="Device for selecting for the training.") - parser.add_argument("--use_amp", - type=distutils.util.strtobool, - default=False, - help="Enable mixed precision training.") - parser.add_argument("--scale_loss", - type=float, - default=2**15, - help="The value of scale_loss for fp16.") + parser.add_argument( + "--use_amp", + type=distutils.util.strtobool, + default=False, + help="Enable mixed precision training.") + parser.add_argument( + "--scale_loss", + type=float, + default=2**15, + help="The value of scale_loss for fp16.") args = parser.parse_args() return args @@ -301,17 +304,11 @@ def do_train(args): # If use defalut last_epoch, lr of the first iteration is 0. # Use `last_epoch = 0` to be consistent with nv bert. - lr_scheduler = paddle.optimizer.lr.LambdaDecay( - args.learning_rate, - lambda current_step, num_warmup_steps=args.warmup_steps, - num_training_steps=args.max_steps if args.max_steps > 0 else - (len(train_data_loader) * args.num_train_epochs): float( - current_step) / float(max(1, num_warmup_steps)) - if current_step < num_warmup_steps else max( - 0.0, - float(num_training_steps - current_step) / float( - max(1, num_training_steps - num_warmup_steps))), - last_epoch=0) + num_training_steps = args.max_steps if args.max_steps > 0 else len( + train_data_loader) * args.num_train_epochs + + lr_scheduler = LinearDecayWithWarmup( + args.learning_rate, num_training_steps, args.warmup_steps, last_epoch=0) optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, @@ -390,8 +387,8 @@ def do_train(args): attention_mask=input_mask, masked_positions=masked_lm_positions) loss = criterion(prediction_scores, seq_relationship_score, - masked_lm_labels, next_sentence_labels, - masked_lm_scale) + masked_lm_labels, next_sentence_labels, + masked_lm_scale) if args.use_amp: scaler.scale(loss).backward() scaler.minimize(optimizer, loss) diff --git a/PaddleNLP/examples/language_model/electra/run_glue.py b/PaddleNLP/examples/language_model/electra/run_glue.py index 96d4deda..91603a9c 100644 --- a/PaddleNLP/examples/language_model/electra/run_glue.py +++ b/PaddleNLP/examples/language_model/electra/run_glue.py @@ -31,6 +31,7 @@ from paddlenlp.datasets import GlueCoLA, GlueSST2, GlueMRPC, GlueSTSB, GlueQQP, from paddlenlp.data import Stack, Tuple, Pad from paddlenlp.data.sampler import SamplerHelper from paddlenlp.transformers import ElectraForSequenceClassification, ElectraTokenizer +from paddlenlp.transformers import LinearDecayWithWarmup from paddlenlp.utils.log import logger from paddlenlp.metrics import AccuracyAndF1, Mcc, PearsonAndSpearman @@ -226,16 +227,9 @@ def do_train(args): num_training_steps = args.max_steps if args.max_steps > 0 else ( len(train_data_loader) * args.num_train_epochs) - warmup_steps = int(math.floor(num_training_steps * args.warmup_proportion)) - lr_scheduler = paddle.optimizer.lr.LambdaDecay( - args.learning_rate, - lambda current_step, num_warmup_steps=warmup_steps, - num_training_steps=num_training_steps : float( - current_step) / float(max(1, num_warmup_steps)) - if current_step < num_warmup_steps else max( - 0.0, - float(num_training_steps - current_step) / float( - max(1, num_training_steps - num_warmup_steps)))) + + lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, + args.warmup_proportion) optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, diff --git a/PaddleNLP/examples/language_model/electra/run_pretrain.py b/PaddleNLP/examples/language_model/electra/run_pretrain.py index aee9704a..3a117ede 100644 --- a/PaddleNLP/examples/language_model/electra/run_pretrain.py +++ b/PaddleNLP/examples/language_model/electra/run_pretrain.py @@ -32,6 +32,7 @@ from paddle.io import DataLoader, Dataset from paddlenlp.transformers import ElectraForTotalPretraining, ElectraModel, ElectraPretrainingCriterion from paddlenlp.transformers import ElectraDiscriminator, ElectraGenerator from paddlenlp.transformers import ElectraTokenizer +from paddlenlp.transformers import LinearDecayWithWarmup FORMAT = '%(asctime)s-%(levelname)s: %(message)s' logging.basicConfig(level=logging.INFO, format=FORMAT) @@ -470,15 +471,9 @@ def do_train(args): num_training_steps = args.max_steps if args.max_steps > 0 else ( len(train_data_loader) * args.num_train_epochs) - lr_scheduler = paddle.optimizer.lr.LambdaDecay( - args.learning_rate, - lambda current_step, num_warmup_steps=args.warmup_steps, - num_training_steps=num_training_steps: float( - current_step) / float(max(1, num_warmup_steps)) - if current_step < num_warmup_steps else max( - 0.0, - float(num_training_steps - current_step) / float( - max(1, num_training_steps - num_warmup_steps)))) + + lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, + args.warmup_steps) clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0) optimizer = paddle.optimizer.AdamW( diff --git a/PaddleNLP/examples/machine_reading_comprehension/DuReader-robust/run_du.py b/PaddleNLP/examples/machine_reading_comprehension/DuReader-robust/run_du.py index 6b2614db..65a96242 100644 --- a/PaddleNLP/examples/machine_reading_comprehension/DuReader-robust/run_du.py +++ b/PaddleNLP/examples/machine_reading_comprehension/DuReader-robust/run_du.py @@ -29,6 +29,7 @@ import paddlenlp as ppnlp from paddlenlp.datasets import SQuAD, DuReaderRobust, CMRC, DRCD from paddlenlp.data import Pad, Stack, Tuple from paddlenlp.transformers import BertForQuestionAnswering, BertTokenizer, ErnieForQuestionAnswering, ErnieTokenizer +from paddlenlp.transformers import LinearDecayWithWarmup from paddlenlp.metrics.squad import squad_evaluate, compute_predictions TASK_CLASSES = {"dureader-robust": DuReaderRobust, "cmrc": CMRC, "drcd": DRCD} @@ -177,16 +178,11 @@ def do_train(args): if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) - lr_scheduler = paddle.optimizer.lr.LambdaDecay( - args.learning_rate, - lambda current_step, warmup_proportion=args.warmup_proportion, - num_training_steps=args.max_steps if args.max_steps > 0 else - (len(train_data_loader)*args.num_train_epochs): float( - current_step) / float(max(1, warmup_proportion*num_training_steps)) - if current_step < warmup_proportion*num_training_steps else max( - 0.0, - float(num_training_steps - current_step) / float( - max(1, num_training_steps - warmup_proportion*num_training_steps)))) + num_training_steps = args.max_steps if args.max_steps > 0 else len( + train_data_loader) * args.num_train_epochs + + lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, + args.warmup_proportion) optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, diff --git a/PaddleNLP/examples/machine_reading_comprehension/DuReader-yesno/run_du.py b/PaddleNLP/examples/machine_reading_comprehension/DuReader-yesno/run_du.py index 6a4ed4ac..e754fc1f 100644 --- a/PaddleNLP/examples/machine_reading_comprehension/DuReader-yesno/run_du.py +++ b/PaddleNLP/examples/machine_reading_comprehension/DuReader-yesno/run_du.py @@ -28,6 +28,7 @@ import paddlenlp as ppnlp from paddlenlp.data import Pad, Stack, Tuple from paddlenlp.transformers import BertForSequenceClassification, BertTokenizer +from paddlenlp.transformers import LinearDecayWithWarmup MODEL_CLASSES = {"bert": (BertForSequenceClassification, BertTokenizer)} @@ -210,16 +211,11 @@ def do_train(args): if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) - lr_scheduler = paddle.optimizer.lr.LambdaDecay( - args.learning_rate, - lambda current_step, warmup_proportion=args.warmup_proportion, - num_training_steps=args.max_steps if args.max_steps > 0 else - (len(train_ds.examples)//args.batch_size*args.num_train_epochs): float( - current_step) / float(max(1, warmup_proportion*num_training_steps)) - if current_step < warmup_proportion*num_training_steps else max( - 0.0, - float(num_training_steps - current_step) / float( - max(1, num_training_steps - warmup_proportion*num_training_steps)))) + num_training_steps = args.max_steps if args.max_steps > 0 else len( + train_ds.examples) // args.batch_size * args.num_train_epochs + + lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, + args.warmup_proportion) optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, diff --git a/PaddleNLP/examples/machine_reading_comprehension/DuReader/run_du.py b/PaddleNLP/examples/machine_reading_comprehension/DuReader/run_du.py index 3b5e446e..44d9a0ec 100644 --- a/PaddleNLP/examples/machine_reading_comprehension/DuReader/run_du.py +++ b/PaddleNLP/examples/machine_reading_comprehension/DuReader/run_du.py @@ -28,6 +28,7 @@ import paddlenlp as ppnlp from paddlenlp.data import Pad, Stack, Tuple from paddlenlp.transformers import BertForQuestionAnswering, BertTokenizer, ErnieForQuestionAnswering, ErnieTokenizer +from paddlenlp.transformers import LinearDecayWithWarmup from paddlenlp.metrics.dureader import dureader_evaluate, compute_predictions MODEL_CLASSES = { @@ -168,16 +169,11 @@ def do_train(args): if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) - lr_scheduler = paddle.optimizer.lr.LambdaDecay( - args.learning_rate, - lambda current_step, warmup_proportion=args.warmup_proportion, - num_training_steps=args.max_steps if args.max_steps > 0 else - (len(train_dataset.examples)//args.batch_size*args.num_train_epochs): float( - current_step) / float(max(1, warmup_proportion*num_training_steps)) - if current_step < warmup_proportion*num_training_steps else max( - 0.0, - float(num_training_steps - current_step) / float( - max(1, num_training_steps - warmup_proportion*num_training_steps)))) + num_training_steps = args.max_steps if args.max_steps > 0 else len( + train_dataset.examples) // args.batch_size * args.num_train_epochs + + lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, + args.warmup_proportion) optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, diff --git a/PaddleNLP/examples/machine_reading_comprehension/SQuAD/run_squad.py b/PaddleNLP/examples/machine_reading_comprehension/SQuAD/run_squad.py index 04a11573..2e02b2e0 100644 --- a/PaddleNLP/examples/machine_reading_comprehension/SQuAD/run_squad.py +++ b/PaddleNLP/examples/machine_reading_comprehension/SQuAD/run_squad.py @@ -28,6 +28,7 @@ import paddlenlp as ppnlp from paddlenlp.data import Pad, Stack, Tuple from paddlenlp.transformers import BertForQuestionAnswering, BertTokenizer, ErnieForQuestionAnswering, ErnieTokenizer +from paddlenlp.transformers import LinearDecayWithWarmup from paddlenlp.metrics.squad import squad_evaluate, compute_predictions MODEL_CLASSES = { @@ -168,16 +169,11 @@ def do_train(args): if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) - lr_scheduler = paddle.optimizer.lr.LambdaDecay( - args.learning_rate, - lambda current_step, warmup_proportion=args.warmup_proportion, - num_training_steps=args.max_steps if args.max_steps > 0 else - (len(train_dataset.examples)//args.batch_size*args.num_train_epochs): float( - current_step) / float(max(1, warmup_proportion*num_training_steps)) - if current_step < warmup_proportion*num_training_steps else max( - 0.0, - float(num_training_steps - current_step) / float( - max(1, num_training_steps - warmup_proportion*num_training_steps)))) + num_training_steps = args.max_steps if args.max_steps > 0 else len( + train_dataset.examples) // args.batch_size * args.num_train_epochs + + lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, + args.warmup_proportion) optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, diff --git a/PaddleNLP/examples/model_compression/run_glue_ofa.py b/PaddleNLP/examples/model_compression/run_glue_ofa.py index ac51dff7..267ca344 100644 --- a/PaddleNLP/examples/model_compression/run_glue_ofa.py +++ b/PaddleNLP/examples/model_compression/run_glue_ofa.py @@ -28,6 +28,7 @@ from paddle.metric import Metric, Accuracy, Precision, Recall from paddlenlp.data import Stack, Tuple, Pad from paddlenlp.data.sampler import SamplerHelper from paddlenlp.transformers import BertModel, BertForSequenceClassification, BertTokenizer +from paddlenlp.transformers import LinearDecayWithWarmup from paddlenlp.utils.log import logger from paddlenlp.metrics import AccuracyAndF1, Mcc, PearsonAndSpearman import paddlenlp.datasets as datasets @@ -459,16 +460,11 @@ def do_train(args): num_heads=model.bert.config['num_attention_heads']) reorder_neuron_head(ofa_model.model, head_importance, neuron_importance) - lr_scheduler = paddle.optimizer.lr.LambdaDecay( - args.learning_rate, - lambda current_step, num_warmup_steps=args.warmup_steps, - num_training_steps=args.max_steps if args.max_steps > 0 else - (len(train_data_loader) * args.num_train_epochs): float( - current_step) / float(max(1, num_warmup_steps)) - if current_step < num_warmup_steps else max( - 0.0, - float(num_training_steps - current_step) / float( - max(1, num_training_steps - num_warmup_steps)))) + num_training_steps = args.max_steps if args.max_steps > 0 else len( + train_data_loader) * args.num_train_epochs + + lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, + args.warmup_steps) optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, diff --git a/PaddleNLP/examples/named_entity_recognition/msra_ner/run_msra_ner.py b/PaddleNLP/examples/named_entity_recognition/msra_ner/run_msra_ner.py index a3bf4bf7..99339e67 100644 --- a/PaddleNLP/examples/named_entity_recognition/msra_ner/run_msra_ner.py +++ b/PaddleNLP/examples/named_entity_recognition/msra_ner/run_msra_ner.py @@ -27,6 +27,7 @@ import paddlenlp as ppnlp from paddlenlp.datasets import MSRA_NER from paddlenlp.data import Stack, Tuple, Pad from paddlenlp.transformers import BertForTokenClassification, BertTokenizer +from paddlenlp.transformers import LinearDecayWithWarmup from paddlenlp.metrics import ChunkEvaluator @@ -291,16 +292,11 @@ def do_train(args): if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) - lr_scheduler = paddle.optimizer.lr.LambdaDecay( - args.learning_rate, - lambda current_step, num_warmup_steps=args.warmup_steps, - num_training_steps=args.max_steps if args.max_steps > 0 else - (len(train_data_loader) * args.num_train_epochs): float( - current_step) / float(max(1, num_warmup_steps)) - if current_step < num_warmup_steps else max( - 0.0, - float(num_training_steps - current_step) / float( - max(1, num_training_steps - num_warmup_steps)))) + num_training_steps = args.max_steps if args.max_steps > 0 else len( + train_data_loader) * args.num_train_epochs + + lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, + args.warmup_steps) optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, diff --git a/PaddleNLP/examples/text_classification/pretrained_models/train.py b/PaddleNLP/examples/text_classification/pretrained_models/train.py index 73bec44e..395d167c 100644 --- a/PaddleNLP/examples/text_classification/pretrained_models/train.py +++ b/PaddleNLP/examples/text_classification/pretrained_models/train.py @@ -22,8 +22,9 @@ import numpy as np import paddle import paddle.nn.functional as F -from paddlenlp.data import Stack, Tuple, Pad import paddlenlp as ppnlp +from paddlenlp.data import Stack, Tuple, Pad +from paddlenlp.transformers import LinearDecayWithWarmup # yapf: disable parser = argparse.ArgumentParser() @@ -214,19 +215,10 @@ def do_train(): model = paddle.DataParallel(model) num_training_steps = len(train_data_loader) * args.epochs - num_warmup_steps = int(args.warmup_proportion * num_training_steps) - - def get_lr_factor(current_step): - if current_step < num_warmup_steps: - return float(current_step) / float(max(1, num_warmup_steps)) - else: - return max(0.0, - float(num_training_steps - current_step) / - float(max(1, num_training_steps - num_warmup_steps))) - - lr_scheduler = paddle.optimizer.lr.LambdaDecay( - args.learning_rate, - lr_lambda=lambda current_step: get_lr_factor(current_step)) + + lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, + args.warmup_proportion) + optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, parameters=model.parameters(), diff --git a/PaddleNLP/examples/text_generation/ernie-gen/train.py b/PaddleNLP/examples/text_generation/ernie-gen/train.py index 507ac860..2505f91f 100644 --- a/PaddleNLP/examples/text_generation/ernie-gen/train.py +++ b/PaddleNLP/examples/text_generation/ernie-gen/train.py @@ -10,6 +10,7 @@ import paddle.nn as nn from paddle.io import DataLoader from paddlenlp.transformers import ErnieForGeneration from paddlenlp.transformers import ErnieTokenizer, ErnieTinyTokenizer, BertTokenizer, ElectraTokenizer, RobertaTokenizer +from paddlenlp.transformers import LinearDecayWithWarmup from paddlenlp.datasets import Poetry from paddlenlp.data import Stack, Tuple, Pad from paddlenlp.metrics import Rouge1, Rouge2 @@ -175,16 +176,10 @@ def train(): if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) - max_steps = (len(train_data_loader) * args.num_epochs) - lr_scheduler = paddle.optimizer.lr.LambdaDecay( - args.learning_rate, - lambda current_step, num_warmup_steps=max_steps*args.warmup_proportion, - num_training_steps=max_steps: float( - current_step) / float(max(1, num_warmup_steps)) - if current_step < num_warmup_steps else max( - 0.0, - float(num_training_steps - current_step) / float( - max(1, num_training_steps - num_warmup_steps)))) + max_steps = len(train_data_loader) * args.num_epochs + + lr_scheduler = LinearDecayWithWarmup(args.learning_rate, max_steps, + args.warmup_proportion) optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, diff --git a/PaddleNLP/examples/text_matching/sentence_transformers/train.py b/PaddleNLP/examples/text_matching/sentence_transformers/train.py index f37f86de..bf85f170 100644 --- a/PaddleNLP/examples/text_matching/sentence_transformers/train.py +++ b/PaddleNLP/examples/text_matching/sentence_transformers/train.py @@ -22,8 +22,9 @@ import numpy as np import paddle import paddle.nn.functional as F -from paddlenlp.data import Stack, Tuple, Pad import paddlenlp as ppnlp +from paddlenlp.data import Stack, Tuple, Pad +from paddlenlp.transformers import LinearDecayWithWarmup from model import SentenceTransformer @@ -231,19 +232,10 @@ def do_train(): model = paddle.DataParallel(model) num_training_steps = len(train_data_loader) * args.epochs - num_warmup_steps = int(args.warmup_proption * num_training_steps) - - def get_lr_factor(current_step): - if current_step < num_warmup_steps: - return float(current_step) / float(max(1, num_warmup_steps)) - else: - return max(0.0, - float(num_training_steps - current_step) / - float(max(1, num_training_steps - num_warmup_steps))) - - lr_scheduler = paddle.optimizer.lr.LambdaDecay( - args.learning_rate, - lr_lambda=lambda current_step: get_lr_factor(current_step)) + + lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, + args.warmup_proportion) + optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, parameters=model.parameters(), diff --git a/PaddleNLP/legacy/benchmark/bert/run_glue.py b/PaddleNLP/legacy/benchmark/bert/run_glue.py index 49e87f6a..4cfc7cac 100644 --- a/PaddleNLP/legacy/benchmark/bert/run_glue.py +++ b/PaddleNLP/legacy/benchmark/bert/run_glue.py @@ -27,6 +27,7 @@ from paddlenlp.datasets import GlueQNLI, GlueSST2 from paddlenlp.data import Stack, Tuple, Pad from paddlenlp.data.sampler import SamplerHelper from paddlenlp.transformers import BertForSequenceClassification, BertTokenizer +from paddlenlp.transformers import LinearDecayWithWarmup FORMAT = '%(asctime)s-%(levelname)s: %(message)s' logging.basicConfig(level=logging.INFO, format=FORMAT) @@ -332,16 +333,12 @@ def do_train(args): # Create the training-backward program, this pass will not be # executed in the validation with paddle.static.program_guard(main_program, startup_program): - lr_scheduler = paddle.optimizer.lr.LambdaDecay( - args.learning_rate, - lambda current_step, num_warmup_steps=args.warmup_steps, - num_training_steps=args.max_steps if args.max_steps > 0 else - (len(train_data_loader) * args.num_train_epochs): float( - current_step) / float(max(1, num_warmup_steps)) - if current_step < num_warmup_steps else max( - 0.0, - float(num_training_steps - current_step) / float( - max(1, num_training_steps - num_warmup_steps)))) + num_training_steps = args.max_steps if args.max_steps > 0 else len( + train_data_loader) * args.num_train_epochs + + lr_scheduler = LinearDecayWithWarmup( + args.learning_rate, num_training_steps, args.warmup_steps) + optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, diff --git a/PaddleNLP/legacy/benchmark/bert/run_pretrain.py b/PaddleNLP/legacy/benchmark/bert/run_pretrain.py index 45e88381..9d412fc0 100644 --- a/PaddleNLP/legacy/benchmark/bert/run_pretrain.py +++ b/PaddleNLP/legacy/benchmark/bert/run_pretrain.py @@ -28,6 +28,7 @@ from paddle.io import DataLoader, Dataset from paddlenlp.transformers import BertForPretraining, BertModel, BertPretrainingCriterion from paddlenlp.transformers import BertTokenizer +from paddlenlp.transformers import LinearDecayWithWarmup from data import create_data_holder, create_pretraining_dataset MODEL_CLASSES = {"bert": (BertForPretraining, BertTokenizer)} @@ -198,17 +199,11 @@ def do_train(args): loss = criterion(prediction_scores, seq_relationship_score, masked_lm_labels, next_sentence_labels, masked_lm_scale) + num_training_steps = args.max_steps if args.max_steps > 0 else len( + train_data_loader) * args.num_train_epochs # Define the dynamic learing_reate scheduler and optimizer - lr_scheduler = paddle.optimizer.lr.LambdaDecay( - args.learning_rate, - lambda current_step, num_warmup_steps=args.warmup_steps, - num_training_steps=args.max_steps if args.max_steps > 0 else - (len(train_data_loader) * args.num_train_epochs): float( - current_step) / float(max(1, num_warmup_steps)) - if current_step < num_warmup_steps else max( - 0.0, - float(num_training_steps - current_step) / float( - max(1, num_training_steps - num_warmup_steps)))) + lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, + args.warmup_steps) optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, diff --git a/PaddleNLP/legacy/benchmark/bert/run_pretrain_single.py b/PaddleNLP/legacy/benchmark/bert/run_pretrain_single.py index 81a3524c..66ac012a 100644 --- a/PaddleNLP/legacy/benchmark/bert/run_pretrain_single.py +++ b/PaddleNLP/legacy/benchmark/bert/run_pretrain_single.py @@ -27,6 +27,7 @@ import paddle from paddle.io import DataLoader, Dataset from paddlenlp.transformers import BertForPretraining, BertModel, BertPretrainingCriterion from paddlenlp.transformers import BertTokenizer +from paddlenlp.transformers import LinearDecayWithWarmup from data import create_data_holder, create_pretraining_dataset MODEL_CLASSES = {"bert": (BertForPretraining, BertTokenizer)} @@ -210,17 +211,11 @@ def do_train(args): loss = criterion(prediction_scores, seq_relationship_score, masked_lm_labels, next_sentence_labels, masked_lm_scale) + num_training_steps = args.max_steps if args.max_steps > 0 else len( + train_data_loader) * args.num_train_epochs # Define the dynamic learing_reate scheduler and optimizer - lr_scheduler = paddle.optimizer.lr.LambdaDecay( - args.learning_rate, - lambda current_step, num_warmup_steps=args.warmup_steps, - num_training_steps=args.max_steps if args.max_steps > 0 else - (len(train_data_loader) * args.num_train_epochs): float( - current_step) / float(max(1, num_warmup_steps)) - if current_step < num_warmup_steps else max( - 0.0, - float(num_training_steps - current_step) / float( - max(1, num_training_steps - num_warmup_steps)))) + lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, + args.warmup_steps) optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, diff --git a/PaddleNLP/paddlenlp/transformers/__init__.py b/PaddleNLP/paddlenlp/transformers/__init__.py index 86daabd0..5c8f7466 100644 --- a/PaddleNLP/paddlenlp/transformers/__init__.py +++ b/PaddleNLP/paddlenlp/transformers/__init__.py @@ -27,3 +27,4 @@ from .electra.modeling import * from .electra.tokenizer import * from .transformer.modeling import * from .ernie_gen.modeling import ErnieForGeneration +from .optimization import * diff --git a/PaddleNLP/paddlenlp/transformers/converter/run_glue_pp.py b/PaddleNLP/paddlenlp/transformers/converter/run_glue_pp.py index fccbc082..c5c180e6 100644 --- a/PaddleNLP/paddlenlp/transformers/converter/run_glue_pp.py +++ b/PaddleNLP/paddlenlp/transformers/converter/run_glue_pp.py @@ -31,6 +31,7 @@ from paddlenlp.data import * from paddlenlp.data.sampler import SamplerHelper from paddlenlp.transformers.model_bert import * from paddlenlp.transformers.tokenizer_bert import BertTokenizer +from paddlenlp.transformers import LinearDecayWithWarmup TASK_CLASSES = { "qnli": (GlueQNLI, paddle.metric.Accuracy), # (dataset, metric) @@ -307,16 +308,11 @@ def do_train(args): if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) - lr_scheduler = paddle.optimizer.lr.LambdaDecay( - args.learning_rate, - lambda current_step, num_warmup_steps=args.warmup_steps, - num_training_steps=args.max_steps if args.max_steps > 0 else - (len(train_data_loader) * args.num_train_epochs): float( - current_step) / float(max(1, num_warmup_steps)) - if current_step < num_warmup_steps else max( - 0.0, - float(num_training_steps - current_step) / float( - max(1, num_training_steps - num_warmup_steps)))) + num_training_steps = args.max_steps if args.max_steps > 0 else len( + train_data_loader) * args.num_train_epochs + + lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, + args.warmup_steps) optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, diff --git a/PaddleNLP/paddlenlp/transformers/optimization.py b/PaddleNLP/paddlenlp/transformers/optimization.py new file mode 100644 index 00000000..925cde60 --- /dev/null +++ b/PaddleNLP/paddlenlp/transformers/optimization.py @@ -0,0 +1,260 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +import math + +from paddle.optimizer.lr import LambdaDecay + +__all__ = [ + 'LinearDecayWithWarmup', 'ConstScheduleWithWarmup', 'CosineDecayWithWarmup', + 'PolyDecayWithWarmup' +] + + +def is_integer(number): + if sys.version > '3': + return isinstance(number, int) + return isinstance(number, (int, long)) + + +class LinearDecayWithWarmup(LambdaDecay): + """ + Create a learning rate scheduler, which increases learning rate linearly + from 0 to given `learning_rate`, after this warmup period learning rate + would be decreased linearly from the base learning rate to 0. + + Args: + learning_rate (float): The base learning rate. It is a python float + number. + total_steps (int): The number of training steps. + warmup (int|float): If int, it means the number of steps for warmup. + If float, it means the proportion of warmup in total training steps. + last_epoch (int, optional): The index of last epoch. It can be set to + restart training. If None, it means initial learning rate. + Default: -1. + verbose (bool, optional): If True, prints a message to stdout for each + update. Default: False. + + Examples: + + .. code-block:: python + + from paddlenlp.transformers import LinearDecayWithWarmup + lr, warmup_steps, max_steps = 0.1, 100, 1000 + lr_scheduler = LinearDecayWithWarmup(lr, max_steps, warmup_steps) + + """ + + def __init__(self, + learning_rate, + total_steps, + warmup, + last_epoch=-1, + verbose=False): + warmup_steps = warmup if is_integer(warmup) else int( + math.floor(warmup * total_steps)) + + def lr_lambda(current_step): + if current_step < warmup_steps: + return float(current_step) / float(max(1, warmup_steps)) + return max(0.0, + float(total_steps - current_step) / + float(max(1, total_steps - warmup_steps))) + + super(LinearDecayWithWarmup, self).__init__(learning_rate, lr_lambda, + last_epoch, verbose) + + +class ConstScheduleWithWarmup(LambdaDecay): + """ + Create a learning rate scheduler, which increases learning rate linearly + from 0 to given `learning_rate` during warmup periods and keeps learning + rate a constant after that. + + Args: + learning_rate (float): The base learning rate. It is a python float + number. + warmup (int|float): If int, it means the number of steps for warmup. + If float, it means the proportion of warmup in total training steps. + total_steps (int, optional): The number of training steps. If `warmup` + is a float number, `total_steps` must be provided. + last_epoch (int, optional): The index of last epoch. It can be set to + restart training. If None, it means initial learning rate. + Default: -1. + + Examples: + + .. code-block:: python + + from paddlenlp.transformers import ConstScheduleWithWarmup + lr, warmup_steps = 0.1, 100 + lr_scheduler = ConstScheduleWithWarmup(lr, warmup_steps) + + """ + + def __init__(self, + learning_rate, + warmup, + total_steps=None, + last_epoch=-1, + verbose=False): + warmup_steps = warmup if is_integer(warmup) else int( + math.floor(warmup * total_steps)) + if is_integer(warmup): + warmup_steps = warmup + elif total_steps: + warmup_steps = int(math.floor(warmup * total_steps)) + else: + raise ValueError( + "Please provide total steps if `warmup` is a float number , or provide integer for argument `warmup`." + ) + + def lr_lambda(current_step): + if current_step < warmup_steps: + return float(current_step) / float(max(1.0, warmup_steps)) + return 1.0 + + super(ConstScheduleWithWarmup, self).__init__(learning_rate, lr_lambda, + last_epoch, verbose) + + +class CosineDecayWithWarmup(LambdaDecay): + """ + Create a learning rate scheduler, which increases learning rate linearly + from 0 to given `learning_rate`, after this warmup period learning rate + would be decreased following the values of the cosine function. If + `with_hard_restarts` is True, the cosine function could have serveral hard + restarts. + + Args: + learning_rate (float): The base learning rate. It is a python float + number. + total_steps (int): The number of training steps. + warmup (int|float): If int, it means the number of steps for warmup. + If float, it means the proportion of warmup in total training steps. + with_hard_restarts (bool) Whether cosine function has several hard + restarts. Default: False. + num_cycles (int|float optional): If `with_hard_restarts` is False, it + means the number of waves in cosine scheduler and should be an + integer number and defaults to 1. If `with_hard_restarts` is True, + it means the number of hard restarts to use and should be a float + number and defaults to be 0.5. Default: None. + last_epoch (int, optional): The index of last epoch. It can be set to + restart training. If None, it means initial learning rate. + Default: -1. + + Examples: + + .. code-block:: python + + from paddlenlp.transformers import CosineDecayWithWarmup + lr, warmup_steps, max_steps = 0.1, 100, 1000 + lr_scheduler = CosineDecayWithWarmup(lr, max_steps, warmup_steps) + + """ + + def __init__(self, + learning_rate, + total_steps, + warmup, + with_hard_restarts=False, + num_cycles=None, + last_epoch=-1, + verbose=False): + warmup_steps = warmup if is_integer(warmup) else int( + math.floor(warmup * total_steps)) + # Input check + if num_cycles is not None: + assert not with_hard_restarts and isinstance(num_cycles, int) or with_hard_restarts and isinstance(num_cycles, float), \ + "`num_circles` should be an integer while `with_hard_restarts` is False, an float while `with_hard_restarts` is True." + else: + num_cycles = 1 if not with_hard_restarts else 0.5 + + def lr_lambda(current_step): + if current_step < warmup_steps: + return float(current_step) / float(max(1, warmup_steps)) + + progress = float(current_step - warmup_steps) / float( + max(1, total_steps - warmup_steps)) + + if with_hard_restarts: + if progress >= 1.0: + return 0.0 + return max(0.0, 0.5 * (1.0 + math.cos(math.pi * ( + (float(num_cycles) * progress) % 1.0)))) + + return max(0.0, 0.5 * ( + 1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress))) + + super(CosineDecayWithWarmup, self).__init__(learning_rate, lr_lambda, + last_epoch, verbose) + + +class PolyDecayWithWarmup(LambdaDecay): + """ + Create a learning rate scheduler, which increases learning rate linearly + from 0 to given `lr_init`, after this warmup period learning rate would + be decreased as a polynomial decay from the base learning rate to the end + learning rate `lr_end`. + + Args: + learning_rate (float): The base learning rate. It is a python float + number. + total_steps (int): The number of training steps. + warmup (int|float): If int, it means the number of steps for warmup. + If float, it means the proportion of warmup in total training steps. + lr_end (float, optional): The end learning rate. Default: 1e-7. + power (float, optional): Power factor. Default: 1.0. + last_epoch (int, optional): The index of last epoch. It can be set to + restart training. If None, it means initial learning rate. + Default: -1. + + Examples: + + .. code-block:: python + + from paddlenlp.transformers import PolyDecayWithWarmup + lr, lr_end, warmup_steps, max_steps = 0.1, 1e-6, 100, 1000 + lr_scheduler = PolyDecayWithWarmup(lr, max_steps, warmup_steps, lr_end) + + """ + + def __init__(self, + learning_rate, + total_steps, + warmup, + lr_end=1e-7, + power=1.0, + last_epoch=-1, + verbose=False): + lr_init = learning_rate + assert lr_init > lr_end, f"`lr_end` must be be smaller than `learning_rate`. But `lr_end` is {lr_end} while `learning_rate` is {lr_init}." + warmup_steps = warmup if is_integer(warmup) else int( + math.floor(warmup * total_steps)) + + def lr_lambda(current_step): + if current_step < warmup_steps: + return float(current_step) / float(max(1, warmup_steps)) + elif current_step > total_steps: + return lr_end / lr_init # it multiplies by lr_init equals to lr_end + else: + lr_range = lr_init - lr_end + decay_steps = total_steps - warmup_steps + pct_remaining = 1 - (current_step - warmup_steps) / decay_steps + decay = lr_range * pct_remaining**power + lr_end + return decay / lr_init # it multiplies by lr_init equals to decay + + super(PolyDecayWithWarmup, self).__init__(lr_init, lr_lambda, + last_epoch, verbose) -- GitLab