未验证 提交 fee616e8 编写于 作者: J Jiaqi Liu 提交者: GitHub

Add some lr schedulers with warmup (#5256)

* add -linear-schedule-with-warmup

* update lr scheduler usage in run_glue

* add import info

* add other 5 scheduler with warmup

* add copyright, update usage in run_glue

* simplify argument, make warmup arg support float and int

* Add some LambdaDecay scheduler with warmup, update usage in run_glue

* update classname and unify two cosine decays into one

* update usage in run_glue

* fix typo

* update WarmUp to Warmup, and update class name about Const, and update doc, and usage

* update usage of decay class

* update usage of decay class
上级 a22fa4b3
...@@ -29,6 +29,7 @@ from paddlenlp.data import Stack, Tuple, Pad ...@@ -29,6 +29,7 @@ from paddlenlp.data import Stack, Tuple, Pad
from paddlenlp.data.sampler import SamplerHelper from paddlenlp.data.sampler import SamplerHelper
from paddlenlp.transformers import BertForSequenceClassification, BertTokenizer from paddlenlp.transformers import BertForSequenceClassification, BertTokenizer
from paddlenlp.transformers import ErnieForSequenceClassification, ErnieTokenizer from paddlenlp.transformers import ErnieForSequenceClassification, ErnieTokenizer
from paddlenlp.transformers import LinearDecayWithWarmup
from paddlenlp.metrics import Mcc, PearsonAndSpearman from paddlenlp.metrics import Mcc, PearsonAndSpearman
from paddlenlp.utils.log import logger from paddlenlp.utils.log import logger
...@@ -381,17 +382,11 @@ def do_train(args): ...@@ -381,17 +382,11 @@ def do_train(args):
# Create the training-backward program, this pass will not be # Create the training-backward program, this pass will not be
# executed in the validation # executed in the validation
num_training_steps = args.max_steps if args.max_steps > 0 else len(
train_data_loader) * args.num_train_epochs
with paddle.static.program_guard(main_program, startup_program): with paddle.static.program_guard(main_program, startup_program):
lr_scheduler = paddle.optimizer.lr.LambdaDecay( lr_scheduler = LinearDecayWithWarmup(
args.learning_rate, args.learning_rate, num_training_steps, args.warmup_steps)
lambda current_step, num_warmup_steps=args.warmup_steps,
num_training_steps=args.max_steps if args.max_steps > 0 else
(len(train_data_loader) * args.num_train_epochs): float(
current_step) / float(max(1, num_warmup_steps))
if current_step < num_warmup_steps else max(
0.0,
float(num_training_steps - current_step) / float(
max(1, num_training_steps - num_warmup_steps))))
optimizer = paddle.optimizer.AdamW( optimizer = paddle.optimizer.AdamW(
learning_rate=lr_scheduler, learning_rate=lr_scheduler,
epsilon=args.adam_epsilon, epsilon=args.adam_epsilon,
......
...@@ -31,6 +31,7 @@ from paddle.io import DataLoader, Dataset ...@@ -31,6 +31,7 @@ from paddle.io import DataLoader, Dataset
from paddlenlp.transformers import BertForPretraining, BertModel, BertPretrainingCriterion from paddlenlp.transformers import BertForPretraining, BertModel, BertPretrainingCriterion
from paddlenlp.transformers import BertTokenizer from paddlenlp.transformers import BertTokenizer
from paddlenlp.transformers import LinearDecayWithWarmup
from data import create_data_holder, create_pretraining_dataset from data import create_data_holder, create_pretraining_dataset
MODEL_CLASSES = {"bert": (BertForPretraining, BertTokenizer)} MODEL_CLASSES = {"bert": (BertForPretraining, BertTokenizer)}
...@@ -232,7 +233,8 @@ def dist_optimizer(args, optimizer): ...@@ -232,7 +233,8 @@ def dist_optimizer(args, optimizer):
if args.use_amp: if args.use_amp:
dist_strategy.amp = True dist_strategy.amp = True
custom_black_list = ['lookup_table', 'lookup_table_v2'] if args.use_pure_fp16 else None custom_black_list = ['lookup_table',
'lookup_table_v2'] if args.use_pure_fp16 else None
dist_strategy.amp_configs = { dist_strategy.amp_configs = {
'custom_white_list': ['softmax', 'layer_norm', 'gelu'], 'custom_white_list': ['softmax', 'layer_norm', 'gelu'],
'init_loss_scaling': args.scale_loss, 'init_loss_scaling': args.scale_loss,
...@@ -305,16 +307,11 @@ def do_train(args): ...@@ -305,16 +307,11 @@ def do_train(args):
masked_lm_labels, next_sentence_labels, masked_lm_scale) masked_lm_labels, next_sentence_labels, masked_lm_scale)
# Define the dynamic learing_reate scheduler and optimizer # Define the dynamic learing_reate scheduler and optimizer
lr_scheduler = paddle.optimizer.lr.LambdaDecay( num_training_steps = args.max_steps if args.max_steps > 0 else len(
args.learning_rate, train_data_loader) * args.num_train_epochs
lambda current_step, num_warmup_steps=args.warmup_steps,
num_training_steps=args.max_steps if args.max_steps > 0 else lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps,
(len(train_data_loader) * args.num_train_epochs): float( args.warmup_steps)
current_step) / float(max(1, num_warmup_steps))
if current_step < num_warmup_steps else max(
0.0,
float(num_training_steps - current_step) / float(
max(1, num_training_steps - num_warmup_steps))))
optimizer = paddle.optimizer.AdamW( optimizer = paddle.optimizer.AdamW(
learning_rate=lr_scheduler, learning_rate=lr_scheduler,
...@@ -327,8 +324,8 @@ def do_train(args): ...@@ -327,8 +324,8 @@ def do_train(args):
], ],
multi_precision=args.use_pure_fp16) multi_precision=args.use_pure_fp16)
if worker_num == 1 and args.use_amp: if worker_num == 1 and args.use_amp:
custom_black_list=(['lookup_table', 'lookup_table_v2'] custom_black_list = (['lookup_table', 'lookup_table_v2']
if args.use_pure_fp16 else None) if args.use_pure_fp16 else None)
amp_list = paddle.static.amp.AutoMixedPrecisionLists( amp_list = paddle.static.amp.AutoMixedPrecisionLists(
custom_white_list=['softmax', 'layer_norm', 'gelu'], custom_white_list=['softmax', 'layer_norm', 'gelu'],
custom_black_list=custom_black_list) custom_black_list=custom_black_list)
......
...@@ -28,6 +28,7 @@ from paddle.io import DataLoader, Dataset ...@@ -28,6 +28,7 @@ from paddle.io import DataLoader, Dataset
from paddlenlp.transformers import BertForPretraining, BertModel, BertPretrainingCriterion from paddlenlp.transformers import BertForPretraining, BertModel, BertPretrainingCriterion
from paddlenlp.transformers import ErnieForPretraining, ErnieModel, ErniePretrainingCriterion from paddlenlp.transformers import ErnieForPretraining, ErnieModel, ErniePretrainingCriterion
from paddlenlp.transformers import BertTokenizer, ErnieTokenizer from paddlenlp.transformers import BertTokenizer, ErnieTokenizer
from paddlenlp.transformers import LinearDecayWithWarmup
from data import create_data_holder, create_pretraining_dataset from data import create_data_holder, create_pretraining_dataset
MODEL_CLASSES = { MODEL_CLASSES = {
...@@ -222,16 +223,11 @@ def do_train(args): ...@@ -222,16 +223,11 @@ def do_train(args):
masked_lm_labels, next_sentence_labels, masked_lm_scale) masked_lm_labels, next_sentence_labels, masked_lm_scale)
# Define the dynamic learing_reate scheduler and optimizer # Define the dynamic learing_reate scheduler and optimizer
lr_scheduler = paddle.optimizer.lr.LambdaDecay( num_training_steps = args.max_steps if args.max_steps > 0 else len(
args.learning_rate, train_data_loader) * args.num_train_epochs
lambda current_step, num_warmup_steps=args.warmup_steps,
num_training_steps=args.max_steps if args.max_steps > 0 else lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps,
(len(train_data_loader) * args.num_train_epochs): float( args.warmup_steps)
current_step) / float(max(1, num_warmup_steps))
if current_step < num_warmup_steps else max(
0.0,
float(num_training_steps - current_step) / float(
max(1, num_training_steps - num_warmup_steps))))
optimizer = paddle.optimizer.AdamW( optimizer = paddle.optimizer.AdamW(
learning_rate=lr_scheduler, learning_rate=lr_scheduler,
...@@ -244,8 +240,8 @@ def do_train(args): ...@@ -244,8 +240,8 @@ def do_train(args):
], ],
multi_precision=args.use_pure_fp16) multi_precision=args.use_pure_fp16)
if args.use_amp: if args.use_amp:
custom_black_list=(['lookup_table', 'lookup_table_v2'] custom_black_list = (['lookup_table', 'lookup_table_v2']
if args.use_pure_fp16 else None) if args.use_pure_fp16 else None)
amp_list = paddle.static.amp.AutoMixedPrecisionLists( amp_list = paddle.static.amp.AutoMixedPrecisionLists(
custom_white_list=['layer_norm', 'softmax', 'gelu'], custom_white_list=['layer_norm', 'softmax', 'gelu'],
custom_black_list=custom_black_list) custom_black_list=custom_black_list)
......
...@@ -9,13 +9,13 @@ import paddle.nn as nn ...@@ -9,13 +9,13 @@ import paddle.nn as nn
import paddle.nn.functional as F import paddle.nn.functional as F
import paddle.distributed as dist import paddle.distributed as dist
from paddle.io import DataLoader, DistributedBatchSampler, BatchSampler from paddle.io import DataLoader, DistributedBatchSampler, BatchSampler
from paddle.optimizer.lr import LambdaDecay
from paddle.optimizer import AdamW from paddle.optimizer import AdamW
from paddle.metric import Accuracy from paddle.metric import Accuracy
from paddlenlp.datasets import MapDatasetWrapper from paddlenlp.datasets import MapDatasetWrapper
from paddlenlp.data import Stack, Tuple, Pad from paddlenlp.data import Stack, Tuple, Pad
from paddlenlp.transformers import BertTokenizer, BertForSequenceClassification, BertForTokenClassification from paddlenlp.transformers import BertTokenizer, BertForSequenceClassification, BertForTokenClassification
from paddlenlp.transformers import LinearDecayWithWarmup
from args import parse_args, set_default_args from args import parse_args, set_default_args
import data import data
...@@ -54,14 +54,6 @@ def save_ckpt(model, optimizer, output_dir, name): ...@@ -54,14 +54,6 @@ def save_ckpt(model, optimizer, output_dir, name):
paddle.save(optimizer.state_dict(), opt_path) paddle.save(optimizer.state_dict(), opt_path)
def compute_lr_factor(current_step, warmup_steps, max_train_steps):
if current_step < warmup_steps:
factor = float(current_step) / warmup_steps
else:
factor = 1 - float(current_step) / max_train_steps
return factor
class DGULossFunction(nn.Layer): class DGULossFunction(nn.Layer):
def __init__(self, task_name): def __init__(self, task_name):
super(DGULossFunction, self).__init__() super(DGULossFunction, self).__init__()
...@@ -117,16 +109,14 @@ def print_logs(args, step, logits, labels, loss, total_time, metric): ...@@ -117,16 +109,14 @@ def print_logs(args, step, logits, labels, loss, total_time, metric):
def train(args, model, train_data_loader, dev_data_loader, metric, rank): def train(args, model, train_data_loader, dev_data_loader, metric, rank):
num_examples = len(train_data_loader) * args.batch_size * args.n_gpu num_examples = len(train_data_loader) * args.batch_size * args.n_gpu
max_train_steps = args.epochs * len(train_data_loader) max_train_steps = args.epochs * len(train_data_loader)
warmup_steps = int(max_train_steps * args.warmup_proportion)
if rank == 0: if rank == 0:
print("Num train examples: %d" % num_examples) print("Num train examples: %d" % num_examples)
print("Max train steps: %d" % max_train_steps) print("Max train steps: %d" % max_train_steps)
print("Num warmup steps: %d" % warmup_steps) print("Warmup proportion: %d" % args.warmup_proportion)
factor_fn = partial(
compute_lr_factor, lr_scheduler = LinearDecayWithWarmup(args.learning_rate, max_train_steps,
warmup_steps=warmup_steps, args.warmup_proportion)
max_train_steps=max_train_steps)
lr_scheduler = LambdaDecay(args.learning_rate, factor_fn)
optimizer = AdamW( optimizer = AdamW(
learning_rate=lr_scheduler, learning_rate=lr_scheduler,
parameters=model.parameters(), parameters=model.parameters(),
......
...@@ -32,6 +32,7 @@ from paddlenlp.data.sampler import SamplerHelper ...@@ -32,6 +32,7 @@ from paddlenlp.data.sampler import SamplerHelper
from paddlenlp.transformers import BertForSequenceClassification, BertTokenizer from paddlenlp.transformers import BertForSequenceClassification, BertTokenizer
from paddlenlp.transformers import ElectraForSequenceClassification, ElectraTokenizer from paddlenlp.transformers import ElectraForSequenceClassification, ElectraTokenizer
from paddlenlp.transformers import ErnieForSequenceClassification, ErnieTokenizer from paddlenlp.transformers import ErnieForSequenceClassification, ErnieTokenizer
from paddlenlp.transformers import LinearDecayWithWarmup
from paddlenlp.metrics import AccuracyAndF1, Mcc, PearsonAndSpearman from paddlenlp.metrics import AccuracyAndF1, Mcc, PearsonAndSpearman
FORMAT = '%(asctime)s-%(levelname)s: %(message)s' FORMAT = '%(asctime)s-%(levelname)s: %(message)s'
...@@ -355,17 +356,10 @@ def do_train(args): ...@@ -355,17 +356,10 @@ def do_train(args):
num_training_steps = args.max_steps if args.max_steps > 0 else ( num_training_steps = args.max_steps if args.max_steps > 0 else (
len(train_data_loader) * args.num_train_epochs) len(train_data_loader) * args.num_train_epochs)
warmup_steps = args.warmup_steps if args.warmup_steps > 0 else ( warmup = args.warmup_steps if args.warmup_steps > 0 else args.warmup_proportion
int(math.floor(num_training_steps * args.warmup_proportion)))
lr_scheduler = paddle.optimizer.lr.LambdaDecay( lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps,
args.learning_rate, warmup)
lambda current_step, num_warmup_steps=warmup_steps,
num_training_steps=num_training_steps : float(
current_step) / float(max(1, num_warmup_steps))
if current_step < num_warmup_steps else max(
0.0,
float(num_training_steps - current_step) / float(
max(1, num_training_steps - num_warmup_steps))))
optimizer = paddle.optimizer.AdamW( optimizer = paddle.optimizer.AdamW(
learning_rate=lr_scheduler, learning_rate=lr_scheduler,
......
...@@ -32,6 +32,7 @@ from paddlenlp.data import Stack, Tuple, Pad ...@@ -32,6 +32,7 @@ from paddlenlp.data import Stack, Tuple, Pad
from paddlenlp.transformers import BertForSequenceClassification, BertTokenizer from paddlenlp.transformers import BertForSequenceClassification, BertTokenizer
from paddlenlp.transformers import ElectraForSequenceClassification, ElectraTokenizer from paddlenlp.transformers import ElectraForSequenceClassification, ElectraTokenizer
from paddlenlp.transformers import ErnieForSequenceClassification, ErnieTokenizer from paddlenlp.transformers import ErnieForSequenceClassification, ErnieTokenizer
from paddlenlp.transformers import LinearDecayWithWarmup
from paddlenlp.metrics import AccuracyAndF1, Mcc, PearsonAndSpearman from paddlenlp.metrics import AccuracyAndF1, Mcc, PearsonAndSpearman
FORMAT = '%(asctime)s-%(levelname)s: %(message)s' FORMAT = '%(asctime)s-%(levelname)s: %(message)s'
...@@ -162,14 +163,16 @@ def parse_args(): ...@@ -162,14 +163,16 @@ def parse_args():
type=str, type=str,
default="gpu", default="gpu",
help="Device for selecting for the training.") help="Device for selecting for the training.")
parser.add_argument("--use_amp", parser.add_argument(
type=distutils.util.strtobool, "--use_amp",
default=False, type=distutils.util.strtobool,
help="Enable mixed precision training.") default=False,
parser.add_argument("--scale_loss", help="Enable mixed precision training.")
type=float, parser.add_argument(
default=2**15, "--scale_loss",
help="The value of scale_loss for fp16.") type=float,
default=2**15,
help="The value of scale_loss for fp16.")
args = parser.parse_args() args = parser.parse_args()
return args return args
...@@ -360,17 +363,10 @@ def do_train(args): ...@@ -360,17 +363,10 @@ def do_train(args):
num_training_steps = args.max_steps if args.max_steps > 0 else ( num_training_steps = args.max_steps if args.max_steps > 0 else (
len(train_data_loader) * args.num_train_epochs) len(train_data_loader) * args.num_train_epochs)
warmup_steps = args.warmup_steps if args.warmup_steps > 0 else ( warmup = args.warmup_steps if args.warmup_steps > 0 else args.warmup_proportion
int(math.floor(num_training_steps * args.warmup_proportion)))
lr_scheduler = paddle.optimizer.lr.LambdaDecay( lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps,
args.learning_rate, warmup)
lambda current_step, num_warmup_steps=warmup_steps,
num_training_steps=num_training_steps : float(
current_step) / float(max(1, num_warmup_steps))
if current_step < num_warmup_steps else max(
0.0,
float(num_training_steps - current_step) / float(
max(1, num_training_steps - num_warmup_steps))))
optimizer = paddle.optimizer.AdamW( optimizer = paddle.optimizer.AdamW(
learning_rate=lr_scheduler, learning_rate=lr_scheduler,
......
...@@ -34,6 +34,7 @@ from paddlenlp.data import Stack, Tuple, Pad ...@@ -34,6 +34,7 @@ from paddlenlp.data import Stack, Tuple, Pad
from paddlenlp.transformers import BertForPretraining, BertModel, BertPretrainingCriterion from paddlenlp.transformers import BertForPretraining, BertModel, BertPretrainingCriterion
from paddlenlp.transformers import ErnieForPretraining, ErnieModel, ErniePretrainingCriterion from paddlenlp.transformers import ErnieForPretraining, ErnieModel, ErniePretrainingCriterion
from paddlenlp.transformers import BertTokenizer, ErnieTokenizer from paddlenlp.transformers import BertTokenizer, ErnieTokenizer
from paddlenlp.transformers import LinearDecayWithWarmup
FORMAT = '%(asctime)s-%(levelname)s: %(message)s' FORMAT = '%(asctime)s-%(levelname)s: %(message)s'
logging.basicConfig(level=logging.INFO, format=FORMAT) logging.basicConfig(level=logging.INFO, format=FORMAT)
...@@ -147,14 +148,16 @@ def parse_args(): ...@@ -147,14 +148,16 @@ def parse_args():
type=str, type=str,
default="gpu", default="gpu",
help="Device for selecting for the training.") help="Device for selecting for the training.")
parser.add_argument("--use_amp", parser.add_argument(
type=distutils.util.strtobool, "--use_amp",
default=False, type=distutils.util.strtobool,
help="Enable mixed precision training.") default=False,
parser.add_argument("--scale_loss", help="Enable mixed precision training.")
type=float, parser.add_argument(
default=2**15, "--scale_loss",
help="The value of scale_loss for fp16.") type=float,
default=2**15,
help="The value of scale_loss for fp16.")
args = parser.parse_args() args = parser.parse_args()
return args return args
...@@ -301,17 +304,11 @@ def do_train(args): ...@@ -301,17 +304,11 @@ def do_train(args):
# If use defalut last_epoch, lr of the first iteration is 0. # If use defalut last_epoch, lr of the first iteration is 0.
# Use `last_epoch = 0` to be consistent with nv bert. # Use `last_epoch = 0` to be consistent with nv bert.
lr_scheduler = paddle.optimizer.lr.LambdaDecay( num_training_steps = args.max_steps if args.max_steps > 0 else len(
args.learning_rate, train_data_loader) * args.num_train_epochs
lambda current_step, num_warmup_steps=args.warmup_steps,
num_training_steps=args.max_steps if args.max_steps > 0 else lr_scheduler = LinearDecayWithWarmup(
(len(train_data_loader) * args.num_train_epochs): float( args.learning_rate, num_training_steps, args.warmup_steps, last_epoch=0)
current_step) / float(max(1, num_warmup_steps))
if current_step < num_warmup_steps else max(
0.0,
float(num_training_steps - current_step) / float(
max(1, num_training_steps - num_warmup_steps))),
last_epoch=0)
optimizer = paddle.optimizer.AdamW( optimizer = paddle.optimizer.AdamW(
learning_rate=lr_scheduler, learning_rate=lr_scheduler,
...@@ -390,8 +387,8 @@ def do_train(args): ...@@ -390,8 +387,8 @@ def do_train(args):
attention_mask=input_mask, attention_mask=input_mask,
masked_positions=masked_lm_positions) masked_positions=masked_lm_positions)
loss = criterion(prediction_scores, seq_relationship_score, loss = criterion(prediction_scores, seq_relationship_score,
masked_lm_labels, next_sentence_labels, masked_lm_labels, next_sentence_labels,
masked_lm_scale) masked_lm_scale)
if args.use_amp: if args.use_amp:
scaler.scale(loss).backward() scaler.scale(loss).backward()
scaler.minimize(optimizer, loss) scaler.minimize(optimizer, loss)
......
...@@ -31,6 +31,7 @@ from paddlenlp.datasets import GlueCoLA, GlueSST2, GlueMRPC, GlueSTSB, GlueQQP, ...@@ -31,6 +31,7 @@ from paddlenlp.datasets import GlueCoLA, GlueSST2, GlueMRPC, GlueSTSB, GlueQQP,
from paddlenlp.data import Stack, Tuple, Pad from paddlenlp.data import Stack, Tuple, Pad
from paddlenlp.data.sampler import SamplerHelper from paddlenlp.data.sampler import SamplerHelper
from paddlenlp.transformers import ElectraForSequenceClassification, ElectraTokenizer from paddlenlp.transformers import ElectraForSequenceClassification, ElectraTokenizer
from paddlenlp.transformers import LinearDecayWithWarmup
from paddlenlp.utils.log import logger from paddlenlp.utils.log import logger
from paddlenlp.metrics import AccuracyAndF1, Mcc, PearsonAndSpearman from paddlenlp.metrics import AccuracyAndF1, Mcc, PearsonAndSpearman
...@@ -226,16 +227,9 @@ def do_train(args): ...@@ -226,16 +227,9 @@ def do_train(args):
num_training_steps = args.max_steps if args.max_steps > 0 else ( num_training_steps = args.max_steps if args.max_steps > 0 else (
len(train_data_loader) * args.num_train_epochs) len(train_data_loader) * args.num_train_epochs)
warmup_steps = int(math.floor(num_training_steps * args.warmup_proportion))
lr_scheduler = paddle.optimizer.lr.LambdaDecay( lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps,
args.learning_rate, args.warmup_proportion)
lambda current_step, num_warmup_steps=warmup_steps,
num_training_steps=num_training_steps : float(
current_step) / float(max(1, num_warmup_steps))
if current_step < num_warmup_steps else max(
0.0,
float(num_training_steps - current_step) / float(
max(1, num_training_steps - num_warmup_steps))))
optimizer = paddle.optimizer.AdamW( optimizer = paddle.optimizer.AdamW(
learning_rate=lr_scheduler, learning_rate=lr_scheduler,
......
...@@ -32,6 +32,7 @@ from paddle.io import DataLoader, Dataset ...@@ -32,6 +32,7 @@ from paddle.io import DataLoader, Dataset
from paddlenlp.transformers import ElectraForTotalPretraining, ElectraModel, ElectraPretrainingCriterion from paddlenlp.transformers import ElectraForTotalPretraining, ElectraModel, ElectraPretrainingCriterion
from paddlenlp.transformers import ElectraDiscriminator, ElectraGenerator from paddlenlp.transformers import ElectraDiscriminator, ElectraGenerator
from paddlenlp.transformers import ElectraTokenizer from paddlenlp.transformers import ElectraTokenizer
from paddlenlp.transformers import LinearDecayWithWarmup
FORMAT = '%(asctime)s-%(levelname)s: %(message)s' FORMAT = '%(asctime)s-%(levelname)s: %(message)s'
logging.basicConfig(level=logging.INFO, format=FORMAT) logging.basicConfig(level=logging.INFO, format=FORMAT)
...@@ -470,15 +471,9 @@ def do_train(args): ...@@ -470,15 +471,9 @@ def do_train(args):
num_training_steps = args.max_steps if args.max_steps > 0 else ( num_training_steps = args.max_steps if args.max_steps > 0 else (
len(train_data_loader) * args.num_train_epochs) len(train_data_loader) * args.num_train_epochs)
lr_scheduler = paddle.optimizer.lr.LambdaDecay(
args.learning_rate, lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps,
lambda current_step, num_warmup_steps=args.warmup_steps, args.warmup_steps)
num_training_steps=num_training_steps: float(
current_step) / float(max(1, num_warmup_steps))
if current_step < num_warmup_steps else max(
0.0,
float(num_training_steps - current_step) / float(
max(1, num_training_steps - num_warmup_steps))))
clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0) clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
optimizer = paddle.optimizer.AdamW( optimizer = paddle.optimizer.AdamW(
......
...@@ -29,6 +29,7 @@ import paddlenlp as ppnlp ...@@ -29,6 +29,7 @@ import paddlenlp as ppnlp
from paddlenlp.datasets import SQuAD, DuReaderRobust, CMRC, DRCD from paddlenlp.datasets import SQuAD, DuReaderRobust, CMRC, DRCD
from paddlenlp.data import Pad, Stack, Tuple from paddlenlp.data import Pad, Stack, Tuple
from paddlenlp.transformers import BertForQuestionAnswering, BertTokenizer, ErnieForQuestionAnswering, ErnieTokenizer from paddlenlp.transformers import BertForQuestionAnswering, BertTokenizer, ErnieForQuestionAnswering, ErnieTokenizer
from paddlenlp.transformers import LinearDecayWithWarmup
from paddlenlp.metrics.squad import squad_evaluate, compute_predictions from paddlenlp.metrics.squad import squad_evaluate, compute_predictions
TASK_CLASSES = {"dureader-robust": DuReaderRobust, "cmrc": CMRC, "drcd": DRCD} TASK_CLASSES = {"dureader-robust": DuReaderRobust, "cmrc": CMRC, "drcd": DRCD}
...@@ -177,16 +178,11 @@ def do_train(args): ...@@ -177,16 +178,11 @@ def do_train(args):
if paddle.distributed.get_world_size() > 1: if paddle.distributed.get_world_size() > 1:
model = paddle.DataParallel(model) model = paddle.DataParallel(model)
lr_scheduler = paddle.optimizer.lr.LambdaDecay( num_training_steps = args.max_steps if args.max_steps > 0 else len(
args.learning_rate, train_data_loader) * args.num_train_epochs
lambda current_step, warmup_proportion=args.warmup_proportion,
num_training_steps=args.max_steps if args.max_steps > 0 else lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps,
(len(train_data_loader)*args.num_train_epochs): float( args.warmup_proportion)
current_step) / float(max(1, warmup_proportion*num_training_steps))
if current_step < warmup_proportion*num_training_steps else max(
0.0,
float(num_training_steps - current_step) / float(
max(1, num_training_steps - warmup_proportion*num_training_steps))))
optimizer = paddle.optimizer.AdamW( optimizer = paddle.optimizer.AdamW(
learning_rate=lr_scheduler, learning_rate=lr_scheduler,
......
...@@ -28,6 +28,7 @@ import paddlenlp as ppnlp ...@@ -28,6 +28,7 @@ import paddlenlp as ppnlp
from paddlenlp.data import Pad, Stack, Tuple from paddlenlp.data import Pad, Stack, Tuple
from paddlenlp.transformers import BertForSequenceClassification, BertTokenizer from paddlenlp.transformers import BertForSequenceClassification, BertTokenizer
from paddlenlp.transformers import LinearDecayWithWarmup
MODEL_CLASSES = {"bert": (BertForSequenceClassification, BertTokenizer)} MODEL_CLASSES = {"bert": (BertForSequenceClassification, BertTokenizer)}
...@@ -210,16 +211,11 @@ def do_train(args): ...@@ -210,16 +211,11 @@ def do_train(args):
if paddle.distributed.get_world_size() > 1: if paddle.distributed.get_world_size() > 1:
model = paddle.DataParallel(model) model = paddle.DataParallel(model)
lr_scheduler = paddle.optimizer.lr.LambdaDecay( num_training_steps = args.max_steps if args.max_steps > 0 else len(
args.learning_rate, train_ds.examples) // args.batch_size * args.num_train_epochs
lambda current_step, warmup_proportion=args.warmup_proportion,
num_training_steps=args.max_steps if args.max_steps > 0 else lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps,
(len(train_ds.examples)//args.batch_size*args.num_train_epochs): float( args.warmup_proportion)
current_step) / float(max(1, warmup_proportion*num_training_steps))
if current_step < warmup_proportion*num_training_steps else max(
0.0,
float(num_training_steps - current_step) / float(
max(1, num_training_steps - warmup_proportion*num_training_steps))))
optimizer = paddle.optimizer.AdamW( optimizer = paddle.optimizer.AdamW(
learning_rate=lr_scheduler, learning_rate=lr_scheduler,
......
...@@ -28,6 +28,7 @@ import paddlenlp as ppnlp ...@@ -28,6 +28,7 @@ import paddlenlp as ppnlp
from paddlenlp.data import Pad, Stack, Tuple from paddlenlp.data import Pad, Stack, Tuple
from paddlenlp.transformers import BertForQuestionAnswering, BertTokenizer, ErnieForQuestionAnswering, ErnieTokenizer from paddlenlp.transformers import BertForQuestionAnswering, BertTokenizer, ErnieForQuestionAnswering, ErnieTokenizer
from paddlenlp.transformers import LinearDecayWithWarmup
from paddlenlp.metrics.dureader import dureader_evaluate, compute_predictions from paddlenlp.metrics.dureader import dureader_evaluate, compute_predictions
MODEL_CLASSES = { MODEL_CLASSES = {
...@@ -168,16 +169,11 @@ def do_train(args): ...@@ -168,16 +169,11 @@ def do_train(args):
if paddle.distributed.get_world_size() > 1: if paddle.distributed.get_world_size() > 1:
model = paddle.DataParallel(model) model = paddle.DataParallel(model)
lr_scheduler = paddle.optimizer.lr.LambdaDecay( num_training_steps = args.max_steps if args.max_steps > 0 else len(
args.learning_rate, train_dataset.examples) // args.batch_size * args.num_train_epochs
lambda current_step, warmup_proportion=args.warmup_proportion,
num_training_steps=args.max_steps if args.max_steps > 0 else lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps,
(len(train_dataset.examples)//args.batch_size*args.num_train_epochs): float( args.warmup_proportion)
current_step) / float(max(1, warmup_proportion*num_training_steps))
if current_step < warmup_proportion*num_training_steps else max(
0.0,
float(num_training_steps - current_step) / float(
max(1, num_training_steps - warmup_proportion*num_training_steps))))
optimizer = paddle.optimizer.AdamW( optimizer = paddle.optimizer.AdamW(
learning_rate=lr_scheduler, learning_rate=lr_scheduler,
......
...@@ -28,6 +28,7 @@ import paddlenlp as ppnlp ...@@ -28,6 +28,7 @@ import paddlenlp as ppnlp
from paddlenlp.data import Pad, Stack, Tuple from paddlenlp.data import Pad, Stack, Tuple
from paddlenlp.transformers import BertForQuestionAnswering, BertTokenizer, ErnieForQuestionAnswering, ErnieTokenizer from paddlenlp.transformers import BertForQuestionAnswering, BertTokenizer, ErnieForQuestionAnswering, ErnieTokenizer
from paddlenlp.transformers import LinearDecayWithWarmup
from paddlenlp.metrics.squad import squad_evaluate, compute_predictions from paddlenlp.metrics.squad import squad_evaluate, compute_predictions
MODEL_CLASSES = { MODEL_CLASSES = {
...@@ -168,16 +169,11 @@ def do_train(args): ...@@ -168,16 +169,11 @@ def do_train(args):
if paddle.distributed.get_world_size() > 1: if paddle.distributed.get_world_size() > 1:
model = paddle.DataParallel(model) model = paddle.DataParallel(model)
lr_scheduler = paddle.optimizer.lr.LambdaDecay( num_training_steps = args.max_steps if args.max_steps > 0 else len(
args.learning_rate, train_dataset.examples) // args.batch_size * args.num_train_epochs
lambda current_step, warmup_proportion=args.warmup_proportion,
num_training_steps=args.max_steps if args.max_steps > 0 else lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps,
(len(train_dataset.examples)//args.batch_size*args.num_train_epochs): float( args.warmup_proportion)
current_step) / float(max(1, warmup_proportion*num_training_steps))
if current_step < warmup_proportion*num_training_steps else max(
0.0,
float(num_training_steps - current_step) / float(
max(1, num_training_steps - warmup_proportion*num_training_steps))))
optimizer = paddle.optimizer.AdamW( optimizer = paddle.optimizer.AdamW(
learning_rate=lr_scheduler, learning_rate=lr_scheduler,
......
...@@ -28,6 +28,7 @@ from paddle.metric import Metric, Accuracy, Precision, Recall ...@@ -28,6 +28,7 @@ from paddle.metric import Metric, Accuracy, Precision, Recall
from paddlenlp.data import Stack, Tuple, Pad from paddlenlp.data import Stack, Tuple, Pad
from paddlenlp.data.sampler import SamplerHelper from paddlenlp.data.sampler import SamplerHelper
from paddlenlp.transformers import BertModel, BertForSequenceClassification, BertTokenizer from paddlenlp.transformers import BertModel, BertForSequenceClassification, BertTokenizer
from paddlenlp.transformers import LinearDecayWithWarmup
from paddlenlp.utils.log import logger from paddlenlp.utils.log import logger
from paddlenlp.metrics import AccuracyAndF1, Mcc, PearsonAndSpearman from paddlenlp.metrics import AccuracyAndF1, Mcc, PearsonAndSpearman
import paddlenlp.datasets as datasets import paddlenlp.datasets as datasets
...@@ -459,16 +460,11 @@ def do_train(args): ...@@ -459,16 +460,11 @@ def do_train(args):
num_heads=model.bert.config['num_attention_heads']) num_heads=model.bert.config['num_attention_heads'])
reorder_neuron_head(ofa_model.model, head_importance, neuron_importance) reorder_neuron_head(ofa_model.model, head_importance, neuron_importance)
lr_scheduler = paddle.optimizer.lr.LambdaDecay( num_training_steps = args.max_steps if args.max_steps > 0 else len(
args.learning_rate, train_data_loader) * args.num_train_epochs
lambda current_step, num_warmup_steps=args.warmup_steps,
num_training_steps=args.max_steps if args.max_steps > 0 else lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps,
(len(train_data_loader) * args.num_train_epochs): float( args.warmup_steps)
current_step) / float(max(1, num_warmup_steps))
if current_step < num_warmup_steps else max(
0.0,
float(num_training_steps - current_step) / float(
max(1, num_training_steps - num_warmup_steps))))
optimizer = paddle.optimizer.AdamW( optimizer = paddle.optimizer.AdamW(
learning_rate=lr_scheduler, learning_rate=lr_scheduler,
......
...@@ -27,6 +27,7 @@ import paddlenlp as ppnlp ...@@ -27,6 +27,7 @@ import paddlenlp as ppnlp
from paddlenlp.datasets import MSRA_NER from paddlenlp.datasets import MSRA_NER
from paddlenlp.data import Stack, Tuple, Pad from paddlenlp.data import Stack, Tuple, Pad
from paddlenlp.transformers import BertForTokenClassification, BertTokenizer from paddlenlp.transformers import BertForTokenClassification, BertTokenizer
from paddlenlp.transformers import LinearDecayWithWarmup
from paddlenlp.metrics import ChunkEvaluator from paddlenlp.metrics import ChunkEvaluator
...@@ -291,16 +292,11 @@ def do_train(args): ...@@ -291,16 +292,11 @@ def do_train(args):
if paddle.distributed.get_world_size() > 1: if paddle.distributed.get_world_size() > 1:
model = paddle.DataParallel(model) model = paddle.DataParallel(model)
lr_scheduler = paddle.optimizer.lr.LambdaDecay( num_training_steps = args.max_steps if args.max_steps > 0 else len(
args.learning_rate, train_data_loader) * args.num_train_epochs
lambda current_step, num_warmup_steps=args.warmup_steps,
num_training_steps=args.max_steps if args.max_steps > 0 else lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps,
(len(train_data_loader) * args.num_train_epochs): float( args.warmup_steps)
current_step) / float(max(1, num_warmup_steps))
if current_step < num_warmup_steps else max(
0.0,
float(num_training_steps - current_step) / float(
max(1, num_training_steps - num_warmup_steps))))
optimizer = paddle.optimizer.AdamW( optimizer = paddle.optimizer.AdamW(
learning_rate=lr_scheduler, learning_rate=lr_scheduler,
......
...@@ -22,8 +22,9 @@ import numpy as np ...@@ -22,8 +22,9 @@ import numpy as np
import paddle import paddle
import paddle.nn.functional as F import paddle.nn.functional as F
from paddlenlp.data import Stack, Tuple, Pad
import paddlenlp as ppnlp import paddlenlp as ppnlp
from paddlenlp.data import Stack, Tuple, Pad
from paddlenlp.transformers import LinearDecayWithWarmup
# yapf: disable # yapf: disable
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
...@@ -214,19 +215,10 @@ def do_train(): ...@@ -214,19 +215,10 @@ def do_train():
model = paddle.DataParallel(model) model = paddle.DataParallel(model)
num_training_steps = len(train_data_loader) * args.epochs num_training_steps = len(train_data_loader) * args.epochs
num_warmup_steps = int(args.warmup_proportion * num_training_steps)
lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps,
def get_lr_factor(current_step): args.warmup_proportion)
if current_step < num_warmup_steps:
return float(current_step) / float(max(1, num_warmup_steps))
else:
return max(0.0,
float(num_training_steps - current_step) /
float(max(1, num_training_steps - num_warmup_steps)))
lr_scheduler = paddle.optimizer.lr.LambdaDecay(
args.learning_rate,
lr_lambda=lambda current_step: get_lr_factor(current_step))
optimizer = paddle.optimizer.AdamW( optimizer = paddle.optimizer.AdamW(
learning_rate=lr_scheduler, learning_rate=lr_scheduler,
parameters=model.parameters(), parameters=model.parameters(),
......
...@@ -10,6 +10,7 @@ import paddle.nn as nn ...@@ -10,6 +10,7 @@ import paddle.nn as nn
from paddle.io import DataLoader from paddle.io import DataLoader
from paddlenlp.transformers import ErnieForGeneration from paddlenlp.transformers import ErnieForGeneration
from paddlenlp.transformers import ErnieTokenizer, ErnieTinyTokenizer, BertTokenizer, ElectraTokenizer, RobertaTokenizer from paddlenlp.transformers import ErnieTokenizer, ErnieTinyTokenizer, BertTokenizer, ElectraTokenizer, RobertaTokenizer
from paddlenlp.transformers import LinearDecayWithWarmup
from paddlenlp.datasets import Poetry from paddlenlp.datasets import Poetry
from paddlenlp.data import Stack, Tuple, Pad from paddlenlp.data import Stack, Tuple, Pad
from paddlenlp.metrics import Rouge1, Rouge2 from paddlenlp.metrics import Rouge1, Rouge2
...@@ -175,16 +176,10 @@ def train(): ...@@ -175,16 +176,10 @@ def train():
if paddle.distributed.get_world_size() > 1: if paddle.distributed.get_world_size() > 1:
model = paddle.DataParallel(model) model = paddle.DataParallel(model)
max_steps = (len(train_data_loader) * args.num_epochs) max_steps = len(train_data_loader) * args.num_epochs
lr_scheduler = paddle.optimizer.lr.LambdaDecay(
args.learning_rate, lr_scheduler = LinearDecayWithWarmup(args.learning_rate, max_steps,
lambda current_step, num_warmup_steps=max_steps*args.warmup_proportion, args.warmup_proportion)
num_training_steps=max_steps: float(
current_step) / float(max(1, num_warmup_steps))
if current_step < num_warmup_steps else max(
0.0,
float(num_training_steps - current_step) / float(
max(1, num_training_steps - num_warmup_steps))))
optimizer = paddle.optimizer.AdamW( optimizer = paddle.optimizer.AdamW(
learning_rate=lr_scheduler, learning_rate=lr_scheduler,
......
...@@ -22,8 +22,9 @@ import numpy as np ...@@ -22,8 +22,9 @@ import numpy as np
import paddle import paddle
import paddle.nn.functional as F import paddle.nn.functional as F
from paddlenlp.data import Stack, Tuple, Pad
import paddlenlp as ppnlp import paddlenlp as ppnlp
from paddlenlp.data import Stack, Tuple, Pad
from paddlenlp.transformers import LinearDecayWithWarmup
from model import SentenceTransformer from model import SentenceTransformer
...@@ -231,19 +232,10 @@ def do_train(): ...@@ -231,19 +232,10 @@ def do_train():
model = paddle.DataParallel(model) model = paddle.DataParallel(model)
num_training_steps = len(train_data_loader) * args.epochs num_training_steps = len(train_data_loader) * args.epochs
num_warmup_steps = int(args.warmup_proption * num_training_steps)
lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps,
def get_lr_factor(current_step): args.warmup_proportion)
if current_step < num_warmup_steps:
return float(current_step) / float(max(1, num_warmup_steps))
else:
return max(0.0,
float(num_training_steps - current_step) /
float(max(1, num_training_steps - num_warmup_steps)))
lr_scheduler = paddle.optimizer.lr.LambdaDecay(
args.learning_rate,
lr_lambda=lambda current_step: get_lr_factor(current_step))
optimizer = paddle.optimizer.AdamW( optimizer = paddle.optimizer.AdamW(
learning_rate=lr_scheduler, learning_rate=lr_scheduler,
parameters=model.parameters(), parameters=model.parameters(),
......
...@@ -27,6 +27,7 @@ from paddlenlp.datasets import GlueQNLI, GlueSST2 ...@@ -27,6 +27,7 @@ from paddlenlp.datasets import GlueQNLI, GlueSST2
from paddlenlp.data import Stack, Tuple, Pad from paddlenlp.data import Stack, Tuple, Pad
from paddlenlp.data.sampler import SamplerHelper from paddlenlp.data.sampler import SamplerHelper
from paddlenlp.transformers import BertForSequenceClassification, BertTokenizer from paddlenlp.transformers import BertForSequenceClassification, BertTokenizer
from paddlenlp.transformers import LinearDecayWithWarmup
FORMAT = '%(asctime)s-%(levelname)s: %(message)s' FORMAT = '%(asctime)s-%(levelname)s: %(message)s'
logging.basicConfig(level=logging.INFO, format=FORMAT) logging.basicConfig(level=logging.INFO, format=FORMAT)
...@@ -332,16 +333,12 @@ def do_train(args): ...@@ -332,16 +333,12 @@ def do_train(args):
# Create the training-backward program, this pass will not be # Create the training-backward program, this pass will not be
# executed in the validation # executed in the validation
with paddle.static.program_guard(main_program, startup_program): with paddle.static.program_guard(main_program, startup_program):
lr_scheduler = paddle.optimizer.lr.LambdaDecay( num_training_steps = args.max_steps if args.max_steps > 0 else len(
args.learning_rate, train_data_loader) * args.num_train_epochs
lambda current_step, num_warmup_steps=args.warmup_steps,
num_training_steps=args.max_steps if args.max_steps > 0 else lr_scheduler = LinearDecayWithWarmup(
(len(train_data_loader) * args.num_train_epochs): float( args.learning_rate, num_training_steps, args.warmup_steps)
current_step) / float(max(1, num_warmup_steps))
if current_step < num_warmup_steps else max(
0.0,
float(num_training_steps - current_step) / float(
max(1, num_training_steps - num_warmup_steps))))
optimizer = paddle.optimizer.AdamW( optimizer = paddle.optimizer.AdamW(
learning_rate=lr_scheduler, learning_rate=lr_scheduler,
epsilon=args.adam_epsilon, epsilon=args.adam_epsilon,
......
...@@ -28,6 +28,7 @@ from paddle.io import DataLoader, Dataset ...@@ -28,6 +28,7 @@ from paddle.io import DataLoader, Dataset
from paddlenlp.transformers import BertForPretraining, BertModel, BertPretrainingCriterion from paddlenlp.transformers import BertForPretraining, BertModel, BertPretrainingCriterion
from paddlenlp.transformers import BertTokenizer from paddlenlp.transformers import BertTokenizer
from paddlenlp.transformers import LinearDecayWithWarmup
from data import create_data_holder, create_pretraining_dataset from data import create_data_holder, create_pretraining_dataset
MODEL_CLASSES = {"bert": (BertForPretraining, BertTokenizer)} MODEL_CLASSES = {"bert": (BertForPretraining, BertTokenizer)}
...@@ -198,17 +199,11 @@ def do_train(args): ...@@ -198,17 +199,11 @@ def do_train(args):
loss = criterion(prediction_scores, seq_relationship_score, loss = criterion(prediction_scores, seq_relationship_score,
masked_lm_labels, next_sentence_labels, masked_lm_scale) masked_lm_labels, next_sentence_labels, masked_lm_scale)
num_training_steps = args.max_steps if args.max_steps > 0 else len(
train_data_loader) * args.num_train_epochs
# Define the dynamic learing_reate scheduler and optimizer # Define the dynamic learing_reate scheduler and optimizer
lr_scheduler = paddle.optimizer.lr.LambdaDecay( lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps,
args.learning_rate, args.warmup_steps)
lambda current_step, num_warmup_steps=args.warmup_steps,
num_training_steps=args.max_steps if args.max_steps > 0 else
(len(train_data_loader) * args.num_train_epochs): float(
current_step) / float(max(1, num_warmup_steps))
if current_step < num_warmup_steps else max(
0.0,
float(num_training_steps - current_step) / float(
max(1, num_training_steps - num_warmup_steps))))
optimizer = paddle.optimizer.AdamW( optimizer = paddle.optimizer.AdamW(
learning_rate=lr_scheduler, learning_rate=lr_scheduler,
......
...@@ -27,6 +27,7 @@ import paddle ...@@ -27,6 +27,7 @@ import paddle
from paddle.io import DataLoader, Dataset from paddle.io import DataLoader, Dataset
from paddlenlp.transformers import BertForPretraining, BertModel, BertPretrainingCriterion from paddlenlp.transformers import BertForPretraining, BertModel, BertPretrainingCriterion
from paddlenlp.transformers import BertTokenizer from paddlenlp.transformers import BertTokenizer
from paddlenlp.transformers import LinearDecayWithWarmup
from data import create_data_holder, create_pretraining_dataset from data import create_data_holder, create_pretraining_dataset
MODEL_CLASSES = {"bert": (BertForPretraining, BertTokenizer)} MODEL_CLASSES = {"bert": (BertForPretraining, BertTokenizer)}
...@@ -210,17 +211,11 @@ def do_train(args): ...@@ -210,17 +211,11 @@ def do_train(args):
loss = criterion(prediction_scores, seq_relationship_score, loss = criterion(prediction_scores, seq_relationship_score,
masked_lm_labels, next_sentence_labels, masked_lm_scale) masked_lm_labels, next_sentence_labels, masked_lm_scale)
num_training_steps = args.max_steps if args.max_steps > 0 else len(
train_data_loader) * args.num_train_epochs
# Define the dynamic learing_reate scheduler and optimizer # Define the dynamic learing_reate scheduler and optimizer
lr_scheduler = paddle.optimizer.lr.LambdaDecay( lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps,
args.learning_rate, args.warmup_steps)
lambda current_step, num_warmup_steps=args.warmup_steps,
num_training_steps=args.max_steps if args.max_steps > 0 else
(len(train_data_loader) * args.num_train_epochs): float(
current_step) / float(max(1, num_warmup_steps))
if current_step < num_warmup_steps else max(
0.0,
float(num_training_steps - current_step) / float(
max(1, num_training_steps - num_warmup_steps))))
optimizer = paddle.optimizer.AdamW( optimizer = paddle.optimizer.AdamW(
learning_rate=lr_scheduler, learning_rate=lr_scheduler,
......
...@@ -27,3 +27,4 @@ from .electra.modeling import * ...@@ -27,3 +27,4 @@ from .electra.modeling import *
from .electra.tokenizer import * from .electra.tokenizer import *
from .transformer.modeling import * from .transformer.modeling import *
from .ernie_gen.modeling import ErnieForGeneration from .ernie_gen.modeling import ErnieForGeneration
from .optimization import *
...@@ -31,6 +31,7 @@ from paddlenlp.data import * ...@@ -31,6 +31,7 @@ from paddlenlp.data import *
from paddlenlp.data.sampler import SamplerHelper from paddlenlp.data.sampler import SamplerHelper
from paddlenlp.transformers.model_bert import * from paddlenlp.transformers.model_bert import *
from paddlenlp.transformers.tokenizer_bert import BertTokenizer from paddlenlp.transformers.tokenizer_bert import BertTokenizer
from paddlenlp.transformers import LinearDecayWithWarmup
TASK_CLASSES = { TASK_CLASSES = {
"qnli": (GlueQNLI, paddle.metric.Accuracy), # (dataset, metric) "qnli": (GlueQNLI, paddle.metric.Accuracy), # (dataset, metric)
...@@ -307,16 +308,11 @@ def do_train(args): ...@@ -307,16 +308,11 @@ def do_train(args):
if paddle.distributed.get_world_size() > 1: if paddle.distributed.get_world_size() > 1:
model = paddle.DataParallel(model) model = paddle.DataParallel(model)
lr_scheduler = paddle.optimizer.lr.LambdaDecay( num_training_steps = args.max_steps if args.max_steps > 0 else len(
args.learning_rate, train_data_loader) * args.num_train_epochs
lambda current_step, num_warmup_steps=args.warmup_steps,
num_training_steps=args.max_steps if args.max_steps > 0 else lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps,
(len(train_data_loader) * args.num_train_epochs): float( args.warmup_steps)
current_step) / float(max(1, num_warmup_steps))
if current_step < num_warmup_steps else max(
0.0,
float(num_training_steps - current_step) / float(
max(1, num_training_steps - num_warmup_steps))))
optimizer = paddle.optimizer.AdamW( optimizer = paddle.optimizer.AdamW(
learning_rate=lr_scheduler, learning_rate=lr_scheduler,
......
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import math
from paddle.optimizer.lr import LambdaDecay
__all__ = [
'LinearDecayWithWarmup', 'ConstScheduleWithWarmup', 'CosineDecayWithWarmup',
'PolyDecayWithWarmup'
]
def is_integer(number):
if sys.version > '3':
return isinstance(number, int)
return isinstance(number, (int, long))
class LinearDecayWithWarmup(LambdaDecay):
"""
Create a learning rate scheduler, which increases learning rate linearly
from 0 to given `learning_rate`, after this warmup period learning rate
would be decreased linearly from the base learning rate to 0.
Args:
learning_rate (float): The base learning rate. It is a python float
number.
total_steps (int): The number of training steps.
warmup (int|float): If int, it means the number of steps for warmup.
If float, it means the proportion of warmup in total training steps.
last_epoch (int, optional): The index of last epoch. It can be set to
restart training. If None, it means initial learning rate.
Default: -1.
verbose (bool, optional): If True, prints a message to stdout for each
update. Default: False.
Examples:
.. code-block:: python
from paddlenlp.transformers import LinearDecayWithWarmup
lr, warmup_steps, max_steps = 0.1, 100, 1000
lr_scheduler = LinearDecayWithWarmup(lr, max_steps, warmup_steps)
"""
def __init__(self,
learning_rate,
total_steps,
warmup,
last_epoch=-1,
verbose=False):
warmup_steps = warmup if is_integer(warmup) else int(
math.floor(warmup * total_steps))
def lr_lambda(current_step):
if current_step < warmup_steps:
return float(current_step) / float(max(1, warmup_steps))
return max(0.0,
float(total_steps - current_step) /
float(max(1, total_steps - warmup_steps)))
super(LinearDecayWithWarmup, self).__init__(learning_rate, lr_lambda,
last_epoch, verbose)
class ConstScheduleWithWarmup(LambdaDecay):
"""
Create a learning rate scheduler, which increases learning rate linearly
from 0 to given `learning_rate` during warmup periods and keeps learning
rate a constant after that.
Args:
learning_rate (float): The base learning rate. It is a python float
number.
warmup (int|float): If int, it means the number of steps for warmup.
If float, it means the proportion of warmup in total training steps.
total_steps (int, optional): The number of training steps. If `warmup`
is a float number, `total_steps` must be provided.
last_epoch (int, optional): The index of last epoch. It can be set to
restart training. If None, it means initial learning rate.
Default: -1.
Examples:
.. code-block:: python
from paddlenlp.transformers import ConstScheduleWithWarmup
lr, warmup_steps = 0.1, 100
lr_scheduler = ConstScheduleWithWarmup(lr, warmup_steps)
"""
def __init__(self,
learning_rate,
warmup,
total_steps=None,
last_epoch=-1,
verbose=False):
warmup_steps = warmup if is_integer(warmup) else int(
math.floor(warmup * total_steps))
if is_integer(warmup):
warmup_steps = warmup
elif total_steps:
warmup_steps = int(math.floor(warmup * total_steps))
else:
raise ValueError(
"Please provide total steps if `warmup` is a float number , or provide integer for argument `warmup`."
)
def lr_lambda(current_step):
if current_step < warmup_steps:
return float(current_step) / float(max(1.0, warmup_steps))
return 1.0
super(ConstScheduleWithWarmup, self).__init__(learning_rate, lr_lambda,
last_epoch, verbose)
class CosineDecayWithWarmup(LambdaDecay):
"""
Create a learning rate scheduler, which increases learning rate linearly
from 0 to given `learning_rate`, after this warmup period learning rate
would be decreased following the values of the cosine function. If
`with_hard_restarts` is True, the cosine function could have serveral hard
restarts.
Args:
learning_rate (float): The base learning rate. It is a python float
number.
total_steps (int): The number of training steps.
warmup (int|float): If int, it means the number of steps for warmup.
If float, it means the proportion of warmup in total training steps.
with_hard_restarts (bool) Whether cosine function has several hard
restarts. Default: False.
num_cycles (int|float optional): If `with_hard_restarts` is False, it
means the number of waves in cosine scheduler and should be an
integer number and defaults to 1. If `with_hard_restarts` is True,
it means the number of hard restarts to use and should be a float
number and defaults to be 0.5. Default: None.
last_epoch (int, optional): The index of last epoch. It can be set to
restart training. If None, it means initial learning rate.
Default: -1.
Examples:
.. code-block:: python
from paddlenlp.transformers import CosineDecayWithWarmup
lr, warmup_steps, max_steps = 0.1, 100, 1000
lr_scheduler = CosineDecayWithWarmup(lr, max_steps, warmup_steps)
"""
def __init__(self,
learning_rate,
total_steps,
warmup,
with_hard_restarts=False,
num_cycles=None,
last_epoch=-1,
verbose=False):
warmup_steps = warmup if is_integer(warmup) else int(
math.floor(warmup * total_steps))
# Input check
if num_cycles is not None:
assert not with_hard_restarts and isinstance(num_cycles, int) or with_hard_restarts and isinstance(num_cycles, float), \
"`num_circles` should be an integer while `with_hard_restarts` is False, an float while `with_hard_restarts` is True."
else:
num_cycles = 1 if not with_hard_restarts else 0.5
def lr_lambda(current_step):
if current_step < warmup_steps:
return float(current_step) / float(max(1, warmup_steps))
progress = float(current_step - warmup_steps) / float(
max(1, total_steps - warmup_steps))
if with_hard_restarts:
if progress >= 1.0:
return 0.0
return max(0.0, 0.5 * (1.0 + math.cos(math.pi * (
(float(num_cycles) * progress) % 1.0))))
return max(0.0, 0.5 * (
1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)))
super(CosineDecayWithWarmup, self).__init__(learning_rate, lr_lambda,
last_epoch, verbose)
class PolyDecayWithWarmup(LambdaDecay):
"""
Create a learning rate scheduler, which increases learning rate linearly
from 0 to given `lr_init`, after this warmup period learning rate would
be decreased as a polynomial decay from the base learning rate to the end
learning rate `lr_end`.
Args:
learning_rate (float): The base learning rate. It is a python float
number.
total_steps (int): The number of training steps.
warmup (int|float): If int, it means the number of steps for warmup.
If float, it means the proportion of warmup in total training steps.
lr_end (float, optional): The end learning rate. Default: 1e-7.
power (float, optional): Power factor. Default: 1.0.
last_epoch (int, optional): The index of last epoch. It can be set to
restart training. If None, it means initial learning rate.
Default: -1.
Examples:
.. code-block:: python
from paddlenlp.transformers import PolyDecayWithWarmup
lr, lr_end, warmup_steps, max_steps = 0.1, 1e-6, 100, 1000
lr_scheduler = PolyDecayWithWarmup(lr, max_steps, warmup_steps, lr_end)
"""
def __init__(self,
learning_rate,
total_steps,
warmup,
lr_end=1e-7,
power=1.0,
last_epoch=-1,
verbose=False):
lr_init = learning_rate
assert lr_init > lr_end, f"`lr_end` must be be smaller than `learning_rate`. But `lr_end` is {lr_end} while `learning_rate` is {lr_init}."
warmup_steps = warmup if is_integer(warmup) else int(
math.floor(warmup * total_steps))
def lr_lambda(current_step):
if current_step < warmup_steps:
return float(current_step) / float(max(1, warmup_steps))
elif current_step > total_steps:
return lr_end / lr_init # it multiplies by lr_init equals to lr_end
else:
lr_range = lr_init - lr_end
decay_steps = total_steps - warmup_steps
pct_remaining = 1 - (current_step - warmup_steps) / decay_steps
decay = lr_range * pct_remaining**power + lr_end
return decay / lr_init # it multiplies by lr_init equals to decay
super(PolyDecayWithWarmup, self).__init__(lr_init, lr_lambda,
last_epoch, verbose)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册