diff --git a/PaddleNLP/examples/dialogue/dgu/main.py b/PaddleNLP/examples/dialogue/dgu/main.py index b7a12664550b60990974993bb0477ad805394d78..352f0ef0f6931a6584b1f280667a9ef993ee12da 100644 --- a/PaddleNLP/examples/dialogue/dgu/main.py +++ b/PaddleNLP/examples/dialogue/dgu/main.py @@ -128,22 +128,13 @@ def train(args, model, train_data_loader, dev_data_loader, metric, rank): max_train_steps=max_train_steps) lr_scheduler = LambdaDecay(args.learning_rate, factor_fn) optimizer = AdamW( - learning_rate=lr_scheduler, - parameters=model.parameters(), - weight_decay=args.weight_decay, - apply_decay_param_fun=lambda x: x in [ - params.name for params in model.parameters() - if not any(nd in params.name for nd in ['bias', 'norm'])], - grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm) - ) - optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in [ p.name for n, p in model.named_parameters() - if not any(nd in n for nd in ["bias", "norm"]) - ]) + if not any(nd in n for nd in ["bias", "norm"])], + grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm)) loss_fn = DGULossFunction(args.task_name) load_ckpt(args, model, optimizer)