diff --git a/demo/finetune_classifier.py b/demo/finetune_classifier.py index 4e9540d143e2d7bd407cf852a520e3c2c3a3135e..ce14d3ef5116cbf7b01ea2f1ec0a0257870f9a40 100644 --- a/demo/finetune_classifier.py +++ b/demo/finetune_classifier.py @@ -177,15 +177,15 @@ if args.use_lr_decay: lr_scheduler, parameters=model.parameters(), weight_decay=args.wd, - apply_decay_param_fun=lambda n: param_name_to_exclue_from_weight_decay.match(n), + apply_decay_param_fun=lambda n: not param_name_to_exclue_from_weight_decay.match(n), grad_clip=g_clip) else: lr_scheduler = None - opt = P.optimizer.Adam( + opt = P.optimizer.AdamW( args.lr, parameters=model.parameters(), weight_decay=args.wd, - apply_decay_param_fun=lambda n: param_name_to_exclue_from_weight_decay.match(n), + apply_decay_param_fun=lambda n: not param_name_to_exclue_from_weight_decay.match(n), grad_clip=g_clip) scaler = P.amp.GradScaler(enable=args.use_amp) @@ -209,7 +209,8 @@ with LogWriter( lr_scheduler and lr_scheduler.step() if step % 10 == 0: - _lr = lr_scheduler.get_lr() + _lr = lr_scheduler.get_lr( + ) if args.use_lr_decay else args.lr if args.use_amp: _l = (loss / scaler._scale).numpy() msg = '[step-%d] train loss %.5f lr %.3e scaling %.3e' % ( diff --git a/demo/finetune_classifier_distributed.py b/demo/finetune_classifier_distributed.py index d1df8675369125baf3c1605347cfa0848b0903de..d4b1195d0664a716ebf5b9bc03df251698f660f5 100644 --- a/demo/finetune_classifier_distributed.py +++ b/demo/finetune_classifier_distributed.py @@ -144,7 +144,7 @@ lr_scheduler = P.optimizer.lr.LambdaDecay( opt = P.optimizer.AdamW( learning_rate=lr_scheduler, parameters=model.parameters(), - apply_decay_param_fun=lambda n: param_name_to_exclue_from_weight_decay.match(n), + apply_decay_param_fun=lambda n: not param_name_to_exclue_from_weight_decay.match(n), weight_decay=args.wd, grad_clip=g_clip) scaler = P.amp.GradScaler(enable=args.use_amp) diff --git a/demo/finetune_ner.py b/demo/finetune_ner.py index 7489f16dac0b236bd703bd20565cf0500b5f48ab..6929afdcb2492f58cdaa7c3ecdb5140fe0b843e2 100644 --- a/demo/finetune_ner.py +++ b/demo/finetune_ner.py @@ -210,7 +210,7 @@ opt = P.optimizer.AdamW( lr_scheduler, parameters=model.parameters(), weight_decay=args.wd, - apply_decay_param_fun=lambda n: param_name_to_exclue_from_weight_decay.match(n), + apply_decay_param_fun=lambda n: not param_name_to_exclue_from_weight_decay.match(n), grad_clip=g_clip) scaler = P.amp.GradScaler(enable=args.use_amp) diff --git a/demo/finetune_sentiment_analysis.py b/demo/finetune_sentiment_analysis.py index 015d29d4d2fe0acdf34d0032ebdffd3946515e62..16087fa0a59bd6fbfa1cfd460fd8a96dfa9bc34f 100644 --- a/demo/finetune_sentiment_analysis.py +++ b/demo/finetune_sentiment_analysis.py @@ -126,7 +126,7 @@ if not args.eval: lr_scheduler, parameters=model.parameters(), weight_decay=args.wd, - apply_decay_param_fun=lambda n: param_name_to_exclue_from_weight_decay.match(n), + apply_decay_param_fun=lambda n: not param_name_to_exclue_from_weight_decay.match(n), grad_clip=g_clip) scaler = P.amp.GradScaler(enable=args.use_amp) with LogWriter(logdir=str(create_if_not_exists(args.save_dir /