diff --git a/PaddleNLP/language_model/train.py b/PaddleNLP/language_model/train.py index f12e9431f105f4ab05817294e90831d00fbe1416..a96c5844df7da79652b72e1edf6754f85b056eef 100644 --- a/PaddleNLP/language_model/train.py +++ b/PaddleNLP/language_model/train.py @@ -137,7 +137,7 @@ def main(): res_vars = res_vars[:-1] loss, last_hidden, last_cell, feed_order = res_vars - clip1 = fluid.clip.GradientClipByGlobalNorm( + clip = fluid.clip.GradientClipByGlobalNorm( clip_norm=config.max_grad_norm) learning_rate = fluid.layers.create_global_var( @@ -148,7 +148,7 @@ def main(): persistable=True) optimizer = fluid.optimizer.SGD(learning_rate=learning_rate, - grad_clip=clip1) + grad_clip=clip) optimizer.minimize(loss) # define inference program diff --git a/PaddleNLP/pretrain_language_models/BERT/optimization.py b/PaddleNLP/pretrain_language_models/BERT/optimization.py index 7a6dc3d366d4c4e18fd83d0d4046ad88d86dc2fd..0900c840c19949dee4f51a732643c9e2b39241af 100755 --- a/PaddleNLP/pretrain_language_models/BERT/optimization.py +++ b/PaddleNLP/pretrain_language_models/BERT/optimization.py @@ -102,10 +102,9 @@ def optimization(loss, raise ValueError("Unkown learning rate scheduler, should be " "'noam_decay' or 'linear_warmup_decay'") - clip1 = fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0) + clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0) - optimizer = fluid.optimizer.Adam( - learning_rate=scheduled_lr, grad_clip=clip1) + optimizer = fluid.optimizer.Adam(learning_rate=scheduled_lr, grad_clip=clip) def exclude_from_weight_decay(param): name = param.name.rstrip(".master") diff --git a/PaddleNLP/pretrain_language_models/XLNet/optimization.py b/PaddleNLP/pretrain_language_models/XLNet/optimization.py index 0911b33caa417b934e19d78af87d694120cd773a..109773eeb565ef1de940c3edfe77dad45f4aa395 100644 --- a/PaddleNLP/pretrain_language_models/XLNet/optimization.py +++ b/PaddleNLP/pretrain_language_models/XLNet/optimization.py @@ -110,10 +110,9 @@ def optimization(loss, return True return False - clip1 = fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0) + clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0) - optimizer = fluid.optimizer.Adam( - learning_rate=scheduled_lr, grad_clip=clip1) + optimizer = fluid.optimizer.Adam(learning_rate=scheduled_lr, grad_clip=clip) param_list = dict()