From 923d11fd48e15d69fc4ffbf8c9e9e7a123866c2c Mon Sep 17 00:00:00 2001 From: Zhou Wei <52485244+zhouwei25@users.noreply.github.com> Date: Fri, 3 Apr 2020 20:45:25 +0800 Subject: [PATCH] fix models because the gradient clip strategy has been upgraded (#4515) * fix models because the gradient clip strategy has been upgraded,test=develop * fix models because the gradient clip strategy has been upgraded,test=develop --- PaddleCV/ocr_recognition/crnn_ctc_model.py | 4 ---- PaddleNLP/dialogue_domain_classification/nets.py | 5 ----- PaddleRec/gru4rec/dy_graph/gru4rec_dy.py | 2 +- dygraph/bert/optimization.py | 2 +- dygraph/ocr_recognition/train.py | 2 +- dygraph/ptb_lm/ptb_dy.py | 6 +++--- dygraph/seq2seq/train.py | 4 ++-- 7 files changed, 8 insertions(+), 17 deletions(-) diff --git a/PaddleCV/ocr_recognition/crnn_ctc_model.py b/PaddleCV/ocr_recognition/crnn_ctc_model.py index 55db15ca..9c7c0419 100755 --- a/PaddleCV/ocr_recognition/crnn_ctc_model.py +++ b/PaddleCV/ocr_recognition/crnn_ctc_model.py @@ -114,7 +114,6 @@ def encoder_net(images, num_classes, rnn_hidden_size=200, regularizer=None, - gradient_clip=None, is_test=False, use_cudnn=False): conv_features = ocr_convs( @@ -130,16 +129,13 @@ def encoder_net(images, para_attr = fluid.ParamAttr( regularizer=regularizer, - gradient_clip=gradient_clip, initializer=fluid.initializer.Normal(0.0, 0.02)) bias_attr = fluid.ParamAttr( regularizer=regularizer, - gradient_clip=gradient_clip, initializer=fluid.initializer.Normal(0.0, 0.02), learning_rate=2.0) bias_attr_nobias = fluid.ParamAttr( regularizer=regularizer, - gradient_clip=gradient_clip, initializer=fluid.initializer.Normal(0.0, 0.02)) fc_1 = fluid.layers.fc(input=sliced_feature, diff --git a/PaddleNLP/dialogue_domain_classification/nets.py b/PaddleNLP/dialogue_domain_classification/nets.py index 77912b3b..13a69538 100755 --- a/PaddleNLP/dialogue_domain_classification/nets.py +++ b/PaddleNLP/dialogue_domain_classification/nets.py @@ -33,26 +33,21 @@ def textcnn_net_multi_label(data, """ init_bound = 0.1 initializer = fluid.initializer.Uniform(low=-init_bound, high=init_bound) - #gradient_clip = fluid.clip.GradientClipByNorm(10.0) - gradient_clip = None regularizer = fluid.regularizer.L2DecayRegularizer( regularization_coeff=1e-4) seg_param_attrs = fluid.ParamAttr(name="seg_weight", learning_rate=640.0, initializer=initializer, - gradient_clip=gradient_clip, trainable=True) fc_param_attrs_1 = fluid.ParamAttr(name="fc_weight_1", learning_rate=1.0, regularizer=regularizer, initializer=initializer, - gradient_clip=gradient_clip, trainable=True) fc_param_attrs_2 = fluid.ParamAttr(name="fc_weight_2", learning_rate=1.0, regularizer=regularizer, initializer=initializer, - gradient_clip=gradient_clip, trainable=True) if win_sizes is None: diff --git a/PaddleRec/gru4rec/dy_graph/gru4rec_dy.py b/PaddleRec/gru4rec/dy_graph/gru4rec_dy.py index 2a87e0bf..54455a57 100644 --- a/PaddleRec/gru4rec/dy_graph/gru4rec_dy.py +++ b/PaddleRec/gru4rec/dy_graph/gru4rec_dy.py @@ -408,7 +408,7 @@ def train_ptb_lm(): if args.ce: print("kpis\ttest_ppl\t%0.3f" % ppl[0]) - grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(max_grad_norm) + grad_clip = fluid.clip.GradientClipByGlobalNorm(max_grad_norm) for epoch_id in range(max_epoch): ptb_model.train() total_loss = 0.0 diff --git a/dygraph/bert/optimization.py b/dygraph/bert/optimization.py index 5c4c02b7..7f87fa8c 100755 --- a/dygraph/bert/optimization.py +++ b/dygraph/bert/optimization.py @@ -134,7 +134,7 @@ class Optimizer(object): param_list = dict() clip_norm_thres = 1.0 - #grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(clip_norm_thres) + #grad_clip = fluid.clip.GradientClipByGlobalNorm(clip_norm_thres) if use_data_parallel: loss = model.scale_loss(loss) diff --git a/dygraph/ocr_recognition/train.py b/dygraph/ocr_recognition/train.py index 6e5792a1..e01b01c1 100644 --- a/dygraph/ocr_recognition/train.py +++ b/dygraph/ocr_recognition/train.py @@ -74,7 +74,7 @@ def train(args): learning_rate = LR optimizer = fluid.optimizer.Adam(learning_rate=learning_rate, parameter_list=ocr_attention.parameters()) - grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(args.gradient_clip) + grad_clip = fluid.clip.GradientClipByGlobalNorm(args.gradient_clip) train_reader = data_reader.data_reader( args.batch_size, diff --git a/dygraph/ptb_lm/ptb_dy.py b/dygraph/ptb_lm/ptb_dy.py index fc468b84..de85084b 100644 --- a/dygraph/ptb_lm/ptb_dy.py +++ b/dygraph/ptb_lm/ptb_dy.py @@ -32,8 +32,8 @@ import time from args import * -#import fluid.dygraph_grad_clip as dygraph_clip -#from fluid.dygraph_grad_clip import * +#import fluid.clip as clip +#from fluid.clip import * import sys if sys.version[0] == '2': @@ -371,7 +371,7 @@ def train_ptb_lm(): ce_time = [] ce_ppl = [] - grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(max_grad_norm) + grad_clip = fluid.clip.GradientClipByGlobalNorm(max_grad_norm) for epoch_id in range(max_epoch): ptb_model.train() total_loss = 0.0 diff --git a/dygraph/seq2seq/train.py b/dygraph/seq2seq/train.py index 89d18168..19b17ae7 100755 --- a/dygraph/seq2seq/train.py +++ b/dygraph/seq2seq/train.py @@ -27,7 +27,7 @@ import contextlib import paddle import paddle.fluid as fluid -from paddle.fluid.dygraph_grad_clip import GradClipByGlobalNorm +from paddle.fluid.clip import GradientClipByGlobalNorm import reader @@ -84,7 +84,7 @@ def main(): num_layers=num_layers, init_scale=init_scale, dropout=dropout) - gloabl_norm_clip = GradClipByGlobalNorm(max_grad_norm) + gloabl_norm_clip = GradientClipByGlobalNorm(max_grad_norm) lr = args.learning_rate opt_type = args.optimizer if opt_type == "sgd": -- GitLab