diff --git a/PaddleCV/ocr_recognition/attention_model.py b/PaddleCV/ocr_recognition/attention_model.py index 963d2168fd6ec4f53724573895344825c18558a3..4a2dad271ed5ff91f251138e4df1504af4a8a5f6 100755 --- a/PaddleCV/ocr_recognition/attention_model.py +++ b/PaddleCV/ocr_recognition/attention_model.py @@ -188,7 +188,7 @@ def attention_train_net(args, data_shape, num_classes): prediction = gru_decoder_with_attention(trg_embedding, encoded_vector, encoded_proj, decoder_boot, decoder_size, num_classes) - fluid.clip.set_gradient_clip(fluid.clip.GradientClipByValue(args.gradient_clip)) + fluid.clip.set_gradient_clip(fluid.clip.GradientClipByGlobalNorm(args.gradient_clip)) label_out = fluid.layers.cast(x=label_out, dtype='int64') _, maxid = fluid.layers.topk(input=prediction, k=1) diff --git a/PaddleCV/ocr_recognition/run_attention.sh b/PaddleCV/ocr_recognition/run_attention.sh index 50ddba7119d3b9ad09150fe8c677a22ea7732ab2..beae85cf719d9ec01f599812e7412dadb9e5b681 100644 --- a/PaddleCV/ocr_recognition/run_attention.sh +++ b/PaddleCV/ocr_recognition/run_attention.sh @@ -1,7 +1,7 @@ export CUDA_VISIBLE_DEVICES=0 nohup python train.py \ --lr=1.0 \ ---gradient_clip=10 \ +--gradient_clip=5.0 \ --model="attention" \ --log_period=10 \ > attention.log 2>&1 &