diff --git a/doc/doc_ch/algorithm_rec_visionlan.md b/doc/doc_ch/algorithm_rec_visionlan.md index 0c4fe86e58831f4f5480483f5c21ff1da4176d2b..df039491d49e192349d57b44cc448c57e4211098 100644 --- a/doc/doc_ch/algorithm_rec_visionlan.md +++ b/doc/doc_ch/algorithm_rec_visionlan.md @@ -101,7 +101,7 @@ python3 tools/export_model.py -c configs/rec/rec_r45_visionlan.yml -o Global.pre 执行如下命令进行模型推理: ```shell -python3 tools/infer/predict_rec.py --image_dir='./doc/imgs_words/en/word_2.png' --rec_model_dir='./inference/rec_r45_visionlan/' --rec_algorithm='VisionLAN' --rec_image_shape='3,64,256' --rec_char_dict_path='./ppocr/utils/dict36.txt' +python3 tools/infer/predict_rec.py --image_dir='./doc/imgs_words/en/word_2.png' --rec_model_dir='./inference/rec_r45_visionlan/' --rec_algorithm='VisionLAN' --rec_image_shape='3,64,256' --rec_char_dict_path='./ppocr/utils/ic15_dict.txt' --use_space_char=False # 预测文件夹下所有图像时,可修改image_dir为文件夹,如 --image_dir='./doc/imgs_words_en/'。 ``` @@ -110,7 +110,7 @@ python3 tools/infer/predict_rec.py --image_dir='./doc/imgs_words/en/word_2.png' 执行命令后,上面图像的预测结果(识别的文本和得分)会打印到屏幕上,示例如下: 结果如下: ```shell -Predicts of ./doc/imgs_words/en/word_2.png:('yourself', 0.97076982) +Predicts of ./doc/imgs_words/en/word_2.png:('yourself', 0.9999493) ``` **注意**: diff --git a/doc/doc_en/algorithm_rec_visionlan_en.md b/doc/doc_en/algorithm_rec_visionlan_en.md index ebd02d52f4252c672b4a76c940ccdd621f5354ef..70c2ccc470af0a03485d9d234e86e384c087617f 100644 --- a/doc/doc_en/algorithm_rec_visionlan_en.md +++ b/doc/doc_en/algorithm_rec_visionlan_en.md @@ -90,7 +90,7 @@ After the conversion is successful, there are three files in the directory: For VisionLAN text recognition model inference, the following commands can be executed: ``` -python3 tools/infer/predict_rec.py --image_dir='./doc/imgs_words/en/word_2.png' --rec_model_dir='./inference/rec_r45_visionlan/' --rec_algorithm='VisionLAN' --rec_image_shape='3,64,256' --rec_char_dict_path='./ppocr/utils/dict36.txt' +python3 tools/infer/predict_rec.py --image_dir='./doc/imgs_words/en/word_2.png' --rec_model_dir='./inference/rec_r45_visionlan/' --rec_algorithm='VisionLAN' --rec_image_shape='3,64,256' --rec_char_dict_path='./ppocr/utils/ic15_dict.txt' --use_space_char=False ``` ![](../imgs_words/en/word_2.png) @@ -98,7 +98,7 @@ python3 tools/infer/predict_rec.py --image_dir='./doc/imgs_words/en/word_2.png' After executing the command, the prediction result (recognized text and score) of the image above is printed to the screen, an example is as follows: The result is as follows: ```shell -Predicts of ./doc/imgs_words/en/word_2.png:('yourself', 0.97076982) +Predicts of ./doc/imgs_words/en/word_2.png:('yourself', 0.9999493) ``` diff --git a/ppocr/losses/__init__.py b/ppocr/losses/__init__.py index 8986e5e5b9f488b023781176011024276c437e11..1851fc84e4ee7ec69f9c5261446fea50bec493a0 100755 --- a/ppocr/losses/__init__.py +++ b/ppocr/losses/__init__.py @@ -67,7 +67,7 @@ def build_loss(config): 'ClsLoss', 'AttentionLoss', 'SRNLoss', 'PGLoss', 'CombinedLoss', 'CELoss', 'TableAttentionLoss', 'SARLoss', 'AsterLoss', 'SDMGRLoss', 'VQASerTokenLayoutLMLoss', 'LossFromOutput', 'PRENLoss', 'MultiLoss', - 'TableMasterLoss', 'SPINAttentionLoss', 'VLLoss','StrokeFocusLoss' + 'TableMasterLoss', 'SPINAttentionLoss', 'VLLoss', 'StrokeFocusLoss' ] config = copy.deepcopy(config) module_name = config.pop('name') diff --git a/ppocr/postprocess/rec_postprocess.py b/ppocr/postprocess/rec_postprocess.py index 7b994f810d6747a91aceec82641f433d816b3feb..fc9fccfb143bf31ec66989e279d0bcc1c9baa5cc 100644 --- a/ppocr/postprocess/rec_postprocess.py +++ b/ppocr/postprocess/rec_postprocess.py @@ -780,7 +780,7 @@ class VLLabelDecode(BaseRecLabelDecode): ) + length[i])].topk(1)[0][:, 0] preds_prob = paddle.exp( paddle.log(preds_prob).sum() / (preds_prob.shape[0] + 1e-6)) - text.append((preds_text, preds_prob)) + text.append((preds_text, preds_prob.numpy()[0])) if label is None: return text label = self.decode(label) diff --git a/tools/program.py b/tools/program.py index 34845f005f81aa20553cec98d231e8358698cff7..a61a5c7861d3fd3daf42a9196f6886f375241b32 100755 --- a/tools/program.py +++ b/tools/program.py @@ -490,7 +490,7 @@ def eval(model, break images = batch[0] start = time.time() - + # use amp if scaler: with paddle.amp.auto_cast(level='O2'): @@ -508,10 +508,10 @@ def eval(model, 1, 2, 0).astype(np.uint8) fm_lr = (lr_img[i].numpy() * 255).transpose( 1, 2, 0).astype(np.uint8) - cv2.imwrite("output/images/{}_{}_sr.jpg".format(sum_images, - i), fm_sr) - cv2.imwrite("output/images/{}_{}_lr.jpg".format(sum_images, - i), fm_lr) + cv2.imwrite("output/images/{}_{}_sr.jpg".format( + sum_images, i), fm_sr) + cv2.imwrite("output/images/{}_{}_lr.jpg".format( + sum_images, i), fm_lr) else: preds = model(images) else: @@ -529,10 +529,10 @@ def eval(model, 1, 2, 0).astype(np.uint8) fm_lr = (lr_img[i].numpy() * 255).transpose( 1, 2, 0).astype(np.uint8) - cv2.imwrite("output/images/{}_{}_sr.jpg".format(sum_images, - i), fm_sr) - cv2.imwrite("output/images/{}_{}_lr.jpg".format(sum_images, - i), fm_lr) + cv2.imwrite("output/images/{}_{}_sr.jpg".format( + sum_images, i), fm_sr) + cv2.imwrite("output/images/{}_{}_lr.jpg".format( + sum_images, i), fm_lr) else: preds = model(images) @@ -652,7 +652,7 @@ def preprocess(is_train=False): 'EAST', 'DB', 'SAST', 'Rosetta', 'CRNN', 'STARNet', 'RARE', 'SRN', 'CLS', 'PGNet', 'Distillation', 'NRTR', 'TableAttn', 'SAR', 'PSE', 'SEED', 'SDMGR', 'LayoutXLM', 'LayoutLM', 'LayoutLMv2', 'PREN', 'FCE', - 'SVTR', 'ViTSTR', 'ABINet', 'DB++', 'TableMaster', 'SPIN', 'VisionLAN', + 'SVTR', 'ViTSTR', 'ABINet', 'DB++', 'TableMaster', 'SPIN', 'VisionLAN', 'Gestalt' ]