diff --git a/configs/vqa/re/layoutxlm.yml b/configs/vqa/re/layoutxlm.yml index 06f4bf97ddc3572bc00d0eae1bac5b80d75e64a0..bb367f0e64db6b67a77a555c61049c6b580e23a2 100644 --- a/configs/vqa/re/layoutxlm.yml +++ b/configs/vqa/re/layoutxlm.yml @@ -6,12 +6,12 @@ Global: save_model_dir: ./output/re_layoutxlm/ save_epoch_step: 2000 # evaluation is run every 10 iterations after the 0th iteration - eval_batch_step: [ 0, 38 ] + eval_batch_step: [ 0, 19 ] cal_metric_during_train: False - pretrained_model: &pretrained_model layoutxlm-base-uncased + pretrained_model: &pretrained_model layoutxlm-base-uncased # This field can only be changed by modifying the configuration file save_inference_dir: use_visualdl: False - infer_img: ppstructure/vqa/images/input/zh_val_21.jpg + infer_img: doc/vqa/input/zh_val_21.jpg save_res_path: ./output/re/ Architecture: diff --git a/configs/vqa/ser/layoutlm.yml b/configs/vqa/ser/layoutlm.yml index e33a6a237a6666032cb3de8cbdc58056408e1ab6..a635fc7dbec1e1364aca85fe660c6fe44433da5b 100644 --- a/configs/vqa/ser/layoutlm.yml +++ b/configs/vqa/ser/layoutlm.yml @@ -8,10 +8,10 @@ Global: # evaluation is run every 10 iterations after the 0th iteration eval_batch_step: [ 0, 19 ] cal_metric_during_train: False - pretrained_model: &pretrained_model layoutlm-base-uncased + pretrained_model: &pretrained_model layoutlm-base-uncased # This field can only be changed by modifying the configuration file save_inference_dir: use_visualdl: False - infer_img: ppstructure/vqa/images/input/zh_val_0.jpg + infer_img: doc/vqa/input/zh_val_0.jpg save_res_path: ./output/ser/predicts_layoutlm.txt Architecture: diff --git a/configs/vqa/ser/layoutxlm.yml b/configs/vqa/ser/layoutxlm.yml index 1197e0ae737d4b97909c209a62fd027ee2942946..1c1eac2289384990fc914f85d9f2a9233cda7440 100644 --- a/configs/vqa/ser/layoutxlm.yml +++ b/configs/vqa/ser/layoutxlm.yml @@ -8,10 +8,10 @@ Global: # evaluation is run every 10 iterations after the 0th iteration eval_batch_step: [ 0, 19 ] cal_metric_during_train: False - pretrained_model: &pretrained_model layoutxlm-base-uncased + pretrained_model: &pretrained_model layoutxlm-base-uncased # This field can only be changed by modifying the configuration file save_inference_dir: use_visualdl: False - infer_img: ppstructure/vqa/images/input/zh_val_42.jpg + infer_img: doc/vqa/input/zh_val_42.jpg save_res_path: ./output/ser Architecture: diff --git a/ppstructure/vqa/images/input/zh_val_0.jpg b/doc/vqa/input/zh_val_0.jpg similarity index 100% rename from ppstructure/vqa/images/input/zh_val_0.jpg rename to doc/vqa/input/zh_val_0.jpg diff --git a/ppstructure/vqa/images/input/zh_val_21.jpg b/doc/vqa/input/zh_val_21.jpg similarity index 100% rename from ppstructure/vqa/images/input/zh_val_21.jpg rename to doc/vqa/input/zh_val_21.jpg diff --git a/ppstructure/vqa/images/input/zh_val_40.jpg b/doc/vqa/input/zh_val_40.jpg similarity index 100% rename from ppstructure/vqa/images/input/zh_val_40.jpg rename to doc/vqa/input/zh_val_40.jpg diff --git a/ppstructure/vqa/images/input/zh_val_42.jpg b/doc/vqa/input/zh_val_42.jpg similarity index 100% rename from ppstructure/vqa/images/input/zh_val_42.jpg rename to doc/vqa/input/zh_val_42.jpg diff --git a/ppstructure/vqa/images/result_re/zh_val_21_re.jpg b/doc/vqa/result_re/zh_val_21_re.jpg similarity index 100% rename from ppstructure/vqa/images/result_re/zh_val_21_re.jpg rename to doc/vqa/result_re/zh_val_21_re.jpg diff --git a/ppstructure/vqa/images/result_re/zh_val_40_re.jpg b/doc/vqa/result_re/zh_val_40_re.jpg similarity index 100% rename from ppstructure/vqa/images/result_re/zh_val_40_re.jpg rename to doc/vqa/result_re/zh_val_40_re.jpg diff --git a/ppstructure/vqa/images/result_ser/zh_val_0_ser.jpg b/doc/vqa/result_ser/zh_val_0_ser.jpg similarity index 100% rename from ppstructure/vqa/images/result_ser/zh_val_0_ser.jpg rename to doc/vqa/result_ser/zh_val_0_ser.jpg diff --git a/ppstructure/vqa/images/result_ser/zh_val_42_ser.jpg b/doc/vqa/result_ser/zh_val_42_ser.jpg similarity index 100% rename from ppstructure/vqa/images/result_ser/zh_val_42_ser.jpg rename to doc/vqa/result_ser/zh_val_42_ser.jpg diff --git a/ppocr/data/imaug/label_ops.py b/ppocr/data/imaug/label_ops.py index d80f4ec792fd632f5772883e52e53c847b52b581..4cbd790051d58107395617aafed0e7148f1db05c 100644 --- a/ppocr/data/imaug/label_ops.py +++ b/ppocr/data/imaug/label_ops.py @@ -787,7 +787,7 @@ class SARLabelEncode(BaseRecLabelEncode): class VQATokenLabelEncode(object): """ - 基于NLP的标签编码 + Label encode for NLP VQA methods """ def __init__(self, diff --git a/ppocr/data/simple_dataset.py b/ppocr/data/simple_dataset.py index 1dc86d6d18ad92fb130f319c66883adb7fe0e203..08b00d09feea2f8169f35fe07ba9c189d51203a2 100644 --- a/ppocr/data/simple_dataset.py +++ b/ppocr/data/simple_dataset.py @@ -122,7 +122,7 @@ class SimpleDataSet(Dataset): self.logger.error( "When parsing line {}, error happened with msg: {}".format( data_line, traceback.format_exc())) - # outs = None + outs = None if outs is None: # during evaluation, we should fix the idx to get same results for many times of evaluation. rnd_idx = np.random.randint(self.__len__( diff --git a/ppocr/losses/vqa_token_layoutlm_loss.py b/ppocr/losses/vqa_token_layoutlm_loss.py index 7ad311f58649ad282d3bed915a83653bfcaa07e3..244893d97d0e422c5ca270bdece689e13aba2b07 100755 --- a/ppocr/losses/vqa_token_layoutlm_loss.py +++ b/ppocr/losses/vqa_token_layoutlm_loss.py @@ -1,4 +1,4 @@ -# copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/ppocr/postprocess/vqa_token_re_layoutlm_postprocess.py b/ppocr/postprocess/vqa_token_re_layoutlm_postprocess.py index 1fbea0faaa11c6aa529d893cf8ca768fc7a36029..1d55d13d76b496ba0a5b540ba915889ce9146a8e 100644 --- a/ppocr/postprocess/vqa_token_re_layoutlm_postprocess.py +++ b/ppocr/postprocess/vqa_token_re_layoutlm_postprocess.py @@ -34,7 +34,7 @@ class VQAReTokenLayoutLMPostProcess(object): entity_idx_dict_batch = kwargs['entity_idx_dict_batch'] pred_relations = preds['pred_relations'] - # 进行 relations 到 ocr信息的转换 + # merge relations and ocr info results = [] for pred_relation, ser_result, entity_idx_dict in zip( pred_relations, ser_results, entity_idx_dict_batch): diff --git a/ppstructure/vqa/README.md b/ppstructure/vqa/README.md index 58095665bd49a439421560e67c35e4af20b1313c..ca3e2bcc0aae073f0c5ee6e8485464bad9756307 100644 --- a/ppstructure/vqa/README.md +++ b/ppstructure/vqa/README.md @@ -34,7 +34,7 @@ PP-Structure 里的 DOC-VQA算法基于PaddleNLP自然语言处理算法库进 ### 2.1 SER -![](./images/result_ser/zh_val_0_ser.jpg) | ![](./images/result_ser/zh_val_42_ser.jpg) +![](../../doc/vqa/result_ser/zh_val_0_ser.jpg) | ![](../../doc/vqa/result_ser/zh_val_42_ser.jpg) ---|--- 图中不同颜色的框表示不同的类别,对于XFUN数据集,有`QUESTION`, `ANSWER`, `HEADER` 3种类别 @@ -48,7 +48,7 @@ PP-Structure 里的 DOC-VQA算法基于PaddleNLP自然语言处理算法库进 ### 2.2 RE -![](./images/result_re/zh_val_21_re.jpg) | ![](./images/result_re/zh_val_40_re.jpg) +![](../../doc/vqa/result_re/zh_val_21_re.jpg) | ![](../../doc/vqa/result_re/zh_val_40_re.jpg) ---|--- @@ -164,7 +164,7 @@ CUDA_VISIBLE_DEVICES=0 python3 tools/eval.py -c configs/vqa/ser/layoutxlm.yml -o 使用如下命令即可完成`OCR引擎 + SER`的串联预测 ```shell -CUDA_VISIBLE_DEVICES=0 python3 tools/infer_vqa_token_ser.py -c configs/vqa/ser/layoutxlm.yml -o Architecture.Backbone.checkpoints=PP-Layout_v1.0_ser_pretrained/ Global.infer_img=ppstructure/vqa/images/input/zh_val_42.jpg +CUDA_VISIBLE_DEVICES=0 python3 tools/infer_vqa_token_ser.py -c configs/vqa/ser/layoutxlm.yml -o Architecture.Backbone.checkpoints=PP-Layout_v1.0_ser_pretrained/ Global.infer_img=doc/vqa/input/zh_val_42.jpg ``` 最终会在`config.Global.save_res_path`字段所配置的目录下保存预测结果可视化图像以及预测结果文本文件,预测结果文本文件名为`infer_results.txt`。 @@ -219,7 +219,7 @@ CUDA_VISIBLE_DEVICES=0 python3 tools/eval.py -c configs/vqa/re/layoutxlm.yml -o 使用如下命令即可完成`OCR引擎 + SER + RE`的串联预测 ```shell export CUDA_VISIBLE_DEVICES=0 -python3 tools/infer_vqa_token_ser_re.py -c configs/vqa/re/layoutxlm.yml -o Architecture.Backbone.checkpoints=PP-Layout_v1.0_re_pretrained/ Global.infer_img=ppstructure/vqa/images/input/zh_val_21.jpg -c_ser configs/vqa/ser/layoutxlm.yml -o_ser Architecture.Backbone.checkpoints=PP-Layout_v1.0_ser_pretrained/ +python3 tools/infer_vqa_token_ser_re.py -c configs/vqa/re/layoutxlm.yml -o Architecture.Backbone.checkpoints=PP-Layout_v1.0_re_pretrained/ Global.infer_img=doc/vqa/input/zh_val_21.jpg -c_ser configs/vqa/ser/layoutxlm.yml -o_ser Architecture.Backbone.checkpoints=PP-Layout_v1.0_ser_pretrained/ ``` 最终会在`config.Global.save_res_path`字段所配置的目录下保存预测结果可视化图像以及预测结果文本文件,预测结果文本文件名为`infer_results.txt`。 diff --git a/tools/infer_vqa_token_ser_re.py b/tools/infer_vqa_token_ser_re.py index ce9a36c55233384ba79794932626b2e1c9251a9f..fd62ace8aef35db168537580513139e429e88cc3 100755 --- a/tools/infer_vqa_token_ser_re.py +++ b/tools/infer_vqa_token_ser_re.py @@ -104,7 +104,7 @@ def make_input(ser_inputs, ser_results): ser_inputs[8] = entities_batch ser_inputs.append(relations_batch) - + # remove ocr_info segment_offset_id and label in ser input ser_inputs.pop(7) ser_inputs.pop(6) ser_inputs.pop(1)