diff --git a/configs/vqa/re/layoutlmv2.yml b/configs/vqa/re/layoutlmv2.yml index 2fa5fd1165c20bbfa8d8505bbb53d48744daebef..737dbf6b600b1b414a7f66f422e59f46154d91a9 100644 --- a/configs/vqa/re/layoutlmv2.yml +++ b/configs/vqa/re/layoutlmv2.yml @@ -11,7 +11,7 @@ Global: save_inference_dir: use_visualdl: False seed: 2048 - infer_img: doc/vqa/input/zh_val_21.jpg + infer_img: ppstructure/docs/vqa/input/zh_val_21.jpg save_res_path: ./output/re/ Architecture: @@ -21,7 +21,7 @@ Architecture: Backbone: name: LayoutLMv2ForRe pretrained: True - checkpoints: + checkpoints: Loss: name: LossFromOutput @@ -52,7 +52,7 @@ Train: name: SimpleDataSet data_dir: train_data/XFUND/zh_train/image label_file_list: - - train_data/XFUND/zh_train/xfun_normalize_train.json + - train_data/XFUND/zh_train/train.json ratio_list: [ 1.0 ] transforms: - DecodeImage: # load image @@ -61,7 +61,7 @@ Train: - VQATokenLabelEncode: # Class handling label contains_re: True algorithm: *algorithm - class_path: &class_path ppstructure/vqa/labels/labels_ser.txt + class_path: &class_path train_data/XFUND/class_list_xfun.txt - VQATokenPad: max_seq_len: &max_seq_len 512 return_attention_mask: True @@ -77,7 +77,7 @@ Train: order: 'hwc' - ToCHWImage: - KeepKeys: - keep_keys: [ 'input_ids', 'bbox', 'image', 'attention_mask', 'token_type_ids','entities', 'relations'] # dataloader will return list in this order + keep_keys: [ 'input_ids', 'bbox', 'attention_mask', 'token_type_ids','image', 'entities', 'relations'] # dataloader will return list in this order loader: shuffle: True drop_last: False @@ -90,7 +90,7 @@ Eval: name: SimpleDataSet data_dir: train_data/XFUND/zh_val/image label_file_list: - - train_data/XFUND/zh_val/xfun_normalize_val.json + - train_data/XFUND/zh_val/val.json transforms: - DecodeImage: # load image img_mode: RGB @@ -114,7 +114,7 @@ Eval: order: 'hwc' - ToCHWImage: - KeepKeys: - keep_keys: [ 'input_ids', 'bbox', 'image', 'attention_mask', 'token_type_ids','entities', 'relations'] # dataloader will return list in this order + keep_keys: [ 'input_ids', 'bbox', 'attention_mask', 'token_type_ids', 'image','entities', 'relations'] # dataloader will return list in this order loader: shuffle: False drop_last: False diff --git a/configs/vqa/re/layoutxlm.yml b/configs/vqa/re/layoutxlm.yml index ff16120ac1be92e989ebfda6af3ccf346dde89cd..d8585bb72593d55578ff3c6cd1401b5a843bb683 100644 --- a/configs/vqa/re/layoutxlm.yml +++ b/configs/vqa/re/layoutxlm.yml @@ -11,7 +11,7 @@ Global: save_inference_dir: use_visualdl: False seed: 2022 - infer_img: doc/vqa/input/zh_val_21.jpg + infer_img: ppstructure/docs/vqa/input/zh_val_21.jpg save_res_path: ./output/re/ Architecture: @@ -52,7 +52,7 @@ Train: name: SimpleDataSet data_dir: train_data/XFUND/zh_train/image label_file_list: - - train_data/XFUND/zh_train/xfun_normalize_train.json + - train_data/XFUND/zh_train/train.json ratio_list: [ 1.0 ] transforms: - DecodeImage: # load image @@ -61,7 +61,7 @@ Train: - VQATokenLabelEncode: # Class handling label contains_re: True algorithm: *algorithm - class_path: &class_path ppstructure/vqa/labels/labels_ser.txt + class_path: &class_path train_data/XFUND/class_list_xfun.txt - VQATokenPad: max_seq_len: &max_seq_len 512 return_attention_mask: True @@ -77,7 +77,7 @@ Train: order: 'hwc' - ToCHWImage: - KeepKeys: - keep_keys: [ 'input_ids', 'bbox', 'image', 'attention_mask', 'token_type_ids','entities', 'relations'] # dataloader will return list in this order + keep_keys: [ 'input_ids', 'bbox','attention_mask', 'token_type_ids', 'image', 'entities', 'relations'] # dataloader will return list in this order loader: shuffle: True drop_last: False @@ -90,7 +90,7 @@ Eval: name: SimpleDataSet data_dir: train_data/XFUND/zh_val/image label_file_list: - - train_data/XFUND/zh_val/xfun_normalize_val.json + - train_data/XFUND/zh_val/val.json transforms: - DecodeImage: # load image img_mode: RGB @@ -114,7 +114,7 @@ Eval: order: 'hwc' - ToCHWImage: - KeepKeys: - keep_keys: [ 'input_ids', 'bbox', 'image', 'attention_mask', 'token_type_ids','entities', 'relations'] # dataloader will return list in this order + keep_keys: [ 'input_ids', 'bbox', 'attention_mask', 'token_type_ids', 'image', 'entities', 'relations'] # dataloader will return list in this order loader: shuffle: False drop_last: False diff --git a/configs/vqa/ser/layoutlm.yml b/configs/vqa/ser/layoutlm.yml index 47ab093e1fce5cb38a75409eb1d9ac67c6426ba4..53e114defd4cdfa427ae27b647603744302eb0e8 100644 --- a/configs/vqa/ser/layoutlm.yml +++ b/configs/vqa/ser/layoutlm.yml @@ -11,7 +11,7 @@ Global: save_inference_dir: use_visualdl: False seed: 2022 - infer_img: doc/vqa/input/zh_val_0.jpg + infer_img: ppstructure/docs/vqa/input/zh_val_42.jpg save_res_path: ./output/ser/ Architecture: @@ -77,7 +77,7 @@ Train: order: 'hwc' - ToCHWImage: - KeepKeys: - keep_keys: [ 'input_ids','labels', 'bbox', 'image', 'attention_mask', 'token_type_ids'] # dataloader will return list in this order + keep_keys: [ 'input_ids', 'bbox', 'attention_mask', 'token_type_ids', 'image', 'labels'] # dataloader will return list in this order loader: shuffle: True drop_last: False @@ -112,7 +112,7 @@ Eval: order: 'hwc' - ToCHWImage: - KeepKeys: - keep_keys: [ 'input_ids', 'labels', 'bbox', 'image', 'attention_mask', 'token_type_ids'] # dataloader will return list in this order + keep_keys: [ 'input_ids', 'bbox', 'attention_mask', 'token_type_ids', 'image', 'labels'] # dataloader will return list in this order loader: shuffle: False drop_last: False diff --git a/configs/vqa/ser/layoutlmv2.yml b/configs/vqa/ser/layoutlmv2.yml index d6a9c03e5ec9683a3a6423ed22a98f361769541f..e48c7469567a740ca74240f0ca9f782ed5bb3c6d 100644 --- a/configs/vqa/ser/layoutlmv2.yml +++ b/configs/vqa/ser/layoutlmv2.yml @@ -11,7 +11,7 @@ Global: save_inference_dir: use_visualdl: False seed: 2022 - infer_img: doc/vqa/input/zh_val_0.jpg + infer_img: ppstructure/docs/vqa/input/zh_val_42.jpg save_res_path: ./output/ser/ Architecture: @@ -78,7 +78,7 @@ Train: order: 'hwc' - ToCHWImage: - KeepKeys: - keep_keys: [ 'input_ids','labels', 'bbox', 'image', 'attention_mask', 'token_type_ids'] # dataloader will return list in this order + keep_keys: [ 'input_ids', 'bbox', 'attention_mask', 'token_type_ids', 'image', 'labels'] # dataloader will return list in this order loader: shuffle: True drop_last: False @@ -113,7 +113,7 @@ Eval: order: 'hwc' - ToCHWImage: - KeepKeys: - keep_keys: [ 'input_ids', 'labels', 'bbox', 'image', 'attention_mask', 'token_type_ids'] # dataloader will return list in this order + keep_keys: [ 'input_ids', 'bbox', 'attention_mask', 'token_type_ids', 'image', 'labels'] # dataloader will return list in this order loader: shuffle: False drop_last: False diff --git a/configs/vqa/ser/layoutxlm.yml b/configs/vqa/ser/layoutxlm.yml index 3686989ccf7481a28584fd49c3969c1a69cd04d4..fa9df192afbc1d638c220cba3ef3640715585b37 100644 --- a/configs/vqa/ser/layoutxlm.yml +++ b/configs/vqa/ser/layoutxlm.yml @@ -43,7 +43,7 @@ Optimizer: PostProcess: name: VQASerTokenLayoutLMPostProcess - class_path: &class_path ppstructure/vqa/labels/labels_ser.txt + class_path: &class_path train_data/XFUND/class_list_xfun.txt Metric: name: VQASerTokenMetric @@ -78,7 +78,7 @@ Train: order: 'hwc' - ToCHWImage: - KeepKeys: - keep_keys: [ 'input_ids','labels', 'bbox', 'image', 'attention_mask', 'token_type_ids'] # dataloader will return list in this order + keep_keys: [ 'input_ids', 'bbox', 'attention_mask', 'token_type_ids', 'image', 'labels'] # dataloader will return list in this order loader: shuffle: True drop_last: False @@ -113,7 +113,7 @@ Eval: order: 'hwc' - ToCHWImage: - KeepKeys: - keep_keys: [ 'input_ids', 'labels', 'bbox', 'image', 'attention_mask', 'token_type_ids'] # dataloader will return list in this order + keep_keys: [ 'input_ids', 'bbox', 'attention_mask', 'token_type_ids', 'image', 'labels'] # dataloader will return list in this order loader: shuffle: False drop_last: False diff --git a/ppocr/data/imaug/label_ops.py b/ppocr/data/imaug/label_ops.py index c95b326224515c43b03f90ee51c809006399dfff..0723e97ae719690ef2e6a500b327b039c7a46f66 100644 --- a/ppocr/data/imaug/label_ops.py +++ b/ppocr/data/imaug/label_ops.py @@ -912,7 +912,7 @@ class VQATokenLabelEncode(object): label = info['label'] gt_label = self._parse_label(label, encode_res) -# construct entities for re + # construct entities for re if train_re: if gt_label[0] != self.label2id_map["O"]: entity_id_to_index_map[info["id"]] = len(entities) diff --git a/ppocr/losses/vqa_token_layoutlm_loss.py b/ppocr/losses/vqa_token_layoutlm_loss.py index 244893d97d0e422c5ca270bdece689e13aba2b07..f9cd4634731a26dd990d6ffac3d8defc8cdf7e97 100755 --- a/ppocr/losses/vqa_token_layoutlm_loss.py +++ b/ppocr/losses/vqa_token_layoutlm_loss.py @@ -27,8 +27,8 @@ class VQASerTokenLayoutLMLoss(nn.Layer): self.ignore_index = self.loss_class.ignore_index def forward(self, predicts, batch): - labels = batch[1] - attention_mask = batch[4] + labels = batch[5] + attention_mask = batch[2] if attention_mask is not None: active_loss = attention_mask.reshape([-1, ]) == 1 active_outputs = predicts.reshape( diff --git a/ppocr/modeling/backbones/vqa_layoutlm.py b/ppocr/modeling/backbones/vqa_layoutlm.py index ede5b7a35af65fac351277cefccd89b251f5cdb7..1542519651a5199ab1d08cc950919cb84baebc0d 100644 --- a/ppocr/modeling/backbones/vqa_layoutlm.py +++ b/ppocr/modeling/backbones/vqa_layoutlm.py @@ -74,9 +74,9 @@ class LayoutLMForSer(NLPBaseModel): def forward(self, x): x = self.model( input_ids=x[0], - bbox=x[2], - attention_mask=x[4], - token_type_ids=x[5], + bbox=x[1], + attention_mask=x[2], + token_type_ids=x[3], position_ids=None, output_hidden_states=False) return x @@ -96,13 +96,15 @@ class LayoutLMv2ForSer(NLPBaseModel): def forward(self, x): x = self.model( input_ids=x[0], - bbox=x[2], - image=x[3], - attention_mask=x[4], - token_type_ids=x[5], + bbox=x[1], + attention_mask=x[2], + token_type_ids=x[3], + image=x[4], position_ids=None, head_mask=None, labels=None) + if not self.training: + return x return x[0] @@ -119,14 +121,16 @@ class LayoutXLMForSer(NLPBaseModel): def forward(self, x): x = self.model( - input_ids=x[0], - bbox=x[2], - image=x[3], - attention_mask=x[4], - token_type_ids=x[5], - position_ids=None, - head_mask=None, - labels=None) + input_ids=x[0], + bbox=x[1], + attention_mask=x[2], + token_type_ids=x[3], + image=x[4], + position_ids=None, + head_mask=None, + labels=None) + if not self.training: + return x return x[0] @@ -140,12 +144,12 @@ class LayoutLMv2ForRe(NLPBaseModel): x = self.model( input_ids=x[0], bbox=x[1], - labels=None, - image=x[2], - attention_mask=x[3], - token_type_ids=x[4], + attention_mask=x[2], + token_type_ids=x[3], + image=x[4], position_ids=None, head_mask=None, + labels=None, entities=x[5], relations=x[6]) return x @@ -161,12 +165,12 @@ class LayoutXLMForRe(NLPBaseModel): x = self.model( input_ids=x[0], bbox=x[1], - labels=None, - image=x[2], - attention_mask=x[3], - token_type_ids=x[4], + attention_mask=x[2], + token_type_ids=x[3], + image=x[4], position_ids=None, head_mask=None, + labels=None, entities=x[5], relations=x[6]) return x diff --git a/ppocr/postprocess/vqa_token_ser_layoutlm_postprocess.py b/ppocr/postprocess/vqa_token_ser_layoutlm_postprocess.py index 782cdea6c58c69e0d728787e0e21e200c9e13790..90bc52733df2d2b5020cf5756a96b84e903282e1 100644 --- a/ppocr/postprocess/vqa_token_ser_layoutlm_postprocess.py +++ b/ppocr/postprocess/vqa_token_ser_layoutlm_postprocess.py @@ -41,11 +41,13 @@ class VQASerTokenLayoutLMPostProcess(object): self.id2label_map_for_show[val] = key def __call__(self, preds, batch=None, *args, **kwargs): + if isinstance(preds, tuple): + preds = preds[0] if isinstance(preds, paddle.Tensor): preds = preds.numpy() if batch is not None: - return self._metric(preds, batch[1]) + return self._metric(preds, batch[5]) else: return self._infer(preds, **kwargs) @@ -63,11 +65,10 @@ class VQASerTokenLayoutLMPostProcess(object): j]]) return decode_out_list, label_decode_out_list - def _infer(self, preds, attention_masks, segment_offset_ids, ocr_infos): + def _infer(self, preds, segment_offset_ids, ocr_infos): results = [] - for pred, attention_mask, segment_offset_id, ocr_info in zip( - preds, attention_masks, segment_offset_ids, ocr_infos): + for pred, segment_offset_id, ocr_info in zip(preds, segment_offset_ids, ocr_infos): pred = np.argmax(pred, axis=1) pred = [self.id2label_map[idx] for idx in pred] diff --git a/ppstructure/docs/models_list.md b/ppstructure/docs/models_list.md index c7dab999ff6e370c56c5495e22e91f117b3d1275..71d0566bb0e70dc907416cf426ce88acac538e79 100644 --- a/ppstructure/docs/models_list.md +++ b/ppstructure/docs/models_list.md @@ -1,11 +1,12 @@ # PP-Structure 系列模型列表 -- [1. 版面分析模型](#1) -- [2. OCR和表格识别模型](#2) - - [2.1 OCR](#21) - - [2.2 表格识别模型](#22) -- [3. VQA模型](#3) -- [4. KIE模型](#4) +- [PP-Structure 系列模型列表](#pp-structure-系列模型列表) + - [1. 版面分析模型](#1-版面分析模型) + - [2. OCR和表格识别模型](#2-ocr和表格识别模型) + - [2.1 OCR](#21-ocr) + - [2.2 表格识别模型](#22-表格识别模型) + - [3. VQA模型](#3-vqa模型) + - [4. KIE模型](#4-kie模型) @@ -42,11 +43,11 @@ |模型名称|模型简介|推理模型大小|下载地址| | --- | --- | --- | --- | -|ser_LayoutXLM_xfun_zh|基于LayoutXLM在xfun中文数据集上训练的SER模型|1.4G|[推理模型 coming soon]() / [训练模型](https://paddleocr.bj.bcebos.com/pplayout/re_LayoutXLM_xfun_zh.tar) | -|re_LayoutXLM_xfun_zh|基于LayoutXLM在xfun中文数据集上训练的RE模型|1.4G|[推理模型 coming soon]() / [训练模型](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutXLM_xfun_zh.tar) | -|ser_LayoutLMv2_xfun_zh|基于LayoutLMv2在xfun中文数据集上训练的SER模型|778M|[推理模型 coming soon]() / [训练模型](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutLMv2_xfun_zh.tar) | +|ser_LayoutXLM_xfun_zh|基于LayoutXLM在xfun中文数据集上训练的SER模型|1.4G|[推理模型](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutXLM_xfun_zh_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutXLM_xfun_zh.tar) | +|re_LayoutXLM_xfun_zh|基于LayoutXLM在xfun中文数据集上训练的RE模型|1.4G|[推理模型 coming soon]() / [训练模型](https://paddleocr.bj.bcebos.com/pplayout/re_LayoutXLM_xfun_zh.tar) | +|ser_LayoutLMv2_xfun_zh|基于LayoutLMv2在xfun中文数据集上训练的SER模型|778M|[推理模型](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutLMv2_xfun_zh_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutLMv2_xfun_zh.tar) | |re_LayoutLMv2_xfun_zh|基于LayoutLMv2在xfun中文数据集上训练的RE模型|765M|[推理模型 coming soon]() / [训练模型](https://paddleocr.bj.bcebos.com/pplayout/re_LayoutLMv2_xfun_zh.tar) | -|ser_LayoutLM_xfun_zh|基于LayoutLM在xfun中文数据集上训练的SER模型|430M|[推理模型 coming soon]() / [训练模型](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutLM_xfun_zh.tar) | +|ser_LayoutLM_xfun_zh|基于LayoutLM在xfun中文数据集上训练的SER模型|430M|[推理模型](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutLM_xfun_zh_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutLM_xfun_zh.tar) | ## 4. KIE模型 diff --git a/ppstructure/docs/models_list_en.md b/ppstructure/docs/models_list_en.md index b92c10c241df72c85649b64f915b4266cd3fe410..b3a07555051ef08495019ef3c40d2ca4bfbad03c 100644 --- a/ppstructure/docs/models_list_en.md +++ b/ppstructure/docs/models_list_en.md @@ -1,11 +1,12 @@ # PP-Structure Model list -- [1. Layout Analysis](#1) -- [2. OCR and Table Recognition](#2) - - [2.1 OCR](#21) - - [2.2 Table Recognition](#22) -- [3. VQA](#3) -- [4. KIE](#4) +- [PP-Structure Model list](#pp-structure-model-list) + - [1. Layout Analysis](#1-layout-analysis) + - [2. OCR and Table Recognition](#2-ocr-and-table-recognition) + - [2.1 OCR](#21-ocr) + - [2.2 Table Recognition](#22-table-recognition) + - [3. VQA](#3-vqa) + - [4. KIE](#4-kie) @@ -42,11 +43,11 @@ If you need to use other OCR models, you can download the model in [PP-OCR model |model| description |inference model size|download| | --- |----------------------------------------------------------------| --- | --- | -|ser_LayoutXLM_xfun_zh| SER model trained on xfun Chinese dataset based on LayoutXLM |1.4G|[inference model coming soon]() / [trained model](https://paddleocr.bj.bcebos.com/pplayout/re_LayoutXLM_xfun_zh.tar) | -|re_LayoutXLM_xfun_zh| Re model trained on xfun Chinese dataset based on LayoutXLM |1.4G|[inference model coming soon]() / [trained model](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutXLM_xfun_zh.tar) | -|ser_LayoutLMv2_xfun_zh| SER model trained on xfun Chinese dataset based on LayoutXLMv2 |778M|[inference model coming soon]() / [trained model](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutLMv2_xfun_zh.tar) | +|ser_LayoutXLM_xfun_zh| SER model trained on xfun Chinese dataset based on LayoutXLM |1.4G|[inference model](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutXLM_xfun_zh_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutXLM_xfun_zh.tar) | +|re_LayoutXLM_xfun_zh| Re model trained on xfun Chinese dataset based on LayoutXLM |1.4G|[inference model coming soon]() / [trained model](https://paddleocr.bj.bcebos.com/pplayout/re_LayoutXLM_xfun_zh.tar) | +|ser_LayoutLMv2_xfun_zh| SER model trained on xfun Chinese dataset based on LayoutXLMv2 |778M|[inference model](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutLMv2_xfun_zh_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutLMv2_xfun_zh.tar) | |re_LayoutLMv2_xfun_zh| Re model trained on xfun Chinese dataset based on LayoutXLMv2 |765M|[inference model coming soon]() / [trained model](https://paddleocr.bj.bcebos.com/pplayout/re_LayoutLMv2_xfun_zh.tar) | -|ser_LayoutLM_xfun_zh| SER model trained on xfun Chinese dataset based on LayoutLM |430M|[inference model coming soon]() / [trained model](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutLM_xfun_zh.tar) | +|ser_LayoutLM_xfun_zh| SER model trained on xfun Chinese dataset based on LayoutLM |430M|[inference model](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutLM_xfun_zh_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutLM_xfun_zh.tar) | ## 4. KIE diff --git a/ppstructure/utility.py b/ppstructure/utility.py index 1ad902e7e6be95a6901e3774420fad337f594861..7ad3426740a14719083c23bf525591c176036da8 100644 --- a/ppstructure/utility.py +++ b/ppstructure/utility.py @@ -40,6 +40,14 @@ def init_args(): type=ast.literal_eval, default=None, help='label map according to ppstructure/layout/README_ch.md') + + # params for vqa + parser.add_argument("--vqa_algorithm", type=str, default='LayoutXLM') + parser.add_argument("--ser_model_dir", type=str) + parser.add_argument( + "--ser_dict_path", + type=str, + default="../train_data/XFUND/class_list_xfun.txt") # params for inference parser.add_argument( "--mode", diff --git a/ppstructure/vqa/README.md b/ppstructure/vqa/README.md index 711ffa313865cd5a210143819cd4604dc28ef4f4..cfb421005062de46a6d4ff4a093c657017847d46 100644 --- a/ppstructure/vqa/README.md +++ b/ppstructure/vqa/README.md @@ -1,19 +1,16 @@ English | [简体中文](README_ch.md) -- [Document Visual Question Answering (Doc-VQA)](#Document-Visual-Question-Answering) - - [1. Introduction](#1-Introduction) +- [Document Visual Question Answering](#document-visual-question-answering) + - [1 Introduction](#1-introduction) - [2. Performance](#2-performance) - - [3. Effect demo](#3-Effect-demo) + - [3. Effect demo](#3-effect-demo) - [3.1 SER](#31-ser) - [3.2 RE](#32-re) - - [4. Install](#4-Install) - - [4.1 Installation dependencies](#41-Install-dependencies) - - [4.2 Install PaddleOCR](#42-Install-PaddleOCR) - - [5. Usage](#5-Usage) - - [5.1 Data and Model Preparation](#51-Data-and-Model-Preparation) - - [5.2 SER](#52-ser) + - [4. Install](#4-install) + - [4.1 Install dependencies](#41-install-dependencies) - [5.3 RE](#53-re) - - [6. Reference](#6-Reference-Links) + - [6. Reference Links](#6-reference-links) + - [License](#license) # Document Visual Question Answering @@ -187,17 +184,17 @@ CUDA_VISIBLE_DEVICES=0 python3 tools/eval.py -c configs/vqa/ser/layoutxlm.yml -o ```` Finally, `precision`, `recall`, `hmean` and other indicators will be printed -* Use `OCR engine + SER` tandem prediction +* `OCR + SER` tandem prediction based on training engine -Use the following command to complete the series prediction of `OCR engine + SER`, taking the pretrained SER model as an example: +Use the following command to complete the series prediction of `OCR engine + SER`, taking the SER model based on LayoutXLM as an example:: ```shell -CUDA_VISIBLE_DEVICES=0 python3 tools/infer_vqa_token_ser.py -c configs/vqa/ser/layoutxlm.yml -o Architecture.Backbone.checkpoints=pretrain/ser_LayoutXLM_xfun_zh/Global.infer_img=doc/vqa/input/zh_val_42.jpg +python3.7 tools/export_model.py -c configs/vqa/ser/layoutxlm.yml -o Architecture.Backbone.checkpoints=pretrain/ser_LayoutXLM_xfun_zh/ Global.save_inference_dir=output/ser/infer ```` Finally, the prediction result visualization image and the prediction result text file will be saved in the directory configured by the `config.Global.save_res_path` field. The prediction result text file is named `infer_results.txt`. -* End-to-end evaluation of `OCR engine + SER` prediction system +* End-to-end evaluation of `OCR + SER` prediction system First use the `tools/infer_vqa_token_ser.py` script to complete the prediction of the dataset, then use the following command to evaluate. @@ -205,6 +202,24 @@ First use the `tools/infer_vqa_token_ser.py` script to complete the prediction o export CUDA_VISIBLE_DEVICES=0 python3 tools/eval_with_label_end2end.py --gt_json_path XFUND/zh_val/xfun_normalize_val.json --pred_json_path output_res/infer_results.txt ```` +* export model + +Use the following command to complete the model export of the SER model, taking the SER model based on LayoutXLM as an example: + +```shell +python3.7 tools/export_model.py -c configs/vqa/ser/layoutxlm.yml -o Architecture.Backbone.checkpoints=pretrain/ser_LayoutXLM_xfun_zh/ Global.save_inference_dir=output/ser/infer +``` +The converted model will be stored in the directory specified by the `Global.save_inference_dir` field. + +* `OCR + SER` tandem prediction based on prediction engine + +Use the following command to complete the tandem prediction of `OCR + SER` based on the prediction engine, taking the SER model based on LayoutXLM as an example: + +```shell +cd ppstructure +CUDA_VISIBLE_DEVICES=0 python3.7 vqa/predict_vqa_token_ser.py --vqa_algorithm=LayoutXLM --ser_model_dir=../output/ser/infer --ser_dict_path=../train_data/XFUND/class_list_xfun.txt --image_dir=docs/vqa/input/zh_val_42.jpg --output=output +``` +After the prediction is successful, the visualization images and results will be saved in the directory specified by the `output` field ### 5.3 RE @@ -247,11 +262,19 @@ Finally, `precision`, `recall`, `hmean` and other indicators will be printed Use the following command to complete the series prediction of `OCR engine + SER + RE`, taking the pretrained SER and RE models as an example: ```shell export CUDA_VISIBLE_DEVICES=0 -python3 tools/infer_vqa_token_ser_re.py -c configs/vqa/re/layoutxlm.yml -o Architecture.Backbone.checkpoints=pretrain/re_LayoutXLM_xfun_zh/Global.infer_img=doc/vqa/input/zh_val_21.jpg -c_ser configs/vqa/ser/layoutxlm. yml -o_ser Architecture.Backbone.checkpoints=pretrain/ser_LayoutXLM_xfun_zh/ +python3 tools/infer_vqa_token_ser_re.py -c configs/vqa/re/layoutxlm.yml -o Architecture.Backbone.checkpoints=pretrain/re_LayoutXLM_xfun_zh/Global.infer_img=ppstructure/docs/vqa/input/zh_val_21.jpg -c_ser configs/vqa/ser/layoutxlm. yml -o_ser Architecture.Backbone.checkpoints=pretrain/ser_LayoutXLM_xfun_zh/ ```` Finally, the prediction result visualization image and the prediction result text file will be saved in the directory configured by the `config.Global.save_res_path` field. The prediction result text file is named `infer_results.txt`. +* export model + +cooming soon + +* `OCR + SER + RE` tandem prediction based on prediction engine + +cooming soon + ## 6. Reference Links - LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding, https://arxiv.org/pdf/2104.08836.pdf diff --git a/ppstructure/vqa/README_ch.md b/ppstructure/vqa/README_ch.md index 297ba64f82e70eafd4a0b1fee0764899799219ad..52113b80b52b216dad4d7654863944b639eacbfc 100644 --- a/ppstructure/vqa/README_ch.md +++ b/ppstructure/vqa/README_ch.md @@ -14,6 +14,7 @@ - [5.2 SER](#52-ser) - [5.3 RE](#53-re) - [6. 参考链接](#6-参考链接) + - [License](#license) # 文档视觉问答(DOC-VQA) @@ -183,16 +184,16 @@ CUDA_VISIBLE_DEVICES=0 python3 tools/eval.py -c configs/vqa/ser/layoutxlm.yml -o ``` 最终会打印出`precision`, `recall`, `hmean`等指标 -* 使用`OCR引擎 + SER`串联预测 +* 基于训练引擎的`OCR + SER`串联预测 -使用如下命令即可完成`OCR引擎 + SER`的串联预测, 以SER预训练模型为例: +使用如下命令即可完成基于训练引擎的`OCR + SER`的串联预测, 以基于LayoutXLM的SER模型为例: ```shell CUDA_VISIBLE_DEVICES=0 python3 tools/infer_vqa_token_ser.py -c configs/vqa/ser/layoutxlm.yml -o Architecture.Backbone.checkpoints=pretrain/ser_LayoutXLM_xfun_zh/ Global.infer_img=doc/vqa/input/zh_val_42.jpg ``` 最终会在`config.Global.save_res_path`字段所配置的目录下保存预测结果可视化图像以及预测结果文本文件,预测结果文本文件名为`infer_results.txt`。 -* 对`OCR引擎 + SER`预测系统进行端到端评估 +* 对`OCR + SER`预测系统进行端到端评估 首先使用 `tools/infer_vqa_token_ser.py` 脚本完成数据集的预测,然后使用下面的命令进行评估。 @@ -200,6 +201,24 @@ CUDA_VISIBLE_DEVICES=0 python3 tools/infer_vqa_token_ser.py -c configs/vqa/ser/l export CUDA_VISIBLE_DEVICES=0 python3 tools/eval_with_label_end2end.py --gt_json_path XFUND/zh_val/xfun_normalize_val.json --pred_json_path output_res/infer_results.txt ``` +* 模型导出 + +使用如下命令即可完成SER模型的模型导出, 以基于LayoutXLM的SER模型为例: + +```shell +python3.7 tools/export_model.py -c configs/vqa/ser/layoutxlm.yml -o Architecture.Backbone.checkpoints=pretrain/ser_LayoutXLM_xfun_zh/ Global.save_inference_dir=output/ser/infer +``` +转换后的模型会存放在`Global.save_inference_dir`字段指定的目录下。 + +* 基于预测引擎的`OCR + SER`串联预测 + +使用如下命令即可完成基于预测引擎的`OCR + SER`的串联预测, 以基于LayoutXLM的SER模型为例: + +```shell +cd ppstructure +CUDA_VISIBLE_DEVICES=0 python3.7 vqa/predict_vqa_token_ser.py --vqa_algorithm=LayoutXLM --ser_model_dir=../output/ser/infer --ser_dict_path=../train_data/XFUND/class_list_xfun.txt --image_dir=docs/vqa/input/zh_val_42.jpg --output=output +``` +预测成功后,可视化图片和结果会保存在`output`字段指定的目录下 ### 5.3 RE @@ -236,16 +255,24 @@ CUDA_VISIBLE_DEVICES=0 python3 tools/eval.py -c configs/vqa/re/layoutxlm.yml -o ``` 最终会打印出`precision`, `recall`, `hmean`等指标 -* 使用`OCR引擎 + SER + RE`串联预测 +* 基于训练引擎的`OCR + SER + RE`串联预测 -使用如下命令即可完成`OCR引擎 + SER + RE`的串联预测, 以预训练SER和RE模型为例: +使用如下命令即可完成基于训练引擎的`OCR + SER + RE`串联预测, 以基于LayoutXLMSER和RE模型为例: ```shell export CUDA_VISIBLE_DEVICES=0 -python3 tools/infer_vqa_token_ser_re.py -c configs/vqa/re/layoutxlm.yml -o Architecture.Backbone.checkpoints=pretrain/re_LayoutXLM_xfun_zh/ Global.infer_img=doc/vqa/input/zh_val_21.jpg -c_ser configs/vqa/ser/layoutxlm.yml -o_ser Architecture.Backbone.checkpoints=pretrain/ser_LayoutXLM_xfun_zh/ +python3 tools/infer_vqa_token_ser_re.py -c configs/vqa/re/layoutxlm.yml -o Architecture.Backbone.checkpoints=pretrain/re_LayoutXLM_xfun_zh/ Global.infer_img=ppstructure/docs/vqa/input/zh_val_21.jpg -c_ser configs/vqa/ser/layoutxlm.yml -o_ser Architecture.Backbone.checkpoints=pretrain/ser_LayoutXLM_xfun_zh/ ``` 最终会在`config.Global.save_res_path`字段所配置的目录下保存预测结果可视化图像以及预测结果文本文件,预测结果文本文件名为`infer_results.txt`。 +* 模型导出 + +cooming soon + +* 基于预测引擎的`OCR + SER + RE`串联预测 + +cooming soon + ## 6. 参考链接 - LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding, https://arxiv.org/pdf/2104.08836.pdf diff --git a/ppstructure/vqa/predict_vqa_token_ser.py b/ppstructure/vqa/predict_vqa_token_ser.py new file mode 100644 index 0000000000000000000000000000000000000000..f55c87571332e9309583ce41b5c222f5f06c38bd --- /dev/null +++ b/ppstructure/vqa/predict_vqa_token_ser.py @@ -0,0 +1,161 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import sys + +__dir__ = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(__dir__) +sys.path.append(os.path.abspath(os.path.join(__dir__, '../..'))) + +os.environ["FLAGS_allocator_strategy"] = 'auto_growth' + +import cv2 +import json +import numpy as np +import time + +import tools.infer.utility as utility +from ppocr.data import create_operators, transform +from ppocr.postprocess import build_post_process +from ppocr.utils.logging import get_logger +from ppocr.utils.visual import draw_ser_results +from ppocr.utils.utility import get_image_file_list, check_and_read_gif +from ppstructure.utility import parse_args + +from paddleocr import PaddleOCR + +logger = get_logger() + + +class SerPredictor(object): + def __init__(self, args): + self.ocr_engine = PaddleOCR(use_angle_cls=False, show_log=False) + + pre_process_list = [{ + 'VQATokenLabelEncode': { + 'algorithm': args.vqa_algorithm, + 'class_path': args.ser_dict_path, + 'contains_re': False, + 'ocr_engine': self.ocr_engine + } + }, { + 'VQATokenPad':{ + 'max_seq_len':512, + 'return_attention_mask': True + } + }, { + 'VQASerTokenChunk':{ + 'max_seq_len':512, + 'return_attention_mask': True + } + }, { + 'Resize':{ + 'size' : [224, 224] + } + }, { + 'NormalizeImage': { + 'std': [58.395, 57.12, 57.375], + 'mean': [123.675, 116.28, 103.53], + 'scale': '1', + 'order': 'hwc' + } + }, { + 'ToCHWImage': None + }, { + 'KeepKeys': { + 'keep_keys': [ + 'input_ids', 'bbox', 'attention_mask', 'token_type_ids', 'image', 'labels', + 'segment_offset_id', 'ocr_info', + 'entities' + ] + } + }] + postprocess_params = { + 'name': 'VQASerTokenLayoutLMPostProcess', + "class_path": args.ser_dict_path, + } + + self.preprocess_op = create_operators(pre_process_list, {'infer_mode':True}) + self.postprocess_op = build_post_process(postprocess_params) + self.predictor, self.input_tensor, self.output_tensors, self.config = \ + utility.create_predictor(args, 'ser', logger) + + def __call__(self, img): + ori_im = img.copy() + data = {'image': img} + data = transform(data, self.preprocess_op) + img = data[0] + if img is None: + return None, 0 + img = np.expand_dims(img, axis=0) + img = img.copy() + starttime = time.time() + + for idx in range(len(self.input_tensor)): + expand_input = np.expand_dims(data[idx], axis=0) + self.input_tensor[idx].copy_from_cpu(expand_input) + + self.predictor.run() + + outputs = [] + for output_tensor in self.output_tensors: + output = output_tensor.copy_to_cpu() + outputs.append(output) + preds = outputs[0] + + post_result = self.postprocess_op( + preds, + segment_offset_ids=[data[6]], + ocr_infos=[data[7]]) + elapse = time.time() - starttime + return post_result, elapse + + +def main(args): + image_file_list = get_image_file_list(args.image_dir) + ser_predictor = SerPredictor(args) + count = 0 + total_time = 0 + + os.makedirs(args.output, exist_ok=True) + with open( + os.path.join(args.output, 'infer.txt'), mode='w', + encoding='utf-8') as f_w: + for image_file in image_file_list: + img, flag = check_and_read_gif(image_file) + if not flag: + img = cv2.imread(image_file) + img = img[:,:,::-1] + if img is None: + logger.info("error in loading image:{}".format(image_file)) + continue + ser_res, elapse = ser_predictor(img) + ser_res = ser_res[0] + + res_str = '{}\t{}\n'.format(image_file,json.dumps({"ocr_info": ser_res,}, ensure_ascii=False)) + f_w.write(res_str) + + img_res = draw_ser_results(image_file, ser_res, font_path="../doc/fonts/simfang.ttf",) + + img_save_path = os.path.join(args.output, + os.path.basename(image_file)) + cv2.imwrite(img_save_path, img_res) + logger.info("save vis result to {}".format(img_save_path)) + if count > 0: + total_time += elapse + count += 1 + logger.info("Predict time of {}: {}".format(image_file, elapse)) + +if __name__ == "__main__": + main(parse_args()) diff --git a/ppstructure/vqa/requirements.txt b/ppstructure/vqa/requirements.txt index 0042ec0baedcc3e7bbecb922d10b93c95219219d..fcd882274c4402ba2a1d34f20ee6e2befa157121 100644 --- a/ppstructure/vqa/requirements.txt +++ b/ppstructure/vqa/requirements.txt @@ -1,4 +1,7 @@ sentencepiece yacs seqeval -paddlenlp>=2.2.1 \ No newline at end of file +paddlenlp>=2.2.1 +pypandoc +attrdict +python_docx \ No newline at end of file diff --git a/tools/export_model.py b/tools/export_model.py index b10d41d5b288258ad895cefa7d8cc243eff10546..752732ff7eb2410f943212b3f8722c179df1e060 100755 --- a/tools/export_model.py +++ b/tools/export_model.py @@ -97,6 +97,22 @@ def export_single_model(model, shape=[None, 1, 32, 100], dtype="float32"), ] model = to_static(model, input_spec=other_shape) + elif arch_config["algorithm"] in ["LayoutLM", "LayoutLMv2", "LayoutXLM"]: + input_spec=[ + paddle.static.InputSpec( + shape=[None, 512], dtype="int64"), # input_ids + paddle.static.InputSpec( + shape=[None, 512, 4], dtype="int64"), # bbox + paddle.static.InputSpec( + shape=[None, 512], dtype="int64"), # attention_mask + paddle.static.InputSpec( + shape=[None, 512], dtype="int64"), # token_type_ids + paddle.static.InputSpec( + shape=[None, 3, 224, 224], dtype="int64"), # image + ] + if arch_config["algorithm"] == "LayoutLM": + input_spec.pop(4) + model = to_static(model, input_spec=[input_spec]) else: infer_shape = [3, -1, -1] if arch_config["model_type"] == "rec": @@ -172,7 +188,7 @@ def main(): config["Architecture"]["Head"]["out_channels"] = char_num model = build_model(config["Architecture"]) - load_model(config, model) + load_model(config, model, model_type=config['Architecture']["model_type"]) model.eval() save_path = config["Global"]["save_inference_dir"] diff --git a/tools/infer/utility.py b/tools/infer/utility.py index 366212f228eec33f11c825bfaf1e360258af9b2e..aa5d2371b6ea62c3150b7f336dfd01ae721d444b 100644 --- a/tools/infer/utility.py +++ b/tools/infer/utility.py @@ -153,6 +153,8 @@ def create_predictor(args, mode, logger): model_dir = args.rec_model_dir elif mode == 'table': model_dir = args.table_model_dir + elif mode == 'ser': + model_dir = args.ser_model_dir else: model_dir = args.e2e_model_dir @@ -316,8 +318,13 @@ def create_predictor(args, mode, logger): # create predictor predictor = inference.create_predictor(config) input_names = predictor.get_input_names() - for name in input_names: - input_tensor = predictor.get_input_handle(name) + if mode in ['ser','re']: + input_tensor = [] + for name in input_names: + input_tensor.append(predictor.get_input_handle(name)) + else: + for name in input_names: + input_tensor = predictor.get_input_handle(name) output_tensors = get_output_tensors(args, mode, predictor) return predictor, input_tensor, output_tensors, config diff --git a/tools/infer_vqa_token_ser.py b/tools/infer_vqa_token_ser.py index 39ada64a99847a910158b74672c89398ba08f032..1e661ec2c72fa2c076310d6fc4d9d8f4c6a346b4 100755 --- a/tools/infer_vqa_token_ser.py +++ b/tools/infer_vqa_token_ser.py @@ -44,6 +44,7 @@ def to_tensor(data): from collections import defaultdict data_dict = defaultdict(list) to_tensor_idxs = [] + for idx, v in enumerate(data): if isinstance(v, (np.ndarray, paddle.Tensor, numbers.Number)): if idx not in to_tensor_idxs: @@ -57,6 +58,7 @@ def to_tensor(data): class SerPredictor(object): def __init__(self, config): global_config = config['Global'] + self.algorithm = config['Architecture']["algorithm"] # build post process self.post_process_class = build_post_process(config['PostProcess'], @@ -70,7 +72,7 @@ class SerPredictor(object): from paddleocr import PaddleOCR - self.ocr_engine = PaddleOCR(use_angle_cls=False, show_log=False) + self.ocr_engine = PaddleOCR(use_angle_cls=False, show_log=False, use_gpu=global_config['use_gpu']) # create data ops transforms = [] @@ -80,8 +82,8 @@ class SerPredictor(object): op[op_name]['ocr_engine'] = self.ocr_engine elif op_name == 'KeepKeys': op[op_name]['keep_keys'] = [ - 'input_ids', 'labels', 'bbox', 'image', 'attention_mask', - 'token_type_ids', 'segment_offset_id', 'ocr_info', + 'input_ids', 'bbox', 'attention_mask', 'token_type_ids', 'image', 'labels', + 'segment_offset_id', 'ocr_info', 'entities' ] @@ -99,9 +101,11 @@ class SerPredictor(object): batch = transform(data, self.ops) batch = to_tensor(batch) preds = self.model(batch) + if self.algorithm in ['LayoutLMv2', 'LayoutXLM']: + preds = preds[0] + post_result = self.post_process_class( preds, - attention_masks=batch[4], segment_offset_ids=batch[6], ocr_infos=batch[7]) return post_result, batch @@ -138,8 +142,6 @@ if __name__ == '__main__': save_img_path = os.path.join( config['Global']['save_res_path'], os.path.splitext(os.path.basename(img_path))[0] + "_ser.jpg") - logger.info("process: [{}/{}], save result to {}".format( - idx, len(infer_imgs), save_img_path)) result, _ = ser_engine(data) result = result[0] @@ -149,3 +151,7 @@ if __name__ == '__main__': }, ensure_ascii=False) + "\n") img_res = draw_ser_results(img_path, result) cv2.imwrite(save_img_path, img_res) + + logger.info("process: [{}/{}], save result to {}".format( + idx, len(infer_imgs), save_img_path)) + diff --git a/tools/infer_vqa_token_ser_re.py b/tools/infer_vqa_token_ser_re.py index 6210f7f3c24227c9d366b08ce93ccfe4df849ce1..d5ae634ceabd089f88a7f4d8e109029267010374 100755 --- a/tools/infer_vqa_token_ser_re.py +++ b/tools/infer_vqa_token_ser_re.py @@ -38,7 +38,7 @@ from ppocr.utils.save_load import load_model from ppocr.utils.visual import draw_re_results from ppocr.utils.logging import get_logger from ppocr.utils.utility import get_image_file_list, load_vqa_bio_label_maps, print_dict -from tools.program import ArgsParser, load_config, merge_config, check_gpu +from tools.program import ArgsParser, load_config, merge_config from tools.infer_vqa_token_ser import SerPredictor @@ -107,7 +107,7 @@ def make_input(ser_inputs, ser_results): # remove ocr_info segment_offset_id and label in ser input ser_inputs.pop(7) ser_inputs.pop(6) - ser_inputs.pop(1) + ser_inputs.pop(5) return ser_inputs, entity_idx_dict_batch @@ -131,9 +131,7 @@ class SerRePredictor(object): self.model.eval() def __call__(self, img_path): - ser_results, ser_inputs = self.ser_engine(img_path) - paddle.save(ser_inputs, 'ser_inputs.npy') - paddle.save(ser_results, 'ser_results.npy') + ser_results, ser_inputs = self.ser_engine({'img_path': img_path}) re_input, entity_idx_dict_batch = make_input(ser_inputs, ser_results) preds = self.model(re_input) post_result = self.post_process_class( @@ -155,7 +153,6 @@ def preprocess(): # check if set use_gpu=True in paddlepaddle cpu version use_gpu = config['Global']['use_gpu'] - check_gpu(use_gpu) device = 'gpu:{}'.format(dist.ParallelEnv().dev_id) if use_gpu else 'cpu' device = paddle.set_device(device) @@ -185,9 +182,7 @@ if __name__ == '__main__': for idx, img_path in enumerate(infer_imgs): save_img_path = os.path.join( config['Global']['save_res_path'], - os.path.splitext(os.path.basename(img_path))[0] + "_ser.jpg") - logger.info("process: [{}/{}], save result to {}".format( - idx, len(infer_imgs), save_img_path)) + os.path.splitext(os.path.basename(img_path))[0] + "_ser_re.jpg") result = ser_re_engine(img_path) result = result[0] @@ -197,3 +192,6 @@ if __name__ == '__main__': }, ensure_ascii=False) + "\n") img_res = draw_re_results(img_path, result) cv2.imwrite(save_img_path, img_res) + + logger.info("process: [{}/{}], save result to {}".format( + idx, len(infer_imgs), save_img_path)) \ No newline at end of file