diff --git a/configs/table/SLANet_ch.yml b/configs/table/SLANet_ch.yml new file mode 100644 index 0000000000000000000000000000000000000000..997ff0a77b5ea824957abc1d32a7ba7f70abc12c --- /dev/null +++ b/configs/table/SLANet_ch.yml @@ -0,0 +1,141 @@ +Global: + use_gpu: True + epoch_num: 400 + log_smooth_window: 20 + print_batch_step: 20 + save_model_dir: ./output/SLANet_ch + save_epoch_step: 400 + # evaluation is run every 331 iterations after the 0th iteration + eval_batch_step: [0, 331] + cal_metric_during_train: True + pretrained_model: + checkpoints: + save_inference_dir: ./output/SLANet_ch/infer + use_visualdl: False + infer_img: doc/table/table.jpg + # for data or label process + character_dict_path: ppocr/utils/dict/table_structure_dict_ch.txt + character_type: en + max_text_length: &max_text_length 500 + box_format: &box_format xyxyxyxy # 'xywh', 'xyxy', 'xyxyxyxy' + infer_mode: False + use_sync_bn: True + save_res_path: output/infer + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + clip_norm: 5.0 + lr: + learning_rate: 0.001 + regularizer: + name: 'L2' + factor: 0.00000 + +Architecture: + model_type: table + algorithm: SLANet + Backbone: + name: PPLCNet + scale: 1.0 + pretrained: True + use_ssld: True + Neck: + name: CSPPAN + out_channels: 96 + Head: + name: SLAHead + hidden_size: 256 + max_text_length: *max_text_length + loc_reg_num: &loc_reg_num 8 + +Loss: + name: SLALoss + structure_weight: 1.0 + loc_weight: 2.0 + loc_loss: smooth_l1 + +PostProcess: + name: TableLabelDecode + merge_no_span_structure: &merge_no_span_structure True + +Metric: + name: TableMetric + main_indicator: acc + compute_bbox_metric: False + loc_reg_num: *loc_reg_num + box_format: *box_format + del_thead_tbody: True + +Train: + dataset: + name: PubTabDataSet + data_dir: train_data/table/train/ + label_file_list: [train_data/table/train.txt] + transforms: + - DecodeImage: + img_mode: BGR + channel_first: False + - TableLabelEncode: + learn_empty_box: False + merge_no_span_structure: *merge_no_span_structure + replace_empty_cell_token: False + loc_reg_num: *loc_reg_num + max_text_length: *max_text_length + - TableBoxEncode: + in_box_format: *box_format + out_box_format: *box_format + - ResizeTableImage: + max_len: 488 + - NormalizeImage: + scale: 1./255. + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: 'hwc' + - PaddingTableImage: + size: [488, 488] + - ToCHWImage: + - KeepKeys: + keep_keys: [ 'image', 'structure', 'bboxes', 'bbox_masks', 'shape' ] + loader: + shuffle: True + batch_size_per_card: 48 + drop_last: True + num_workers: 1 + +Eval: + dataset: + name: PubTabDataSet + data_dir: train_data/table/val/ + label_file_list: [train_data/table/val.txt] + transforms: + - DecodeImage: + img_mode: BGR + channel_first: False + - TableLabelEncode: + learn_empty_box: False + merge_no_span_structure: *merge_no_span_structure + replace_empty_cell_token: False + loc_reg_num: *loc_reg_num + max_text_length: *max_text_length + - TableBoxEncode: + in_box_format: *box_format + out_box_format: *box_format + - ResizeTableImage: + max_len: 488 + - NormalizeImage: + scale: 1./255. + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: 'hwc' + - PaddingTableImage: + size: [488, 488] + - ToCHWImage: + - KeepKeys: + keep_keys: [ 'image', 'structure', 'bboxes', 'bbox_masks', 'shape' ] + loader: + shuffle: False + drop_last: False + batch_size_per_card: 48 + num_workers: 1 diff --git a/paddleocr.py b/paddleocr.py index d78046802eb8b8af42ae2718697a5cfc1e7186de..f6fb095af34a58cc91b9fd0f22b2e95bf833e010 100644 --- a/paddleocr.py +++ b/paddleocr.py @@ -636,4 +636,6 @@ def main(): for item in result: item.pop('img') + item.pop('res') logger.info(item) + logger.info('result save to {}'.format(args.output)) diff --git a/ppocr/utils/dict/kie_dict/xfund_class_list.txt b/ppocr/utils/dict/kie_dict/xfund_class_list.txt new file mode 100644 index 0000000000000000000000000000000000000000..faded9f9b8f56bd258909bec9b8f1755aa688367 --- /dev/null +++ b/ppocr/utils/dict/kie_dict/xfund_class_list.txt @@ -0,0 +1,4 @@ +OTHER +QUESTION +ANSWER +HEADER diff --git a/ppocr/utils/save_load.py b/ppocr/utils/save_load.py index 0c652c8fdc88bd066d7202bb57c046aefbc20cc4..f86125521d19342f63a9fcb3bdcaed02cc4c6463 100644 --- a/ppocr/utils/save_load.py +++ b/ppocr/utils/save_load.py @@ -194,6 +194,9 @@ def save_model(model, _mkdir_if_not_exist(model_path, logger) model_prefix = os.path.join(model_path, prefix) paddle.save(optimizer.state_dict(), model_prefix + '.pdopt') + + is_nlp_model = config['Architecture']["model_type"] == 'kie' and config[ + "Architecture"]["algorithm"] not in ["SDMGR"] if is_nlp_model is not True: paddle.save(model.state_dict(), model_prefix + '.pdparams') metric_prefix = model_prefix diff --git a/ppstructure/README.md b/ppstructure/README.md index cff057e81909e620eaa86ffe464433cc3a5d6f21..66df10b2ec4d52fb743c40893d5fc5aa7d6ab5be 100644 --- a/ppstructure/README.md +++ b/ppstructure/README.md @@ -106,9 +106,9 @@ PP-Structure Series Model List (Updating) |model name|description|model size|download| | --- | --- | --- | --- | -|ch_PP-OCRv3_det_slim|[New] slim quantization with distillation lightweight model, supporting Chinese, English, multilingual text detection| 1.1M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_slim_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_slim_distill_train.tar)| -|ch_PP-OCRv3_rec_slim |[New] Slim qunatization with distillation lightweight model, supporting Chinese, English text recognition| 4.9M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_train.tar) | -|ch_ppstructure_mobile_v2.0_SLANet|Chinese table recognition model trained on PubTabNet dataset based on SLANet|9.3M|[inference model](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_train.tar) | +|ch_PP-OCRv3_det| [New] Lightweight model, supporting Chinese, English, multilingual text detection | 3.8M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_distill_train.tar)| +|ch_PP-OCRv3_rec| [New] Lightweight model, supporting Chinese, English, multilingual text recognition | 12.4M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_train.tar) | +|ch_ppstructure_mobile_v2.0_SLANet|Chinese table recognition model based on SLANet|9.3M|[inference model](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_train.tar) | ### 7.3 KIE model diff --git a/ppstructure/README_ch.md b/ppstructure/README_ch.md index efd25eb2cbda585c3fc2e192cd8184ccc7e10c0d..597cceafdf4fa94433da31a87b5cf4fa663c30fb 100644 --- a/ppstructure/README_ch.md +++ b/ppstructure/README_ch.md @@ -120,9 +120,9 @@ PP-Structure系列模型列表(更新中) |模型名称|模型简介|模型大小|下载地址| | --- | --- | --- | --- | -|ch_PP-OCRv3_det_slim|【最新】slim量化+蒸馏版超轻量模型,支持中英文、多语种文本检测| 1.1M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_slim_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_slim_distill_train.tar)| -|ch_PP-OCRv3_rec_slim |【最新】slim量化版超轻量模型,支持中英文、数字识别| 4.9M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_train.tar) | -|ch_ppstructure_mobile_v2.0_SLANet|基于SLANet在PubTabNet数据集上训练的中文表格识别模型|9.3M|[推理模型](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_train.tar) | +|ch_PP-OCRv3_det| 【最新】超轻量模型,支持中英文、多语种文本检测 | 3.8M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_distill_train.tar)| +|ch_PP-OCRv3_rec|【最新】超轻量模型,支持中英文、数字识别|12.4M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_train.tar) | +|ch_ppstructure_mobile_v2.0_SLANet|基于SLANet的中文表格识别模型|9.3M|[推理模型](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_train.tar) | diff --git a/ppstructure/docs/inference.md b/ppstructure/docs/inference.md index b050900760067402b2b738ed8d0e94d6788aca4f..cf11960c1ccde00f102db1a33f2b0d0e5dc9c985 100644 --- a/ppstructure/docs/inference.md +++ b/ppstructure/docs/inference.md @@ -4,7 +4,7 @@ - [1.1 版面分析+表格识别](#1.1) - [1.2 版面分析](#1.2) - [1.3 表格识别](#1.3) -- [2. DocVQA](#2) +- [2. 关键信息抽取](#2) ## 1. Structure @@ -16,23 +16,26 @@ cd ppstructure 下载模型 ```bash mkdir inference && cd inference -# 下载PP-OCRv2文本检测模型并解压 -wget https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_slim_quant_infer.tar && tar xf ch_PP-OCRv2_det_slim_quant_infer.tar -# 下载PP-OCRv2文本识别模型并解压 -wget https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_infer.tar && tar xf ch_PP-OCRv2_rec_slim_quant_infer.tar -# 下载超轻量级英文表格预测模型并解压 -wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_structure_infer.tar && tar xf en_ppocr_mobile_v2.0_table_structure_infer.tar +# 下载PP-Structurev2版面分析模型并解压 +wget https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_layout_infer.tar && tar xf picodet_lcnet_x1_0_layout_infer.tar +# 下载PP-OCRv3文本检测模型并解压 +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar && tar xf ch_PP-OCRv3_det_infer.tar +# 下载PP-OCRv3文本识别模型并解压 +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar && tar xf ch_PP-OCRv3_rec_infer.tar +# 下载PP-Structurev2表格识别模型并解压 +wget https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_infer.tar && tar xf ch_ppstructure_mobile_v2.0_SLANet_infer.tar cd .. ``` ### 1.1 版面分析+表格识别 ```bash -python3 predict_system.py --det_model_dir=inference/ch_PP-OCRv2_det_slim_quant_infer \ - --rec_model_dir=inference/ch_PP-OCRv2_rec_slim_quant_infer \ - --table_model_dir=inference/en_ppocr_mobile_v2.0_table_structure_infer \ +python3 predict_system.py --det_model_dir=inference/ch_PP-OCRv3_det_infer \ + --rec_model_dir=inference/ch_PP-OCRv3_rec_infer \ + --table_model_dir=inference/ch_ppstructure_mobile_v2.0_SLANet_infer \ + --layout_model_dir=inference/picodet_lcnet_x1_0_layout_infer \ --image_dir=./docs/table/1.png \ --rec_char_dict_path=../ppocr/utils/ppocr_keys_v1.txt \ - --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt \ + --table_char_dict_path=../ppocr/utils/dict/table_structure_dict_ch.txt \ --output=../output \ --vis_font_path=../doc/fonts/simfang.ttf ``` @@ -41,19 +44,23 @@ python3 predict_system.py --det_model_dir=inference/ch_PP-OCRv2_det_slim_quant_i ### 1.2 版面分析 ```bash -python3 predict_system.py --image_dir=./docs/table/1.png --table=false --ocr=false --output=../output/ +python3 predict_system.py --layout_model_dir=inference/picodet_lcnet_x1_0_layout_infer \ + --image_dir=./docs/table/1.png \ + --output=../output \ + --table=false \ + --ocr=false ``` 运行完成后,每张图片会在`output`字段指定的目录下的`structure`目录下有一个同名目录,图片区域会被裁剪之后保存下来,图片名为表格在图片里的坐标。版面分析结果会存储在`res.txt`文件中。 ### 1.3 表格识别 ```bash -python3 predict_system.py --det_model_dir=inference/ch_PP-OCRv2_det_slim_quant_infer \ - --rec_model_dir=inference/ch_PP-OCRv2_rec_slim_quant_infer \ - --table_model_dir=inference/en_ppocr_mobile_v2.0_table_structure_infer \ +python3 predict_system.py --det_model_dir=inference/ch_PP-OCRv3_det_infer \ + --rec_model_dir=inference/ch_PP-OCRv3_rec_infer \ + --table_model_dir=inference/ch_ppstructure_mobile_v2.0_SLANet_infer \ --image_dir=./docs/table/table.jpg \ --rec_char_dict_path=../ppocr/utils/ppocr_keys_v1.txt \ - --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt \ + --table_char_dict_path=../ppocr/utils/dict/table_structure_dict_ch.txt \ --output=../output \ --vis_font_path=../doc/fonts/simfang.ttf \ --layout=false @@ -61,20 +68,22 @@ python3 predict_system.py --det_model_dir=inference/ch_PP-OCRv2_det_slim_quant_i 运行完成后,每张图片会在`output`字段指定的目录下的`structure`目录下有一个同名目录,表格会存储为一个excel,excel文件名为`[0,0,img_h,img_w]`。 -## 2. DocVQA +## 2. 关键信息抽取 ```bash cd ppstructure -# 下载模型 mkdir inference && cd inference -# 下载SER xfun 模型并解压 -wget https://paddleocr.bj.bcebos.com/pplayout/PP-Layout_v1.0_ser_pretrained.tar && tar xf PP-Layout_v1.0_ser_pretrained.tar +# 下载SER XFUND 模型并解压 +wget https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/ser_vi_layoutxlm_xfund_infer.tar && tar -xf ser_vi_layoutxlm_xfund_infer.tar cd .. - -python3 predict_system.py --model_name_or_path=kie/PP-Layout_v1.0_ser_pretrained/ \ - --mode=kie \ - --image_dir=kie/images/input/zh_val_0.jpg \ - --vis_font_path=../doc/fonts/simfang.ttf +python3 kie/predict_kie_token_ser.py \ + --kie_algorithm=LayoutXLM \ + --ser_model_dir=../inference/ser_vi_layoutxlm_xfund_infer \ + --image_dir=./docs/kie/input/zh_val_42.jpg \ + --ser_dict_path=../ppocr/utils/dict/kie_dict/xfund_class_list.txt \ + --vis_font_path=../doc/fonts/simfang.ttf \ + --ocr_order_method="tb-yx" ``` + 运行完成后,每张图片会在`output`字段指定的目录下的`kie`目录下存放可视化之后的图片,图片名和输入图片名一致。 diff --git a/ppstructure/docs/inference_en.md b/ppstructure/docs/inference_en.md index ad16f048e3b08a45d6e6d76e630ba48483f263d4..357e26a11f7e86a342bb3dbf24ea3c721705ae98 100644 --- a/ppstructure/docs/inference_en.md +++ b/ppstructure/docs/inference_en.md @@ -4,7 +4,7 @@ - [1.1 layout analysis + table recognition](#1.1) - [1.2 layout analysis](#1.2) - [1.3 table recognition](#1.3) -- [2. DocVQA](#2) +- [2. KIE](#2) ## 1. Structure @@ -18,23 +18,26 @@ download model ```bash mkdir inference && cd inference -# Download the PP-OCRv2 text detection model and unzip it -wget https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_slim_quant_infer.tar && tar xf ch_PP-OCRv2_det_slim_quant_infer.tar -# Download the PP-OCRv2 text recognition model and unzip it -wget https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_infer.tar && tar xf ch_PP-OCRv2_rec_slim_quant_infer.tar -# Download the ultra-lightweight English table structure model and unzip it -wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_structure_infer.tar && tar xf en_ppocr_mobile_v2.0_table_structure_infer.tar +# Download the PP-Structurev2 layout analysis model and unzip it +wget https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_layout_infer.tar && tar xf picodet_lcnet_x1_0_layout_infer.tar +# Download the PP-OCRv3 text detection model and unzip it +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar && tar xf ch_PP-OCRv3_det_infer.tar +# Download the PP-OCRv3 text recognition model and unzip it +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar && tar xf ch_PP-OCRv3_rec_infer.tar +# Download the PP-Structurev2 form recognition model and unzip it +wget https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_infer.tar && tar xf ch_ppstructure_mobile_v2.0_SLANet_infer.tar cd .. ``` ### 1.1 layout analysis + table recognition ```bash -python3 predict_system.py --det_model_dir=inference/ch_PP-OCRv2_det_slim_quant_infer \ - --rec_model_dir=inference/ch_PP-OCRv2_rec_slim_quant_infer \ - --table_model_dir=inference/en_ppocr_mobile_v2.0_table_structure_infer \ +python3 predict_system.py --det_model_dir=inference/ch_PP-OCRv3_det_infer \ + --rec_model_dir=inference/ch_PP-OCRv3_rec_infer \ + --table_model_dir=inference/ch_ppstructure_mobile_v2.0_SLANet_infer \ + --layout_model_dir=inference/picodet_lcnet_x1_0_layout_infer \ --image_dir=./docs/table/1.png \ --rec_char_dict_path=../ppocr/utils/ppocr_keys_v1.txt \ - --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt \ + --table_char_dict_path=../ppocr/utils/dict/table_structure_dict_ch.txt \ --output=../output \ --vis_font_path=../doc/fonts/simfang.ttf ``` @@ -43,19 +46,23 @@ After the operation is completed, each image will have a directory with the same ### 1.2 layout analysis ```bash -python3 predict_system.py --image_dir=./docs/table/1.png --table=false --ocr=false --output=../output/ +python3 predict_system.py --layout_model_dir=inference/picodet_lcnet_x1_0_layout_infer \ + --image_dir=./docs/table/1.png \ + --output=../output \ + --table=false \ + --ocr=false ``` After the operation is completed, each image will have a directory with the same name in the `structure` directory under the directory specified by the `output` field. Each picture in image will be cropped and saved. The filename of picture area is their coordinates in the image. Layout analysis results will be stored in the `res.txt` file ### 1.3 table recognition ```bash -python3 predict_system.py --det_model_dir=inference/ch_PP-OCRv2_det_slim_quant_infer \ - --rec_model_dir=inference/ch_PP-OCRv2_rec_slim_quant_infer \ - --table_model_dir=inference/en_ppocr_mobile_v2.0_table_structure_infer \ +python3 predict_system.py --det_model_dir=inference/ch_PP-OCRv3_det_infer \ + --rec_model_dir=inference/ch_PP-OCRv3_rec_infer \ + --table_model_dir=inference/ch_ppstructure_mobile_v2.0_SLANet_infer \ --image_dir=./docs/table/table.jpg \ --rec_char_dict_path=../ppocr/utils/ppocr_keys_v1.txt \ - --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt \ + --table_char_dict_path=../ppocr/utils/dict/table_structure_dict_ch.txt \ --output=../output \ --vis_font_path=../doc/fonts/simfang.ttf \ --layout=false @@ -63,19 +70,22 @@ python3 predict_system.py --det_model_dir=inference/ch_PP-OCRv2_det_slim_quant_i After the operation is completed, each image will have a directory with the same name in the `structure` directory under the directory specified by the `output` field. Each table in the image will be stored as an excel. The filename of excel is their coordinates in the image. -## 2. DocVQA +## 2. KIE ```bash cd ppstructure -# download model mkdir inference && cd inference -wget https://paddleocr.bj.bcebos.com/pplayout/PP-Layout_v1.0_ser_pretrained.tar && tar xf PP-Layout_v1.0_ser_pretrained.tar +# download model +wget https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/ser_vi_layoutxlm_xfund_infer.tar && tar -xf ser_vi_layoutxlm_xfund_infer.tar cd .. - -python3 predict_system.py --model_name_or_path=kie/PP-Layout_v1.0_ser_pretrained/ \ - --mode=kie \ - --image_dir=kie/images/input/zh_val_0.jpg \ - --vis_font_path=../doc/fonts/simfang.ttf +python3 kie/predict_kie_token_ser.py \ + --kie_algorithm=LayoutXLM \ + --ser_model_dir=../inference/ser_vi_layoutxlm_xfund_infer \ + --image_dir=./docs/kie/input/zh_val_42.jpg \ + --ser_dict_path=../ppocr/utils/dict/kie_dict/xfund_class_list.txt \ + --vis_font_path=../doc/fonts/simfang.ttf \ + --ocr_order_method="tb-yx" ``` + After the operation is completed, each image will store the visualized image in the `kie` directory under the directory specified by the `output` field, and the image name is the same as the input image name. diff --git a/ppstructure/docs/installation.md b/ppstructure/docs/installation.md index 3649e729d04ec83ba2d97571af993d75358eec73..0635580234abe2716769441d845df6386fbf5b86 100644 --- a/ppstructure/docs/installation.md +++ b/ppstructure/docs/installation.md @@ -1,7 +1,7 @@ - [快速安装](#快速安装) - [1. PaddlePaddle 和 PaddleOCR](#1-paddlepaddle-和-paddleocr) - [2. 安装其他依赖](#2-安装其他依赖) - - [2.1 VQA所需依赖](#21--kie所需依赖) + - [2.1 KIE所需依赖](#21-kie所需依赖) # 快速安装 @@ -11,16 +11,11 @@ ## 2. 安装其他依赖 -### 2.1 VQA所需依赖 -* paddleocr +### 2.1 KIE所需依赖 -```bash -pip3 install paddleocr -``` +* paddleocr -* PaddleNLP ```bash -git clone https://github.com/PaddlePaddle/PaddleNLP -b develop -cd PaddleNLP -pip3 install -e . +pip install paddleocr -U +pip install -r ./kie/requirements.txt ``` diff --git a/ppstructure/docs/installation_en.md b/ppstructure/docs/installation_en.md index 02b02db0c58f60a5296734b93563510732a7286d..de8bb5f6fc06fbd4f21cb0ca00ec80cce109ebf7 100644 --- a/ppstructure/docs/installation_en.md +++ b/ppstructure/docs/installation_en.md @@ -2,7 +2,7 @@ - [1. PaddlePaddle 和 PaddleOCR](#1) - [2. Install other dependencies](#2) - - [2.1 VQA](#21) + - [2.1 KIE](#21) @@ -14,17 +14,11 @@ Please refer to [PaddleOCR installation documentation](../../doc/doc_en/installa ## 2. Install other dependencies -### 2.1 VQA +### 2.1 KIE * paddleocr ```bash -pip3 install paddleocr -``` - -* PaddleNLP -```bash -git clone https://github.com/PaddlePaddle/PaddleNLP -b develop -cd PaddleNLP -pip3 install -e . +pip install paddleocr -U +pip install -r ./kie/requirements.txt ``` diff --git a/ppstructure/docs/models_list.md b/ppstructure/docs/models_list.md index 0b2f41deb5588c82238e93d835dc8c606e4fde2e..935d12d756eec467574f9ae32d48c70a3ea054c3 100644 --- a/ppstructure/docs/models_list.md +++ b/ppstructure/docs/models_list.md @@ -10,13 +10,17 @@ ## 1. 版面分析模型 -|模型名称|模型简介|下载地址|label_map| -| --- | --- | --- | --- | -| ppyolov2_r50vd_dcn_365e_publaynet | PubLayNet 数据集训练的版面分析模型,可以划分**文字、标题、表格、图片以及列表**5类区域 | [推理模型](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_publaynet.tar) / [训练模型](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_publaynet_pretrained.pdparams) |{0: "Text", 1: "Title", 2: "List", 3:"Table", 4:"Figure"}| -| ppyolov2_r50vd_dcn_365e_tableBank_word | TableBank Word 数据集训练的版面分析模型,只能检测表格 | [推理模型](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_tableBank_word.tar) | {0:"Table"}| -| ppyolov2_r50vd_dcn_365e_tableBank_latex | TableBank Latex 数据集训练的版面分析模型,只能检测表格 | [推理模型](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_tableBank_latex.tar) | {0:"Table"}| +|模型名称|模型简介|推理模型大小|下载地址|dict path| +| --- | --- | --- | --- | --- | +| picodet_lcnet_x1_0_fgd_layout | 基于PicoDet LCNet_x1_0和FGD蒸馏在PubLayNet 数据集训练的英文版面分析模型,可以划分**文字、标题、表格、图片以及列表**5类区域 | 9.7M | [推理模型](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout.pdparams) | [PubLayNet dict](../../ppocr/utils/dict/layout_dict/layout_publaynet_dict.txt) | +| ppyolov2_r50vd_dcn_365e_publaynet | 基于PP-YOLOv2在PubLayNet数据集上训练的英文版面分析模型 | 221M | [推理模型](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_publaynet.tar) / [训练模型](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_publaynet_pretrained.pdparams) | 同上 | +| picodet_lcnet_x1_0_fgd_layout_cdla | CDLA数据集训练的中文版面分析模型,可以划分为**表格、图片、图片标题、表格、表格标题、页眉、脚本、引用、公式**10类区域 | 9.7M | [推理模型](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_cdla_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_cdla.pdparams) | [CDLA dict](../../ppocr/utils/dict/layout_dict/layout_cdla_dict.txt) | +| picodet_lcnet_x1_0_fgd_layout_table | 表格数据集训练的版面分析模型,支持中英文文档表格区域的检测 | 9.7M | [推理模型](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_table_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_table.pdparams) | [Table dict](../../ppocr/utils/dict/layout_dict/layout_table_dict.txt) | +| ppyolov2_r50vd_dcn_365e_tableBank_word | 基于PP-YOLOv2在TableBank Word 数据集训练的版面分析模型,支持英文文档表格区域的检测 | 221M | [推理模型](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_tableBank_word.tar) | 同上 | +| ppyolov2_r50vd_dcn_365e_tableBank_latex | 基于PP-YOLOv2在TableBank Latex数据集训练的版面分析模型,支持英文文档表格区域的检测 | 221M | [推理模型](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_tableBank_latex.tar) | 同上 | + ## 2. OCR和表格识别模型 @@ -24,8 +28,8 @@ |模型名称|模型简介|推理模型大小|下载地址| | --- | --- | --- | --- | -|en_ppocr_mobile_v2.0_table_det|PubLayNet数据集训练的英文表格场景的文字检测|4.7M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/table/en_ppocr_mobile_v2.0_table_det_train.tar) | -|en_ppocr_mobile_v2.0_table_rec|PubLayNet数据集训练的英文表格场景的文字识别|6.9M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/table/en_ppocr_mobile_v2.0_table_rec_train.tar) | +|en_ppocr_mobile_v2.0_table_det|PubTabNet数据集训练的英文表格场景的文字检测|4.7M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/table/en_ppocr_mobile_v2.0_table_det_train.tar) | +|en_ppocr_mobile_v2.0_table_rec|PubTabNet数据集训练的英文表格场景的文字识别|6.9M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/table/en_ppocr_mobile_v2.0_table_rec_train.tar) | 如需要使用其他OCR模型,可以在 [PP-OCR model_list](../../doc/doc_ch/models_list.md) 下载模型或者使用自己训练好的模型配置到 `det_model_dir`, `rec_model_dir`两个字段即可。 @@ -36,7 +40,7 @@ | --- | --- | --- | --- | |en_ppocr_mobile_v2.0_table_structure|基于TableRec-RARE在PubTabNet数据集上训练的英文表格识别模型|6.8M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_structure_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/table/en_ppocr_mobile_v2.0_table_structure_train.tar) | |en_ppstructure_mobile_v2.0_SLANet|基于SLANet在PubTabNet数据集上训练的英文表格识别模型|9.2M|[推理模型](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/en_ppstructure_mobile_v2.0_SLANet_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/en_ppstructure_mobile_v2.0_SLANet_train.tar) | -|ch_ppstructure_mobile_v2.0_SLANet|基于SLANet在PubTabNet数据集上训练的中文表格识别模型|9.3M|[推理模型](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_train.tar) | +|ch_ppstructure_mobile_v2.0_SLANet|基于SLANet的中文表格识别模型|9.3M|[推理模型](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_train.tar) | diff --git a/ppstructure/docs/models_list_en.md b/ppstructure/docs/models_list_en.md index 7ba1d30464287eaf67a0265464fcc261e3b4407f..85531fb753c4e32f0cdc9296ab97a9faebbb0ebd 100644 --- a/ppstructure/docs/models_list_en.md +++ b/ppstructure/docs/models_list_en.md @@ -4,18 +4,20 @@ - [2. OCR and Table Recognition](#2-ocr-and-table-recognition) - [2.1 OCR](#21-ocr) - [2.2 Table Recognition](#22-table-recognition) -- [3. VQA](#3-kie) -- [4. KIE](#4-kie) - +- [3. KIE](#3-kie) + ## 1. Layout Analysis -|model name| description |download|label_map| -| --- |---------------------------------------------------------------------------------------------------------------------------------------------------------| --- | --- | -| ppyolov2_r50vd_dcn_365e_publaynet | The layout analysis model trained on the PubLayNet dataset, the model can recognition 5 types of areas such as **text, title, table, picture and list** | [inference model](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_publaynet.tar) / [trained model](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_publaynet_pretrained.pdparams) |{0: "Text", 1: "Title", 2: "List", 3:"Table", 4:"Figure"}| -| ppyolov2_r50vd_dcn_365e_tableBank_word | The layout analysis model trained on the TableBank Word dataset, the model can only detect tables | [inference model](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_tableBank_word.tar) | {0:"Table"}| -| ppyolov2_r50vd_dcn_365e_tableBank_latex | The layout analysis model trained on the TableBank Latex dataset, the model can only detect tables | [inference model](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_tableBank_latex.tar) | {0:"Table"}| +|model name| description | inference model size |download|dict path| +| --- |---------------------------------------------------------------------------------------------------------------------------------------------------------| --- | --- | --- | +| picodet_lcnet_x1_0_fgd_layout | The layout analysis English model trained on the PubLayNet dataset based on PicoDet LCNet_x1_0 and FGD . the model can recognition 5 types of areas such as **Text, Title, Table, Picture and List** | 9.7M | [inference model](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout.pdparams) | [PubLayNet dict](../../ppocr/utils/dict/layout_dict/layout_publaynet_dict.txt) | +| ppyolov2_r50vd_dcn_365e_publaynet | The layout analysis English model trained on the PubLayNet dataset based on PP-YOLOv2 | 221M | [inference_moel]](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_publaynet.tar) / [trained model](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_publaynet_pretrained.pdparams) | sme as above | +| picodet_lcnet_x1_0_fgd_layout_cdla | The layout analysis Chinese model trained on the CDLA dataset, the model can recognition 10 types of areas such as **Table、Figure、Figure caption、Table、Table caption、Header、Footer、Reference、Equation** | 9.7M | [inference model](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_cdla_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_cdla.pdparams) | [CDLA dict](../../ppocr/utils/dict/layout_dict/layout_cdla_dict.txt) | +| picodet_lcnet_x1_0_fgd_layout_table | The layout analysis model trained on the table dataset, the model can detect tables in Chinese and English documents | 9.7M | [inference model](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_table_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_table.pdparams) | [Table dict](../../ppocr/utils/dict/layout_dict/layout_table_dict.txt) | +| ppyolov2_r50vd_dcn_365e_tableBank_word | The layout analysis model trained on the TableBank Word dataset based on PP-YOLOv2, the model can detect tables in English documents | 221M | [inference model](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_tableBank_word.tar) | same as above | +| ppyolov2_r50vd_dcn_365e_tableBank_latex | The layout analysis model trained on the TableBank Latex dataset based on PP-YOLOv2, the model can detect tables in English documents | 221M | [inference model](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_tableBank_latex.tar) | same as above | ## 2. OCR and Table Recognition @@ -37,22 +39,28 @@ If you need to use other OCR models, you can download the model in [PP-OCR model | --- |-----------------------------------------------------------------------------| --- | --- | |en_ppocr_mobile_v2.0_table_structure| English table recognition model trained on PubTabNet dataset based on TableRec-RARE |6.8M|[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_structure_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.1/table/en_ppocr_mobile_v2.0_table_structure_train.tar) | |en_ppstructure_mobile_v2.0_SLANet|English table recognition model trained on PubTabNet dataset based on SLANet|9.2M|[inference model](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/en_ppstructure_mobile_v2.0_SLANet_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/en_ppstructure_mobile_v2.0_SLANet_train.tar) | -|ch_ppstructure_mobile_v2.0_SLANet|Chinese table recognition model trained on PubTabNet dataset based on SLANet|9.3M|[inference model](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_train.tar) | +|ch_ppstructure_mobile_v2.0_SLANet|Chinese table recognition model based on SLANet|9.3M|[inference model](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_train.tar) | -## 3. VQA - -|model| description |inference model size|download| -| --- |----------------------------------------------------------------| --- | --- | -|ser_LayoutXLM_xfun_zh| SER model trained on xfun Chinese dataset based on LayoutXLM |1.4G|[inference model](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutXLM_xfun_zh_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutXLM_xfun_zh.tar) | -|re_LayoutXLM_xfun_zh| Re model trained on xfun Chinese dataset based on LayoutXLM |1.4G|[inference model coming soon]() / [trained model](https://paddleocr.bj.bcebos.com/pplayout/re_LayoutXLM_xfun_zh.tar) | -|ser_LayoutLMv2_xfun_zh| SER model trained on xfun Chinese dataset based on LayoutXLMv2 |778M|[inference model](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutLMv2_xfun_zh_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutLMv2_xfun_zh.tar) | -|re_LayoutLMv2_xfun_zh| Re model trained on xfun Chinese dataset based on LayoutXLMv2 |765M|[inference model coming soon]() / [trained model](https://paddleocr.bj.bcebos.com/pplayout/re_LayoutLMv2_xfun_zh.tar) | -|ser_LayoutLM_xfun_zh| SER model trained on xfun Chinese dataset based on LayoutLM |430M|[inference model](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutLM_xfun_zh_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutLM_xfun_zh.tar) | - - -## 4. KIE - -|model|description|model size|download| -| --- | --- | --- | --- | -|SDMGR|Key Information Extraction Model|78M|[inference model coming soon]() / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.1/kie/kie_vgg16.tar)| +## 3. KIE + +On XFUND_zh dataset, Accuracy and time cost of different models on V100 GPU are as follows. + +|Model|Backbone|Task|Config|Hmean|Time cost(ms)|Download link| +| --- | --- | --- | --- | --- | --- |--- | +|VI-LayoutXLM| VI-LayoutXLM-base | SER | [ser_vi_layoutxlm_xfund_zh_udml.yml](../../configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh_udml.yml)|**93.19%**| 15.49| [trained model](https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/ser_vi_layoutxlm_xfund_pretrained.tar)| +|LayoutXLM| LayoutXLM-base | SER | [ser_layoutxlm_xfund_zh.yml](../../configs/kie/layoutlm_series/ser_layoutxlm_xfund_zh.yml)|90.38%| 19.49 |[trained model](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutXLM_xfun_zh.tar)| +|LayoutLM| LayoutLM-base | SER | [ser_layoutlm_xfund_zh.yml](../../configs/kie/layoutlm_series/ser_layoutlm_xfund_zh.yml)|77.31%|-|[trained model](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutLM_xfun_zh.tar)| +|LayoutLMv2| LayoutLMv2-base | SER | [ser_layoutlmv2_xfund_zh.yml](../../configs/kie/layoutlm_series/ser_layoutlmv2_xfund_zh.yml)|85.44%|31.46|[trained model](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutLMv2_xfun_zh.tar)| +|VI-LayoutXLM| VI-LayoutXLM-base | RE | [re_vi_layoutxlm_xfund_zh_udml.yml](../../configs/kie/vi_layoutxlm/re_vi_layoutxlm_xfund_zh_udml.yml)|**83.92%**|15.49|[trained model](https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/re_vi_layoutxlm_xfund_pretrained.tar)| +|LayoutXLM| LayoutXLM-base | RE | [re_layoutxlm_xfund_zh.yml](../../configs/kie/layoutlm_series/re_layoutxlm_xfund_zh.yml)|74.83%|19.49|[trained model](https://paddleocr.bj.bcebos.com/pplayout/re_LayoutXLM_xfun_zh.tar)| +|LayoutLMv2| LayoutLMv2-base | RE | [re_layoutlmv2_xfund_zh.yml](../../configs/kie/layoutlm_series/re_layoutlmv2_xfund_zh.yml)|67.77%|31.46|[trained model](https://paddleocr.bj.bcebos.com/pplayout/re_LayoutLMv2_xfun_zh.tar)| + +* Note: The above time cost information just considers inference time without preprocess or postprocess, test environment: `V100 GPU + CUDA 10.2 + CUDNN 8.1.1 + TRT 7.2.3.4` + + +On wildreceipt dataset, the algorithm result is as follows: + +|Model|Backbone|Config|Hmean|Download link| +| --- | --- | --- | --- | --- | +|SDMGR|VGG6|[configs/kie/sdmgr/kie_unet_sdmgr.yml](../../configs/kie/sdmgr/kie_unet_sdmgr.yml)|86.7%|[trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.1/kie/kie_vgg16.tar)| diff --git a/ppstructure/docs/quickstart.md b/ppstructure/docs/quickstart.md index 9a538a6f11d99e9caa4c3483421aaccc344079de..38a37bebcb561228ddfb2b2970b9ddbaebbcb19a 100644 --- a/ppstructure/docs/quickstart.md +++ b/ppstructure/docs/quickstart.md @@ -7,16 +7,22 @@ - [2.1.2 版面分析+表格识别](#212-版面分析表格识别) - [2.1.3 版面分析](#213-版面分析) - [2.1.4 表格识别](#214-表格识别) - - [2.1.5 DocVQA](#215-dockie) + - [2.1.5 关键信息抽取](#215-关键信息抽取) + - [2.1.6 版面恢复](#216-版面恢复) - [2.2 代码使用](#22-代码使用) - - [2.2.1 图像方向分类版面分析表格识别](#221-图像方向分类版面分析表格识别) + + - [2.2.1 图像方向+分类版面分析+表格识别](#221-图像方向分类版面分析表格识别) - [2.2.2 版面分析+表格识别](#222-版面分析表格识别) - [2.2.3 版面分析](#223-版面分析) - [2.2.4 表格识别](#224-表格识别) - - [2.2.5 DocVQA](#225-dockie) + + - [2.2.5 关键信息抽取](#225-关键信息抽取) + - [2.2.6 版面恢复](#226-版面恢复) + - [2.3 返回结果说明](#23-返回结果说明) - [2.3.1 版面分析+表格识别](#231-版面分析表格识别) - - [2.3.2 DocVQA](#232-dockie) + - [2.3.2 关键信息抽取](#232-关键信息抽取) + - [2.4 参数说明](#24-参数说明) @@ -24,11 +30,12 @@ ## 1. 安装依赖包 ```bash -# 安装 paddleocr,推荐使用2.5+版本 -pip3 install "paddleocr>=2.5" -# 安装 DocVQA依赖包paddlenlp(如不需要DocVQA功能,可跳过) -pip install paddlenlp - +# 安装 paddleocr,推荐使用2.6版本 +pip3 install "paddleocr>=2.6" +# 安装 关键信息抽取 依赖包(如不需要KIE功能,可跳过) +pip install -r kie/requirements.txt +# 安装 图像方向分类依赖包paddleclas(如不需要图像方向分类功能,可跳过) +pip3 install paddleclas ``` @@ -62,15 +69,24 @@ paddleocr --image_dir=PaddleOCR/ppstructure/docs/table/table.jpg --type=structur ``` -#### 2.1.5 DocVQA -请参考:[文档视觉问答](../kie/README.md)。 +#### 2.1.5 关键信息抽取 +请参考:[关键信息抽取教程](../kie/README_ch.md)。 + + + +#### 2.1.6 版面恢复 + +```bash +paddleocr --image_dir=PaddleOCR/ppstructure/docs/table/1.png --type=structure --recovery=true +``` + ### 2.2 代码使用 -#### 2.2.1 图像方向分类版面分析表格识别 +#### 2.2.1 图像方向分类+版面分析+表格识别 ```python import os @@ -149,6 +165,7 @@ for line in result: ``` + #### 2.2.4 表格识别 ```python @@ -170,9 +187,36 @@ for line in result: ``` -#### 2.2.5 DocVQA +#### 2.2.5 关键信息抽取 + +请参考:[关键信息抽取教程](../kie/README_ch.md)。 -请参考:[文档视觉问答](../kie/README.md)。 + + +#### 2.2.6 版面恢复 + +```python +import os +import cv2 +from paddleocr import PPStructure,save_structure_res +from paddelocr.ppstructure.recovery.recovery_to_doc import sorted_layout_boxes, convert_info_docx + +table_engine = PPStructure(layout=False, show_log=True) + +save_folder = './output' +img_path = 'PaddleOCR/ppstructure/docs/table/1.png' +img = cv2.imread(img_path) +result = table_engine(img) +save_structure_res(result, save_folder, os.path.basename(img_path).split('.')[0]) + +for line in result: + line.pop('img') + print(line) + +h, w, _ = img.shape +res = sorted_layout_boxes(res, w) +convert_info_docx(img, result, save_folder, os.path.basename(img_path).split('.')[0]) +``` ### 2.3 返回结果说明 @@ -208,9 +252,9 @@ dict 里各个字段说明如下 ``` -#### 2.3.2 DocVQA +#### 2.3.2 关键信息抽取 -请参考:[文档视觉问答](../kie/README.md)。 +请参考:[关键信息抽取教程](../kie/README_ch.md)。 ### 2.4 参数说明 @@ -235,6 +279,7 @@ dict 里各个字段说明如下 | table | 前向中是否执行表格识别 | True | | ocr | 对于版面分析中的非表格区域,是否执行ocr。当layout为False时会被自动设置为False| True | | recovery | 前向中是否执行版面恢复| False | +| save_pdf | 版面恢复导出docx文件的同时,是否导出pdf文件 | False | | structure_version | 模型版本,可选 PP-structure和PP-structurev2 | PP-structure | 大部分参数和PaddleOCR whl包保持一致,见 [whl包文档](../../doc/doc_ch/whl.md) diff --git a/ppstructure/docs/quickstart_en.md b/ppstructure/docs/quickstart_en.md index cf9d12ff9c1dadef95fedd3a02acb2146607aa96..dbfbf43b01c94bd6f9c729f2f6edcd1dd6aee056 100644 --- a/ppstructure/docs/quickstart_en.md +++ b/ppstructure/docs/quickstart_en.md @@ -7,16 +7,19 @@ - [2.1.2 layout analysis + table recognition](#212-layout-analysis--table-recognition) - [2.1.3 layout analysis](#213-layout-analysis) - [2.1.4 table recognition](#214-table-recognition) - - [2.1.5 DocVQA](#215-dockie) + - [2.1.5 Key Information Extraction](#215-Key-Information-Extraction) + - [2.1.6 layout recovery](#216-layout-recovery) - [2.2 Use by code](#22-use-by-code) - [2.2.1 image orientation + layout analysis + table recognition](#221-image-orientation--layout-analysis--table-recognition) - [2.2.2 layout analysis + table recognition](#222-layout-analysis--table-recognition) - [2.2.3 layout analysis](#223-layout-analysis) - [2.2.4 table recognition](#224-table-recognition) - [2.2.5 DocVQA](#225-dockie) + - [2.2.5 Key Information Extraction](#225-Key-Information-Extraction) + - [2.2.6 layout recovery](#226-layout-recovery) - [2.3 Result description](#23-result-description) - [2.3.1 layout analysis + table recognition](#231-layout-analysis--table-recognition) - - [2.3.2 DocVQA](#232-dockie) + - [2.3.2 Key Information Extraction](#232-Key-Information-Extraction) - [2.4 Parameter Description](#24-parameter-description) @@ -24,14 +27,16 @@ ## 1. Install package ```bash -# Install paddleocr, version 2.5+ is recommended -pip3 install "paddleocr>=2.5" -# Install the DocVQA dependency package paddlenlp (if you do not use the DocVQA, you can skip it) -pip install paddlenlp - +# Install paddleocr, version 2.6 is recommended +pip3 install "paddleocr>=2.6" +# Install the KIE dependency packages (if you do not use the KIE, you can skip it) +pip install -r kie/requirements.txt +# Install the image direction classification dependency package paddleclas (if you do not use the image direction classification, you can skip it) +pip3 install paddleclas ``` + ## 2. Use @@ -62,9 +67,15 @@ paddleocr --image_dir=PaddleOCR/ppstructure/docs/table/table.jpg --type=structur ``` -#### 2.1.5 DocVQA +#### 2.1.5 Key Information Extraction -Please refer to: [Documentation Visual Q&A](../kie/README.md) . +Please refer to: [Key Information Extraction](../kie/README.md) . + + +#### 2.1.6 layout recovery +```bash +paddleocr --image_dir=PaddleOCR/ppstructure/docs/table/1.png --type=structure --recovery=true +``` ### 2.2 Use by code @@ -120,7 +131,7 @@ for line in result: from PIL import Image -font_path = 'PaddleOCR/doc/fonts/simfang.ttf' # PaddleOCR下提供字体包 +font_path = 'PaddleOCR/doc/fonts/simfang.ttf' # font provieded in PaddleOCR image = Image.open(img_path).convert('RGB') im_show = draw_structure_result(image, result,font_path=font_path) im_show = Image.fromarray(im_show) @@ -170,9 +181,35 @@ for line in result: ``` -#### 2.2.5 DocVQA +#### 2.2.5 Key Information Extraction + +Please refer to: [Key Information Extraction](../kie/README.md) . + + +#### 2.2.6 layout recovery + +```python +import os +import cv2 +from paddleocr import PPStructure,save_structure_res +from paddelocr.ppstructure.recovery.recovery_to_doc import sorted_layout_boxes, convert_info_docx -Please refer to: [Documentation Visual Q&A](../kie/README.md) . +table_engine = PPStructure(layout=False, show_log=True) + +save_folder = './output' +img_path = 'PaddleOCR/ppstructure/docs/table/1.png' +img = cv2.imread(img_path) +result = table_engine(img) +save_structure_res(result, save_folder, os.path.basename(img_path).split('.')[0]) + +for line in result: + line.pop('img') + print(line) + +h, w, _ = img.shape +res = sorted_layout_boxes(res, w) +convert_info_docx(img, result, save_folder, os.path.basename(img_path).split('.')[0]) +``` ### 2.3 Result description @@ -208,9 +245,9 @@ After the recognition is completed, each image will have a directory with the sa ``` -#### 2.3.2 DocVQA +#### 2.3.2 Key Information Extraction -Please refer to: [Documentation Visual Q&A](../kie/README.md) . +Please refer to: [Key Information Extraction](../kie/README.md) . ### 2.4 Parameter Description @@ -235,6 +272,7 @@ Please refer to: [Documentation Visual Q&A](../kie/README.md) . | table | Whether to perform table recognition in forward | True | | ocr | Whether to perform ocr for non-table areas in layout analysis. When layout is False, it will be automatically set to False| True | | recovery | Whether to perform layout recovery in forward| False | +| save_pdf | Whether to convert docx to pdf when recovery| False | | structure_version | Structure version, optional PP-structure and PP-structurev2 | PP-structure | Most of the parameters are consistent with the PaddleOCR whl package, see [whl package documentation](../../doc/doc_en/whl.md) diff --git a/ppstructure/docs/recovery/recovery.jpg b/ppstructure/docs/recovery/recovery.jpg new file mode 100644 index 0000000000000000000000000000000000000000..a3817ab70eff5b380072701b70ab227ae6c8184c Binary files /dev/null and b/ppstructure/docs/recovery/recovery.jpg differ diff --git a/ppstructure/docs/table/recovery.jpg b/ppstructure/docs/table/recovery.jpg deleted file mode 100644 index bee2e2fb3499ec4b348e2b2f1475a87c9c562190..0000000000000000000000000000000000000000 Binary files a/ppstructure/docs/table/recovery.jpg and /dev/null differ diff --git a/ppstructure/kie/README.md b/ppstructure/kie/README.md index 9e1b72e772f03a9dadd202268c39cba11f8f121e..adb19a3ca729821ab16bf8f0f8ec14c2376de1de 100644 --- a/ppstructure/kie/README.md +++ b/ppstructure/kie/README.md @@ -246,7 +246,7 @@ For training, evaluation and inference tutorial for text recognition models, ple If you want to finish the KIE tasks in your scene, and don't know what to prepare, please refer to [End cdoc](../../doc/doc_en/recognition.md). -关于怎样在自己的场景中完成关键信息抽取任务,请参考:[Guide to End-to-end KIE](./how_to_do_kie_en.md)。 +To complete the key information extraction task in your own scenario from data preparation to model selection, please refer to: [Guide to End-to-end KIE](./how_to_do_kie_en.md)。 ## 5. Reference diff --git a/ppstructure/layout/README.md b/ppstructure/layout/README.md index 3762544b834d752a705216ca3f93d326aa1391ad..45386da348fc8d2d76da64cffbd2d1a4482812af 100644 --- a/ppstructure/layout/README.md +++ b/ppstructure/layout/README.md @@ -63,7 +63,7 @@ python3 -m pip install "paddlepaddle>=2.2" -i https://mirror.baidu.com/pypi/simp git clone https://github.com/PaddlePaddle/PaddleDetection.git ``` -- **(2)安装其他依赖 ** +- **(2)安装其他依赖** ```bash cd PaddleDetection @@ -138,7 +138,7 @@ json文件包含所有图像的标注,数据以字典嵌套的方式存放, ``` { - + 'segmentation': # 物体的分割标注 'area': 60518.099043117836, # 物体的区域面积 'iscrowd': 0, # iscrowd @@ -166,15 +166,17 @@ json文件包含所有图像的标注,数据以字典嵌套的方式存放, 提供了训练脚本、评估脚本和预测脚本,本节将以PubLayNet预训练模型为例进行讲解。 -如果不希望训练,直接体验后面的模型评估、预测、动转静、推理的流程,可以下载提供的预训练模型,并跳过本部分。 +如果不希望训练,直接体验后面的模型评估、预测、动转静、推理的流程,可以下载提供的预训练模型(PubLayNet数据集),并跳过本部分。 ``` mkdir pretrained_model cd pretrained_model -# 下载并解压PubLayNet预训练模型 +# 下载PubLayNet预训练模型 wget https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_layout.pdparams ``` +下载更多[版面分析模型](../docs/models_list.md)(中文CDLA数据集预训练模型、表格预训练模型) + ### 4.1. 启动训练 开始训练: @@ -184,7 +186,7 @@ wget https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_ 如果你希望训练自己的数据集,需要修改配置文件中的数据配置、类别数。 -以`configs/picodet/legacy_model/application/layout_detection/picodet_lcnet_x1_0_layout.yml` 为例,修改的内容如下所示。 +以`configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml` 为例,修改的内容如下所示。 ```yaml metric: COCO @@ -223,16 +225,20 @@ TestDataset: # 训练日志会自动保存到 log 目录中 # 单卡训练 +export CUDA_VISIBLE_DEVICES=0 python3 tools/train.py \ - -c configs/picodet/legacy_model/application/layout_detection/picodet_lcnet_x1_0_layout.yml \ + -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ --eval # 多卡训练,通过--gpus参数指定卡号 +export CUDA_VISIBLE_DEVICES=0,1,2,3 python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py \ - -c configs/picodet/legacy_model/application/layout_detection/picodet_lcnet_x1_0_layout.yml \ + -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ --eval ``` +**注意:**如果训练时显存out memory,将TrainReader中batch_size调小,同时LearningRate中base_lr等比例减小。发布的config均由8卡训练得到,如果改变GPU卡数为1,那么base_lr需要减小8倍。 + 正常启动训练后,会看到以下log输出: ``` @@ -254,9 +260,11 @@ PaddleDetection支持了基于FGD([Focal and Global Knowledge Distillation for D 更换数据集,修改【TODO】配置中的数据配置、类别数,具体可以参考4.1。启动训练: ```bash -python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py \ - -c configs/picodet/legacy_model/application/layout_detection/picodet_lcnet_x1_0_layout.yml \ - --slim_config configs/picodet/legacy_model/application/layout_detection/picodet_lcnet_x2_5_layout.yml \ +# 单卡训练 +export CUDA_VISIBLE_DEVICES=0 +python3 tools/train.py \ + -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ + --slim_config configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x2_5_layout.yml \ --eval ``` @@ -267,13 +275,13 @@ python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py \ ### 5.1. 指标评估 -训练中模型参数默认保存在`output/picodet_lcnet_x1_0_layout`目录下。在评估指标时,需要设置`weights`指向保存的参数文件。评估数据集可以通过 `configs/picodet/legacy_model/application/layout_detection/picodet_lcnet_x1_0_layout.yml` 修改`EvalDataset`中的 `image_dir`、`anno_path`和`dataset_dir` 设置。 +训练中模型参数默认保存在`output/picodet_lcnet_x1_0_layout`目录下。在评估指标时,需要设置`weights`指向保存的参数文件。评估数据集可以通过 `configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml` 修改`EvalDataset`中的 `image_dir`、`anno_path`和`dataset_dir` 设置。 ```bash # GPU 评估, weights 为待测权重 python3 tools/eval.py \ - -c configs/picodet/legacy_model/application/layout_detection/picodet_lcnet_x1_0_layout.yml \ - -o weigths=./output/picodet_lcnet_x1_0_layout/best_model + -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ + -o weights=./output/picodet_lcnet_x1_0_layout/best_model ``` 会输出以下信息,打印出mAP、AP0.5等信息。 @@ -299,8 +307,8 @@ python3 tools/eval.py \ ``` python3 tools/eval.py \ - -c configs/picodet/legacy_model/application/layout_detection/picodet_lcnet_x1_0_layout.yml \ - --slim_config configs/picodet/legacy_model/application/layout_detection/picodet_lcnet_x2_5_layout.yml \ + -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ + --slim_config configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x2_5_layout.yml \ -o weights=output/picodet_lcnet_x2_5_layout/best_model ``` @@ -311,18 +319,17 @@ python3 tools/eval.py \ ### 5.2. 测试版面分析结果 -预测使用的配置文件必须与训练一致,如您通过 `python3 tools/train.py -c configs/picodet/legacy_model/application/layout_detection/picodet_lcnet_x1_0_layout.yml` 完成了模型的训练过程。 - -使用 PaddleDetection 训练好的模型,您可以使用如下命令进行中文模型预测。 +预测使用的配置文件必须与训练一致,如您通过 `python3 tools/train.py -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml` 完成了模型的训练过程。 +使用 PaddleDetection 训练好的模型,您可以使用如下命令进行模型预测。 ```bash python3 tools/infer.py \ - -c configs/picodet/legacy_model/application/layout_detection/picodet_lcnet_x1_0_layout.yml \ + -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ -o weights='output/picodet_lcnet_x1_0_layout/best_model.pdparams' \ --infer_img='docs/images/layout.jpg' \ --output_dir=output_dir/ \ - --draw_threshold=0.4 + --draw_threshold=0.5 ``` - `--infer_img`: 推理单张图片,也可以通过`--infer_dir`推理文件中的所有图片。 @@ -335,16 +342,15 @@ python3 tools/infer.py \ ``` python3 tools/infer.py \ - -c configs/picodet/legacy_model/application/layout_detection/picodet_lcnet_x1_0_layout.yml \ - --slim_config configs/picodet/legacy_model/application/layout_detection/picodet_lcnet_x2_5_layout.yml \ + -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ + --slim_config configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x2_5_layout.yml \ -o weights='output/picodet_lcnet_x2_5_layout/best_model.pdparams' \ --infer_img='docs/images/layout.jpg' \ --output_dir=output_dir/ \ - --draw_threshold=0.4 + --draw_threshold=0.5 ``` - ## 6. 模型导出与预测 @@ -356,7 +362,7 @@ inference 模型(`paddle.jit.save`保存的模型) 一般是模型训练, ```bash python3 tools/export_model.py \ - -c configs/picodet/legacy_model/application/layout_detection/picodet_lcnet_x1_0_layout.yml \ + -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ -o weights=output/picodet_lcnet_x1_0_layout/best_model \ --output_dir=output_inference/ ``` @@ -377,8 +383,8 @@ FGD蒸馏模型转inference模型步骤如下: ```bash python3 tools/export_model.py \ - -c configs/picodet/legacy_model/application/publayernet_lcnet_x1_5/picodet_student.yml \ - --slim_config configs/picodet/legacy_model/application/publayernet_lcnet_x1_5/picodet_teacher.yml \ + -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ + --slim_config configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x2_5_layout.yml \ -o weights=./output/picodet_lcnet_x2_5_layout/best_model \ --output_dir=output_inference/ ``` @@ -404,7 +410,7 @@ python3 deploy/python/infer.py \ ------------------------------------------ ----------- Model Configuration ----------- Model Arch: PicoDet -Transform Order: +Transform Order: --transform op: Resize --transform op: NormalizeImage --transform op: Permute @@ -466,4 +472,3 @@ preprocess_time(ms): 2172.50, inference_time(ms): 11.90, postprocess_time(ms): 1 year={2022} } ``` - diff --git a/ppstructure/layout/__init__.py b/ppstructure/layout/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1d11e265597c7c8e39098a228108da3bb954b892 --- /dev/null +++ b/ppstructure/layout/__init__.py @@ -0,0 +1,13 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/ppstructure/recovery/README.md b/ppstructure/recovery/README.md index 713d0307dbbd66664db15d19df484af76efea75a..90a6a2c3c4189dc885d698e4cac2d1a24a49d1df 100644 --- a/ppstructure/recovery/README.md +++ b/ppstructure/recovery/README.md @@ -6,10 +6,12 @@ English | [简体中文](README_ch.md) - [2.1 Installation dependencies](#2.1) - [2.2 Install PaddleOCR](#2.2) - [3. Quick Start](#3) + - [3.1 Download models](#3.1) + - [3.2 Layout recovery](#3.2) -## 1. Introduction +## 1. Introduction Layout recovery means that after OCR recognition, the content is still arranged like the original document pictures, and the paragraphs are output to word document in the same order. @@ -17,8 +19,9 @@ Layout recovery combines [layout analysis](../layout/README.md)、[table recogni The following figure shows the result: