提交 7fd35af4 编写于 作者: qq_25193841's avatar qq_25193841

Merge remote-tracking branch 'origin/dygraph' into dy1

Global:
use_gpu: True
epoch_num: 400
log_smooth_window: 20
print_batch_step: 20
save_model_dir: ./output/SLANet_ch
save_epoch_step: 400
# evaluation is run every 331 iterations after the 0th iteration
eval_batch_step: [0, 331]
cal_metric_during_train: True
pretrained_model:
checkpoints:
save_inference_dir: ./output/SLANet_ch/infer
use_visualdl: False
infer_img: doc/table/table.jpg
# for data or label process
character_dict_path: ppocr/utils/dict/table_structure_dict_ch.txt
character_type: en
max_text_length: &max_text_length 500
box_format: &box_format xyxyxyxy # 'xywh', 'xyxy', 'xyxyxyxy'
infer_mode: False
use_sync_bn: True
save_res_path: output/infer
Optimizer:
name: Adam
beta1: 0.9
beta2: 0.999
clip_norm: 5.0
lr:
learning_rate: 0.001
regularizer:
name: 'L2'
factor: 0.00000
Architecture:
model_type: table
algorithm: SLANet
Backbone:
name: PPLCNet
scale: 1.0
pretrained: True
use_ssld: True
Neck:
name: CSPPAN
out_channels: 96
Head:
name: SLAHead
hidden_size: 256
max_text_length: *max_text_length
loc_reg_num: &loc_reg_num 8
Loss:
name: SLALoss
structure_weight: 1.0
loc_weight: 2.0
loc_loss: smooth_l1
PostProcess:
name: TableLabelDecode
merge_no_span_structure: &merge_no_span_structure True
Metric:
name: TableMetric
main_indicator: acc
compute_bbox_metric: False
loc_reg_num: *loc_reg_num
box_format: *box_format
del_thead_tbody: True
Train:
dataset:
name: PubTabDataSet
data_dir: train_data/table/train/
label_file_list: [train_data/table/train.txt]
transforms:
- DecodeImage:
img_mode: BGR
channel_first: False
- TableLabelEncode:
learn_empty_box: False
merge_no_span_structure: *merge_no_span_structure
replace_empty_cell_token: False
loc_reg_num: *loc_reg_num
max_text_length: *max_text_length
- TableBoxEncode:
in_box_format: *box_format
out_box_format: *box_format
- ResizeTableImage:
max_len: 488
- NormalizeImage:
scale: 1./255.
mean: [0.485, 0.456, 0.406]
std: [0.229, 0.224, 0.225]
order: 'hwc'
- PaddingTableImage:
size: [488, 488]
- ToCHWImage:
- KeepKeys:
keep_keys: [ 'image', 'structure', 'bboxes', 'bbox_masks', 'shape' ]
loader:
shuffle: True
batch_size_per_card: 48
drop_last: True
num_workers: 1
Eval:
dataset:
name: PubTabDataSet
data_dir: train_data/table/val/
label_file_list: [train_data/table/val.txt]
transforms:
- DecodeImage:
img_mode: BGR
channel_first: False
- TableLabelEncode:
learn_empty_box: False
merge_no_span_structure: *merge_no_span_structure
replace_empty_cell_token: False
loc_reg_num: *loc_reg_num
max_text_length: *max_text_length
- TableBoxEncode:
in_box_format: *box_format
out_box_format: *box_format
- ResizeTableImage:
max_len: 488
- NormalizeImage:
scale: 1./255.
mean: [0.485, 0.456, 0.406]
std: [0.229, 0.224, 0.225]
order: 'hwc'
- PaddingTableImage:
size: [488, 488]
- ToCHWImage:
- KeepKeys:
keep_keys: [ 'image', 'structure', 'bboxes', 'bbox_masks', 'shape' ]
loader:
shuffle: False
drop_last: False
batch_size_per_card: 48
num_workers: 1
...@@ -636,4 +636,6 @@ def main(): ...@@ -636,4 +636,6 @@ def main():
for item in result: for item in result:
item.pop('img') item.pop('img')
item.pop('res')
logger.info(item) logger.info(item)
logger.info('result save to {}'.format(args.output))
...@@ -194,6 +194,9 @@ def save_model(model, ...@@ -194,6 +194,9 @@ def save_model(model,
_mkdir_if_not_exist(model_path, logger) _mkdir_if_not_exist(model_path, logger)
model_prefix = os.path.join(model_path, prefix) model_prefix = os.path.join(model_path, prefix)
paddle.save(optimizer.state_dict(), model_prefix + '.pdopt') paddle.save(optimizer.state_dict(), model_prefix + '.pdopt')
is_nlp_model = config['Architecture']["model_type"] == 'kie' and config[
"Architecture"]["algorithm"] not in ["SDMGR"]
if is_nlp_model is not True: if is_nlp_model is not True:
paddle.save(model.state_dict(), model_prefix + '.pdparams') paddle.save(model.state_dict(), model_prefix + '.pdparams')
metric_prefix = model_prefix metric_prefix = model_prefix
......
...@@ -106,9 +106,9 @@ PP-Structure Series Model List (Updating) ...@@ -106,9 +106,9 @@ PP-Structure Series Model List (Updating)
|model name|description|model size|download| |model name|description|model size|download|
| --- | --- | --- | --- | | --- | --- | --- | --- |
|ch_PP-OCRv3_det_slim|[New] slim quantization with distillation lightweight model, supporting Chinese, English, multilingual text detection| 1.1M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_slim_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_slim_distill_train.tar)| |ch_PP-OCRv3_det| [New] Lightweight model, supporting Chinese, English, multilingual text detection | 3.8M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_distill_train.tar)|
|ch_PP-OCRv3_rec_slim |[New] Slim qunatization with distillation lightweight model, supporting Chinese, English text recognition| 4.9M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_train.tar) | |ch_PP-OCRv3_rec| [New] Lightweight model, supporting Chinese, English, multilingual text recognition | 12.4M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_train.tar) |
|ch_ppstructure_mobile_v2.0_SLANet|Chinese table recognition model trained on PubTabNet dataset based on SLANet|9.3M|[inference model](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_train.tar) | |ch_ppstructure_mobile_v2.0_SLANet|Chinese table recognition model based on SLANet|9.3M|[inference model](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_train.tar) |
### 7.3 KIE model ### 7.3 KIE model
......
...@@ -120,9 +120,9 @@ PP-Structure系列模型列表(更新中) ...@@ -120,9 +120,9 @@ PP-Structure系列模型列表(更新中)
|模型名称|模型简介|模型大小|下载地址| |模型名称|模型简介|模型大小|下载地址|
| --- | --- | --- | --- | | --- | --- | --- | --- |
|ch_PP-OCRv3_det_slim|【最新】slim量化+蒸馏版超轻量模型,支持中英文、多语种文本检测| 1.1M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_slim_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_slim_distill_train.tar)| |ch_PP-OCRv3_det| 【最新】超轻量模型,支持中英文、多语种文本检测 | 3.8M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_distill_train.tar)|
|ch_PP-OCRv3_rec_slim |【最新】slim量化版超轻量模型,支持中英文、数字识别| 4.9M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_train.tar) | |ch_PP-OCRv3_rec|【最新】超轻量模型,支持中英文、数字识别|12.4M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_train.tar) |
|ch_ppstructure_mobile_v2.0_SLANet|基于SLANet在PubTabNet数据集上训练的中文表格识别模型|9.3M|[推理模型](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_train.tar) | |ch_ppstructure_mobile_v2.0_SLANet|基于SLANet的中文表格识别模型|9.3M|[推理模型](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_train.tar) |
<a name="73"></a> <a name="73"></a>
......
...@@ -4,7 +4,7 @@ ...@@ -4,7 +4,7 @@
- [1.1 版面分析+表格识别](#1.1) - [1.1 版面分析+表格识别](#1.1)
- [1.2 版面分析](#1.2) - [1.2 版面分析](#1.2)
- [1.3 表格识别](#1.3) - [1.3 表格识别](#1.3)
- [2. DocVQA](#2) - [2. 关键信息抽取](#2)
<a name="1"></a> <a name="1"></a>
## 1. Structure ## 1. Structure
...@@ -16,23 +16,26 @@ cd ppstructure ...@@ -16,23 +16,26 @@ cd ppstructure
下载模型 下载模型
```bash ```bash
mkdir inference && cd inference mkdir inference && cd inference
# 下载PP-OCRv2文本检测模型并解压 # 下载PP-Structurev2版面分析模型并解压
wget https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_slim_quant_infer.tar && tar xf ch_PP-OCRv2_det_slim_quant_infer.tar wget https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_layout_infer.tar && tar xf picodet_lcnet_x1_0_layout_infer.tar
# 下载PP-OCRv2文本识别模型并解压 # 下载PP-OCRv3文本检测模型并解压
wget https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_infer.tar && tar xf ch_PP-OCRv2_rec_slim_quant_infer.tar wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar && tar xf ch_PP-OCRv3_det_infer.tar
# 下载超轻量级英文表格预测模型并解压 # 下载PP-OCRv3文本识别模型并解压
wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_structure_infer.tar && tar xf en_ppocr_mobile_v2.0_table_structure_infer.tar wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar && tar xf ch_PP-OCRv3_rec_infer.tar
# 下载PP-Structurev2表格识别模型并解压
wget https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_infer.tar && tar xf ch_ppstructure_mobile_v2.0_SLANet_infer.tar
cd .. cd ..
``` ```
<a name="1.1"></a> <a name="1.1"></a>
### 1.1 版面分析+表格识别 ### 1.1 版面分析+表格识别
```bash ```bash
python3 predict_system.py --det_model_dir=inference/ch_PP-OCRv2_det_slim_quant_infer \ python3 predict_system.py --det_model_dir=inference/ch_PP-OCRv3_det_infer \
--rec_model_dir=inference/ch_PP-OCRv2_rec_slim_quant_infer \ --rec_model_dir=inference/ch_PP-OCRv3_rec_infer \
--table_model_dir=inference/en_ppocr_mobile_v2.0_table_structure_infer \ --table_model_dir=inference/ch_ppstructure_mobile_v2.0_SLANet_infer \
--layout_model_dir=inference/picodet_lcnet_x1_0_layout_infer \
--image_dir=./docs/table/1.png \ --image_dir=./docs/table/1.png \
--rec_char_dict_path=../ppocr/utils/ppocr_keys_v1.txt \ --rec_char_dict_path=../ppocr/utils/ppocr_keys_v1.txt \
--table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt \ --table_char_dict_path=../ppocr/utils/dict/table_structure_dict_ch.txt \
--output=../output \ --output=../output \
--vis_font_path=../doc/fonts/simfang.ttf --vis_font_path=../doc/fonts/simfang.ttf
``` ```
...@@ -41,19 +44,23 @@ python3 predict_system.py --det_model_dir=inference/ch_PP-OCRv2_det_slim_quant_i ...@@ -41,19 +44,23 @@ python3 predict_system.py --det_model_dir=inference/ch_PP-OCRv2_det_slim_quant_i
<a name="1.2"></a> <a name="1.2"></a>
### 1.2 版面分析 ### 1.2 版面分析
```bash ```bash
python3 predict_system.py --image_dir=./docs/table/1.png --table=false --ocr=false --output=../output/ python3 predict_system.py --layout_model_dir=inference/picodet_lcnet_x1_0_layout_infer \
--image_dir=./docs/table/1.png \
--output=../output \
--table=false \
--ocr=false
``` ```
运行完成后,每张图片会在`output`字段指定的目录下的`structure`目录下有一个同名目录,图片区域会被裁剪之后保存下来,图片名为表格在图片里的坐标。版面分析结果会存储在`res.txt`文件中。 运行完成后,每张图片会在`output`字段指定的目录下的`structure`目录下有一个同名目录,图片区域会被裁剪之后保存下来,图片名为表格在图片里的坐标。版面分析结果会存储在`res.txt`文件中。
<a name="1.3"></a> <a name="1.3"></a>
### 1.3 表格识别 ### 1.3 表格识别
```bash ```bash
python3 predict_system.py --det_model_dir=inference/ch_PP-OCRv2_det_slim_quant_infer \ python3 predict_system.py --det_model_dir=inference/ch_PP-OCRv3_det_infer \
--rec_model_dir=inference/ch_PP-OCRv2_rec_slim_quant_infer \ --rec_model_dir=inference/ch_PP-OCRv3_rec_infer \
--table_model_dir=inference/en_ppocr_mobile_v2.0_table_structure_infer \ --table_model_dir=inference/ch_ppstructure_mobile_v2.0_SLANet_infer \
--image_dir=./docs/table/table.jpg \ --image_dir=./docs/table/table.jpg \
--rec_char_dict_path=../ppocr/utils/ppocr_keys_v1.txt \ --rec_char_dict_path=../ppocr/utils/ppocr_keys_v1.txt \
--table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt \ --table_char_dict_path=../ppocr/utils/dict/table_structure_dict_ch.txt \
--output=../output \ --output=../output \
--vis_font_path=../doc/fonts/simfang.ttf \ --vis_font_path=../doc/fonts/simfang.ttf \
--layout=false --layout=false
...@@ -61,20 +68,22 @@ python3 predict_system.py --det_model_dir=inference/ch_PP-OCRv2_det_slim_quant_i ...@@ -61,20 +68,22 @@ python3 predict_system.py --det_model_dir=inference/ch_PP-OCRv2_det_slim_quant_i
运行完成后,每张图片会在`output`字段指定的目录下的`structure`目录下有一个同名目录,表格会存储为一个excel,excel文件名为`[0,0,img_h,img_w]`。 运行完成后,每张图片会在`output`字段指定的目录下的`structure`目录下有一个同名目录,表格会存储为一个excel,excel文件名为`[0,0,img_h,img_w]`。
<a name="2"></a> <a name="2"></a>
## 2. DocVQA ## 2. 关键信息抽取
```bash ```bash
cd ppstructure cd ppstructure
# 下载模型
mkdir inference && cd inference mkdir inference && cd inference
# 下载SER xfun 模型并解压 # 下载SER XFUND 模型并解压
wget https://paddleocr.bj.bcebos.com/pplayout/PP-Layout_v1.0_ser_pretrained.tar && tar xf PP-Layout_v1.0_ser_pretrained.tar wget https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/ser_vi_layoutxlm_xfund_infer.tar && tar -xf ser_vi_layoutxlm_xfund_infer.tar
cd .. cd ..
python3 kie/predict_kie_token_ser.py \
python3 predict_system.py --model_name_or_path=kie/PP-Layout_v1.0_ser_pretrained/ \ --kie_algorithm=LayoutXLM \
--mode=kie \ --ser_model_dir=../inference/ser_vi_layoutxlm_xfund_infer \
--image_dir=kie/images/input/zh_val_0.jpg \ --image_dir=./docs/kie/input/zh_val_42.jpg \
--vis_font_path=../doc/fonts/simfang.ttf --ser_dict_path=../ppocr/utils/dict/kie_dict/xfund_class_list.txt \
--vis_font_path=../doc/fonts/simfang.ttf \
--ocr_order_method="tb-yx"
``` ```
运行完成后,每张图片会在`output`字段指定的目录下的`kie`目录下存放可视化之后的图片,图片名和输入图片名一致。 运行完成后,每张图片会在`output`字段指定的目录下的`kie`目录下存放可视化之后的图片,图片名和输入图片名一致。
...@@ -4,7 +4,7 @@ ...@@ -4,7 +4,7 @@
- [1.1 layout analysis + table recognition](#1.1) - [1.1 layout analysis + table recognition](#1.1)
- [1.2 layout analysis](#1.2) - [1.2 layout analysis](#1.2)
- [1.3 table recognition](#1.3) - [1.3 table recognition](#1.3)
- [2. DocVQA](#2) - [2. KIE](#2)
<a name="1"></a> <a name="1"></a>
## 1. Structure ## 1. Structure
...@@ -18,23 +18,26 @@ download model ...@@ -18,23 +18,26 @@ download model
```bash ```bash
mkdir inference && cd inference mkdir inference && cd inference
# Download the PP-OCRv2 text detection model and unzip it # Download the PP-Structurev2 layout analysis model and unzip it
wget https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_slim_quant_infer.tar && tar xf ch_PP-OCRv2_det_slim_quant_infer.tar wget https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_layout_infer.tar && tar xf picodet_lcnet_x1_0_layout_infer.tar
# Download the PP-OCRv2 text recognition model and unzip it # Download the PP-OCRv3 text detection model and unzip it
wget https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_infer.tar && tar xf ch_PP-OCRv2_rec_slim_quant_infer.tar wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar && tar xf ch_PP-OCRv3_det_infer.tar
# Download the ultra-lightweight English table structure model and unzip it # Download the PP-OCRv3 text recognition model and unzip it
wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_structure_infer.tar && tar xf en_ppocr_mobile_v2.0_table_structure_infer.tar wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar && tar xf ch_PP-OCRv3_rec_infer.tar
# Download the PP-Structurev2 form recognition model and unzip it
wget https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_infer.tar && tar xf ch_ppstructure_mobile_v2.0_SLANet_infer.tar
cd .. cd ..
``` ```
<a name="1.1"></a> <a name="1.1"></a>
### 1.1 layout analysis + table recognition ### 1.1 layout analysis + table recognition
```bash ```bash
python3 predict_system.py --det_model_dir=inference/ch_PP-OCRv2_det_slim_quant_infer \ python3 predict_system.py --det_model_dir=inference/ch_PP-OCRv3_det_infer \
--rec_model_dir=inference/ch_PP-OCRv2_rec_slim_quant_infer \ --rec_model_dir=inference/ch_PP-OCRv3_rec_infer \
--table_model_dir=inference/en_ppocr_mobile_v2.0_table_structure_infer \ --table_model_dir=inference/ch_ppstructure_mobile_v2.0_SLANet_infer \
--layout_model_dir=inference/picodet_lcnet_x1_0_layout_infer \
--image_dir=./docs/table/1.png \ --image_dir=./docs/table/1.png \
--rec_char_dict_path=../ppocr/utils/ppocr_keys_v1.txt \ --rec_char_dict_path=../ppocr/utils/ppocr_keys_v1.txt \
--table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt \ --table_char_dict_path=../ppocr/utils/dict/table_structure_dict_ch.txt \
--output=../output \ --output=../output \
--vis_font_path=../doc/fonts/simfang.ttf --vis_font_path=../doc/fonts/simfang.ttf
``` ```
...@@ -43,19 +46,23 @@ After the operation is completed, each image will have a directory with the same ...@@ -43,19 +46,23 @@ After the operation is completed, each image will have a directory with the same
<a name="1.2"></a> <a name="1.2"></a>
### 1.2 layout analysis ### 1.2 layout analysis
```bash ```bash
python3 predict_system.py --image_dir=./docs/table/1.png --table=false --ocr=false --output=../output/ python3 predict_system.py --layout_model_dir=inference/picodet_lcnet_x1_0_layout_infer \
--image_dir=./docs/table/1.png \
--output=../output \
--table=false \
--ocr=false
``` ```
After the operation is completed, each image will have a directory with the same name in the `structure` directory under the directory specified by the `output` field. Each picture in image will be cropped and saved. The filename of picture area is their coordinates in the image. Layout analysis results will be stored in the `res.txt` file After the operation is completed, each image will have a directory with the same name in the `structure` directory under the directory specified by the `output` field. Each picture in image will be cropped and saved. The filename of picture area is their coordinates in the image. Layout analysis results will be stored in the `res.txt` file
<a name="1.3"></a> <a name="1.3"></a>
### 1.3 table recognition ### 1.3 table recognition
```bash ```bash
python3 predict_system.py --det_model_dir=inference/ch_PP-OCRv2_det_slim_quant_infer \ python3 predict_system.py --det_model_dir=inference/ch_PP-OCRv3_det_infer \
--rec_model_dir=inference/ch_PP-OCRv2_rec_slim_quant_infer \ --rec_model_dir=inference/ch_PP-OCRv3_rec_infer \
--table_model_dir=inference/en_ppocr_mobile_v2.0_table_structure_infer \ --table_model_dir=inference/ch_ppstructure_mobile_v2.0_SLANet_infer \
--image_dir=./docs/table/table.jpg \ --image_dir=./docs/table/table.jpg \
--rec_char_dict_path=../ppocr/utils/ppocr_keys_v1.txt \ --rec_char_dict_path=../ppocr/utils/ppocr_keys_v1.txt \
--table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt \ --table_char_dict_path=../ppocr/utils/dict/table_structure_dict_ch.txt \
--output=../output \ --output=../output \
--vis_font_path=../doc/fonts/simfang.ttf \ --vis_font_path=../doc/fonts/simfang.ttf \
--layout=false --layout=false
...@@ -63,19 +70,22 @@ python3 predict_system.py --det_model_dir=inference/ch_PP-OCRv2_det_slim_quant_i ...@@ -63,19 +70,22 @@ python3 predict_system.py --det_model_dir=inference/ch_PP-OCRv2_det_slim_quant_i
After the operation is completed, each image will have a directory with the same name in the `structure` directory under the directory specified by the `output` field. Each table in the image will be stored as an excel. The filename of excel is their coordinates in the image. After the operation is completed, each image will have a directory with the same name in the `structure` directory under the directory specified by the `output` field. Each table in the image will be stored as an excel. The filename of excel is their coordinates in the image.
<a name="2"></a> <a name="2"></a>
## 2. DocVQA ## 2. KIE
```bash ```bash
cd ppstructure cd ppstructure
# download model
mkdir inference && cd inference mkdir inference && cd inference
wget https://paddleocr.bj.bcebos.com/pplayout/PP-Layout_v1.0_ser_pretrained.tar && tar xf PP-Layout_v1.0_ser_pretrained.tar # download model
wget https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/ser_vi_layoutxlm_xfund_infer.tar && tar -xf ser_vi_layoutxlm_xfund_infer.tar
cd .. cd ..
python3 kie/predict_kie_token_ser.py \
python3 predict_system.py --model_name_or_path=kie/PP-Layout_v1.0_ser_pretrained/ \ --kie_algorithm=LayoutXLM \
--mode=kie \ --ser_model_dir=../inference/ser_vi_layoutxlm_xfund_infer \
--image_dir=kie/images/input/zh_val_0.jpg \ --image_dir=./docs/kie/input/zh_val_42.jpg \
--vis_font_path=../doc/fonts/simfang.ttf --ser_dict_path=../ppocr/utils/dict/kie_dict/xfund_class_list.txt \
--vis_font_path=../doc/fonts/simfang.ttf \
--ocr_order_method="tb-yx"
``` ```
After the operation is completed, each image will store the visualized image in the `kie` directory under the directory specified by the `output` field, and the image name is the same as the input image name. After the operation is completed, each image will store the visualized image in the `kie` directory under the directory specified by the `output` field, and the image name is the same as the input image name.
- [快速安装](#快速安装) - [快速安装](#快速安装)
- [1. PaddlePaddle 和 PaddleOCR](#1-paddlepaddle-和-paddleocr) - [1. PaddlePaddle 和 PaddleOCR](#1-paddlepaddle-和-paddleocr)
- [2. 安装其他依赖](#2-安装其他依赖) - [2. 安装其他依赖](#2-安装其他依赖)
- [2.1 VQA所需依赖](#21--kie所需依赖) - [2.1 KIE所需依赖](#21-kie所需依赖)
# 快速安装 # 快速安装
...@@ -11,16 +11,11 @@ ...@@ -11,16 +11,11 @@
## 2. 安装其他依赖 ## 2. 安装其他依赖
### 2.1 VQA所需依赖 ### 2.1 KIE所需依赖
* paddleocr
```bash * paddleocr
pip3 install paddleocr
```
* PaddleNLP
```bash ```bash
git clone https://github.com/PaddlePaddle/PaddleNLP -b develop pip install paddleocr -U
cd PaddleNLP pip install -r ./kie/requirements.txt
pip3 install -e .
``` ```
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
- [1. PaddlePaddle 和 PaddleOCR](#1) - [1. PaddlePaddle 和 PaddleOCR](#1)
- [2. Install other dependencies](#2) - [2. Install other dependencies](#2)
- [2.1 VQA](#21) - [2.1 KIE](#21)
<a name="1"></a> <a name="1"></a>
...@@ -14,17 +14,11 @@ Please refer to [PaddleOCR installation documentation](../../doc/doc_en/installa ...@@ -14,17 +14,11 @@ Please refer to [PaddleOCR installation documentation](../../doc/doc_en/installa
## 2. Install other dependencies ## 2. Install other dependencies
<a name="21"></a> <a name="21"></a>
### 2.1 VQA ### 2.1 KIE
* paddleocr * paddleocr
```bash ```bash
pip3 install paddleocr pip install paddleocr -U
``` pip install -r ./kie/requirements.txt
* PaddleNLP
```bash
git clone https://github.com/PaddlePaddle/PaddleNLP -b develop
cd PaddleNLP
pip3 install -e .
``` ```
...@@ -10,13 +10,17 @@ ...@@ -10,13 +10,17 @@
<a name="1"></a> <a name="1"></a>
## 1. 版面分析模型 ## 1. 版面分析模型
|模型名称|模型简介|下载地址|label_map| |模型名称|模型简介|推理模型大小|下载地址|dict path|
| --- | --- | --- | --- | | --- | --- | --- | --- | --- |
| ppyolov2_r50vd_dcn_365e_publaynet | PubLayNet 数据集训练的版面分析模型,可以划分**文字、标题、表格、图片以及列表**5类区域 | [推理模型](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_publaynet.tar) / [训练模型](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_publaynet_pretrained.pdparams) |{0: "Text", 1: "Title", 2: "List", 3:"Table", 4:"Figure"}| | picodet_lcnet_x1_0_fgd_layout | 基于PicoDet LCNet_x1_0和FGD蒸馏在PubLayNet 数据集训练的英文版面分析模型,可以划分**文字、标题、表格、图片以及列表**5类区域 | 9.7M | [推理模型](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout.pdparams) | [PubLayNet dict](../../ppocr/utils/dict/layout_dict/layout_publaynet_dict.txt) |
| ppyolov2_r50vd_dcn_365e_tableBank_word | TableBank Word 数据集训练的版面分析模型,只能检测表格 | [推理模型](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_tableBank_word.tar) | {0:"Table"}| | ppyolov2_r50vd_dcn_365e_publaynet | 基于PP-YOLOv2在PubLayNet数据集上训练的英文版面分析模型 | 221M | [推理模型](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_publaynet.tar) / [训练模型](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_publaynet_pretrained.pdparams) | 同上 |
| ppyolov2_r50vd_dcn_365e_tableBank_latex | TableBank Latex 数据集训练的版面分析模型,只能检测表格 | [推理模型](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_tableBank_latex.tar) | {0:"Table"}| | picodet_lcnet_x1_0_fgd_layout_cdla | CDLA数据集训练的中文版面分析模型,可以划分为**表格、图片、图片标题、表格、表格标题、页眉、脚本、引用、公式**10类区域 | 9.7M | [推理模型](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_cdla_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_cdla.pdparams) | [CDLA dict](../../ppocr/utils/dict/layout_dict/layout_cdla_dict.txt) |
| picodet_lcnet_x1_0_fgd_layout_table | 表格数据集训练的版面分析模型,支持中英文文档表格区域的检测 | 9.7M | [推理模型](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_table_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_table.pdparams) | [Table dict](../../ppocr/utils/dict/layout_dict/layout_table_dict.txt) |
| ppyolov2_r50vd_dcn_365e_tableBank_word | 基于PP-YOLOv2在TableBank Word 数据集训练的版面分析模型,支持英文文档表格区域的检测 | 221M | [推理模型](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_tableBank_word.tar) | 同上 |
| ppyolov2_r50vd_dcn_365e_tableBank_latex | 基于PP-YOLOv2在TableBank Latex数据集训练的版面分析模型,支持英文文档表格区域的检测 | 221M | [推理模型](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_tableBank_latex.tar) | 同上 |
<a name="2"></a> <a name="2"></a>
## 2. OCR和表格识别模型 ## 2. OCR和表格识别模型
<a name="21"></a> <a name="21"></a>
...@@ -24,8 +28,8 @@ ...@@ -24,8 +28,8 @@
|模型名称|模型简介|推理模型大小|下载地址| |模型名称|模型简介|推理模型大小|下载地址|
| --- | --- | --- | --- | | --- | --- | --- | --- |
|en_ppocr_mobile_v2.0_table_det|PubLayNet数据集训练的英文表格场景的文字检测|4.7M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/table/en_ppocr_mobile_v2.0_table_det_train.tar) | |en_ppocr_mobile_v2.0_table_det|PubTabNet数据集训练的英文表格场景的文字检测|4.7M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/table/en_ppocr_mobile_v2.0_table_det_train.tar) |
|en_ppocr_mobile_v2.0_table_rec|PubLayNet数据集训练的英文表格场景的文字识别|6.9M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/table/en_ppocr_mobile_v2.0_table_rec_train.tar) | |en_ppocr_mobile_v2.0_table_rec|PubTabNet数据集训练的英文表格场景的文字识别|6.9M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/table/en_ppocr_mobile_v2.0_table_rec_train.tar) |
如需要使用其他OCR模型,可以在 [PP-OCR model_list](../../doc/doc_ch/models_list.md) 下载模型或者使用自己训练好的模型配置到 `det_model_dir`, `rec_model_dir`两个字段即可。 如需要使用其他OCR模型,可以在 [PP-OCR model_list](../../doc/doc_ch/models_list.md) 下载模型或者使用自己训练好的模型配置到 `det_model_dir`, `rec_model_dir`两个字段即可。
...@@ -36,7 +40,7 @@ ...@@ -36,7 +40,7 @@
| --- | --- | --- | --- | | --- | --- | --- | --- |
|en_ppocr_mobile_v2.0_table_structure|基于TableRec-RARE在PubTabNet数据集上训练的英文表格识别模型|6.8M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_structure_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/table/en_ppocr_mobile_v2.0_table_structure_train.tar) | |en_ppocr_mobile_v2.0_table_structure|基于TableRec-RARE在PubTabNet数据集上训练的英文表格识别模型|6.8M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_structure_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/table/en_ppocr_mobile_v2.0_table_structure_train.tar) |
|en_ppstructure_mobile_v2.0_SLANet|基于SLANet在PubTabNet数据集上训练的英文表格识别模型|9.2M|[推理模型](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/en_ppstructure_mobile_v2.0_SLANet_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/en_ppstructure_mobile_v2.0_SLANet_train.tar) | |en_ppstructure_mobile_v2.0_SLANet|基于SLANet在PubTabNet数据集上训练的英文表格识别模型|9.2M|[推理模型](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/en_ppstructure_mobile_v2.0_SLANet_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/en_ppstructure_mobile_v2.0_SLANet_train.tar) |
|ch_ppstructure_mobile_v2.0_SLANet|基于SLANet在PubTabNet数据集上训练的中文表格识别模型|9.3M|[推理模型](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_train.tar) | |ch_ppstructure_mobile_v2.0_SLANet|基于SLANet的中文表格识别模型|9.3M|[推理模型](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_train.tar) |
<a name="3"></a> <a name="3"></a>
......
...@@ -4,18 +4,20 @@ ...@@ -4,18 +4,20 @@
- [2. OCR and Table Recognition](#2-ocr-and-table-recognition) - [2. OCR and Table Recognition](#2-ocr-and-table-recognition)
- [2.1 OCR](#21-ocr) - [2.1 OCR](#21-ocr)
- [2.2 Table Recognition](#22-table-recognition) - [2.2 Table Recognition](#22-table-recognition)
- [3. VQA](#3-kie) - [3. KIE](#3-kie)
- [4. KIE](#4-kie)
<a name="1"></a> <a name="1"></a>
## 1. Layout Analysis ## 1. Layout Analysis
|model name| description |download|label_map| |model name| description | inference model size |download|dict path|
| --- |---------------------------------------------------------------------------------------------------------------------------------------------------------| --- | --- | | --- |---------------------------------------------------------------------------------------------------------------------------------------------------------| --- | --- | --- |
| ppyolov2_r50vd_dcn_365e_publaynet | The layout analysis model trained on the PubLayNet dataset, the model can recognition 5 types of areas such as **text, title, table, picture and list** | [inference model](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_publaynet.tar) / [trained model](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_publaynet_pretrained.pdparams) |{0: "Text", 1: "Title", 2: "List", 3:"Table", 4:"Figure"}| | picodet_lcnet_x1_0_fgd_layout | The layout analysis English model trained on the PubLayNet dataset based on PicoDet LCNet_x1_0 and FGD . the model can recognition 5 types of areas such as **Text, Title, Table, Picture and List** | 9.7M | [inference model](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout.pdparams) | [PubLayNet dict](../../ppocr/utils/dict/layout_dict/layout_publaynet_dict.txt) |
| ppyolov2_r50vd_dcn_365e_tableBank_word | The layout analysis model trained on the TableBank Word dataset, the model can only detect tables | [inference model](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_tableBank_word.tar) | {0:"Table"}| | ppyolov2_r50vd_dcn_365e_publaynet | The layout analysis English model trained on the PubLayNet dataset based on PP-YOLOv2 | 221M | [inference_moel]](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_publaynet.tar) / [trained model](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_publaynet_pretrained.pdparams) | sme as above |
| ppyolov2_r50vd_dcn_365e_tableBank_latex | The layout analysis model trained on the TableBank Latex dataset, the model can only detect tables | [inference model](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_tableBank_latex.tar) | {0:"Table"}| | picodet_lcnet_x1_0_fgd_layout_cdla | The layout analysis Chinese model trained on the CDLA dataset, the model can recognition 10 types of areas such as **Table、Figure、Figure caption、Table、Table caption、Header、Footer、Reference、Equation** | 9.7M | [inference model](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_cdla_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_cdla.pdparams) | [CDLA dict](../../ppocr/utils/dict/layout_dict/layout_cdla_dict.txt) |
| picodet_lcnet_x1_0_fgd_layout_table | The layout analysis model trained on the table dataset, the model can detect tables in Chinese and English documents | 9.7M | [inference model](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_table_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_table.pdparams) | [Table dict](../../ppocr/utils/dict/layout_dict/layout_table_dict.txt) |
| ppyolov2_r50vd_dcn_365e_tableBank_word | The layout analysis model trained on the TableBank Word dataset based on PP-YOLOv2, the model can detect tables in English documents | 221M | [inference model](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_tableBank_word.tar) | same as above |
| ppyolov2_r50vd_dcn_365e_tableBank_latex | The layout analysis model trained on the TableBank Latex dataset based on PP-YOLOv2, the model can detect tables in English documents | 221M | [inference model](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_tableBank_latex.tar) | same as above |
<a name="2"></a> <a name="2"></a>
## 2. OCR and Table Recognition ## 2. OCR and Table Recognition
...@@ -37,22 +39,28 @@ If you need to use other OCR models, you can download the model in [PP-OCR model ...@@ -37,22 +39,28 @@ If you need to use other OCR models, you can download the model in [PP-OCR model
| --- |-----------------------------------------------------------------------------| --- | --- | | --- |-----------------------------------------------------------------------------| --- | --- |
|en_ppocr_mobile_v2.0_table_structure| English table recognition model trained on PubTabNet dataset based on TableRec-RARE |6.8M|[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_structure_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.1/table/en_ppocr_mobile_v2.0_table_structure_train.tar) | |en_ppocr_mobile_v2.0_table_structure| English table recognition model trained on PubTabNet dataset based on TableRec-RARE |6.8M|[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_structure_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.1/table/en_ppocr_mobile_v2.0_table_structure_train.tar) |
|en_ppstructure_mobile_v2.0_SLANet|English table recognition model trained on PubTabNet dataset based on SLANet|9.2M|[inference model](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/en_ppstructure_mobile_v2.0_SLANet_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/en_ppstructure_mobile_v2.0_SLANet_train.tar) | |en_ppstructure_mobile_v2.0_SLANet|English table recognition model trained on PubTabNet dataset based on SLANet|9.2M|[inference model](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/en_ppstructure_mobile_v2.0_SLANet_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/en_ppstructure_mobile_v2.0_SLANet_train.tar) |
|ch_ppstructure_mobile_v2.0_SLANet|Chinese table recognition model trained on PubTabNet dataset based on SLANet|9.3M|[inference model](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_train.tar) | |ch_ppstructure_mobile_v2.0_SLANet|Chinese table recognition model based on SLANet|9.3M|[inference model](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_train.tar) |
<a name="3"></a> <a name="3"></a>
## 3. VQA ## 3. KIE
|model| description |inference model size|download| On XFUND_zh dataset, Accuracy and time cost of different models on V100 GPU are as follows.
| --- |----------------------------------------------------------------| --- | --- |
|ser_LayoutXLM_xfun_zh| SER model trained on xfun Chinese dataset based on LayoutXLM |1.4G|[inference model](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutXLM_xfun_zh_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutXLM_xfun_zh.tar) | |Model|Backbone|Task|Config|Hmean|Time cost(ms)|Download link|
|re_LayoutXLM_xfun_zh| Re model trained on xfun Chinese dataset based on LayoutXLM |1.4G|[inference model coming soon]() / [trained model](https://paddleocr.bj.bcebos.com/pplayout/re_LayoutXLM_xfun_zh.tar) | | --- | --- | --- | --- | --- | --- |--- |
|ser_LayoutLMv2_xfun_zh| SER model trained on xfun Chinese dataset based on LayoutXLMv2 |778M|[inference model](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutLMv2_xfun_zh_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutLMv2_xfun_zh.tar) | |VI-LayoutXLM| VI-LayoutXLM-base | SER | [ser_vi_layoutxlm_xfund_zh_udml.yml](../../configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh_udml.yml)|**93.19%**| 15.49| [trained model](https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/ser_vi_layoutxlm_xfund_pretrained.tar)|
|re_LayoutLMv2_xfun_zh| Re model trained on xfun Chinese dataset based on LayoutXLMv2 |765M|[inference model coming soon]() / [trained model](https://paddleocr.bj.bcebos.com/pplayout/re_LayoutLMv2_xfun_zh.tar) | |LayoutXLM| LayoutXLM-base | SER | [ser_layoutxlm_xfund_zh.yml](../../configs/kie/layoutlm_series/ser_layoutxlm_xfund_zh.yml)|90.38%| 19.49 |[trained model](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutXLM_xfun_zh.tar)|
|ser_LayoutLM_xfun_zh| SER model trained on xfun Chinese dataset based on LayoutLM |430M|[inference model](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutLM_xfun_zh_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutLM_xfun_zh.tar) | |LayoutLM| LayoutLM-base | SER | [ser_layoutlm_xfund_zh.yml](../../configs/kie/layoutlm_series/ser_layoutlm_xfund_zh.yml)|77.31%|-|[trained model](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutLM_xfun_zh.tar)|
|LayoutLMv2| LayoutLMv2-base | SER | [ser_layoutlmv2_xfund_zh.yml](../../configs/kie/layoutlm_series/ser_layoutlmv2_xfund_zh.yml)|85.44%|31.46|[trained model](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutLMv2_xfun_zh.tar)|
<a name="4"></a> |VI-LayoutXLM| VI-LayoutXLM-base | RE | [re_vi_layoutxlm_xfund_zh_udml.yml](../../configs/kie/vi_layoutxlm/re_vi_layoutxlm_xfund_zh_udml.yml)|**83.92%**|15.49|[trained model](https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/re_vi_layoutxlm_xfund_pretrained.tar)|
## 4. KIE |LayoutXLM| LayoutXLM-base | RE | [re_layoutxlm_xfund_zh.yml](../../configs/kie/layoutlm_series/re_layoutxlm_xfund_zh.yml)|74.83%|19.49|[trained model](https://paddleocr.bj.bcebos.com/pplayout/re_LayoutXLM_xfun_zh.tar)|
|LayoutLMv2| LayoutLMv2-base | RE | [re_layoutlmv2_xfund_zh.yml](../../configs/kie/layoutlm_series/re_layoutlmv2_xfund_zh.yml)|67.77%|31.46|[trained model](https://paddleocr.bj.bcebos.com/pplayout/re_LayoutLMv2_xfun_zh.tar)|
|model|description|model size|download|
| --- | --- | --- | --- | * Note: The above time cost information just considers inference time without preprocess or postprocess, test environment: `V100 GPU + CUDA 10.2 + CUDNN 8.1.1 + TRT 7.2.3.4`
|SDMGR|Key Information Extraction Model|78M|[inference model coming soon]() / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.1/kie/kie_vgg16.tar)|
On wildreceipt dataset, the algorithm result is as follows:
|Model|Backbone|Config|Hmean|Download link|
| --- | --- | --- | --- | --- |
|SDMGR|VGG6|[configs/kie/sdmgr/kie_unet_sdmgr.yml](../../configs/kie/sdmgr/kie_unet_sdmgr.yml)|86.7%|[trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.1/kie/kie_vgg16.tar)|
...@@ -7,16 +7,22 @@ ...@@ -7,16 +7,22 @@
- [2.1.2 版面分析+表格识别](#212-版面分析表格识别) - [2.1.2 版面分析+表格识别](#212-版面分析表格识别)
- [2.1.3 版面分析](#213-版面分析) - [2.1.3 版面分析](#213-版面分析)
- [2.1.4 表格识别](#214-表格识别) - [2.1.4 表格识别](#214-表格识别)
- [2.1.5 DocVQA](#215-dockie) - [2.1.5 关键信息抽取](#215-关键信息抽取)
- [2.1.6 版面恢复](#216-版面恢复)
- [2.2 代码使用](#22-代码使用) - [2.2 代码使用](#22-代码使用)
- [2.2.1 图像方向分类版面分析表格识别](#221-图像方向分类版面分析表格识别)
- [2.2.1 图像方向+分类版面分析+表格识别](#221-图像方向分类版面分析表格识别)
- [2.2.2 版面分析+表格识别](#222-版面分析表格识别) - [2.2.2 版面分析+表格识别](#222-版面分析表格识别)
- [2.2.3 版面分析](#223-版面分析) - [2.2.3 版面分析](#223-版面分析)
- [2.2.4 表格识别](#224-表格识别) - [2.2.4 表格识别](#224-表格识别)
- [2.2.5 DocVQA](#225-dockie)
- [2.2.5 关键信息抽取](#225-关键信息抽取)
- [2.2.6 版面恢复](#226-版面恢复)
- [2.3 返回结果说明](#23-返回结果说明) - [2.3 返回结果说明](#23-返回结果说明)
- [2.3.1 版面分析+表格识别](#231-版面分析表格识别) - [2.3.1 版面分析+表格识别](#231-版面分析表格识别)
- [2.3.2 DocVQA](#232-dockie) - [2.3.2 关键信息抽取](#232-关键信息抽取)
- [2.4 参数说明](#24-参数说明) - [2.4 参数说明](#24-参数说明)
...@@ -24,11 +30,12 @@ ...@@ -24,11 +30,12 @@
## 1. 安装依赖包 ## 1. 安装依赖包
```bash ```bash
# 安装 paddleocr,推荐使用2.5+版本 # 安装 paddleocr,推荐使用2.6版本
pip3 install "paddleocr>=2.5" pip3 install "paddleocr>=2.6"
# 安装 DocVQA依赖包paddlenlp(如不需要DocVQA功能,可跳过) # 安装 关键信息抽取 依赖包(如不需要KIE功能,可跳过)
pip install paddlenlp pip install -r kie/requirements.txt
# 安装 图像方向分类依赖包paddleclas(如不需要图像方向分类功能,可跳过)
pip3 install paddleclas
``` ```
<a name="2"></a> <a name="2"></a>
...@@ -62,15 +69,24 @@ paddleocr --image_dir=PaddleOCR/ppstructure/docs/table/table.jpg --type=structur ...@@ -62,15 +69,24 @@ paddleocr --image_dir=PaddleOCR/ppstructure/docs/table/table.jpg --type=structur
``` ```
<a name="215"></a> <a name="215"></a>
#### 2.1.5 DocVQA
请参考:[文档视觉问答](../kie/README.md) #### 2.1.5 关键信息抽取
请参考:[关键信息抽取教程](../kie/README_ch.md)
<a name="216"></a>
#### 2.1.6 版面恢复
```bash
paddleocr --image_dir=PaddleOCR/ppstructure/docs/table/1.png --type=structure --recovery=true
```
<a name="22"></a> <a name="22"></a>
### 2.2 代码使用 ### 2.2 代码使用
<a name="221"></a> <a name="221"></a>
#### 2.2.1 图像方向分类版面分析表格识别 #### 2.2.1 图像方向分类+版面分析+表格识别
```python ```python
import os import os
...@@ -149,6 +165,7 @@ for line in result: ...@@ -149,6 +165,7 @@ for line in result:
``` ```
<a name="224"></a> <a name="224"></a>
#### 2.2.4 表格识别 #### 2.2.4 表格识别
```python ```python
...@@ -170,9 +187,36 @@ for line in result: ...@@ -170,9 +187,36 @@ for line in result:
``` ```
<a name="225"></a> <a name="225"></a>
#### 2.2.5 DocVQA #### 2.2.5 关键信息抽取
请参考:[关键信息抽取教程](../kie/README_ch.md)
请参考:[文档视觉问答](../kie/README.md) <a name="226"></a>
#### 2.2.6 版面恢复
```python
import os
import cv2
from paddleocr import PPStructure,save_structure_res
from paddelocr.ppstructure.recovery.recovery_to_doc import sorted_layout_boxes, convert_info_docx
table_engine = PPStructure(layout=False, show_log=True)
save_folder = './output'
img_path = 'PaddleOCR/ppstructure/docs/table/1.png'
img = cv2.imread(img_path)
result = table_engine(img)
save_structure_res(result, save_folder, os.path.basename(img_path).split('.')[0])
for line in result:
line.pop('img')
print(line)
h, w, _ = img.shape
res = sorted_layout_boxes(res, w)
convert_info_docx(img, result, save_folder, os.path.basename(img_path).split('.')[0])
```
<a name="23"></a> <a name="23"></a>
### 2.3 返回结果说明 ### 2.3 返回结果说明
...@@ -208,9 +252,9 @@ dict 里各个字段说明如下 ...@@ -208,9 +252,9 @@ dict 里各个字段说明如下
``` ```
<a name="232"></a> <a name="232"></a>
#### 2.3.2 DocVQA #### 2.3.2 关键信息抽取
请参考:[文档视觉问答](../kie/README.md) 请参考:[关键信息抽取教程](../kie/README_ch.md)
<a name="24"></a> <a name="24"></a>
### 2.4 参数说明 ### 2.4 参数说明
...@@ -235,6 +279,7 @@ dict 里各个字段说明如下 ...@@ -235,6 +279,7 @@ dict 里各个字段说明如下
| table | 前向中是否执行表格识别 | True | | table | 前向中是否执行表格识别 | True |
| ocr | 对于版面分析中的非表格区域,是否执行ocr。当layout为False时会被自动设置为False| True | | ocr | 对于版面分析中的非表格区域,是否执行ocr。当layout为False时会被自动设置为False| True |
| recovery | 前向中是否执行版面恢复| False | | recovery | 前向中是否执行版面恢复| False |
| save_pdf | 版面恢复导出docx文件的同时,是否导出pdf文件 | False |
| structure_version | 模型版本,可选 PP-structure和PP-structurev2 | PP-structure | | structure_version | 模型版本,可选 PP-structure和PP-structurev2 | PP-structure |
大部分参数和PaddleOCR whl包保持一致,见 [whl包文档](../../doc/doc_ch/whl.md) 大部分参数和PaddleOCR whl包保持一致,见 [whl包文档](../../doc/doc_ch/whl.md)
...@@ -7,16 +7,19 @@ ...@@ -7,16 +7,19 @@
- [2.1.2 layout analysis + table recognition](#212-layout-analysis--table-recognition) - [2.1.2 layout analysis + table recognition](#212-layout-analysis--table-recognition)
- [2.1.3 layout analysis](#213-layout-analysis) - [2.1.3 layout analysis](#213-layout-analysis)
- [2.1.4 table recognition](#214-table-recognition) - [2.1.4 table recognition](#214-table-recognition)
- [2.1.5 DocVQA](#215-dockie) - [2.1.5 Key Information Extraction](#215-Key-Information-Extraction)
- [2.1.6 layout recovery](#216-layout-recovery)
- [2.2 Use by code](#22-use-by-code) - [2.2 Use by code](#22-use-by-code)
- [2.2.1 image orientation + layout analysis + table recognition](#221-image-orientation--layout-analysis--table-recognition) - [2.2.1 image orientation + layout analysis + table recognition](#221-image-orientation--layout-analysis--table-recognition)
- [2.2.2 layout analysis + table recognition](#222-layout-analysis--table-recognition) - [2.2.2 layout analysis + table recognition](#222-layout-analysis--table-recognition)
- [2.2.3 layout analysis](#223-layout-analysis) - [2.2.3 layout analysis](#223-layout-analysis)
- [2.2.4 table recognition](#224-table-recognition) - [2.2.4 table recognition](#224-table-recognition)
- [2.2.5 DocVQA](#225-dockie) - [2.2.5 DocVQA](#225-dockie)
- [2.2.5 Key Information Extraction](#225-Key-Information-Extraction)
- [2.2.6 layout recovery](#226-layout-recovery)
- [2.3 Result description](#23-result-description) - [2.3 Result description](#23-result-description)
- [2.3.1 layout analysis + table recognition](#231-layout-analysis--table-recognition) - [2.3.1 layout analysis + table recognition](#231-layout-analysis--table-recognition)
- [2.3.2 DocVQA](#232-dockie) - [2.3.2 Key Information Extraction](#232-Key-Information-Extraction)
- [2.4 Parameter Description](#24-parameter-description) - [2.4 Parameter Description](#24-parameter-description)
...@@ -24,14 +27,16 @@ ...@@ -24,14 +27,16 @@
## 1. Install package ## 1. Install package
```bash ```bash
# Install paddleocr, version 2.5+ is recommended # Install paddleocr, version 2.6 is recommended
pip3 install "paddleocr>=2.5" pip3 install "paddleocr>=2.6"
# Install the DocVQA dependency package paddlenlp (if you do not use the DocVQA, you can skip it) # Install the KIE dependency packages (if you do not use the KIE, you can skip it)
pip install paddlenlp pip install -r kie/requirements.txt
# Install the image direction classification dependency package paddleclas (if you do not use the image direction classification, you can skip it)
pip3 install paddleclas
``` ```
<a name="2"></a> <a name="2"></a>
## 2. Use ## 2. Use
<a name="21"></a> <a name="21"></a>
...@@ -62,9 +67,15 @@ paddleocr --image_dir=PaddleOCR/ppstructure/docs/table/table.jpg --type=structur ...@@ -62,9 +67,15 @@ paddleocr --image_dir=PaddleOCR/ppstructure/docs/table/table.jpg --type=structur
``` ```
<a name="215"></a> <a name="215"></a>
#### 2.1.5 DocVQA #### 2.1.5 Key Information Extraction
Please refer to: [Documentation Visual Q&A](../kie/README.md) . Please refer to: [Key Information Extraction](../kie/README.md) .
<a name="216"></a>
#### 2.1.6 layout recovery
```bash
paddleocr --image_dir=PaddleOCR/ppstructure/docs/table/1.png --type=structure --recovery=true
```
<a name="22"></a> <a name="22"></a>
### 2.2 Use by code ### 2.2 Use by code
...@@ -120,7 +131,7 @@ for line in result: ...@@ -120,7 +131,7 @@ for line in result:
from PIL import Image from PIL import Image
font_path = 'PaddleOCR/doc/fonts/simfang.ttf' # PaddleOCR下提供字体包 font_path = 'PaddleOCR/doc/fonts/simfang.ttf' # font provieded in PaddleOCR
image = Image.open(img_path).convert('RGB') image = Image.open(img_path).convert('RGB')
im_show = draw_structure_result(image, result,font_path=font_path) im_show = draw_structure_result(image, result,font_path=font_path)
im_show = Image.fromarray(im_show) im_show = Image.fromarray(im_show)
...@@ -170,9 +181,35 @@ for line in result: ...@@ -170,9 +181,35 @@ for line in result:
``` ```
<a name="225"></a> <a name="225"></a>
#### 2.2.5 DocVQA #### 2.2.5 Key Information Extraction
Please refer to: [Key Information Extraction](../kie/README.md) .
<a name="226"></a>
#### 2.2.6 layout recovery
```python
import os
import cv2
from paddleocr import PPStructure,save_structure_res
from paddelocr.ppstructure.recovery.recovery_to_doc import sorted_layout_boxes, convert_info_docx
Please refer to: [Documentation Visual Q&A](../kie/README.md) . table_engine = PPStructure(layout=False, show_log=True)
save_folder = './output'
img_path = 'PaddleOCR/ppstructure/docs/table/1.png'
img = cv2.imread(img_path)
result = table_engine(img)
save_structure_res(result, save_folder, os.path.basename(img_path).split('.')[0])
for line in result:
line.pop('img')
print(line)
h, w, _ = img.shape
res = sorted_layout_boxes(res, w)
convert_info_docx(img, result, save_folder, os.path.basename(img_path).split('.')[0])
```
<a name="23"></a> <a name="23"></a>
### 2.3 Result description ### 2.3 Result description
...@@ -208,9 +245,9 @@ After the recognition is completed, each image will have a directory with the sa ...@@ -208,9 +245,9 @@ After the recognition is completed, each image will have a directory with the sa
``` ```
<a name="232"></a> <a name="232"></a>
#### 2.3.2 DocVQA #### 2.3.2 Key Information Extraction
Please refer to: [Documentation Visual Q&A](../kie/README.md) . Please refer to: [Key Information Extraction](../kie/README.md) .
<a name="24"></a> <a name="24"></a>
### 2.4 Parameter Description ### 2.4 Parameter Description
...@@ -235,6 +272,7 @@ Please refer to: [Documentation Visual Q&A](../kie/README.md) . ...@@ -235,6 +272,7 @@ Please refer to: [Documentation Visual Q&A](../kie/README.md) .
| table | Whether to perform table recognition in forward | True | | table | Whether to perform table recognition in forward | True |
| ocr | Whether to perform ocr for non-table areas in layout analysis. When layout is False, it will be automatically set to False| True | | ocr | Whether to perform ocr for non-table areas in layout analysis. When layout is False, it will be automatically set to False| True |
| recovery | Whether to perform layout recovery in forward| False | | recovery | Whether to perform layout recovery in forward| False |
| save_pdf | Whether to convert docx to pdf when recovery| False |
| structure_version | Structure version, optional PP-structure and PP-structurev2 | PP-structure | | structure_version | Structure version, optional PP-structure and PP-structurev2 | PP-structure |
Most of the parameters are consistent with the PaddleOCR whl package, see [whl package documentation](../../doc/doc_en/whl.md) Most of the parameters are consistent with the PaddleOCR whl package, see [whl package documentation](../../doc/doc_en/whl.md)
...@@ -246,7 +246,7 @@ For training, evaluation and inference tutorial for text recognition models, ple ...@@ -246,7 +246,7 @@ For training, evaluation and inference tutorial for text recognition models, ple
If you want to finish the KIE tasks in your scene, and don't know what to prepare, please refer to [End cdoc](../../doc/doc_en/recognition.md). If you want to finish the KIE tasks in your scene, and don't know what to prepare, please refer to [End cdoc](../../doc/doc_en/recognition.md).
关于怎样在自己的场景中完成关键信息抽取任务,请参考:[Guide to End-to-end KIE](./how_to_do_kie_en.md) To complete the key information extraction task in your own scenario from data preparation to model selection, please refer to: [Guide to End-to-end KIE](./how_to_do_kie_en.md)
## 5. Reference ## 5. Reference
......
...@@ -63,7 +63,7 @@ python3 -m pip install "paddlepaddle>=2.2" -i https://mirror.baidu.com/pypi/simp ...@@ -63,7 +63,7 @@ python3 -m pip install "paddlepaddle>=2.2" -i https://mirror.baidu.com/pypi/simp
git clone https://github.com/PaddlePaddle/PaddleDetection.git git clone https://github.com/PaddlePaddle/PaddleDetection.git
``` ```
- **(2)安装其他依赖 ** - **(2)安装其他依赖**
```bash ```bash
cd PaddleDetection cd PaddleDetection
...@@ -138,7 +138,7 @@ json文件包含所有图像的标注,数据以字典嵌套的方式存放, ...@@ -138,7 +138,7 @@ json文件包含所有图像的标注,数据以字典嵌套的方式存放,
``` ```
{ {
'segmentation': # 物体的分割标注 'segmentation': # 物体的分割标注
'area': 60518.099043117836, # 物体的区域面积 'area': 60518.099043117836, # 物体的区域面积
'iscrowd': 0, # iscrowd 'iscrowd': 0, # iscrowd
...@@ -166,15 +166,17 @@ json文件包含所有图像的标注,数据以字典嵌套的方式存放, ...@@ -166,15 +166,17 @@ json文件包含所有图像的标注,数据以字典嵌套的方式存放,
提供了训练脚本、评估脚本和预测脚本,本节将以PubLayNet预训练模型为例进行讲解。 提供了训练脚本、评估脚本和预测脚本,本节将以PubLayNet预训练模型为例进行讲解。
如果不希望训练,直接体验后面的模型评估、预测、动转静、推理的流程,可以下载提供的预训练模型,并跳过本部分。 如果不希望训练,直接体验后面的模型评估、预测、动转静、推理的流程,可以下载提供的预训练模型(PubLayNet数据集),并跳过本部分。
``` ```
mkdir pretrained_model mkdir pretrained_model
cd pretrained_model cd pretrained_model
# 下载并解压PubLayNet预训练模型 # 下载PubLayNet预训练模型
wget https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_layout.pdparams wget https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_layout.pdparams
``` ```
下载更多[版面分析模型](../docs/models_list.md)(中文CDLA数据集预训练模型、表格预训练模型)
### 4.1. 启动训练 ### 4.1. 启动训练
开始训练: 开始训练:
...@@ -184,7 +186,7 @@ wget https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_ ...@@ -184,7 +186,7 @@ wget https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_
如果你希望训练自己的数据集,需要修改配置文件中的数据配置、类别数。 如果你希望训练自己的数据集,需要修改配置文件中的数据配置、类别数。
`configs/picodet/legacy_model/application/layout_detection/picodet_lcnet_x1_0_layout.yml` 为例,修改的内容如下所示。 `configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml` 为例,修改的内容如下所示。
```yaml ```yaml
metric: COCO metric: COCO
...@@ -223,16 +225,20 @@ TestDataset: ...@@ -223,16 +225,20 @@ TestDataset:
# 训练日志会自动保存到 log 目录中 # 训练日志会自动保存到 log 目录中
# 单卡训练 # 单卡训练
export CUDA_VISIBLE_DEVICES=0
python3 tools/train.py \ python3 tools/train.py \
-c configs/picodet/legacy_model/application/layout_detection/picodet_lcnet_x1_0_layout.yml \ -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \
--eval --eval
# 多卡训练,通过--gpus参数指定卡号 # 多卡训练,通过--gpus参数指定卡号
export CUDA_VISIBLE_DEVICES=0,1,2,3
python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py \ python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py \
-c configs/picodet/legacy_model/application/layout_detection/picodet_lcnet_x1_0_layout.yml \ -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \
--eval --eval
``` ```
**注意:**如果训练时显存out memory,将TrainReader中batch_size调小,同时LearningRate中base_lr等比例减小。发布的config均由8卡训练得到,如果改变GPU卡数为1,那么base_lr需要减小8倍。
正常启动训练后,会看到以下log输出: 正常启动训练后,会看到以下log输出:
``` ```
...@@ -254,9 +260,11 @@ PaddleDetection支持了基于FGD([Focal and Global Knowledge Distillation for D ...@@ -254,9 +260,11 @@ PaddleDetection支持了基于FGD([Focal and Global Knowledge Distillation for D
更换数据集,修改【TODO】配置中的数据配置、类别数,具体可以参考4.1。启动训练: 更换数据集,修改【TODO】配置中的数据配置、类别数,具体可以参考4.1。启动训练:
```bash ```bash
python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py \ # 单卡训练
-c configs/picodet/legacy_model/application/layout_detection/picodet_lcnet_x1_0_layout.yml \ export CUDA_VISIBLE_DEVICES=0
--slim_config configs/picodet/legacy_model/application/layout_detection/picodet_lcnet_x2_5_layout.yml \ python3 tools/train.py \
-c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \
--slim_config configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x2_5_layout.yml \
--eval --eval
``` ```
...@@ -267,13 +275,13 @@ python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py \ ...@@ -267,13 +275,13 @@ python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py \
### 5.1. 指标评估 ### 5.1. 指标评估
训练中模型参数默认保存在`output/picodet_lcnet_x1_0_layout`目录下。在评估指标时,需要设置`weights`指向保存的参数文件。评估数据集可以通过 `configs/picodet/legacy_model/application/layout_detection/picodet_lcnet_x1_0_layout.yml` 修改`EvalDataset`中的 `image_dir``anno_path``dataset_dir` 设置。 训练中模型参数默认保存在`output/picodet_lcnet_x1_0_layout`目录下。在评估指标时,需要设置`weights`指向保存的参数文件。评估数据集可以通过 `configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml` 修改`EvalDataset`中的 `image_dir``anno_path``dataset_dir` 设置。
```bash ```bash
# GPU 评估, weights 为待测权重 # GPU 评估, weights 为待测权重
python3 tools/eval.py \ python3 tools/eval.py \
-c configs/picodet/legacy_model/application/layout_detection/picodet_lcnet_x1_0_layout.yml \ -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \
-o weigths=./output/picodet_lcnet_x1_0_layout/best_model -o weights=./output/picodet_lcnet_x1_0_layout/best_model
``` ```
会输出以下信息,打印出mAP、AP0.5等信息。 会输出以下信息,打印出mAP、AP0.5等信息。
...@@ -299,8 +307,8 @@ python3 tools/eval.py \ ...@@ -299,8 +307,8 @@ python3 tools/eval.py \
``` ```
python3 tools/eval.py \ python3 tools/eval.py \
-c configs/picodet/legacy_model/application/layout_detection/picodet_lcnet_x1_0_layout.yml \ -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \
--slim_config configs/picodet/legacy_model/application/layout_detection/picodet_lcnet_x2_5_layout.yml \ --slim_config configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x2_5_layout.yml \
-o weights=output/picodet_lcnet_x2_5_layout/best_model -o weights=output/picodet_lcnet_x2_5_layout/best_model
``` ```
...@@ -311,18 +319,17 @@ python3 tools/eval.py \ ...@@ -311,18 +319,17 @@ python3 tools/eval.py \
### 5.2. 测试版面分析结果 ### 5.2. 测试版面分析结果
预测使用的配置文件必须与训练一致,如您通过 `python3 tools/train.py -c configs/picodet/legacy_model/application/layout_detection/picodet_lcnet_x1_0_layout.yml` 完成了模型的训练过程。 预测使用的配置文件必须与训练一致,如您通过 `python3 tools/train.py -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml` 完成了模型的训练过程。
使用 PaddleDetection 训练好的模型,您可以使用如下命令进行中文模型预测。
使用 PaddleDetection 训练好的模型,您可以使用如下命令进行模型预测。
```bash ```bash
python3 tools/infer.py \ python3 tools/infer.py \
-c configs/picodet/legacy_model/application/layout_detection/picodet_lcnet_x1_0_layout.yml \ -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \
-o weights='output/picodet_lcnet_x1_0_layout/best_model.pdparams' \ -o weights='output/picodet_lcnet_x1_0_layout/best_model.pdparams' \
--infer_img='docs/images/layout.jpg' \ --infer_img='docs/images/layout.jpg' \
--output_dir=output_dir/ \ --output_dir=output_dir/ \
--draw_threshold=0.4 --draw_threshold=0.5
``` ```
- `--infer_img`: 推理单张图片,也可以通过`--infer_dir`推理文件中的所有图片。 - `--infer_img`: 推理单张图片,也可以通过`--infer_dir`推理文件中的所有图片。
...@@ -335,16 +342,15 @@ python3 tools/infer.py \ ...@@ -335,16 +342,15 @@ python3 tools/infer.py \
``` ```
python3 tools/infer.py \ python3 tools/infer.py \
-c configs/picodet/legacy_model/application/layout_detection/picodet_lcnet_x1_0_layout.yml \ -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \
--slim_config configs/picodet/legacy_model/application/layout_detection/picodet_lcnet_x2_5_layout.yml \ --slim_config configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x2_5_layout.yml \
-o weights='output/picodet_lcnet_x2_5_layout/best_model.pdparams' \ -o weights='output/picodet_lcnet_x2_5_layout/best_model.pdparams' \
--infer_img='docs/images/layout.jpg' \ --infer_img='docs/images/layout.jpg' \
--output_dir=output_dir/ \ --output_dir=output_dir/ \
--draw_threshold=0.4 --draw_threshold=0.5
``` ```
## 6. 模型导出与预测 ## 6. 模型导出与预测
...@@ -356,7 +362,7 @@ inference 模型(`paddle.jit.save`保存的模型) 一般是模型训练, ...@@ -356,7 +362,7 @@ inference 模型(`paddle.jit.save`保存的模型) 一般是模型训练,
```bash ```bash
python3 tools/export_model.py \ python3 tools/export_model.py \
-c configs/picodet/legacy_model/application/layout_detection/picodet_lcnet_x1_0_layout.yml \ -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \
-o weights=output/picodet_lcnet_x1_0_layout/best_model \ -o weights=output/picodet_lcnet_x1_0_layout/best_model \
--output_dir=output_inference/ --output_dir=output_inference/
``` ```
...@@ -377,8 +383,8 @@ FGD蒸馏模型转inference模型步骤如下: ...@@ -377,8 +383,8 @@ FGD蒸馏模型转inference模型步骤如下:
```bash ```bash
python3 tools/export_model.py \ python3 tools/export_model.py \
-c configs/picodet/legacy_model/application/publayernet_lcnet_x1_5/picodet_student.yml \ -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \
--slim_config configs/picodet/legacy_model/application/publayernet_lcnet_x1_5/picodet_teacher.yml \ --slim_config configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x2_5_layout.yml \
-o weights=./output/picodet_lcnet_x2_5_layout/best_model \ -o weights=./output/picodet_lcnet_x2_5_layout/best_model \
--output_dir=output_inference/ --output_dir=output_inference/
``` ```
...@@ -404,7 +410,7 @@ python3 deploy/python/infer.py \ ...@@ -404,7 +410,7 @@ python3 deploy/python/infer.py \
------------------------------------------ ------------------------------------------
----------- Model Configuration ----------- ----------- Model Configuration -----------
Model Arch: PicoDet Model Arch: PicoDet
Transform Order: Transform Order:
--transform op: Resize --transform op: Resize
--transform op: NormalizeImage --transform op: NormalizeImage
--transform op: Permute --transform op: Permute
...@@ -466,4 +472,3 @@ preprocess_time(ms): 2172.50, inference_time(ms): 11.90, postprocess_time(ms): 1 ...@@ -466,4 +472,3 @@ preprocess_time(ms): 2172.50, inference_time(ms): 11.90, postprocess_time(ms): 1
year={2022} year={2022}
} }
``` ```
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
...@@ -6,10 +6,12 @@ English | [简体中文](README_ch.md) ...@@ -6,10 +6,12 @@ English | [简体中文](README_ch.md)
- [2.1 Installation dependencies](#2.1) - [2.1 Installation dependencies](#2.1)
- [2.2 Install PaddleOCR](#2.2) - [2.2 Install PaddleOCR](#2.2)
- [3. Quick Start](#3) - [3. Quick Start](#3)
- [3.1 Download models](#3.1)
- [3.2 Layout recovery](#3.2)
<a name="1"></a> <a name="1"></a>
## 1. Introduction ## 1. Introduction
Layout recovery means that after OCR recognition, the content is still arranged like the original document pictures, and the paragraphs are output to word document in the same order. Layout recovery means that after OCR recognition, the content is still arranged like the original document pictures, and the paragraphs are output to word document in the same order.
...@@ -17,8 +19,9 @@ Layout recovery combines [layout analysis](../layout/README.md)、[table recogni ...@@ -17,8 +19,9 @@ Layout recovery combines [layout analysis](../layout/README.md)、[table recogni
The following figure shows the result: The following figure shows the result:
<div align="center"> <div align="center">
<img src="../docs/table/recovery.jpg" width = "700" /> <img src="../docs/recovery/recovery.jpg" width = "700" />
</div> </div>
<a name="2"></a> <a name="2"></a>
## 2. Install ## 2. Install
...@@ -33,14 +36,14 @@ The following figure shows the result: ...@@ -33,14 +36,14 @@ The following figure shows the result:
python3 -m pip install --upgrade pip python3 -m pip install --upgrade pip
# GPU installation # GPU installation
python3 -m pip install "paddlepaddle-gpu>=2.2" -i https://mirror.baidu.com/pypi/simple python3 -m pip install "paddlepaddle-gpu" -i https://mirror.baidu.com/pypi/simple
# CPU installation # CPU installation
python3 -m pip install "paddlepaddle>=2.2" -i https://mirror.baidu.com/pypi/simple python3 -m pip install "paddlepaddle" -i https://mirror.baidu.com/pypi/simple
```` ````
For more requirements, please refer to the instructions in [Installation Documentation](https://www.paddlepaddle.org.cn/install/quick). For more requirements, please refer to the instructions in [Installation Documentation](https://www.paddlepaddle.org.cn/en/install/quick?docurl=/documentation/docs/en/install/pip/macos-pip_en.html).
<a name="2.2"></a> <a name="2.2"></a>
...@@ -67,38 +70,61 @@ python3 -m pip install -r ppstructure/recovery/requirements.txt ...@@ -67,38 +70,61 @@ python3 -m pip install -r ppstructure/recovery/requirements.txt
## 3. Quick Start ## 3. Quick Start
```python <a name="3.1"></a>
### 3.1 Download models
If input is English document, download English models:
```bash
cd PaddleOCR/ppstructure cd PaddleOCR/ppstructure
# download model # download model
mkdir inference && cd inference mkdir inference && cd inference
# Download the detection model of the ultra-lightweight English PP-OCRv3 model and unzip it # Download the detection model of the ultra-lightweight English PP-OCRv3 model and unzip it
wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar && tar xf ch_PP-OCRv3_det_infer.tar https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar && tar xf en_PP-OCRv3_det_infer.tar
# Download the recognition model of the ultra-lightweight English PP-OCRv3 model and unzip it # Download the recognition model of the ultra-lightweight English PP-OCRv3 model and unzip it
wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar && tar xf ch_PP-OCRv3_rec_infer.tar wget https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_infer.tar && tar xf en_PP-OCRv3_rec_infer.tar
# Download the ultra-lightweight English table inch model and unzip it # Download the ultra-lightweight English table inch model and unzip it
wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_structure_infer.tar && tar xf en_ppocr_mobile_v2.0_table_structure_infer.tar wget https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/en_ppstructure_mobile_v2.0_SLANet_infer.tar && tar xf en_ppstructure_mobile_v2.0_SLANet_infer.tar
# Download the layout model of publaynet dataset and unzip it # Download the layout model of publaynet dataset and unzip it
wget wget https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_infer.tar && tar xf picodet_lcnet_x1_0_fgd_layout_infer.tar
https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_layout_infer.tar && tar picodet_lcnet_x1_0_layout_infer.tar
cd .. cd ..
# run ```
If input is Chinese document,download Chinese models:
[Chinese and English ultra-lightweight PP-OCRv3 model](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/README.md#pp-ocr-series-model-listupdate-on-september-8th)、[表格识别模型](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/ppstructure/docs/models_list.md#22-表格识别模型)、[版面分析模型](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/ppstructure/docs/models_list.md#1-版面分析模型)
<a name="3.2"></a>
### 3.2 Layout recovery
```bash
python3 predict_system.py \ python3 predict_system.py \
--image_dir=./docs/table/1.png \ --image_dir=./docs/table/1.png \
--det_model_dir=inference/en_PP-OCRv3_det_infer \ --det_model_dir=inference/en_PP-OCRv3_det_infer \
--rec_model_dir=inference/en_PP-OCRv3_rec_infe \ --rec_model_dir=inference/en_PP-OCRv3_rec_infer \
--rec_char_dict_path=../ppocr/utils/en_dict.txt \ --rec_char_dict_path=../ppocr/utils/en_dict.txt \
--output=../output/ \ --table_model_dir=inference/en_ppstructure_mobile_v2.0_SLANet_infer \
--table_model_dir=inference/ch_ppstructure_mobile_v2.0_SLANet_infer \
--table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt \ --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt \
--table_max_len=488 \ --layout_model_dir=inference/picodet_lcnet_x1_0_fgd_layout_infer \
--layout_model_dir=inference/picodet_lcnet_x1_0_layout_infer \
--layout_dict_path=../ppocr/utils/dict/layout_dict/layout_publaynet_dict.txt \ --layout_dict_path=../ppocr/utils/dict/layout_dict/layout_publaynet_dict.txt \
--vis_font_path=../doc/fonts/simfang.ttf \ --vis_font_path=../doc/fonts/simfang.ttf \
--recovery=True \ --recovery=True \
--save_pdf=False --save_pdf=False \
--output=../output/
``` ```
After running, the docx of each picture will be saved in the directory specified by the output field After running, the docx of each picture will be saved in the directory specified by the output field
Recovery table to Word code[table_process.py] reference:https://github.com/pqzx/html2docx.git Field:
\ No newline at end of file
- image_dir:test file测试文件, can be picture, picture directory, pdf file, pdf file directory
- det_model_dir:OCR detection model path
- rec_model_dir:OCR recognition model path
- rec_char_dict_path:OCR recognition dict path. If the Chinese model is used, change to "../ppocr/utils/ppocr_keys_v1.txt". And if you trained the model on your own dataset, change to the trained dictionary
- table_model_dir:tabel recognition model path
- table_char_dict_path:tabel recognition dict path. If the Chinese model is used, no need to change
- layout_model_dir:layout analysis model path
- layout_dict_path:layout analysis dict path. If the Chinese model is used, change to "../ppocr/utils/dict/layout_dict/layout_cdla_dict.txt"
- recovery:whether to enable layout of recovery, default False
- save_pdf:when recovery file, whether to save pdf file, default False
- output:save the recovery result path
...@@ -8,19 +8,22 @@ ...@@ -8,19 +8,22 @@
- [2.2 安装PaddleOCR](#2.2) - [2.2 安装PaddleOCR](#2.2)
- [3. 使用](#3) - [3. 使用](#3)
- [3.1 下载模型](#3.1)
- [3.2 版面恢复](#3.2)
<a name="1"></a> <a name="1"></a>
## 1. 简介 ## 1. 简介
版面恢复就是在OCR识别后,内容仍然像原文档图片那样排列着,段落不变、顺序不变的输出到word文档中等。 版面恢复就是在OCR识别后,内容仍然像原文档图片那样排列着,段落不变、顺序不变的输出到word文档中等。
版面恢复结合了[版面分析](../layout/README_ch.md)[表格识别](../table/README_ch.md)技术,从而更好地恢复图片、表格、标题等内容,下图展示了版面恢复的结果: 版面恢复结合了[版面分析](../layout/README_ch.md)[表格识别](../table/README_ch.md)技术,从而更好地恢复图片、表格、标题等内容,支持pdf文档、文档图片格式的输入文件,下图展示了版面恢复的结果:
<div align="center"> <div align="center">
<img src="../docs/table/recovery.jpg" width = "700" /> <img src="../docs/recovery/recovery.jpg" width = "700" />
</div> </div>
<a name="2"></a> <a name="2"></a>
## 2. 安装 ## 2. 安装
...@@ -35,10 +38,10 @@ ...@@ -35,10 +38,10 @@
python3 -m pip install --upgrade pip python3 -m pip install --upgrade pip
# GPU安装 # GPU安装
python3 -m pip install "paddlepaddle-gpu>=2.3" -i https://mirror.baidu.com/pypi/simple python3 -m pip install "paddlepaddle-gpu" -i https://mirror.baidu.com/pypi/simple
# CPU安装 # CPU安装
python3 -m pip install "paddlepaddle>=2.3" -i https://mirror.baidu.com/pypi/simple python3 -m pip install "paddlepaddle" -i https://mirror.baidu.com/pypi/simple
``` ```
...@@ -69,40 +72,66 @@ python3 -m pip install -r ppstructure/recovery/requirements.txt ...@@ -69,40 +72,66 @@ python3 -m pip install -r ppstructure/recovery/requirements.txt
## 3. 使用 ## 3. 使用
恢复给定文档的版面: <a name="3.1"></a>
### 3.1 下载模型
如果输入为英文文档类型,下载英文模型
```python ```bash
cd PaddleOCR/ppstructure cd PaddleOCR/ppstructure
# 下载模型 # 下载模型
mkdir inference && cd inference mkdir inference && cd inference
# 下载超英文轻量级PP-OCRv3模型的检测模型并解压 # 下载英文超轻量PP-OCRv3检测模型并解压
wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar && tar xf ch_PP-OCRv3_det_infer.tar wget https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar && tar xf en_PP-OCRv3_det_infer.tar
# 下载英文轻量级PP-OCRv3模型的识别模型并解压 # 下载英文超轻量PP-OCRv3识别模型并解压
wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar && tar xf ch_PP-OCRv3_rec_infer.tar wget https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_infer.tar && tar xf en_PP-OCRv3_rec_infer.tar
# 下载超轻量级英文表格英寸模型并解压 # 下载英文表格识别模型并解压
wget https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_infer.tar && tar xf ch_ppstructure_mobile_v2.0_SLANet_infer.tar wget https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/en_ppstructure_mobile_v2.0_SLANet_infer.tar && tar xf en_ppstructure_mobile_v2.0_SLANet_infer.tar
# 下载英文版面分析模型 # 下载英文版面分析模型
wget https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_layout_infer.tar && tar picodet_lcnet_x1_0_layout_infer.tar wget https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_infer.tar && tar xf picodet_lcnet_x1_0_fgd_layout_infer.tar
cd .. cd ..
```
如果输入为中文文档类型,在下述链接中下载中文模型即可:
# 执行预测 [PP-OCRv3中英文超轻量文本检测和识别模型](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/README_ch.md#pp-ocr%E7%B3%BB%E5%88%97%E6%A8%A1%E5%9E%8B%E5%88%97%E8%A1%A8%E6%9B%B4%E6%96%B0%E4%B8%AD)[表格识别模型](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/ppstructure/docs/models_list.md#22-表格识别模型)[版面分析模型](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/ppstructure/docs/models_list.md#1-版面分析模型)
<a name="3.2"></a>
### 3.2 版面恢复
使用下载的模型恢复给定文档的版面,以英文模型为例,执行如下命令:
```bash
python3 predict_system.py \ python3 predict_system.py \
--image_dir=./docs/table/1.png \ --image_dir=./docs/table/1.png \
--det_model_dir=inference/en_PP-OCRv3_det_infer \ --det_model_dir=inference/en_PP-OCRv3_det_infer \
--rec_model_dir=inference/en_PP-OCRv3_rec_infe \ --rec_model_dir=inference/en_PP-OCRv3_rec_infer \
--rec_char_dict_path=../ppocr/utils/en_dict.txt \ --rec_char_dict_path=../ppocr/utils/en_dict.txt \
--output=../output/ \ --table_model_dir=inference/en_ppstructure_mobile_v2.0_SLANet_infer \
--table_model_dir=inference/ch_ppstructure_mobile_v2.0_SLANet_infer \
--table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt \ --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt \
--table_max_len=488 \ --layout_model_dir=inference/picodet_lcnet_x1_0_fgd_layout_infer \
--layout_model_dir=inference/picodet_lcnet_x1_0_layout_infer \
--layout_dict_path=../ppocr/utils/dict/layout_dict/layout_publaynet_dict.txt \ --layout_dict_path=../ppocr/utils/dict/layout_dict/layout_publaynet_dict.txt \
--vis_font_path=../doc/fonts/simfang.ttf \ --vis_font_path=../doc/fonts/simfang.ttf \
--recovery=True \ --recovery=True \
--save_pdf=False --save_pdf=False \
--output=../output/
``` ```
运行完成后,每张图片的docx文档会保存到`output`字段指定的目录下 运行完成后,恢复版面的docx文档会保存到`output`字段指定的目录下
表格恢复到Word代码[table_process.py]来自:https://github.com/pqzx/html2docx.git 字段含义:
- image_dir:测试文件,可以是图片、图片目录、pdf文件、pdf文件目录
- det_model_dir:OCR检测模型路径
- rec_model_dir:OCR识别模型路径
- rec_char_dict_path:OCR识别字典,如果更换为中文模型,需要更改为"../ppocr/utils/ppocr_keys_v1.txt",如果您在自己的数据集上训练的模型,则更改为训练的字典的文件
- table_model_dir:表格识别模型路径
- table_char_dict_path:表格识别字典,如果更换为中文模型,不需要更换字典
- layout_model_dir:版面分析模型路径
- layout_dict_path:版面分析字典,如果更换为中文模型,需要更改为"../ppocr/utils/dict/layout_dict/layout_cdla_dict.txt"
- recovery:是否进行版面恢复,默认False
- save_pdf:进行版面恢复导出docx文档的同时,是否保存为pdf文件,默认为False
- output:版面恢复结果保存路径
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
...@@ -24,7 +24,7 @@ from docx.enum.section import WD_SECTION ...@@ -24,7 +24,7 @@ from docx.enum.section import WD_SECTION
from docx.oxml.ns import qn from docx.oxml.ns import qn
from docx.enum.table import WD_TABLE_ALIGNMENT from docx.enum.table import WD_TABLE_ALIGNMENT
from table_process import HtmlToDocx from ppstructure.recovery.table_process import HtmlToDocx
from ppocr.utils.logging import get_logger from ppocr.utils.logging import get_logger
logger = get_logger() logger = get_logger()
...@@ -69,7 +69,7 @@ def convert_info_docx(img, res, save_folder, img_name, save_pdf): ...@@ -69,7 +69,7 @@ def convert_info_docx(img, res, save_folder, img_name, save_pdf):
new_table = deepcopy(table) new_table = deepcopy(table)
new_table.alignment = WD_TABLE_ALIGNMENT.CENTER new_table.alignment = WD_TABLE_ALIGNMENT.CENTER
paragraph.add_run().element.addnext(new_table._tbl) paragraph.add_run().element.addnext(new_table._tbl)
else: else:
paragraph = doc.add_paragraph() paragraph = doc.add_paragraph()
paragraph_format = paragraph.paragraph_format paragraph_format = paragraph.paragraph_format
...@@ -86,10 +86,10 @@ def convert_info_docx(img, res, save_folder, img_name, save_pdf): ...@@ -86,10 +86,10 @@ def convert_info_docx(img, res, save_folder, img_name, save_pdf):
# save to pdf # save to pdf
if save_pdf: if save_pdf:
pdf = os.path.join(save_folder, '{}.pdf'.format(img_name)) pdf_path = os.path.join(save_folder, '{}.pdf'.format(img_name))
from docx2pdf import convert from docx2pdf import convert
convert(docx_path, pdf_path) convert(docx_path, pdf_path)
logger.info('pdf save to {}'.format(pdf)) logger.info('pdf save to {}'.format(pdf_path))
def sorted_layout_boxes(res, w): def sorted_layout_boxes(res, w):
...@@ -112,7 +112,7 @@ def sorted_layout_boxes(res, w): ...@@ -112,7 +112,7 @@ def sorted_layout_boxes(res, w):
res_left = [] res_left = []
res_right = [] res_right = []
i = 0 i = 0
while True: while True:
if i >= num_boxes: if i >= num_boxes:
break break
...@@ -137,7 +137,7 @@ def sorted_layout_boxes(res, w): ...@@ -137,7 +137,7 @@ def sorted_layout_boxes(res, w):
res_left = [] res_left = []
res_right = [] res_right = []
break break
elif _boxes[i]['bbox'][0] < w / 4 and _boxes[i]['bbox'][2] < 3*w / 4: elif _boxes[i]['bbox'][0] < w / 4 and _boxes[i]['bbox'][2] < 3 * w / 4:
_boxes[i]['layout'] = 'double' _boxes[i]['layout'] = 'double'
res_left.append(_boxes[i]) res_left.append(_boxes[i])
i += 1 i += 1
...@@ -157,4 +157,4 @@ def sorted_layout_boxes(res, w): ...@@ -157,4 +157,4 @@ def sorted_layout_boxes(res, w):
new_res += res_left new_res += res_left
if res_right: if res_right:
new_res += res_right new_res += res_right
return new_res return new_res
\ No newline at end of file
...@@ -59,16 +59,16 @@ cd PaddleOCR/ppstructure ...@@ -59,16 +59,16 @@ cd PaddleOCR/ppstructure
# download model # download model
mkdir inference && cd inference mkdir inference && cd inference
# Download the PP-OCRv3 text detection model and unzip it # Download the PP-OCRv3 text detection model and unzip it
wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_slim_infer.tar && tar xf ch_PP-OCRv3_det_slim_infer.tar wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar && tar xf ch_PP-OCRv3_det_infer.tar
# Download the PP-OCRv3 text recognition model and unzip it # Download the PP-OCRv3 text recognition model and unzip it
wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_infer.tar && tar xf ch_PP-OCRv3_rec_slim_infer.tar wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar && tar xf ch_PP-OCRv3_rec_infer.tar
# Download the PP-Structurev2 form recognition model and unzip it # Download the PP-Structurev2 form recognition model and unzip it
wget https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_infer.tar && tar xf ch_ppstructure_mobile_v2.0_SLANet_infer.tar wget https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_infer.tar && tar xf ch_ppstructure_mobile_v2.0_SLANet_infer.tar
cd .. cd ..
# run # run
python3.7 table/predict_table.py \ python3.7 table/predict_table.py \
--det_model_dir=inference/ch_PP-OCRv3_det_slim_infer \ --det_model_dir=inference/ch_PP-OCRv3_det_infer \
--rec_model_dir=inference/ch_PP-OCRv3_rec_slim_infer \ --rec_model_dir=inference/ch_PP-OCRv3_rec_infer \
--table_model_dir=inference/ch_ppstructure_mobile_v2.0_SLANet_infer \ --table_model_dir=inference/ch_ppstructure_mobile_v2.0_SLANet_infer \
--rec_char_dict_path=../ppocr/utils/ppocr_keys_v1.txt \ --rec_char_dict_path=../ppocr/utils/ppocr_keys_v1.txt \
--table_char_dict_path=../ppocr/utils/dict/table_structure_dict_ch.txt \ --table_char_dict_path=../ppocr/utils/dict/table_structure_dict_ch.txt \
......
...@@ -64,16 +64,16 @@ cd PaddleOCR/ppstructure ...@@ -64,16 +64,16 @@ cd PaddleOCR/ppstructure
# 下载模型 # 下载模型
mkdir inference && cd inference mkdir inference && cd inference
# 下载PP-OCRv3文本检测模型并解压 # 下载PP-OCRv3文本检测模型并解压
wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_slim_infer.tar && tar xf ch_PP-OCRv3_det_slim_infer.tar wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar && tar xf ch_PP-OCRv3_det_infer.tar
# 下载PP-OCRv3文本识别模型并解压 # 下载PP-OCRv3文本识别模型并解压
wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_infer.tar && tar xf ch_PP-OCRv3_rec_slim_infer.tar wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar && tar xf ch_PP-OCRv3_rec_infer.tar
# 下载PP-Structurev2表格识别模型并解压 # 下载PP-Structurev2表格识别模型并解压
wget https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_infer.tar && tar xf ch_ppstructure_mobile_v2.0_SLANet_infer.tar wget https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_infer.tar && tar xf ch_ppstructure_mobile_v2.0_SLANet_infer.tar
cd .. cd ..
# 执行表格识别 # 执行表格识别
python table/predict_table.py \ python table/predict_table.py \
--det_model_dir=inference/ch_PP-OCRv3_det_slim_infer \ --det_model_dir=inference/ch_PP-OCRv3_det_infer \
--rec_model_dir=inference/ch_PP-OCRv3_rec_slim_infer \ --rec_model_dir=inference/ch_PP-OCRv3_rec_infer \
--table_model_dir=inference/ch_ppstructure_mobile_v2.0_SLANet_infer \ --table_model_dir=inference/ch_ppstructure_mobile_v2.0_SLANet_infer \
--rec_char_dict_path=../ppocr/utils/ppocr_keys_v1.txt \ --rec_char_dict_path=../ppocr/utils/ppocr_keys_v1.txt \
--table_char_dict_path=../ppocr/utils/dict/table_structure_dict_ch.txt \ --table_char_dict_path=../ppocr/utils/dict/table_structure_dict_ch.txt \
......
...@@ -38,7 +38,7 @@ def init_args(): ...@@ -38,7 +38,7 @@ def init_args():
parser.add_argument( parser.add_argument(
"--layout_dict_path", "--layout_dict_path",
type=str, type=str,
default="../ppocr/utils/dict/layout_dict/layout_pubalynet_dict.txt") default="../ppocr/utils/dict/layout_dict/layout_publaynet_dict.txt")
parser.add_argument( parser.add_argument(
"--layout_score_threshold", "--layout_score_threshold",
type=float, type=float,
...@@ -84,13 +84,18 @@ def init_args(): ...@@ -84,13 +84,18 @@ def init_args():
type=str2bool, type=str2bool,
default=True, default=True,
help='In the forward, whether the non-table area is recognition by ocr') help='In the forward, whether the non-table area is recognition by ocr')
# param for recovery
parser.add_argument( parser.add_argument(
"--recovery", "--recovery",
type=bool, type=str2bool,
default=False, default=False,
help='Whether to enable layout of recovery') help='Whether to enable layout of recovery')
parser.add_argument( parser.add_argument(
"--save_pdf", type=bool, default=False, help='Whether to save pdf file') "--save_pdf",
type=str2bool,
default=False,
help='Whether to save pdf file')
return parser return parser
......
...@@ -108,7 +108,6 @@ if [ ${MODE} = "benchmark_train" ];then ...@@ -108,7 +108,6 @@ if [ ${MODE} = "benchmark_train" ];then
fi fi
if [ ${model_name} == "layoutxlm_ser" ] || [ ${model_name} == "vi_layoutxlm_ser" ]; then if [ ${model_name} == "layoutxlm_ser" ] || [ ${model_name} == "vi_layoutxlm_ser" ]; then
pip install -r ppstructure/kie/requirements.txt pip install -r ppstructure/kie/requirements.txt
pip install paddlenlp\>=2.3.5 --force-reinstall -i https://mirrors.aliyun.com/pypi/simple/
wget -nc -P ./train_data/ https://paddleocr.bj.bcebos.com/ppstructure/dataset/XFUND.tar --no-check-certificate wget -nc -P ./train_data/ https://paddleocr.bj.bcebos.com/ppstructure/dataset/XFUND.tar --no-check-certificate
cd ./train_data/ && tar xf XFUND.tar cd ./train_data/ && tar xf XFUND.tar
# expand gt.txt 10 times # expand gt.txt 10 times
......
...@@ -225,23 +225,24 @@ def create_predictor(args, mode, logger): ...@@ -225,23 +225,24 @@ def create_predictor(args, mode, logger):
min_subgraph_size, # skip the minmum trt subgraph min_subgraph_size, # skip the minmum trt subgraph
use_calib_mode=False) use_calib_mode=False)
# collect shape # collect shape
if args.shape_info_filename is not None: if args.shape_info_filename is not None:
if not os.path.exists(args.shape_info_filename): if not os.path.exists(args.shape_info_filename):
config.collect_shape_range_info(args.shape_info_filename) config.collect_shape_range_info(
logger.info( args.shape_info_filename)
f"collect dynamic shape info into : {args.shape_info_filename}" logger.info(
) f"collect dynamic shape info into : {args.shape_info_filename}"
)
else:
logger.info(
f"dynamic shape info file( {args.shape_info_filename} ) already exists, not need to generate again."
)
config.enable_tuned_tensorrt_dynamic_shape(
args.shape_info_filename, True)
else: else:
logger.info( logger.info(
f"dynamic shape info file( {args.shape_info_filename} ) already exists, not need to generate again." f"when using tensorrt, dynamic shape is a suggested option, you can use '--shape_info_filename=shape.txt' for offline dygnamic shape tuning"
) )
config.enable_tuned_tensorrt_dynamic_shape(
args.shape_info_filename, True)
else:
logger.info(
f"when using tensorrt, dynamic shape is a suggested option, you can use '--shape_info_filename=shape.txt' for offline dygnamic shape tuning"
)
elif args.use_xpu: elif args.use_xpu:
config.enable_xpu(10 * 1024 * 1024) config.enable_xpu(10 * 1024 * 1024)
......
...@@ -39,7 +39,7 @@ from ppocr.utils.visual import draw_re_results ...@@ -39,7 +39,7 @@ from ppocr.utils.visual import draw_re_results
from ppocr.utils.logging import get_logger from ppocr.utils.logging import get_logger
from ppocr.utils.utility import get_image_file_list, load_vqa_bio_label_maps, print_dict from ppocr.utils.utility import get_image_file_list, load_vqa_bio_label_maps, print_dict
from tools.program import ArgsParser, load_config, merge_config from tools.program import ArgsParser, load_config, merge_config
from tools.infer_vqa_token_ser import SerPredictor from tools.infer_kie_token_ser import SerPredictor
class ReArgsParser(ArgsParser): class ReArgsParser(ArgsParser):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册