From 6938804ceaacdcd135dc5b28d874d5d70b63f5f2 Mon Sep 17 00:00:00 2001 From: zhoujun Date: Mon, 24 Jan 2022 12:50:24 +0800 Subject: [PATCH] cp 5296 (#5329) * update cmd * update --- ppstructure/docs/quickstart.md | 73 +++++++++++++++++++--------------- ppstructure/table/README.md | 31 ++++++++++----- ppstructure/table/README_ch.md | 61 ++++++++++++++++++++-------- 3 files changed, 106 insertions(+), 59 deletions(-) diff --git a/ppstructure/docs/quickstart.md b/ppstructure/docs/quickstart.md index 446c577e..8d9c0a31 100644 --- a/ppstructure/docs/quickstart.md +++ b/ppstructure/docs/quickstart.md @@ -1,13 +1,13 @@ # PP-Structure 快速开始 -* [1. 安装PaddleOCR whl包](#1) -* [2. 便捷使用](#2) - + [2.1 命令行使用](#21) - + [2.2 Python脚本使用](#22) - + [2.3 返回结果说明](#23) - + [2.4 参数说明](#24) -* [3. Python脚本使用](#3) - +- [PP-Structure 快速开始](#pp-structure-快速开始) + - [1. 安装依赖包](#1-安装依赖包) + - [2. 便捷使用](#2-便捷使用) + - [2.1 命令行使用](#21-命令行使用) + - [2.2 Python脚本使用](#22-python脚本使用) + - [2.3 返回结果说明](#23-返回结果说明) + - [2.4 参数说明](#24-参数说明) + - [3. Python脚本使用](#3-python脚本使用) @@ -33,6 +33,7 @@ pip3 install -e . ### 2.1 命令行使用 * 版面分析+表格识别 + ```bash paddleocr --image_dir=../doc/table/1.png --type=structure ``` @@ -46,6 +47,7 @@ coming soon ### 2.2 Python脚本使用 * 版面分析+表格识别 + ```python import os import cv2 @@ -79,9 +81,11 @@ comming soon ### 2.3 返回结果说明 + PP-Structure的返回结果为一个dict组成的list,示例如下 * 版面分析+表格识别 + ```shell [ { 'type': 'Text', @@ -91,13 +95,14 @@ PP-Structure的返回结果为一个dict组成的list,示例如下 } ] ``` + dict 里各个字段说明如下 -| 字段 | 说明 | -| --------------- | -------------| -|type|图片区域的类型| -|bbox|图片区域的在原图的坐标,分别[左上角x,左上角y,右下角x,右下角y]| -|res|图片区域的OCR或表格识别结果。
表格: 表格的HTML字符串;
OCR: 一个包含各个单行文字的检测坐标和识别结果的元组| +| 字段 | 说明 | +| ---- | -------------------------------------------------------------------------------------------------------------------------- | +| type | 图片区域的类型 | +| bbox | 图片区域的在原图的坐标,分别[左上角x,左上角y,右下角x,右下角y] | +| res | 图片区域的OCR或表格识别结果。`
` 表格: 表格的HTML字符串; `
` OCR: 一个包含各个单行文字的检测坐标和识别结果的元组 | * VQA @@ -107,20 +112,20 @@ comming soon ### 2.4 参数说明 -| 字段 | 说明 | 默认值 | -| --------------- | ---------------------------------------- | ------------------------------------------- | -| output | excel和识别结果保存的地址 | ./output/table | -| table_max_len | 表格结构模型预测时,图像的长边resize尺度 | 488 | -| table_model_dir | 表格结构模型 inference 模型地址 | None | -| table_char_type | 表格结构模型所用字典地址 | ../ppocr/utils/dict/table_structure_dict.txt | -| model_name_or_path | VQA SER模型地址 | None | -| max_seq_length | VQA SER模型最大支持token长度 | 512 | -| label_map_path | VQA SER 标签文件地址 | ./vqa/labels/labels_ser.txt | -| mode | pipeline预测模式,structure: 版面分析+表格识别; vqa: ser文档信息抽取 | structure | +| 字段 | 说明 | 默认值 | +| ------------------ | -------------------------------------------------------------------- | -------------------------------------------- | +| output | excel和识别结果保存的地址 | ./output/table | +| table_max_len | 表格结构模型预测时,图像的长边resize尺度 | 488 | +| table_model_dir | 表格结构模型 inference 模型地址 | None | +| table_char_type | 表格结构模型所用字典地址 | ../ppocr/utils/dict/table_structure_dict.txt | +| model_name_or_path | VQA SER模型地址 | None | +| max_seq_length | VQA SER模型最大支持token长度 | 512 | +| label_map_path | VQA SER 标签文件地址 | ./vqa/labels/labels_ser.txt | +| mode | pipeline预测模式,structure: 版面分析+表格识别; vqa: ser文档信息抽取 | structure | 大部分参数和paddleocr whl包保持一致,见 [whl包文档](../doc/doc_ch/whl.md) -运行完成后,每张图片会在`output`字段指定的目录下有一个同名目录,图片里的每个表格会存储为一个excel,图片区域会被裁剪之后保存下来,excel文件和图片名名为表格在图片里的坐标。 +运行完成后,每张图片会在 `output`字段指定的目录下有一个同名目录,图片里的每个表格会存储为一个excel,图片区域会被裁剪之后保存下来,excel文件和图片名名为表格在图片里的坐标。 @@ -133,16 +138,16 @@ cd ppstructure # 下载模型 mkdir inference && cd inference -# 下载超轻量级中文OCR模型的检测模型并解压 -wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_infer.tar && tar xf ch_ppocr_mobile_v2.0_det_infer.tar -# 下载超轻量级中文OCR模型的识别模型并解压 -wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_infer.tar && tar xf ch_ppocr_mobile_v2.0_rec_infer.tar -# 下载超轻量级英文表格英寸模型并解压 +# 下载PP-OCRv2文本检测模型并解压 +wget https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_slim_quant_infer.tar && tar xf ch_PP-OCRv2_det_slim_quant_infer.tar +# 下载PP-OCRv2文本识别模型并解压 +wget https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_infer.tar && tar xf ch_PP-OCRv2_rec_slim_quant_infer.tar +# 下载超轻量级英文表格预测模型并解压 wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_structure_infer.tar && tar xf en_ppocr_mobile_v2.0_table_structure_infer.tar cd .. -python3 predict_system.py --det_model_dir=inference/ch_ppocr_mobile_v2.0_det_infer \ - --rec_model_dir=inference/ch_ppocr_mobile_v2.0_rec_infer \ +python3 predict_system.py --det_model_dir=inference/ch_PP-OCRv2_det_slim_quant_infer \ + --rec_model_dir=inference/ch_PP-OCRv2_rec_slim_quant_infer \ --table_model_dir=inference/en_ppocr_mobile_v2.0_table_structure_infer \ --image_dir=../doc/table/1.png \ --rec_char_dict_path=../ppocr/utils/ppocr_keys_v1.txt \ @@ -150,7 +155,8 @@ python3 predict_system.py --det_model_dir=inference/ch_ppocr_mobile_v2.0_det_inf --output=../output/table \ --vis_font_path=../doc/fonts/simfang.ttf ``` -运行完成后,每张图片会在`output`字段指定的目录下的`talbe`目录下有一个同名目录,图片里的每个表格会存储为一个excel,图片区域会被裁剪之后保存下来,excel文件和图片名名为表格在图片里的坐标。 + +运行完成后,每张图片会在 `output`字段指定的目录下的 `talbe`目录下有一个同名目录,图片里的每个表格会存储为一个excel,图片区域会被裁剪之后保存下来,excel文件和图片名名为表格在图片里的坐标。 * VQA @@ -168,4 +174,5 @@ python3 predict_system.py --model_name_or_path=vqa/PP-Layout_v1.0_ser_pretrained --image_dir=vqa/images/input/zh_val_0.jpg \ --vis_font_path=../doc/fonts/simfang.ttf ``` -运行完成后,每张图片会在`output`字段指定的目录下的`vqa`目录下存放可视化之后的图片,图片名和输入图片名一致。 + +运行完成后,每张图片会在 `output`字段指定的目录下的 `vqa`目录下存放可视化之后的图片,图片名和输入图片名一致。 diff --git a/ppstructure/table/README.md b/ppstructure/table/README.md index 30a11a20..150ed34e 100644 --- a/ppstructure/table/README.md +++ b/ppstructure/table/README.md @@ -1,7 +1,9 @@ # Table Recognition ## 1. pipeline + The table recognition mainly contains three models + 1. Single line text detection-DB 2. Single line text recognition-CRNN 3. Table structure and cell coordinate prediction-RARE @@ -16,13 +18,13 @@ The table recognition flow chart is as follows 4. The cell recognition result and the table structure together construct the html string of the table. ## 2. Performance -We evaluated the algorithm on the PubTabNet[1] eval dataset, and the performance is as follows: +We evaluated the algorithm on the PubTabNet``[1]`` eval dataset, and the performance is as follows: -|Method|[TEDS(Tree-Edit-Distance-based Similarity)](https://github.com/ibm-aur-nlp/PubTabNet/tree/master/src)| -| --- | --- | -| EDD[2] | 88.3 | -| Ours | 93.32 | +| Method | [TEDS(Tree-Edit-Distance-based Similarity)](https://github.com/ibm-aur-nlp/PubTabNet/tree/master/src) | +| ------------------------- | -------------------------------------------------------------------------------------------------- | +| EDD``[2]`` | 88.3 | +| Ours | 93.32 | ## 3. How to use @@ -41,8 +43,9 @@ wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_tab wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_structure_infer.tar && tar xf en_ppocr_mobile_v2.0_table_structure_infer.tar cd .. # run -python3 table/predict_table.py --det_model_dir=inference/en_ppocr_mobile_v2.0_table_det_infer --rec_model_dir=inference/en_ppocr_mobile_v2.0_table_rec_infer --table_model_dir=inference/en_ppocr_mobile_v2.0_table_structure_infer --image_dir=../doc/table/table.jpg --rec_char_dict_path=../ppocr/utils/dict/table_dict.txt --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt --rec_char_dict_path=../ppocr/utils/dict/en_dict.txt --det_limit_side_len=736 --det_limit_type=min --output ../output/table +python3 table/predict_table.py --det_model_dir=inference/en_ppocr_mobile_v2.0_table_det_infer --rec_model_dir=inference/en_ppocr_mobile_v2.0_table_rec_infer --table_model_dir=inference/en_ppocr_mobile_v2.0_table_structure_infer --image_dir=../doc/table/table.jpg --rec_char_dict_path=../ppocr/utils/dict/table_dict.txt --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt --det_limit_side_len=736 --det_limit_type=min --output ../output/table ``` + Note: The above model is trained on the PubLayNet dataset and only supports English scanning scenarios. If you need to identify other scenarios, you need to train the model yourself and replace the three fields `det_model_dir`, `rec_model_dir`, `table_model_dir`. After running, the excel sheet of each picture will be saved in the directory specified by the output field @@ -51,11 +54,14 @@ After running, the excel sheet of each picture will be saved in the directory sp In this chapter, we only introduce the training of the table structure model, For model training of [text detection](../../doc/doc_en/detection_en.md) and [text recognition](../../doc/doc_en/recognition_en.md), please refer to the corresponding documents -#### data preparation -The training data uses public data set [PubTabNet](https://arxiv.org/abs/1911.10683 ), Can be downloaded from the official [website](https://github.com/ibm-aur-nlp/PubTabNet) 。The PubTabNet data set contains about 500,000 images, as well as annotations in html format。 +#### data preparation + +The training data uses public data set [PubTabNet](https://arxiv.org/abs/1911.10683), Can be downloaded from the official [website](https://github.com/ibm-aur-nlp/PubTabNet) 。The PubTabNet data set contains about 500,000 images, as well as annotations in html format。 + +#### Start training -#### Start training *If you are installing the cpu version of paddle, please modify the `use_gpu` field in the configuration file to false* + ```shell # single GPU training python3 tools/train.py -c configs/table/table_mv3.yml @@ -80,6 +86,7 @@ python3 tools/train.py -c configs/table/table_mv3.yml -o Global.checkpoints=./yo ### 3.3 Eval The table uses [TEDS(Tree-Edit-Distance-based Similarity)](https://github.com/ibm-aur-nlp/PubTabNet/tree/master/src) as the evaluation metric of the model. Before the model evaluation, the three models in the pipeline need to be exported as inference models (we have provided them), and the gt for evaluation needs to be prepared. Examples of gt are as follows: + ```json {"PMC4289340_004_00.png": [ ["", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "
", "", "", "
", "", "", "
", "", ""], @@ -87,18 +94,22 @@ The table uses [TEDS(Tree-Edit-Distance-based Similarity)](https://github.com/ib [["", "F", "e", "a", "t", "u", "r", "e", ""], ["", "G", "b", "3", " ", "+", ""], ["", "G", "b", "3", " ", "-", ""], ["", "P", "a", "t", "i", "e", "n", "t", "s", ""], ["6", "2"], ["4", "5"]] ]} ``` + In gt json, the key is the image name, the value is the corresponding gt, and gt is a list composed of four items, and each item is + 1. HTML string list of table structure 2. The coordinates of each cell (not including the empty text in the cell) 3. The text information in each cell (not including the empty text in the cell) Use the following command to evaluate. After the evaluation is completed, the teds indicator will be output. + ```python cd PaddleOCR/ppstructure python3 table/eval_table.py --det_model_dir=path/to/det_model_dir --rec_model_dir=path/to/rec_model_dir --table_model_dir=path/to/table_model_dir --image_dir=../doc/table/1.png --rec_char_dict_path=../ppocr/utils/dict/table_dict.txt --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt --det_limit_side_len=736 --det_limit_type=min --gt_path=path/to/gt.json ``` If the PubLatNet eval dataset is used, it will be output + ```bash teds: 93.32 ``` @@ -109,8 +120,10 @@ teds: 93.32 cd PaddleOCR/ppstructure python3 table/predict_table.py --det_model_dir=path/to/det_model_dir --rec_model_dir=path/to/rec_model_dir --table_model_dir=path/to/table_model_dir --image_dir=../doc/table/1.png --rec_char_dict_path=../ppocr/utils/dict/table_dict.txt --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt --rec_char_type=EN --det_limit_side_len=736 --det_limit_type=min --output ../output/table ``` + After running, the excel sheet of each picture will be saved in the directory specified by the output field Reference + 1. https://github.com/ibm-aur-nlp/PubTabNet 2. https://arxiv.org/pdf/1911.10683 diff --git a/ppstructure/table/README_ch.md b/ppstructure/table/README_ch.md index 33276b36..d0fae97d 100644 --- a/ppstructure/table/README_ch.md +++ b/ppstructure/table/README_ch.md @@ -1,17 +1,23 @@ # 表格识别 -* [1. 表格识别 pipeline](#1) -* [2. 性能](#2) -* [3. 使用](#3) - + [3.1 快速开始](#31) - + [3.2 训练](#32) - + [3.3 评估](#33) - + [3.4 预测](#34) +- [表格识别](#表格识别) + - [1. 表格识别 pipeline](#1-表格识别-pipeline) + - [2. 性能](#2-性能) + - [3. 使用](#3-使用) + - [3.1 快速开始](#31-快速开始) + - [3.2 训练](#32-训练) + - [数据准备](#数据准备) + - [启动训练](#启动训练) + - [断点训练](#断点训练) + - [3.3 评估](#33-评估) + - [3.4 预测](#34-预测) + ## 1. 表格识别 pipeline 表格识别主要包含三个模型 + 1. 单行文本检测-DB 2. 单行文本识别-CRNN 3. 表格结构和cell坐标预测-RARE @@ -27,20 +33,23 @@ 3. 由单行文字的坐标、识别结果和单元格的坐标一起组合出单元格的识别结果。 4. 单元格的识别结果和表格结构一起构造表格的html字符串。 - + ## 2. 性能 -我们在 PubTabNet[1] 评估数据集上对算法进行了评估,性能如下 +我们在 PubTabNet``[1]`` 评估数据集上对算法进行了评估,性能如下 -|算法|[TEDS(Tree-Edit-Distance-based Similarity)](https://github.com/ibm-aur-nlp/PubTabNet/tree/master/src)| -| --- | --- | -| EDD[2] | 88.3 | -| Ours | 93.32 | +| 算法 | [TEDS(Tree-Edit-Distance-based Similarity)](https://github.com/ibm-aur-nlp/PubTabNet/tree/master/src) | +| ------------------------- | -------------------------------------------------------------------------------------------------- | +| EDD``[2]`` | 88.3 | +| Ours | 93.32 | + ## 3. 使用 + + ### 3.1 快速开始 ```python @@ -56,20 +65,27 @@ wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_tab wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_structure_infer.tar && tar xf en_ppocr_mobile_v2.0_table_structure_infer.tar cd .. # 执行预测 -python3 table/predict_table.py --det_model_dir=inference/en_ppocr_mobile_v2.0_table_det_infer --rec_model_dir=inference/en_ppocr_mobile_v2.0_table_rec_infer --table_model_dir=inference/en_ppocr_mobile_v2.0_table_structure_infer --image_dir=../doc/table/table.jpg --rec_char_dict_path=../ppocr/utils/dict/table_dict.txt --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt --rec_char_dict_path=../ppocr/utils/dict/en_dict.txt --det_limit_side_len=736 --det_limit_type=min --output ../output/table +python3 table/predict_table.py --det_model_dir=inference/en_ppocr_mobile_v2.0_table_det_infer --rec_model_dir=inference/en_ppocr_mobile_v2.0_table_rec_infer --table_model_dir=inference/en_ppocr_mobile_v2.0_table_structure_infer --image_dir=../doc/table/table.jpg --rec_char_dict_path=../ppocr/utils/dict/table_dict.txt --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt --det_limit_side_len=736 --det_limit_type=min --output ../output/table ``` + 运行完成后,每张图片的excel表格会保存到output字段指定的目录下 note: 上述模型是在 PubLayNet 数据集上训练的表格识别模型,仅支持英文扫描场景,如需识别其他场景需要自己训练模型后替换 `det_model_dir`,`rec_model_dir`,`table_model_dir`三个字段即可。 + + ### 3.2 训练 + 在这一章节中,我们仅介绍表格结构模型的训练,[文字检测](../../doc/doc_ch/detection.md)和[文字识别](../../doc/doc_ch/recognition.md)的模型训练请参考对应的文档。 -#### 数据准备 +#### 数据准备 + 训练数据使用公开数据集PubTabNet ([论文](https://arxiv.org/abs/1911.10683),[下载地址](https://github.com/ibm-aur-nlp/PubTabNet))。PubTabNet数据集包含约50万张表格数据的图像,以及图像对应的html格式的注释。 -#### 启动训练 +#### 启动训练 + *如果您安装的是cpu版本,请将配置文件中的 `use_gpu` 字段修改为false* + ```shell # 单机单卡训练 python3 tools/train.py -c configs/table/table_mv3.yml @@ -82,16 +98,19 @@ python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/ #### 断点训练 如果训练程序中断,如果希望加载训练中断的模型从而恢复训练,可以通过指定Global.checkpoints指定要加载的模型路径: + ```shell python3 tools/train.py -c configs/table/table_mv3.yml -o Global.checkpoints=./your/trained/model ``` -**注意**:`Global.checkpoints`的优先级高于`Global.pretrain_weights`的优先级,即同时指定两个参数时,优先加载`Global.checkpoints`指定的模型,如果`Global.checkpoints`指定的模型路径有误,会加载`Global.pretrain_weights`指定的模型。 +**注意**:`Global.checkpoints`的优先级高于 `Global.pretrain_weights`的优先级,即同时指定两个参数时,优先加载 `Global.checkpoints`指定的模型,如果 `Global.checkpoints`指定的模型路径有误,会加载 `Global.pretrain_weights`指定的模型。 + ### 3.3 评估 表格使用 [TEDS(Tree-Edit-Distance-based Similarity)](https://github.com/ibm-aur-nlp/PubTabNet/tree/master/src) 作为模型的评估指标。在进行模型评估之前,需要将pipeline中的三个模型分别导出为inference模型(我们已经提供好),还需要准备评估的gt, gt示例如下: + ```json {"PMC4289340_004_00.png": [ ["", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "
", "", "", "
", "", "", "
", "", ""], @@ -99,21 +118,28 @@ python3 tools/train.py -c configs/table/table_mv3.yml -o Global.checkpoints=./yo [["", "F", "e", "a", "t", "u", "r", "e", ""], ["", "G", "b", "3", " ", "+", ""], ["", "G", "b", "3", " ", "-", ""], ["", "P", "a", "t", "i", "e", "n", "t", "s", ""], ["6", "2"], ["4", "5"]] ]} ``` + json 中,key为图片名,value为对应的gt,gt是一个由三个item组成的list,每个item分别为 + 1. 表格结构的html字符串list 2. 每个cell的坐标 (不包括cell里文字为空的) 3. 每个cell里的文字信息 (不包括cell里文字为空的) 准备完成后使用如下命令进行评估,评估完成后会输出teds指标。 + ```python cd PaddleOCR/ppstructure python3 table/eval_table.py --det_model_dir=path/to/det_model_dir --rec_model_dir=path/to/rec_model_dir --table_model_dir=path/to/table_model_dir --image_dir=../doc/table/1.png --rec_char_dict_path=../ppocr/utils/dict/table_dict.txt --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt --det_limit_side_len=736 --det_limit_type=min --gt_path=path/to/gt.json ``` + 如使用PubLatNet评估数据集,将会输出 + ```bash teds: 93.32 ``` + + ### 3.4 预测 ```python @@ -122,5 +148,6 @@ python3 table/predict_table.py --det_model_dir=path/to/det_model_dir --rec_model ``` Reference + 1. https://github.com/ibm-aur-nlp/PubTabNet 2. https://arxiv.org/pdf/1911.10683 -- GitLab