From 9fd1e9d630ac65420926eca5ec475715d3d77709 Mon Sep 17 00:00:00 2001 From: WenmuZhou Date: Thu, 29 Jul 2021 17:59:44 +0800 Subject: [PATCH] opt tableocr doc --- ppstructure/table/README.md | 11 +++++++---- ppstructure/table/README_ch.md | 24 ++++++++++++++---------- ppstructure/table/eval_table.py | 10 +++++----- 3 files changed, 26 insertions(+), 19 deletions(-) diff --git a/ppstructure/table/README.md b/ppstructure/table/README.md index afcbe169..c538db27 100644 --- a/ppstructure/table/README.md +++ b/ppstructure/table/README.md @@ -49,28 +49,31 @@ python3 tools/train.py -c configs/table/table_mv3.yml -o Global.checkpoints=./yo **Note**: The priority of `Global.checkpoints` is higher than that of `Global.pretrain_weights`, that is, when two parameters are specified at the same time, the model specified by `Global.checkpoints` will be loaded first. If the model path specified by `Global.checkpoints` is wrong, the one specified by `Global.pretrain_weights` will be loaded. ### 2.2 Eval -First cd to the PaddleOCR/ppstructure directory The table uses TEDS (Tree-Edit-Distance-based Similarity) as the evaluation metric of the model. Before the model evaluation, the three models in the pipeline need to be exported as inference models (we have provided them), and the gt for evaluation needs to be prepared. Examples of gt are as follows: ```json -{"PMC4289340_004_00.png": [["", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "
", "", "", "
", "", "", "
", "", ""], [[1, 4, 29, 13], [137, 4, 161, 13], [215, 4, 236, 13], [1, 17, 30, 27], [137, 17, 147, 27], [215, 17, 225, 27]], [["", "F", "e", "a", "t", "u", "r", "e", ""], ["", "G", "b", "3", " ", "+", ""], ["", "G", "b", "3", " ", "-", ""], ["", "P", "a", "t", "i", "e", "n", "t", "s", ""], ["6", "2"], ["4", "5"]]]} +{"PMC4289340_004_00.png": [ + ["", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "
", "", "", "
", "", "", "
", "", ""], + [[1, 4, 29, 13], [137, 4, 161, 13], [215, 4, 236, 13], [1, 17, 30, 27], [137, 17, 147, 27], [215, 17, 225, 27]], + [["", "F", "e", "a", "t", "u", "r", "e", ""], ["", "G", "b", "3", " ", "+", ""], ["", "G", "b", "3", " ", "-", ""], ["", "P", "a", "t", "i", "e", "n", "t", "s", ""], ["6", "2"], ["4", "5"]] +]} ``` In gt json, the key is the image name, the value is the corresponding gt, and gt is a list composed of four items, and each item is 1. HTML string list of table structure 2. The coordinates of each cell (not including the empty text in the cell) 3. The text information in each cell (not including the empty text in the cell) -4. The text information in each cell (including the empty text in the cell) Use the following command to evaluate. After the evaluation is completed, the teds indicator will be output. ```python +cd PaddleOCR/ppstructure python3 table/eval_table.py --det_model_dir=path/to/det_model_dir --rec_model_dir=path/to/rec_model_dir --table_model_dir=path/to/table_model_dir --image_dir=../doc/table/1.png --rec_char_dict_path=../ppocr/utils/dict/table_dict.txt --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt --rec_char_type=EN --det_limit_side_len=736 --det_limit_type=min --gt_path=path/to/gt.json ``` ### 2.3 Inference -First cd to the PaddleOCR/ppstructure directory ```python +cd PaddleOCR/ppstructure python3 table/predict_table.py --det_model_dir=path/to/det_model_dir --rec_model_dir=path/to/rec_model_dir --table_model_dir=path/to/table_model_dir --image_dir=../doc/table/1.png --rec_char_dict_path=../ppocr/utils/dict/table_dict.txt --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt --rec_char_type=EN --det_limit_side_len=736 --det_limit_type=min --output ../output/table ``` After running, the excel sheet of each picture will be saved in the directory specified by the output field \ No newline at end of file diff --git a/ppstructure/table/README_ch.md b/ppstructure/table/README_ch.md index 4b912f3e..5981dab4 100644 --- a/ppstructure/table/README_ch.md +++ b/ppstructure/table/README_ch.md @@ -1,6 +1,6 @@ -# 表格结构和内容预测 +# Table OCR -## 1. pipeline +## 1. Table OCR pineline 表格的ocr主要包含三个模型 1. 单行文本检测-DB 2. 单行文本识别-CRNN @@ -10,7 +10,9 @@ ![tableocr_pipeline](../../doc/table/tableocr_pipeline.jpg) -1. 图片由单行文字检测检测模型到单行文字的坐标,然后送入识别模型拿到识别结果。 +流程说明: + +1. 图片由单行文字检测模型检测到单行文字的坐标,然后送入识别模型拿到识别结果。 2. 图片由表格结构和cell坐标预测模型拿到表格的结构信息和单元格的坐标信息。 3. 由单行文字的坐标、识别结果和单元格的坐标一起组合出单元格的识别结果。 4. 单元格的识别结果和表格结构一起构造表格的html字符串。 @@ -21,7 +23,7 @@ 在这一章节中,我们仅介绍表格结构模型的训练,[文字检测](../../doc/doc_ch/detection.md)和[文字识别](../../doc/doc_ch/recognition.md)的模型训练请参考对应的文档。 #### 数据准备 -训练数据使用公开数据集[PubTabNet](https://arxiv.org/abs/1911.10683),可以从[官网](https://github.com/ibm-aur-nlp/PubTabNet)下载。PubTabNet数据集包含约50万张表格数据的图像,以及图像对应的html格式的注释。 +训练数据使用公开数据集PubTabNet ([论文](https://arxiv.org/abs/1911.10683),[下载地址](https://github.com/ibm-aur-nlp/PubTabNet))。PubTabNet数据集包含约50万张表格数据的图像,以及图像对应的html格式的注释。 #### 启动训练 *如果您安装的是cpu版本,请将配置文件中的 `use_gpu` 字段修改为false* @@ -45,28 +47,30 @@ python3 tools/train.py -c configs/table/table_mv3.yml -o Global.checkpoints=./yo ### 2.2 评估 -先cd到PaddleOCR/ppstructure目录下 表格使用 TEDS(Tree-Edit-Distance-based Similarity) 作为模型的评估指标。在进行模型评估之前,需要将pipeline中的三个模型分别导出为inference模型(我们已经提供好),还需要准备评估的gt, gt示例如下: ```json -{"PMC4289340_004_00.png": [["", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "
", "", "", "
", "", "", "
", "", ""], [[1, 4, 29, 13], [137, 4, 161, 13], [215, 4, 236, 13], [1, 17, 30, 27], [137, 17, 147, 27], [215, 17, 225, 27]], [["", "F", "e", "a", "t", "u", "r", "e", ""], ["", "G", "b", "3", " ", "+", ""], ["", "G", "b", "3", " ", "-", ""], ["", "P", "a", "t", "i", "e", "n", "t", "s", ""], ["6", "2"], ["4", "5"]]]} +{"PMC4289340_004_00.png": [ + ["", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "
", "", "", "
", "", "", "
", "", ""], + [[1, 4, 29, 13], [137, 4, 161, 13], [215, 4, 236, 13], [1, 17, 30, 27], [137, 17, 147, 27], [215, 17, 225, 27]], + [["", "F", "e", "a", "t", "u", "r", "e", ""], ["", "G", "b", "3", " ", "+", ""], ["", "G", "b", "3", " ", "-", ""], ["", "P", "a", "t", "i", "e", "n", "t", "s", ""], ["6", "2"], ["4", "5"]] +]} ``` -json 中,key为图片名,value为对于的gt,gt是一个由四个item组成的list,每个item分别为 +json 中,key为图片名,value为对应的gt,gt是一个由四个item组成的list,每个item分别为 1. 表格结构的html字符串list 2. 每个cell的坐标 (不包括cell里文字为空的) 3. 每个cell里的文字信息 (不包括cell里文字为空的) -4. 每个cell里的文字信息 (包括cell里文字为空的) 准备完成后使用如下命令进行评估,评估完成后会输出teds指标。 ```python +cd PaddleOCR/ppstructure python3 table/eval_table.py --det_model_dir=path/to/det_model_dir --rec_model_dir=path/to/rec_model_dir --table_model_dir=path/to/table_model_dir --image_dir=../doc/table/1.png --rec_char_dict_path=../ppocr/utils/dict/table_dict.txt --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt --rec_char_type=EN --det_limit_side_len=736 --det_limit_type=min --gt_path=path/to/gt.json ``` ### 2.3 预测 -先cd到PaddleOCR/ppstructure目录下 - ```python +cd PaddleOCR/ppstructure python3 table/predict_table.py --det_model_dir=path/to/det_model_dir --rec_model_dir=path/to/rec_model_dir --table_model_dir=path/to/table_model_dir --image_dir=../doc/table/1.png --rec_char_dict_path=../ppocr/utils/dict/table_dict.txt --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt --rec_char_type=EN --det_limit_side_len=736 --det_limit_type=min --output ../output/table ``` 运行完成后,每张图片的excel表格会保存到output字段指定的目录下 diff --git a/ppstructure/table/eval_table.py b/ppstructure/table/eval_table.py index 15f54937..87b44d3d 100755 --- a/ppstructure/table/eval_table.py +++ b/ppstructure/table/eval_table.py @@ -46,20 +46,20 @@ def main(gt_path, img_root, args): pred_html = text_sys(img) pred_htmls.append(pred_html) - gt_structures, gt_bboxes, gt_contents, contents_with_block = jsons_gt[img_name] - gt_html, gt = get_gt_html(gt_structures, contents_with_block) + gt_structures, gt_bboxes, gt_contents = jsons_gt[img_name] + gt_html, gt = get_gt_html(gt_structures, gt_contents) gt_htmls.append(gt_html) scores = teds.batch_evaluate_html(gt_htmls, pred_htmls) logger.info('teds:', sum(scores) / len(scores)) -def get_gt_html(gt_structures, contents_with_block): +def get_gt_html(gt_structures, gt_contents): end_html = [] td_index = 0 for tag in gt_structures: if '' in tag: - if contents_with_block[td_index] != []: - end_html.extend(contents_with_block[td_index]) + if gt_contents[td_index] != []: + end_html.extend(gt_contents[td_index]) end_html.append(tag) td_index += 1 else: -- GitLab