diff --git a/paddleocr.py b/paddleocr.py index d07082f0ddc1133b3e9b3a7a7703d87f7cfeeedb..cb2c34f69f68d289b317d4737bd23385c77c3d95 100644 --- a/paddleocr.py +++ b/paddleocr.py @@ -47,7 +47,7 @@ __all__ = [ ] SUPPORT_DET_MODEL = ['DB'] -VERSION = '2.4.0.4' +VERSION = '2.5' SUPPORT_REC_MODEL = ['CRNN'] BASE_DIR = os.path.expanduser("~/.paddleocr/") @@ -442,7 +442,7 @@ class PPStructure(StructureSystem): logger.debug(params) super().__init__(params) - def __call__(self, img): + def __call__(self, img, return_ocr_result_in_table=False): if isinstance(img, str): # download net image if img.startswith('http'): @@ -460,7 +460,7 @@ class PPStructure(StructureSystem): if isinstance(img, np.ndarray) and len(img.shape) == 2: img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) - res = super().__call__(img) + res = super().__call__(img, return_ocr_result_in_table) return res diff --git a/ppocr/postprocess/rec_postprocess.py b/ppocr/postprocess/rec_postprocess.py index 3bc7bcdf9b388bb8da6c656682e2e06a18a0f4fb..47825dc7d43dc5fb68f7ec9c45c7d4d91c1144a3 100644 --- a/ppocr/postprocess/rec_postprocess.py +++ b/ppocr/postprocess/rec_postprocess.py @@ -73,7 +73,7 @@ class BaseRecLabelDecode(object): conf_list = [0] text = ''.join(char_list) - result_list.append((text, np.mean(conf_list))) + result_list.append((text, np.mean(conf_list).tolist())) return result_list def get_ignored_tokens(self): @@ -196,7 +196,7 @@ class NRTRLabelDecode(BaseRecLabelDecode): else: conf_list.append(1) text = ''.join(char_list) - result_list.append((text.lower(), np.mean(conf_list))) + result_list.append((text.lower(), np.mean(conf_list).tolist())) return result_list @@ -241,7 +241,7 @@ class AttnLabelDecode(BaseRecLabelDecode): else: conf_list.append(1) text = ''.join(char_list) - result_list.append((text, np.mean(conf_list))) + result_list.append((text, np.mean(conf_list).tolist())) return result_list def __call__(self, preds, label=None, *args, **kwargs): @@ -333,7 +333,7 @@ class SEEDLabelDecode(BaseRecLabelDecode): else: conf_list.append(1) text = ''.join(char_list) - result_list.append((text, np.mean(conf_list))) + result_list.append((text, np.mean(conf_list).tolist())) return result_list def __call__(self, preds, label=None, *args, **kwargs): @@ -417,7 +417,7 @@ class SRNLabelDecode(BaseRecLabelDecode): conf_list.append(1) text = ''.join(char_list) - result_list.append((text, np.mean(conf_list))) + result_list.append((text, np.mean(conf_list).tolist())) return result_list def add_special_char(self, dict_character): @@ -636,7 +636,7 @@ class SARLabelDecode(BaseRecLabelDecode): comp = re.compile('[^A-Z^a-z^0-9^\u4e00-\u9fa5]') text = text.lower() text = comp.sub('', text) - result_list.append((text, np.mean(conf_list))) + result_list.append((text, np.mean(conf_list).tolist())) return result_list def __call__(self, preds, label=None, *args, **kwargs): @@ -699,7 +699,7 @@ class PRENLabelDecode(BaseRecLabelDecode): text = ''.join(char_list) if len(text) > 0: - result_list.append((text, np.mean(conf_list))) + result_list.append((text, np.mean(conf_list).tolist())) else: # here confidence of empty recog result is 1 result_list.append(('', 1)) diff --git a/ppstructure/docs/inference.md b/ppstructure/docs/inference.md index bfcdbd0c07da6e3a9168c3b7464183ac5dfba536..7604246da5a79b0ee2939c9fb4c91602531ec7de 100644 --- a/ppstructure/docs/inference.md +++ b/ppstructure/docs/inference.md @@ -1,15 +1,20 @@ # 基于Python预测引擎推理 -- [版面分析+表格识别](#1) -- [DocVQA](#2) +- [1. Structure](#1) + - [1.1 版面分析+表格识别](#1.1) + - [1.2 版面分析](#1.2) + - [1.3 表格识别](#1.3) +- [2. DocVQA](#2) -## 1. 版面分析+表格识别 +## 1. Structure +进入`ppstructure`目录 ```bash cd ppstructure - -# 下载模型 +```` +下载模型 +```bash mkdir inference && cd inference # 下载PP-OCRv2文本检测模型并解压 wget https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_slim_quant_infer.tar && tar xf ch_PP-OCRv2_det_slim_quant_infer.tar @@ -18,17 +23,42 @@ wget https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant # 下载超轻量级英文表格预测模型并解压 wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_structure_infer.tar && tar xf en_ppocr_mobile_v2.0_table_structure_infer.tar cd .. - +``` + +### 1.1 版面分析+表格识别 +```bash python3 predict_system.py --det_model_dir=inference/ch_PP-OCRv2_det_slim_quant_infer \ --rec_model_dir=inference/ch_PP-OCRv2_rec_slim_quant_infer \ --table_model_dir=inference/en_ppocr_mobile_v2.0_table_structure_infer \ - --image_dir=../doc/table/1.png \ + --image_dir=./docs/table/1.png \ --rec_char_dict_path=../ppocr/utils/ppocr_keys_v1.txt \ --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt \ - --output=../output/table \ + --output=../output \ --vis_font_path=../doc/fonts/simfang.ttf ``` -运行完成后,每张图片会在`output`字段指定的目录下的`talbe`目录下有一个同名目录,图片里的每个表格会存储为一个excel,图片区域会被裁剪之后保存下来,excel文件和图片名名为表格在图片里的坐标。 +运行完成后,每张图片会在`output`字段指定的目录下的`structure`目录下有一个同名目录,图片里的每个表格会存储为一个excel,图片区域会被裁剪之后保存下来,excel文件和图片名为表格在图片里的坐标。详细的结果会存储在`res.txt`文件中。 + + +### 1.2 版面分析 +```bash +python3 predict_system.py --image_dir=./docs/table/1.png --table=false --ocr=false --output=../output/ +``` +运行完成后,每张图片会在`output`字段指定的目录下的`structure`目录下有一个同名目录,图片区域会被裁剪之后保存下来,图片名为表格在图片里的坐标。版面分析结果会存储在`res.txt`文件中。 + + +### 1.3 表格识别 +```bash +python3 predict_system.py --det_model_dir=inference/ch_PP-OCRv2_det_slim_quant_infer \ + --rec_model_dir=inference/ch_PP-OCRv2_rec_slim_quant_infer \ + --table_model_dir=inference/en_ppocr_mobile_v2.0_table_structure_infer \ + --image_dir=./docs/table/table.jpg \ + --rec_char_dict_path=../ppocr/utils/ppocr_keys_v1.txt \ + --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt \ + --output=../output \ + --vis_font_path=../doc/fonts/simfang.ttf \ + --layout=false +``` +运行完成后,每张图片会在`output`字段指定的目录下的`structure`目录下有一个同名目录,表格会存储为一个excel,excel文件名为`[0,0,img_h,img_w]`。 ## 2. DocVQA @@ -47,4 +77,4 @@ python3 predict_system.py --model_name_or_path=vqa/PP-Layout_v1.0_ser_pretrained --image_dir=vqa/images/input/zh_val_0.jpg \ --vis_font_path=../doc/fonts/simfang.ttf ``` -运行完成后,每张图片会在`output`字段指定的目录下的`vqa`目录下存放可视化之后的图片,图片名和输入图片名一致。 \ No newline at end of file +运行完成后,每张图片会在`output`字段指定的目录下的`vqa`目录下存放可视化之后的图片,图片名和输入图片名一致。 diff --git a/ppstructure/docs/inference_en.md b/ppstructure/docs/inference_en.md index bfcdbd0c07da6e3a9168c3b7464183ac5dfba536..2a0fb30543eaa06c4ede5f82a443135c959db37d 100644 --- a/ppstructure/docs/inference_en.md +++ b/ppstructure/docs/inference_en.md @@ -1,34 +1,66 @@ -# 基于Python预测引擎推理 +# Python Inference -- [版面分析+表格识别](#1) -- [DocVQA](#2) +- [1. Structure](#1) + - [1.1 layout analysis + table recognition](#1.1) + - [1.2 layout analysis](#1.2) + - [1.3 table recognition](#1.3) +- [2. DocVQA](#2) -## 1. 版面分析+表格识别 +## 1. Structure +Go to the `ppstructure` directory ```bash cd ppstructure +```` -# 下载模型 +download model + +```bash mkdir inference && cd inference -# 下载PP-OCRv2文本检测模型并解压 +# Download the PP-OCRv2 text detection model and unzip it wget https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_slim_quant_infer.tar && tar xf ch_PP-OCRv2_det_slim_quant_infer.tar -# 下载PP-OCRv2文本识别模型并解压 +# Download the PP-OCRv2 text recognition model and unzip it wget https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_infer.tar && tar xf ch_PP-OCRv2_rec_slim_quant_infer.tar -# 下载超轻量级英文表格预测模型并解压 +# Download the ultra-lightweight English table structure model and unzip it wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_structure_infer.tar && tar xf en_ppocr_mobile_v2.0_table_structure_infer.tar cd .. - +``` + +### 1.1 layout analysis + table recognition +```bash python3 predict_system.py --det_model_dir=inference/ch_PP-OCRv2_det_slim_quant_infer \ --rec_model_dir=inference/ch_PP-OCRv2_rec_slim_quant_infer \ --table_model_dir=inference/en_ppocr_mobile_v2.0_table_structure_infer \ - --image_dir=../doc/table/1.png \ + --image_dir=./docs/table/1.png \ --rec_char_dict_path=../ppocr/utils/ppocr_keys_v1.txt \ --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt \ - --output=../output/table \ + --output=../output \ --vis_font_path=../doc/fonts/simfang.ttf ``` -运行完成后,每张图片会在`output`字段指定的目录下的`talbe`目录下有一个同名目录,图片里的每个表格会存储为一个excel,图片区域会被裁剪之后保存下来,excel文件和图片名名为表格在图片里的坐标。 +After the operation is completed, each image will have a directory with the same name in the `structure` directory under the directory specified by the `output` field. Each table in the image will be stored as an excel, and the picture area will be cropped and saved. The filename of excel and picture is their coordinates in the image. Detailed results are stored in the `res.txt` file. + + +### 1.2 layout analysis +```bash +python3 predict_system.py --image_dir=./docs/table/1.png --table=false --ocr=false --output=../output/ +``` +After the operation is completed, each image will have a directory with the same name in the `structure` directory under the directory specified by the `output` field. Each picture in image will be cropped and saved. The filename of picture area is their coordinates in the image. Layout analysis results will be stored in the `res.txt` file + + +### 1.3 table recognition +```bash +python3 predict_system.py --det_model_dir=inference/ch_PP-OCRv2_det_slim_quant_infer \ + --rec_model_dir=inference/ch_PP-OCRv2_rec_slim_quant_infer \ + --table_model_dir=inference/en_ppocr_mobile_v2.0_table_structure_infer \ + --image_dir=./docs/table/table.jpg \ + --rec_char_dict_path=../ppocr/utils/ppocr_keys_v1.txt \ + --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt \ + --output=../output \ + --vis_font_path=../doc/fonts/simfang.ttf \ + --layout=false +``` +After the operation is completed, each image will have a directory with the same name in the `structure` directory under the directory specified by the `output` field. Each table in the image will be stored as an excel. The filename of excel is their coordinates in the image. ## 2. DocVQA @@ -36,9 +68,8 @@ python3 predict_system.py --det_model_dir=inference/ch_PP-OCRv2_det_slim_quant_i ```bash cd ppstructure -# 下载模型 +# download model mkdir inference && cd inference -# 下载SER xfun 模型并解压 wget https://paddleocr.bj.bcebos.com/pplayout/PP-Layout_v1.0_ser_pretrained.tar && tar xf PP-Layout_v1.0_ser_pretrained.tar cd .. @@ -47,4 +78,4 @@ python3 predict_system.py --model_name_or_path=vqa/PP-Layout_v1.0_ser_pretrained --image_dir=vqa/images/input/zh_val_0.jpg \ --vis_font_path=../doc/fonts/simfang.ttf ``` -运行完成后,每张图片会在`output`字段指定的目录下的`vqa`目录下存放可视化之后的图片,图片名和输入图片名一致。 \ No newline at end of file +After the operation is completed, each image will store the visualized image in the `vqa` directory under the directory specified by the `output` field, and the image name is the same as the input image name. diff --git a/ppstructure/docs/models_list_en.md b/ppstructure/docs/models_list_en.md index c7dab999ff6e370c56c5495e22e91f117b3d1275..b92c10c241df72c85649b64f915b4266cd3fe410 100644 --- a/ppstructure/docs/models_list_en.md +++ b/ppstructure/docs/models_list_en.md @@ -1,56 +1,56 @@ -# PP-Structure 系列模型列表 +# PP-Structure Model list -- [1. 版面分析模型](#1) -- [2. OCR和表格识别模型](#2) +- [1. Layout Analysis](#1) +- [2. OCR and Table Recognition](#2) - [2.1 OCR](#21) - - [2.2 表格识别模型](#22) -- [3. VQA模型](#3) -- [4. KIE模型](#4) + - [2.2 Table Recognition](#22) +- [3. VQA](#3) +- [4. KIE](#4) -## 1. 版面分析模型 +## 1. Layout Analysis -|模型名称|模型简介|下载地址|label_map| -| --- | --- | --- | --- | -| ppyolov2_r50vd_dcn_365e_publaynet | PubLayNet 数据集训练的版面分析模型,可以划分**文字、标题、表格、图片以及列表**5类区域 | [推理模型](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_publaynet.tar) / [训练模型](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_publaynet_pretrained.pdparams) |{0: "Text", 1: "Title", 2: "List", 3:"Table", 4:"Figure"}| -| ppyolov2_r50vd_dcn_365e_tableBank_word | TableBank Word 数据集训练的版面分析模型,只能检测表格 | [推理模型](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_tableBank_word.tar) | {0:"Table"}| -| ppyolov2_r50vd_dcn_365e_tableBank_latex | TableBank Latex 数据集训练的版面分析模型,只能检测表格 | [推理模型](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_tableBank_latex.tar) | {0:"Table"}| +|model name| description |download|label_map| +| --- |---------------------------------------------------------------------------------------------------------------------------------------------------------| --- | --- | +| ppyolov2_r50vd_dcn_365e_publaynet | The layout analysis model trained on the PubLayNet dataset, the model can recognition 5 types of areas such as **text, title, table, picture and list** | [inference model](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_publaynet.tar) / [trained model](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_publaynet_pretrained.pdparams) |{0: "Text", 1: "Title", 2: "List", 3:"Table", 4:"Figure"}| +| ppyolov2_r50vd_dcn_365e_tableBank_word | The layout analysis model trained on the TableBank Word dataset, the model can only detect tables | [inference model](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_tableBank_word.tar) | {0:"Table"}| +| ppyolov2_r50vd_dcn_365e_tableBank_latex | The layout analysis model trained on the TableBank Latex dataset, the model can only detect tables | [inference model](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_tableBank_latex.tar) | {0:"Table"}| -## 2. OCR和表格识别模型 +## 2. OCR and Table Recognition ### 2.1 OCR -|模型名称|模型简介|推理模型大小|下载地址| -| --- | --- | --- | --- | -|en_ppocr_mobile_v2.0_table_det|PubLayNet数据集训练的英文表格场景的文字检测|4.7M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/table/en_ppocr_mobile_v2.0_table_det_train.tar) | -|en_ppocr_mobile_v2.0_table_rec|PubLayNet数据集训练的英文表格场景的文字识别|6.9M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/table/en_ppocr_mobile_v2.0_table_rec_train.tar) | +|model name| description | inference model size |download| +| --- |---|---| --- | +|en_ppocr_mobile_v2.0_table_det| Text detection model of English table scenes trained on PubTabNet dataset | 4.7M |[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.1/table/en_ppocr_mobile_v2.0_table_det_train.tar) | +|en_ppocr_mobile_v2.0_table_rec| Text recognition model of English table scenes trained on PubTabNet dataset | 6.9M |[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.1/table/en_ppocr_mobile_v2.0_table_rec_train.tar) | -如需要使用其他OCR模型,可以在 [PP-OCR model_list](../../doc/doc_ch/models_list.md) 下载模型或者使用自己训练好的模型配置到 `det_model_dir`, `rec_model_dir`两个字段即可。 +If you need to use other OCR models, you can download the model in [PP-OCR model_list](../../doc/doc_ch/models_list.md) or use the model you trained yourself to configure to `det_model_dir`, `rec_model_dir` field. -### 2.2 表格识别模型 +### 2.2 Table Recognition -|模型名称|模型简介|推理模型大小|下载地址| -| --- | --- | --- | --- | -|en_ppocr_mobile_v2.0_table_structure|PubLayNet数据集训练的英文表格场景的表格结构预测|18.6M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_structure_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/table/en_ppocr_mobile_v2.0_table_structure_train.tar) | +|model| description |inference model size|download| +| --- |-----------------------------------------------------------------------------| --- | --- | +|en_ppocr_mobile_v2.0_table_structure| Table structure model for English table scenes trained on PubTabNet dataset |18.6M|[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_structure_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.1/table/en_ppocr_mobile_v2.0_table_structure_train.tar) | -## 3. VQA模型 +## 3. VQA -|模型名称|模型简介|推理模型大小|下载地址| -| --- | --- | --- | --- | -|ser_LayoutXLM_xfun_zh|基于LayoutXLM在xfun中文数据集上训练的SER模型|1.4G|[推理模型 coming soon]() / [训练模型](https://paddleocr.bj.bcebos.com/pplayout/re_LayoutXLM_xfun_zh.tar) | -|re_LayoutXLM_xfun_zh|基于LayoutXLM在xfun中文数据集上训练的RE模型|1.4G|[推理模型 coming soon]() / [训练模型](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutXLM_xfun_zh.tar) | -|ser_LayoutLMv2_xfun_zh|基于LayoutLMv2在xfun中文数据集上训练的SER模型|778M|[推理模型 coming soon]() / [训练模型](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutLMv2_xfun_zh.tar) | -|re_LayoutLMv2_xfun_zh|基于LayoutLMv2在xfun中文数据集上训练的RE模型|765M|[推理模型 coming soon]() / [训练模型](https://paddleocr.bj.bcebos.com/pplayout/re_LayoutLMv2_xfun_zh.tar) | -|ser_LayoutLM_xfun_zh|基于LayoutLM在xfun中文数据集上训练的SER模型|430M|[推理模型 coming soon]() / [训练模型](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutLM_xfun_zh.tar) | +|model| description |inference model size|download| +| --- |----------------------------------------------------------------| --- | --- | +|ser_LayoutXLM_xfun_zh| SER model trained on xfun Chinese dataset based on LayoutXLM |1.4G|[inference model coming soon]() / [trained model](https://paddleocr.bj.bcebos.com/pplayout/re_LayoutXLM_xfun_zh.tar) | +|re_LayoutXLM_xfun_zh| Re model trained on xfun Chinese dataset based on LayoutXLM |1.4G|[inference model coming soon]() / [trained model](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutXLM_xfun_zh.tar) | +|ser_LayoutLMv2_xfun_zh| SER model trained on xfun Chinese dataset based on LayoutXLMv2 |778M|[inference model coming soon]() / [trained model](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutLMv2_xfun_zh.tar) | +|re_LayoutLMv2_xfun_zh| Re model trained on xfun Chinese dataset based on LayoutXLMv2 |765M|[inference model coming soon]() / [trained model](https://paddleocr.bj.bcebos.com/pplayout/re_LayoutLMv2_xfun_zh.tar) | +|ser_LayoutLM_xfun_zh| SER model trained on xfun Chinese dataset based on LayoutLM |430M|[inference model coming soon]() / [trained model](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutLM_xfun_zh.tar) | -## 4. KIE模型 +## 4. KIE -|模型名称|模型简介|模型大小|下载地址| +|model|description|model size|download| | --- | --- | --- | --- | -|SDMGR|关键信息提取模型|78M|[推理模型 coming soon]() / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/kie/kie_vgg16.tar)| +|SDMGR|Key Information Extraction Model|78M|[inference model coming soon]() / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.1/kie/kie_vgg16.tar)| diff --git a/ppstructure/docs/quickstart.md b/ppstructure/docs/quickstart.md index 45643de003c3bdf9c22d43dd9c1118026f8ae34f..6610035d1442f988ac69763724ce78f6db35ae20 100644 --- a/ppstructure/docs/quickstart.md +++ b/ppstructure/docs/quickstart.md @@ -4,10 +4,14 @@ - [2. 便捷使用](#2) - [2.1 命令行使用](#21) - [2.1.1 版面分析+表格识别](#211) - - [2.1.2 DocVQA](#212) - - [2.2 Python脚本使用](#22) + - [2.1.2 版面分析](#212) + - [2.1.3 表格识别](#213) + - [2.1.4 DocVQA](#214) + - [2.2 代码使用](#22) - [2.2.1 版面分析+表格识别](#221) - - [2.2.2 DocVQA](#222) + - [2.2.2 版面分析](#222) + - [2.2.3 表格识别](#223) + - [2.2.4 DocVQA](#224) - [2.3 返回结果说明](#23) - [2.3.1 版面分析+表格识别](#231) - [2.3.2 DocVQA](#232) @@ -18,10 +22,10 @@ ## 1. 安装依赖包 ```bash -# 安装 paddleocr,推荐使用2.3.0.2+版本 -pip3 install "paddleocr>=2.3.0.2" +# 安装 paddleocr,推荐使用2.5+版本 +pip3 install "paddleocr>=2.5" # 安装 版面分析依赖包layoutparser(如不需要版面分析功能,可跳过) -pip3 install -U https://paddleocr.bj.bcebos.com/whl/layoutparser-0.0.0-py3-none-any.whl +pip3 install -U https://paddleocr.bj.bcebos.com/whl/layoutparser-0.0.0-py3-none-any.whl # 安装 DocVQA依赖包paddlenlp(如不需要DocVQA功能,可跳过) pip install paddlenlp @@ -32,20 +36,32 @@ pip install paddlenlp ### 2.1 命令行使用 - + #### 2.1.1 版面分析+表格识别 ```bash -paddleocr --image_dir=../doc/table/1.png --type=structure +paddleocr --image_dir=PaddleOCR/ppstructure/docs/table/1.png --type=structure ``` -#### 2.1.2 DocVQA +#### 2.1.2 版面分析 +```bash +paddleocr --image_dir=PaddleOCR/ppstructure/docs/table/1.png --type=structure --table=false --ocr=false +``` + + +#### 2.1.3 表格识别 +```bash +paddleocr --image_dir=PaddleOCR/ppstructure/docs/table/table.jpg --type=structure --layout=false +``` + + +#### 2.1.4 DocVQA 请参考:[文档视觉问答](../vqa/README.md)。 -### 2.2 Python脚本使用 +### 2.2 代码使用 #### 2.2.1 版面分析+表格识别 @@ -57,8 +73,8 @@ from paddleocr import PPStructure,draw_structure_result,save_structure_res table_engine = PPStructure(show_log=True) -save_folder = './output/table' -img_path = '../doc/table/1.png' +save_folder = './output' +img_path = 'PaddleOCR/ppstructure/docs/table/1.png' img = cv2.imread(img_path) result = table_engine(img) save_structure_res(result, save_folder,os.path.basename(img_path).split('.')[0]) @@ -69,7 +85,7 @@ for line in result: from PIL import Image -font_path = '../doc/fonts/simfang.ttf' # PaddleOCR下提供字体包 +font_path = 'PaddleOCR/doc/fonts/simfang.ttf' # PaddleOCR下提供字体包 image = Image.open(img_path).convert('RGB') im_show = draw_structure_result(image, result,font_path=font_path) im_show = Image.fromarray(im_show) @@ -77,7 +93,49 @@ im_show.save('result.jpg') ``` -#### 2.2.2 DocVQA +#### 2.2.2 版面分析 + +```python +import os +import cv2 +from paddleocr import PPStructure,save_structure_res + +table_engine = PPStructure(table=False, ocr=False, show_log=True) + +save_folder = './output' +img_path = 'PaddleOCR/ppstructure/docs/table/1.png' +img = cv2.imread(img_path) +result = table_engine(img) +save_structure_res(result, save_folder, os.path.basename(img_path).split('.')[0]) + +for line in result: + line.pop('img') + print(line) +``` + + +#### 2.2.3 表格识别 + +```python +import os +import cv2 +from paddleocr import PPStructure,save_structure_res + +table_engine = PPStructure(layout=False, show_log=True) + +save_folder = './output' +img_path = 'PaddleOCR/ppstructure/docs/table/table.jpg' +img = cv2.imread(img_path) +result = table_engine(img) +save_structure_res(result, save_folder, os.path.basename(img_path).split('.')[0]) + +for line in result: + line.pop('img') + print(line) +``` + + +#### 2.2.4 DocVQA 请参考:[文档视觉问答](../vqa/README.md)。 @@ -98,11 +156,11 @@ PP-Structure的返回结果为一个dict组成的list,示例如下 ``` dict 里各个字段说明如下 -| 字段 | 说明 | -| --------------- | -------------| -|type|图片区域的类型| -|bbox|图片区域的在原图的坐标,分别[左上角x,左上角y,右下角x,右下角y]| -|res|图片区域的OCR或表格识别结果。
表格: 表格的HTML字符串;
OCR: 一个包含各个单行文字的检测坐标和识别结果的元组| +| 字段 | 说明 | +| --------------- |-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +|type| 图片区域的类型 | +|bbox| 图片区域的在原图的坐标,分别[左上角x,左上角y,右下角x,右下角y] | +|res| 图片区域的OCR或表格识别结果。
表格: 一个dict,字段说明如下
        `html`: 表格的HTML字符串
        在代码使用模式下,前向传入return_ocr_result_in_table=True可以拿到表格中每个文本的检测识别结果,对应为如下字段:
        `boxes`: 文本检测坐标
        `rec_res`: 文本识别结果。
OCR: 一个包含各个单行文字的检测坐标和识别结果的元组 | 运行完成后,每张图片会在`output`字段指定的目录下有一个同名目录,图片里的每个表格会存储为一个excel,图片区域会被裁剪之后保存下来,excel文件和图片名为表格在图片里的坐标。 @@ -110,8 +168,8 @@ dict 里各个字段说明如下 /output/table/1/ └─ res.txt └─ [454, 360, 824, 658].xlsx 表格识别结果 - └─ [16, 2, 828, 305].jpg 被裁剪出的图片区域 - └─ [17, 361, 404, 711].xlsx 表格识别结果 + └─ [16, 2, 828, 305].jpg 被裁剪出的图片区域 + └─ [17, 361, 404, 711].xlsx 表格识别结果 ``` @@ -122,17 +180,19 @@ dict 里各个字段说明如下 ### 2.4 参数说明 -| 字段 | 说明 | 默认值 | -| --------------- | ---------------------------------------- | ------------------------------------------- | -| output | excel和识别结果保存的地址 | ./output/table | -| table_max_len | 表格结构模型预测时,图像的长边resize尺度 | 488 | -| table_model_dir | 表格结构模型 inference 模型地址 | None | -| table_char_dict_path | 表格结构模型所用字典地址 | ../ppocr/utils/dict/table_structure_dict.txt | -| layout_path_model | 版面分析模型模型地址,可以为在线地址或者本地地址,当为本地地址时,需要指定 layout_label_map, 命令行模式下可通过--layout_label_map='{0: "Text", 1: "Title", 2: "List", 3:"Table", 4:"Figure"}' 指定 | lp://PubLayNet/ppyolov2_r50vd_dcn_365e_publaynet/config | -| layout_label_map | 版面分析模型模型label映射字典 | None | -| model_name_or_path | VQA SER模型地址 | None | -| max_seq_length | VQA SER模型最大支持token长度 | 512 | -| label_map_path | VQA SER 标签文件地址 | ./vqa/labels/labels_ser.txt | -| mode | pipeline预测模式,structure: 版面分析+表格识别; VQA: SER文档信息抽取 | structure | +| 字段 | 说明 | 默认值 | +|----------------------|----------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------| +| output | excel和识别结果保存的地址 | ./output/table | +| table_max_len | 表格结构模型预测时,图像的长边resize尺度 | 488 | +| table_model_dir | 表格结构模型 inference 模型地址 | None | +| table_char_dict_path | 表格结构模型所用字典地址 | ../ppocr/utils/dict/table_structure_dict.txt | +| layout_path_model | 版面分析模型模型地址,可以为在线地址或者本地地址,当为本地地址时,需要指定 layout_label_map, 命令行模式下可通过--layout_label_map='{0: "Text", 1: "Title", 2: "List", 3:"Table", 4:"Figure"}' 指定 | lp://PubLayNet/ppyolov2_r50vd_dcn_365e_publaynet/config | +| layout_label_map | 版面分析模型模型label映射字典 | None | +| model_name_or_path | VQA SER模型地址 | None | +| max_seq_length | VQA SER模型最大支持token长度 | 512 | +| label_map_path | VQA SER 标签文件地址 | ./vqa/labels/labels_ser.txt | +| layout | 前向中是否执行版面分析 | True | +| table | 前向中是否执行表格识别 | True | +| ocr | 对于版面分析中的非表格区域,是否执行ocr。当layout为False时会被自动设置为False | True | 大部分参数和PaddleOCR whl包保持一致,见 [whl包文档](../../doc/doc_ch/whl.md) diff --git a/ppstructure/docs/quickstart_en.md b/ppstructure/docs/quickstart_en.md index 45643de003c3bdf9c22d43dd9c1118026f8ae34f..853436ff07e665fb140a749e8ccbde4392ea5c13 100644 --- a/ppstructure/docs/quickstart_en.md +++ b/ppstructure/docs/quickstart_en.md @@ -1,54 +1,70 @@ -# PP-Structure 快速开始 - -- [1. 安装依赖包](#1) -- [2. 便捷使用](#2) - - [2.1 命令行使用](#21) - - [2.1.1 版面分析+表格识别](#211) - - [2.1.2 DocVQA](#212) - - [2.2 Python脚本使用](#22) - - [2.2.1 版面分析+表格识别](#221) - - [2.2.2 DocVQA](#222) - - [2.3 返回结果说明](#23) - - [2.3.1 版面分析+表格识别](#231) +# PP-Structure Quick Start + +- [1. Install package](#1) +- [2. Use](#2) + - [2.1 Use by command line](#21) + - [2.1.1 layout analysis + table recognition](#211) + - [2.1.2 layout analysis](#212) + - [2.1.3 table recognition](#213) + - [2.1.4 DocVQA](#214) + - [2.2 Use by code](#22) + - [2.2.1 layout analysis + table recognition](#221) + - [2.2.2 layout analysis](#222) + - [2.2.3 table recognition](#223) + - [2.2.4 DocVQA](#224) + - [2.3 Result description](#23) + - [2.3.1 layout analysis + table recognition](#231) - [2.3.2 DocVQA](#232) - - [2.4 参数说明](#24) + - [2.4 Parameter Description](#24) -## 1. 安装依赖包 +## 1. Install package ```bash -# 安装 paddleocr,推荐使用2.3.0.2+版本 -pip3 install "paddleocr>=2.3.0.2" -# 安装 版面分析依赖包layoutparser(如不需要版面分析功能,可跳过) -pip3 install -U https://paddleocr.bj.bcebos.com/whl/layoutparser-0.0.0-py3-none-any.whl -# 安装 DocVQA依赖包paddlenlp(如不需要DocVQA功能,可跳过) +# Install paddleocr, version 2.5+ is recommended +pip3 install "paddleocr>=2.5" +# Install layoutparser (if you do not use the layout analysis, you can skip it) +pip3 install -U https://paddleocr.bj.bcebos.com/whl/layoutparser-0.0.0-py3-none-any.whl +# Install the DocVQA dependency package paddlenlp (if you do not use the DocVQA, you can skip it) pip install paddlenlp ``` -## 2. 便捷使用 +## 2. Use -### 2.1 命令行使用 - +### 2.1 Use by command line + -#### 2.1.1 版面分析+表格识别 +#### 2.1.1 layout analysis + table recognition ```bash -paddleocr --image_dir=../doc/table/1.png --type=structure +paddleocr --image_dir=PaddleOCR/ppstructure/docs/table/1.png --type=structure ``` -#### 2.1.2 DocVQA +#### 2.1.2 layout analysis +```bash +paddleocr --image_dir=PaddleOCR/ppstructure/docs/table/1.png --type=structure --table=false --ocr=false +``` + + +#### 2.1.3 table recognition +```bash +paddleocr --image_dir=PaddleOCR/ppstructure/docs/table/table.jpg --type=structure --layout=false +``` + + +#### 2.1.4 DocVQA -请参考:[文档视觉问答](../vqa/README.md)。 +Please refer to: [Documentation Visual Q&A](../vqa/README.md) . -### 2.2 Python脚本使用 +### 2.2 Use by code -#### 2.2.1 版面分析+表格识别 +#### 2.2.1 layout analysis + table recognition ```python import os @@ -57,8 +73,8 @@ from paddleocr import PPStructure,draw_structure_result,save_structure_res table_engine = PPStructure(show_log=True) -save_folder = './output/table' -img_path = '../doc/table/1.png' +save_folder = './output' +img_path = 'PaddleOCR/ppstructure/docs/table/1.png' img = cv2.imread(img_path) result = table_engine(img) save_structure_res(result, save_folder,os.path.basename(img_path).split('.')[0]) @@ -69,7 +85,7 @@ for line in result: from PIL import Image -font_path = '../doc/fonts/simfang.ttf' # PaddleOCR下提供字体包 +font_path = 'PaddleOCR/doc/fonts/simfang.ttf' # PaddleOCR下提供字体包 image = Image.open(img_path).convert('RGB') im_show = draw_structure_result(image, result,font_path=font_path) im_show = Image.fromarray(im_show) @@ -77,16 +93,59 @@ im_show.save('result.jpg') ``` -#### 2.2.2 DocVQA +#### 2.2.2 layout analysis -请参考:[文档视觉问答](../vqa/README.md)。 +```python +import os +import cv2 +from paddleocr import PPStructure,save_structure_res + +table_engine = PPStructure(table=False, ocr=False, show_log=True) + +save_folder = './output' +img_path = 'PaddleOCR/ppstructure/docs/table/1.png' +img = cv2.imread(img_path) +result = table_engine(img) +save_structure_res(result, save_folder, os.path.basename(img_path).split('.')[0]) + +for line in result: + line.pop('img') + print(line) +``` + + +#### 2.2.3 table recognition + +```python +import os +import cv2 +from paddleocr import PPStructure,save_structure_res + +table_engine = PPStructure(layout=False, show_log=True) + +save_folder = './output' +img_path = 'PaddleOCR/ppstructure/docs/table/table.jpg' +img = cv2.imread(img_path) +result = table_engine(img) +save_structure_res(result, save_folder, os.path.basename(img_path).split('.')[0]) + +for line in result: + line.pop('img') + print(line) +``` + + +#### 2.2.4 DocVQA + +Please refer to: [Documentation Visual Q&A](../vqa/README.md) . -### 2.3 返回结果说明 -PP-Structure的返回结果为一个dict组成的list,示例如下 +### 2.3 Result description + +The return of PP-Structure is a list of dicts, the example is as follows: -#### 2.3.1 版面分析+表格识别 +#### 2.3.1 layout analysis + table recognition ```shell [ { 'type': 'Text', @@ -96,43 +155,44 @@ PP-Structure的返回结果为一个dict组成的list,示例如下 } ] ``` -dict 里各个字段说明如下 - -| 字段 | 说明 | -| --------------- | -------------| -|type|图片区域的类型| -|bbox|图片区域的在原图的坐标,分别[左上角x,左上角y,右下角x,右下角y]| -|res|图片区域的OCR或表格识别结果。
表格: 表格的HTML字符串;
OCR: 一个包含各个单行文字的检测坐标和识别结果的元组| +Each field in dict is described as follows: -运行完成后,每张图片会在`output`字段指定的目录下有一个同名目录,图片里的每个表格会存储为一个excel,图片区域会被裁剪之后保存下来,excel文件和图片名为表格在图片里的坐标。 +| field | description | +| --------------- |--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +|type| Type of image area. | +|bbox| The coordinates of the image area in the original image, respectively [upper left corner x, upper left corner y, lower right corner x, lower right corner y]. | +|res| OCR or table recognition result of the image area.
table: a dict with field descriptions as follows:
        `html`: html str of table.
        In the code usage mode, set return_ocr_result_in_table=True whrn call can get the detection and recognition results of each text in the table area, corresponding to the following fields:
        `boxes`: text detection boxes.
        `rec_res`: text recognition results.
OCR: A tuple containing the detection boxes and recognition results of each single text. | +After the recognition is completed, each image will have a directory with the same name under the directory specified by the `output` field. Each table in the image will be stored as an excel, and the picture area will be cropped and saved. The filename of excel and picture is their coordinates in the image. ``` /output/table/1/ └─ res.txt - └─ [454, 360, 824, 658].xlsx 表格识别结果 - └─ [16, 2, 828, 305].jpg 被裁剪出的图片区域 - └─ [17, 361, 404, 711].xlsx 表格识别结果 + └─ [454, 360, 824, 658].xlsx table recognition result + └─ [16, 2, 828, 305].jpg picture in Image + └─ [17, 361, 404, 711].xlsx table recognition result ``` #### 2.3.2 DocVQA -请参考:[文档视觉问答](../vqa/README.md)。 +Please refer to: [Documentation Visual Q&A](../vqa/README.md) . -### 2.4 参数说明 - -| 字段 | 说明 | 默认值 | -| --------------- | ---------------------------------------- | ------------------------------------------- | -| output | excel和识别结果保存的地址 | ./output/table | -| table_max_len | 表格结构模型预测时,图像的长边resize尺度 | 488 | -| table_model_dir | 表格结构模型 inference 模型地址 | None | -| table_char_dict_path | 表格结构模型所用字典地址 | ../ppocr/utils/dict/table_structure_dict.txt | -| layout_path_model | 版面分析模型模型地址,可以为在线地址或者本地地址,当为本地地址时,需要指定 layout_label_map, 命令行模式下可通过--layout_label_map='{0: "Text", 1: "Title", 2: "List", 3:"Table", 4:"Figure"}' 指定 | lp://PubLayNet/ppyolov2_r50vd_dcn_365e_publaynet/config | -| layout_label_map | 版面分析模型模型label映射字典 | None | -| model_name_or_path | VQA SER模型地址 | None | -| max_seq_length | VQA SER模型最大支持token长度 | 512 | -| label_map_path | VQA SER 标签文件地址 | ./vqa/labels/labels_ser.txt | -| mode | pipeline预测模式,structure: 版面分析+表格识别; VQA: SER文档信息抽取 | structure | - -大部分参数和PaddleOCR whl包保持一致,见 [whl包文档](../../doc/doc_ch/whl.md) +### 2.4 Parameter Description + +| field | description | default | +|----------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------| +| output | The save path of result | ./output/table | +| table_max_len | When the table structure model predicts, the long side of the image | 488 | +| table_model_dir | the path of table structure model | None | +| table_char_dict_path | the dict path of table structure model | ../ppocr/utils/dict/table_structure_dict.txt | +| layout_path_model | The model path of the layout analysis model, which can be an online address or a local path. When it is a local path, layout_label_map needs to be set. In command line mode, use --layout_label_map='{0: "Text", 1: "Title", 2: "List", 3:"Table", 4:"Figure"}' | lp://PubLayNet/ppyolov2_r50vd_dcn_365e_publaynet/config | +| layout_label_map | Layout analysis model model label mapping dictionary path | None | +| model_name_or_path | the model path of VQA SER model | None | +| max_seq_length | the max token length of VQA SER model | 512 | +| label_map_path | the label path of VQA SER model | ./vqa/labels/labels_ser.txt | +| layout | Whether to perform layout analysis in forward | True | +| table | Whether to perform table recognition in forward | True | +| ocr | Whether to perform ocr for non-table areas in layout analysis. When layout is False, it will be automatically set to False | True | + +Most of the parameters are consistent with the PaddleOCR whl package, see [whl package documentation](../../doc/doc_en/whl.md) diff --git a/ppstructure/predict_system.py b/ppstructure/predict_system.py index 96227aabbbf38904417f3e3a6fd6c49031c4bc58..7f18fcdf8e6b57be6e129f3271f5bb583f4da616 100644 --- a/ppstructure/predict_system.py +++ b/ppstructure/predict_system.py @@ -23,9 +23,10 @@ sys.path.append(os.path.abspath(os.path.join(__dir__, '..'))) os.environ["FLAGS_allocator_strategy"] = 'auto_growth' import cv2 import json -import numpy as np import time import logging +from copy import deepcopy +from attrdict import AttrDict from ppocr.utils.utility import get_image_file_list, check_and_read_gif from ppocr.utils.logging import get_logger @@ -40,97 +41,122 @@ class StructureSystem(object): def __init__(self, args): self.mode = args.mode if self.mode == 'structure': - import layoutparser as lp - # args.det_limit_type = 'resize_long' - args.drop_score = 0 if not args.show_log: logger.setLevel(logging.INFO) - self.text_system = TextSystem(args) - self.table_system = TableSystem(args, - self.text_system.text_detector, - self.text_system.text_recognizer) - - config_path = None - model_path = None - if os.path.isdir(args.layout_path_model): - model_path = args.layout_path_model + if args.layout == False and args.ocr == True: + args.ocr = False + logger.warning( + "When args.layout is false, args.ocr is automatically set to false" + ) + args.drop_score = 0 + # init layout and ocr model + self.text_system = None + if args.layout: + import layoutparser as lp + config_path = None + model_path = None + if os.path.isdir(args.layout_path_model): + model_path = args.layout_path_model + else: + config_path = args.layout_path_model + self.table_layout = lp.PaddleDetectionLayoutModel( + config_path=config_path, + model_path=model_path, + label_map=args.layout_label_map, + threshold=0.5, + enable_mkldnn=args.enable_mkldnn, + enforce_cpu=not args.use_gpu, + thread_num=args.cpu_threads) + if args.ocr: + self.text_system = TextSystem(args) + else: + self.table_layout = None + if args.table: + if self.text_system is not None: + self.table_system = TableSystem( + args, self.text_system.text_detector, + self.text_system.text_recognizer) + else: + self.table_system = TableSystem(args) else: - config_path = args.layout_path_model - self.table_layout = lp.PaddleDetectionLayoutModel( - config_path=config_path, - model_path=model_path, - label_map=args.layout_label_map, - threshold=0.5, - enable_mkldnn=args.enable_mkldnn, - enforce_cpu=not args.use_gpu, - thread_num=args.cpu_threads) - self.use_angle_cls = args.use_angle_cls - self.drop_score = args.drop_score + self.table_system = None + elif self.mode == 'vqa': raise NotImplementedError - def __call__(self, img): + def __call__(self, img, return_ocr_result_in_table=False): if self.mode == 'structure': ori_im = img.copy() - layout_res = self.table_layout.detect(img[..., ::-1]) + if self.table_layout is not None: + layout_res = self.table_layout.detect(img[..., ::-1]) + else: + h, w = ori_im.shape[:2] + layout_res = [AttrDict(coordinates=[0, 0, w, h], type='Table')] res_list = [] for region in layout_res: + res = '' x1, y1, x2, y2 = region.coordinates x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2) roi_img = ori_im[y1:y2, x1:x2, :] if region.type == 'Table': - res = self.table_system(roi_img) + if self.table_system is not None: + res = self.table_system(roi_img, + return_ocr_result_in_table) else: - filter_boxes, filter_rec_res = self.text_system(roi_img) - # remove style char - style_token = [ - '', '', '', '', '', - '', '', '', '', '', - '', '', '', '' - ] - res = [] - for box, rec_res in zip(filter_boxes, filter_rec_res): - rec_str, rec_conf = rec_res - for token in style_token: - if token in rec_str: - rec_str = rec_str.replace(token, '') - box += [x1, y1] - res.append({ - 'text': rec_str, - 'confidence': float(rec_conf), - 'text_region': box.tolist() - }) + if self.text_system is not None: + filter_boxes, filter_rec_res = self.text_system(roi_img) + # remove style char + style_token = [ + '', '', '', '', '', + '', '', '', '', + '', '', '', '', + '' + ] + res = [] + for box, rec_res in zip(filter_boxes, filter_rec_res): + rec_str, rec_conf = rec_res + for token in style_token: + if token in rec_str: + rec_str = rec_str.replace(token, '') + box += [x1, y1] + res.append({ + 'text': rec_str, + 'confidence': float(rec_conf), + 'text_region': box.tolist() + }) res_list.append({ 'type': region.type, 'bbox': [x1, y1, x2, y2], 'img': roi_img, 'res': res }) + return res_list elif self.mode == 'vqa': raise NotImplementedError - return res_list + return None def save_structure_res(res, save_folder, img_name): excel_save_folder = os.path.join(save_folder, img_name) os.makedirs(excel_save_folder, exist_ok=True) + res_cp = deepcopy(res) # save res with open( os.path.join(excel_save_folder, 'res.txt'), 'w', encoding='utf8') as f: - for region in res: - if region['type'] == 'Table': + for region in res_cp: + roi_img = region.pop('img') + f.write('{}\n'.format(json.dumps(region))) + + if region['type'] == 'Table' and len(region[ + 'res']) > 0 and 'html' in region['res']: excel_path = os.path.join(excel_save_folder, '{}.xlsx'.format(region['bbox'])) - to_excel(region['res'], excel_path) + to_excel(region['res']['html'], excel_path) elif region['type'] == 'Figure': - roi_img = region['img'] img_path = os.path.join(excel_save_folder, '{}.jpg'.format(region['bbox'])) cv2.imwrite(img_path, roi_img) - else: - for text_result in region['res']: - f.write('{}\n'.format(json.dumps(text_result))) def main(args): diff --git a/ppstructure/table/README.md b/ppstructure/table/README.md index 65d2cd22b6f18d06fe538ffe1fd243c0c0bfaa3c..d21ef4aa3813b4ff49dc0580be35c5e2e0483c8f 100644 --- a/ppstructure/table/README.md +++ b/ppstructure/table/README.md @@ -51,7 +51,7 @@ wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_tab wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_structure_infer.tar && tar xf en_ppocr_mobile_v2.0_table_structure_infer.tar cd .. # run -python3 table/predict_table.py --det_model_dir=inference/en_ppocr_mobile_v2.0_table_det_infer --rec_model_dir=inference/en_ppocr_mobile_v2.0_table_rec_infer --table_model_dir=inference/en_ppocr_mobile_v2.0_table_structure_infer --image_dir=../doc/table/table.jpg --rec_char_dict_path=../ppocr/utils/dict/table_dict.txt --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt --det_limit_side_len=736 --det_limit_type=min --output ../output/table +python3 table/predict_table.py --det_model_dir=inference/en_ppocr_mobile_v2.0_table_det_infer --rec_model_dir=inference/en_ppocr_mobile_v2.0_table_rec_infer --table_model_dir=inference/en_ppocr_mobile_v2.0_table_structure_infer --image_dir=./docs/table/table.jpg --rec_char_dict_path=../ppocr/utils/dict/table_dict.txt --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt --det_limit_side_len=736 --det_limit_type=min --output ./output/table ``` Note: The above model is trained on the PubLayNet dataset and only supports English scanning scenarios. If you need to identify other scenarios, you need to train the model yourself and replace the three fields `det_model_dir`, `rec_model_dir`, `table_model_dir`. diff --git a/ppstructure/table/README_ch.md b/ppstructure/table/README_ch.md index c68b8c88b92bdca3f3a6c777f5f4681093fa89f5..a0a64d6b7ebcb272e4b607975170a679abd036ab 100644 --- a/ppstructure/table/README_ch.md +++ b/ppstructure/table/README_ch.md @@ -61,7 +61,7 @@ wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_tab wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_structure_infer.tar && tar xf en_ppocr_mobile_v2.0_table_structure_infer.tar cd .. # 执行预测 -python3 table/predict_table.py --det_model_dir=inference/en_ppocr_mobile_v2.0_table_det_infer --rec_model_dir=inference/en_ppocr_mobile_v2.0_table_rec_infer --table_model_dir=inference/en_ppocr_mobile_v2.0_table_structure_infer --image_dir=../doc/table/table.jpg --rec_char_dict_path=../ppocr/utils/dict/table_dict.txt --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt --det_limit_side_len=736 --det_limit_type=min --output ../output/table +python3 table/predict_table.py --det_model_dir=inference/en_ppocr_mobile_v2.0_table_det_infer --rec_model_dir=inference/en_ppocr_mobile_v2.0_table_rec_infer --table_model_dir=inference/en_ppocr_mobile_v2.0_table_structure_infer --image_dir=./docs/table/table.jpg --rec_char_dict_path=../ppocr/utils/dict/table_dict.txt --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt --det_limit_side_len=736 --det_limit_type=min --output ./output/table ``` 运行完成后,每张图片的excel表格会保存到output字段指定的目录下 diff --git a/ppstructure/table/predict_table.py b/ppstructure/table/predict_table.py index 352ae84de1f435f91258cf0ced4dce9345de1220..402d6c24189d044e2ee6d359edef8624d4aae145 100644 --- a/ppstructure/table/predict_table.py +++ b/ppstructure/table/predict_table.py @@ -54,16 +54,20 @@ def expand(pix, det_box, shape): class TableSystem(object): def __init__(self, args, text_detector=None, text_recognizer=None): - self.text_detector = predict_det.TextDetector(args) if text_detector is None else text_detector - self.text_recognizer = predict_rec.TextRecognizer(args) if text_recognizer is None else text_recognizer + self.text_detector = predict_det.TextDetector( + args) if text_detector is None else text_detector + self.text_recognizer = predict_rec.TextRecognizer( + args) if text_recognizer is None else text_recognizer self.table_structurer = predict_strture.TableStructurer(args) - def __call__(self, img): + def __call__(self, img, return_ocr_result_in_table=False): + result = dict() ori_im = img.copy() structure_res, elapse = self.table_structurer(copy.deepcopy(img)) dt_boxes, elapse = self.text_detector(copy.deepcopy(img)) dt_boxes = sorted_boxes(dt_boxes) - + if return_ocr_result_in_table: + result['boxes'] = [x.tolist() for x in dt_boxes] r_boxes = [] for box in dt_boxes: x_min = box[:, 0].min() - 1 @@ -88,14 +92,17 @@ class TableSystem(object): rec_res, elapse = self.text_recognizer(img_crop_list) logger.debug("rec_res num : {}, elapse : {}".format( len(rec_res), elapse)) - + if return_ocr_result_in_table: + result['rec_res'] = rec_res pred_html, pred = self.rebuild_table(structure_res, dt_boxes, rec_res) - return pred_html + result['html'] = pred_html + return result def rebuild_table(self, structure_res, dt_boxes, rec_res): pred_structures, pred_bboxes = structure_res matched_index = self.match_result(dt_boxes, pred_bboxes) - pred_html, pred = self.get_pred_html(pred_structures, matched_index, rec_res) + pred_html, pred = self.get_pred_html(pred_structures, matched_index, + rec_res) return pred_html, pred def match_result(self, dt_boxes, pred_bboxes): @@ -104,11 +111,13 @@ class TableSystem(object): # gt_box = [np.min(gt_box[:, 0]), np.min(gt_box[:, 1]), np.max(gt_box[:, 0]), np.max(gt_box[:, 1])] distances = [] for j, pred_box in enumerate(pred_bboxes): - distances.append( - (distance(gt_box, pred_box), 1. - compute_iou(gt_box, pred_box))) # 获取两两cell之间的L1距离和 1- IOU + distances.append((distance(gt_box, pred_box), + 1. - compute_iou(gt_box, pred_box) + )) # 获取两两cell之间的L1距离和 1- IOU sorted_distances = distances.copy() # 根据距离和IOU挑选最"近"的cell - sorted_distances = sorted(sorted_distances, key=lambda item: (item[1], item[0])) + sorted_distances = sorted( + sorted_distances, key=lambda item: (item[1], item[0])) if distances.index(sorted_distances[0]) not in matched.keys(): matched[distances.index(sorted_distances[0])] = [i] else: @@ -122,7 +131,8 @@ class TableSystem(object): if '' in tag: if td_index in matched_index.keys(): b_with = False - if '' in ocr_contents[matched_index[td_index][0]] and len(matched_index[td_index]) > 1: + if '' in ocr_contents[matched_index[td_index][ + 0]] and len(matched_index[td_index]) > 1: b_with = True end_html.extend('') for i, td_index_index in enumerate(matched_index[td_index]): @@ -138,7 +148,8 @@ class TableSystem(object): content = content[:-4] if len(content) == 0: continue - if i != len(matched_index[td_index]) - 1 and ' ' != content[-1]: + if i != len(matched_index[ + td_index]) - 1 and ' ' != content[-1]: content += ' ' end_html.extend(content) if b_with: @@ -187,18 +198,19 @@ def main(args): for i, image_file in enumerate(image_file_list): logger.info("[{}/{}] {}".format(i, img_num, image_file)) img, flag = check_and_read_gif(image_file) - excel_path = os.path.join(args.output, os.path.basename(image_file).split('.')[0] + '.xlsx') + excel_path = os.path.join( + args.output, os.path.basename(image_file).split('.')[0] + '.xlsx') if not flag: img = cv2.imread(image_file) if img is None: logger.error("error in loading image:{}".format(image_file)) continue starttime = time.time() - pred_html = text_sys(img) - + pred_res = text_sys(img) + pred_html = pred_res['html'] + logger.info(pred_html) to_excel(pred_html, excel_path) logger.info('excel saved to {}'.format(excel_path)) - logger.info(pred_html) elapse = time.time() - starttime logger.info("Predict time : {:.3f}s".format(elapse)) diff --git a/ppstructure/utility.py b/ppstructure/utility.py index 081a5f6ae3cd4a01bc2d1ba4812f39086e16cfe9..938c12f951730ed1b81186608dd10efb383e8cfc 100644 --- a/ppstructure/utility.py +++ b/ppstructure/utility.py @@ -15,7 +15,7 @@ import ast from PIL import Image import numpy as np -from tools.infer.utility import draw_ocr_box_txt, init_args as infer_args +from tools.infer.utility import draw_ocr_box_txt, str2bool, init_args as infer_args def init_args(): @@ -30,6 +30,7 @@ def init_args(): "--table_char_dict_path", type=str, default="../ppocr/utils/dict/table_structure_dict.txt") + # params for layout parser.add_argument( "--layout_path_model", type=str, @@ -39,11 +40,27 @@ def init_args(): type=ast.literal_eval, default=None, help='label map according to ppstructure/layout/README_ch.md') + # params for inference parser.add_argument( "--mode", type=str, default='structure', help='structure and vqa is supported') + parser.add_argument( + "--layout", + type=str2bool, + default=True, + help='Whether to enable layout analysis') + parser.add_argument( + "--table", + type=str2bool, + default=True, + help='In the forward, whether the table area uses table recognition') + parser.add_argument( + "--ocr", + type=str2bool, + default=True, + help='In the forward, whether the non-table area is recognition by ocr') return parser diff --git a/requirements.txt b/requirements.txt index b60d48371337e38bde6e51171aa6ecfb9573fb4d..b15176db3eb42c381c1612f404fd15c6b020b3dc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,3 +12,4 @@ cython lxml premailer openpyxl +attrdict