diff --git a/__init__.py b/__init__.py index 15a9aca4da19a981b9e678e7cc93e33cf40fc81c..11436094c163db1b91f5ac38f2936a53017016c1 100644 --- a/__init__.py +++ b/__init__.py @@ -16,5 +16,6 @@ from .paddleocr import * __version__ = paddleocr.VERSION __all__ = [ 'PaddleOCR', 'PPStructure', 'draw_ocr', 'draw_structure_result', - 'save_structure_res', 'download_with_progressbar' + 'save_structure_res', 'download_with_progressbar', 'sorted_layout_boxes', + 'convert_info_docx' ] diff --git a/deploy/hubserving/readme.md b/deploy/hubserving/readme.md index 8144c2e7cefaed6f64763e414101445b2d80b81a..c583cc96ede437a1f65f9b1bddb69e84b7c54852 100755 --- a/deploy/hubserving/readme.md +++ b/deploy/hubserving/readme.md @@ -20,13 +20,14 @@ PaddleOCR提供2种服务部署方式: # 基于PaddleHub Serving的服务部署 -hubserving服务部署目录下包括文本检测、文本方向分类,文本识别、文本检测+文本方向分类+文本识别3阶段串联,表格识别和PP-Structure六种服务包,请根据需求选择相应的服务包进行安装和启动。目录结构如下: +hubserving服务部署目录下包括文本检测、文本方向分类,文本识别、文本检测+文本方向分类+文本识别3阶段串联,版面分析、表格识别和PP-Structure七种服务包,请根据需求选择相应的服务包进行安装和启动。目录结构如下: ``` deploy/hubserving/ └─ ocr_cls 文本方向分类模块服务包 └─ ocr_det 文本检测模块服务包 └─ ocr_rec 文本识别模块服务包 └─ ocr_system 文本检测+文本方向分类+文本识别串联服务包 + └─ structure_layout 版面分析服务包 └─ structure_table 表格识别服务包 └─ structure_system PP-Structure服务包 ``` @@ -41,6 +42,7 @@ deploy/hubserving/ocr_system/ ``` ## 1. 近期更新 +* 2022.08.23 新增版面分析服务。 * 2022.05.05 新增PP-OCRv3检测和识别模型。 * 2022.03.30 新增PP-Structure和表格识别两种服务。 @@ -59,9 +61,9 @@ pip3 install paddlehub==2.1.0 --upgrade -i https://mirror.baidu.com/pypi/simple 检测模型:./inference/ch_PP-OCRv3_det_infer/ 识别模型:./inference/ch_PP-OCRv3_rec_infer/ 方向分类器:./inference/ch_ppocr_mobile_v2.0_cls_infer/ -版面分析模型:./inference/layout_infer/ +版面分析模型:./inference/picodet_lcnet_x1_0_fgd_layout_infer/ 表格结构识别模型:./inference/ch_ppstructure_mobile_v2.0_SLANet_infer/ -``` +``` **模型路径可在`params.py`中查看和修改。** 更多模型可以从PaddleOCR提供的模型库[PP-OCR](../../doc/doc_ch/models_list.md)和[PP-Structure](../../ppstructure/docs/models_list.md)下载,也可以替换成自己训练转换好的模型。 @@ -87,6 +89,9 @@ hub install deploy/hubserving/structure_table/ # 或,安装PP-Structure服务模块: hub install deploy/hubserving/structure_system/ + +# 或,安装版面分析服务模块: +hub install deploy/hubserving/structure_layout/ ``` * 在Windows环境下(文件夹的分隔符为`\`),安装示例如下: @@ -108,6 +113,9 @@ hub install deploy\hubserving\structure_table\ # 或,安装PP-Structure服务模块: hub install deploy\hubserving\structure_system\ + +# 或,安装版面分析服务模块: +hub install deploy\hubserving\structure_layout\ ``` ### 2.4 启动服务 @@ -118,7 +126,7 @@ $ hub serving start --modules [Module1==Version1, Module2==Version2, ...] \ --port XXXX \ --use_multiprocess \ --workers \ -``` +``` **参数:** @@ -168,7 +176,7 @@ $ hub serving start --modules [Module1==Version1, Module2==Version2, ...] \ ```shell export CUDA_VISIBLE_DEVICES=3 hub serving start -c deploy/hubserving/ocr_system/config.json -``` +``` ## 3. 发送预测请求 配置好服务端,可使用以下命令发送预测请求,获取预测结果: @@ -185,6 +193,7 @@ hub serving start -c deploy/hubserving/ocr_system/config.json `http://127.0.0.1:8868/predict/ocr_system` `http://127.0.0.1:8869/predict/structure_table` `http://127.0.0.1:8870/predict/structure_system` +`http://127.0.0.1:8870/predict/structure_layout` - **image_dir**:测试图像路径,可以是单张图片路径,也可以是图像集合目录路径 - **visualize**:是否可视化结果,默认为False - **output**:可视化结果保存路径,默认为`./hubserving_result` @@ -203,17 +212,19 @@ hub serving start -c deploy/hubserving/ocr_system/config.json |text_region|list|文本位置坐标| |html|str|表格的html字符串| |regions|list|版面分析+表格识别+OCR的结果,每一项为一个list,包含表示区域坐标的`bbox`,区域类型的`type`和区域结果的`res`三个字段| +|layout|list|版面分析的结果,每一项一个dict,包含版面区域坐标的`bbox`,区域类型的`label`| 不同模块返回的字段不同,如,文本识别服务模块返回结果不含`text_region`字段,具体信息如下: -| 字段名/模块名 | ocr_det | ocr_cls | ocr_rec | ocr_system | structure_table | structure_system | -| --- | --- | --- | --- | --- | --- |--- | -|angle| | ✔ | | ✔ | || -|text| | |✔|✔| | ✔ | -|confidence| |✔ |✔| | | ✔| -|text_region| ✔| | |✔ | | ✔| -|html| | | | |✔ |✔| -|regions| | | | |✔ |✔ | +| 字段名/模块名 | ocr_det | ocr_cls | ocr_rec | ocr_system | structure_table | structure_system | Structure_layout | +| --- | --- | --- | --- | --- | --- | --- | --- | +|angle| | ✔ | | ✔ | ||| +|text| | |✔|✔| | ✔ | | +|confidence| |✔ |✔| | | ✔| | +|text_region| ✔| | |✔ | | ✔| | +|html| | | | |✔ |✔|| +|regions| | | | |✔ |✔ | | +|layout| | | | | | | ✔ | **说明:** 如果需要增加、删除、修改返回字段,可在相应模块的`module.py`文件中进行修改,完整流程参考下一节自定义修改服务模块。 diff --git a/deploy/hubserving/readme_en.md b/deploy/hubserving/readme_en.md index 06eaaebacb51744844473c0ffe8b189dc545492c..f09fe46417c7567305e5ce05a14be74d33450c31 100755 --- a/deploy/hubserving/readme_en.md +++ b/deploy/hubserving/readme_en.md @@ -20,13 +20,14 @@ PaddleOCR provides 2 service deployment methods: # Service deployment based on PaddleHub Serving -The hubserving service deployment directory includes six service packages: text detection, text angle class, text recognition, text detection+text angle class+text recognition three-stage series connection, table recognition and PP-Structure. Please select the corresponding service package to install and start service according to your needs. The directory is as follows: +The hubserving service deployment directory includes seven service packages: text detection, text angle class, text recognition, text detection+text angle class+text recognition three-stage series connection, layout analysis, table recognition and PP-Structure. Please select the corresponding service package to install and start service according to your needs. The directory is as follows: ``` deploy/hubserving/ └─ ocr_det text detection module service package └─ ocr_cls text angle class module service package └─ ocr_rec text recognition module service package └─ ocr_system text detection+text angle class+text recognition three-stage series connection service package + └─ structure_layout layout analysis service package └─ structure_table table recognition service package └─ structure_system PP-Structure service package ``` @@ -43,6 +44,7 @@ deploy/hubserving/ocr_system/ * 2022.05.05 add PP-OCRv3 text detection and recognition models. * 2022.03.30 add PP-Structure and table recognition services。 +* 2022.08.23 add layout analysis services。 ## 2. Quick start service @@ -61,7 +63,7 @@ Before installing the service module, you need to prepare the inference model an text detection model: ./inference/ch_PP-OCRv3_det_infer/ text recognition model: ./inference/ch_PP-OCRv3_rec_infer/ text angle classifier: ./inference/ch_ppocr_mobile_v2.0_cls_infer/ -layout parse model: ./inference/layout_infer/ +layout parse model: ./inference/picodet_lcnet_x1_0_fgd_layout_infer/ tanle recognition: ./inference/ch_ppstructure_mobile_v2.0_SLANet_infer/ ``` @@ -89,6 +91,9 @@ hub install deploy/hubserving/structure_table/ # Or install PP-Structure service module hub install deploy/hubserving/structure_system/ + +# Or install layout analysis service module +hub install deploy/hubserving/structure_layout/ ``` * On Windows platform, the examples are as follows. @@ -110,6 +115,9 @@ hub install deploy/hubserving/structure_table/ # Or install PP-Structure service module hub install deploy\hubserving\structure_system\ + +# Or install layout analysis service module +hub install deploy\hubserving\structure_layout\ ``` ### 2.4 Start service @@ -190,8 +198,9 @@ For example, if using the configuration file to start the text angle classificat `http://127.0.0.1:8866/predict/ocr_cls` `http://127.0.0.1:8867/predict/ocr_rec` `http://127.0.0.1:8868/predict/ocr_system` -`http://127.0.0.1:8869/predict/structure_table` +`http://127.0.0.1:8869/predict/structure_table` `http://127.0.0.1:8870/predict/structure_system` +`http://127.0.0.1:8870/predict/structure_layout` - **image_dir**:Test image path, can be a single image path or an image directory path - **visualize**:Whether to visualize the results, the default value is False - **output**:The floder to save Visualization result, default value is `./hubserving_result` @@ -212,17 +221,19 @@ The returned result is a list. Each item in the list is a dict. The dict may con |text_region|list|text location coordinates| |html|str|table html str| |regions|list|The result of layout analysis + table recognition + OCR, each item is a list, including `bbox` indicating area coordinates, `type` of area type and `res` of area results| +|layout|list|The result of layout analysis, each item is a dict, including `bbox` indicating area coordinates, `label` of area type| The fields returned by different modules are different. For example, the results returned by the text recognition service module do not contain `text_region`. The details are as follows: -| field name/module name | ocr_det | ocr_cls | ocr_rec | ocr_system | structure_table | structure_system | -| --- | --- | --- | --- | --- | --- |--- | -|angle| | ✔ | | ✔ | || -|text| | |✔|✔| | ✔ | -|confidence| |✔ |✔| | | ✔| -|text_region| ✔| | |✔ | | ✔| -|html| | | | |✔ |✔| -|regions| | | | |✔ |✔ | +| field name/module name | ocr_det | ocr_cls | ocr_rec | ocr_system | structure_table | structure_system | structure_layout | +| --- | --- | --- | --- | --- | --- |--- |--- | +|angle| | ✔ | | ✔ | || | +|text| | |✔|✔| | ✔ | | +|confidence| |✔ |✔| | | ✔| | +|text_region| ✔| | |✔ | | ✔| | +|html| | | | |✔ |✔| | +|regions| | | | |✔ |✔ | | +|layout| | | | | | |✔ | **Note:** If you need to add, delete or modify the returned fields, you can modify the file `module.py` of the corresponding module. For the complete process, refer to the user-defined modification service module in the next section. diff --git a/deploy/hubserving/structure_layout/__init__.py b/deploy/hubserving/structure_layout/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c747d3e7aeca842933e083dffc01ef1fba3f4e85 --- /dev/null +++ b/deploy/hubserving/structure_layout/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. \ No newline at end of file diff --git a/deploy/hubserving/structure_layout/config.json b/deploy/hubserving/structure_layout/config.json new file mode 100644 index 0000000000000000000000000000000000000000..bc52c1ab603d5659f90a5ed8a72cdb06638fb9e5 --- /dev/null +++ b/deploy/hubserving/structure_layout/config.json @@ -0,0 +1,16 @@ +{ + "modules_info": { + "structure_layout": { + "init_args": { + "version": "1.0.0", + "use_gpu": true + }, + "predict_args": { + } + } + }, + "port": 8871, + "use_multiprocess": false, + "workers": 2 +} + diff --git a/deploy/hubserving/structure_layout/module.py b/deploy/hubserving/structure_layout/module.py new file mode 100644 index 0000000000000000000000000000000000000000..7091f123fc0039e4886d8763096952d7c445184c --- /dev/null +++ b/deploy/hubserving/structure_layout/module.py @@ -0,0 +1,143 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import sys +sys.path.insert(0, ".") +import copy + +import time +import paddlehub +from paddlehub.common.logger import logger +from paddlehub.module.module import moduleinfo, runnable, serving +import cv2 +import paddlehub as hub + +from tools.infer.utility import base64_to_cv2 +from ppstructure.layout.predict_layout import LayoutPredictor as _LayoutPredictor +from ppstructure.utility import parse_args +from deploy.hubserving.structure_layout.params import read_params + + +@moduleinfo( + name="structure_layout", + version="1.0.0", + summary="PP-Structure layout service", + author="paddle-dev", + author_email="paddle-dev@baidu.com", + type="cv/structure_layout") +class LayoutPredictor(hub.Module): + def _initialize(self, use_gpu=False, enable_mkldnn=False): + """ + initialize with the necessary elements + """ + cfg = self.merge_configs() + cfg.use_gpu = use_gpu + if use_gpu: + try: + _places = os.environ["CUDA_VISIBLE_DEVICES"] + int(_places[0]) + print("use gpu: ", use_gpu) + print("CUDA_VISIBLE_DEVICES: ", _places) + cfg.gpu_mem = 8000 + except: + raise RuntimeError( + "Environment Variable CUDA_VISIBLE_DEVICES is not set correctly. If you wanna use gpu, please set CUDA_VISIBLE_DEVICES via export CUDA_VISIBLE_DEVICES=cuda_device_id." + ) + cfg.ir_optim = True + cfg.enable_mkldnn = enable_mkldnn + + self.layout_predictor = _LayoutPredictor(cfg) + + def merge_configs(self): + # deafult cfg + backup_argv = copy.deepcopy(sys.argv) + sys.argv = sys.argv[:1] + cfg = parse_args() + + update_cfg_map = vars(read_params()) + + for key in update_cfg_map: + cfg.__setattr__(key, update_cfg_map[key]) + + sys.argv = copy.deepcopy(backup_argv) + return cfg + + def read_images(self, paths=[]): + images = [] + for img_path in paths: + assert os.path.isfile( + img_path), "The {} isn't a valid file.".format(img_path) + img = cv2.imread(img_path) + if img is None: + logger.info("error in loading image:{}".format(img_path)) + continue + images.append(img) + return images + + def predict(self, images=[], paths=[]): + """ + Get the chinese texts in the predicted images. + Args: + images (list(numpy.ndarray)): images data, shape of each is [H, W, C]. If images not paths + paths (list[str]): The paths of images. If paths not images + Returns: + res (list): The layout results of images. + """ + + if images != [] and isinstance(images, list) and paths == []: + predicted_data = images + elif images == [] and isinstance(paths, list) and paths != []: + predicted_data = self.read_images(paths) + else: + raise TypeError("The input data is inconsistent with expectations.") + + assert predicted_data != [], "There is not any image to be predicted. Please check the input data." + + all_results = [] + for img in predicted_data: + if img is None: + logger.info("error in loading image") + all_results.append([]) + continue + starttime = time.time() + res, _ = self.layout_predictor(img) + elapse = time.time() - starttime + logger.info("Predict time: {}".format(elapse)) + + for item in res: + item['bbox'] = item['bbox'].tolist() + all_results.append({'layout': res}) + return all_results + + @serving + def serving_method(self, images, **kwargs): + """ + Run as a service. + """ + images_decode = [base64_to_cv2(image) for image in images] + results = self.predict(images_decode, **kwargs) + return results + + +if __name__ == '__main__': + layout = LayoutPredictor() + layout._initialize() + image_path = ['./ppstructure/docs/table/1.png'] + res = layout.predict(paths=image_path) + print(res) diff --git a/deploy/hubserving/structure_layout/params.py b/deploy/hubserving/structure_layout/params.py new file mode 100755 index 0000000000000000000000000000000000000000..448b66ac42dac555f084299f525ee9e91ad481d8 --- /dev/null +++ b/deploy/hubserving/structure_layout/params.py @@ -0,0 +1,32 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + + +class Config(object): + pass + + +def read_params(): + cfg = Config() + + # params for layout analysis + cfg.layout_model_dir = './inference/picodet_lcnet_x1_0_fgd_layout_infer/' + cfg.layout_dict_path = './ppocr/utils/dict/layout_dict/layout_publaynet_dict.txt' + cfg.layout_score_threshold = 0.5 + cfg.layout_nms_threshold = 0.5 + return cfg diff --git a/paddleocr.py b/paddleocr.py index f6fb095af34a58cc91b9fd0f22b2e95bf833e010..1a236f2474cf3d5ef1fc6ab61955157bb1837db2 100644 --- a/paddleocr.py +++ b/paddleocr.py @@ -286,11 +286,17 @@ MODEL_URLS = { } }, 'layout': { - 'ch': { + 'en': { 'url': - 'https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_layout_infer.tar', + 'https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_infer.tar', 'dict_path': 'ppocr/utils/dict/layout_dict/layout_publaynet_dict.txt' + }, + 'ch': { + 'url': + 'https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_cdla_infer.tar', + 'dict_path': + 'ppocr/utils/dict/layout_dict/layout_cdla_dict.txt' } } } @@ -556,7 +562,7 @@ class PPStructure(StructureSystem): params.table_model_dir, os.path.join(BASE_DIR, 'whl', 'table'), table_model_config['url']) layout_model_config = get_model_config( - 'STRUCTURE', params.structure_version, 'layout', 'ch') + 'STRUCTURE', params.structure_version, 'layout', lang) params.layout_model_dir, layout_url = confirm_model_dir_url( params.layout_model_dir, os.path.join(BASE_DIR, 'whl', 'layout'), layout_model_config['url']) @@ -578,7 +584,7 @@ class PPStructure(StructureSystem): logger.debug(params) super().__init__(params) - def __call__(self, img, return_ocr_result_in_table=False): + def __call__(self, img, return_ocr_result_in_table=False, img_idx=0): if isinstance(img, str): # download net image if img.startswith('http'): @@ -596,7 +602,8 @@ class PPStructure(StructureSystem): if isinstance(img, np.ndarray) and len(img.shape) == 2: img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) - res, _ = super().__call__(img, return_ocr_result_in_table) + res, _ = super().__call__( + img, return_ocr_result_in_table, img_idx=img_idx) return res @@ -631,10 +638,54 @@ def main(): for line in result: logger.info(line) elif args.type == 'structure': - result = engine(img_path) - save_structure_res(result, args.output, img_name) - - for item in result: + img, flag_gif, flag_pdf = check_and_read(img_path) + if not flag_gif and not flag_pdf: + img = cv2.imread(img_path) + + if not flag_pdf: + if img is None: + logger.error("error in loading image:{}".format(image_file)) + continue + img_paths = [[img_path, img]] + else: + img_paths = [] + for index, pdf_img in enumerate(img): + os.makedirs( + os.path.join(args.output, img_name), exist_ok=True) + pdf_img_path = os.path.join( + args.output, img_name, + img_name + '_' + str(index) + '.jpg') + cv2.imwrite(pdf_img_path, pdf_img) + img_paths.append([pdf_img_path, pdf_img]) + + all_res = [] + for index, (new_img_path, img) in enumerate(img_paths): + logger.info('processing {}/{} page:'.format(index + 1, + len(img_paths))) + new_img_name = os.path.basename(new_img_path).split('.')[0] + result = engine(new_img_path, img_idx=index) + save_structure_res(result, args.output, img_name, index) + + if args.recovery and result != []: + from copy import deepcopy + from ppstructure.recovery.recovery_to_doc import sorted_layout_boxes + h, w, _ = img.shape + result_cp = deepcopy(result) + result_sorted = sorted_layout_boxes(result_cp, w) + all_res += result_sorted + + if args.recovery and all_res != []: + try: + from ppstructure.recovery.recovery_to_doc import convert_info_docx + convert_info_docx(img, all_res, args.output, img_name, + args.save_pdf) + except Exception as ex: + logger.error( + "error in layout recovery image:{}, err msg: {}".format( + img_name, ex)) + continue + + for item in all_res: item.pop('img') item.pop('res') logger.info(item) diff --git a/ppstructure/docs/quickstart.md b/ppstructure/docs/quickstart.md index f19ee2591aba955ff09b2404d3ca85c80b75d781..b9367cab327a2f6232e34431c12532db03c75389 100644 --- a/ppstructure/docs/quickstart.md +++ b/ppstructure/docs/quickstart.md @@ -51,10 +51,14 @@ pip3 install "paddleocr>=2.6" pip3 install paddleclas # 安装 关键信息抽取 依赖包(如不需要KIE功能,可跳过) -pip3 install -r kie/requirements.txt +pip3 install -r ppstructure/kie/requirements.txt + +# 安装 版面恢复 依赖包(如不需要版面恢复功能,可跳过) +pip3 install -r ppstructure/recovery/requirements.txt ``` + ## 2. 便捷使用 @@ -94,7 +98,12 @@ paddleocr --image_dir=ppstructure/docs/table/table.jpg --type=structure --layout #### 2.1.6 版面恢复 ```bash +# 中文测试图 paddleocr --image_dir=ppstructure/docs/table/1.png --type=structure --recovery=true +# 英文测试图 +paddleocr --image_dir=ppstructure/docs/table/1.png --type=structure --recovery=true --lang='en' +# pdf测试文件 +paddleocr --image_dir=ppstructure/recovery/UnrealText.pdf --type=structure --recovery=true --lang='en' ``` @@ -215,9 +224,12 @@ for line in result: import os import cv2 from paddleocr import PPStructure,save_structure_res -from paddelocr.ppstructure.recovery.recovery_to_doc import sorted_layout_boxes, convert_info_docx +from paddleocr.ppstructure.recovery.recovery_to_doc import sorted_layout_boxes, convert_info_docx -table_engine = PPStructure(layout=False, show_log=True) +# 中文测试图 +table_engine = PPStructure(recovery=True) +# 英文测试图 +# table_engine = PPStructure(recovery=True, lang='en') save_folder = './output' img_path = 'ppstructure/docs/table/1.png' @@ -230,8 +242,8 @@ for line in result: print(line) h, w, _ = img.shape -res = sorted_layout_boxes(res, w) -convert_info_docx(img, result, save_folder, os.path.basename(img_path).split('.')[0]) +res = sorted_layout_boxes(result, w) +convert_info_docx(img, res, save_folder, os.path.basename(img_path).split('.')[0]) ``` @@ -303,4 +315,4 @@ dict 里各个字段说明如下: ## 3. 小结 -通过本节内容,相信您已经熟练掌握通过PaddleOCR whl包调用PP-Structure相关功能的使用方法,您可以参考[文档教程](../../README_ch.md#文档教程),获取包括模型训练、推理部署等更详细的使用教程。 \ No newline at end of file +通过本节内容,相信您已经熟练掌握通过PaddleOCR whl包调用PP-Structure相关功能的使用方法,您可以参考[文档教程](../../README_ch.md#文档教程),获取包括模型训练、推理部署等更详细的使用教程。 diff --git a/ppstructure/docs/quickstart_en.md b/ppstructure/docs/quickstart_en.md index f0fbc86394dab00f1715f8f8fda30f3116c4fd07..b1df40b267a82fd48853edf607acd43f3a5431c9 100644 --- a/ppstructure/docs/quickstart_en.md +++ b/ppstructure/docs/quickstart_en.md @@ -54,6 +54,9 @@ pip3 install paddleclas # Install the KIE dependency packages (if you do not use the KIE, you can skip it) pip3 install -r kie/requirements.txt + +# Install the layout recovery dependency packages (if you do not use the layout recovery, you can skip it) +pip3 install -r recovery/requirements.txt ``` @@ -88,14 +91,15 @@ paddleocr --image_dir=ppstructure/docs/table/table.jpg --type=structure --layout ``` + #### 2.1.5 Key Information Extraction Key information extraction does not currently support use by the whl package. For detailed usage tutorials, please refer to: [Key Information Extraction](../kie/README.md). #### 2.1.6 layout recovery -```bash -paddleocr --image_dir=ppstructure/docs/table/1.png --type=structure --recovery=true +``` +paddleocr --image_dir=ppstructure/docs/table/1.png --type=structure --recovery=true --lang='en' ``` @@ -213,9 +217,12 @@ Key information extraction does not currently support use by the whl package. Fo import os import cv2 from paddleocr import PPStructure,save_structure_res -from paddelocr.ppstructure.recovery.recovery_to_doc import sorted_layout_boxes, convert_info_docx +from paddleocr.ppstructure.recovery.recovery_to_doc import sorted_layout_boxes, convert_info_docx -table_engine = PPStructure(layout=False, show_log=True) +# Chinese image +table_engine = PPStructure(recovery=True) +# English image +# table_engine = PPStructure(recovery=True, lang='en') save_folder = './output' img_path = 'ppstructure/docs/table/1.png' @@ -228,8 +235,8 @@ for line in result: print(line) h, w, _ = img.shape -res = sorted_layout_boxes(res, w) -convert_info_docx(img, result, save_folder, os.path.basename(img_path).split('.')[0]) +res = sorted_layout_boxes(result, w) +convert_info_docx(img, res, save_folder, os.path.basename(img_path).split('.')[0]) ``` @@ -301,4 +308,4 @@ Most of the parameters are consistent with the PaddleOCR whl package, see [whl p ## 3. Summary -Through the content in this section, you can master the use of PP-Structure related functions through PaddleOCR whl package. Please refer to [documentation tutorial](../../README.md) for more detailed usage tutorials including model training, inference and deployment, etc. \ No newline at end of file +Through the content in this section, you can master the use of PP-Structure related functions through PaddleOCR whl package. Please refer to [documentation tutorial](../../README.md) for more detailed usage tutorials including model training, inference and deployment, etc. diff --git a/ppstructure/docs/recovery/UnrealText.pdf b/ppstructure/docs/recovery/UnrealText.pdf new file mode 100644 index 0000000000000000000000000000000000000000..0b5cf961af4ebf09cb96fc3f09fb9c19abec68f1 Binary files /dev/null and b/ppstructure/docs/recovery/UnrealText.pdf differ diff --git a/ppstructure/docs/recovery/recovery_ch.jpg b/ppstructure/docs/recovery/recovery_ch.jpg new file mode 100644 index 0000000000000000000000000000000000000000..df5a5063f036053673041b92a01f288b3e1d246b Binary files /dev/null and b/ppstructure/docs/recovery/recovery_ch.jpg differ diff --git a/ppstructure/layout/README_ch.md b/ppstructure/layout/README_ch.md index d5598fc1a896ea4cfcc94619e1744b9b7ec288b3..f8d1978e25d7fb17cfd3fcb363b4ce981e19c8dc 100644 --- a/ppstructure/layout/README_ch.md +++ b/ppstructure/layout/README_ch.md @@ -160,11 +160,13 @@ json文件包含所有图像的标注,数据以字典嵌套的方式存放, ``` mkdir pretrained_model cd pretrained_model -# 下载PubLayNet预训练模型 -wget https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_layout.pdparams +# 下载PubLayNet预训练模型(直接体验模型评估、预测、动转静) +wget https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout.pdparams +# 下载PubLaynet推理模型(直接体验模型推理) +wget https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_infer.tar ``` -下载更多[版面分析模型](../docs/models_list.md)(中文CDLA数据集预训练模型、表格预训练模型) +如果测试图片为中文,可以下载中文CDLA数据集的预训练模型,识别10类文档区域:Table、Figure、Figure caption、Table、Table caption、Header、Footer、Reference、Equation,在[版面分析模型](../docs/models_list.md)中下载`picodet_lcnet_x1_0_fgd_layout_cdla`模型的训练模型和推理模型。如果只检测图片中的表格区域,可以下载表格数据集的预训练模型,在[版面分析模型](../docs/models_list.md)中下载`picodet_lcnet_x1_0_fgd_layout_table`模型的训练模型和推理模型。 ### 4.1. 启动训练 @@ -216,14 +218,14 @@ TestDataset: # 单卡训练 export CUDA_VISIBLE_DEVICES=0 python3 tools/train.py \ - -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ - --eval + -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ + --eval # 多卡训练,通过--gpus参数指定卡号 export CUDA_VISIBLE_DEVICES=0,1,2,3 python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py \ - -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ - --eval + -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ + --eval ``` **注意:**如果训练时显存out memory,将TrainReader中batch_size调小,同时LearningRate中base_lr等比例减小。发布的config均由8卡训练得到,如果改变GPU卡数为1,那么base_lr需要减小8倍。 @@ -252,9 +254,9 @@ PaddleDetection支持了基于FGD([Focal and Global Knowledge Distillation for D # 单卡训练 export CUDA_VISIBLE_DEVICES=0 python3 tools/train.py \ - -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ - --slim_config configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x2_5_layout.yml \ - --eval + -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ + --slim_config configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x2_5_layout.yml \ + --eval ``` - `-c`: 指定模型配置文件。 @@ -269,8 +271,8 @@ python3 tools/train.py \ ```bash # GPU 评估, weights 为待测权重 python3 tools/eval.py \ - -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ - -o weights=./output/picodet_lcnet_x1_0_layout/best_model + -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ + -o weights=./output/picodet_lcnet_x1_0_layout/best_model ``` 会输出以下信息,打印出mAP、AP0.5等信息。 @@ -292,13 +294,13 @@ python3 tools/eval.py \ [08/15 07:07:09] ppdet.engine INFO: Best test bbox ap is 0.935. ``` -使用FGD蒸馏模型进行评估: +若使用**提供的预训练模型进行评估**,或使用**FGD蒸馏训练的模型**,更换`weights`模型路径,执行如下命令进行评估: ``` python3 tools/eval.py \ - -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ - --slim_config configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x2_5_layout.yml \ - -o weights=output/picodet_lcnet_x2_5_layout/best_model + -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ + --slim_config configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x2_5_layout.yml \ + -o weights=output/picodet_lcnet_x2_5_layout/best_model ``` - `-c`: 指定模型配置文件。 @@ -325,18 +327,16 @@ python3 tools/infer.py \ - `--output_dir`: 指定可视化结果保存路径。 - `--draw_threshold`:指定绘制结果框的NMS阈值。 -预测图片如下所示,图片会存储在`output_dir`路径中。 - -使用FGD蒸馏模型进行测试: +若使用**提供的预训练模型进行预测**,或使用**FGD蒸馏训练的模型**,更换`weights`模型路径,执行如下命令进行预测: ``` python3 tools/infer.py \ - -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ - --slim_config configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x2_5_layout.yml \ - -o weights='output/picodet_lcnet_x2_5_layout/best_model.pdparams' \ - --infer_img='docs/images/layout.jpg' \ - --output_dir=output_dir/ \ - --draw_threshold=0.5 + -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ + --slim_config configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x2_5_layout.yml \ + -o weights='output/picodet_lcnet_x2_5_layout/best_model.pdparams' \ + --infer_img='docs/images/layout.jpg' \ + --output_dir=output_dir/ \ + --draw_threshold=0.5 ``` @@ -351,9 +351,9 @@ inference 模型(`paddle.jit.save`保存的模型) 一般是模型训练, ```bash python3 tools/export_model.py \ - -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ - -o weights=output/picodet_lcnet_x1_0_layout/best_model \ - --output_dir=output_inference/ + -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ + -o weights=output/picodet_lcnet_x1_0_layout/best_model \ + --output_dir=output_inference/ ``` * 如无需导出后处理,请指定:`-o export.benchmark=True`(如果-o已出现过,此处删掉-o) @@ -368,27 +368,27 @@ output_inference/picodet_lcnet_x1_0_layout/ └── model.pdmodel # inference模型的模型结构文件 ``` -FGD蒸馏模型转inference模型步骤如下: +若使用**提供的预训练模型转Inference模型**,或使用**FGD蒸馏训练的模型**,更换`weights`模型路径,模型转inference模型步骤如下: ```bash python3 tools/export_model.py \ - -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ - --slim_config configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x2_5_layout.yml \ - -o weights=./output/picodet_lcnet_x2_5_layout/best_model \ - --output_dir=output_inference/ + -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ + --slim_config configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x2_5_layout.yml \ + -o weights=./output/picodet_lcnet_x2_5_layout/best_model \ + --output_dir=output_inference/ ``` ### 6.2 模型推理 -版面恢复任务进行推理,可以执行如下命令: +若使用**提供的推理训练模型推理**,或使用**FGD蒸馏训练的模型**,更换`model_dir`推理模型路径,执行如下命令进行推理: ```bash python3 deploy/python/infer.py \ - --model_dir=output_inference/picodet_lcnet_x1_0_layout/ \ - --image_file=docs/images/layout.jpg \ - --device=CPU + --model_dir=output_inference/picodet_lcnet_x1_0_layout/ \ + --image_file=docs/images/layout.jpg \ + --device=CPU ``` - --device:指定GPU、CPU设备 diff --git a/ppstructure/predict_system.py b/ppstructure/predict_system.py index d63ab3b3daf018af7d0872e42bd14b8823d193ae..71147d3af8ec666d368234270dcb0d16aaf91938 100644 --- a/ppstructure/predict_system.py +++ b/ppstructure/predict_system.py @@ -77,7 +77,7 @@ class StructureSystem(object): elif self.mode == 'kie': raise NotImplementedError - def __call__(self, img, img_idx=0, return_ocr_result_in_table=False): + def __call__(self, img, return_ocr_result_in_table=False, img_idx=0): time_dict = { 'image_orientation': 0, 'layout': 0, @@ -227,65 +227,39 @@ def main(args): if img is None: logger.error("error in loading image:{}".format(image_file)) continue - res, time_dict = structure_sys(img) + imgs = [img] + else: + imgs = img - if structure_sys.mode == 'structure': - save_structure_res(res, save_folder, img_name) + all_res = [] + for index, img in enumerate(imgs): + res, time_dict = structure_sys(img, img_idx=index) + if structure_sys.mode == 'structure' and res != []: + save_structure_res(res, save_folder, img_name, index) draw_img = draw_structure_result(img, res, args.vis_font_path) - img_save_path = os.path.join(save_folder, img_name, 'show.jpg') + img_save_path = os.path.join(save_folder, img_name, + 'show_{}.jpg'.format(index)) elif structure_sys.mode == 'kie': raise NotImplementedError # draw_img = draw_ser_results(img, res, args.vis_font_path) # img_save_path = os.path.join(save_folder, img_name + '.jpg') - cv2.imwrite(img_save_path, draw_img) - logger.info('result save to {}'.format(img_save_path)) - if args.recovery: - try: - from ppstructure.recovery.recovery_to_doc import sorted_layout_boxes, convert_info_docx - h, w, _ = img.shape - res = sorted_layout_boxes(res, w) - convert_info_docx(img, res, save_folder, img_name, - args.save_pdf) - except Exception as ex: - logger.error( - "error in layout recovery image:{}, err msg: {}".format( - image_file, ex)) - continue - else: - pdf_imgs = img - all_res = [] - for index, img in enumerate(pdf_imgs): - - res, time_dict = structure_sys(img, index) - if structure_sys.mode == 'structure' and res != []: - save_structure_res(res, save_folder, img_name, index) - draw_img = draw_structure_result(img, res, - args.vis_font_path) - img_save_path = os.path.join(save_folder, img_name, - 'show_{}.jpg'.format(index)) - elif structure_sys.mode == 'kie': - raise NotImplementedError - # draw_img = draw_ser_results(img, res, args.vis_font_path) - # img_save_path = os.path.join(save_folder, img_name + '.jpg') - if res != []: - cv2.imwrite(img_save_path, draw_img) - logger.info('result save to {}'.format(img_save_path)) - if args.recovery and res != []: - from ppstructure.recovery.recovery_to_doc import sorted_layout_boxes, convert_info_docx - h, w, _ = img.shape - res = sorted_layout_boxes(res, w) - all_res += res - - if args.recovery and all_res != []: - try: - convert_info_docx(img, all_res, save_folder, img_name, - args.save_pdf) - except Exception as ex: - logger.error( - "error in layout recovery image:{}, err msg: {}".format( - image_file, ex)) - continue + if res != []: + cv2.imwrite(img_save_path, draw_img) + logger.info('result save to {}'.format(img_save_path)) + if args.recovery and res != []: + from ppstructure.recovery.recovery_to_doc import sorted_layout_boxes, convert_info_docx + h, w, _ = img.shape + res = sorted_layout_boxes(res, w) + all_res += res + if args.recovery and all_res != []: + try: + convert_info_docx(img, all_res, save_folder, img_name, + args.save_pdf) + except Exception as ex: + logger.error("error in layout recovery image:{}, err msg: {}". + format(image_file, ex)) + continue logger.info("Predict time : {:.3f}s".format(time_dict['all'])) diff --git a/ppstructure/recovery/README.md b/ppstructure/recovery/README.md index 90a6a2c3c4189dc885d698e4cac2d1a24a49d1df..59aef707dd67799bb46dc18dc58f883c502c8b86 100644 --- a/ppstructure/recovery/README.md +++ b/ppstructure/recovery/README.md @@ -8,6 +8,7 @@ English | [简体中文](README_ch.md) - [3. Quick Start](#3) - [3.1 Download models](#3.1) - [3.2 Layout recovery](#3.2) + - [4. More](#4) @@ -15,13 +16,16 @@ English | [简体中文](README_ch.md) Layout recovery means that after OCR recognition, the content is still arranged like the original document pictures, and the paragraphs are output to word document in the same order. -Layout recovery combines [layout analysis](../layout/README.md)、[table recognition](../table/README.md) to better recover images, tables, titles, etc. -The following figure shows the result: +Layout recovery combines [layout analysis](../layout/README.md)、[table recognition](../table/README.md) to better recover images, tables, titles, etc. supports input files in PDF and document image formats in Chinese and English. The following figure shows the effect of restoring the layout of English and Chinese documents: