diff --git a/README.md b/README.md index f57672e5055df042ede9ae03bbed590889c5941c..75828c3589a78e33a8c4feb15a771c115a33e5e7 100644 --- a/README.md +++ b/README.md @@ -26,17 +26,19 @@ PaddleOCR aims to create multilingual, awesome, leading, and practical OCR tools ## Recent updates +- **🔥2022.8.24 Release PaddleOCR [release/2.6](https://github.com/PaddlePaddle/PaddleOCR/tree/release/2.6)** + - Release [PP-Structurev2](./ppstructure/),with functions and performance fully upgraded, adapted to Chinese scenes, and new support for [Layout Recovery](./ppstructure/recovery) and **one line command to convert PDF to Word**; + - [Layout Analysis](./ppstructure/layout) optimization: model storage reduced by 95%, while speed increased by 11 times, and the average CPU time-cost is only 41ms; + - [Table Recognition](./ppstructure/table) optimization: 3 optimization strategies are designed, and the model accuracy is improved by 6% under comparable time consumption; + - [Key Information Extraction](./ppstructure/kie) optimization:a visual-independent model structure is designed, the accuracy of semantic entity recognition is increased by 2.8%, and the accuracy of relation extraction is increased by 9.1%. + +- **🔥2022.7 Release [OCR scene application collection](./applications/README_en.md)** + - Release **9 vertical models** such as digital tube, LCD screen, license plate, handwriting recognition model, high-precision SVTR model, etc, covering the main OCR vertical applications in general, manufacturing, finance, and transportation industries. + - **🔥2022.5.9 Release PaddleOCR [release/2.5](https://github.com/PaddlePaddle/PaddleOCR/tree/release/2.5)** - Release [PP-OCRv3](./doc/doc_en/ppocr_introduction_en.md#pp-ocrv3): With comparable speed, the effect of Chinese scene is further improved by 5% compared with PP-OCRv2, the effect of English scene is improved by 11%, and the average recognition accuracy of 80 language multilingual models is improved by more than 5%. - Release [PPOCRLabelv2](./PPOCRLabel): Add the annotation function for table recognition task, key information extraction task and irregular text image. - Release interactive e-book [*"Dive into OCR"*](./doc/doc_en/ocr_book_en.md), covers the cutting-edge theory and code practice of OCR full stack technology. -- 2021.12.21 Release PaddleOCR [release/2.4](https://github.com/PaddlePaddle/PaddleOCR/tree/release/2.4) - - Release 1 text detection algorithm (PSENet), 3 text recognition algorithms (NRTR、SEED、SAR). - - Release 1 key information extraction algorithm (SDMGR, [tutorial](./ppstructure/docs/kie_en.md)) and 3 [DocVQA](./ppstructure/vqa) algorithms (LayoutLM, LayoutLMv2, LayoutXLM). -- 2021.9.7 Release PaddleOCR [release/2.3](https://github.com/PaddlePaddle/PaddleOCR/tree/release/2.3) - - Release [PP-OCRv2](./doc/doc_en/ppocr_introduction_en.md#pp-ocrv2). The inference speed of PP-OCRv2 is 220% higher than that of PP-OCR server in CPU device. The F-score of PP-OCRv2 is 7% higher than that of PP-OCR mobile. -- 2021.8.3 Release PaddleOCR [release/2.2](https://github.com/PaddlePaddle/PaddleOCR/tree/release/2.2) - - Release a new structured documents analysis toolkit, i.e., [PP-Structure](./ppstructure/README.md), support layout analysis and table recognition (One-key to export chart images to Excel files). - [more](./doc/doc_en/update_en.md) @@ -45,7 +47,9 @@ PaddleOCR aims to create multilingual, awesome, leading, and practical OCR tools PaddleOCR support a variety of cutting-edge algorithms related to OCR, and developed industrial featured models/solution [PP-OCR](./doc/doc_en/ppocr_introduction_en.md) and [PP-Structure](./ppstructure/README.md) on this basis, and get through the whole process of data production, model training, compression, inference and deployment. -![](./doc/features_en.png) +
+ +
> It is recommended to start with the “quick experience” in the document tutorial @@ -113,18 +117,19 @@ PaddleOCR support a variety of cutting-edge algorithms related to OCR, and devel - [Quick Start](./ppstructure/docs/quickstart_en.md) - [Model Zoo](./ppstructure/docs/models_list_en.md) - [Model training](./doc/doc_en/training_en.md) - - [Layout Parser](./ppstructure/layout/README.md) + - [Layout Analysis](./ppstructure/layout/README.md) - [Table Recognition](./ppstructure/table/README.md) - - [DocVQA](./ppstructure/vqa/README.md) - - [Key Information Extraction](./ppstructure/docs/kie_en.md) + - [Key Information Extraction](./ppstructure/kie/README.md) - [Inference and Deployment](./deploy/README.md) - [Python Inference](./ppstructure/docs/inference_en.md) - - [C++ Inference]() + - [C++ Inference](./deploy/cpp_infer/readme.md) - [Serving](./deploy/pdserving/README.md) -- [Academic algorithms](./doc/doc_en/algorithms_en.md) +- [Academic Algorithms](./doc/doc_en/algorithm_overview_en.md) - [Text detection](./doc/doc_en/algorithm_overview_en.md) - [Text recognition](./doc/doc_en/algorithm_overview_en.md) - - [End-to-end](./doc/doc_en/algorithm_overview_en.md) + - [End-to-end OCR](./doc/doc_en/algorithm_overview_en.md) + - [Table Recognition](./doc/doc_en/algorithm_overview_en.md) + - [Key Information Extraction](./doc/doc_en/algorithm_overview_en.md) - [Add New Algorithms to PaddleOCR](./doc/doc_en/add_new_algorithm_en.md) - Data Annotation and Synthesis - [Semi-automatic Annotation Tool: PPOCRLabel](./PPOCRLabel/README.md) @@ -135,9 +140,9 @@ PaddleOCR support a variety of cutting-edge algorithms related to OCR, and devel - [General OCR Datasets(Chinese/English)](doc/doc_en/dataset/datasets_en.md) - [HandWritten_OCR_Datasets(Chinese)](doc/doc_en/dataset/handwritten_datasets_en.md) - [Various OCR Datasets(multilingual)](doc/doc_en/dataset/vertical_and_multilingual_datasets_en.md) - - [layout analysis](doc/doc_en/dataset/layout_datasets_en.md) - - [table recognition](doc/doc_en/dataset/table_datasets_en.md) - - [DocVQA](doc/doc_en/dataset/docvqa_datasets_en.md) + - [Layout Analysis](doc/doc_en/dataset/layout_datasets_en.md) + - [Table Recognition](doc/doc_en/dataset/table_datasets_en.md) + - [Key Information Extraction](doc/doc_en/dataset/kie_datasets_en.md) - [Code Structure](./doc/doc_en/tree_en.md) - [Visualization](#Visualization) - [Community](#Community) @@ -176,7 +181,7 @@ PaddleOCR support a variety of cutting-edge algorithms related to OCR, and devel
-PP-Structure +PP-Structurev2 - layout analysis + table recognition
@@ -185,12 +190,28 @@ PaddleOCR support a variety of cutting-edge algorithms related to OCR, and devel - SER (Semantic entity recognition)
- + +
+ +
+ +
+ +
+
- RE (Relation Extraction)
- + +
+ +
+ +
+ +
+
diff --git a/README_ch.md b/README_ch.md index c52d5f3dd17839254c3f58794e016f08dc0b21bc..8ffa7a3755970374e1559d3c771bd82c02010a61 100755 --- a/README_ch.md +++ b/README_ch.md @@ -27,21 +27,20 @@ PaddleOCR旨在打造一套丰富、领先、且实用的OCR工具库,助力 ## 近期更新 -- **🔥2022.7 发布[OCR场景应用集合](./applications)** - - 发布OCR场景应用集合,包含数码管、液晶屏、车牌、高精度SVTR模型等**7个垂类模型**,覆盖通用,制造、金融、交通行业的主要OCR垂类应用。 +- **🔥2022.8.24 发布 PaddleOCR [release/2.6](https://github.com/PaddlePaddle/PaddleOCR/tree/release/2.6)** + - 发布[PP-Structurev2](./ppstructure/),系统功能性能全面升级,适配中文场景,新增支持[版面复原](./ppstructure/recovery),支持**一行命令完成PDF转Word**; + - [版面分析](./ppstructure/layout)模型优化:模型存储减少95%,速度提升11倍,平均CPU耗时仅需41ms; + - [表格识别](./ppstructure/table)模型优化:设计3大优化策略,预测耗时不变情况下,模型精度提升6%; + - [关键信息抽取](./ppstructure/kie)模型优化:设计视觉无关模型结构,语义实体识别精度提升2.8%,关系抽取精度提升9.1%。 -- **🔥2022.5.9 发布PaddleOCR [release/2.5](https://github.com/PaddlePaddle/PaddleOCR/tree/release/2.5)** +- **🔥2022.8 发布 [OCR场景应用集合](./applications)** + - 包含数码管、液晶屏、车牌、高精度SVTR模型、手写体识别等**9个垂类模型**,覆盖通用,制造、金融、交通行业的主要OCR垂类应用。 + +- **2022.5.9 发布 PaddleOCR [release/2.5](https://github.com/PaddlePaddle/PaddleOCR/tree/release/2.5)** - 发布[PP-OCRv3](./doc/doc_ch/ppocr_introduction.md#pp-ocrv3),速度可比情况下,中文场景效果相比于PP-OCRv2再提升5%,英文场景提升11%,80语种多语言模型平均识别准确率提升5%以上; - 发布半自动标注工具[PPOCRLabelv2](./PPOCRLabel):新增表格文字图像、图像关键信息抽取任务和不规则文字图像的标注功能; - 发布OCR产业落地工具集:打通22种训练部署软硬件环境与方式,覆盖企业90%的训练部署环境需求; - 发布交互式OCR开源电子书[《动手学OCR》](./doc/doc_ch/ocr_book.md),覆盖OCR全栈技术的前沿理论与代码实践,并配套教学视频。 -- 2021.12.21 发布PaddleOCR [release/2.4](https://github.com/PaddlePaddle/PaddleOCR/tree/release/2.4) - - OCR算法新增1种文本检测算法([PSENet](./doc/doc_ch/algorithm_det_psenet.md)),3种文本识别算法([NRTR](./doc/doc_ch/algorithm_rec_nrtr.md)、[SEED](./doc/doc_ch/algorithm_rec_seed.md)、[SAR](./doc/doc_ch/algorithm_rec_sar.md)); - - 文档结构化算法新增1种关键信息提取算法([SDMGR](./ppstructure/docs/kie.md)),3种[DocVQA](./ppstructure/vqa)算法(LayoutLM、LayoutLMv2,LayoutXLM)。 -- 2021.9.7 发布PaddleOCR [release/2.3](https://github.com/PaddlePaddle/PaddleOCR/tree/release/2.3) - - 发布[PP-OCRv2](./doc/doc_ch/ppocr_introduction.md#pp-ocrv2),CPU推理速度相比于PP-OCR server提升220%;效果相比于PP-OCR mobile 提升7%。 -- 2021.8.3 发布PaddleOCR [release/2.2](https://github.com/PaddlePaddle/PaddleOCR/tree/release/2.2) - - 发布文档结构分析[PP-Structure](./ppstructure/README_ch.md)工具包,支持版面分析与表格识别(含Excel导出)。 > [更多](./doc/doc_ch/update.md) @@ -49,7 +48,9 @@ PaddleOCR旨在打造一套丰富、领先、且实用的OCR工具库,助力 支持多种OCR相关前沿算法,在此基础上打造产业级特色模型[PP-OCR](./doc/doc_ch/ppocr_introduction.md)和[PP-Structure](./ppstructure/README_ch.md),并打通数据生产、模型训练、压缩、预测部署全流程。 -![](./doc/features.png) +
+ +
> 上述内容的使用方法建议从文档教程中的快速开始体验 @@ -213,14 +214,30 @@ PaddleOCR旨在打造一套丰富、领先、且实用的OCR工具库,助力 - SER(语义实体识别)
- +
+
+ +
+ +
+ +
+ - RE(关系提取)
- + +
+ +
+
+
+ +
+ diff --git a/__init__.py b/__init__.py index 15a9aca4da19a981b9e678e7cc93e33cf40fc81c..11436094c163db1b91f5ac38f2936a53017016c1 100644 --- a/__init__.py +++ b/__init__.py @@ -16,5 +16,6 @@ from .paddleocr import * __version__ = paddleocr.VERSION __all__ = [ 'PaddleOCR', 'PPStructure', 'draw_ocr', 'draw_structure_result', - 'save_structure_res', 'download_with_progressbar' + 'save_structure_res', 'download_with_progressbar', 'sorted_layout_boxes', + 'convert_info_docx' ] diff --git a/applications/README.md b/applications/README.md index 017c2a9f6f696904e9bf2f1180104e66c90ee712..2637cd6eaf0c3c59d56673c5e2d294ee7fca2b8b 100644 --- a/applications/README.md +++ b/applications/README.md @@ -20,10 +20,10 @@ PaddleOCR场景应用覆盖通用,制造、金融、交通行业的主要OCR ### 通用 -| 类别 | 亮点 | 模型下载 | 教程 | -| ---------------------- | ------------ | -------------- | --------------------------------------- | -| 高精度中文识别模型SVTR | 比PP-OCRv3识别模型精度高3%,可用于数据挖掘或对预测效率要求不高的场景。| [模型下载](#2) | [中文](./高精度中文识别模型.md)/English | -| 手写体识别 | 新增字形支持 | | | +| 类别 | 亮点 | 模型下载 | 教程 | 示例图 | +| ---------------------- | ------------------------------------------------------------ | -------------- | --------------------------------------- | ------------------------------------------------------------ | +| 高精度中文识别模型SVTR | 比PP-OCRv3识别模型精度高3%,
可用于数据挖掘或对预测效率要求不高的场景。 | [模型下载](#2) | [中文](./高精度中文识别模型.md)/English | | +| 手写体识别 | 新增字形支持 | [模型下载](#2) | [中文](./手写文字识别.md)/English | | @@ -42,14 +42,14 @@ PaddleOCR场景应用覆盖通用,制造、金融、交通行业的主要OCR ### 金融 -| 类别 | 亮点 | 模型下载 | 教程 | 示例图 | -| -------------- | ------------------------ | -------------- | ----------------------------------- | ------------------------------------------------------------ | -| 表单VQA | 多模态通用表单结构化提取 | [模型下载](#2) | [中文](./多模态表单识别.md)/English | | -| 增值税发票 | 尽请期待 | | | | -| 印章检测与识别 | 端到端弯曲文本识别 | | | | -| 通用卡证识别 | 通用结构化提取 | | | | -| 身份证识别 | 结构化提取、图像阴影 | | | | -| 合同比对 | 密集文本检测、NLP串联 | | | | +| 类别 | 亮点 | 模型下载 | 教程 | 示例图 | +| -------------- | ----------------------------- | -------------- | ------------------------------------- | ------------------------------------------------------------ | +| 表单VQA | 多模态通用表单结构化提取 | [模型下载](#2) | [中文](./多模态表单识别.md)/English | | +| 增值税发票 | 关键信息抽取,SER、RE任务训练 | [模型下载](#2) | [中文](./发票关键信息抽取.md)/English | | +| 印章检测与识别 | 端到端弯曲文本识别 | | | | +| 通用卡证识别 | 通用结构化提取 | | | | +| 身份证识别 | 结构化提取、图像阴影 | | | | +| 合同比对 | 密集文本检测、NLP串联 | | | | diff --git a/applications/README_en.md b/applications/README_en.md new file mode 100644 index 0000000000000000000000000000000000000000..95c56a1f740faa95e1fe3adeaeb90bfe902f8ed8 --- /dev/null +++ b/applications/README_en.md @@ -0,0 +1,79 @@ +English| [简体中文](README.md) + +# Application + +PaddleOCR scene application covers general, manufacturing, finance, transportation industry of the main OCR vertical applications, on the basis of the general capabilities of PP-OCR, PP-Structure, in the form of notebook to show the use of scene data fine-tuning, model optimization methods, data augmentation and other content, for developers to quickly land OCR applications to provide demonstration and inspiration. + +- [Tutorial](#1) + - [General](#11) + - [Manufacturing](#12) + - [Finance](#13) + - [Transportation](#14) + +- [Model Download](#2) + + + +## Tutorial + + + +### General + +| Case | Feature | Model Download | Tutorial | Example | +| ---------------------------------------------- | ---------------- | -------------------- | --------------------------------------- | ------------------------------------------------------------ | +| High-precision Chineses recognition model SVTR | New model | [Model Download](#2) | [中文](./高精度中文识别模型.md)/English | | +| Chinese handwriting recognition | New font support | [Model Download](#2) | [中文](./手写文字识别.md)/English | | + + + +### Manufacturing + +| Case | Feature | Model Download | Tutorial | Example | +| ------------------------------ | ------------------------------------------------------------ | -------------------- | ------------------------------------------------------------ | ------------------------------------------------------------ | +| Digital tube | Digital tube data sythesis, recognition model fine-tuning | [Model Download](#2) | [中文](./光功率计数码管字符识别/光功率计数码管字符识别.md)/English | | +| LCD screen | Detection model distillation, serving deployment | [Model Download](#2) | [中文](./液晶屏读数识别.md)/English | | +| Packaging production data | Dot matrix character synthesis, overexposure and overdark text recognition | [Model Download](#2) | [中文](./包装生产日期识别.md)/English | | +| PCB text recognition | Small size text detection and recognition | [Model Download](#2) | [中文](./PCB字符识别/PCB字符识别.md)/English | | +| Meter text recognition | High-resolution image detection fine-tuning | [Model Download](#2) | | | +| LCD character defect detection | Non-text character recognition | | | | + + + +### Finance + +| Case | Feature | Model Download | Tutorial | Example | +| ----------------------------------- | -------------------------------------------------- | -------------------- | ------------------------------------- | ------------------------------------------------------------ | +| Form visual question and answer | Multimodal general form structured extraction | [Model Download](#2) | [中文](./多模态表单识别.md)/English | | +| VAT invoice | Key information extraction, SER, RE task fine-tune | [Model Download](#2) | [中文](./发票关键信息抽取.md)/English | | +| Seal detection and recognition | End-to-end curved text recognition | | | | +| Universal card recognition | Universal structured extraction | | | | +| ID card recognition | Structured extraction, image shading | | | | +| Contract key information extraction | Dense text detection, NLP concatenation | | | | + + + +### Transportation + +| Case | Feature | Model Download | Tutorial | Example | +| ----------------------------------------------- | ------------------------------------------------------------ | -------------------- | ----------------------------------- | ------------------------------------------------------------ | +| License plate recognition | Multi-angle images, lightweight models, edge-side deployment | [Model Download](#2) | [中文](./轻量级车牌识别.md)/English | | +| Driver's license/driving license identification | coming soon | | | | +| Express text recognition | coming soon | | | | + + + +## Model Download + +- For international developers: We're building a way to download these trained models, and since the current tutorials are Chinese, if you are good at both Chinese and English, or willing to polish English documents, please let us know in [discussion](https://github.com/PaddlePaddle/PaddleOCR/discussions). +- For Chinese developer: If you want to download the trained application model in the above scenarios, scan the QR code below with your WeChat, follow the PaddlePaddle official account to fill in the questionnaire, and join the PaddleOCR official group to get the 20G OCR learning materials (including "Dive into OCR" e-book, course video, application models and other materials) + +
+ +
+ + If you are an enterprise developer and have not found a suitable solution in the above scenarios, you can fill in the [OCR Application Cooperation Survey Questionnaire](https://paddle.wjx.cn/vj/QwF7GKw.aspx) to carry out different levels of cooperation with the official team **for free**, including but not limited to problem abstraction, technical solution determination, project Q&A, joint research and development, etc. If you have already used paddleOCR in your project, you can also fill out this questionnaire to jointly promote with the PaddlePaddle and enhance the technical publicity of enterprises. Looking forward to your submission! + + +trackgit-views + diff --git "a/applications/\345\217\221\347\245\250\345\205\263\351\224\256\344\277\241\346\201\257\346\212\275\345\217\226.md" "b/applications/\345\217\221\347\245\250\345\205\263\351\224\256\344\277\241\346\201\257\346\212\275\345\217\226.md" index cd7fa1a0b3c988b21b33fe8f123e7d7c3e851ca5..14a6a1c8f1dd2350767afa162063b06791e79dd4 100644 --- "a/applications/\345\217\221\347\245\250\345\205\263\351\224\256\344\277\241\346\201\257\346\212\275\345\217\226.md" +++ "b/applications/\345\217\221\347\245\250\345\205\263\351\224\256\344\277\241\346\201\257\346\212\275\345\217\226.md" @@ -279,6 +279,12 @@ LayoutXLM与VI-LayoutXLM针对该场景的训练结果如下所示。 可以看出,对于VI-LayoutXLM相比LayoutXLM的Hmean高了1.3%。 +如需获取已训练模型,请扫码填写问卷,加入PaddleOCR官方交流群获取全部OCR垂类模型下载链接、《动手学OCR》电子书等全套OCR学习资料🎁 + +
+ +
+ #### 4.4.3 模型评估 diff --git a/deploy/hubserving/readme.md b/deploy/hubserving/readme.md index 8144c2e7cefaed6f64763e414101445b2d80b81a..c583cc96ede437a1f65f9b1bddb69e84b7c54852 100755 --- a/deploy/hubserving/readme.md +++ b/deploy/hubserving/readme.md @@ -20,13 +20,14 @@ PaddleOCR提供2种服务部署方式: # 基于PaddleHub Serving的服务部署 -hubserving服务部署目录下包括文本检测、文本方向分类,文本识别、文本检测+文本方向分类+文本识别3阶段串联,表格识别和PP-Structure六种服务包,请根据需求选择相应的服务包进行安装和启动。目录结构如下: +hubserving服务部署目录下包括文本检测、文本方向分类,文本识别、文本检测+文本方向分类+文本识别3阶段串联,版面分析、表格识别和PP-Structure七种服务包,请根据需求选择相应的服务包进行安装和启动。目录结构如下: ``` deploy/hubserving/ └─ ocr_cls 文本方向分类模块服务包 └─ ocr_det 文本检测模块服务包 └─ ocr_rec 文本识别模块服务包 └─ ocr_system 文本检测+文本方向分类+文本识别串联服务包 + └─ structure_layout 版面分析服务包 └─ structure_table 表格识别服务包 └─ structure_system PP-Structure服务包 ``` @@ -41,6 +42,7 @@ deploy/hubserving/ocr_system/ ``` ## 1. 近期更新 +* 2022.08.23 新增版面分析服务。 * 2022.05.05 新增PP-OCRv3检测和识别模型。 * 2022.03.30 新增PP-Structure和表格识别两种服务。 @@ -59,9 +61,9 @@ pip3 install paddlehub==2.1.0 --upgrade -i https://mirror.baidu.com/pypi/simple 检测模型:./inference/ch_PP-OCRv3_det_infer/ 识别模型:./inference/ch_PP-OCRv3_rec_infer/ 方向分类器:./inference/ch_ppocr_mobile_v2.0_cls_infer/ -版面分析模型:./inference/layout_infer/ +版面分析模型:./inference/picodet_lcnet_x1_0_fgd_layout_infer/ 表格结构识别模型:./inference/ch_ppstructure_mobile_v2.0_SLANet_infer/ -``` +``` **模型路径可在`params.py`中查看和修改。** 更多模型可以从PaddleOCR提供的模型库[PP-OCR](../../doc/doc_ch/models_list.md)和[PP-Structure](../../ppstructure/docs/models_list.md)下载,也可以替换成自己训练转换好的模型。 @@ -87,6 +89,9 @@ hub install deploy/hubserving/structure_table/ # 或,安装PP-Structure服务模块: hub install deploy/hubserving/structure_system/ + +# 或,安装版面分析服务模块: +hub install deploy/hubserving/structure_layout/ ``` * 在Windows环境下(文件夹的分隔符为`\`),安装示例如下: @@ -108,6 +113,9 @@ hub install deploy\hubserving\structure_table\ # 或,安装PP-Structure服务模块: hub install deploy\hubserving\structure_system\ + +# 或,安装版面分析服务模块: +hub install deploy\hubserving\structure_layout\ ``` ### 2.4 启动服务 @@ -118,7 +126,7 @@ $ hub serving start --modules [Module1==Version1, Module2==Version2, ...] \ --port XXXX \ --use_multiprocess \ --workers \ -``` +``` **参数:** @@ -168,7 +176,7 @@ $ hub serving start --modules [Module1==Version1, Module2==Version2, ...] \ ```shell export CUDA_VISIBLE_DEVICES=3 hub serving start -c deploy/hubserving/ocr_system/config.json -``` +``` ## 3. 发送预测请求 配置好服务端,可使用以下命令发送预测请求,获取预测结果: @@ -185,6 +193,7 @@ hub serving start -c deploy/hubserving/ocr_system/config.json `http://127.0.0.1:8868/predict/ocr_system` `http://127.0.0.1:8869/predict/structure_table` `http://127.0.0.1:8870/predict/structure_system` +`http://127.0.0.1:8870/predict/structure_layout` - **image_dir**:测试图像路径,可以是单张图片路径,也可以是图像集合目录路径 - **visualize**:是否可视化结果,默认为False - **output**:可视化结果保存路径,默认为`./hubserving_result` @@ -203,17 +212,19 @@ hub serving start -c deploy/hubserving/ocr_system/config.json |text_region|list|文本位置坐标| |html|str|表格的html字符串| |regions|list|版面分析+表格识别+OCR的结果,每一项为一个list,包含表示区域坐标的`bbox`,区域类型的`type`和区域结果的`res`三个字段| +|layout|list|版面分析的结果,每一项一个dict,包含版面区域坐标的`bbox`,区域类型的`label`| 不同模块返回的字段不同,如,文本识别服务模块返回结果不含`text_region`字段,具体信息如下: -| 字段名/模块名 | ocr_det | ocr_cls | ocr_rec | ocr_system | structure_table | structure_system | -| --- | --- | --- | --- | --- | --- |--- | -|angle| | ✔ | | ✔ | || -|text| | |✔|✔| | ✔ | -|confidence| |✔ |✔| | | ✔| -|text_region| ✔| | |✔ | | ✔| -|html| | | | |✔ |✔| -|regions| | | | |✔ |✔ | +| 字段名/模块名 | ocr_det | ocr_cls | ocr_rec | ocr_system | structure_table | structure_system | Structure_layout | +| --- | --- | --- | --- | --- | --- | --- | --- | +|angle| | ✔ | | ✔ | ||| +|text| | |✔|✔| | ✔ | | +|confidence| |✔ |✔| | | ✔| | +|text_region| ✔| | |✔ | | ✔| | +|html| | | | |✔ |✔|| +|regions| | | | |✔ |✔ | | +|layout| | | | | | | ✔ | **说明:** 如果需要增加、删除、修改返回字段,可在相应模块的`module.py`文件中进行修改,完整流程参考下一节自定义修改服务模块。 diff --git a/deploy/hubserving/readme_en.md b/deploy/hubserving/readme_en.md index 06eaaebacb51744844473c0ffe8b189dc545492c..f09fe46417c7567305e5ce05a14be74d33450c31 100755 --- a/deploy/hubserving/readme_en.md +++ b/deploy/hubserving/readme_en.md @@ -20,13 +20,14 @@ PaddleOCR provides 2 service deployment methods: # Service deployment based on PaddleHub Serving -The hubserving service deployment directory includes six service packages: text detection, text angle class, text recognition, text detection+text angle class+text recognition three-stage series connection, table recognition and PP-Structure. Please select the corresponding service package to install and start service according to your needs. The directory is as follows: +The hubserving service deployment directory includes seven service packages: text detection, text angle class, text recognition, text detection+text angle class+text recognition three-stage series connection, layout analysis, table recognition and PP-Structure. Please select the corresponding service package to install and start service according to your needs. The directory is as follows: ``` deploy/hubserving/ └─ ocr_det text detection module service package └─ ocr_cls text angle class module service package └─ ocr_rec text recognition module service package └─ ocr_system text detection+text angle class+text recognition three-stage series connection service package + └─ structure_layout layout analysis service package └─ structure_table table recognition service package └─ structure_system PP-Structure service package ``` @@ -43,6 +44,7 @@ deploy/hubserving/ocr_system/ * 2022.05.05 add PP-OCRv3 text detection and recognition models. * 2022.03.30 add PP-Structure and table recognition services。 +* 2022.08.23 add layout analysis services。 ## 2. Quick start service @@ -61,7 +63,7 @@ Before installing the service module, you need to prepare the inference model an text detection model: ./inference/ch_PP-OCRv3_det_infer/ text recognition model: ./inference/ch_PP-OCRv3_rec_infer/ text angle classifier: ./inference/ch_ppocr_mobile_v2.0_cls_infer/ -layout parse model: ./inference/layout_infer/ +layout parse model: ./inference/picodet_lcnet_x1_0_fgd_layout_infer/ tanle recognition: ./inference/ch_ppstructure_mobile_v2.0_SLANet_infer/ ``` @@ -89,6 +91,9 @@ hub install deploy/hubserving/structure_table/ # Or install PP-Structure service module hub install deploy/hubserving/structure_system/ + +# Or install layout analysis service module +hub install deploy/hubserving/structure_layout/ ``` * On Windows platform, the examples are as follows. @@ -110,6 +115,9 @@ hub install deploy/hubserving/structure_table/ # Or install PP-Structure service module hub install deploy\hubserving\structure_system\ + +# Or install layout analysis service module +hub install deploy\hubserving\structure_layout\ ``` ### 2.4 Start service @@ -190,8 +198,9 @@ For example, if using the configuration file to start the text angle classificat `http://127.0.0.1:8866/predict/ocr_cls` `http://127.0.0.1:8867/predict/ocr_rec` `http://127.0.0.1:8868/predict/ocr_system` -`http://127.0.0.1:8869/predict/structure_table` +`http://127.0.0.1:8869/predict/structure_table` `http://127.0.0.1:8870/predict/structure_system` +`http://127.0.0.1:8870/predict/structure_layout` - **image_dir**:Test image path, can be a single image path or an image directory path - **visualize**:Whether to visualize the results, the default value is False - **output**:The floder to save Visualization result, default value is `./hubserving_result` @@ -212,17 +221,19 @@ The returned result is a list. Each item in the list is a dict. The dict may con |text_region|list|text location coordinates| |html|str|table html str| |regions|list|The result of layout analysis + table recognition + OCR, each item is a list, including `bbox` indicating area coordinates, `type` of area type and `res` of area results| +|layout|list|The result of layout analysis, each item is a dict, including `bbox` indicating area coordinates, `label` of area type| The fields returned by different modules are different. For example, the results returned by the text recognition service module do not contain `text_region`. The details are as follows: -| field name/module name | ocr_det | ocr_cls | ocr_rec | ocr_system | structure_table | structure_system | -| --- | --- | --- | --- | --- | --- |--- | -|angle| | ✔ | | ✔ | || -|text| | |✔|✔| | ✔ | -|confidence| |✔ |✔| | | ✔| -|text_region| ✔| | |✔ | | ✔| -|html| | | | |✔ |✔| -|regions| | | | |✔ |✔ | +| field name/module name | ocr_det | ocr_cls | ocr_rec | ocr_system | structure_table | structure_system | structure_layout | +| --- | --- | --- | --- | --- | --- |--- |--- | +|angle| | ✔ | | ✔ | || | +|text| | |✔|✔| | ✔ | | +|confidence| |✔ |✔| | | ✔| | +|text_region| ✔| | |✔ | | ✔| | +|html| | | | |✔ |✔| | +|regions| | | | |✔ |✔ | | +|layout| | | | | | |✔ | **Note:** If you need to add, delete or modify the returned fields, you can modify the file `module.py` of the corresponding module. For the complete process, refer to the user-defined modification service module in the next section. diff --git a/deploy/hubserving/structure_layout/__init__.py b/deploy/hubserving/structure_layout/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c747d3e7aeca842933e083dffc01ef1fba3f4e85 --- /dev/null +++ b/deploy/hubserving/structure_layout/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. \ No newline at end of file diff --git a/deploy/hubserving/structure_layout/config.json b/deploy/hubserving/structure_layout/config.json new file mode 100644 index 0000000000000000000000000000000000000000..bc52c1ab603d5659f90a5ed8a72cdb06638fb9e5 --- /dev/null +++ b/deploy/hubserving/structure_layout/config.json @@ -0,0 +1,16 @@ +{ + "modules_info": { + "structure_layout": { + "init_args": { + "version": "1.0.0", + "use_gpu": true + }, + "predict_args": { + } + } + }, + "port": 8871, + "use_multiprocess": false, + "workers": 2 +} + diff --git a/deploy/hubserving/structure_layout/module.py b/deploy/hubserving/structure_layout/module.py new file mode 100644 index 0000000000000000000000000000000000000000..7091f123fc0039e4886d8763096952d7c445184c --- /dev/null +++ b/deploy/hubserving/structure_layout/module.py @@ -0,0 +1,143 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import sys +sys.path.insert(0, ".") +import copy + +import time +import paddlehub +from paddlehub.common.logger import logger +from paddlehub.module.module import moduleinfo, runnable, serving +import cv2 +import paddlehub as hub + +from tools.infer.utility import base64_to_cv2 +from ppstructure.layout.predict_layout import LayoutPredictor as _LayoutPredictor +from ppstructure.utility import parse_args +from deploy.hubserving.structure_layout.params import read_params + + +@moduleinfo( + name="structure_layout", + version="1.0.0", + summary="PP-Structure layout service", + author="paddle-dev", + author_email="paddle-dev@baidu.com", + type="cv/structure_layout") +class LayoutPredictor(hub.Module): + def _initialize(self, use_gpu=False, enable_mkldnn=False): + """ + initialize with the necessary elements + """ + cfg = self.merge_configs() + cfg.use_gpu = use_gpu + if use_gpu: + try: + _places = os.environ["CUDA_VISIBLE_DEVICES"] + int(_places[0]) + print("use gpu: ", use_gpu) + print("CUDA_VISIBLE_DEVICES: ", _places) + cfg.gpu_mem = 8000 + except: + raise RuntimeError( + "Environment Variable CUDA_VISIBLE_DEVICES is not set correctly. If you wanna use gpu, please set CUDA_VISIBLE_DEVICES via export CUDA_VISIBLE_DEVICES=cuda_device_id." + ) + cfg.ir_optim = True + cfg.enable_mkldnn = enable_mkldnn + + self.layout_predictor = _LayoutPredictor(cfg) + + def merge_configs(self): + # deafult cfg + backup_argv = copy.deepcopy(sys.argv) + sys.argv = sys.argv[:1] + cfg = parse_args() + + update_cfg_map = vars(read_params()) + + for key in update_cfg_map: + cfg.__setattr__(key, update_cfg_map[key]) + + sys.argv = copy.deepcopy(backup_argv) + return cfg + + def read_images(self, paths=[]): + images = [] + for img_path in paths: + assert os.path.isfile( + img_path), "The {} isn't a valid file.".format(img_path) + img = cv2.imread(img_path) + if img is None: + logger.info("error in loading image:{}".format(img_path)) + continue + images.append(img) + return images + + def predict(self, images=[], paths=[]): + """ + Get the chinese texts in the predicted images. + Args: + images (list(numpy.ndarray)): images data, shape of each is [H, W, C]. If images not paths + paths (list[str]): The paths of images. If paths not images + Returns: + res (list): The layout results of images. + """ + + if images != [] and isinstance(images, list) and paths == []: + predicted_data = images + elif images == [] and isinstance(paths, list) and paths != []: + predicted_data = self.read_images(paths) + else: + raise TypeError("The input data is inconsistent with expectations.") + + assert predicted_data != [], "There is not any image to be predicted. Please check the input data." + + all_results = [] + for img in predicted_data: + if img is None: + logger.info("error in loading image") + all_results.append([]) + continue + starttime = time.time() + res, _ = self.layout_predictor(img) + elapse = time.time() - starttime + logger.info("Predict time: {}".format(elapse)) + + for item in res: + item['bbox'] = item['bbox'].tolist() + all_results.append({'layout': res}) + return all_results + + @serving + def serving_method(self, images, **kwargs): + """ + Run as a service. + """ + images_decode = [base64_to_cv2(image) for image in images] + results = self.predict(images_decode, **kwargs) + return results + + +if __name__ == '__main__': + layout = LayoutPredictor() + layout._initialize() + image_path = ['./ppstructure/docs/table/1.png'] + res = layout.predict(paths=image_path) + print(res) diff --git a/deploy/hubserving/structure_layout/params.py b/deploy/hubserving/structure_layout/params.py new file mode 100755 index 0000000000000000000000000000000000000000..448b66ac42dac555f084299f525ee9e91ad481d8 --- /dev/null +++ b/deploy/hubserving/structure_layout/params.py @@ -0,0 +1,32 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + + +class Config(object): + pass + + +def read_params(): + cfg = Config() + + # params for layout analysis + cfg.layout_model_dir = './inference/picodet_lcnet_x1_0_fgd_layout_infer/' + cfg.layout_dict_path = './ppocr/utils/dict/layout_dict/layout_publaynet_dict.txt' + cfg.layout_score_threshold = 0.5 + cfg.layout_nms_threshold = 0.5 + return cfg diff --git a/doc/doc_ch/algorithm_rec_srn.md b/doc/doc_ch/algorithm_rec_srn.md index ca7961359eb902fafee959b26d02f324aece233a..dd61a388c7024fabdadec1c120bd3341ed0197cc 100644 --- a/doc/doc_ch/algorithm_rec_srn.md +++ b/doc/doc_ch/algorithm_rec_srn.md @@ -78,7 +78,7 @@ python3 tools/export_model.py -c configs/rec/rec_r50_fpn_srn.yml -o Global.pretr SRN文本识别模型推理,可以执行如下命令: ``` -python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words/en/word_1.png" --rec_model_dir="./inference/rec_srn/" --rec_image_shape="1,64,256" --rec_char_type="ch" --rec_algorithm="SRN" --rec_char_dict_path=./ppocr/utils/ic15_dict.txt --use_space_char=False +python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words/en/word_1.png" --rec_model_dir="./inference/rec_srn/" --rec_image_shape="1,64,256" --rec_algorithm="SRN" --rec_char_dict_path=./ppocr/utils/ic15_dict.txt --use_space_char=False ``` diff --git a/doc/doc_ch/dataset/kie_datasets.md b/doc/doc_ch/dataset/kie_datasets.md index 7f8d14cbc4ad724621f28c7d6ca1f8c2ac79f097..be5624dbf257150745a79db25f0367ccee339559 100644 --- a/doc/doc_ch/dataset/kie_datasets.md +++ b/doc/doc_ch/dataset/kie_datasets.md @@ -1,6 +1,6 @@ # 关键信息抽取数据集 -这里整理了常见的DocVQA数据集,持续更新中,欢迎各位小伙伴贡献数据集~ +这里整理了常见的关键信息抽取数据集,持续更新中,欢迎各位小伙伴贡献数据集~ - [FUNSD数据集](#funsd) - [XFUND数据集](#xfund) diff --git a/doc/doc_ch/inference_args.md b/doc/doc_ch/inference_args.md index fa188ab7c800eaabae8a4ff54413af162dd60e43..36efc6fbf7a6ec62bc700964dc13261fecdb9bd5 100644 --- a/doc/doc_ch/inference_args.md +++ b/doc/doc_ch/inference_args.md @@ -15,7 +15,7 @@ | save_crop_res | bool | False | 是否保存OCR的识别文本图像 | | crop_res_save_dir | str | "./output" | 保存OCR识别出来的文本图像路径 | | use_mp | bool | False | 是否开启多进程预测 | -| total_process_num | int | 6 | 开启的进城数,`use_mp`为`True`时生效 | +| total_process_num | int | 6 | 开启的进程数,`use_mp`为`True`时生效 | | process_id | int | 0 | 当前进程的id号,无需自己修改 | | benchmark | bool | False | 是否开启benchmark,对预测速度、显存占用等进行统计 | | save_log_path | str | "./log_output/" | 开启`benchmark`时,日志结果的保存文件夹 | @@ -39,10 +39,10 @@ | 参数名称 | 类型 | 默认值 | 含义 | | :--: | :--: | :--: | :--: | -| det_algorithm | str | "DB" | 文本检测算法名称,目前支持`DB`, `EAST`, `SAST`, `PSE` | +| det_algorithm | str | "DB" | 文本检测算法名称,目前支持`DB`, `EAST`, `SAST`, `PSE`, `DB++`, `FCE` | | det_model_dir | str | xx | 检测inference模型路径 | | det_limit_side_len | int | 960 | 检测的图像边长限制 | -| det_limit_type | str | "max" | 检测的变成限制类型,目前支持`min`, `max`,`min`表示保证图像最短边不小于`det_limit_side_len`,`max`表示保证图像最长边不大于`det_limit_side_len` | +| det_limit_type | str | "max" | 检测的边长限制类型,目前支持`min`和`max`,`min`表示保证图像最短边不小于`det_limit_side_len`,`max`表示保证图像最长边不大于`det_limit_side_len` | 其中,DB算法相关参数如下 @@ -85,9 +85,9 @@ PSE算法相关参数如下 | 参数名称 | 类型 | 默认值 | 含义 | | :--: | :--: | :--: | :--: | -| rec_algorithm | str | "CRNN" | 文本识别算法名称,目前支持`CRNN`, `SRN`, `RARE`, `NETR`, `SAR` | +| rec_algorithm | str | "CRNN" | 文本识别算法名称,目前支持`CRNN`, `SRN`, `RARE`, `NETR`, `SAR`, `ViTSTR`, `ABINet`, `VisionLAN`, `SPIN`, `RobustScanner`, `SVTR`, `SVTR_LCNet` | | rec_model_dir | str | 无,如果使用识别模型,该项是必填项 | 识别inference模型路径 | -| rec_image_shape | list | [3, 32, 320] | 识别时的图像尺寸, | +| rec_image_shape | list | [3, 48, 320] | 识别时的图像尺寸 | | rec_batch_num | int | 6 | 识别的batch size | | max_text_length | int | 25 | 识别结果最大长度,在`SRN`中有效 | | rec_char_dict_path | str | "./ppocr/utils/ppocr_keys_v1.txt" | 识别的字符字典文件 | diff --git a/doc/doc_ch/inference_ppocr.md b/doc/doc_ch/inference_ppocr.md index 622ac995d37ce290ee51af06164b0c2aef8b5a14..514f905393984e2189b4c9c920ca4aeb91ac6da1 100644 --- a/doc/doc_ch/inference_ppocr.md +++ b/doc/doc_ch/inference_ppocr.md @@ -158,3 +158,5 @@ python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --de 执行命令后,识别结果图像如下: ![](../imgs_results/system_res_00018069_v3.jpg) + +更多关于推理超参数的配置与解释,请参考:[模型推理超参数解释教程](./inference_args.md)。 diff --git a/doc/doc_en/algorithm_en.md b/doc/doc_en/algorithm_en.md deleted file mode 100644 index c880336b4ad528eab2cce479edf11fce0b43f435..0000000000000000000000000000000000000000 --- a/doc/doc_en/algorithm_en.md +++ /dev/null @@ -1,11 +0,0 @@ -# Academic Algorithms and Models - -PaddleOCR will add cutting-edge OCR algorithms and models continuously. Check out the supported models and tutorials by clicking the following list: - - -- [text detection algorithms](./algorithm_overview_en.md#11) -- [text recognition algorithms](./algorithm_overview_en.md#12) -- [end-to-end algorithms](./algorithm_overview_en.md#2) -- [table recognition algorithms](./algorithm_overview_en.md#3) - -Developers are welcome to contribute more algorithms! Please refer to [add new algorithm](./add_new_algorithm_en.md) guideline. diff --git a/doc/doc_en/algorithm_overview_en.md b/doc/doc_en/algorithm_overview_en.md index 3f59bf9c829920fb43fa7f89858b4586ceaac26f..5bf569e3e1649cfabbe196be7e1a55d1caa3bf61 100755 --- a/doc/doc_en/algorithm_overview_en.md +++ b/doc/doc_en/algorithm_overview_en.md @@ -7,7 +7,11 @@ - [3. Table Recognition Algorithms](#3) - [4. Key Information Extraction Algorithms](#4) -This tutorial lists the OCR algorithms supported by PaddleOCR, as well as the models and metrics of each algorithm on **English public datasets**. It is mainly used for algorithm introduction and algorithm performance comparison. For more models on other datasets including Chinese, please refer to [PP-OCR v2.0 models list](./models_list_en.md). +This tutorial lists the OCR algorithms supported by PaddleOCR, as well as the models and metrics of each algorithm on **English public datasets**. It is mainly used for algorithm introduction and algorithm performance comparison. For more models on other datasets including Chinese, please refer to [PP-OCRv3 models list](./models_list_en.md). + +>> +Developers are welcome to contribute more algorithms! Please refer to [add new algorithm](./add_new_algorithm_en.md) guideline. + diff --git a/doc/doc_en/dataset/kie_datasets_en.md b/doc/doc_en/dataset/kie_datasets_en.md index 3a8b744fc0b2653aab5c1435996a2ef73dd336e4..7b476f77d0380496d026c448937e59b23ee24c87 100644 --- a/doc/doc_en/dataset/kie_datasets_en.md +++ b/doc/doc_en/dataset/kie_datasets_en.md @@ -1,9 +1,10 @@ -## Key Imnformation Extraction dataset +## Key Information Extraction dataset + +Here are the common datasets key information extraction, which are being updated continuously. Welcome to contribute datasets. -Here are the common DocVQA datasets, which are being updated continuously. Welcome to contribute datasets. - [FUNSD dataset](#funsd) - [XFUND dataset](#xfund) -- [wildreceipt dataset](#wildreceipt数据集) +- [wildreceipt dataset](#wildreceipt-dataset) #### 1. FUNSD dataset @@ -20,7 +21,8 @@ Here are the common DocVQA datasets, which are being updated continuously. Welco #### 2. XFUND dataset - **Data source**: https://github.com/doc-analysis/XFUND -- **Data introduction**: XFUND is a multilingual form comprehension dataset, which contains form data in 7 different languages, and all are manually annotated in the form of key-value pairs. The data for each language contains 199 form data, which are divided into 149 training sets and 50 test sets. Part of the image and the annotation box visualization are shown below: +- **Data introduction**: XFUND is a multilingual form comprehension dataset, which contains form data in 7 different languages, and all are manually annotated in the form of key-value pairs. The data for each language contains 199 form data, which are divided into 149 training sets and 50 test sets. Part of the image and the annotation box visualization are shown below. +
diff --git a/doc/doc_en/dataset/layout_datasets_en.md b/doc/doc_en/dataset/layout_datasets_en.md new file mode 100644 index 0000000000000000000000000000000000000000..54c88609d0f25f65b4878fac96a43de5f1cc3164 --- /dev/null +++ b/doc/doc_en/dataset/layout_datasets_en.md @@ -0,0 +1,55 @@ +## Layout Analysis Dataset + +Here are the common datasets of layout anlysis, which are being updated continuously. Welcome to contribute datasets. + +- [PubLayNet dataset](#publaynet) +- [CDLA dataset](#CDLA) +- [TableBank dataset](#TableBank) + + +Most of the layout analysis datasets are object detection datasets. In addition to open source datasets, you can also label or synthesize datasets using tools such as [labelme](https://github.com/wkentaro/labelme) and so on. + + + + +#### 1. PubLayNet dataset + +- **Data source**: https://github.com/ibm-aur-nlp/PubLayNet +- **Data introduction**: The PubLayNet dataset contains 350000 training images and 11000 validation images. There are 5 categories in total, namely: `text, title, list, table, figure`. Some images and their annotations as shown below. + +
+ + +
+ +- **Download address**: https://developer.ibm.com/exchanges/data/all/publaynet/ +- **Note**: When using this dataset, you need to follow [CDLA-Permissive](https://cdla.io/permissive-1-0/) license. + + + + +#### 2、CDLA数据集 +- **Data source**: https://github.com/buptlihang/CDLA +- **Data introduction**: CDLA dataset contains 5000 training images and 1000 validation images with 10 categories, which are `Text, Title, Figure, Figure caption, Table, Table caption, Header, Footer, Reference, Equation`. Some images and their annotations as shown below. + +
+ + +
+ +- **Download address**: https://github.com/buptlihang/CDLA +- **Note**: When you train detection model on CDLA dataset using [PaddleDetection](https://github.com/PaddlePaddle/PaddleDetection/tree/develop), you need to remove the label `__ignore__` and `_background_`. + + + +#### 3、TableBank dataet +- **Data source**: https://doc-analysis.github.io/tablebank-page/index.html +- **Data introduction**: TableBank dataset contains 2 types of document: Latex (187199 training images, 7265 validation images and 5719 testing images) and Word (73383 training images 2735 validation images and 2281 testing images). Some images and their annotations as shown below. + +
+ + +
+ +- **Data source**: https://doc-analysis.github.io/tablebank-page/index.html +- **Note**: When using this dataset, you need to follow [Apache-2.0](https://github.com/doc-analysis/TableBank/blob/master/LICENSE) license. diff --git a/doc/doc_en/inference_args_en.md b/doc/doc_en/inference_args_en.md new file mode 100644 index 0000000000000000000000000000000000000000..f2c99fc8297d47f27a219bf7d8e7f2ea518257f0 --- /dev/null +++ b/doc/doc_en/inference_args_en.md @@ -0,0 +1,120 @@ +# PaddleOCR Model Inference Parameter Explanation + +When using PaddleOCR for model inference, you can customize the modification parameters to modify the model, data, preprocessing, postprocessing, etc.(parameter file:[utility.py](../../tools/infer/utility.py)),The detailed parameter explanation is as follows: + +* Global parameters + +| parameters | type | default | implication | +| :--: | :--: | :--: | :--: | +| image_dir | str | None, must be specified explicitly | Image or folder path | +| vis_font_path | str | "./doc/fonts/simfang.ttf" | font path for visualization | +| drop_score | float | 0.5 | Results with a recognition score less than this value will be discarded and will not be returned as results | +| use_pdserving | bool | False | Whether to use Paddle Serving for prediction | +| warmup | bool | False | Whether to enable warmup, this method can be used when statistical prediction time | +| draw_img_save_dir | str | "./inference_results" | The saving folder of the system's tandem prediction OCR results | +| save_crop_res | bool | False | Whether to save the recognized text image for OCR | +| crop_res_save_dir | str | "./output" | Save the text image path recognized by OCR | +| use_mp | bool | False | Whether to enable multi-process prediction | +| total_process_num | int | 6 | The number of processes, which takes effect when `use_mp` is `True` | +| process_id | int | 0 | The id number of the current process, no need to modify it yourself | +| benchmark | bool | False | Whether to enable benchmark, and make statistics on prediction speed, memory usage, etc. | +| save_log_path | str | "./log_output/" | Folder where log results are saved when `benchmark` is enabled | +| show_log | bool | True | Whether to show the log information in the inference | +| use_onnx | bool | False | Whether to enable onnx prediction | + + +* Prediction engine related parameters + +| parameters | type | default | implication | +| :--: | :--: | :--: | :--: | +| use_gpu | bool | True | Whether to use GPU for prediction | +| ir_optim | bool | True | Whether to analyze and optimize the calculation graph. The prediction process can be accelerated when `ir_optim` is enabled | +| use_tensorrt | bool | False | Whether to enable tensorrt | +| min_subgraph_size | int | 15 | The minimum subgraph size in tensorrt. When the size of the subgraph is greater than this value, it will try to use the trt engine to calculate the subgraph. | +| precision | str | fp32 | The precision of prediction, supports `fp32`, `fp16`, `int8` | +| enable_mkldnn | bool | True | Whether to enable mkldnn | +| cpu_threads | int | 10 | When mkldnn is enabled, the number of threads predicted by the cpu | + +* Text detection model related parameters + +| parameters | type | default | implication | +| :--: | :--: | :--: | :--: | +| det_algorithm | str | "DB" | Text detection algorithm name, currently supports `DB`, `EAST`, `SAST`, `PSE`, `DB++`, `FCE` | +| det_model_dir | str | xx | Detection inference model paths | +| det_limit_side_len | int | 960 | image side length limit | +| det_limit_type | str | "max" | The side length limit type, currently supports `min`and `max`. `min` means to ensure that the shortest side of the image is not less than `det_limit_side_len`, `max` means to ensure that the longest side of the image is not greater than `det_limit_side_len` | + +The relevant parameters of the DB algorithm are as follows + +| parameters | type | default | implication | +| :--: | :--: | :--: | :--: | +| det_db_thresh | float | 0.3 | In the probability map output by DB, only pixels with a score greater than this threshold will be considered as text pixels | +| det_db_box_thresh | float | 0.6 | Within the detection box, when the average score of all pixels is greater than the threshold, the result will be considered as a text area | +| det_db_unclip_ratio | float | 1.5 | The expansion factor of the `Vatti clipping` algorithm, which is used to expand the text area | +| max_batch_size | int | 10 | max batch size | +| use_dilation | bool | False | Whether to inflate the segmentation results to obtain better detection results | +| det_db_score_mode | str | "fast" | DB detection result score calculation method, supports `fast` and `slow`, `fast` calculates the average score according to all pixels within the bounding rectangle of the polygon, `slow` calculates the average score according to all pixels within the original polygon, The calculation speed is relatively slower, but more accurate. | + +The relevant parameters of the EAST algorithm are as follows + +| parameters | type | default | implication | +| :--: | :--: | :--: | :--: | +| det_east_score_thresh | float | 0.8 | Threshold for score map in EAST postprocess | +| det_east_cover_thresh | float | 0.1 | Average score threshold for text boxes in EAST postprocess | +| det_east_nms_thresh | float | 0.2 | Threshold of nms in EAST postprocess | + +The relevant parameters of the SAST algorithm are as follows + +| parameters | type | default | implication | +| :--: | :--: | :--: | :--: | +| det_sast_score_thresh | float | 0.5 | Score thresholds in SAST postprocess | +| det_sast_nms_thresh | float | 0.5 | Thresholding of nms in SAST postprocess | +| det_sast_polygon | bool | False | Whether polygon detection, curved text scene (such as Total-Text) is set to True | + +The relevant parameters of the PSE algorithm are as follows + +| parameters | type | default | implication | +| :--: | :--: | :--: | :--: | +| det_pse_thresh | float | 0.0 | Threshold for binarizing the output image | +| det_pse_box_thresh | float | 0.85 | Threshold for filtering boxes, below this threshold is discarded | +| det_pse_min_area | float | 16 | The minimum area of the box, below this threshold is discarded | +| det_pse_box_type | str | "box" | The type of the returned box, box: four point coordinates, poly: all point coordinates of the curved text | +| det_pse_scale | int | 1 | The ratio of the input image relative to the post-processed image, such as an image of `640*640`, the network output is `160*160`, and when the scale is 2, the shape of the post-processed image is `320*320`. Increasing this value can speed up the post-processing speed, but it will bring about a decrease in accuracy | + +* Text recognition model related parameters + +| parameters | type | default | implication | +| :--: | :--: | :--: | :--: | +| rec_algorithm | str | "CRNN" | Text recognition algorithm name, currently supports `CRNN`, `SRN`, `RARE`, `NETR`, `SAR`, `ViTSTR`, `ABINet`, `VisionLAN`, `SPIN`, `RobustScanner`, `SVTR`, `SVTR_LCNet` | +| rec_model_dir | str | None, it is required if using the recognition model | recognition inference model paths | +| rec_image_shape | list | [3, 48, 320] | Image size at the time of recognition | +| rec_batch_num | int | 6 | batch size | +| max_text_length | int | 25 | The maximum length of the recognition result, valid in `SRN` | +| rec_char_dict_path | str | "./ppocr/utils/ppocr_keys_v1.txt" | character dictionary file | +| use_space_char | bool | True | Whether to include spaces, if `True`, the `space` character will be added at the end of the character dictionary | + + +* End-to-end text detection and recognition model related parameters + +| parameters | type | default | implication | +| :--: | :--: | :--: | :--: | +| e2e_algorithm | str | "PGNet" | End-to-end algorithm name, currently supports `PGNet` | +| e2e_model_dir | str | None, it is required if using the end-to-end model | end-to-end model inference model path | +| e2e_limit_side_len | int | 768 | End-to-end input image side length limit | +| e2e_limit_type | str | "max" | End-to-end side length limit type, currently supports `min` and `max`. `min` means to ensure that the shortest side of the image is not less than `e2e_limit_side_len`, `max` means to ensure that the longest side of the image is not greater than `e2e_limit_side_len` | +| e2e_pgnet_score_thresh | float | 0.5 | End-to-end score threshold, results below this threshold are discarded | +| e2e_char_dict_path | str | "./ppocr/utils/ic15_dict.txt" | Recognition dictionary file path | +| e2e_pgnet_valid_set | str | "totaltext" | The name of the validation set, currently supports `totaltext`, `partvgg`, the post-processing methods corresponding to different data sets are different, and it can be consistent with the training process | +| e2e_pgnet_mode | str | "fast" | PGNet's detection result score calculation method, supports `fast` and `slow`, `fast` calculates the average score according to all pixels within the bounding rectangle of the polygon, `slow` calculates the average score according to all pixels within the original polygon, The calculation speed is relatively slower, but more accurate. | + + +* Angle classifier model related parameters + +| parameters | type | default | implication | +| :--: | :--: | :--: | :--: | +| use_angle_cls | bool | False | whether to use an angle classifier | +| cls_model_dir | str | None, if you need to use, you must specify the path explicitly | angle classifier inference model path | +| cls_image_shape | list | [3, 48, 192] | prediction shape | +| label_list | list | ['0', '180'] | The angle value corresponding to the class id | +| cls_batch_num | int | 6 | batch size | +| cls_thresh | float | 0.9 | Prediction threshold, when the model prediction result is 180 degrees, and the score is greater than the threshold, the final prediction result is considered to be 180 degrees and needs to be flipped | diff --git a/doc/doc_en/inference_ppocr_en.md b/doc/doc_en/inference_ppocr_en.md index 0f57b0ba6b226c19ecb1e0b60afdfa34302b8e78..4c9db51e1d23e5ac05cfcb3ec43748df75c0b36c 100755 --- a/doc/doc_en/inference_ppocr_en.md +++ b/doc/doc_en/inference_ppocr_en.md @@ -160,3 +160,5 @@ python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --de After executing the command, the recognition result image is as follows: ![](../imgs_results/system_res_00018069_v3.jpg) + +For more configuration and explanation of inference parameters, please refer to:[Model Inference Parameters Explained Tutorial](./inference_args_en.md)。 diff --git a/doc/features.png b/doc/features.png deleted file mode 100644 index 273e4beb74771b723ab732f703863fa2a3a4c21c..0000000000000000000000000000000000000000 Binary files a/doc/features.png and /dev/null differ diff --git a/doc/features_en.png b/doc/features_en.png deleted file mode 100644 index 310a1b7e50920304521a5fa68c5c2e2a881d3917..0000000000000000000000000000000000000000 Binary files a/doc/features_en.png and /dev/null differ diff --git a/paddleocr.py b/paddleocr.py index f6fb095af34a58cc91b9fd0f22b2e95bf833e010..1a236f2474cf3d5ef1fc6ab61955157bb1837db2 100644 --- a/paddleocr.py +++ b/paddleocr.py @@ -286,11 +286,17 @@ MODEL_URLS = { } }, 'layout': { - 'ch': { + 'en': { 'url': - 'https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_layout_infer.tar', + 'https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_infer.tar', 'dict_path': 'ppocr/utils/dict/layout_dict/layout_publaynet_dict.txt' + }, + 'ch': { + 'url': + 'https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_cdla_infer.tar', + 'dict_path': + 'ppocr/utils/dict/layout_dict/layout_cdla_dict.txt' } } } @@ -556,7 +562,7 @@ class PPStructure(StructureSystem): params.table_model_dir, os.path.join(BASE_DIR, 'whl', 'table'), table_model_config['url']) layout_model_config = get_model_config( - 'STRUCTURE', params.structure_version, 'layout', 'ch') + 'STRUCTURE', params.structure_version, 'layout', lang) params.layout_model_dir, layout_url = confirm_model_dir_url( params.layout_model_dir, os.path.join(BASE_DIR, 'whl', 'layout'), layout_model_config['url']) @@ -578,7 +584,7 @@ class PPStructure(StructureSystem): logger.debug(params) super().__init__(params) - def __call__(self, img, return_ocr_result_in_table=False): + def __call__(self, img, return_ocr_result_in_table=False, img_idx=0): if isinstance(img, str): # download net image if img.startswith('http'): @@ -596,7 +602,8 @@ class PPStructure(StructureSystem): if isinstance(img, np.ndarray) and len(img.shape) == 2: img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) - res, _ = super().__call__(img, return_ocr_result_in_table) + res, _ = super().__call__( + img, return_ocr_result_in_table, img_idx=img_idx) return res @@ -631,10 +638,54 @@ def main(): for line in result: logger.info(line) elif args.type == 'structure': - result = engine(img_path) - save_structure_res(result, args.output, img_name) - - for item in result: + img, flag_gif, flag_pdf = check_and_read(img_path) + if not flag_gif and not flag_pdf: + img = cv2.imread(img_path) + + if not flag_pdf: + if img is None: + logger.error("error in loading image:{}".format(image_file)) + continue + img_paths = [[img_path, img]] + else: + img_paths = [] + for index, pdf_img in enumerate(img): + os.makedirs( + os.path.join(args.output, img_name), exist_ok=True) + pdf_img_path = os.path.join( + args.output, img_name, + img_name + '_' + str(index) + '.jpg') + cv2.imwrite(pdf_img_path, pdf_img) + img_paths.append([pdf_img_path, pdf_img]) + + all_res = [] + for index, (new_img_path, img) in enumerate(img_paths): + logger.info('processing {}/{} page:'.format(index + 1, + len(img_paths))) + new_img_name = os.path.basename(new_img_path).split('.')[0] + result = engine(new_img_path, img_idx=index) + save_structure_res(result, args.output, img_name, index) + + if args.recovery and result != []: + from copy import deepcopy + from ppstructure.recovery.recovery_to_doc import sorted_layout_boxes + h, w, _ = img.shape + result_cp = deepcopy(result) + result_sorted = sorted_layout_boxes(result_cp, w) + all_res += result_sorted + + if args.recovery and all_res != []: + try: + from ppstructure.recovery.recovery_to_doc import convert_info_docx + convert_info_docx(img, all_res, args.output, img_name, + args.save_pdf) + except Exception as ex: + logger.error( + "error in layout recovery image:{}, err msg: {}".format( + img_name, ex)) + continue + + for item in all_res: item.pop('img') item.pop('res') logger.info(item) diff --git a/ppocr/utils/save_load.py b/ppocr/utils/save_load.py index f86125521d19342f63a9fcb3bdcaed02cc4c6463..aa65f290c0a5f4f13b3103fb4404815e2ae74a88 100644 --- a/ppocr/utils/save_load.py +++ b/ppocr/utils/save_load.py @@ -104,8 +104,9 @@ def load_model(config, model, optimizer=None, model_type='det'): continue pre_value = params[key] if pre_value.dtype == paddle.float16: - pre_value = pre_value.astype(paddle.float32) is_float16 = True + if pre_value.dtype != value.dtype: + pre_value = pre_value.astype(value.dtype) if list(value.shape) == list(pre_value.shape): new_state_dict[key] = pre_value else: @@ -162,8 +163,9 @@ def load_pretrained_params(model, path): logger.warning("The pretrained params {} not in model".format(k1)) else: if params[k1].dtype == paddle.float16: - params[k1] = params[k1].astype(paddle.float32) is_float16 = True + if params[k1].dtype != state_dict[k1].dtype: + params[k1] = params[k1].astype(state_dict[k1].dtype) if list(state_dict[k1].shape) == list(params[k1].shape): new_state_dict[k1] = params[k1] else: diff --git a/ppstructure/README.md b/ppstructure/README.md index 66df10b2ec4d52fb743c40893d5fc5aa7d6ab5be..fb3697bc1066262833ee20bcbb8f79833f264f14 100644 --- a/ppstructure/README.md +++ b/ppstructure/README.md @@ -1,120 +1,115 @@ English | [简体中文](README_ch.md) - [1. Introduction](#1-introduction) -- [2. Update log](#2-update-log) -- [3. Features](#3-features) -- [4. Results](#4-results) - - [4.1 Layout analysis and table recognition](#41-layout-analysis-and-table-recognition) - - [4.2 KIE](#42-kie) -- [5. Quick start](#5-quick-start) -- [6. PP-Structure System](#6-pp-structure-system) - - [6.1 Layout analysis and table recognition](#61-layout-analysis-and-table-recognition) - - [6.1.1 Layout analysis](#611-layout-analysis) - - [6.1.2 Table recognition](#612-table-recognition) - - [6.2 KIE](#62-kie) -- [7. Model List](#7-model-list) - - [7.1 Layout analysis model](#71-layout-analysis-model) - - [7.2 OCR and table recognition model](#72-ocr-and-table-recognition-model) - - [7.3 KIE model](#73-kie-model) +- [2. Features](#2-features) +- [3. Results](#3-results) + - [3.1 Layout analysis and table recognition](#31-layout-analysis-and-table-recognition) + - [3.2 Layout Recovery](#32-layout-recovery) + - [3.3 KIE](#33-kie) +- [4. Quick start](#4-quick-start) +- [5. Model List](#5-model-list) ## 1. Introduction -PP-Structure is an OCR toolkit that can be used for document analysis and processing with complex structures, designed to help developers better complete document understanding tasks +PP-Structure is an intelligent document analysis system developed by the PaddleOCR team, which aims to help developers better complete tasks related to document understanding such as layout analysis and table recognition. -## 2. Update log -* 2022.02.12 KIE add LayoutLMv2 model。 -* 2021.12.07 add [KIE SER and RE tasks](kie/README.md)。 +The pipeline of PP-Structurev2 system is shown below. The document image first passes through the image direction correction module to identify the direction of the entire image and complete the direction correction. Then, two tasks of layout information analysis and key information extraction can be completed. -## 3. Features +- In the layout analysis task, the image first goes through the layout analysis model to divide the image into different areas such as text, table, and figure, and then analyze these areas separately. For example, the table area is sent to the form recognition module for structured recognition, and the text area is sent to the OCR engine for text recognition. Finally, the layout recovery module restores it to a word or pdf file with the same layout as the original image; +- In the key information extraction task, the OCR engine is first used to extract the text content, and then the SER(semantic entity recognition) module obtains the semantic entities in the image, and finally the RE(relationship extraction) module obtains the correspondence between the semantic entities, thereby extracting the required key information. + -The main features of PP-Structure are as follows: +More technical details: 👉 [PP-Structurev2 Technical Report](docs/PP-Structurev2_introduction.md) -- Support the layout analysis of documents, divide the documents into 5 types of areas **text, title, table, image and list** (conjunction with Layout-Parser) -- Support to extract the texts from the text, title, picture and list areas (used in conjunction with PP-OCR) -- Support to extract excel files from the table areas -- Support python whl package and command line usage, easy to use -- Support custom training for layout analysis and table structure tasks -- Support Document Key Information Extraction (KIE) tasks: Semantic Entity Recognition (SER) and Relation Extraction (RE) +PP-Structurev2 supports independent use or flexible collocation of each module. For example, you can use layout analysis alone or table recognition alone. Click the corresponding link below to get the tutorial for each independent module: -## 4. Results +- [Layout Analysis](layout/README.md) +- [Table Recognition](table/README.md) +- [Key Information Extraction](kie/README.md) +- [Layout Recovery](recovery/README.md) -### 4.1 Layout analysis and table recognition +## 2. Features - - -The figure shows the pipeline of layout analysis + table recognition. The image is first divided into four areas of image, text, title and table by layout analysis, and then OCR detection and recognition is performed on the three areas of image, text and title, and the table is performed table recognition, where the image will also be stored for use. - -### 4.2 KIE - -* SER -* -![](docs/kie/result_ser/zh_val_0_ser.jpg) | ![](docs/kie/result_ser/zh_val_42_ser.jpg) ----|--- - -Different colored boxes in the figure represent different categories. For xfun dataset, there are three categories: query, answer and header: +The main features of PP-Structurev2 are as follows: +- Support layout analysis of documents in the form of images/pdfs, which can be divided into areas such as **text, titles, tables, figures, formulas, etc.**; +- Support common Chinese and English **table detection** tasks; +- Support structured table recognition, and output the final result to **Excel file**; +- Support multimodal-based Key Information Extraction (KIE) tasks - **Semantic Entity Recognition** (SER) and **Relation Extraction (RE); +- Support **layout recovery**, that is, restore the document in word or pdf format with the same layout as the original image; +- Support customized training and multiple inference deployment methods such as python whl package quick start; +- Connect with the semi-automatic data labeling tool PPOCRLabel, which supports the labeling of layout analysis, table recognition, and SER. -* Dark purple: header -* Light purple: query -* Army green: answer +## 3. Results -The corresponding category and OCR recognition results are also marked at the top left of the OCR detection box. +PP-Structurev2 supports the independent use or flexible collocation of each module. For example, layout analysis can be used alone, or table recognition can be used alone. Only the visualization effects of several representative usage methods are shown here. +### 3.1 Layout analysis and table recognition -* RE - -![](docs/kie/result_re/zh_val_21_re.jpg) | ![](docs/kie/result_re/zh_val_40_re.jpg) ----|--- +The figure shows the pipeline of layout analysis + table recognition. The image is first divided into four areas of image, text, title and table by layout analysis, and then OCR detection and recognition is performed on the three areas of image, text and title, and the table is performed table recognition, where the image will also be stored for use. + +### 3.2 Layout recovery -In the figure, the red box represents the question, the blue box represents the answer, and the question and answer are connected by green lines. The corresponding category and OCR recognition results are also marked at the top left of the OCR detection box. +The following figure shows the effect of layout recovery based on the results of layout analysis and table recognition in the previous section. + -## 5. Quick start +### 3.3 KIE -Start from [Quick Installation](./docs/quickstart.md) +* SER -## 6. PP-Structure System +Different colored boxes in the figure represent different categories. -### 6.1 Layout analysis and table recognition +
+ +
-![pipeline](docs/table/pipeline.jpg) +
+ +
-In PP-Structure, the image will be divided into 5 types of areas **text, title, image list and table**. For the first 4 types of areas, directly use PP-OCR system to complete the text detection and recognition. For the table area, after the table structuring process, the table in image is converted into an Excel file with the same table style. +
+ +
-#### 6.1.1 Layout analysis +
+ +
-Layout analysis classifies image by region, including the use of Python scripts of layout analysis tools, extraction of designated category detection boxes, performance indicators, and custom training layout analysis models. For details, please refer to [document](layout/README.md). +
+ +
-#### 6.1.2 Table recognition +* RE -Table recognition converts table images into excel documents, which include the detection and recognition of table text and the prediction of table structure and cell coordinates. For detailed instructions, please refer to [document](table/README.md) +In the figure, the red box represents `Question`, the blue box represents `Answer`, and `Question` and `Answer` are connected by green lines. -### 6.2 KIE +
+ +
-Multi-modal based Key Information Extraction (KIE) methods include Semantic Entity Recognition (SER) and Relation Extraction (RE) tasks. Based on SER task, text recognition and classification in images can be completed. Based on THE RE task, we can extract the relation of the text content in the image, such as judge the problem pair. For details, please refer to [document](kie/README.md) +
+ +
-## 7. Model List +
+ +
-PP-Structure Series Model List (Updating) +
+ +
-### 7.1 Layout analysis model +## 4. Quick start -|model name|description|download|label_map| -| --- | --- | --- |--- | -| ppyolov2_r50vd_dcn_365e_publaynet | The layout analysis model trained on the PubLayNet dataset can divide image into 5 types of areas **text, title, table, picture, and list** | [PubLayNet](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_publaynet.tar) | {0: "Text", 1: "Title", 2: "List", 3:"Table", 4:"Figure"}| +Start from [Quick Start](./docs/quickstart_en.md). -### 7.2 OCR and table recognition model +## 5. Model List -|model name|description|model size|download| -| --- | --- | --- | --- | -|ch_PP-OCRv3_det| [New] Lightweight model, supporting Chinese, English, multilingual text detection | 3.8M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_distill_train.tar)| -|ch_PP-OCRv3_rec| [New] Lightweight model, supporting Chinese, English, multilingual text recognition | 12.4M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_train.tar) | -|ch_ppstructure_mobile_v2.0_SLANet|Chinese table recognition model based on SLANet|9.3M|[inference model](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_train.tar) | +Some tasks need to use both the structured analysis models and the OCR models. For example, the table recognition task needs to use the table recognition model for structured analysis, and the OCR model to recognize the text in the table. Please select the appropriate models according to your specific needs. -### 7.3 KIE model +For structural analysis related model downloads, please refer to: +- [PP-Structure Model Zoo](./docs/models_list_en.md) -|model name|description|model size|download| -| --- | --- | --- | --- | -|ser_LayoutXLM_xfun_zhd|SER model trained on xfun Chinese dataset based on LayoutXLM|1.4G|[inference model coming soon]() / [trained model](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutXLM_xfun_zh.tar) | -|re_LayoutXLM_xfun_zh|RE model trained on xfun Chinese dataset based on LayoutXLM|1.4G|[inference model coming soon]() / [trained model](https://paddleocr.bj.bcebos.com/pplayout/re_LayoutXLM_xfun_zh.tar) | +For OCR related model downloads, please refer to: +- [PP-OCR Model Zoo](../doc/doc_en/models_list_en.md) -If you need to use other models, you can download the model in [PPOCR model_list](../doc/doc_en/models_list_en.md) and [PPStructure model_list](./docs/models_list.md) diff --git a/ppstructure/README_ch.md b/ppstructure/README_ch.md index 6539002bfe1497853dfa11eb774cf3c453567988..87a9c625b32c32e9c7fffb8ebc9b9fdf3b2130db 100644 --- a/ppstructure/README_ch.md +++ b/ppstructure/README_ch.md @@ -21,7 +21,7 @@ PP-Structurev2系统流程图如下所示,文档图像首先经过图像矫正 - 关键信息抽取任务中,首先使用OCR引擎提取文本内容,然后由语义实体识别模块获取图像中的语义实体,最后经关系抽取模块获取语义实体之间的对应关系,从而提取需要的关键信息。 -更多技术细节:👉 [PP-Structurev2技术报告]() +更多技术细节:👉 [PP-Structurev2技术报告](docs/PP-Structurev2_introduction.md) PP-Structurev2支持各个模块独立使用或灵活搭配,如,可以单独使用版面分析,或单独使用表格识别,点击下面相应链接获取各个独立模块的使用教程: @@ -76,6 +76,14 @@ PP-Structurev2支持各个模块独立使用或灵活搭配,如,可以单独
+
+ +
+ +
+ +
+ * RE 图中红色框表示`问题`,蓝色框表示`答案`,`问题`和`答案`之间使用绿色线连接。 @@ -88,6 +96,14 @@ PP-Structurev2支持各个模块独立使用或灵活搭配,如,可以单独 +
+ +
+ +
+ +
+ ## 4. 快速体验 diff --git a/ppstructure/docs/PP-Structurev2_introduction.md b/ppstructure/docs/PP-Structurev2_introduction.md new file mode 100644 index 0000000000000000000000000000000000000000..e337b563efea5b3fccbe81b14abcd50f1d36d70b --- /dev/null +++ b/ppstructure/docs/PP-Structurev2_introduction.md @@ -0,0 +1,426 @@ +# PP-Structurev2 + +## 目录 + +- [1. 背景](#1-背景) +- [2. 简介](#3-简介) +- [3. 整图方向矫正](#3-整图方向矫正) +- [4. 版面信息结构化](#4-版面信息结构化) + - [4.1 版面分析](#41-版面分析) + - [4.2 表格识别](#42-表格识别) + - [4.3 版面恢复](#43-版面恢复) +- [5. 关键信息抽取](#5-关键信息抽取) +- [6. Reference](#6-Reference) + +## 1. 背景 + +现实场景中包含大量的文档图像,它们以图片等非结构化形式存储。基于文档图像的结构化分析与信息抽取对于数据的数字化存储以及产业的数字化转型至关重要。基于该考虑,PaddleOCR自研并发布了PP-Structure智能文档分析系统,旨在帮助开发者更好的完成版面分析、表格识别、关键信息抽取等文档理解相关任务。 + +近期,PaddleOCR团队针对PP-Structurev1的版面分析、表格识别、关键信息抽取模块,进行了共计8个方面的升级,同时新增整图方向矫正、文档复原等功能,打造出一个全新的、效果更优的文档分析系统:PP-Structurev2。 + +## 2. 简介 + +PP-Structurev2在PP-Structurev1的基础上进一步改进,主要有以下3个方面升级: + + * **系统功能升级** :新增图像矫正和版面复原模块,图像转word/pdf、关键信息抽取能力全覆盖! + * **系统性能优化** : + * 版面分析:发布轻量级版面分析模型,速度提升**11倍**,平均CPU耗时仅需**41ms**! + * 表格识别:设计3大优化策略,预测耗时不变情况下,模型精度提升**6%**。 + * 关键信息抽取:设计视觉无关模型结构,语义实体识别精度提升**2.8%**,关系抽取精度提升**9.1%**。 + * **中文场景适配** :完成对版面分析与表格识别的中文场景适配,开源**开箱即用**的中文场景版面结构化模型! + +PP-Structurev2系统流程图如下所示,文档图像首先经过图像矫正模块,判断整图方向并完成转正,随后可以完成版面信息分析与关键信息抽取2类任务。版面分析任务中,图像首先经过版面分析模型,将图像划分为文本、表格、图像等不同区域,随后对这些区域分别进行识别,如,将表格区域送入表格识别模块进行结构化识别,将文本区域送入OCR引擎进行文字识别,最后使用版面恢复模块将其恢复为与原始图像布局一致的word或者pdf格式的文件;关键信息抽取任务中,首先使用OCR引擎提取文本内容,然后由语义实体识别模块获取图像中的语义实体,最后经关系抽取模块获取语义实体之间的对应关系,从而提取需要的关键信息。 + +
+ +
+ + +从算法改进思路来看,对系统中的3个关键子模块,共进行了8个方面的改进。 + +* 版面分析 + * PP-PicoDet:轻量级版面分析模型 + * FGD:兼顾全局与局部特征的模型蒸馏算法 + +* 表格识别 + * PP-LCNet: CPU友好型轻量级骨干网络 + * CSP-PAN:轻量级高低层特征融合模块 + * SLAHead:结构与位置信息对齐的特征解码模块 + +* 关键信息抽取 + * VI-LayoutXLM:视觉特征无关的多模态预训练模型结构 + * TB-YX:考虑阅读顺序的文本行排序逻辑 + * UDML:联合互学习知识蒸馏策略 + +最终,与PP-Structurev1相比: + +- 版面分析模型参数量减少95.6%,推理速度提升11倍,精度提升0.4%; +- 表格识别预测耗时不变,模型精度提升6%,端到端TEDS提升2%; +- 关键信息抽取模型速度提升2.8倍,语义实体识别模型精度提升2.8%;关系抽取模型精度提升9.1%。 + +下面对各个模块进行详细介绍。 + +## 3. 整图方向矫正 + +由于训练集一般以正方向图像为主,旋转过的文档图像直接输入模型会增加识别难度,影响识别效果。PP-Structurev2引入了整图方向矫正模块来判断含文字图像的方向,并将其进行方向调整。 + +我们直接调用PaddleClas中提供的文字图像方向分类模型-[PULC_text_image_orientation](https://github.com/PaddlePaddle/PaddleClas/blob/develop/docs/zh_CN/PULC/PULC_text_image_orientation.md),该模型部分数据集图像如下所示。不同于文本行方向分类器,文字图像方向分类模型针对整图进行方向判别。文字图像方向分类模型在验证集上精度高达99%,单张图像CPU预测耗时仅为`2.16ms`。 + +
+ +
+ +## 4. 版面信息结构化 + +### 4.1 版面分析 + +版面分析指的是对图片形式的文档进行区域划分,定位其中的关键区域,如文字、标题、表格、图片等,PP-Structurev1使用了PaddleDetection中开源的高效检测算法PP-YOLOv2完成版面分析的任务。 + +在PP-Structurev2中,我们发布基于PP-PicoDet的轻量级版面分析模型,并针对版面分析场景定制图像尺度,同时使用FGD知识蒸馏算法,进一步提升模型精度。最终CPU上`41ms`即可完成版面分析过程(仅包含模型推理时间,数据预处理耗时大约50ms左右)。在公开数据集PubLayNet 上,消融实验如下: + +| 实验序号 | 策略 | 模型存储(M) | mAP | CPU预测耗时(ms) | +|:------:|:------:|:------:|:------:|:------:| +| 1 | PP-YOLOv2(640*640) | 221 | 93.6% | 512 | +| 2 | PP-PicoDet-LCNet2.5x(640*640) | 29.7 | 92.5% |53.2| +| 3 | PP-PicoDet-LCNet2.5x(800*608) | 29.7 | 94.2% |83.1 | +| 4 | PP-PicoDet-LCNet1.0x(800*608) | 9.7 | 93.5% | 41.2| +| 5 | PP-PicoDet-LCNet1.0x(800*608) + FGD | 9.7 | 94% |41.2| + +* 测试条件 + * paddle版本:2.3.0 + * CPU:Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz,开启mkldnn,线程数为10 + +在PubLayNet数据集上,与其他方法的性能对比如下表所示。可以看到,和基于Detectron2的版面分析工具layoutparser相比,我们的模型精度高出大约5%,预测速度快约69倍。 + +| 模型 | mAP | CPU预测耗时 | +|-------------------|-----------|------------| +| layoutparser (Detectron2) | 88.98% | 2.9s | +| PP-Structurev2 (PP-PicoDet) | **94%** | 41.2ms | + +[PubLayNet](https://github.com/ibm-aur-nlp/PubLayNet)数据集是一个大型的文档图像数据集,包含Text、Title、Tale、Figure、List,共5个类别。数据集中包含335,703张训练集、11,245张验证集和11,405张测试集。训练数据与标注示例图如下所示: + +
+ +
+ + +#### 4.1.1 优化策略 + +**(1)轻量级版面分析模型PP-PicoDet** + +`PP-PicoDet`是PaddleDetection中提出的轻量级目标检测模型,通过使用PP-LCNet骨干网络、CSP-PAN特征融合模块、SimOTA标签分配方法等优化策略,最终在CPU与移动端具有卓越的性能。我们将PP-Structurev1中采用的PP-YOLOv2模型替换为`PP-PicoDet`,同时针对版面分析场景优化预测尺度,从针对目标检测设计的`640*640`调整为更适配文档图像的`800*608`,在`1.0x`配置下,模型精度与PP-YOLOv2相当,CPU平均预测速度可提升11倍。 + +**(1)FGD知识蒸馏** + +FGD(Focal and Global Knowledge Distillation for Detectors),是一种兼顾局部全局特征信息的模型蒸馏方法,分为Focal蒸馏和Global蒸馏2个部分。Focal蒸馏分离图像的前景和背景,让学生模型分别关注教师模型的前景和背景部分特征的关键像素;Global蒸馏部分重建不同像素之间的关系并将其从教师转移到学生,以补偿Focal蒸馏中丢失的全局信息。我们基于FGD蒸馏策略,使用教师模型PP-PicoDet-LCNet2.5x(mAP=94.2%)蒸馏学生模型PP-PicoDet-LCNet1.0x(mAP=93.5%),可将学生模型精度提升0.5%,和教师模型仅差0.2%,而预测速度比教师模型快1倍。 + +#### 4.1.2 场景适配 + +**(1)中文版面分析** + +除了英文公开数据集PubLayNet,我们也在中文场景进行了场景适配与方法验证。[CDLA](https://github.com/buptlihang/CDLA)是一个中文文档版面分析数据集,面向中文文献类(论文)场景,包含正文、标题等10个label。数据集中包含5,000张训练集和1,000张验证集。训练数据与标注示例图如下所示: + + +
+ +
+ + +在CDLA 数据集上,消融实验如下: + +| 实验序号 | 策略 | mAP | +|:------:|:------:|:------:| +| 1 | PP-YOLOv2 | 84.7% | +| 2 | PP-PicoDet-LCNet2.5x(800*608) | 87.8% | +| 3 | PP-PicoDet-LCNet1.0x(800*608) | 84.5% | +| 4 | PP-PicoDet-LCNet1.0x(800*608) + FGD | 86.8% | + + +**(2)表格版面分析** + +在实际应用中,很多场景并不关注图像中的图片、文本等版面区域,而仅需要提取文档图像中的表格,此时版面分析任务退化为一个表格检测任务,表格检测往往也是表格识别的前序任务。面向中英文文档场景,我们整理了开源领域含表格的版面分析数据集,包括TableBank、DocBank等。融合后的数据集中包含496,405张训练集与9,495张验证集图像。 + +在表格数据集上,消融实验如下: + +| 实验序号 | 策略 | mAP | +|:------:|:------:|:------:| +| 1 | PP-YOLOv2 |91.3% | +| 2 | PP-PicoDet-LCNet2.5x(800*608) | 95.9% | +| 3 | PP-PicoDet-LCNet1.0x(800*608) | 95.2% | +| 4 | PP-PicoDet-LCNet1.0x(800*608) + FGD | 95.7% | + +表格检测效果示意图如下: + +
+ +
+ +### 4.2 表格识别 + +基于深度学习的表格识别算法种类丰富,PP-Structurev1中,我们基于文本识别算法RARE研发了端到端表格识别算法TableRec-RARE,模型输出为表格结构的HTML表示,进而可以方便地转化为Excel文件。PP-Structurev2中,我们对模型结构和损失函数等5个方面进行升级,提出了 SLANet (Structure Location Alignment Network) ,模型结构如下图所示: + +
+ +
+ +在PubTabNet英文表格识别数据集上的消融实验如下: + +|策略|Acc|TEDS|推理速度(CPU+MKLDNN)|模型大小| +|---|---|---|---|---| +|TableRec-RARE| 71.73% | 93.88% |779ms |6.8M| +|+PP-LCNet| 74.71% |94.37% |778ms| 8.7M| +|+CSP-PAN| 75.68%| 94.72% |708ms| 9.3M| +|+SLAHead| 77.7%|94.85%| 766ms| 9.2M| +|+MergeToken| 76.31%| 95.89%|766ms| 9.2M| + +* 测试环境 + * paddle版本:2.3.1 + * CPU:Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz,开启mkldnn,线程数为10 + +在PubtabNet英文表格识别数据集上,和其他方法对比如下: + +|策略|Acc|TEDS|推理速度(CPU+MKLDNN)|模型大小| +|---|---|---|---|---| +|TableMaster|77.9%|96.12%|2144ms|253M| +|TableRec-RARE| 71.73% | 93.88% |779ms |6.8M| +|SLANet|76.31%| 95.89%|766ms|9.2M| + +#### 4.2.1 优化策略 + +**(1) CPU友好型轻量级骨干网络PP-LCNet** + +PP-LCNet是结合Intel-CPU端侧推理特性而设计的轻量高性能骨干网络,该方案在图像分类任务上取得了比ShuffleNetV2、MobileNetV3、GhostNet等轻量级模型更优的“精度-速度”均衡。PP-Structurev2中,我们采用PP-LCNet作为骨干网络,表格识别模型精度从71.73%提升至72.98%;同时加载通过SSLD知识蒸馏方案训练得到的图像分类模型权重作为表格识别的预训练模型,最终精度进一步提升2.95%至74.71%。 + +**(2)轻量级高低层特征融合模块CSP-PAN** + +对骨干网络提取的特征进行融合,可以有效解决尺度变化较大等复杂场景中的模型预测问题。早期,FPN模块被提出并用于特征融合,但是它的特征融合过程仅包含单向(高->低),融合不够充分。CSP-PAN基于PAN进行改进,在保证特征融合更为充分的同时,使用CSP block、深度可分离卷积等策略减小了计算量。在表格识别场景中,我们进一步将CSP-PAN的通道数从128降低至96以降低模型大小。最终表格识别模型精度提升0.97%至75.68%,预测速度提升10%。 + +**(3)结构与位置信息对齐的特征解码模块SLAHead** + +TableRec-RARE的TableAttentionHead如下图a所示,TableAttentionHead在执行完全部step的计算后拿到最终隐藏层状态表征(hiddens),随后hiddens经由SDM(Structure Decode Module)和CLDM(Cell Location Decode Module)模块生成全部的表格结构token和单元格坐标。但是这种设计忽略了单元格token和坐标之间一一对应的关系。 + +PP-Structurev2中,我们设计SLAHead模块,对单元格token和坐标之间做了对齐操作,如下图b所示。在SLAHead中,每一个step的隐藏层状态表征会分别送入SDM和CLDM来得到当前step的token和坐标,每个step的token和坐标输出分别进行concat得到表格的html表达和全部单元格的坐标。此外,考虑到表格识别模型的单元格准确率依赖于表格结构的识别准确,我们将损失函数中表格结构分支与单元格定位分支的权重比从1:1提升到8:1,并使用收敛更稳定的Smoothl1 Loss替换定位分支中的MSE Loss。最终模型精度从75.68%提高至77.7%。 + + +
+ +
+ + +**(4)其他** + +TableRec-RARE算法中,我们使用``和``两个单独的token来表示一个非跨行列单元格,这种表示方式限制了网络对于单元格数量较多表格的处理能力。 + +PP-Structurev2中,我们参考TableMaster中的token处理方法,将``和``合并为一个token-``。合并token后,验证集中token长度大于500的图片也参与模型评估,最终模型精度降低为76.31%,但是端到端TEDS提升1.04%。 + +#### 4.2.2 中文场景适配 + +除了上述模型策略的升级外,本次升级还开源了中文表格识别模型。在实际应用场景中,表格图像存在着各种各样的倾斜角度(PubTabNet数据集不存在该问题),因此在中文模型中,我们将单元格坐标回归的点数从2个(左上,右下)增加到4个(左上,右上,右下,左下)。在内部测试集上,模型升级前后指标如下: +|模型|acc| +|---|---| +|TableRec-RARE|44.3%| +|SLANet|59.35%| + +可视化结果如下,左为输入图像,右为识别的html表格 + + +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ + + + +### 4.3 版面恢复 + +版面恢复指的是文档图像经过OCR识别、版面分析、表格识别等方法处理后的内容可以与原始文档保持相同的排版方式,并输出到word等文档中。PP-Structurev2中,我们版面恢复系统,包含版面分析、表格识别、OCR文本检测与识别等子模块。 +下图展示了版面恢复的结果: + +
+ +
+ +## 5. 关键信息抽取 + +关键信息抽取指的是针对文档图像的文字内容,提取出用户关注的关键信息,如身份证中的姓名、住址等字段。PP-Structure中支持了基于多模态LayoutLM系列模型的语义实体识别 (Semantic Entity Recognition, SER) 以及关系抽取 (Relation Extraction, RE) 任务。PP-Structurev2中,我们对模型结构以及下游任务训练方法进行升级,提出了VI-LayoutXLM(Visual-feature Independent LayoutXLM),具体流程图如下所示。 + + +
+ +
+ + +具体优化策略包括: + +* VI-LayoutXLM:视觉特征无关的多模态预训练模型结构 +* TB-YX:考虑人类阅读顺序的文本行排序逻辑 +* UDML:联合互学习知识蒸馏策略 + +XFUND-zh数据集上,SER任务的消融实验如下所示。 + +| 实验序号 | 策略 | 模型大小(G) | 精度 | GPU预测耗时(ms) | CPU预测耗时(ms) | +|:------:|:------:|:------:|:------:|:------:|:------:| +| 1 | LayoutXLM | 1.4 | 89.50% | 59.35 | 766.16 | +| 2 | VI-LayoutXLM | 1.1 | 90.46% | 23.71 | 675.58 | +| 3 | 实验2 + TB-YX文本行排序 | 1.1 | 92.50% | 23.71 | 675.58 | +| 4 | 实验3 + UDML蒸馏 | 1.1 | 93.19% | 23.71 | 675.58 | +| 5 | 实验3 + UDML蒸馏 | 1.1 | **93.19%** | **15.49** | **675.58** | + +* 测试条件 + * paddle版本:2.3.0 + * GPU:V100,实验5的GPU预测耗时使用`trt+fp16`测试得到,环境为cuda10.2+ cudnn8.1.1 + trt7.2.3.4,其他实验的预测耗时统计中没有使用TRT。 + * CPU:Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz,开启mkldnn,线程数为10 + +在XFUND数据集上,与其他方法的效果对比如下所示。 + +| 模型 | SER Hmean | RE Hmean | +|-------------------|-----------|------------| +| LayoutLMv2-base | 85.44% | 67.77% | +| LayoutXLM-base | 89.24% | 70.73% | +| StrucTexT-large | 92.29% | **86.81%** | +| VI-LayoutXLM-base (ours) | **93.19%** | 83.92% | + + +### 5.1 优化策略 + +**(1) VI-LayoutXLM(Visual-feature Independent LayoutXLM)** + +LayoutLMv2以及LayoutXLM中引入视觉骨干网络,用于提取视觉特征,并与后续的text embedding进行联合,作为多模态的输入embedding。但是该模块为基于`ResNet_x101_64x4d`的特征提取网络,特征抽取阶段耗时严重,因此我们将其去除,同时仍然保留文本、位置以及布局等信息,最终发现针对LayoutXLM进行改进,下游SER任务精度无损,针对LayoutLMv2进行改进,下游SER任务精度仅降低`2.1%`,而模型大小减小了约`340M`。具体消融实验如下所示。 + +| 模型 | 模型大小 (G) | F-score | 精度收益 | +|-----------------|----------|---------|--------| +| LayoutLMv2 | 0.76 | 84.20% | - | +| VI-LayoutLMv2 | 0.42 | 82.10% | -2.10% | +| LayoutXLM | 1.4 | 89.50% | - | +| VI-LayouXLM | 1.1 | 90.46% | +0.96% | + +同时,基于XFUND数据集,VI-LayoutXLM在RE任务上的精度也进一步提升了`1.06%`。 + +**(2) TB-YX排序方法(Threshold-Based YX sorting algorithm)** + +文本阅读顺序对于信息抽取与文本理解等任务至关重要,传统多模态模型中,没有考虑不同OCR工具可能产生的不正确阅读顺序,而模型输入中包含位置编码,阅读顺序会直接影响预测结果,在预处理中,我们对文本行按照从上到下,从左到右(YX)的顺序进行排序,为防止文本行位置轻微干扰带来的排序结果不稳定问题,在排序的过程中,引入位置偏移阈值Th,对于Y方向距离小于Th的2个文本内容,使用x方向的位置从左到右进行排序。TB-YX排序方法伪代码如下所示。 + +```py +def order_by_tbyx(ocr_info, th=20): + """ + ocr_info: a list of dict, which contains bbox information([x1, y1, x2, y2]) + th: threshold of the position threshold + """ + res = sorted(ocr_info, key=lambda r: (r["bbox"][1], r["bbox"][0])) # sort using y1 first and then x1 + for i in range(len(res) - 1): + for j in range(i, 0, -1): + # restore the order using the + if abs(res[j + 1]["bbox"][1] - res[j]["bbox"][1]) < th and \ + (res[j + 1]["bbox"][0] < res[j]["bbox"][0]): + tmp = deepcopy(res[j]) + res[j] = deepcopy(res[j + 1]) + res[j + 1] = deepcopy(tmp) + else: + break + return res +``` + +不同排序方法的结果对比如下所示,可以看出引入偏离阈值之后,排序结果更加符合人类的阅读顺序。 + +
+ +
+ + +使用该策略,最终XFUND数据集上,SER任务F1指标提升`2.06%`,RE任务F1指标提升`7.04%`。 + +**(3) 互学习蒸馏策略** + +UDML(Unified-Deep Mutual Learning)联合互学习是PP-OCRv2与PP-OCRv3中采用的对于文本识别非常有效的提升模型效果的策略。在训练时,引入2个完全相同的模型进行互学习,计算2个模型之间的互蒸馏损失函数(DML loss),同时对transformer中间层的输出结果计算距离损失函数(L2 loss)。使用该策略,最终XFUND数据集上,SER任务F1指标提升`0.6%`,RE任务F1指标提升`5.01%`。 + +最终优化后模型基于SER任务的可视化结果如下所示。 + +
+ +
+ +
+ +
+ + +RE任务的可视化结果如下所示。 + + +
+ +
+ +
+ +
+ +### 5.2 更多场景消融实验 + +我们在FUNSD数据集上,同时基于RE任务进行对本次升级策略进行验证,具体实验结果如下所示,可以看出该方案针对不同任务,在不同数据集上均有非常明显的精度收益。 + +#### 5.2.1 XFUND_zh数据集 + +**RE任务结果** + +| 实验序号 | 策略 | 模型大小(G) | F1-score | +|:------:|:------------:|:---------:|:----------:| +| 1 | LayoutXLM | 1.4 | 70.81% | +| 2 | VI-LayoutXLM | 1.1 | 71.87% | +| 3 | 实验2 + PP-OCR排序 | 1.1 | 78.91% | +| 4 | 实验3 + UDML蒸馏 | 1.1 | **83.92%** | + + +#### 5.2.2 FUNSD数据集 + +**SER任务结果** + +| 实验序号 | 策略 | F1-score | +|:------:|:------:|:------:| +| 1 | LayoutXLM | 82.28% | +| 2 | PP-Structurev2 SER | **87.79%** | + + +**RE任务结果** + +| 实验序号 | 策略 | F1-score | +|:------:|:------:|:------:| +| 1 | LayoutXLM | 53.13% | +| 2 | PP-Structurev2 SER | **74.87%** | + + +## 6. Reference +* [1] Zhong X, ShafieiBavani E, Jimeno Yepes A. Image-based table recognition: data, model, and evaluation[C]//European Conference on Computer Vision. Springer, Cham, 2020: 564-580. +* [2] Cui C, Gao T, Wei S. Yuning Du, Ruoyu Guo, Shuilong Dong, Bin Lu, Ying Zhou, Xueying Lv, Qiwen Liu, Xiaoguang Hu, Dianhai Yu, and Yanjun Ma* [J]. Pplcnet: A lightweight cpu convolutional neural network, 2021, 3. +* [3] Lin T Y, Dollár P, Girshick R, et al. Feature pyramid networks for object detection[C]//Proceedings of the IEEE conference on computer vision and pattern recognition. 2017: 2117-2125. +* [4] Yu G, Chang Q, Lv W, et al. PP-PicoDet: A Better Real-Time Object Detector on Mobile Devices[J]. arXiv preprint arXiv:2111.00902, 2021. +* [5] Bochkovskiy A, Wang C Y, Liao H Y M. Yolov4: Optimal speed and accuracy of object detection[J]. arXiv preprint arXiv:2004.10934, 2020. +* [6] Ye J, Qi X, He Y, et al. PingAn-VCGroup's Solution for ICDAR 2021 Competition on Scientific Literature Parsing Task B: Table Recognition to HTML[J]. arXiv preprint arXiv:2105.01848, 2021. +* [7] Zhong X, Tang J, Yepes A J. Publaynet: largest dataset ever for document layout analysis[C]//2019 International Conference on Document Analysis and Recognition (ICDAR). IEEE, 2019: 1015-1022. +* [8] CDLA:https://github.com/buptlihang/CDLA +* [9]Gao L, Huang Y, Déjean H, et al. ICDAR 2019 competition on table detection and recognition (cTDaR)[C]//2019 International Conference on Document Analysis and Recognition (ICDAR). IEEE, 2019: 1510-1515. +* [10] Mondal A, Lipps P, Jawahar C V. IIIT-AR-13K: a new dataset for graphical object detection in documents[C]//International Workshop on Document Analysis Systems. Springer, Cham, 2020: 216-230. +* [11] Tal ocr_tabel:https://ai.100tal.com/dataset +* [12] Li M, Cui L, Huang S, et al. Tablebank: A benchmark dataset for table detection and recognition[J]. arXiv preprint arXiv:1903.01949, 2019. +* [13]Li M, Xu Y, Cui L, et al. DocBank: A benchmark dataset for document layout analysis[J]. arXiv preprint arXiv:2006.01038, 2020. +* [14] Xu Y, Li M, Cui L, et al. Layoutlm: Pre-training of text and layout for document image understanding[C]//Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining. 2020: 1192-1200. +* [15] Xu Y, Xu Y, Lv T, et al. LayoutLMv2: Multi-modal pre-training for visually-rich document understanding[J]. arXiv preprint arXiv:2012.14740, 2020. +* [16] Xu Y, Lv T, Cui L, et al. Layoutxlm: Multimodal pre-training for multilingual visually-rich document understanding[J]. arXiv preprint arXiv:2104.08836, 2021. +* [17] Xu Y, Lv T, Cui L, et al. XFUND: A Benchmark Dataset for Multilingual Visually Rich Form Understanding[C]//Findings of the Association for Computational Linguistics: ACL 2022. 2022: 3214-3224. +* [18] Jaume G, Ekenel H K, Thiran J P. Funsd: A dataset for form understanding in noisy scanned documents[C]//2019 International Conference on Document Analysis and Recognition Workshops (ICDARW). IEEE, 2019, 2: 1-6. diff --git a/ppstructure/docs/inference_en.md b/ppstructure/docs/inference_en.md index 357e26a11f7e86a342bb3dbf24ea3c721705ae98..71019ec70f80e44bc16d2b0d07b0bb93b475b7e7 100644 --- a/ppstructure/docs/inference_en.md +++ b/ppstructure/docs/inference_en.md @@ -1,13 +1,13 @@ # Python Inference -- [1. Structure](#1) +- [1. Layout Structured Analysis](#1) - [1.1 layout analysis + table recognition](#1.1) - [1.2 layout analysis](#1.2) - [1.3 table recognition](#1.3) -- [2. KIE](#2) +- [2. Key Information Extraction](#2) -## 1. Structure +## 1. Layout Structured Analysis Go to the `ppstructure` directory ```bash @@ -70,7 +70,7 @@ python3 predict_system.py --det_model_dir=inference/ch_PP-OCRv3_det_infer \ After the operation is completed, each image will have a directory with the same name in the `structure` directory under the directory specified by the `output` field. Each table in the image will be stored as an excel. The filename of excel is their coordinates in the image. -## 2. KIE +## 2. Key Information Extraction ```bash cd ppstructure diff --git a/ppstructure/docs/quickstart.md b/ppstructure/docs/quickstart.md index f19ee2591aba955ff09b2404d3ca85c80b75d781..b9367cab327a2f6232e34431c12532db03c75389 100644 --- a/ppstructure/docs/quickstart.md +++ b/ppstructure/docs/quickstart.md @@ -51,10 +51,14 @@ pip3 install "paddleocr>=2.6" pip3 install paddleclas # 安装 关键信息抽取 依赖包(如不需要KIE功能,可跳过) -pip3 install -r kie/requirements.txt +pip3 install -r ppstructure/kie/requirements.txt + +# 安装 版面恢复 依赖包(如不需要版面恢复功能,可跳过) +pip3 install -r ppstructure/recovery/requirements.txt ``` + ## 2. 便捷使用 @@ -94,7 +98,12 @@ paddleocr --image_dir=ppstructure/docs/table/table.jpg --type=structure --layout #### 2.1.6 版面恢复 ```bash +# 中文测试图 paddleocr --image_dir=ppstructure/docs/table/1.png --type=structure --recovery=true +# 英文测试图 +paddleocr --image_dir=ppstructure/docs/table/1.png --type=structure --recovery=true --lang='en' +# pdf测试文件 +paddleocr --image_dir=ppstructure/recovery/UnrealText.pdf --type=structure --recovery=true --lang='en' ``` @@ -215,9 +224,12 @@ for line in result: import os import cv2 from paddleocr import PPStructure,save_structure_res -from paddelocr.ppstructure.recovery.recovery_to_doc import sorted_layout_boxes, convert_info_docx +from paddleocr.ppstructure.recovery.recovery_to_doc import sorted_layout_boxes, convert_info_docx -table_engine = PPStructure(layout=False, show_log=True) +# 中文测试图 +table_engine = PPStructure(recovery=True) +# 英文测试图 +# table_engine = PPStructure(recovery=True, lang='en') save_folder = './output' img_path = 'ppstructure/docs/table/1.png' @@ -230,8 +242,8 @@ for line in result: print(line) h, w, _ = img.shape -res = sorted_layout_boxes(res, w) -convert_info_docx(img, result, save_folder, os.path.basename(img_path).split('.')[0]) +res = sorted_layout_boxes(result, w) +convert_info_docx(img, res, save_folder, os.path.basename(img_path).split('.')[0]) ``` @@ -303,4 +315,4 @@ dict 里各个字段说明如下: ## 3. 小结 -通过本节内容,相信您已经熟练掌握通过PaddleOCR whl包调用PP-Structure相关功能的使用方法,您可以参考[文档教程](../../README_ch.md#文档教程),获取包括模型训练、推理部署等更详细的使用教程。 \ No newline at end of file +通过本节内容,相信您已经熟练掌握通过PaddleOCR whl包调用PP-Structure相关功能的使用方法,您可以参考[文档教程](../../README_ch.md#文档教程),获取包括模型训练、推理部署等更详细的使用教程。 diff --git a/ppstructure/docs/quickstart_en.md b/ppstructure/docs/quickstart_en.md index dbfbf43b01c94bd6f9c729f2f6edcd1dd6aee056..b1df40b267a82fd48853edf607acd43f3a5431c9 100644 --- a/ppstructure/docs/quickstart_en.md +++ b/ppstructure/docs/quickstart_en.md @@ -1,7 +1,7 @@ # PP-Structure Quick Start -- [1. Install package](#1-install-package) -- [2. Use](#2-use) +- [1. Environment Preparation](#1-environment-preparation) +- [2. Quick Use](#2-quick-use) - [2.1 Use by command line](#21-use-by-command-line) - [2.1.1 image orientation + layout analysis + table recognition](#211-image-orientation--layout-analysis--table-recognition) - [2.1.2 layout analysis + table recognition](#212-layout-analysis--table-recognition) @@ -9,35 +9,59 @@ - [2.1.4 table recognition](#214-table-recognition) - [2.1.5 Key Information Extraction](#215-Key-Information-Extraction) - [2.1.6 layout recovery](#216-layout-recovery) - - [2.2 Use by code](#22-use-by-code) + - [2.2 Use by python script](#22-use-by-python-script) - [2.2.1 image orientation + layout analysis + table recognition](#221-image-orientation--layout-analysis--table-recognition) - [2.2.2 layout analysis + table recognition](#222-layout-analysis--table-recognition) - [2.2.3 layout analysis](#223-layout-analysis) - [2.2.4 table recognition](#224-table-recognition) - - [2.2.5 DocVQA](#225-dockie) - [2.2.5 Key Information Extraction](#225-Key-Information-Extraction) - [2.2.6 layout recovery](#226-layout-recovery) - [2.3 Result description](#23-result-description) - [2.3.1 layout analysis + table recognition](#231-layout-analysis--table-recognition) - [2.3.2 Key Information Extraction](#232-Key-Information-Extraction) - [2.4 Parameter Description](#24-parameter-description) +- [3. Summary](#3-summary) -## 1. Install package +## 1. Environment Preparation +### 1.1 Install PaddlePaddle + +> If you do not have a Python environment, please refer to [Environment Preparation](./environment_en.md). + +- If you have CUDA 9 or CUDA 10 installed on your machine, please run the following command to install + + ```bash + python3 -m pip install paddlepaddle-gpu -i https://mirror.baidu.com/pypi/simple + ``` + +- If you have no available GPU on your machine, please run the following command to install the CPU version + + ```bash + python3 -m pip install paddlepaddle -i https://mirror.baidu.com/pypi/simple + ``` + +For more software version requirements, please refer to the instructions in [Installation Document](https://www.paddlepaddle.org.cn/install/quick) for operation. + +### 1.2 Install PaddleOCR Whl Package ```bash # Install paddleocr, version 2.6 is recommended pip3 install "paddleocr>=2.6" -# Install the KIE dependency packages (if you do not use the KIE, you can skip it) -pip install -r kie/requirements.txt + # Install the image direction classification dependency package paddleclas (if you do not use the image direction classification, you can skip it) pip3 install paddleclas + +# Install the KIE dependency packages (if you do not use the KIE, you can skip it) +pip3 install -r kie/requirements.txt + +# Install the layout recovery dependency packages (if you do not use the layout recovery, you can skip it) +pip3 install -r recovery/requirements.txt ``` -## 2. Use +## 2. Quick Use ### 2.1 Use by command line @@ -45,40 +69,41 @@ pip3 install paddleclas #### 2.1.1 image orientation + layout analysis + table recognition ```bash -paddleocr --image_dir=PaddleOCR/ppstructure/docs/table/1.png --type=structure --image_orientation=true +paddleocr --image_dir=ppstructure/docs/table/1.png --type=structure --image_orientation=true ``` #### 2.1.2 layout analysis + table recognition ```bash -paddleocr --image_dir=PaddleOCR/ppstructure/docs/table/1.png --type=structure +paddleocr --image_dir=ppstructure/docs/table/1.png --type=structure ``` #### 2.1.3 layout analysis ```bash -paddleocr --image_dir=PaddleOCR/ppstructure/docs/table/1.png --type=structure --table=false --ocr=false +paddleocr --image_dir=ppstructure/docs/table/1.png --type=structure --table=false --ocr=false ``` #### 2.1.4 table recognition ```bash -paddleocr --image_dir=PaddleOCR/ppstructure/docs/table/table.jpg --type=structure --layout=false +paddleocr --image_dir=ppstructure/docs/table/table.jpg --type=structure --layout=false ``` + #### 2.1.5 Key Information Extraction -Please refer to: [Key Information Extraction](../kie/README.md) . +Key information extraction does not currently support use by the whl package. For detailed usage tutorials, please refer to: [Key Information Extraction](../kie/README.md). #### 2.1.6 layout recovery -```bash -paddleocr --image_dir=PaddleOCR/ppstructure/docs/table/1.png --type=structure --recovery=true +``` +paddleocr --image_dir=ppstructure/docs/table/1.png --type=structure --recovery=true --lang='en' ``` -### 2.2 Use by code +### 2.2 Use by python script #### 2.2.1 image orientation + layout analysis + table recognition @@ -91,7 +116,7 @@ from paddleocr import PPStructure,draw_structure_result,save_structure_res table_engine = PPStructure(show_log=True, image_orientation=True) save_folder = './output' -img_path = 'PaddleOCR/ppstructure/docs/table/1.png' +img_path = 'ppstructure/docs/table/1.png' img = cv2.imread(img_path) result = table_engine(img) save_structure_res(result, save_folder,os.path.basename(img_path).split('.')[0]) @@ -102,7 +127,7 @@ for line in result: from PIL import Image -font_path = 'PaddleOCR/doc/fonts/simfang.ttf' # PaddleOCR下提供字体包 +font_path = 'doc/fonts/simfang.ttf' # PaddleOCR下提供字体包 image = Image.open(img_path).convert('RGB') im_show = draw_structure_result(image, result,font_path=font_path) im_show = Image.fromarray(im_show) @@ -120,7 +145,7 @@ from paddleocr import PPStructure,draw_structure_result,save_structure_res table_engine = PPStructure(show_log=True) save_folder = './output' -img_path = 'PaddleOCR/ppstructure/docs/table/1.png' +img_path = 'ppstructure/docs/table/1.png' img = cv2.imread(img_path) result = table_engine(img) save_structure_res(result, save_folder,os.path.basename(img_path).split('.')[0]) @@ -131,7 +156,7 @@ for line in result: from PIL import Image -font_path = 'PaddleOCR/doc/fonts/simfang.ttf' # font provieded in PaddleOCR +font_path = 'doc/fonts/simfang.ttf' # font provieded in PaddleOCR image = Image.open(img_path).convert('RGB') im_show = draw_structure_result(image, result,font_path=font_path) im_show = Image.fromarray(im_show) @@ -149,7 +174,7 @@ from paddleocr import PPStructure,save_structure_res table_engine = PPStructure(table=False, ocr=False, show_log=True) save_folder = './output' -img_path = 'PaddleOCR/ppstructure/docs/table/1.png' +img_path = 'ppstructure/docs/table/1.png' img = cv2.imread(img_path) result = table_engine(img) save_structure_res(result, save_folder, os.path.basename(img_path).split('.')[0]) @@ -170,7 +195,7 @@ from paddleocr import PPStructure,save_structure_res table_engine = PPStructure(layout=False, show_log=True) save_folder = './output' -img_path = 'PaddleOCR/ppstructure/docs/table/table.jpg' +img_path = 'ppstructure/docs/table/table.jpg' img = cv2.imread(img_path) result = table_engine(img) save_structure_res(result, save_folder, os.path.basename(img_path).split('.')[0]) @@ -183,7 +208,7 @@ for line in result: #### 2.2.5 Key Information Extraction -Please refer to: [Key Information Extraction](../kie/README.md) . +Key information extraction does not currently support use by the whl package. For detailed usage tutorials, please refer to: [Key Information Extraction](../kie/README.md). #### 2.2.6 layout recovery @@ -192,12 +217,15 @@ Please refer to: [Key Information Extraction](../kie/README.md) . import os import cv2 from paddleocr import PPStructure,save_structure_res -from paddelocr.ppstructure.recovery.recovery_to_doc import sorted_layout_boxes, convert_info_docx +from paddleocr.ppstructure.recovery.recovery_to_doc import sorted_layout_boxes, convert_info_docx -table_engine = PPStructure(layout=False, show_log=True) +# Chinese image +table_engine = PPStructure(recovery=True) +# English image +# table_engine = PPStructure(recovery=True, lang='en') save_folder = './output' -img_path = 'PaddleOCR/ppstructure/docs/table/1.png' +img_path = 'ppstructure/docs/table/1.png' img = cv2.imread(img_path) result = table_engine(img) save_structure_res(result, save_folder, os.path.basename(img_path).split('.')[0]) @@ -207,8 +235,8 @@ for line in result: print(line) h, w, _ = img.shape -res = sorted_layout_boxes(res, w) -convert_info_docx(img, result, save_folder, os.path.basename(img_path).split('.')[0]) +res = sorted_layout_boxes(result, w) +convert_info_docx(img, res, save_folder, os.path.basename(img_path).split('.')[0]) ``` @@ -231,8 +259,8 @@ Each field in dict is described as follows: | field | description | | --- |---| -|type| Type of image area. | -|bbox| The coordinates of the image area in the original image, respectively [upper left corner x, upper left corner y, lower right corner x, lower right corner y]. | +|type| Type of image area. | +|bbox| The coordinates of the image area in the original image, respectively [upper left corner x, upper left corner y, lower right corner x, lower right corner y]. | |res| OCR or table recognition result of the image area.
table: a dict with field descriptions as follows:
        `html`: html str of table.
        In the code usage mode, set return_ocr_result_in_table=True whrn call can get the detection and recognition results of each text in the table area, corresponding to the following fields:
        `boxes`: text detection boxes.
        `rec_res`: text recognition results.
OCR: A tuple containing the detection boxes and recognition results of each single text. | After the recognition is completed, each image will have a directory with the same name under the directory specified by the `output` field. Each table in the image will be stored as an excel, and the picture area will be cropped and saved. The filename of excel and picture is their coordinates in the image. @@ -276,3 +304,8 @@ Please refer to: [Key Information Extraction](../kie/README.md) . | structure_version | Structure version, optional PP-structure and PP-structurev2 | PP-structure | Most of the parameters are consistent with the PaddleOCR whl package, see [whl package documentation](../../doc/doc_en/whl.md) + + +## 3. Summary + +Through the content in this section, you can master the use of PP-Structure related functions through PaddleOCR whl package. Please refer to [documentation tutorial](../../README.md) for more detailed usage tutorials including model training, inference and deployment, etc. diff --git a/ppstructure/docs/recovery/UnrealText.pdf b/ppstructure/docs/recovery/UnrealText.pdf new file mode 100644 index 0000000000000000000000000000000000000000..0b5cf961af4ebf09cb96fc3f09fb9c19abec68f1 Binary files /dev/null and b/ppstructure/docs/recovery/UnrealText.pdf differ diff --git a/ppstructure/docs/recovery/recovery_ch.jpg b/ppstructure/docs/recovery/recovery_ch.jpg new file mode 100644 index 0000000000000000000000000000000000000000..df5a5063f036053673041b92a01f288b3e1d246b Binary files /dev/null and b/ppstructure/docs/recovery/recovery_ch.jpg differ diff --git a/ppstructure/kie/requirements.txt b/ppstructure/kie/requirements.txt index 53a7315d051704640b9a692ffaa52ce05fd16274..11fa98da1bff7a1863d8a077ca73435d15072523 100644 --- a/ppstructure/kie/requirements.txt +++ b/ppstructure/kie/requirements.txt @@ -1,7 +1,7 @@ sentencepiece yacs seqeval -git+https://github.com/PaddlePaddle/PaddleNLP pypandoc attrdict python_docx +https://paddleocr.bj.bcebos.com/ppstructure/whl/paddlenlp-2.3.0.dev0-py3-none-any.whl diff --git a/ppstructure/layout/README_ch.md b/ppstructure/layout/README_ch.md index d5598fc1a896ea4cfcc94619e1744b9b7ec288b3..f8d1978e25d7fb17cfd3fcb363b4ce981e19c8dc 100644 --- a/ppstructure/layout/README_ch.md +++ b/ppstructure/layout/README_ch.md @@ -160,11 +160,13 @@ json文件包含所有图像的标注,数据以字典嵌套的方式存放, ``` mkdir pretrained_model cd pretrained_model -# 下载PubLayNet预训练模型 -wget https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_layout.pdparams +# 下载PubLayNet预训练模型(直接体验模型评估、预测、动转静) +wget https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout.pdparams +# 下载PubLaynet推理模型(直接体验模型推理) +wget https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_infer.tar ``` -下载更多[版面分析模型](../docs/models_list.md)(中文CDLA数据集预训练模型、表格预训练模型) +如果测试图片为中文,可以下载中文CDLA数据集的预训练模型,识别10类文档区域:Table、Figure、Figure caption、Table、Table caption、Header、Footer、Reference、Equation,在[版面分析模型](../docs/models_list.md)中下载`picodet_lcnet_x1_0_fgd_layout_cdla`模型的训练模型和推理模型。如果只检测图片中的表格区域,可以下载表格数据集的预训练模型,在[版面分析模型](../docs/models_list.md)中下载`picodet_lcnet_x1_0_fgd_layout_table`模型的训练模型和推理模型。 ### 4.1. 启动训练 @@ -216,14 +218,14 @@ TestDataset: # 单卡训练 export CUDA_VISIBLE_DEVICES=0 python3 tools/train.py \ - -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ - --eval + -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ + --eval # 多卡训练,通过--gpus参数指定卡号 export CUDA_VISIBLE_DEVICES=0,1,2,3 python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py \ - -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ - --eval + -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ + --eval ``` **注意:**如果训练时显存out memory,将TrainReader中batch_size调小,同时LearningRate中base_lr等比例减小。发布的config均由8卡训练得到,如果改变GPU卡数为1,那么base_lr需要减小8倍。 @@ -252,9 +254,9 @@ PaddleDetection支持了基于FGD([Focal and Global Knowledge Distillation for D # 单卡训练 export CUDA_VISIBLE_DEVICES=0 python3 tools/train.py \ - -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ - --slim_config configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x2_5_layout.yml \ - --eval + -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ + --slim_config configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x2_5_layout.yml \ + --eval ``` - `-c`: 指定模型配置文件。 @@ -269,8 +271,8 @@ python3 tools/train.py \ ```bash # GPU 评估, weights 为待测权重 python3 tools/eval.py \ - -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ - -o weights=./output/picodet_lcnet_x1_0_layout/best_model + -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ + -o weights=./output/picodet_lcnet_x1_0_layout/best_model ``` 会输出以下信息,打印出mAP、AP0.5等信息。 @@ -292,13 +294,13 @@ python3 tools/eval.py \ [08/15 07:07:09] ppdet.engine INFO: Best test bbox ap is 0.935. ``` -使用FGD蒸馏模型进行评估: +若使用**提供的预训练模型进行评估**,或使用**FGD蒸馏训练的模型**,更换`weights`模型路径,执行如下命令进行评估: ``` python3 tools/eval.py \ - -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ - --slim_config configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x2_5_layout.yml \ - -o weights=output/picodet_lcnet_x2_5_layout/best_model + -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ + --slim_config configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x2_5_layout.yml \ + -o weights=output/picodet_lcnet_x2_5_layout/best_model ``` - `-c`: 指定模型配置文件。 @@ -325,18 +327,16 @@ python3 tools/infer.py \ - `--output_dir`: 指定可视化结果保存路径。 - `--draw_threshold`:指定绘制结果框的NMS阈值。 -预测图片如下所示,图片会存储在`output_dir`路径中。 - -使用FGD蒸馏模型进行测试: +若使用**提供的预训练模型进行预测**,或使用**FGD蒸馏训练的模型**,更换`weights`模型路径,执行如下命令进行预测: ``` python3 tools/infer.py \ - -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ - --slim_config configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x2_5_layout.yml \ - -o weights='output/picodet_lcnet_x2_5_layout/best_model.pdparams' \ - --infer_img='docs/images/layout.jpg' \ - --output_dir=output_dir/ \ - --draw_threshold=0.5 + -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ + --slim_config configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x2_5_layout.yml \ + -o weights='output/picodet_lcnet_x2_5_layout/best_model.pdparams' \ + --infer_img='docs/images/layout.jpg' \ + --output_dir=output_dir/ \ + --draw_threshold=0.5 ``` @@ -351,9 +351,9 @@ inference 模型(`paddle.jit.save`保存的模型) 一般是模型训练, ```bash python3 tools/export_model.py \ - -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ - -o weights=output/picodet_lcnet_x1_0_layout/best_model \ - --output_dir=output_inference/ + -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ + -o weights=output/picodet_lcnet_x1_0_layout/best_model \ + --output_dir=output_inference/ ``` * 如无需导出后处理,请指定:`-o export.benchmark=True`(如果-o已出现过,此处删掉-o) @@ -368,27 +368,27 @@ output_inference/picodet_lcnet_x1_0_layout/ └── model.pdmodel # inference模型的模型结构文件 ``` -FGD蒸馏模型转inference模型步骤如下: +若使用**提供的预训练模型转Inference模型**,或使用**FGD蒸馏训练的模型**,更换`weights`模型路径,模型转inference模型步骤如下: ```bash python3 tools/export_model.py \ - -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ - --slim_config configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x2_5_layout.yml \ - -o weights=./output/picodet_lcnet_x2_5_layout/best_model \ - --output_dir=output_inference/ + -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ + --slim_config configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x2_5_layout.yml \ + -o weights=./output/picodet_lcnet_x2_5_layout/best_model \ + --output_dir=output_inference/ ``` ### 6.2 模型推理 -版面恢复任务进行推理,可以执行如下命令: +若使用**提供的推理训练模型推理**,或使用**FGD蒸馏训练的模型**,更换`model_dir`推理模型路径,执行如下命令进行推理: ```bash python3 deploy/python/infer.py \ - --model_dir=output_inference/picodet_lcnet_x1_0_layout/ \ - --image_file=docs/images/layout.jpg \ - --device=CPU + --model_dir=output_inference/picodet_lcnet_x1_0_layout/ \ + --image_file=docs/images/layout.jpg \ + --device=CPU ``` - --device:指定GPU、CPU设备 diff --git a/ppstructure/predict_system.py b/ppstructure/predict_system.py index d63ab3b3daf018af7d0872e42bd14b8823d193ae..71147d3af8ec666d368234270dcb0d16aaf91938 100644 --- a/ppstructure/predict_system.py +++ b/ppstructure/predict_system.py @@ -77,7 +77,7 @@ class StructureSystem(object): elif self.mode == 'kie': raise NotImplementedError - def __call__(self, img, img_idx=0, return_ocr_result_in_table=False): + def __call__(self, img, return_ocr_result_in_table=False, img_idx=0): time_dict = { 'image_orientation': 0, 'layout': 0, @@ -227,65 +227,39 @@ def main(args): if img is None: logger.error("error in loading image:{}".format(image_file)) continue - res, time_dict = structure_sys(img) + imgs = [img] + else: + imgs = img - if structure_sys.mode == 'structure': - save_structure_res(res, save_folder, img_name) + all_res = [] + for index, img in enumerate(imgs): + res, time_dict = structure_sys(img, img_idx=index) + if structure_sys.mode == 'structure' and res != []: + save_structure_res(res, save_folder, img_name, index) draw_img = draw_structure_result(img, res, args.vis_font_path) - img_save_path = os.path.join(save_folder, img_name, 'show.jpg') + img_save_path = os.path.join(save_folder, img_name, + 'show_{}.jpg'.format(index)) elif structure_sys.mode == 'kie': raise NotImplementedError # draw_img = draw_ser_results(img, res, args.vis_font_path) # img_save_path = os.path.join(save_folder, img_name + '.jpg') - cv2.imwrite(img_save_path, draw_img) - logger.info('result save to {}'.format(img_save_path)) - if args.recovery: - try: - from ppstructure.recovery.recovery_to_doc import sorted_layout_boxes, convert_info_docx - h, w, _ = img.shape - res = sorted_layout_boxes(res, w) - convert_info_docx(img, res, save_folder, img_name, - args.save_pdf) - except Exception as ex: - logger.error( - "error in layout recovery image:{}, err msg: {}".format( - image_file, ex)) - continue - else: - pdf_imgs = img - all_res = [] - for index, img in enumerate(pdf_imgs): - - res, time_dict = structure_sys(img, index) - if structure_sys.mode == 'structure' and res != []: - save_structure_res(res, save_folder, img_name, index) - draw_img = draw_structure_result(img, res, - args.vis_font_path) - img_save_path = os.path.join(save_folder, img_name, - 'show_{}.jpg'.format(index)) - elif structure_sys.mode == 'kie': - raise NotImplementedError - # draw_img = draw_ser_results(img, res, args.vis_font_path) - # img_save_path = os.path.join(save_folder, img_name + '.jpg') - if res != []: - cv2.imwrite(img_save_path, draw_img) - logger.info('result save to {}'.format(img_save_path)) - if args.recovery and res != []: - from ppstructure.recovery.recovery_to_doc import sorted_layout_boxes, convert_info_docx - h, w, _ = img.shape - res = sorted_layout_boxes(res, w) - all_res += res - - if args.recovery and all_res != []: - try: - convert_info_docx(img, all_res, save_folder, img_name, - args.save_pdf) - except Exception as ex: - logger.error( - "error in layout recovery image:{}, err msg: {}".format( - image_file, ex)) - continue + if res != []: + cv2.imwrite(img_save_path, draw_img) + logger.info('result save to {}'.format(img_save_path)) + if args.recovery and res != []: + from ppstructure.recovery.recovery_to_doc import sorted_layout_boxes, convert_info_docx + h, w, _ = img.shape + res = sorted_layout_boxes(res, w) + all_res += res + if args.recovery and all_res != []: + try: + convert_info_docx(img, all_res, save_folder, img_name, + args.save_pdf) + except Exception as ex: + logger.error("error in layout recovery image:{}, err msg: {}". + format(image_file, ex)) + continue logger.info("Predict time : {:.3f}s".format(time_dict['all'])) diff --git a/ppstructure/recovery/README.md b/ppstructure/recovery/README.md index 90a6a2c3c4189dc885d698e4cac2d1a24a49d1df..59aef707dd67799bb46dc18dc58f883c502c8b86 100644 --- a/ppstructure/recovery/README.md +++ b/ppstructure/recovery/README.md @@ -8,6 +8,7 @@ English | [简体中文](README_ch.md) - [3. Quick Start](#3) - [3.1 Download models](#3.1) - [3.2 Layout recovery](#3.2) + - [4. More](#4) @@ -15,13 +16,16 @@ English | [简体中文](README_ch.md) Layout recovery means that after OCR recognition, the content is still arranged like the original document pictures, and the paragraphs are output to word document in the same order. -Layout recovery combines [layout analysis](../layout/README.md)、[table recognition](../table/README.md) to better recover images, tables, titles, etc. -The following figure shows the result: +Layout recovery combines [layout analysis](../layout/README.md)、[table recognition](../table/README.md) to better recover images, tables, titles, etc. supports input files in PDF and document image formats in Chinese and English. The following figure shows the effect of restoring the layout of English and Chinese documents:
+
+ +
+ ## 2. Install @@ -35,7 +39,7 @@ The following figure shows the result: ```bash python3 -m pip install --upgrade pip -# GPU installation +# If you have cuda9 or cuda10 installed on your machine, please run the following command to install python3 -m pip install "paddlepaddle-gpu" -i https://mirror.baidu.com/pypi/simple # CPU installation @@ -62,6 +66,8 @@ git clone https://gitee.com/paddlepaddle/PaddleOCR - **(2) Install recovery's `requirements`** +The layout restoration is exported as docx and PDF files, so python-docx and docx2pdf API need to be installed, and fitz and PyMuPDF apis need to be installed to process the input files in pdf format. + ```bash python3 -m pip install -r ppstructure/recovery/requirements.txt ```` @@ -70,6 +76,16 @@ python3 -m pip install -r ppstructure/recovery/requirements.txt ## 3. Quick Start +Through layout analysis, we divided the image/PDF documents into regions, located the key regions, such as text, table, picture, etc., and recorded the location, category, and regional pixel value information of each region. Different regions are processed separately, where: + +- OCR detection and recognition is performed in the text area, and the coordinates of the OCR detection box and the text content information are added on the basis of the previous information + +- The table area identifies tables and records html and text information of tables +- Save the image directly + +We can restore the test picture through the layout information, OCR detection and recognition structure, table information, and saved pictures. + + ### 3.1 Download models @@ -85,9 +101,11 @@ https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar && ta # Download the recognition model of the ultra-lightweight English PP-OCRv3 model and unzip it wget https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_infer.tar && tar xf en_PP-OCRv3_rec_infer.tar # Download the ultra-lightweight English table inch model and unzip it -wget https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/en_ppstructure_mobile_v2.0_SLANet_infer.tar && tar xf en_ppstructure_mobile_v2.0_SLANet_infer.tar +wget https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/en_ppstructure_mobile_v2.0_SLANet_infer.tar +tar xf en_ppstructure_mobile_v2.0_SLANet_infer.tar # Download the layout model of publaynet dataset and unzip it -wget https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_infer.tar && tar xf picodet_lcnet_x1_0_fgd_layout_infer.tar +wget https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_infer.tar +tar xf picodet_lcnet_x1_0_fgd_layout_infer.tar cd .. ``` If input is Chinese document,download Chinese models: @@ -128,3 +146,15 @@ Field: - recovery:whether to enable layout of recovery, default False - save_pdf:when recovery file, whether to save pdf file, default False - output:save the recovery result path + + + +## 4. More + +For training, evaluation and inference tutorial for text detection models, please refer to [text detection doc](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/doc/doc_ch/detection.md). + +For training, evaluation and inference tutorial for text recognition models, please refer to [text recognition doc](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/doc/doc_ch/recognition.md). + +For training, evaluation and inference tutorial for layout analysis models, please refer to [layout analysis doc](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/ppstructure/layout/README_ch.md) + +For training, evaluation and inference tutorial for table recognition models, please refer to [table recognition doc](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/ppstructure/table/README_ch.md) diff --git a/ppstructure/recovery/README_ch.md b/ppstructure/recovery/README_ch.md index 9215976d37e89c7f02a61a5dfcf2127ff98c998e..ae3b7ed82464f513af585542ef8e92d66f2c8756 100644 --- a/ppstructure/recovery/README_ch.md +++ b/ppstructure/recovery/README_ch.md @@ -10,6 +10,7 @@ - [3. 使用](#3) - [3.1 下载模型](#3.1) - [3.2 版面恢复](#3.2) +- [4. 更多](#4) @@ -18,11 +19,14 @@ 版面恢复就是在OCR识别后,内容仍然像原文档图片那样排列着,段落不变、顺序不变的输出到word文档中等。 -版面恢复结合了[版面分析](../layout/README_ch.md)、[表格识别](../table/README_ch.md)技术,从而更好地恢复图片、表格、标题等内容,支持pdf文档、文档图片格式的输入文件,下图展示了版面恢复的结果: +版面恢复结合了[版面分析](../layout/README_ch.md)、[表格识别](../table/README_ch.md)技术,从而更好地恢复图片、表格、标题等内容,支持中、英文pdf文档、文档图片格式的输入文件,下图分别展示了英文文档和中文文档版面恢复的效果:
+
+ +
@@ -37,10 +41,10 @@ ```bash python3 -m pip install --upgrade pip -# GPU安装 +# 您的机器安装的是CUDA9或CUDA10,请运行以下命令安装 python3 -m pip install "paddlepaddle-gpu" -i https://mirror.baidu.com/pypi/simple -# CPU安装 +# 您的机器是CPU,请运行以下命令安装 python3 -m pip install "paddlepaddle" -i https://mirror.baidu.com/pypi/simple ``` @@ -64,6 +68,8 @@ git clone https://gitee.com/paddlepaddle/PaddleOCR - **(2)安装recovery的`requirements`** +版面恢复导出为docx、pdf文件,所以需要安装python-docx、docx2pdf API,同时处理pdf格式的输入文件,需要安装fitz、PyMuPDF API。 + ```bash python3 -m pip install -r ppstructure/recovery/requirements.txt ``` @@ -72,11 +78,20 @@ python3 -m pip install -r ppstructure/recovery/requirements.txt ## 3. 使用 +我们通过版面分析对图片/pdf形式的文档进行区域划分,定位其中的关键区域,如文字、表格、图片等,记录每个区域的位置、类别、区域像素值信息。对不同的区域分别处理,其中: + +- 文字区域直接进行OCR检测和识别,在之前信息基础上增加OCR检测框坐标和文本内容信息 + +- 表格区域进行表格识别,记录表格html和文字信息 +- 图片直接保存 + +我们通过版面信息、OCR检测和识别结构、表格信息、保存的图片,对测试图片进行恢复即可。 + ### 3.1 下载模型 -如果输入为英文文档类型,下载英文模型 +如果输入为英文文档类型,下载OCR检测和识别、版面分析、表格识别的英文模型 ```bash cd PaddleOCR/ppstructure @@ -88,9 +103,11 @@ wget https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar # 下载英文超轻量PP-OCRv3识别模型并解压 wget https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_infer.tar && tar xf en_PP-OCRv3_rec_infer.tar # 下载英文表格识别模型并解压 -wget https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/en_ppstructure_mobile_v2.0_SLANet_infer.tar && tar xf en_ppstructure_mobile_v2.0_SLANet_infer.tar +wget https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/en_ppstructure_mobile_v2.0_SLANet_infer.tar +tar xf en_ppstructure_mobile_v2.0_SLANet_infer.tar # 下载英文版面分析模型 -wget https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_infer.tar && tar xf picodet_lcnet_x1_0_fgd_layout_infer.tar +wget https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_infer.tar +tar xf picodet_lcnet_x1_0_fgd_layout_infer.tar cd .. ``` @@ -135,3 +152,15 @@ python3 predict_system.py \ - recovery:是否进行版面恢复,默认False - save_pdf:进行版面恢复导出docx文档的同时,是否保存为pdf文件,默认为False - output:版面恢复结果保存路径 + + + +## 4. 更多 + +关于OCR检测模型的训练评估与推理,请参考:[文本检测教程](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/doc/doc_ch/detection.md) + +关于OCR识别模型的训练评估与推理,请参考:[文本识别教程](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/doc/doc_ch/recognition.md) + +关于版面分析模型的训练评估与推理,请参考:[版面分析教程](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/ppstructure/layout/README_ch.md) + +关于表格识别模型的训练评估与推理,请参考:[表格识别教程](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/ppstructure/table/README_ch.md) diff --git a/ppstructure/recovery/recovery_to_doc.py b/ppstructure/recovery/recovery_to_doc.py index 0a556093d17050f65440f8e962015a86de696107..73b497d49d0961b253738eddad49c88c12c13601 100644 --- a/ppstructure/recovery/recovery_to_doc.py +++ b/ppstructure/recovery/recovery_to_doc.py @@ -12,9 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -import cv2 import os -import pypandoc from copy import deepcopy from docx import Document @@ -30,7 +28,7 @@ from ppocr.utils.logging import get_logger logger = get_logger() -def convert_info_docx(img, res, save_folder, img_name, save_pdf): +def convert_info_docx(img, res, save_folder, img_name, save_pdf=False): doc = Document() doc.styles['Normal'].font.name = 'Times New Roman' doc.styles['Normal']._element.rPr.rFonts.set(qn('w:eastAsia'), u'宋体') diff --git a/ppstructure/recovery/requirements.txt b/ppstructure/recovery/requirements.txt index 5ba3099d64574954c65ac8169798759dd7c053ac..b118a41e516ec20e5807030649943e5f7d848107 100644 --- a/ppstructure/recovery/requirements.txt +++ b/ppstructure/recovery/requirements.txt @@ -1,5 +1,5 @@ -pypandoc python-docx docx2pdf fitz -PyMuPDF \ No newline at end of file +PyMuPDF==1.16.14 +beautifulsoup4 \ No newline at end of file diff --git a/ppstructure/table/README.md b/ppstructure/table/README.md index e5c85eb9619ea92cd8b31041907d518eeceaf6a5..c606d641975556fe578a7e1cff8a575ccb4bff21 100644 --- a/ppstructure/table/README.md +++ b/ppstructure/table/README.md @@ -7,7 +7,7 @@ English | [简体中文](README_ch.md) - [3. Result](#3-result) - [4. How to use](#4-how-to-use) - [4.1 Quick start](#41-quick-start) - - [4.2 Train](#42-train) + - [4.2 Training, Evaluation and Inference](#42-training-evaluation-and-inference) - [4.3 Calculate TEDS](#43-calculate-teds) - [5. Reference](#5-reference) @@ -51,6 +51,8 @@ The performance indicators are explained as follows: ### 4.1 Quick start +PP-Structure currently provides table recognition models in both Chinese and English. For the model link, see [models_list](../docs/models_list.md). The following takes the Chinese table recognition model as an example to introduce how to recognize a table. + Use the following commands to quickly complete the identification of a table. ```python @@ -79,7 +81,11 @@ python3.7 table/predict_table.py \ After the operation is completed, the excel table of each image will be saved to the directory specified by the output field, and an html file will be produced in the directory to visually view the cell coordinates and the recognized table. -### 4.2 Train +**NOTE** +1. If you want to use the English table recognition model, you need to download the English text detection and recognition model and the English table recognition model in [models_list](../docs/models_list_en.md), and replace `table_structure_dict_ch.txt` with `table_structure_dict.txt`. +2. To use the TableRec-RARE model, you need to replace `table_structure_dict_ch.txt` with `table_structure_dict.txt`, and add parameter `--merge_no_span_structure=False` + +### 4.2 Training, Evaluation and Inference The training, evaluation and inference process of the text detection model can be referred to [detection](../../doc/doc_en/detection_en.md) @@ -114,9 +120,35 @@ python3 table/eval_table.py \ --gt_path=path/to/gt.txt ``` -If the PubLatNet eval dataset is used, it will be output +Evaluate on the PubLatNet dataset using the English model + +```bash +cd PaddleOCR/ppstructure +# Download the model +mkdir inference && cd inference +# Download the text detection model trained on the PubTabNet dataset and unzip it +wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_det_infer.tar && tar xf en_ppocr_mobile_v2.0_table_det_infer.tar +# Download the text recognition model trained on the PubTabNet dataset and unzip it +wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_rec_infer.tar && tar xf en_ppocr_mobile_v2.0_table_rec_infer.tar +# Download the table recognition model trained on the PubTabNet dataset and unzip it +wget https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/en_ppstructure_mobile_v2.0_SLANet_infer.tar && tar xf en_ppstructure_mobile_v2.0_SLANet_infer.tar +cd .. + +python3 table/eval_table.py \ + --det_model_dir=inference/en_ppocr_mobile_v2.0_table_det_infer \ + --rec_model_dir=inference/en_ppocr_mobile_v2.0_table_rec_infer \ + --table_model_dir=inference/en_ppstructure_mobile_v2.0_SLANet_infer \ + --image_dir=train_data/table/pubtabnet/val/ \ + --rec_char_dict_path=../ppocr/utils/dict/table_dict.txt \ + --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt \ + --det_limit_side_len=736 \ + --det_limit_type=min \ + --gt_path=path/to/gt.txt +``` + +output is ```bash -teds: 94.98 +teds: 95.89 ``` ## 5. Reference diff --git a/ppstructure/table/README_ch.md b/ppstructure/table/README_ch.md index 086e39348e96abe4320debef1cc11487694ccd49..8aa0dc8653223f9b84a283d8be2329f3c9d12b47 100644 --- a/ppstructure/table/README_ch.md +++ b/ppstructure/table/README_ch.md @@ -7,7 +7,7 @@ - [3. 效果演示](#3-效果演示) - [4. 使用](#4-使用) - [4.1 快速开始](#41-快速开始) - - [4.2 训练](#42-训练) + - [4.2 模型训练、评估与推理](#42-模型训练评估与推理) - [4.3 计算TEDS](#43-计算teds) - [5. Reference](#5-reference) @@ -57,6 +57,8 @@ ### 4.1 快速开始 +PP-Structure目前提供了中英文两种语言的表格识别模型,模型链接见 [models_list](../docs/models_list.md)。下面以中文表格识别模型为例,介绍如何识别一张表格。 + 使用如下命令即可快速完成一张表格的识别。 ```python cd PaddleOCR/ppstructure @@ -67,7 +69,7 @@ mkdir inference && cd inference wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar && tar xf ch_PP-OCRv3_det_infer.tar # 下载PP-OCRv3文本识别模型并解压 wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar && tar xf ch_PP-OCRv3_rec_infer.tar -# 下载PP-Structurev2表格识别模型并解压 +# 下载PP-Structurev2中文表格识别模型并解压 wget https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_infer.tar && tar xf ch_ppstructure_mobile_v2.0_SLANet_infer.tar cd .. # 执行表格识别 @@ -82,7 +84,11 @@ python table/predict_table.py \ ``` 运行完成后,每张图片的excel表格会保存到output字段指定的目录下,同时在该目录下回生产一个html文件,用于可视化查看单元格坐标和识别的表格。 -### 4.2 训练 +**NOTE** +1. 如果想使用英文模型,需要在 [models_list](../docs/models_list.md) 中下载英文文字检测识别模型和英文表格识别模型,同时替换`table_structure_dict_ch.txt`为`table_structure_dict.txt`即可。 +2. 如需使用TableRec-RARE模型,需要替换`table_structure_dict_ch.txt`为`table_structure_dict.txt`,同时参数`--merge_no_span_structure=False` + +### 4.2 模型训练、评估与推理 文本检测模型的训练、评估和推理流程可参考 [detection](../../doc/doc_ch/detection.md) @@ -117,9 +123,36 @@ python3 table/eval_table.py \ --det_limit_type=min \ --gt_path=path/to/gt.txt ``` -如使用PubLatNet评估数据集,将会输出 + +如使用英文表格识别模型在PubLatNet数据集上进行评估 + +```bash +cd PaddleOCR/ppstructure +# 下载模型 +mkdir inference && cd inference +# 下载基于PubTabNet数据集训练的文本检测模型并解压 +wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_det_infer.tar && tar xf en_ppocr_mobile_v2.0_table_det_infer.tar +# 下载基于PubTabNet数据集训练的文本识别模型并解压 +wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_rec_infer.tar && tar xf en_ppocr_mobile_v2.0_table_rec_infer.tar +# 下载基于PubTabNet数据集训练的表格识别模型并解压 +wget https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/en_ppstructure_mobile_v2.0_SLANet_infer.tar && tar xf en_ppstructure_mobile_v2.0_SLANet_infer.tar +cd .. + +python3 table/eval_table.py \ + --det_model_dir=inference/en_ppocr_mobile_v2.0_table_det_infer \ + --rec_model_dir=inference/en_ppocr_mobile_v2.0_table_rec_infer \ + --table_model_dir=inference/en_ppstructure_mobile_v2.0_SLANet_infer \ + --image_dir=train_data/table/pubtabnet/val/ \ + --rec_char_dict_path=../ppocr/utils/dict/table_dict.txt \ + --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt \ + --det_limit_side_len=736 \ + --det_limit_type=min \ + --gt_path=path/to/gt.txt +``` + +将会输出 ```bash -teds: 94.98 +teds: 95.89 ``` ## 5. Reference diff --git a/requirements.txt b/requirements.txt index 976d29192abbbf89b8ee6064c0b4ec48d43ad268..cf80775f73b421f96875d48b4659f2b7adf852c9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ shapely scikit-image -imgaug==0.4.0 +imgaug pyclipper lmdb tqdm diff --git a/test_tipc/configs/ch_PP-OCRv2_rec/train_infer_python.txt b/test_tipc/configs/ch_PP-OCRv2_rec/train_infer_python.txt index a96b87dede1e1b4c7b3ed59c4bd9c0470402e7e2..6d20b2df7420371ce964cf8fd5cb29726c000d1d 100644 --- a/test_tipc/configs/ch_PP-OCRv2_rec/train_infer_python.txt +++ b/test_tipc/configs/ch_PP-OCRv2_rec/train_infer_python.txt @@ -41,7 +41,7 @@ inference:tools/infer/predict_rec.py --use_gpu:True|False --enable_mkldnn:False --cpu_threads:6 ---rec_batch_num:1|6 +--rec_batch_num:1 --use_tensorrt:False --precision:fp32 --rec_model_dir: diff --git a/test_tipc/configs/ch_PP-OCRv3_rec/train_infer_python.txt b/test_tipc/configs/ch_PP-OCRv3_rec/train_infer_python.txt index 59fc1bd4160ec77edb0b781c8ffa9845c6a3d5c7..fee08b08ede0f61ae4f57fd42dba303301798a3e 100644 --- a/test_tipc/configs/ch_PP-OCRv3_rec/train_infer_python.txt +++ b/test_tipc/configs/ch_PP-OCRv3_rec/train_infer_python.txt @@ -41,7 +41,7 @@ inference:tools/infer/predict_rec.py --rec_image_shape="3,48,320" --use_gpu:True|False --enable_mkldnn:False --cpu_threads:6 ---rec_batch_num:1|6 +--rec_batch_num:1 --use_tensorrt:False --precision:fp32 --rec_model_dir: diff --git a/test_tipc/configs/ch_ppocr_mobile_v2_0_rec/train_infer_python.txt b/test_tipc/configs/ch_ppocr_mobile_v2_0_rec/train_infer_python.txt index 40f397948936beba0a3a4bdce9aa4a9953ec9d0f..dc490cdc60c2c012549e6fd00c13ec18676ede20 100644 --- a/test_tipc/configs/ch_ppocr_mobile_v2_0_rec/train_infer_python.txt +++ b/test_tipc/configs/ch_ppocr_mobile_v2_0_rec/train_infer_python.txt @@ -41,7 +41,7 @@ inference:tools/infer/predict_rec.py --use_gpu:True|False --enable_mkldnn:False --cpu_threads:6 ---rec_batch_num:1|6 +--rec_batch_num:1 --use_tensorrt:False --precision:fp32 --rec_model_dir: diff --git a/test_tipc/configs/ch_ppocr_server_v2_0_rec/train_infer_python.txt b/test_tipc/configs/ch_ppocr_server_v2_0_rec/train_infer_python.txt index b9a1ae4984c30a08d75b73b884ceb97658eb11c7..85741f98c3fd645a64d8820a046030f1bb7e03c7 100644 --- a/test_tipc/configs/ch_ppocr_server_v2_0_rec/train_infer_python.txt +++ b/test_tipc/configs/ch_ppocr_server_v2_0_rec/train_infer_python.txt @@ -41,7 +41,7 @@ inference:tools/infer/predict_rec.py --use_gpu:True|False --enable_mkldnn:False --cpu_threads:6 ---rec_batch_num:1|6 +--rec_batch_num:1 --use_tensorrt:False --precision:fp32 --rec_model_dir: diff --git a/test_tipc/configs/en_table_structure/table_mv3.yml b/test_tipc/configs/en_table_structure/table_mv3.yml index 6ff31fc262b4380b4cc5258a7b2e098ada39dba0..edcbe2c3b00e8d8a56ad8dd9f208e283b511b86e 100755 --- a/test_tipc/configs/en_table_structure/table_mv3.yml +++ b/test_tipc/configs/en_table_structure/table_mv3.yml @@ -4,7 +4,7 @@ Global: log_smooth_window: 20 print_batch_step: 5 save_model_dir: ./output/table_mv3/ - save_epoch_step: 3 + save_epoch_step: 400 # evaluation is run every 400 iterations after the 0th iteration eval_batch_step: [0, 40000] cal_metric_during_train: True @@ -17,7 +17,8 @@ Global: # for data or label process character_dict_path: ppocr/utils/dict/table_structure_dict.txt character_type: en - max_text_length: 800 + max_text_length: &max_text_length 500 + box_format: &box_format 'xyxy' # 'xywh', 'xyxy', 'xyxyxyxy' infer_mode: False Optimizer: @@ -37,12 +38,14 @@ Architecture: Backbone: name: MobileNetV3 scale: 1.0 - model_name: large + model_name: small + disable_se: true Head: name: TableAttentionHead hidden_size: 256 loc_type: 2 - max_text_length: 800 + max_text_length: *max_text_length + loc_reg_num: &loc_reg_num 4 Loss: name: TableAttentionLoss @@ -70,6 +73,8 @@ Train: learn_empty_box: False merge_no_span_structure: False replace_empty_cell_token: False + loc_reg_num: *loc_reg_num + max_text_length: *max_text_length - TableBoxEncode: - ResizeTableImage: max_len: 488 @@ -102,6 +107,8 @@ Eval: learn_empty_box: False merge_no_span_structure: False replace_empty_cell_token: False + loc_reg_num: *loc_reg_num + max_text_length: *max_text_length - TableBoxEncode: - ResizeTableImage: max_len: 488 diff --git a/test_tipc/configs/rec_mtb_nrtr/train_infer_python.txt b/test_tipc/configs/rec_mtb_nrtr/train_infer_python.txt index fed8ba26753bb770e062f751a9ba1e8e35fc6843..4a8fda0fea76da41a0a13b61f35d96a4d230d488 100644 --- a/test_tipc/configs/rec_mtb_nrtr/train_infer_python.txt +++ b/test_tipc/configs/rec_mtb_nrtr/train_infer_python.txt @@ -41,7 +41,7 @@ inference:tools/infer/predict_rec.py --rec_char_dict_path=./ppocr/utils/EN_symbo --use_gpu:True|False --enable_mkldnn:False --cpu_threads:6 ---rec_batch_num:1|6 +--rec_batch_num:1 --use_tensorrt:False --precision:fp32 --rec_model_dir: diff --git a/test_tipc/configs/rec_mv3_none_bilstm_ctc_v2_0/train_infer_python.txt b/test_tipc/configs/rec_mv3_none_bilstm_ctc_v2_0/train_infer_python.txt index db89b4c78d72d1853096d6b44b73a7ca61792dfe..22c29c9b233ac908741accd7eb85fb3832fb0c0f 100644 --- a/test_tipc/configs/rec_mv3_none_bilstm_ctc_v2_0/train_infer_python.txt +++ b/test_tipc/configs/rec_mv3_none_bilstm_ctc_v2_0/train_infer_python.txt @@ -41,7 +41,7 @@ inference:tools/infer/predict_rec.py --rec_char_dict_path=./ppocr/utils/ic15_dic --use_gpu:True|False --enable_mkldnn:False --cpu_threads:6 ---rec_batch_num:1|6 +--rec_batch_num:1 --use_tensorrt:False --precision:fp32 --rec_model_dir: diff --git a/test_tipc/configs/rec_mv3_none_none_ctc_v2_0/train_infer_python.txt b/test_tipc/configs/rec_mv3_none_none_ctc_v2_0/train_infer_python.txt index 003e91ff3d95e62d4353d7c4545e780ecd2f9708..d91c55e8852eee2cc7913235308f6d1f31e1f2e9 100644 --- a/test_tipc/configs/rec_mv3_none_none_ctc_v2_0/train_infer_python.txt +++ b/test_tipc/configs/rec_mv3_none_none_ctc_v2_0/train_infer_python.txt @@ -41,7 +41,7 @@ inference:tools/infer/predict_rec.py --rec_char_dict_path=./ppocr/utils/ic15_dic --use_gpu:True|False --enable_mkldnn:False --cpu_threads:6 ---rec_batch_num:1|6 +--rec_batch_num:1 --use_tensorrt:False --precision:fp32 --rec_model_dir: diff --git a/test_tipc/configs/rec_mv3_tps_bilstm_att_v2_0/train_infer_python.txt b/test_tipc/configs/rec_mv3_tps_bilstm_att_v2_0/train_infer_python.txt index c7b416c83323863a905929a2effcb1d3ad856422..77dc79cdae8bf4843ad17282885b46a33e64ce53 100644 --- a/test_tipc/configs/rec_mv3_tps_bilstm_att_v2_0/train_infer_python.txt +++ b/test_tipc/configs/rec_mv3_tps_bilstm_att_v2_0/train_infer_python.txt @@ -41,7 +41,7 @@ inference:tools/infer/predict_rec.py --rec_char_dict_path=./ppocr/utils/ic15_dic --use_gpu:True|False --enable_mkldnn:False --cpu_threads:6 ---rec_batch_num:1|6 +--rec_batch_num:1 --use_tensorrt:False --precision:fp32 --rec_model_dir: diff --git a/test_tipc/configs/rec_mv3_tps_bilstm_ctc_v2_0/train_infer_python.txt b/test_tipc/configs/rec_mv3_tps_bilstm_ctc_v2_0/train_infer_python.txt index 0c6e2d1da7f163521e8859bd8c96436b2a6bac64..f38c8d8d67bae84232749e60952a5c73871f9a88 100644 --- a/test_tipc/configs/rec_mv3_tps_bilstm_ctc_v2_0/train_infer_python.txt +++ b/test_tipc/configs/rec_mv3_tps_bilstm_ctc_v2_0/train_infer_python.txt @@ -41,7 +41,7 @@ inference:tools/infer/predict_rec.py --rec_char_dict_path=./ppocr/utils/ic15_dic --use_gpu:True|False --enable_mkldnn:False --cpu_threads:6 ---rec_batch_num:1|6 +--rec_batch_num:1 --use_tensorrt:False --precision:fp32 --rec_model_dir: diff --git a/test_tipc/configs/rec_r31_robustscanner/train_infer_python.txt b/test_tipc/configs/rec_r31_robustscanner/train_infer_python.txt index 07498c9e81ada9652343b8d8fff0f102d4684380..1bf8dc0b6c5ba707d572bc0ad44818d5a51c8800 100644 --- a/test_tipc/configs/rec_r31_robustscanner/train_infer_python.txt +++ b/test_tipc/configs/rec_r31_robustscanner/train_infer_python.txt @@ -1,6 +1,6 @@ ===========================train_params=========================== model_name:rec_r31_robustscanner -python:python +python:python3.7 gpu_list:0|0,1 Global.use_gpu:True|True Global.auto_cast:null @@ -39,11 +39,11 @@ infer_export:tools/export_model.py -c test_tipc/configs/rec_r31_robustscanner/re infer_quant:False inference:tools/infer/predict_rec.py --rec_char_dict_path=./ppocr/utils/dict90.txt --rec_image_shape="3,48,48,160" --use_space_char=False --rec_algorithm="RobustScanner" --use_gpu:True|False ---enable_mkldnn:True|False ---cpu_threads:1|6 ---rec_batch_num:1|6 ---use_tensorrt:False|False ---precision:fp32|int8 +--enable_mkldnn:False +--cpu_threads:6 +--rec_batch_num:1 +--use_tensorrt:False +--precision:fp32 --rec_model_dir: --image_dir:./inference/rec_inference --save_log_path:./test/output/ diff --git a/test_tipc/configs/rec_r31_sar/train_infer_python.txt b/test_tipc/configs/rec_r31_sar/train_infer_python.txt index 03ec54abb65ac41d3b5ad4f6e2fdcf7abb34c344..4acc6223e3b65211d62f2f128150e1c76f286674 100644 --- a/test_tipc/configs/rec_r31_sar/train_infer_python.txt +++ b/test_tipc/configs/rec_r31_sar/train_infer_python.txt @@ -41,7 +41,7 @@ inference:tools/infer/predict_rec.py --rec_char_dict_path=./ppocr/utils/dict90.t --use_gpu:True --enable_mkldnn:False --cpu_threads:6 ---rec_batch_num:1|6 +--rec_batch_num:1 --use_tensorrt:False --precision:fp32 --rec_model_dir: diff --git a/test_tipc/configs/rec_r32_gaspin_bilstm_att/train_infer_python.txt b/test_tipc/configs/rec_r32_gaspin_bilstm_att/train_infer_python.txt index 115dfd661abc64db9e14c629f79099be7b6ff0e0..ac378b36046d532a887056183de9c7788f628b76 100644 --- a/test_tipc/configs/rec_r32_gaspin_bilstm_att/train_infer_python.txt +++ b/test_tipc/configs/rec_r32_gaspin_bilstm_att/train_infer_python.txt @@ -41,7 +41,7 @@ inference:tools/infer/predict_rec.py --rec_char_dict_path=./ppocr/utils/dict/spi --use_gpu:True|False --enable_mkldnn:False --cpu_threads:6 ---rec_batch_num:1|6 +--rec_batch_num:1 --use_tensorrt:False --precision:fp32 --rec_model_dir: diff --git a/test_tipc/configs/rec_r34_vd_none_bilstm_ctc_v2_0/train_infer_python.txt b/test_tipc/configs/rec_r34_vd_none_bilstm_ctc_v2_0/train_infer_python.txt index 07a6190b0ef09da5cd20b9dd8ea922544c578710..b53efbd6ba5db36813733f6682bde1cfd614c6ee 100644 --- a/test_tipc/configs/rec_r34_vd_none_bilstm_ctc_v2_0/train_infer_python.txt +++ b/test_tipc/configs/rec_r34_vd_none_bilstm_ctc_v2_0/train_infer_python.txt @@ -41,7 +41,7 @@ inference:tools/infer/predict_rec.py --rec_char_dict_path=./ppocr/utils/ic15_dic --use_gpu:True|False --enable_mkldnn:False --cpu_threads:6 ---rec_batch_num:1|6 +--rec_batch_num:1 --use_tensorrt:False --precision:fp32 --rec_model_dir: diff --git a/test_tipc/configs/rec_r34_vd_none_none_ctc_v2_0/train_infer_python.txt b/test_tipc/configs/rec_r34_vd_none_none_ctc_v2_0/train_infer_python.txt index 145793aa472d8330daf9321f44692a03e7ef6354..7d953968b8a9d3f62f7c6fb48ed65bd9743d5ba3 100644 --- a/test_tipc/configs/rec_r34_vd_none_none_ctc_v2_0/train_infer_python.txt +++ b/test_tipc/configs/rec_r34_vd_none_none_ctc_v2_0/train_infer_python.txt @@ -41,7 +41,7 @@ inference:tools/infer/predict_rec.py --rec_char_dict_path=./ppocr/utils/ic15_dic --use_gpu:True|False --enable_mkldnn:False --cpu_threads:6 ---rec_batch_num:1|6 +--rec_batch_num:1 --use_tensorrt:False --precision:fp32 --rec_model_dir: diff --git a/test_tipc/configs/rec_r34_vd_tps_bilstm_att_v2_0/train_infer_python.txt b/test_tipc/configs/rec_r34_vd_tps_bilstm_att_v2_0/train_infer_python.txt index 759518a4a11a17e076401bb8dd193617c9f10530..0910ff840e350333a26de9b959229b6f8d39c19e 100644 --- a/test_tipc/configs/rec_r34_vd_tps_bilstm_att_v2_0/train_infer_python.txt +++ b/test_tipc/configs/rec_r34_vd_tps_bilstm_att_v2_0/train_infer_python.txt @@ -41,7 +41,7 @@ inference:tools/infer/predict_rec.py --rec_char_dict_path=./ppocr/utils/ic15_dic --use_gpu:True|False --enable_mkldnn:False --cpu_threads:6 ---rec_batch_num:1|6 +--rec_batch_num:1 --use_tensorrt:False --precision:fp32 --rec_model_dir: diff --git a/test_tipc/configs/rec_r34_vd_tps_bilstm_ctc_v2_0/train_infer_python.txt b/test_tipc/configs/rec_r34_vd_tps_bilstm_ctc_v2_0/train_infer_python.txt index ecc898341ce14dfed0de4290b798dd70078ae2da..33144e622e5fbb399e6dd274196812e2d44dc0fd 100644 --- a/test_tipc/configs/rec_r34_vd_tps_bilstm_ctc_v2_0/train_infer_python.txt +++ b/test_tipc/configs/rec_r34_vd_tps_bilstm_ctc_v2_0/train_infer_python.txt @@ -41,7 +41,7 @@ inference:tools/infer/predict_rec.py --rec_char_dict_path=./ppocr/utils/ic15_dic --use_gpu:True|False --enable_mkldnn:False --cpu_threads:6 ---rec_batch_num:1|6 +--rec_batch_num:1 --use_tensorrt:False --precision:fp32 --rec_model_dir: diff --git a/test_tipc/configs/rec_r45_abinet/train_infer_python.txt b/test_tipc/configs/rec_r45_abinet/train_infer_python.txt index ecab1bcbbde11fc6d14357b6715033704c2c3316..04fc188649c77c62b43307cb2fff2249f28bddae 100644 --- a/test_tipc/configs/rec_r45_abinet/train_infer_python.txt +++ b/test_tipc/configs/rec_r45_abinet/train_infer_python.txt @@ -41,7 +41,7 @@ inference:tools/infer/predict_rec.py --rec_char_dict_path=./ppocr/utils/ic15_dic --use_gpu:True|False --enable_mkldnn:False --cpu_threads:6 ---rec_batch_num:1|6 +--rec_batch_num:1 --use_tensorrt:False --precision:fp32 --rec_model_dir: diff --git a/test_tipc/configs/rec_r45_visionlan/train_infer_python.txt b/test_tipc/configs/rec_r45_visionlan/train_infer_python.txt index c08ae7beb6c867bf36283e60dc1e70cfd9ee06a7..79618edafa794a683e085fb1b8050358342e1f77 100644 --- a/test_tipc/configs/rec_r45_visionlan/train_infer_python.txt +++ b/test_tipc/configs/rec_r45_visionlan/train_infer_python.txt @@ -41,7 +41,7 @@ inference:tools/infer/predict_rec.py --rec_char_dict_path=./ppocr/utils/ic15_dic --use_gpu:True|False --enable_mkldnn:False --cpu_threads:6 ---rec_batch_num:1|6 +--rec_batch_num:1 --use_tensorrt:False --precision:fp32 --rec_model_dir: diff --git a/test_tipc/configs/rec_r50_fpn_vd_none_srn/train_infer_python.txt b/test_tipc/configs/rec_r50_fpn_vd_none_srn/train_infer_python.txt index b5a5286010a5830dc23031b3e0885247fb6ae53f..c1cfd1fcd930c6992982feeb3c118dbc5a56f226 100644 --- a/test_tipc/configs/rec_r50_fpn_vd_none_srn/train_infer_python.txt +++ b/test_tipc/configs/rec_r50_fpn_vd_none_srn/train_infer_python.txt @@ -41,7 +41,7 @@ inference:tools/infer/predict_rec.py --rec_char_dict_path=./ppocr/utils/ic15_dic --use_gpu:True|False --enable_mkldnn:False --cpu_threads:6 ---rec_batch_num:1|6 +--rec_batch_num:1 --use_tensorrt:False --precision:fp32 --rec_model_dir: diff --git a/test_tipc/configs/rec_svtrnet/train_infer_python.txt b/test_tipc/configs/rec_svtrnet/train_infer_python.txt index a7e4a24063b2e248f2ab92d5efd257a2837c0a34..5508c0411cfdc7102ccec7a00c59c2a5e1a54998 100644 --- a/test_tipc/configs/rec_svtrnet/train_infer_python.txt +++ b/test_tipc/configs/rec_svtrnet/train_infer_python.txt @@ -41,7 +41,7 @@ inference:tools/infer/predict_rec.py --rec_char_dict_path=./ppocr/utils/ic15_dic --use_gpu:True|False --enable_mkldnn:False --cpu_threads:6 ---rec_batch_num:1|6 +--rec_batch_num:1 --use_tensorrt:False --precision:fp32 --rec_model_dir: diff --git a/test_tipc/configs/rec_vitstr_none_ce/train_infer_python.txt b/test_tipc/configs/rec_vitstr_none_ce/train_infer_python.txt index 04c5742ea2ddaf01e782d8b39c21bcbcfa0a7ce7..187c11544998626af556e3eeef5f958fbe42fea0 100644 --- a/test_tipc/configs/rec_vitstr_none_ce/train_infer_python.txt +++ b/test_tipc/configs/rec_vitstr_none_ce/train_infer_python.txt @@ -41,7 +41,7 @@ inference:tools/infer/predict_rec.py --rec_char_dict_path=./ppocr/utils/EN_symbo --use_gpu:True|False --enable_mkldnn:False --cpu_threads:6 ---rec_batch_num:1|6 +--rec_batch_num:1 --use_tensorrt:False --precision:fp32 --rec_model_dir: diff --git a/test_tipc/prepare.sh b/test_tipc/prepare.sh index 6aea98a734e0fce8df00293b5362851144a7b119..5b5740113fc319accc8150f71c865a3f0465876d 100644 --- a/test_tipc/prepare.sh +++ b/test_tipc/prepare.sh @@ -160,6 +160,8 @@ if [ ${MODE} = "lite_train_lite_infer" ];then ln -s ./icdar2015_lite ./icdar2015 wget -nc -P ./ic15_data/ https://paddleocr.bj.bcebos.com/dataset/rec_gt_train_lite.txt --no-check-certificate wget -nc -P ./ic15_data/ https://paddleocr.bj.bcebos.com/dataset/rec_gt_test_lite.txt --no-check-certificate + mv ic15_data/rec_gt_train_lite.txt ic15_data/rec_gt_train.txt + mv ic15_data/rec_gt_test_lite.txt ic15_data/rec_gt_test.txt cd ../ cd ./inference && tar xf rec_inference.tar && cd ../ if [ ${model_name} == "ch_PP-OCRv2_det" ] || [ ${model_name} == "ch_PP-OCRv2_det_PACT" ]; then @@ -221,7 +223,6 @@ if [ ${MODE} = "lite_train_lite_infer" ];then fi if [ ${model_name} == "layoutxlm_ser" ] || [ ${model_name} == "vi_layoutxlm_ser" ]; then pip install -r ppstructure/kie/requirements.txt - pip install paddlenlp\>=2.3.5 --force-reinstall -i https://mirrors.aliyun.com/pypi/simple/ wget -nc -P ./train_data/ https://paddleocr.bj.bcebos.com/ppstructure/dataset/XFUND.tar --no-check-certificate cd ./train_data/ && tar xf XFUND.tar cd ../ diff --git a/tools/eval.py b/tools/eval.py index 38d72d178db45a4787ddc09c865afba9222f385a..3d1d3813d33e251ec83a9729383fe772bc4cc225 100755 --- a/tools/eval.py +++ b/tools/eval.py @@ -23,6 +23,7 @@ __dir__ = os.path.dirname(os.path.abspath(__file__)) sys.path.insert(0, __dir__) sys.path.insert(0, os.path.abspath(os.path.join(__dir__, '..'))) +import paddle from ppocr.data import build_dataloader from ppocr.modeling.architectures import build_model from ppocr.postprocess import build_post_process @@ -86,6 +87,30 @@ def main(): else: model_type = None + # build metric + eval_class = build_metric(config['Metric']) + # amp + use_amp = config["Global"].get("use_amp", False) + amp_level = config["Global"].get("amp_level", 'O2') + amp_custom_black_list = config['Global'].get('amp_custom_black_list',[]) + if use_amp: + AMP_RELATED_FLAGS_SETTING = { + 'FLAGS_cudnn_batchnorm_spatial_persistent': 1, + 'FLAGS_max_inplace_grad_add': 8, + } + paddle.fluid.set_flags(AMP_RELATED_FLAGS_SETTING) + scale_loss = config["Global"].get("scale_loss", 1.0) + use_dynamic_loss_scaling = config["Global"].get( + "use_dynamic_loss_scaling", False) + scaler = paddle.amp.GradScaler( + init_loss_scaling=scale_loss, + use_dynamic_loss_scaling=use_dynamic_loss_scaling) + if amp_level == "O2": + model = paddle.amp.decorate( + models=model, level=amp_level, master_weight=True) + else: + scaler = None + best_model_dict = load_model( config, model, model_type=config['Architecture']["model_type"]) if len(best_model_dict): @@ -93,11 +118,9 @@ def main(): for k, v in best_model_dict.items(): logger.info('{}:{}'.format(k, v)) - # build metric - eval_class = build_metric(config['Metric']) # start eval metric = program.eval(model, valid_dataloader, post_process_class, - eval_class, model_type, extra_input) + eval_class, model_type, extra_input, scaler, amp_level, amp_custom_black_list) logger.info('metric eval ***************') for k, v in metric.items(): logger.info('{}:{}'.format(k, v)) diff --git a/tools/infer/predict_rec.py b/tools/infer/predict_rec.py index 7c46e17bacdf1fff464322d284e4549bd8edacf2..176e2c68e2c9b2e08f9b56378c45a57733faf8cd 100755 --- a/tools/infer/predict_rec.py +++ b/tools/infer/predict_rec.py @@ -349,6 +349,13 @@ class TextRecognizer(object): for beg_img_no in range(0, img_num, batch_num): end_img_no = min(img_num, beg_img_no + batch_num) norm_img_batch = [] + if self.rec_algorithm == "SRN": + encoder_word_pos_list = [] + gsrm_word_pos_list = [] + gsrm_slf_attn_bias1_list = [] + gsrm_slf_attn_bias2_list = [] + if self.rec_algorithm == "SAR": + valid_ratios = [] imgC, imgH, imgW = self.rec_image_shape[:3] max_wh_ratio = imgW / imgH # max_wh_ratio = 0 @@ -357,22 +364,16 @@ class TextRecognizer(object): wh_ratio = w * 1.0 / h max_wh_ratio = max(max_wh_ratio, wh_ratio) for ino in range(beg_img_no, end_img_no): - if self.rec_algorithm == "SAR": norm_img, _, _, valid_ratio = self.resize_norm_img_sar( img_list[indices[ino]], self.rec_image_shape) norm_img = norm_img[np.newaxis, :] valid_ratio = np.expand_dims(valid_ratio, axis=0) - valid_ratios = [] valid_ratios.append(valid_ratio) norm_img_batch.append(norm_img) elif self.rec_algorithm == "SRN": norm_img = self.process_image_srn( img_list[indices[ino]], self.rec_image_shape, 8, 25) - encoder_word_pos_list = [] - gsrm_word_pos_list = [] - gsrm_slf_attn_bias1_list = [] - gsrm_slf_attn_bias2_list = [] encoder_word_pos_list.append(norm_img[1]) gsrm_word_pos_list.append(norm_img[2]) gsrm_slf_attn_bias1_list.append(norm_img[3]) diff --git a/tools/infer/utility.py b/tools/infer/utility.py index 8d3e93992d9d8cbd19fdd2c071565c940d011883..9baf66d7f469a3bf6c9a140e034aee3a635a5c8e 100644 --- a/tools/infer/utility.py +++ b/tools/infer/utility.py @@ -248,17 +248,17 @@ def create_predictor(args, mode, logger): config.enable_xpu(10 * 1024 * 1024) else: config.disable_gpu() - if hasattr(args, "cpu_threads"): - config.set_cpu_math_library_num_threads(args.cpu_threads) - else: - # default cpu threads as 10 - config.set_cpu_math_library_num_threads(10) if args.enable_mkldnn: # cache 10 different shapes for mkldnn to avoid memory leak config.set_mkldnn_cache_capacity(10) config.enable_mkldnn() if args.precision == "fp16": config.enable_mkldnn_bfloat16() + if hasattr(args, "cpu_threads"): + config.set_cpu_math_library_num_threads(args.cpu_threads) + else: + # default cpu threads as 10 + config.set_cpu_math_library_num_threads(10) # enable memory optim config.enable_memory_optim() config.disable_glog_info() diff --git a/tools/program.py b/tools/program.py index 7af1fe7354106f06b4384abb56de7675e4dbe053..16d3d4035af933cda01b422ea56e9e2895ec2b88 100755 --- a/tools/program.py +++ b/tools/program.py @@ -191,7 +191,8 @@ def train(config, logger, log_writer=None, scaler=None, - amp_level='O2'): + amp_level='O2', + amp_custom_black_list=[]): cal_metric_during_train = config['Global'].get('cal_metric_during_train', False) calc_epoch_interval = config['Global'].get('calc_epoch_interval', 1) @@ -278,10 +279,7 @@ def train(config, model_average = True # use amp if scaler: - custom_black_list = config['Global'].get( - 'amp_custom_black_list', []) - with paddle.amp.auto_cast( - level=amp_level, custom_black_list=custom_black_list): + with paddle.amp.auto_cast(level=amp_level, custom_black_list=amp_custom_black_list): if model_type == 'table' or extra_input: preds = model(images, data=batch[1:]) elif model_type in ["kie"]: @@ -386,7 +384,9 @@ def train(config, eval_class, model_type, extra_input=extra_input, - scaler=scaler) + scaler=scaler, + amp_level=amp_level, + amp_custom_black_list=amp_custom_black_list) cur_metric_str = 'cur metric, {}'.format(', '.join( ['{}: {}'.format(k, v) for k, v in cur_metric.items()])) logger.info(cur_metric_str) @@ -477,7 +477,9 @@ def eval(model, eval_class, model_type=None, extra_input=False, - scaler=None): + scaler=None, + amp_level='O2', + amp_custom_black_list = []): model.eval() with paddle.no_grad(): total_frame = 0.0 @@ -498,7 +500,7 @@ def eval(model, # use amp if scaler: - with paddle.amp.auto_cast(level='O2'): + with paddle.amp.auto_cast(level=amp_level, custom_black_list=amp_custom_black_list): if model_type == 'table' or extra_input: preds = model(images, data=batch[1:]) elif model_type in ["kie"]: diff --git a/tools/train.py b/tools/train.py index 5f310938f3ae3488281b47ccdb436697595b5578..d0f200189e34265b3c080ac9e25eb80d29c705b7 100755 --- a/tools/train.py +++ b/tools/train.py @@ -138,9 +138,7 @@ def main(config, device, logger, vdl_writer): # build metric eval_class = build_metric(config['Metric']) - # load pretrain model - pre_best_model_dict = load_model(config, model, optimizer, - config['Architecture']["model_type"]) + logger.info('train dataloader has {} iters'.format(len(train_dataloader))) if valid_dataloader is not None: logger.info('valid dataloader has {} iters'.format( @@ -148,6 +146,7 @@ def main(config, device, logger, vdl_writer): use_amp = config["Global"].get("use_amp", False) amp_level = config["Global"].get("amp_level", 'O2') + amp_custom_black_list = config['Global'].get('amp_custom_black_list',[]) if use_amp: AMP_RELATED_FLAGS_SETTING = { 'FLAGS_cudnn_batchnorm_spatial_persistent': 1, @@ -166,12 +165,16 @@ def main(config, device, logger, vdl_writer): else: scaler = None + # load pretrain model + pre_best_model_dict = load_model(config, model, optimizer, + config['Architecture']["model_type"]) + if config['Global']['distributed']: model = paddle.DataParallel(model) # start train program.train(config, train_dataloader, valid_dataloader, device, model, loss_class, optimizer, lr_scheduler, post_process_class, - eval_class, pre_best_model_dict, logger, vdl_writer, scaler,amp_level) + eval_class, pre_best_model_dict, logger, vdl_writer, scaler,amp_level, amp_custom_black_list) def test_reader(config, device, logger):