diff --git a/.gitignore b/.gitignore index 1a2dd675e961f1804fa58e2e2e49118536b84ce9..9eecb4f1056fc040d4c9579d593bee2cc4013837 100644 --- a/.gitignore +++ b/.gitignore @@ -21,3 +21,7 @@ output/ *.log .clang-format .clang_format.hook + +build/ +dist/ +paddleocr.egg-info/ \ No newline at end of file diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000000000000000000000000000000000000..388882df0c3701780dd6371bc91887356a7bca40 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,8 @@ +include LICENSE.txt +include README.md + +recursive-include ppocr/utils *.txt utility.py character.py check.py +recursive-include ppocr/data/det *.py +recursive-include ppocr/postprocess *.py +recursive-include ppocr/postprocess/lanms *.* +recursive-include tools/infer *.py diff --git a/README.md b/README.md index 32124165e392a103852dea86ea8595f996b3f31a..3243f3ce24fada59a0b6f509172b3277e080f7aa 100644 --- a/README.md +++ b/README.md @@ -1,227 +1,209 @@ -English | [简体中文](README_cn.md) +[English](README_en.md) | 简体中文 -## Introduction -PaddleOCR aims to create rich, leading, and practical OCR tools that help users train better models and apply them into practice. +## 简介 +PaddleOCR旨在打造一套丰富、领先、且实用的OCR工具库,助力使用者训练出更好的模型,并应用落地。 -**Recent updates** -- 2020.8.16, Release text detection algorithm [SAST](https://arxiv.org/abs/1908.05498) and text recognition algorithm [SRN](https://arxiv.org/abs/2003.12294) -- 2020.7.23, Release the playback and PPT of live class on BiliBili station, PaddleOCR Introduction, [address](https://aistudio.baidu.com/aistudio/course/introduce/1519) -- 2020.7.15, Add mobile App demo , support both iOS and Android ( based on easyedge and Paddle Lite) -- 2020.7.15, Improve the deployment ability, add the C + + inference , serving deployment. In addtion, the benchmarks of the ultra-lightweight OCR model are provided. -- 2020.7.15, Add several related datasets, data annotation and synthesis tools. -- [more](./doc/doc_en/update_en.md) +**近期更新** +- 2020.8.26 更新OCR相关的84个常见问题及解答,具体参考[FAQ](./doc/doc_ch/FAQ.md) +- 2020.8.24 支持通过whl包安装使用PaddleOCR,具体参考[Paddleocr Package使用说明](./doc/doc_ch/whl.md) +- 2020.8.21 更新8月18日B站直播课回放和PPT,课节2,易学易用的OCR工具大礼包,[获取地址](https://aistudio.baidu.com/aistudio/education/group/info/1519) +- 2020.8.16 开源文本检测算法[SAST](https://arxiv.org/abs/1908.05498)和文本识别算法[SRN](https://arxiv.org/abs/2003.12294) +- 2020.7.23 发布7月21日B站直播课回放和PPT,课节1,PaddleOCR开源大礼包全面解读,[获取地址](https://aistudio.baidu.com/aistudio/course/introduce/1519) +- 2020.7.15 添加基于EasyEdge和Paddle-Lite的移动端DEMO,支持iOS和Android系统 +- [more](./doc/doc_ch/update.md) -## Features -- Ultra-lightweight OCR model, total model size is only 8.6M - - Single model supports Chinese/English numbers combination recognition, vertical text recognition, long text recognition - - Detection model DB (4.1M) + recognition model CRNN (4.5M) -- Various text detection algorithms: EAST, DB -- Various text recognition algorithms: Rosetta, CRNN, STAR-Net, RARE -- Support Linux, Windows, MacOS and other systems. -## Visualization +## 特性 +- 超轻量级中文OCR模型,总模型仅8.6M + - 单模型支持中英文数字组合识别、竖排文本识别、长文本识别 + - 检测模型DB(4.1M)+识别模型CRNN(4.5M) +- 实用通用中文OCR模型 +- 多种预测推理部署方案,包括服务部署和端侧部署 +- 多种文本检测训练算法,EAST、DB、SAST +- 多种文本识别训练算法,Rosetta、CRNN、STAR-Net、RARE、SRN +- 可运行于Linux、Windows、MacOS等多种系统 -![](doc/imgs_results/11.jpg) +## 快速体验 -![](doc/imgs_results/img_10.jpg) - -[More visualization](./doc/doc_en/visualization_en.md) +
+ +
-You can also quickly experience the ultra-lightweight OCR : [Online Experience](https://www.paddlepaddle.org.cn/hub/scene/ocr) +上图是超轻量级中文OCR模型效果展示,更多效果图请见[效果展示页面](./doc/doc_ch/visualization.md)。 -Mobile DEMO experience (based on EasyEdge and Paddle-Lite, supports iOS and Android systems): [Sign in the website to obtain the QR code for installing the App](https://ai.baidu.com/easyedge/app/openSource?from=paddlelite) +- 超轻量级中文OCR在线体验地址:https://www.paddlepaddle.org.cn/hub/scene/ocr +- 移动端DEMO体验(基于EasyEdge和Paddle-Lite, 支持iOS和Android系统):[安装包二维码获取地址](https://ai.baidu.com/easyedge/app/openSource?from=paddlelite) - Also, you can scan the QR code blow to install the App (**Android support only**) + Android手机也可以扫描下面二维码安装体验。
-- [**OCR Quick Start**](./doc/doc_en/quickstart_en.md) - +## 中文OCR模型列表 -### Supported Models: - -|Model Name|Description |Detection Model link|Recognition Model link| Support for space Recognition Model link| +|模型名称|模型简介|检测模型地址|识别模型地址|支持空格的识别模型地址| |-|-|-|-|-| -|db_crnn_mobile|ultra-lightweight OCR model|[inference model](https://paddleocr.bj.bcebos.com/ch_models/ch_det_mv3_db_infer.tar) / [pre-trained model](https://paddleocr.bj.bcebos.com/ch_models/ch_det_mv3_db.tar)|[inference model](https://paddleocr.bj.bcebos.com/ch_models/ch_rec_mv3_crnn_infer.tar) / [pre-trained model](https://paddleocr.bj.bcebos.com/ch_models/ch_rec_mv3_crnn.tar)|[inference model](https://paddleocr.bj.bcebos.com/ch_models/ch_rec_mv3_crnn_enhance_infer.tar) / [pre-train model](https://paddleocr.bj.bcebos.com/ch_models/ch_rec_mv3_crnn_enhance.tar) -|db_crnn_server|General OCR model|[inference model](https://paddleocr.bj.bcebos.com/ch_models/ch_det_r50_vd_db_infer.tar) / [pre-trained model](https://paddleocr.bj.bcebos.com/ch_models/ch_det_r50_vd_db.tar)|[inference model](https://paddleocr.bj.bcebos.com/ch_models/ch_rec_r34_vd_crnn_infer.tar) / [pre-trained model](https://paddleocr.bj.bcebos.com/ch_models/ch_rec_r34_vd_crnn.tar)|[inference model](https://paddleocr.bj.bcebos.com/ch_models/ch_rec_r34_vd_crnn_enhance_infer.tar) / [pre-train model](https://paddleocr.bj.bcebos.com/ch_models/ch_rec_r34_vd_crnn_enhance.tar) - - -## Tutorials -- [Installation](./doc/doc_en/installation_en.md) -- [Quick Start](./doc/doc_en/quickstart_en.md) -- Algorithm introduction - - [Text Detection Algorithm](#TEXTDETECTIONALGORITHM) - - [Text Recognition Algorithm](#TEXTRECOGNITIONALGORITHM) - - [END-TO-END OCR Algorithm](#ENDENDOCRALGORITHM) -- Model training/evaluation - - [Text Detection](./doc/doc_en/detection_en.md) - - [Text Recognition](./doc/doc_en/recognition_en.md) - - [Yml Configuration](./doc/doc_en/config_en.md) - - [Tricks](./doc/doc_en/tricks_en.md) -- Deployment - - [Python Inference](./doc/doc_en/inference_en.md) - - [C++ Inference](./deploy/cpp_infer/readme_en.md) - - [Serving](./doc/doc_en/serving_en.md) - - [Mobile](./deploy/lite/readme_en.md) - - Model Quantization and Compression (coming soon) - - [Benchmark](./doc/doc_en/benchmark_en.md) -- Datasets - - [General OCR Datasets(Chinese/English)](./doc/doc_en/datasets_en.md) - - [HandWritten_OCR_Datasets(Chinese)](./doc/doc_en/handwritten_datasets_en.md) - - [Various OCR Datasets(multilingual)](./doc/doc_en/vertical_and_multilingual_datasets_en.md) - - [Data Annotation Tools](./doc/doc_en/data_annotation_en.md) - - [Data Synthesis Tools](./doc/doc_en/data_synthesis_en.md) -- [FAQ](#FAQ) -- Visualization - - [Ultra-lightweight Chinese/English OCR Visualization](#UCOCRVIS) - - [General Chinese/English OCR Visualization](#GeOCRVIS) - - [Chinese/English OCR Visualization (Support Space Recognization )](#SpaceOCRVIS) -- [Community](#Community) -- [References](./doc/doc_en/reference_en.md) -- [License](#LICENSE) -- [Contribution](#CONTRIBUTION) - - -## Text Detection Algorithm - -PaddleOCR open source text detection algorithms list: +|chinese_db_crnn_mobile|超轻量级中文OCR模型|[inference模型](https://paddleocr.bj.bcebos.com/ch_models/ch_det_mv3_db_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/ch_models/ch_det_mv3_db.tar)|[inference模型](https://paddleocr.bj.bcebos.com/ch_models/ch_rec_mv3_crnn_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/ch_models/ch_rec_mv3_crnn.tar)|[inference模型](https://paddleocr.bj.bcebos.com/ch_models/ch_rec_mv3_crnn_enhance_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/ch_models/ch_rec_mv3_crnn_enhance.tar) +|chinese_db_crnn_server|通用中文OCR模型|[inference模型](https://paddleocr.bj.bcebos.com/ch_models/ch_det_r50_vd_db_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/ch_models/ch_det_r50_vd_db.tar)|[inference模型](https://paddleocr.bj.bcebos.com/ch_models/ch_rec_r34_vd_crnn_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/ch_models/ch_rec_r34_vd_crnn.tar)|[inference模型](https://paddleocr.bj.bcebos.com/ch_models/ch_rec_r34_vd_crnn_enhance_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/ch_models/ch_rec_r34_vd_crnn_enhance.tar) + +## 文档教程 +- [快速安装](./doc/doc_ch/installation.md) +- [中文OCR模型快速使用](./doc/doc_ch/quickstart.md) +- 算法介绍 + - [文本检测](#文本检测算法) + - [文本识别](#文本识别算法) +- 模型训练/评估 + - [文本检测](./doc/doc_ch/detection.md) + - [文本识别](./doc/doc_ch/recognition.md) + - [yml参数配置文件介绍](./doc/doc_ch/config.md) + - [中文OCR训练预测技巧](./doc/doc_ch/tricks.md) +- 预测部署 + - [基于Python预测引擎推理](./doc/doc_ch/inference.md) + - [基于C++预测引擎推理](./deploy/cpp_infer/readme.md) + - [服务化部署](./doc/doc_ch/serving.md) + - [端侧部署](./deploy/lite/readme.md) + - 模型量化压缩(coming soon) + - [Benchmark](./doc/doc_ch/benchmark.md) +- 数据集 + - [通用中英文OCR数据集](./doc/doc_ch/datasets.md) + - [手写中文OCR数据集](./doc/doc_ch/handwritten_datasets.md) + - [垂类多语言OCR数据集](./doc/doc_ch/vertical_and_multilingual_datasets.md) + - [常用数据标注工具](./doc/doc_ch/data_annotation.md) + - [常用数据合成工具](./doc/doc_ch/data_synthesis.md) +- 效果展示 + - [超轻量级中文OCR效果展示](#超轻量级中文OCR效果展示) + - [通用中文OCR效果展示](#通用中文OCR效果展示) + - [支持空格的中文OCR效果展示](#支持空格的中文OCR效果展示) +- FAQ + - [【精选】OCR精选10个问题](./doc/doc_ch/FAQ.md) + - [【理论篇】OCR通用21个问题](./doc/doc_ch/FAQ.md) + - [【实战篇】PaddleOCR实战53个问题](./doc/doc_ch/FAQ.md) +- [技术交流群](#欢迎加入PaddleOCR技术交流群) +- [参考文献](./doc/doc_ch/reference.md) +- [许可证书](#许可证书) +- [贡献代码](#贡献代码) + + +## 算法介绍 + +### 1.文本检测算法 + +PaddleOCR开源的文本检测算法列表: - [x] EAST([paper](https://arxiv.org/abs/1704.03155)) - [x] DB([paper](https://arxiv.org/abs/1911.08947)) -- [x] SAST([paper](https://arxiv.org/abs/1908.05498))(Baidu Self-Research) +- [x] SAST([paper](https://arxiv.org/abs/1908.05498))(百度自研) -On the ICDAR2015 dataset, the text detection result is as follows: +在ICDAR2015文本检测公开数据集上,算法效果如下: -|Model|Backbone|precision|recall|Hmean|Download link| +|模型|骨干网络|precision|recall|Hmean|下载链接| |-|-|-|-|-|-| -|EAST|ResNet50_vd|88.18%|85.51%|86.82%|[Download link](https://paddleocr.bj.bcebos.com/det_r50_vd_east.tar)| -|EAST|MobileNetV3|81.67%|79.83%|80.74%|[Download link](https://paddleocr.bj.bcebos.com/det_mv3_east.tar)| -|DB|ResNet50_vd|83.79%|80.65%|82.19%|[Download link](https://paddleocr.bj.bcebos.com/det_r50_vd_db.tar)| -|DB|MobileNetV3|75.92%|73.18%|74.53%|[Download link](https://paddleocr.bj.bcebos.com/det_mv3_db.tar)| -|SAST|ResNet50_vd|92.18%|82.96%|87.33%|[Download link](https://paddleocr.bj.bcebos.com/SAST/sast_r50_vd_icdar2015.tar)| +|EAST|ResNet50_vd|88.18%|85.51%|86.82%|[下载链接](https://paddleocr.bj.bcebos.com/det_r50_vd_east.tar)| +|EAST|MobileNetV3|81.67%|79.83%|80.74%|[下载链接](https://paddleocr.bj.bcebos.com/det_mv3_east.tar)| +|DB|ResNet50_vd|83.79%|80.65%|82.19%|[下载链接](https://paddleocr.bj.bcebos.com/det_r50_vd_db.tar)| +|DB|MobileNetV3|75.92%|73.18%|74.53%|[下载链接](https://paddleocr.bj.bcebos.com/det_mv3_db.tar)| +|SAST|ResNet50_vd|92.18%|82.96%|87.33%|[下载链接](https://paddleocr.bj.bcebos.com/SAST/sast_r50_vd_icdar2015.tar)| -On Total-Text dataset, the text detection result is as follows: +在Total-text文本检测公开数据集上,算法效果如下: -|Model|Backbone|precision|recall|Hmean|Download link| +|模型|骨干网络|precision|recall|Hmean|下载链接| |-|-|-|-|-|-| -|SAST|ResNet50_vd|88.74%|79.80%|84.03%|[Download link](https://paddleocr.bj.bcebos.com/SAST/sast_r50_vd_total_text.tar)| +|SAST|ResNet50_vd|88.74%|79.80%|84.03%|[下载链接](https://paddleocr.bj.bcebos.com/SAST/sast_r50_vd_total_text.tar)| + +**说明:** SAST模型训练额外加入了icdar2013、icdar2017、COCO-Text、ArT等公开数据集进行调优。PaddleOCR用到的经过整理格式的英文公开数据集下载:[百度云地址](https://pan.baidu.com/s/12cPnZcVuV1zn5DOd4mqjVw) (提取码: 2bpi) + -For use of [LSVT](https://github.com/PaddlePaddle/PaddleOCR/blob/develop/doc/doc_en/datasets_en.md#1-icdar2019-lsvt) street view dataset with a total of 3w training data,the related configuration and pre-trained models for text detection task are as follows: -|Model|Backbone|Configuration file|Pre-trained model| +使用[LSVT](https://github.com/PaddlePaddle/PaddleOCR/blob/develop/doc/doc_ch/datasets.md#1icdar2019-lsvt)街景数据集共3w张数据,训练中文检测模型的相关配置和预训练文件如下: + +|模型|骨干网络|配置文件|预训练模型| |-|-|-|-| -|ultra-lightweight OCR model|MobileNetV3|det_mv3_db.yml|[Download link](https://paddleocr.bj.bcebos.com/ch_models/ch_det_mv3_db.tar)| -|General OCR model|ResNet50_vd|det_r50_vd_db.yml|[Download link](https://paddleocr.bj.bcebos.com/ch_models/ch_det_r50_vd_db.tar)| +|超轻量中文模型|MobileNetV3|det_mv3_db.yml|[下载链接](https://paddleocr.bj.bcebos.com/ch_models/ch_det_mv3_db.tar)| +|通用中文OCR模型|ResNet50_vd|det_r50_vd_db.yml|[下载链接](https://paddleocr.bj.bcebos.com/ch_models/ch_det_r50_vd_db.tar)| -* Note: For the training and evaluation of the above DB model, post-processing parameters box_thresh=0.6 and unclip_ratio=1.5 need to be set. If using different datasets and different models for training, these two parameters can be adjusted for better result. +* 注: 上述DB模型的训练和评估,需设置后处理参数box_thresh=0.6,unclip_ratio=1.5,使用不同数据集、不同模型训练,可调整这两个参数进行优化 -For the training guide and use of PaddleOCR text detection algorithms, please refer to the document [Text detection model training/evaluation/prediction](./doc/doc_en/detection_en.md) +PaddleOCR文本检测算法的训练和使用请参考文档教程中[模型训练/评估中的文本检测部分](./doc/doc_ch/detection.md)。 - -## Text Recognition Algorithm + +### 2.文本识别算法 -PaddleOCR open-source text recognition algorithms list: +PaddleOCR开源的文本识别算法列表: - [x] CRNN([paper](https://arxiv.org/abs/1507.05717)) - [x] Rosetta([paper](https://arxiv.org/abs/1910.05085)) - [x] STAR-Net([paper](http://www.bmva.org/bmvc/2016/papers/paper043/index.html)) - [x] RARE([paper](https://arxiv.org/abs/1603.03915v1)) -- [x] SRN([paper](https://arxiv.org/abs/2003.12294))(Baidu Self-Research) +- [x] SRN([paper](https://arxiv.org/abs/2003.12294))(百度自研) -Refer to [DTRB](https://arxiv.org/abs/1904.01906), the training and evaluation result of these above text recognition (using MJSynth and SynthText for training, evaluate on IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE) is as follow: +参考[DTRB](https://arxiv.org/abs/1904.01906)文字识别训练和评估流程,使用MJSynth和SynthText两个文字识别数据集训练,在IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE数据集上进行评估,算法效果如下: -|Model|Backbone|Avg Accuracy|Module combination|Download link| +|模型|骨干网络|Avg Accuracy|模型存储命名|下载链接| |-|-|-|-|-| -|Rosetta|Resnet34_vd|80.24%|rec_r34_vd_none_none_ctc|[Download link](https://paddleocr.bj.bcebos.com/rec_r34_vd_none_none_ctc.tar)| -|Rosetta|MobileNetV3|78.16%|rec_mv3_none_none_ctc|[Download link](https://paddleocr.bj.bcebos.com/rec_mv3_none_none_ctc.tar)| -|CRNN|Resnet34_vd|82.20%|rec_r34_vd_none_bilstm_ctc|[Download link](https://paddleocr.bj.bcebos.com/rec_r34_vd_none_bilstm_ctc.tar)| -|CRNN|MobileNetV3|79.37%|rec_mv3_none_bilstm_ctc|[Download link](https://paddleocr.bj.bcebos.com/rec_mv3_none_bilstm_ctc.tar)| -|STAR-Net|Resnet34_vd|83.93%|rec_r34_vd_tps_bilstm_ctc|[Download link](https://paddleocr.bj.bcebos.com/rec_r34_vd_tps_bilstm_ctc.tar)| -|STAR-Net|MobileNetV3|81.56%|rec_mv3_tps_bilstm_ctc|[Download link](https://paddleocr.bj.bcebos.com/rec_mv3_tps_bilstm_ctc.tar)| -|RARE|Resnet34_vd|84.90%|rec_r34_vd_tps_bilstm_attn|[Download link](https://paddleocr.bj.bcebos.com/rec_r34_vd_tps_bilstm_attn.tar)| -|RARE|MobileNetV3|83.32%|rec_mv3_tps_bilstm_attn|[Download link](https://paddleocr.bj.bcebos.com/rec_mv3_tps_bilstm_attn.tar)| -|SRN|Resnet50_vd_fpn|88.33%|rec_r50fpn_vd_none_srn|[Download link](https://paddleocr.bj.bcebos.com/SRN/rec_r50fpn_vd_none_srn.tar)| - -**Note:** SRN model uses data expansion method to expand the two training sets mentioned above, and the expanded data can be downloaded from [Baidu Drive](todo). - -The average accuracy of the two-stage training in the original paper is 89.74%, and that of one stage training in paddleocr is 88.33%. Both pre-trained weights can be downloaded [here](https://paddleocr.bj.bcebos.com/SRN/rec_r50fpn_vd_none_srn.tar). - -We use [LSVT](https://github.com/PaddlePaddle/PaddleOCR/blob/develop/doc/doc_en/datasets_en.md#1-icdar2019-lsvt) dataset and cropout 30w traning data from original photos by using position groundtruth and make some calibration needed. In addition, based on the LSVT corpus, 500w synthetic data is generated to train the model. The related configuration and pre-trained models are as follows: - -|Model|Backbone|Configuration file|Pre-trained model| +|Rosetta|Resnet34_vd|80.24%|rec_r34_vd_none_none_ctc|[下载链接](https://paddleocr.bj.bcebos.com/rec_r34_vd_none_none_ctc.tar)| +|Rosetta|MobileNetV3|78.16%|rec_mv3_none_none_ctc|[下载链接](https://paddleocr.bj.bcebos.com/rec_mv3_none_none_ctc.tar)| +|CRNN|Resnet34_vd|82.20%|rec_r34_vd_none_bilstm_ctc|[下载链接](https://paddleocr.bj.bcebos.com/rec_r34_vd_none_bilstm_ctc.tar)| +|CRNN|MobileNetV3|79.37%|rec_mv3_none_bilstm_ctc|[下载链接](https://paddleocr.bj.bcebos.com/rec_mv3_none_bilstm_ctc.tar)| +|STAR-Net|Resnet34_vd|83.93%|rec_r34_vd_tps_bilstm_ctc|[下载链接](https://paddleocr.bj.bcebos.com/rec_r34_vd_tps_bilstm_ctc.tar)| +|STAR-Net|MobileNetV3|81.56%|rec_mv3_tps_bilstm_ctc|[下载链接](https://paddleocr.bj.bcebos.com/rec_mv3_tps_bilstm_ctc.tar)| +|RARE|Resnet34_vd|84.90%|rec_r34_vd_tps_bilstm_attn|[下载链接](https://paddleocr.bj.bcebos.com/rec_r34_vd_tps_bilstm_attn.tar)| +|RARE|MobileNetV3|83.32%|rec_mv3_tps_bilstm_attn|[下载链接](https://paddleocr.bj.bcebos.com/rec_mv3_tps_bilstm_attn.tar)| +|SRN|Resnet50_vd_fpn|88.33%|rec_r50fpn_vd_none_srn|[下载链接](https://paddleocr.bj.bcebos.com/SRN/rec_r50fpn_vd_none_srn.tar)| + +**说明:** SRN模型使用了数据扰动方法对上述提到对两个训练集进行增广,增广后的数据可以在[百度网盘](https://pan.baidu.com/s/1-HSZ-ZVdqBF2HaBZ5pRAKA)上下载,提取码: y3ry。 +原始论文使用两阶段训练平均精度为89.74%,PaddleOCR中使用one-stage训练,平均精度为88.33%。两种预训练权重均在[下载链接](https://paddleocr.bj.bcebos.com/SRN/rec_r50fpn_vd_none_srn.tar)中。 + +使用[LSVT](https://github.com/PaddlePaddle/PaddleOCR/blob/develop/doc/doc_ch/datasets.md#1icdar2019-lsvt)街景数据集根据真值将图crop出来30w数据,进行位置校准。此外基于LSVT语料生成500w合成数据训练中文模型,相关配置和预训练文件如下: + +|模型|骨干网络|配置文件|预训练模型| |-|-|-|-| -|ultra-lightweight OCR model|MobileNetV3|rec_chinese_lite_train.yml|[Download link](https://paddleocr.bj.bcebos.com/ch_models/ch_rec_mv3_crnn.tar)|[inference model](https://paddleocr.bj.bcebos.com/ch_models/ch_rec_mv3_crnn_enhance_infer.tar) & [pre-trained model](https://paddleocr.bj.bcebos.com/ch_models/ch_rec_mv3_crnn_enhance.tar)| -|General OCR model|Resnet34_vd|rec_chinese_common_train.yml|[Download link](https://paddleocr.bj.bcebos.com/ch_models/ch_rec_r34_vd_crnn.tar)|[inference model](https://paddleocr.bj.bcebos.com/ch_models/ch_rec_r34_vd_crnn_enhance_infer.tar) & [pre-trained model](https://paddleocr.bj.bcebos.com/ch_models/ch_rec_r34_vd_crnn_enhance.tar)| - -Please refer to the document for training guide and use of PaddleOCR text recognition algorithms [Text recognition model training/evaluation/prediction](./doc/doc_en/recognition_en.md) +|超轻量中文模型|MobileNetV3|rec_chinese_lite_train.yml|[下载链接](https://paddleocr.bj.bcebos.com/ch_models/ch_rec_mv3_crnn.tar)| +|通用中文OCR模型|Resnet34_vd|rec_chinese_common_train.yml|[下载链接](https://paddleocr.bj.bcebos.com/ch_models/ch_rec_r34_vd_crnn.tar)| - -## END-TO-END OCR Algorithm -- [ ] [End2End-PSL](https://arxiv.org/abs/1909.07808)(Baidu Self-Research, comming soon) +PaddleOCR文本识别算法的训练和使用请参考文档教程中[模型训练/评估中的文本识别部分](./doc/doc_ch/recognition.md)。 -## Visualization +## 效果展示 - -### 1.Ultra-lightweight Chinese/English OCR Visualization [more](./doc/doc_en/visualization_en.md) + +### 1.超轻量级中文OCR效果展示 [more](./doc/doc_ch/visualization.md)
- -### 2. General Chinese/English OCR Visualization [more](./doc/doc_en/visualization_en.md) + +### 2.通用中文OCR效果展示 [more](./doc/doc_ch/visualization.md)
- -### 3.Chinese/English OCR Visualization (Space_support) [more](./doc/doc_en/visualization_en.md) + +### 3.支持空格的中文OCR效果展示 [more](./doc/doc_ch/visualization.md)
- - -## FAQ -1. Error when using attention-based recognition model: KeyError: 'predict' - - The inference of recognition model based on attention loss is still being debugged. For Chinese text recognition, it is recommended to choose the recognition model based on CTC loss first. In practice, it is also found that the recognition model based on attention loss is not as effective as the one based on CTC loss. - -2. About inference speed - - When there are a lot of texts in the picture, the prediction time will increase. You can use `--rec_batch_num` to set a smaller prediction batch size. The default value is 30, which can be changed to 10 or other values. - -3. Service deployment and mobile deployment - - It is expected that the service deployment based on Serving and the mobile deployment based on Paddle Lite will be released successively in mid-to-late June. Stay tuned for more updates. - -4. Release time of self-developed algorithm - - Baidu Self-developed algorithms such as SAST, SRN and end2end PSL will be released in June or July. Please be patient. - -[more](./doc/doc_en/FAQ_en.md) - - -## Community -Scan the QR code below with your wechat and completing the questionnaire, you can access to offical technical exchange group. + +## 欢迎加入PaddleOCR技术交流群 +请扫描下面二维码,完成问卷填写,获取加群二维码和OCR方向的炼丹秘籍
- -## License -This project is released under Apache 2.0 license - - -## Contribution -We welcome all the contributions to PaddleOCR and appreciate for your feedback very much. - -- Many thanks to [Khanh Tran](https://github.com/xxxpsyduck) for contributing the English documentation. -- Many thanks to [zhangxin](https://github.com/ZhangXinNan) for contributing the new visualize function、add .gitgnore and discard set PYTHONPATH manually. -- Many thanks to [lyl120117](https://github.com/lyl120117) for contributing the code for printing the network structure. -- Thanks [xiangyubo](https://github.com/xiangyubo) for contributing the handwritten Chinese OCR datasets. -- Thanks [authorfu](https://github.com/authorfu) for contributing Android demo and [xiadeye](https://github.com/xiadeye) contributing iOS demo, respectively. -- Thanks [BeyondYourself](https://github.com/BeyondYourself) for contributing many great suggestions and simplifying part of the code style. + +## 许可证书 +本项目的发布受Apache 2.0 license许可认证。 + + +## 贡献代码 +我们非常欢迎你为PaddleOCR贡献代码,也十分感谢你的反馈。 + +- 非常感谢 [Khanh Tran](https://github.com/xxxpsyduck) 和 [Karl Horky](https://github.com/karlhorky) 贡献修改英文文档 +- 非常感谢 [zhangxin](https://github.com/ZhangXinNan)([Blog](https://blog.csdn.net/sdlypyzq)) 贡献新的可视化方式、添加.gitgnore、处理手动设置PYTHONPATH环境变量的问题 +- 非常感谢 [lyl120117](https://github.com/lyl120117) 贡献打印网络结构的代码 +- 非常感谢 [xiangyubo](https://github.com/xiangyubo) 贡献手写中文OCR数据集 +- 非常感谢 [authorfu](https://github.com/authorfu) 贡献Android和[xiadeye](https://github.com/xiadeye) 贡献IOS的demo代码 +- 非常感谢 [BeyondYourself](https://github.com/BeyondYourself) 给PaddleOCR提了很多非常棒的建议,并简化了PaddleOCR的部分代码风格。 +- 非常感谢 [tangmq](https://gitee.com/tangmq) 给PaddleOCR增加Docker化部署服务,支持快速发布可调用的Restful API服务。 diff --git a/README_cn.md b/README_cn.md deleted file mode 100644 index c797097c78064e66deb07bf32dc1bcd9a0093d5f..0000000000000000000000000000000000000000 --- a/README_cn.md +++ /dev/null @@ -1,224 +0,0 @@ -[English](README.md) | 简体中文 - -## 简介 -PaddleOCR旨在打造一套丰富、领先、且实用的OCR工具库,助力使用者训练出更好的模型,并应用落地。 - -**近期更新** -- 2020.8.16 开源文本检测算法[SAST](https://arxiv.org/abs/1908.05498)和文本识别算法[SRN](https://arxiv.org/abs/2003.12294) -- 2020.7.23 发布7月21日B站直播课回放和PPT,PaddleOCR开源大礼包全面解读,[获取地址](https://aistudio.baidu.com/aistudio/course/introduce/1519) -- 2020.7.15 添加基于EasyEdge和Paddle-Lite的移动端DEMO,支持iOS和Android系统 -- 2020.7.15 完善预测部署,添加基于C++预测引擎推理、服务化部署和端侧部署方案,以及超轻量级中文OCR模型预测耗时Benchmark -- 2020.7.15 整理OCR相关数据集、常用数据标注以及合成工具 -- [more](./doc/doc_ch/update.md) - - -## 特性 -- 超轻量级中文OCR模型,总模型仅8.6M - - 单模型支持中英文数字组合识别、竖排文本识别、长文本识别 - - 检测模型DB(4.1M)+识别模型CRNN(4.5M) -- 实用通用中文OCR模型 -- 多种预测推理部署方案,包括服务部署和端侧部署 -- 多种文本检测训练算法,EAST、DB -- 多种文本识别训练算法,Rosetta、CRNN、STAR-Net、RARE -- 可运行于Linux、Windows、MacOS等多种系统 - -## 快速体验 - -
- -
- -上图是超轻量级中文OCR模型效果展示,更多效果图请见[效果展示页面](./doc/doc_ch/visualization.md)。 - -- 超轻量级中文OCR在线体验地址:https://www.paddlepaddle.org.cn/hub/scene/ocr -- 移动端DEMO体验(基于EasyEdge和Paddle-Lite, 支持iOS和Android系统):[安装包二维码获取地址](https://ai.baidu.com/easyedge/app/openSource?from=paddlelite) - - Android手机也可以扫描下面二维码安装体验。 - -
- -
- -- [**中文OCR模型快速使用**](./doc/doc_ch/quickstart.md) - - -## 中文OCR模型列表 - -|模型名称|模型简介|检测模型地址|识别模型地址|支持空格的识别模型地址| -|-|-|-|-|-| -|chinese_db_crnn_mobile|超轻量级中文OCR模型|[inference模型](https://paddleocr.bj.bcebos.com/ch_models/ch_det_mv3_db_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/ch_models/ch_det_mv3_db.tar)|[inference模型](https://paddleocr.bj.bcebos.com/ch_models/ch_rec_mv3_crnn_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/ch_models/ch_rec_mv3_crnn.tar)|[inference模型](https://paddleocr.bj.bcebos.com/ch_models/ch_rec_mv3_crnn_enhance_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/ch_models/ch_rec_mv3_crnn_enhance.tar) -|chinese_db_crnn_server|通用中文OCR模型|[inference模型](https://paddleocr.bj.bcebos.com/ch_models/ch_det_r50_vd_db_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/ch_models/ch_det_r50_vd_db.tar)|[inference模型](https://paddleocr.bj.bcebos.com/ch_models/ch_rec_r34_vd_crnn_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/ch_models/ch_rec_r34_vd_crnn.tar)|[inference模型](https://paddleocr.bj.bcebos.com/ch_models/ch_rec_r34_vd_crnn_enhance_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/ch_models/ch_rec_r34_vd_crnn_enhance.tar) - -## 文档教程 -- [快速安装](./doc/doc_ch/installation.md) -- [中文OCR模型快速使用](./doc/doc_ch/quickstart.md) -- 算法介绍 - - [文本检测](#文本检测算法) - - [文本识别](#文本识别算法) - - [端到端OCR](#端到端OCR算法) -- 模型训练/评估 - - [文本检测](./doc/doc_ch/detection.md) - - [文本识别](./doc/doc_ch/recognition.md) - - [yml参数配置文件介绍](./doc/doc_ch/config.md) - - [中文OCR训练预测技巧](./doc/doc_ch/tricks.md) -- 预测部署 - - [基于Python预测引擎推理](./doc/doc_ch/inference.md) - - [基于C++预测引擎推理](./deploy/cpp_infer/readme.md) - - [服务化部署](./doc/doc_ch/serving.md) - - [端侧部署](./deploy/lite/readme.md) - - 模型量化压缩(coming soon) - - [Benchmark](./doc/doc_ch/benchmark.md) -- 数据集 - - [通用中英文OCR数据集](./doc/doc_ch/datasets.md) - - [手写中文OCR数据集](./doc/doc_ch/handwritten_datasets.md) - - [垂类多语言OCR数据集](./doc/doc_ch/vertical_and_multilingual_datasets.md) - - [常用数据标注工具](./doc/doc_ch/data_annotation.md) - - [常用数据合成工具](./doc/doc_ch/data_synthesis.md) -- [FAQ](#FAQ) -- 效果展示 - - [超轻量级中文OCR效果展示](#超轻量级中文OCR效果展示) - - [通用中文OCR效果展示](#通用中文OCR效果展示) - - [支持空格的中文OCR效果展示](#支持空格的中文OCR效果展示) -- [技术交流群](#欢迎加入PaddleOCR技术交流群) -- [参考文献](./doc/doc_ch/reference.md) -- [许可证书](#许可证书) -- [贡献代码](#贡献代码) - - -## 算法介绍 - -### 1.文本检测算法 - -PaddleOCR开源的文本检测算法列表: -- [x] EAST([paper](https://arxiv.org/abs/1704.03155)) -- [x] DB([paper](https://arxiv.org/abs/1911.08947)) -- [x] SAST([paper](https://arxiv.org/abs/1908.05498))(百度自研) - -在ICDAR2015文本检测公开数据集上,算法效果如下: - -|模型|骨干网络|precision|recall|Hmean|下载链接| -|-|-|-|-|-|-| -|EAST|ResNet50_vd|88.18%|85.51%|86.82%|[下载链接](https://paddleocr.bj.bcebos.com/det_r50_vd_east.tar)| -|EAST|MobileNetV3|81.67%|79.83%|80.74%|[下载链接](https://paddleocr.bj.bcebos.com/det_mv3_east.tar)| -|DB|ResNet50_vd|83.79%|80.65%|82.19%|[下载链接](https://paddleocr.bj.bcebos.com/det_r50_vd_db.tar)| -|DB|MobileNetV3|75.92%|73.18%|74.53%|[下载链接](https://paddleocr.bj.bcebos.com/det_mv3_db.tar)| -|SAST|ResNet50_vd|92.18%|82.96%|87.33%|[下载链接](https://paddleocr.bj.bcebos.com/SAST/sast_r50_vd_icdar2015.tar)| - -在Total-text文本检测公开数据集上,算法效果如下: - -|模型|骨干网络|precision|recall|Hmean|下载链接| -|-|-|-|-|-|-| -|SAST|ResNet50_vd|88.74%|79.80%|84.03%|[下载链接](https://paddleocr.bj.bcebos.com/SAST/sast_r50_vd_total_text.tar)| - -使用[LSVT](https://github.com/PaddlePaddle/PaddleOCR/blob/develop/doc/doc_ch/datasets.md#1icdar2019-lsvt)街景数据集共3w张数据,训练中文检测模型的相关配置和预训练文件如下: - -|模型|骨干网络|配置文件|预训练模型| -|-|-|-|-| -|超轻量中文模型|MobileNetV3|det_mv3_db.yml|[下载链接](https://paddleocr.bj.bcebos.com/ch_models/ch_det_mv3_db.tar)| -|通用中文OCR模型|ResNet50_vd|det_r50_vd_db.yml|[下载链接](https://paddleocr.bj.bcebos.com/ch_models/ch_det_r50_vd_db.tar)| - -* 注: 上述DB模型的训练和评估,需设置后处理参数box_thresh=0.6,unclip_ratio=1.5,使用不同数据集、不同模型训练,可调整这两个参数进行优化 - -PaddleOCR文本检测算法的训练和使用请参考文档教程中[模型训练/评估中的文本检测部分](./doc/doc_ch/detection.md)。 - - -### 2.文本识别算法 - -PaddleOCR开源的文本识别算法列表: -- [x] CRNN([paper](https://arxiv.org/abs/1507.05717)) -- [x] Rosetta([paper](https://arxiv.org/abs/1910.05085)) -- [x] STAR-Net([paper](http://www.bmva.org/bmvc/2016/papers/paper043/index.html)) -- [x] RARE([paper](https://arxiv.org/abs/1603.03915v1)) -- [x] SRN([paper](https://arxiv.org/abs/2003.12294))(百度自研) - -参考[DTRB](https://arxiv.org/abs/1904.01906)文字识别训练和评估流程,使用MJSynth和SynthText两个文字识别数据集训练,在IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE数据集上进行评估,算法效果如下: - -|模型|骨干网络|Avg Accuracy|模型存储命名|下载链接| -|-|-|-|-|-| -|Rosetta|Resnet34_vd|80.24%|rec_r34_vd_none_none_ctc|[下载链接](https://paddleocr.bj.bcebos.com/rec_r34_vd_none_none_ctc.tar)| -|Rosetta|MobileNetV3|78.16%|rec_mv3_none_none_ctc|[下载链接](https://paddleocr.bj.bcebos.com/rec_mv3_none_none_ctc.tar)| -|CRNN|Resnet34_vd|82.20%|rec_r34_vd_none_bilstm_ctc|[下载链接](https://paddleocr.bj.bcebos.com/rec_r34_vd_none_bilstm_ctc.tar)| -|CRNN|MobileNetV3|79.37%|rec_mv3_none_bilstm_ctc|[下载链接](https://paddleocr.bj.bcebos.com/rec_mv3_none_bilstm_ctc.tar)| -|STAR-Net|Resnet34_vd|83.93%|rec_r34_vd_tps_bilstm_ctc|[下载链接](https://paddleocr.bj.bcebos.com/rec_r34_vd_tps_bilstm_ctc.tar)| -|STAR-Net|MobileNetV3|81.56%|rec_mv3_tps_bilstm_ctc|[下载链接](https://paddleocr.bj.bcebos.com/rec_mv3_tps_bilstm_ctc.tar)| -|RARE|Resnet34_vd|84.90%|rec_r34_vd_tps_bilstm_attn|[下载链接](https://paddleocr.bj.bcebos.com/rec_r34_vd_tps_bilstm_attn.tar)| -|RARE|MobileNetV3|83.32%|rec_mv3_tps_bilstm_attn|[下载链接](https://paddleocr.bj.bcebos.com/rec_mv3_tps_bilstm_attn.tar)| -|SRN|Resnet50_vd_fpn|88.33%|rec_r50fpn_vd_none_srn|[下载链接](https://paddleocr.bj.bcebos.com/SRN/rec_r50fpn_vd_none_srn.tar)| - -**说明:** SRN模型使用了数据扰动方法对上述提到对两个训练集进行增广,增广后的数据可以在[百度网盘](todo)上下载。 -原始论文使用两阶段训练平均精度为89.74%,PaddleOCR中使用one-stage训练,平均精度为88.33%。两种预训练权重均在[下载链接](https://paddleocr.bj.bcebos.com/SRN/rec_r50fpn_vd_none_srn.tar)中。 - -使用[LSVT](https://github.com/PaddlePaddle/PaddleOCR/blob/develop/doc/doc_ch/datasets.md#1icdar2019-lsvt)街景数据集根据真值将图crop出来30w数据,进行位置校准。此外基于LSVT语料生成500w合成数据训练中文模型,相关配置和预训练文件如下: - -|模型|骨干网络|配置文件|预训练模型| -|-|-|-|-| -|超轻量中文模型|MobileNetV3|rec_chinese_lite_train.yml|[下载链接](https://paddleocr.bj.bcebos.com/ch_models/ch_rec_mv3_crnn.tar)| -|通用中文OCR模型|Resnet34_vd|rec_chinese_common_train.yml|[下载链接](https://paddleocr.bj.bcebos.com/ch_models/ch_rec_r34_vd_crnn.tar)| - -PaddleOCR文本识别算法的训练和使用请参考文档教程中[模型训练/评估中的文本识别部分](./doc/doc_ch/recognition.md)。 - - -### 3.端到端OCR算法 -- [ ] [End2End-PSL](https://arxiv.org/abs/1909.07808)(百度自研, coming soon) - -## 效果展示 - - -### 1.超轻量级中文OCR效果展示 [more](./doc/doc_ch/visualization.md) - -
- -
- - -### 2.通用中文OCR效果展示 [more](./doc/doc_ch/visualization.md) - -
- -
- - -### 3.支持空格的中文OCR效果展示 [more](./doc/doc_ch/visualization.md) - -
- -
- - -## FAQ -1. **转换attention识别模型时报错:KeyError: 'predict'** -问题已解,请更新到最新代码。 - -2. **关于推理速度** -图片中的文字较多时,预测时间会增,可以使用--rec_batch_num设置更小预测batch num,默认值为30,可以改为10或其他数值。 - -3. **服务部署与移动端部署** -预计6月中下旬会先后发布基于Serving的服务部署方案和基于Paddle Lite的移动端部署方案,欢迎持续关注。 - -4. **自研算法发布时间** -自研算法SAST、SRN、End2End-PSL都将在7-8月陆续发布,敬请期待。 - -[more](./doc/doc_ch/FAQ.md) - - -## 欢迎加入PaddleOCR技术交流群 -请扫描下面二维码,完成问卷填写,获取加群二维码和OCR方向的炼丹秘籍 - -
- -
- - -## 许可证书 -本项目的发布受Apache 2.0 license许可认证。 - - -## 贡献代码 -我们非常欢迎你为PaddleOCR贡献代码,也十分感谢你的反馈。 - -- 非常感谢 [Khanh Tran](https://github.com/xxxpsyduck) 贡献了英文文档 -- 非常感谢 [zhangxin](https://github.com/ZhangXinNan)([Blog](https://blog.csdn.net/sdlypyzq)) 贡献新的可视化方式、添加.gitgnore、处理手动设置PYTHONPATH环境变量的问题 -- 非常感谢 [lyl120117](https://github.com/lyl120117) 贡献打印网络结构的代码 -- 非常感谢 [xiangyubo](https://github.com/xiangyubo) 贡献手写中文OCR数据集 -- 非常感谢 [authorfu](https://github.com/authorfu) 贡献Android和[xiadeye](https://github.com/xiadeye) 贡献IOS的demo代码 -- 非常感谢 [BeyondYourself](https://github.com/BeyondYourself) 给PaddleOCR提了很多非常棒的建议,并简化了PaddleOCR的部分代码风格。 diff --git a/README_en.md b/README_en.md new file mode 100644 index 0000000000000000000000000000000000000000..37250da2cd3f6ccee76b522bf10745ecb8cd649e --- /dev/null +++ b/README_en.md @@ -0,0 +1,231 @@ +English | [简体中文](README.md) + +## Introduction +PaddleOCR aims to create rich, leading, and practical OCR tools that help users train better models and apply them into practice. + +**Recent updates** +- 2020.8.24 Support the use of PaddleOCR through whl package installation,pelease refer [PaddleOCR Package](https://github.com/PaddlePaddle/PaddleOCR/blob/develop/doc/doc_en/whl_en.md) +- 2020.8.16, Release text detection algorithm [SAST](https://arxiv.org/abs/1908.05498) and text recognition algorithm [SRN](https://arxiv.org/abs/2003.12294) +- 2020.7.23, Release the playback and PPT of live class on BiliBili station, PaddleOCR Introduction, [address](https://aistudio.baidu.com/aistudio/course/introduce/1519) +- 2020.7.15, Add mobile App demo , support both iOS and Android ( based on easyedge and Paddle Lite) +- 2020.7.15, Improve the deployment ability, add the C + + inference , serving deployment. In addition, the benchmarks of the ultra-lightweight OCR model are provided. +- 2020.7.15, Add several related datasets, data annotation and synthesis tools. +- [more](./doc/doc_en/update_en.md) + +## Features +- Ultra-lightweight OCR model, total model size is only 8.6M + - Single model supports Chinese/English numbers combination recognition, vertical text recognition, long text recognition + - Detection model DB (4.1M) + recognition model CRNN (4.5M) +- Various text detection algorithms: EAST, DB +- Various text recognition algorithms: Rosetta, CRNN, STAR-Net, RARE +- Support Linux, Windows, macOS and other systems. + +## Visualization + +![](doc/imgs_results/11.jpg) + +![](doc/imgs_results/img_10.jpg) + +[More visualization](./doc/doc_en/visualization_en.md) + +You can also quickly experience the ultra-lightweight OCR : [Online Experience](https://www.paddlepaddle.org.cn/hub/scene/ocr) + +Mobile DEMO experience (based on EasyEdge and Paddle-Lite, supports iOS and Android systems): [Sign in to the website to obtain the QR code for installing the App](https://ai.baidu.com/easyedge/app/openSource?from=paddlelite) + + Also, you can scan the QR code below to install the App (**Android support only**) + +
+ +
+ +- [**OCR Quick Start**](./doc/doc_en/quickstart_en.md) + + + +### Supported Models: + +|Model Name|Description |Detection Model link|Recognition Model link| Support for space Recognition Model link| +|-|-|-|-|-| +|db_crnn_mobile|ultra-lightweight OCR model|[inference model](https://paddleocr.bj.bcebos.com/ch_models/ch_det_mv3_db_infer.tar) / [pre-trained model](https://paddleocr.bj.bcebos.com/ch_models/ch_det_mv3_db.tar)|[inference model](https://paddleocr.bj.bcebos.com/ch_models/ch_rec_mv3_crnn_infer.tar) / [pre-trained model](https://paddleocr.bj.bcebos.com/ch_models/ch_rec_mv3_crnn.tar)|[inference model](https://paddleocr.bj.bcebos.com/ch_models/ch_rec_mv3_crnn_enhance_infer.tar) / [pre-train model](https://paddleocr.bj.bcebos.com/ch_models/ch_rec_mv3_crnn_enhance.tar) +|db_crnn_server|General OCR model|[inference model](https://paddleocr.bj.bcebos.com/ch_models/ch_det_r50_vd_db_infer.tar) / [pre-trained model](https://paddleocr.bj.bcebos.com/ch_models/ch_det_r50_vd_db.tar)|[inference model](https://paddleocr.bj.bcebos.com/ch_models/ch_rec_r34_vd_crnn_infer.tar) / [pre-trained model](https://paddleocr.bj.bcebos.com/ch_models/ch_rec_r34_vd_crnn.tar)|[inference model](https://paddleocr.bj.bcebos.com/ch_models/ch_rec_r34_vd_crnn_enhance_infer.tar) / [pre-train model](https://paddleocr.bj.bcebos.com/ch_models/ch_rec_r34_vd_crnn_enhance.tar) + + +## Tutorials +- [Installation](./doc/doc_en/installation_en.md) +- [Quick Start](./doc/doc_en/quickstart_en.md) +- Algorithm introduction + - [Text Detection Algorithm](#TEXTDETECTIONALGORITHM) + - [Text Recognition Algorithm](#TEXTRECOGNITIONALGORITHM) + - [END-TO-END OCR Algorithm](#ENDENDOCRALGORITHM) +- Model training/evaluation + - [Text Detection](./doc/doc_en/detection_en.md) + - [Text Recognition](./doc/doc_en/recognition_en.md) + - [Yml Configuration](./doc/doc_en/config_en.md) + - [Tricks](./doc/doc_en/tricks_en.md) +- Deployment + - [Python Inference](./doc/doc_en/inference_en.md) + - [C++ Inference](./deploy/cpp_infer/readme_en.md) + - [Serving](./doc/doc_en/serving_en.md) + - [Mobile](./deploy/lite/readme_en.md) + - Model Quantization and Compression (coming soon) + - [Benchmark](./doc/doc_en/benchmark_en.md) +- Datasets + - [General OCR Datasets(Chinese/English)](./doc/doc_en/datasets_en.md) + - [HandWritten_OCR_Datasets(Chinese)](./doc/doc_en/handwritten_datasets_en.md) + - [Various OCR Datasets(multilingual)](./doc/doc_en/vertical_and_multilingual_datasets_en.md) + - [Data Annotation Tools](./doc/doc_en/data_annotation_en.md) + - [Data Synthesis Tools](./doc/doc_en/data_synthesis_en.md) +- [FAQ](#FAQ) +- Visualization + - [Ultra-lightweight Chinese/English OCR Visualization](#UCOCRVIS) + - [General Chinese/English OCR Visualization](#GeOCRVIS) + - [Chinese/English OCR Visualization (Support Space Recognition )](#SpaceOCRVIS) +- [Community](#Community) +- [References](./doc/doc_en/reference_en.md) +- [License](#LICENSE) +- [Contribution](#CONTRIBUTION) + + +## Text Detection Algorithm + +PaddleOCR open source text detection algorithms list: +- [x] EAST([paper](https://arxiv.org/abs/1704.03155)) +- [x] DB([paper](https://arxiv.org/abs/1911.08947)) +- [x] SAST([paper](https://arxiv.org/abs/1908.05498))(Baidu Self-Research) + +On the ICDAR2015 dataset, the text detection result is as follows: + +|Model|Backbone|precision|recall|Hmean|Download link| +|-|-|-|-|-|-| +|EAST|ResNet50_vd|88.18%|85.51%|86.82%|[Download link](https://paddleocr.bj.bcebos.com/det_r50_vd_east.tar)| +|EAST|MobileNetV3|81.67%|79.83%|80.74%|[Download link](https://paddleocr.bj.bcebos.com/det_mv3_east.tar)| +|DB|ResNet50_vd|83.79%|80.65%|82.19%|[Download link](https://paddleocr.bj.bcebos.com/det_r50_vd_db.tar)| +|DB|MobileNetV3|75.92%|73.18%|74.53%|[Download link](https://paddleocr.bj.bcebos.com/det_mv3_db.tar)| +|SAST|ResNet50_vd|92.18%|82.96%|87.33%|[Download link](https://paddleocr.bj.bcebos.com/SAST/sast_r50_vd_icdar2015.tar)| + +On Total-Text dataset, the text detection result is as follows: + +|Model|Backbone|precision|recall|Hmean|Download link| +|-|-|-|-|-|-| +|SAST|ResNet50_vd|88.74%|79.80%|84.03%|[Download link](https://paddleocr.bj.bcebos.com/SAST/sast_r50_vd_total_text.tar)| + +**Note:** Additional data, like icdar2013, icdar2017, COCO-Text, ArT, was added to the model training of SAST. Download English public dataset in organized format used by PaddleOCR from [Baidu Drive](https://pan.baidu.com/s/12cPnZcVuV1zn5DOd4mqjVw) (download code: 2bpi). + +For use of [LSVT](https://github.com/PaddlePaddle/PaddleOCR/blob/develop/doc/doc_en/datasets_en.md#1-icdar2019-lsvt) street view dataset with a total of 3w training data,the related configuration and pre-trained models for text detection task are as follows: +|Model|Backbone|Configuration file|Pre-trained model| +|-|-|-|-| +|ultra-lightweight OCR model|MobileNetV3|det_mv3_db.yml|[Download link](https://paddleocr.bj.bcebos.com/ch_models/ch_det_mv3_db.tar)| +|General OCR model|ResNet50_vd|det_r50_vd_db.yml|[Download link](https://paddleocr.bj.bcebos.com/ch_models/ch_det_r50_vd_db.tar)| + +* Note: For the training and evaluation of the above DB model, post-processing parameters box_thresh=0.6 and unclip_ratio=1.5 need to be set. If using different datasets and different models for training, these two parameters can be adjusted for better result. + +For the training guide and use of PaddleOCR text detection algorithms, please refer to the document [Text detection model training/evaluation/prediction](./doc/doc_en/detection_en.md) + + +## Text Recognition Algorithm + +PaddleOCR open-source text recognition algorithms list: +- [x] CRNN([paper](https://arxiv.org/abs/1507.05717)) +- [x] Rosetta([paper](https://arxiv.org/abs/1910.05085)) +- [x] STAR-Net([paper](http://www.bmva.org/bmvc/2016/papers/paper043/index.html)) +- [x] RARE([paper](https://arxiv.org/abs/1603.03915v1)) +- [x] SRN([paper](https://arxiv.org/abs/2003.12294))(Baidu Self-Research) + +Refer to [DTRB](https://arxiv.org/abs/1904.01906), the training and evaluation result of these above text recognition (using MJSynth and SynthText for training, evaluate on IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE) is as follow: + +|Model|Backbone|Avg Accuracy|Module combination|Download link| +|-|-|-|-|-| +|Rosetta|Resnet34_vd|80.24%|rec_r34_vd_none_none_ctc|[Download link](https://paddleocr.bj.bcebos.com/rec_r34_vd_none_none_ctc.tar)| +|Rosetta|MobileNetV3|78.16%|rec_mv3_none_none_ctc|[Download link](https://paddleocr.bj.bcebos.com/rec_mv3_none_none_ctc.tar)| +|CRNN|Resnet34_vd|82.20%|rec_r34_vd_none_bilstm_ctc|[Download link](https://paddleocr.bj.bcebos.com/rec_r34_vd_none_bilstm_ctc.tar)| +|CRNN|MobileNetV3|79.37%|rec_mv3_none_bilstm_ctc|[Download link](https://paddleocr.bj.bcebos.com/rec_mv3_none_bilstm_ctc.tar)| +|STAR-Net|Resnet34_vd|83.93%|rec_r34_vd_tps_bilstm_ctc|[Download link](https://paddleocr.bj.bcebos.com/rec_r34_vd_tps_bilstm_ctc.tar)| +|STAR-Net|MobileNetV3|81.56%|rec_mv3_tps_bilstm_ctc|[Download link](https://paddleocr.bj.bcebos.com/rec_mv3_tps_bilstm_ctc.tar)| +|RARE|Resnet34_vd|84.90%|rec_r34_vd_tps_bilstm_attn|[Download link](https://paddleocr.bj.bcebos.com/rec_r34_vd_tps_bilstm_attn.tar)| +|RARE|MobileNetV3|83.32%|rec_mv3_tps_bilstm_attn|[Download link](https://paddleocr.bj.bcebos.com/rec_mv3_tps_bilstm_attn.tar)| +|SRN|Resnet50_vd_fpn|88.33%|rec_r50fpn_vd_none_srn|[Download link](https://paddleocr.bj.bcebos.com/SRN/rec_r50fpn_vd_none_srn.tar)| + +**Note:** SRN model uses data expansion method to expand the two training sets mentioned above, and the expanded data can be downloaded from [Baidu Drive](https://pan.baidu.com/s/1-HSZ-ZVdqBF2HaBZ5pRAKA) (download code: y3ry). + +The average accuracy of the two-stage training in the original paper is 89.74%, and that of one stage training in paddleocr is 88.33%. Both pre-trained weights can be downloaded [here](https://paddleocr.bj.bcebos.com/SRN/rec_r50fpn_vd_none_srn.tar). + +We use [LSVT](https://github.com/PaddlePaddle/PaddleOCR/blob/develop/doc/doc_en/datasets_en.md#1-icdar2019-lsvt) dataset and cropout 30w training data from original photos by using position groundtruth and make some calibration needed. In addition, based on the LSVT corpus, 500w synthetic data is generated to train the model. The related configuration and pre-trained models are as follows: + +|Model|Backbone|Configuration file|Pre-trained model| +|-|-|-|-| +|ultra-lightweight OCR model|MobileNetV3|rec_chinese_lite_train.yml|[Download link](https://paddleocr.bj.bcebos.com/ch_models/ch_rec_mv3_crnn.tar)|[inference model](https://paddleocr.bj.bcebos.com/ch_models/ch_rec_mv3_crnn_enhance_infer.tar) & [pre-trained model](https://paddleocr.bj.bcebos.com/ch_models/ch_rec_mv3_crnn_enhance.tar)| +|General OCR model|Resnet34_vd|rec_chinese_common_train.yml|[Download link](https://paddleocr.bj.bcebos.com/ch_models/ch_rec_r34_vd_crnn.tar)|[inference model](https://paddleocr.bj.bcebos.com/ch_models/ch_rec_r34_vd_crnn_enhance_infer.tar) & [pre-trained model](https://paddleocr.bj.bcebos.com/ch_models/ch_rec_r34_vd_crnn_enhance.tar)| + +Please refer to the document for training guide and use of PaddleOCR text recognition algorithms [Text recognition model training/evaluation/prediction](./doc/doc_en/recognition_en.md) + + +## END-TO-END OCR Algorithm +- [ ] [End2End-PSL](https://arxiv.org/abs/1909.07808)(Baidu Self-Research, coming soon) + +## Visualization + + +### 1.Ultra-lightweight Chinese/English OCR Visualization [more](./doc/doc_en/visualization_en.md) + +
+ +
+ + +### 2. General Chinese/English OCR Visualization [more](./doc/doc_en/visualization_en.md) + +
+ +
+ + +### 3.Chinese/English OCR Visualization (Space_support) [more](./doc/doc_en/visualization_en.md) + +
+ +
+ + + +## FAQ +1. Error when using attention-based recognition model: KeyError: 'predict' + + The inference of recognition model based on attention loss is still being debugged. For Chinese text recognition, it is recommended to choose the recognition model based on CTC loss first. In practice, it is also found that the recognition model based on attention loss is not as effective as the one based on CTC loss. + +2. About inference speed + + When there are a lot of texts in the picture, the prediction time will increase. You can use `--rec_batch_num` to set a smaller prediction batch size. The default value is 30, which can be changed to 10 or other values. + +3. Service deployment and mobile deployment + + It is expected that the service deployment based on Serving and the mobile deployment based on Paddle Lite will be released successively in mid-to-late June. Stay tuned for more updates. + +4. Release time of self-developed algorithm + + Baidu Self-developed algorithms such as SAST, SRN and end2end PSL will be released in June or July. Please be patient. + +[more](./doc/doc_en/FAQ_en.md) + + +## Community +Scan the QR code below with your wechat and completing the questionnaire, you can access to offical technical exchange group. + +
+ +
+ + +## License +This project is released under Apache 2.0 license + + +## Contribution +We welcome all the contributions to PaddleOCR and appreciate for your feedback very much. + +- Many thanks to [Khanh Tran](https://github.com/xxxpsyduck) and [Karl Horky](https://github.com/karlhorky) for contributing and revising the English documentation. +- Many thanks to [zhangxin](https://github.com/ZhangXinNan) for contributing the new visualize function、add .gitgnore and discard set PYTHONPATH manually. +- Many thanks to [lyl120117](https://github.com/lyl120117) for contributing the code for printing the network structure. +- Thanks [xiangyubo](https://github.com/xiangyubo) for contributing the handwritten Chinese OCR datasets. +- Thanks [authorfu](https://github.com/authorfu) for contributing Android demo and [xiadeye](https://github.com/xiadeye) contributing iOS demo, respectively. +- Thanks [BeyondYourself](https://github.com/BeyondYourself) for contributing many great suggestions and simplifying part of the code style. +- Thanks [tangmq](https://gitee.com/tangmq) for contributing Dockerized deployment services to PaddleOCR and supporting the rapid release of callable Restful API services. diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..7d94f66be072067172d56da13d8bb27d9aeac431 --- /dev/null +++ b/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +__all__ = ['PaddleOCR', 'draw_ocr'] +from .paddleocr import PaddleOCR +from .tools.infer.utility import draw_ocr diff --git a/configs/det/det_sast_icdar15_reader.yml b/configs/det/det_sast_icdar15_reader.yml index 1fdea8756c3d80b291aba4afbb8c7d340654b6cb..ee45a85da7452e2069b0d7467b1ccfc44dd656b7 100644 --- a/configs/det/det_sast_icdar15_reader.yml +++ b/configs/det/det_sast_icdar15_reader.yml @@ -3,7 +3,7 @@ TrainReader: process_function: ppocr.data.det.sast_process,SASTProcessTrain num_workers: 8 img_set_dir: ./train_data/ - label_file_path: [./train_data/icdar13/train_label_json.txt, ./train_data/icdar15/train_label_json.txt, ./train_data/icdar17_mlt_latin/train_label_json.txt, ./train_data/coco_text_icdar_4pts/train_label_json.txt] + label_file_path: [./train_data/icdar2013/train_label_json.txt, ./train_data/icdar2015/train_label_json.txt, ./train_data/icdar17_mlt_latin/train_label_json.txt, ./train_data/coco_text_icdar_4pts/train_label_json.txt] data_ratio_list: [0.1, 0.45, 0.3, 0.15] min_crop_side_ratio: 0.3 min_crop_size: 24 @@ -20,7 +20,5 @@ EvalReader: TestReader: reader_function: ppocr.data.det.dataset_traversal,EvalTestReader process_function: ppocr.data.det.sast_process,SASTProcessTest - infer_img: - img_set_dir: ./train_data/icdar2015/text_localization/ - label_file_path: ./train_data/icdar2015/text_localization/test_icdar2015_label.txt - do_eval: True + infer_img: ./train_data/icdar2015/text_localization/ch4_test_images/img_11.jpg + max_side_len: 1536 diff --git a/configs/det/det_sast_totaltext_reader.yml b/configs/det/det_sast_totaltext_reader.yml index 74c320fcc5e1c8340d0439e350e2a4e88824b16d..92503d9f0e2b57f0d22b15591c5400185daf2afa 100644 --- a/configs/det/det_sast_totaltext_reader.yml +++ b/configs/det/det_sast_totaltext_reader.yml @@ -3,7 +3,7 @@ TrainReader: process_function: ppocr.data.det.sast_process,SASTProcessTrain num_workers: 8 img_set_dir: ./train_data/ - label_file_path: [./train_data/art_latin_icdar_14pt/train_no_tt_test/train_label_json.txt, ./train_data/total_text_icdar_14pt/train/train_label_json.txt] + label_file_path: [./train_data/art_latin_icdar_14pt/train_no_tt_test/train_label_json.txt, ./train_data/total_text_icdar_14pt/train_label_json.txt] data_ratio_list: [0.5, 0.5] min_crop_side_ratio: 0.3 min_crop_size: 24 @@ -13,12 +13,12 @@ TrainReader: EvalReader: reader_function: ppocr.data.det.dataset_traversal,EvalTestReader process_function: ppocr.data.det.sast_process,SASTProcessTest - img_set_dir: ./train_data/afs/ - label_file_path: ./train_data/afs/total_text/test_label_json.txt + img_set_dir: ./train_data/ + label_file_path: ./train_data/total_text_icdar_14pt/test_label_json.txt max_side_len: 768 TestReader: reader_function: ppocr.data.det.dataset_traversal,EvalTestReader process_function: ppocr.data.det.sast_process,SASTProcessTest - infer_img: + infer_img: ./train_data/afs/total_text/Images/Test/img623.jpg max_side_len: 768 diff --git a/configs/rec/rec_r50fpn_vd_none_srn.yml b/configs/rec/rec_r50fpn_vd_none_srn.yml index 7a0f136c28dd967aeb422d843a49cf65b934d7ca..30709e479f8da56b6bd7fe9ebf817a27bff9cc38 100755 --- a/configs/rec/rec_r50fpn_vd_none_srn.yml +++ b/configs/rec/rec_r50fpn_vd_none_srn.yml @@ -27,7 +27,7 @@ Architecture: function: ppocr.modeling.architectures.rec_model,RecModel Backbone: - function: ppocr.modeling.backbones.rec_resnet50_fpn,ResNet + function: ppocr.modeling.backbones.rec_resnet_fpn,ResNet layers: 50 Head: diff --git a/deploy/android_demo/app/src/main/assets/images/180.jpg b/deploy/android_demo/app/src/main/assets/images/180.jpg new file mode 100644 index 0000000000000000000000000000000000000000..84cf4c79ef14769d01b0b0e9667387bd16b3e6e7 Binary files /dev/null and b/deploy/android_demo/app/src/main/assets/images/180.jpg differ diff --git a/deploy/android_demo/app/src/main/assets/images/270.jpg b/deploy/android_demo/app/src/main/assets/images/270.jpg new file mode 100644 index 0000000000000000000000000000000000000000..568739043b7779425b0abeb4459dbb485caed847 Binary files /dev/null and b/deploy/android_demo/app/src/main/assets/images/270.jpg differ diff --git a/deploy/android_demo/app/src/main/assets/images/90.jpg b/deploy/android_demo/app/src/main/assets/images/90.jpg new file mode 100644 index 0000000000000000000000000000000000000000..49e949aa9cc14e3afc507c5806c87d9894c2dcb9 Binary files /dev/null and b/deploy/android_demo/app/src/main/assets/images/90.jpg differ diff --git a/deploy/android_demo/app/src/main/cpp/native.cpp b/deploy/android_demo/app/src/main/cpp/native.cpp index 390c594deb02a8f82693f2c83741a4750fe7cb25..963c5246d5b7b50720f92705d288526ae2cc6a73 100644 --- a/deploy/android_demo/app/src/main/cpp/native.cpp +++ b/deploy/android_demo/app/src/main/cpp/native.cpp @@ -4,29 +4,29 @@ #include "native.h" #include "ocr_ppredictor.h" -#include #include #include +#include static paddle::lite_api::PowerMode str_to_cpu_mode(const std::string &cpu_mode); -extern "C" -JNIEXPORT jlong JNICALL -Java_com_baidu_paddle_lite_demo_ocr_OCRPredictorNative_init(JNIEnv *env, jobject thiz, - jstring j_det_model_path, - jstring j_rec_model_path, - jint j_thread_num, - jstring j_cpu_mode) { - std::string det_model_path = jstring_to_cpp_string(env, j_det_model_path); - std::string rec_model_path = jstring_to_cpp_string(env, j_rec_model_path); - int thread_num = j_thread_num; - std::string cpu_mode = jstring_to_cpp_string(env, j_cpu_mode); - ppredictor::OCR_Config conf; - conf.thread_num = thread_num; - conf.mode = str_to_cpu_mode(cpu_mode); - ppredictor::OCR_PPredictor *orc_predictor = new ppredictor::OCR_PPredictor{conf}; - orc_predictor->init_from_file(det_model_path, rec_model_path); - return reinterpret_cast(orc_predictor); +extern "C" JNIEXPORT jlong JNICALL +Java_com_baidu_paddle_lite_demo_ocr_OCRPredictorNative_init( + JNIEnv *env, jobject thiz, jstring j_det_model_path, + jstring j_rec_model_path, jstring j_cls_model_path, jint j_thread_num, + jstring j_cpu_mode) { + std::string det_model_path = jstring_to_cpp_string(env, j_det_model_path); + std::string rec_model_path = jstring_to_cpp_string(env, j_rec_model_path); + std::string cls_model_path = jstring_to_cpp_string(env, j_cls_model_path); + int thread_num = j_thread_num; + std::string cpu_mode = jstring_to_cpp_string(env, j_cpu_mode); + ppredictor::OCR_Config conf; + conf.thread_num = thread_num; + conf.mode = str_to_cpu_mode(cpu_mode); + ppredictor::OCR_PPredictor *orc_predictor = + new ppredictor::OCR_PPredictor{conf}; + orc_predictor->init_from_file(det_model_path, rec_model_path, cls_model_path); + return reinterpret_cast(orc_predictor); } /** @@ -34,82 +34,81 @@ Java_com_baidu_paddle_lite_demo_ocr_OCRPredictorNative_init(JNIEnv *env, jobject * @param cpu_mode * @return */ -static paddle::lite_api::PowerMode str_to_cpu_mode(const std::string &cpu_mode) { - static std::map cpu_mode_map{ - {"LITE_POWER_HIGH", paddle::lite_api::LITE_POWER_HIGH}, - {"LITE_POWER_LOW", paddle::lite_api::LITE_POWER_HIGH}, - {"LITE_POWER_FULL", paddle::lite_api::LITE_POWER_FULL}, - {"LITE_POWER_NO_BIND", paddle::lite_api::LITE_POWER_NO_BIND}, - {"LITE_POWER_RAND_HIGH", paddle::lite_api::LITE_POWER_RAND_HIGH}, - {"LITE_POWER_RAND_LOW", paddle::lite_api::LITE_POWER_RAND_LOW} - }; - std::string upper_key; - std::transform(cpu_mode.cbegin(), cpu_mode.cend(), upper_key.begin(), ::toupper); - auto index = cpu_mode_map.find(upper_key); - if (index == cpu_mode_map.end()) { - LOGE("cpu_mode not found %s", upper_key.c_str()); - return paddle::lite_api::LITE_POWER_HIGH; - } else { - return index->second; - } - +static paddle::lite_api::PowerMode +str_to_cpu_mode(const std::string &cpu_mode) { + static std::map cpu_mode_map{ + {"LITE_POWER_HIGH", paddle::lite_api::LITE_POWER_HIGH}, + {"LITE_POWER_LOW", paddle::lite_api::LITE_POWER_HIGH}, + {"LITE_POWER_FULL", paddle::lite_api::LITE_POWER_FULL}, + {"LITE_POWER_NO_BIND", paddle::lite_api::LITE_POWER_NO_BIND}, + {"LITE_POWER_RAND_HIGH", paddle::lite_api::LITE_POWER_RAND_HIGH}, + {"LITE_POWER_RAND_LOW", paddle::lite_api::LITE_POWER_RAND_LOW}}; + std::string upper_key; + std::transform(cpu_mode.cbegin(), cpu_mode.cend(), upper_key.begin(), + ::toupper); + auto index = cpu_mode_map.find(upper_key); + if (index == cpu_mode_map.end()) { + LOGE("cpu_mode not found %s", upper_key.c_str()); + return paddle::lite_api::LITE_POWER_HIGH; + } else { + return index->second; + } } -extern "C" -JNIEXPORT jfloatArray JNICALL -Java_com_baidu_paddle_lite_demo_ocr_OCRPredictorNative_forward(JNIEnv *env, jobject thiz, - jlong java_pointer, jfloatArray buf, - jfloatArray ddims, - jobject original_image) { - LOGI("begin to run native forward"); - if (java_pointer == 0) { - LOGE("JAVA pointer is NULL"); - return cpp_array_to_jfloatarray(env, nullptr, 0); - } - cv::Mat origin = bitmap_to_cv_mat(env, original_image); - if (origin.size == 0) { - LOGE("origin bitmap cannot convert to CV Mat"); - return cpp_array_to_jfloatarray(env, nullptr, 0); - } - ppredictor::OCR_PPredictor *ppredictor = (ppredictor::OCR_PPredictor *) java_pointer; - std::vector dims_float_arr = jfloatarray_to_float_vector(env, ddims); - std::vector dims_arr; - dims_arr.resize(dims_float_arr.size()); - std::copy(dims_float_arr.cbegin(), dims_float_arr.cend(), dims_arr.begin()); +extern "C" JNIEXPORT jfloatArray JNICALL +Java_com_baidu_paddle_lite_demo_ocr_OCRPredictorNative_forward( + JNIEnv *env, jobject thiz, jlong java_pointer, jfloatArray buf, + jfloatArray ddims, jobject original_image) { + LOGI("begin to run native forward"); + if (java_pointer == 0) { + LOGE("JAVA pointer is NULL"); + return cpp_array_to_jfloatarray(env, nullptr, 0); + } + cv::Mat origin = bitmap_to_cv_mat(env, original_image); + if (origin.size == 0) { + LOGE("origin bitmap cannot convert to CV Mat"); + return cpp_array_to_jfloatarray(env, nullptr, 0); + } + ppredictor::OCR_PPredictor *ppredictor = + (ppredictor::OCR_PPredictor *)java_pointer; + std::vector dims_float_arr = jfloatarray_to_float_vector(env, ddims); + std::vector dims_arr; + dims_arr.resize(dims_float_arr.size()); + std::copy(dims_float_arr.cbegin(), dims_float_arr.cend(), dims_arr.begin()); - // 这里值有点大,就不调用jfloatarray_to_float_vector了 - int64_t buf_len = (int64_t) env->GetArrayLength(buf); - jfloat *buf_data = env->GetFloatArrayElements(buf, JNI_FALSE); - float *data = (jfloat *) buf_data; - std::vector results = ppredictor->infer_ocr(dims_arr, data, - buf_len, - NET_OCR, origin); - LOGI("infer_ocr finished with boxes %ld", results.size()); - // 这里将std::vector 序列化成 float数组,传输到java层再反序列化 - std::vector float_arr; - for (const ppredictor::OCRPredictResult &r :results) { - float_arr.push_back(r.points.size()); - float_arr.push_back(r.word_index.size()); - float_arr.push_back(r.score); - for (const std::vector &point : r.points) { - float_arr.push_back(point.at(0)); - float_arr.push_back(point.at(1)); - } - for (int index: r.word_index) { - float_arr.push_back(index); - } + // 这里值有点大,就不调用jfloatarray_to_float_vector了 + int64_t buf_len = (int64_t)env->GetArrayLength(buf); + jfloat *buf_data = env->GetFloatArrayElements(buf, JNI_FALSE); + float *data = (jfloat *)buf_data; + std::vector results = + ppredictor->infer_ocr(dims_arr, data, buf_len, NET_OCR, origin); + LOGI("infer_ocr finished with boxes %ld", results.size()); + // 这里将std::vector 序列化成 + // float数组,传输到java层再反序列化 + std::vector float_arr; + for (const ppredictor::OCRPredictResult &r : results) { + float_arr.push_back(r.points.size()); + float_arr.push_back(r.word_index.size()); + float_arr.push_back(r.score); + for (const std::vector &point : r.points) { + float_arr.push_back(point.at(0)); + float_arr.push_back(point.at(1)); } - return cpp_array_to_jfloatarray(env, float_arr.data(), float_arr.size()); + for (int index : r.word_index) { + float_arr.push_back(index); + } + } + return cpp_array_to_jfloatarray(env, float_arr.data(), float_arr.size()); } -extern "C" -JNIEXPORT void JNICALL -Java_com_baidu_paddle_lite_demo_ocr_OCRPredictorNative_release(JNIEnv *env, jobject thiz, - jlong java_pointer){ - if (java_pointer == 0) { - LOGE("JAVA pointer is NULL"); - return; - } - ppredictor::OCR_PPredictor *ppredictor = (ppredictor::OCR_PPredictor *) java_pointer; - delete ppredictor; +extern "C" JNIEXPORT void JNICALL +Java_com_baidu_paddle_lite_demo_ocr_OCRPredictorNative_release( + JNIEnv *env, jobject thiz, jlong java_pointer) { + if (java_pointer == 0) { + LOGE("JAVA pointer is NULL"); + return; + } + ppredictor::OCR_PPredictor *ppredictor = + (ppredictor::OCR_PPredictor *)java_pointer; + delete ppredictor; } \ No newline at end of file diff --git a/deploy/android_demo/app/src/main/cpp/ocr_cls_process.cpp b/deploy/android_demo/app/src/main/cpp/ocr_cls_process.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d720066667b60ee87bc1a1227ad720074254074e --- /dev/null +++ b/deploy/android_demo/app/src/main/cpp/ocr_cls_process.cpp @@ -0,0 +1,46 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "ocr_cls_process.h" +#include +#include +#include +#include +#include +#include + +const std::vector CLS_IMAGE_SHAPE = {3, 32, 100}; + +cv::Mat cls_resize_img(const cv::Mat &img) { + int imgC = CLS_IMAGE_SHAPE[0]; + int imgW = CLS_IMAGE_SHAPE[2]; + int imgH = CLS_IMAGE_SHAPE[1]; + + float ratio = float(img.cols) / float(img.rows); + int resize_w = 0; + if (ceilf(imgH * ratio) > imgW) + resize_w = imgW; + else + resize_w = int(ceilf(imgH * ratio)); + + cv::Mat resize_img; + cv::resize(img, resize_img, cv::Size(resize_w, imgH), 0.f, 0.f, + cv::INTER_CUBIC); + + if (resize_w < imgW) { + cv::copyMakeBorder(resize_img, resize_img, 0, 0, 0, int(imgW - resize_w), + cv::BORDER_CONSTANT, {0, 0, 0}); + } + return resize_img; +} \ No newline at end of file diff --git a/deploy/android_demo/app/src/main/cpp/ocr_cls_process.h b/deploy/android_demo/app/src/main/cpp/ocr_cls_process.h new file mode 100644 index 0000000000000000000000000000000000000000..8e37c3031ba72ac3fe43b467c6986c55a8e73e2c --- /dev/null +++ b/deploy/android_demo/app/src/main/cpp/ocr_cls_process.h @@ -0,0 +1,12 @@ +// +// Created by fujiayi on 2020/7/3. +// +#pragma once + +#include "common.h" +#include +#include + +extern const std::vector CLS_IMAGE_SHAPE; + +cv::Mat cls_resize_img(const cv::Mat &img); \ No newline at end of file diff --git a/deploy/android_demo/app/src/main/cpp/ocr_ppredictor.cpp b/deploy/android_demo/app/src/main/cpp/ocr_ppredictor.cpp index 3d0147715519c195fd48f7f84b7a28a5a82f5363..f0d855e83f010ef762cb4b01086e41a0f64fb4cb 100644 --- a/deploy/android_demo/app/src/main/cpp/ocr_ppredictor.cpp +++ b/deploy/android_demo/app/src/main/cpp/ocr_ppredictor.cpp @@ -3,38 +3,48 @@ // #include "ocr_ppredictor.h" -#include "preprocess.h" #include "common.h" -#include "ocr_db_post_process.h" +#include "ocr_cls_process.h" #include "ocr_crnn_process.h" +#include "ocr_db_post_process.h" +#include "preprocess.h" namespace ppredictor { -OCR_PPredictor::OCR_PPredictor(const OCR_Config &config) : _config(config) { +OCR_PPredictor::OCR_PPredictor(const OCR_Config &config) : _config(config) {} -} +int OCR_PPredictor::init(const std::string &det_model_content, + const std::string &rec_model_content, + const std::string &cls_model_content) { + _det_predictor = std::unique_ptr( + new PPredictor{_config.thread_num, NET_OCR, _config.mode}); + _det_predictor->init_nb(det_model_content); -int -OCR_PPredictor::init(const std::string &det_model_content, const std::string &rec_model_content) { - _det_predictor = std::unique_ptr( - new PPredictor{_config.thread_num, NET_OCR, _config.mode}); - _det_predictor->init_nb(det_model_content); + _rec_predictor = std::unique_ptr( + new PPredictor{_config.thread_num, NET_OCR_INTERNAL, _config.mode}); + _rec_predictor->init_nb(rec_model_content); - _rec_predictor = std::unique_ptr( - new PPredictor{_config.thread_num, NET_OCR_INTERNAL, _config.mode}); - _rec_predictor->init_nb(rec_model_content); - return RETURN_OK; + _cls_predictor = std::unique_ptr( + new PPredictor{_config.thread_num, NET_OCR_INTERNAL, _config.mode}); + _cls_predictor->init_nb(cls_model_content); + return RETURN_OK; } -int OCR_PPredictor::init_from_file(const std::string &det_model_path, const std::string &rec_model_path){ - _det_predictor = std::unique_ptr( - new PPredictor{_config.thread_num, NET_OCR, _config.mode}); - _det_predictor->init_from_file(det_model_path); - - _rec_predictor = std::unique_ptr( - new PPredictor{_config.thread_num, NET_OCR_INTERNAL, _config.mode}); - _rec_predictor->init_from_file(rec_model_path); - return RETURN_OK; +int OCR_PPredictor::init_from_file(const std::string &det_model_path, + const std::string &rec_model_path, + const std::string &cls_model_path) { + _det_predictor = std::unique_ptr( + new PPredictor{_config.thread_num, NET_OCR, _config.mode}); + _det_predictor->init_from_file(det_model_path); + + _rec_predictor = std::unique_ptr( + new PPredictor{_config.thread_num, NET_OCR_INTERNAL, _config.mode}); + _rec_predictor->init_from_file(rec_model_path); + + _cls_predictor = std::unique_ptr( + new PPredictor{_config.thread_num, NET_OCR_INTERNAL, _config.mode}); + _cls_predictor->init_from_file(cls_model_path); + return RETURN_OK; } /** * for debug use, show result of First Step @@ -42,145 +52,188 @@ int OCR_PPredictor::init_from_file(const std::string &det_model_path, const std: * @param boxes * @param srcimg */ -static void visual_img(const std::vector>> &filter_boxes, - const std::vector>> &boxes, - const cv::Mat &srcimg) { - // visualization - cv::Point rook_points[filter_boxes.size()][4]; - for (int n = 0; n < filter_boxes.size(); n++) { - for (int m = 0; m < filter_boxes[0].size(); m++) { - rook_points[n][m] = cv::Point(int(filter_boxes[n][m][0]), int(filter_boxes[n][m][1])); - } +static void +visual_img(const std::vector>> &filter_boxes, + const std::vector>> &boxes, + const cv::Mat &srcimg) { + // visualization + cv::Point rook_points[filter_boxes.size()][4]; + for (int n = 0; n < filter_boxes.size(); n++) { + for (int m = 0; m < filter_boxes[0].size(); m++) { + rook_points[n][m] = + cv::Point(int(filter_boxes[n][m][0]), int(filter_boxes[n][m][1])); } - - cv::Mat img_vis; - srcimg.copyTo(img_vis); - for (int n = 0; n < boxes.size(); n++) { - const cv::Point *ppt[1] = {rook_points[n]}; - int npt[] = {4}; - cv::polylines(img_vis, ppt, npt, 1, 1, CV_RGB(0, 255, 0), 2, 8, 0); - } - // 调试用,自行替换需要修改的路径 - cv::imwrite("/sdcard/1/vis.png", img_vis); + } + + cv::Mat img_vis; + srcimg.copyTo(img_vis); + for (int n = 0; n < boxes.size(); n++) { + const cv::Point *ppt[1] = {rook_points[n]}; + int npt[] = {4}; + cv::polylines(img_vis, ppt, npt, 1, 1, CV_RGB(0, 255, 0), 2, 8, 0); + } + // 调试用,自行替换需要修改的路径 + cv::imwrite("/sdcard/1/vis.png", img_vis); } std::vector -OCR_PPredictor::infer_ocr(const std::vector &dims, const float *input_data, int input_len, - int net_flag, cv::Mat &origin) { +OCR_PPredictor::infer_ocr(const std::vector &dims, + const float *input_data, int input_len, int net_flag, + cv::Mat &origin) { + PredictorInput input = _det_predictor->get_first_input(); + input.set_dims(dims); + input.set_data(input_data, input_len); + std::vector results = _det_predictor->infer(); + PredictorOutput &res = results.at(0); + std::vector>> filtered_box = calc_filtered_boxes( + res.get_float_data(), res.get_size(), (int)dims[2], (int)dims[3], origin); + LOGI("Filter_box size %ld", filtered_box.size()); + return infer_rec(filtered_box, origin); +} - PredictorInput input = _det_predictor->get_first_input(); +std::vector OCR_PPredictor::infer_rec( + const std::vector>> &boxes, + const cv::Mat &origin_img) { + std::vector mean = {0.5f, 0.5f, 0.5f}; + std::vector scale = {1 / 0.5f, 1 / 0.5f, 1 / 0.5f}; + std::vector dims = {1, 3, 0, 0}; + std::vector ocr_results; + + PredictorInput input = _rec_predictor->get_first_input(); + for (auto bp = boxes.crbegin(); bp != boxes.crend(); ++bp) { + const std::vector> &box = *bp; + cv::Mat crop_img = get_rotate_crop_image(origin_img, box); + crop_img = infer_cls(crop_img); + + float wh_ratio = float(crop_img.cols) / float(crop_img.rows); + cv::Mat input_image = crnn_resize_img(crop_img, wh_ratio); + input_image.convertTo(input_image, CV_32FC3, 1 / 255.0f); + const float *dimg = reinterpret_cast(input_image.data); + int input_size = input_image.rows * input_image.cols; + + dims[2] = input_image.rows; + dims[3] = input_image.cols; input.set_dims(dims); - input.set_data(input_data, input_len); - std::vector results = _det_predictor->infer(); - PredictorOutput &res = results.at(0); - std::vector>> filtered_box - = calc_filtered_boxes(res.get_float_data(), res.get_size(), (int) dims[2], (int) dims[3], - origin); - LOGI("Filter_box size %ld", filtered_box.size()); - return infer_rec(filtered_box, origin); -} -std::vector -OCR_PPredictor::infer_rec(const std::vector>> &boxes, - const cv::Mat &origin_img) { - std::vector mean = {0.5f, 0.5f, 0.5f}; - std::vector scale = {1 / 0.5f, 1 / 0.5f, 1 / 0.5f}; - std::vector dims = {1, 3, 0, 0}; - std::vector ocr_results; - - PredictorInput input = _rec_predictor->get_first_input(); - for (auto bp = boxes.crbegin(); bp != boxes.crend(); ++bp) { - const std::vector> &box = *bp; - cv::Mat crop_img = get_rotate_crop_image(origin_img, box); - float wh_ratio = float(crop_img.cols) / float(crop_img.rows); - cv::Mat input_image = crnn_resize_img(crop_img, wh_ratio); - input_image.convertTo(input_image, CV_32FC3, 1 / 255.0f); - const float *dimg = reinterpret_cast(input_image.data); - int input_size = input_image.rows * input_image.cols; - - dims[2] = input_image.rows; - dims[3] = input_image.cols; - input.set_dims(dims); - - neon_mean_scale(dimg, input.get_mutable_float_data(), input_size, mean, scale); - - std::vector results = _rec_predictor->infer(); - - OCRPredictResult res; - res.word_index = postprocess_rec_word_index(results.at(0)); - if (res.word_index.empty()) { - continue; - } - res.score = postprocess_rec_score(results.at(1)); - res.points = box; - ocr_results.emplace_back(std::move(res)); + neon_mean_scale(dimg, input.get_mutable_float_data(), input_size, mean, + scale); + + std::vector results = _rec_predictor->infer(); + + OCRPredictResult res; + res.word_index = postprocess_rec_word_index(results.at(0)); + if (res.word_index.empty()) { + continue; } - LOGI("ocr_results finished %lu", ocr_results.size()); - return ocr_results; + res.score = postprocess_rec_score(results.at(1)); + res.points = box; + ocr_results.emplace_back(std::move(res)); + } + LOGI("ocr_results finished %lu", ocr_results.size()); + return ocr_results; +} + +cv::Mat OCR_PPredictor::infer_cls(const cv::Mat &img, float thresh) { + std::vector mean = {0.5f, 0.5f, 0.5f}; + std::vector scale = {1 / 0.5f, 1 / 0.5f, 1 / 0.5f}; + std::vector dims = {1, 3, 0, 0}; + std::vector ocr_results; + + PredictorInput input = _cls_predictor->get_first_input(); + + cv::Mat input_image = cls_resize_img(img); + input_image.convertTo(input_image, CV_32FC3, 1 / 255.0f); + const float *dimg = reinterpret_cast(input_image.data); + int input_size = input_image.rows * input_image.cols; + + dims[2] = input_image.rows; + dims[3] = input_image.cols; + input.set_dims(dims); + + neon_mean_scale(dimg, input.get_mutable_float_data(), input_size, mean, + scale); + + std::vector results = _cls_predictor->infer(); + + const float *scores = results.at(0).get_float_data(); + const int *labels = results.at(1).get_int_data(); + for (int64_t i = 0; i < results.at(0).get_size(); i++) { + LOGI("output scores [%f]", scores[i]); + } + for (int64_t i = 0; i < results.at(1).get_size(); i++) { + LOGI("output label [%d]", labels[i]); + } + int label_idx = labels[0]; + float score = scores[label_idx]; + + cv::Mat srcimg; + img.copyTo(srcimg); + if (label_idx % 2 == 1 && score > thresh) { + cv::rotate(srcimg, srcimg, 1); + } + return srcimg; } std::vector>> -OCR_PPredictor::calc_filtered_boxes(const float *pred, int pred_size, int output_height, - int output_width, const cv::Mat &origin) { - const double threshold = 0.3; - const double maxvalue = 1; - - cv::Mat pred_map = cv::Mat::zeros(output_height, output_width, CV_32F); - memcpy(pred_map.data, pred, pred_size * sizeof(float)); - cv::Mat cbuf_map; - pred_map.convertTo(cbuf_map, CV_8UC1); - - cv::Mat bit_map; - cv::threshold(cbuf_map, bit_map, threshold, maxvalue, cv::THRESH_BINARY); - - std::vector>> boxes = boxes_from_bitmap(pred_map, bit_map); - float ratio_h = output_height * 1.0f / origin.rows; - float ratio_w = output_width * 1.0f / origin.cols; - std::vector>> filter_boxes = filter_tag_det_res(boxes, ratio_h, - ratio_w, origin); - return filter_boxes; +OCR_PPredictor::calc_filtered_boxes(const float *pred, int pred_size, + int output_height, int output_width, + const cv::Mat &origin) { + const double threshold = 0.3; + const double maxvalue = 1; + + cv::Mat pred_map = cv::Mat::zeros(output_height, output_width, CV_32F); + memcpy(pred_map.data, pred, pred_size * sizeof(float)); + cv::Mat cbuf_map; + pred_map.convertTo(cbuf_map, CV_8UC1); + + cv::Mat bit_map; + cv::threshold(cbuf_map, bit_map, threshold, maxvalue, cv::THRESH_BINARY); + + std::vector>> boxes = + boxes_from_bitmap(pred_map, bit_map); + float ratio_h = output_height * 1.0f / origin.rows; + float ratio_w = output_width * 1.0f / origin.cols; + std::vector>> filter_boxes = + filter_tag_det_res(boxes, ratio_h, ratio_w, origin); + return filter_boxes; } -std::vector OCR_PPredictor::postprocess_rec_word_index(const PredictorOutput &res) { - const int *rec_idx = res.get_int_data(); - const std::vector> rec_idx_lod = res.get_lod(); +std::vector +OCR_PPredictor::postprocess_rec_word_index(const PredictorOutput &res) { + const int *rec_idx = res.get_int_data(); + const std::vector> rec_idx_lod = res.get_lod(); - std::vector pred_idx; - for (int n = int(rec_idx_lod[0][0]); n < int(rec_idx_lod[0][1] * 2); n += 2) { - pred_idx.emplace_back(rec_idx[n]); - } - return pred_idx; + std::vector pred_idx; + for (int n = int(rec_idx_lod[0][0]); n < int(rec_idx_lod[0][1] * 2); n += 2) { + pred_idx.emplace_back(rec_idx[n]); + } + return pred_idx; } float OCR_PPredictor::postprocess_rec_score(const PredictorOutput &res) { - const float *predict_batch = res.get_float_data(); - const std::vector predict_shape = res.get_shape(); - const std::vector> predict_lod = res.get_lod(); - int blank = predict_shape[1]; - float score = 0.f; - int count = 0; - for (int n = predict_lod[0][0]; n < predict_lod[0][1] - 1; n++) { - int argmax_idx = argmax(predict_batch + n * predict_shape[1], - predict_batch + (n + 1) * predict_shape[1]); - float max_value = predict_batch[n * predict_shape[1] + argmax_idx]; - if (blank - 1 - argmax_idx > 1e-5) { - score += max_value; - count += 1; - } - - } - if (count == 0) { - LOGE("calc score count 0"); - } else { - score /= count; + const float *predict_batch = res.get_float_data(); + const std::vector predict_shape = res.get_shape(); + const std::vector> predict_lod = res.get_lod(); + int blank = predict_shape[1]; + float score = 0.f; + int count = 0; + for (int n = predict_lod[0][0]; n < predict_lod[0][1] - 1; n++) { + int argmax_idx = argmax(predict_batch + n * predict_shape[1], + predict_batch + (n + 1) * predict_shape[1]); + float max_value = predict_batch[n * predict_shape[1] + argmax_idx]; + if (blank - 1 - argmax_idx > 1e-5) { + score += max_value; + count += 1; } - LOGI("calc score: %f", score); - return score; - + } + if (count == 0) { + LOGE("calc score count 0"); + } else { + score /= count; + } + LOGI("calc score: %f", score); + return score; } - -NET_TYPE OCR_PPredictor::get_net_flag() const { - return NET_OCR; -} +NET_TYPE OCR_PPredictor::get_net_flag() const { return NET_OCR; } } \ No newline at end of file diff --git a/deploy/android_demo/app/src/main/cpp/ocr_ppredictor.h b/deploy/android_demo/app/src/main/cpp/ocr_ppredictor.h index eb2bc3bc989c5dd9a2c5a8aae3508ca733602bd7..0ec458a4952cbc605e9979ce7850bdeab36c4629 100644 --- a/deploy/android_demo/app/src/main/cpp/ocr_ppredictor.h +++ b/deploy/android_demo/app/src/main/cpp/ocr_ppredictor.h @@ -4,10 +4,10 @@ #pragma once -#include +#include "ppredictor.h" #include #include -#include "ppredictor.h" +#include namespace ppredictor { @@ -15,17 +15,18 @@ namespace ppredictor { * Config */ struct OCR_Config { - int thread_num = 4; // Thread num - paddle::lite_api::PowerMode mode = paddle::lite_api::LITE_POWER_HIGH; // PaddleLite Mode + int thread_num = 4; // Thread num + paddle::lite_api::PowerMode mode = + paddle::lite_api::LITE_POWER_HIGH; // PaddleLite Mode }; /** * PolyGone Result */ struct OCRPredictResult { - std::vector word_index; - std::vector> points; - float score; + std::vector word_index; + std::vector> points; + float score; }; /** @@ -35,78 +36,87 @@ struct OCRPredictResult { */ class OCR_PPredictor : public PPredictor_Interface { public: - OCR_PPredictor(const OCR_Config &config); - - virtual ~OCR_PPredictor() { - - } - - /** - * 初始化二个模型的Predictor - * @param det_model_content - * @param rec_model_content - * @return - */ - int init(const std::string &det_model_content, const std::string &rec_model_content); - int init_from_file(const std::string &det_model_path, const std::string &rec_model_path); - /** - * Return OCR result - * @param dims - * @param input_data - * @param input_len - * @param net_flag - * @param origin - * @return - */ - virtual std::vector - infer_ocr(const std::vector &dims, const float *input_data, int input_len, - int net_flag, cv::Mat &origin); - - - virtual NET_TYPE get_net_flag() const; - + OCR_PPredictor(const OCR_Config &config); + + virtual ~OCR_PPredictor() {} + + /** + * 初始化二个模型的Predictor + * @param det_model_content + * @param rec_model_content + * @return + */ + int init(const std::string &det_model_content, + const std::string &rec_model_content, + const std::string &cls_model_content); + int init_from_file(const std::string &det_model_path, + const std::string &rec_model_path, + const std::string &cls_model_path); + /** + * Return OCR result + * @param dims + * @param input_data + * @param input_len + * @param net_flag + * @param origin + * @return + */ + virtual std::vector + infer_ocr(const std::vector &dims, const float *input_data, + int input_len, int net_flag, cv::Mat &origin); + + virtual NET_TYPE get_net_flag() const; private: - - /** - * calcul Polygone from the result image of first model - * @param pred - * @param output_height - * @param output_width - * @param origin - * @return - */ - std::vector>> - calc_filtered_boxes(const float *pred, int pred_size, int output_height, int output_width, - const cv::Mat &origin); - - /** - * infer for second model - * - * @param boxes - * @param origin - * @return - */ - std::vector - infer_rec(const std::vector>> &boxes, const cv::Mat &origin); - - /** - * Postprocess or sencod model to extract text - * @param res - * @return - */ - std::vector postprocess_rec_word_index(const PredictorOutput &res); - - /** - * calculate confidence of second model text result - * @param res - * @return - */ - float postprocess_rec_score(const PredictorOutput &res); - - std::unique_ptr _det_predictor; - std::unique_ptr _rec_predictor; - OCR_Config _config; - + /** + * calcul Polygone from the result image of first model + * @param pred + * @param output_height + * @param output_width + * @param origin + * @return + */ + std::vector>> + calc_filtered_boxes(const float *pred, int pred_size, int output_height, + int output_width, const cv::Mat &origin); + + /** + * infer for second model + * + * @param boxes + * @param origin + * @return + */ + std::vector + infer_rec(const std::vector>> &boxes, + const cv::Mat &origin); + + /** + * infer for cls model + * + * @param boxes + * @param origin + * @return + */ + cv::Mat infer_cls(const cv::Mat &origin, float thresh = 0.5); + + /** + * Postprocess or sencod model to extract text + * @param res + * @return + */ + std::vector postprocess_rec_word_index(const PredictorOutput &res); + + /** + * calculate confidence of second model text result + * @param res + * @return + */ + float postprocess_rec_score(const PredictorOutput &res); + + std::unique_ptr _det_predictor; + std::unique_ptr _rec_predictor; + std::unique_ptr _cls_predictor; + OCR_Config _config; }; } diff --git a/deploy/android_demo/app/src/main/java/com/baidu/paddle/lite/demo/ocr/OCRPredictorNative.java b/deploy/android_demo/app/src/main/java/com/baidu/paddle/lite/demo/ocr/OCRPredictorNative.java index 2e78a3ece96bb5e37bebcdda7ebc77060686b710..7499d4b92689645c0b1009256884733d392ff68d 100644 --- a/deploy/android_demo/app/src/main/java/com/baidu/paddle/lite/demo/ocr/OCRPredictorNative.java +++ b/deploy/android_demo/app/src/main/java/com/baidu/paddle/lite/demo/ocr/OCRPredictorNative.java @@ -29,7 +29,7 @@ public class OCRPredictorNative { public OCRPredictorNative(Config config) { this.config = config; loadLibrary(); - nativePointer = init(config.detModelFilename, config.recModelFilename, + nativePointer = init(config.detModelFilename, config.recModelFilename,config.clsModelFilename, config.cpuThreadNum, config.cpuPower); Log.i("OCRPredictorNative", "load success " + nativePointer); @@ -38,7 +38,7 @@ public class OCRPredictorNative { public void release() { if (nativePointer != 0) { nativePointer = 0; - destory(nativePointer); +// destory(nativePointer); } } @@ -55,10 +55,11 @@ public class OCRPredictorNative { public String cpuPower; public String detModelFilename; public String recModelFilename; + public String clsModelFilename; } - protected native long init(String detModelPath, String recModelPath, int threadNum, String cpuMode); + protected native long init(String detModelPath, String recModelPath,String clsModelPath, int threadNum, String cpuMode); protected native float[] forward(long pointer, float[] buf, float[] ddims, Bitmap originalImage); diff --git a/deploy/android_demo/app/src/main/java/com/baidu/paddle/lite/demo/ocr/Predictor.java b/deploy/android_demo/app/src/main/java/com/baidu/paddle/lite/demo/ocr/Predictor.java index 078bba286cc9cd5f9904e0594b5608c755a2b131..ddf69ab481618696189a7d0d45264791267e5631 100644 --- a/deploy/android_demo/app/src/main/java/com/baidu/paddle/lite/demo/ocr/Predictor.java +++ b/deploy/android_demo/app/src/main/java/com/baidu/paddle/lite/demo/ocr/Predictor.java @@ -121,7 +121,8 @@ public class Predictor { config.cpuThreadNum = cpuThreadNum; config.detModelFilename = realPath + File.separator + "ch_det_mv3_db_opt.nb"; config.recModelFilename = realPath + File.separator + "ch_rec_mv3_crnn_opt.nb"; - Log.e("Predictor", "model path" + config.detModelFilename + " ; " + config.recModelFilename); + config.clsModelFilename = realPath + File.separator + "cls_opt_arm.nb"; + Log.e("Predictor", "model path" + config.detModelFilename + " ; " + config.recModelFilename + ";" + config.clsModelFilename); config.cpuPower = cpuPowerMode; paddlePredictor = new OCRPredictorNative(config); diff --git a/deploy/cpp_infer/include/config.h b/deploy/cpp_infer/include/config.h index 9dc95eb8200ff203b7498bb41387f99e975345bf..a5f19c32839a3b3995e690c14ce5bb4c79db161b 100644 --- a/deploy/cpp_infer/include/config.h +++ b/deploy/cpp_infer/include/config.h @@ -41,13 +41,15 @@ public: this->use_mkldnn = bool(stoi(config_map_["use_mkldnn"])); + this->use_zero_copy_run = bool(stoi(config_map_["use_zero_copy_run"])); + this->max_side_len = stoi(config_map_["max_side_len"]); this->det_db_thresh = stod(config_map_["det_db_thresh"]); this->det_db_box_thresh = stod(config_map_["det_db_box_thresh"]); - this->det_db_box_thresh = stod(config_map_["det_db_box_thresh"]); + this->det_db_unclip_ratio = stod(config_map_["det_db_unclip_ratio"]); this->det_model_dir.assign(config_map_["det_model_dir"]); @@ -72,6 +74,8 @@ public: bool use_mkldnn = false; + bool use_zero_copy_run = false; + int max_side_len = 960; double det_db_thresh = 0.3; diff --git a/deploy/cpp_infer/include/ocr_cls.h b/deploy/cpp_infer/include/ocr_cls.h index 4d8f2a13791abd93df293ba2896e5cd610b3d939..5dbfbf5a4a541c9542e4abac17d04c94a7a34444 100644 --- a/deploy/cpp_infer/include/ocr_cls.h +++ b/deploy/cpp_infer/include/ocr_cls.h @@ -37,13 +37,14 @@ public: explicit Classifier(const std::string &model_dir, const bool &use_gpu, const int &gpu_id, const int &gpu_mem, const int &cpu_math_library_num_threads, - const bool &use_mkldnn, const double &cls_thresh) { + const bool &use_mkldnn, const bool &use_zero_copy_run, + const double &cls_thresh) { this->use_gpu_ = use_gpu; this->gpu_id_ = gpu_id; this->gpu_mem_ = gpu_mem; this->cpu_math_library_num_threads_ = cpu_math_library_num_threads; this->use_mkldnn_ = use_mkldnn; - + this->use_zero_copy_run_ = use_zero_copy_run; this->cls_thresh = cls_thresh; LoadModel(model_dir); diff --git a/deploy/cpp_infer/include/ocr_det.h b/deploy/cpp_infer/include/ocr_det.h index ed2667eecfea9a09d7da77df37f43a7b9e9bb349..0308d07f3bac67a275452500184e0959b16e8003 100644 --- a/deploy/cpp_infer/include/ocr_det.h +++ b/deploy/cpp_infer/include/ocr_det.h @@ -39,8 +39,8 @@ public: explicit DBDetector(const std::string &model_dir, const bool &use_gpu, const int &gpu_id, const int &gpu_mem, const int &cpu_math_library_num_threads, - const bool &use_mkldnn, const int &max_side_len, - const double &det_db_thresh, + const bool &use_mkldnn, const bool &use_zero_copy_run, + const int &max_side_len, const double &det_db_thresh, const double &det_db_box_thresh, const double &det_db_unclip_ratio, const bool &visualize) { @@ -49,6 +49,7 @@ public: this->gpu_mem_ = gpu_mem; this->cpu_math_library_num_threads_ = cpu_math_library_num_threads; this->use_mkldnn_ = use_mkldnn; + this->use_zero_copy_run_ = use_zero_copy_run; this->max_side_len_ = max_side_len; @@ -75,6 +76,7 @@ private: int gpu_mem_ = 4000; int cpu_math_library_num_threads_ = 4; bool use_mkldnn_ = false; + bool use_zero_copy_run_ = false; int max_side_len_ = 960; diff --git a/deploy/cpp_infer/include/ocr_rec.h b/deploy/cpp_infer/include/ocr_rec.h index d2180b33f23eba2013c6b65b4c9a16eb8a48f0cc..68237170beabb1ecd386821d97c2eefb16435345 100644 --- a/deploy/cpp_infer/include/ocr_rec.h +++ b/deploy/cpp_infer/include/ocr_rec.h @@ -39,12 +39,14 @@ public: explicit CRNNRecognizer(const std::string &model_dir, const bool &use_gpu, const int &gpu_id, const int &gpu_mem, const int &cpu_math_library_num_threads, - const bool &use_mkldnn, const string &label_path) { + const bool &use_mkldnn, const bool &use_zero_copy_run, + const string &label_path) { this->use_gpu_ = use_gpu; this->gpu_id_ = gpu_id; this->gpu_mem_ = gpu_mem; this->cpu_math_library_num_threads_ = cpu_math_library_num_threads; this->use_mkldnn_ = use_mkldnn; + this->use_zero_copy_run_ = use_zero_copy_run; this->label_list_ = Utility::ReadDict(label_path); this->label_list_.push_back(" "); @@ -66,6 +68,7 @@ private: int gpu_mem_ = 4000; int cpu_math_library_num_threads_ = 4; bool use_mkldnn_ = false; + bool use_zero_copy_run_ = false; std::vector label_list_; diff --git a/deploy/cpp_infer/src/main.cpp b/deploy/cpp_infer/src/main.cpp index d5c399fa27112b348607da16786f40b91a8e1eac..989424d0b58bbf6c307dc07d3e461e93ce0ecc10 100644 --- a/deploy/cpp_infer/src/main.cpp +++ b/deploy/cpp_infer/src/main.cpp @@ -48,17 +48,19 @@ int main(int argc, char **argv) { cv::Mat srcimg = cv::imread(img_path, cv::IMREAD_COLOR); - DBDetector det(config.det_model_dir, config.use_gpu, config.gpu_id, - config.gpu_mem, config.cpu_math_library_num_threads, - config.use_mkldnn, config.max_side_len, config.det_db_thresh, - config.det_db_box_thresh, config.det_db_unclip_ratio, - config.visualize); + DBDetector det( + config.det_model_dir, config.use_gpu, config.gpu_id, config.gpu_mem, + config.cpu_math_library_num_threads, config.use_mkldnn, + config.use_zero_copy_run, config.max_side_len, config.det_db_thresh, + config.det_db_box_thresh, config.det_db_unclip_ratio, config.visualize); Classifier cls(config.cls_model_dir, config.use_gpu, config.gpu_id, config.gpu_mem, config.cpu_math_library_num_threads, - config.use_mkldnn, config.cls_thresh); + config.use_mkldnn, config.use_zero_copy_run, + config.cls_thresh); CRNNRecognizer rec(config.rec_model_dir, config.use_gpu, config.gpu_id, config.gpu_mem, config.cpu_math_library_num_threads, - config.use_mkldnn, config.char_list_file); + config.use_mkldnn, config.use_zero_copy_run, + config.char_list_file); auto start = std::chrono::system_clock::now(); std::vector>> boxes; diff --git a/deploy/cpp_infer/src/ocr_cls.cpp b/deploy/cpp_infer/src/ocr_cls.cpp index 15604fe2df22aed9d8237bf92d8f776c0dfa15b2..23a1c79c0eb15fdf52266f9fd7aff0889c992da1 100644 --- a/deploy/cpp_infer/src/ocr_cls.cpp +++ b/deploy/cpp_infer/src/ocr_cls.cpp @@ -34,12 +34,22 @@ cv::Mat Classifier::Run(cv::Mat &img) { this->permute_op_.Run(&resize_img, input.data()); - auto input_names = this->predictor_->GetInputNames(); - auto input_t = this->predictor_->GetInputTensor(input_names[0]); - input_t->Reshape({1, 3, resize_img.rows, resize_img.cols}); - input_t->copy_from_cpu(input.data()); - - this->predictor_->ZeroCopyRun(); + // Inference. + if (this->use_zero_copy_run_) { + auto input_names = this->predictor_->GetInputNames(); + auto input_t = this->predictor_->GetInputTensor(input_names[0]); + input_t->Reshape({1, 3, resize_img.rows, resize_img.cols}); + input_t->copy_from_cpu(input.data()); + this->predictor_->ZeroCopyRun(); + } else { + paddle::PaddleTensor input_t; + input_t.shape = {1, 3, resize_img.rows, resize_img.cols}; + input_t.data = + paddle::PaddleBuf(input.data(), input.size() * sizeof(float)); + input_t.dtype = PaddleDType::FLOAT32; + std::vector outputs; + this->predictor_->Run({input_t}, &outputs, 1); + } std::vector softmax_out; std::vector label_out; diff --git a/deploy/cpp_infer/src/ocr_det.cpp b/deploy/cpp_infer/src/ocr_det.cpp index c87b653ceab011ef0593e7fb87358325deaf882b..56fbace8cc6fa27f8172bed248573f15d0c98dac 100644 --- a/deploy/cpp_infer/src/ocr_det.cpp +++ b/deploy/cpp_infer/src/ocr_det.cpp @@ -31,7 +31,8 @@ void DBDetector::LoadModel(const std::string &model_dir) { } // false for zero copy tensor - config.SwitchUseFeedFetchOps(false); + // true for commom tensor + config.SwitchUseFeedFetchOps(!this->use_zero_copy_run_); // true for multiple input config.SwitchSpecifyInputNames(true); @@ -59,12 +60,22 @@ void DBDetector::Run(cv::Mat &img, std::vector input(1 * 3 * resize_img.rows * resize_img.cols, 0.0f); this->permute_op_.Run(&resize_img, input.data()); - auto input_names = this->predictor_->GetInputNames(); - auto input_t = this->predictor_->GetInputTensor(input_names[0]); - input_t->Reshape({1, 3, resize_img.rows, resize_img.cols}); - input_t->copy_from_cpu(input.data()); - - this->predictor_->ZeroCopyRun(); + // Inference. + if (this->use_zero_copy_run_) { + auto input_names = this->predictor_->GetInputNames(); + auto input_t = this->predictor_->GetInputTensor(input_names[0]); + input_t->Reshape({1, 3, resize_img.rows, resize_img.cols}); + input_t->copy_from_cpu(input.data()); + this->predictor_->ZeroCopyRun(); + } else { + paddle::PaddleTensor input_t; + input_t.shape = {1, 3, resize_img.rows, resize_img.cols}; + input_t.data = + paddle::PaddleBuf(input.data(), input.size() * sizeof(float)); + input_t.dtype = PaddleDType::FLOAT32; + std::vector outputs; + this->predictor_->Run({input_t}, &outputs, 1); + } std::vector out_data; auto output_names = this->predictor_->GetOutputNames(); diff --git a/deploy/cpp_infer/src/ocr_rec.cpp b/deploy/cpp_infer/src/ocr_rec.cpp index 8b5eaf9c59cc63f25880a8a9c3c503e248a2bf9b..0e06b8b37ae3e5937a80fc138945296c29acdfe5 100644 --- a/deploy/cpp_infer/src/ocr_rec.cpp +++ b/deploy/cpp_infer/src/ocr_rec.cpp @@ -41,18 +41,29 @@ void CRNNRecognizer::Run(std::vector>> boxes, this->permute_op_.Run(&resize_img, input.data()); - auto input_names = this->predictor_->GetInputNames(); - auto input_t = this->predictor_->GetInputTensor(input_names[0]); - input_t->Reshape({1, 3, resize_img.rows, resize_img.cols}); - input_t->copy_from_cpu(input.data()); - - this->predictor_->ZeroCopyRun(); + // Inference. + if (this->use_zero_copy_run_) { + auto input_names = this->predictor_->GetInputNames(); + auto input_t = this->predictor_->GetInputTensor(input_names[0]); + input_t->Reshape({1, 3, resize_img.rows, resize_img.cols}); + input_t->copy_from_cpu(input.data()); + this->predictor_->ZeroCopyRun(); + } else { + paddle::PaddleTensor input_t; + input_t.shape = {1, 3, resize_img.rows, resize_img.cols}; + input_t.data = + paddle::PaddleBuf(input.data(), input.size() * sizeof(float)); + input_t.dtype = PaddleDType::FLOAT32; + std::vector outputs; + this->predictor_->Run({input_t}, &outputs, 1); + } std::vector rec_idx; auto output_names = this->predictor_->GetOutputNames(); auto output_t = this->predictor_->GetOutputTensor(output_names[0]); auto rec_idx_lod = output_t->lod(); auto shape_out = output_t->shape(); + int out_num = std::accumulate(shape_out.begin(), shape_out.end(), 1, std::multiplies()); @@ -122,7 +133,8 @@ void CRNNRecognizer::LoadModel(const std::string &model_dir) { } // false for zero copy tensor - config.SwitchUseFeedFetchOps(false); + // true for commom tensor + config.SwitchUseFeedFetchOps(!this->use_zero_copy_run_); // true for multiple input config.SwitchSpecifyInputNames(true); diff --git a/deploy/cpp_infer/tools/config.txt b/deploy/cpp_infer/tools/config.txt index a049fc7d9dfaac88e69581b7c0aad8af8a9efaab..40beea3a2e6f0260a42202d6411ffb10907bf871 100644 --- a/deploy/cpp_infer/tools/config.txt +++ b/deploy/cpp_infer/tools/config.txt @@ -4,6 +4,7 @@ gpu_id 0 gpu_mem 4000 cpu_math_library_num_threads 10 use_mkldnn 0 +use_zero_copy_run 1 # det config max_side_len 960 diff --git a/deploy/hubserving/ocr_det/params.py b/deploy/hubserving/ocr_det/params.py index 0b950114f82d88f20d2ce521628ea9dda7740ab4..e88ab45c7bb548ef971465d4aaefb30d247ab17f 100644 --- a/deploy/hubserving/ocr_det/params.py +++ b/deploy/hubserving/ocr_det/params.py @@ -36,4 +36,6 @@ def read_params(): # cfg.rec_char_dict_path = "./ppocr/utils/ppocr_keys_v1.txt" # cfg.use_space_char = True - return cfg \ No newline at end of file + cfg.use_zero_copy_run = False + + return cfg diff --git a/deploy/hubserving/ocr_rec/params.py b/deploy/hubserving/ocr_rec/params.py index fe93fc0870a1bb1d050285ca858de4cddb6b3a61..59772e2163d1d5f8279dee85432b5bf93502914e 100644 --- a/deploy/hubserving/ocr_rec/params.py +++ b/deploy/hubserving/ocr_rec/params.py @@ -33,7 +33,11 @@ def read_params(): cfg.rec_image_shape = "3, 32, 320" cfg.rec_char_type = 'ch' cfg.rec_batch_num = 30 + cfg.max_text_length = 25 + cfg.rec_char_dict_path = "./ppocr/utils/ppocr_keys_v1.txt" cfg.use_space_char = True - return cfg \ No newline at end of file + cfg.use_zero_copy_run = False + + return cfg diff --git a/deploy/hubserving/ocr_system/params.py b/deploy/hubserving/ocr_system/params.py index 5b3bb1ea44b6cd262283797807c2c77646202fe8..0ff56d37d50b30b09bb13b529a48a260dfe8f84a 100644 --- a/deploy/hubserving/ocr_system/params.py +++ b/deploy/hubserving/ocr_system/params.py @@ -33,7 +33,11 @@ def read_params(): cfg.rec_image_shape = "3, 32, 320" cfg.rec_char_type = 'ch' cfg.rec_batch_num = 30 + cfg.max_text_length = 25 + cfg.rec_char_dict_path = "./ppocr/utils/ppocr_keys_v1.txt" cfg.use_space_char = True - return cfg \ No newline at end of file + cfg.use_zero_copy_run = False + + return cfg diff --git a/deploy/ios_demo/download_dependencies.sh b/deploy/ios_demo/download_dependencies.sh index 40a8b3d259c83614c9bd210815fbe2f3bf49a607..1d09dac9cced12d63e44e4d6ea4870b542f7516b 100755 --- a/deploy/ios_demo/download_dependencies.sh +++ b/deploy/ios_demo/download_dependencies.sh @@ -26,7 +26,7 @@ download_and_extract() { } echo -e "[Download ios ocr demo denpendancy]\n" -download_and_extract "${OCR_MODEL_URL}" "./ios-demo/ocr_demo/models" -download_and_extract "${PADDLE_LITE_LIB_URL}" "./ios-demo/ocr_demo" -download_and_extract "${OPENCV3_FRAMEWORK_URL}" "./ios-demo/ocr_demo" +download_and_extract "${OCR_MODEL_URL}" "./ocr_demo/models" +download_and_extract "${PADDLE_LITE_LIB_URL}" "./ocr_demo" +download_and_extract "${OPENCV3_FRAMEWORK_URL}" "./ocr_demo" echo -e "[done]\n" diff --git a/deploy/lite/readme_en.md b/deploy/lite/readme_en.md index 00bb5e587de59e7d7001a365a52606323c61541e..bf2f4b2212bf4d362fe2127da66127ab05766668 100644 --- a/deploy/lite/readme_en.md +++ b/deploy/lite/readme_en.md @@ -13,7 +13,7 @@ deployment solutions for end-side deployment issues. - Computer (for Compiling Paddle Lite) - Mobile phone (arm7 or arm8) -## 2. Build ncnn library +## 2. Build PaddleLite library [build for Docker](https://paddle-lite.readthedocs.io/zh/latest/user_guides/source_compile.html#docker) [build for Linux](https://paddle-lite.readthedocs.io/zh/latest/user_guides/source_compile.html#android) [build for MAC OS](https://paddle-lite.readthedocs.io/zh/latest/user_guides/source_compile.html#id13) diff --git a/deploy/pdserving/det_local_server.py b/deploy/pdserving/det_local_server.py index 78970af4d1a8a89f976f48f2c29ec97732afa0ce..eb7948daadd018810997bba78367e86aa3398e31 100644 --- a/deploy/pdserving/det_local_server.py +++ b/deploy/pdserving/det_local_server.py @@ -23,7 +23,7 @@ from paddle_serving_app.reader import Div, Normalize, Transpose from paddle_serving_app.reader import DBPostProcess, FilterBoxes if sys.argv[1] == 'gpu': from paddle_serving_server_gpu.web_service import WebService -elif sys.argv[1] == 'cpu' +elif sys.argv[1] == 'cpu': from paddle_serving_server.web_service import WebService import time import re @@ -67,11 +67,13 @@ class OCRService(WebService): ocr_service = OCRService(name="ocr") ocr_service.load_model_config("ocr_det_model") +ocr_service.init_det() if sys.argv[1] == 'gpu': ocr_service.set_gpus("0") ocr_service.prepare_server(workdir="workdir", port=9292, device="gpu", gpuid=0) + ocr_service.run_debugger_service(gpu=True) elif sys.argv[1] == 'cpu': ocr_service.prepare_server(workdir="workdir", port=9292) + ocr_service.run_debugger_service() ocr_service.init_det() -ocr_service.run_debugger_service() ocr_service.run_web_service() diff --git a/deploy/pdserving/ocr_local_server.py b/deploy/pdserving/ocr_local_server.py index f7458c3036734e4bb6e554097029270e11912a3a..de5b3d13f12afd4a84c5d46625682c42f418d6bb 100644 --- a/deploy/pdserving/ocr_local_server.py +++ b/deploy/pdserving/ocr_local_server.py @@ -104,10 +104,11 @@ class OCRService(WebService): ocr_service = OCRService(name="ocr") ocr_service.load_model_config("ocr_rec_model") -ocr_service.prepare_server(workdir="workdir", port=9292) ocr_service.init_det_debugger(det_model_config="ocr_det_model") if sys.argv[1] == 'gpu': + ocr_service.prepare_server(workdir="workdir", port=9292, device="gpu", gpuid=0) ocr_service.run_debugger_service(gpu=True) elif sys.argv[1] == 'cpu': + ocr_service.prepare_server(workdir="workdir", port=9292, device="cpu") ocr_service.run_debugger_service() ocr_service.run_web_service() diff --git a/deploy/pdserving/readme.md b/deploy/pdserving/readme.md index 9472e94cffcd483a85850f6e7ea9c8bc172aaf3b..f9ad80b896be0be29e3a7bb17e4aa119af81d5c4 100644 --- a/deploy/pdserving/readme.md +++ b/deploy/pdserving/readme.md @@ -55,6 +55,23 @@ tar -xzvf ocr_det.tar.gz ``` 执行上述命令会下载`db_crnn_mobile`的模型,如果想要下载规模更大的`db_crnn_server`模型,可以在下载预测模型并解压之后。参考[如何从Paddle保存的预测模型转为Paddle Serving格式可部署的模型](https://github.com/PaddlePaddle/Serving/blob/develop/doc/INFERENCE_TO_SERVING_CN.md)。 +我们以`ch_rec_r34_vd_crnn`模型作为例子,下载链接在: + +``` +wget --no-check-certificate https://paddleocr.bj.bcebos.com/ch_models/ch_rec_r34_vd_crnn_infer.tar +tar xf ch_rec_r34_vd_crnn_infer.tar +``` +因此我们按照Serving模型转换教程,运行下列python文件。 +``` +from paddle_serving_client.io import inference_model_to_serving +inference_model_dir = "ch_rec_r34_vd_crnn" +serving_client_dir = "serving_client_dir" +serving_server_dir = "serving_server_dir" +feed_var_names, fetch_var_names = inference_model_to_serving( + inference_model_dir, serving_client_dir, serving_server_dir, model_filename="model", params_filename="params") +``` +最终会在`serving_client_dir`和`serving_server_dir`生成客户端和服务端的模型配置。 + ### 3. 启动服务 启动服务可以根据实际需求选择启动`标准版`或者`快速版`,两种方式的对比如下表: diff --git a/deploy/pdserving/rec_local_server.py b/deploy/pdserving/rec_local_server.py index fbe67aafee5c8dcae269cd4ad6f6100ed514f0b7..ba021c1cd5054071eb115b3e6e9c64cb572ff871 100644 --- a/deploy/pdserving/rec_local_server.py +++ b/deploy/pdserving/rec_local_server.py @@ -22,7 +22,10 @@ from paddle_serving_client import Client from paddle_serving_app.reader import Sequential, URL2Image, ResizeByFactor from paddle_serving_app.reader import Div, Normalize, Transpose from paddle_serving_app.reader import DBPostProcess, FilterBoxes, GetRotateCropImage, SortedBoxes -from paddle_serving_server_gpu.web_service import WebService +if sys.argv[1] == 'gpu': + from paddle_serving_server_gpu.web_service import WebService +elif sys.argv[1] == 'cpu': + from paddle_serving_server.web_service import WebService import time import re import base64 @@ -65,8 +68,12 @@ class OCRService(WebService): ocr_service = OCRService(name="ocr") ocr_service.load_model_config("ocr_rec_model") -ocr_service.set_gpus("0") ocr_service.init_rec() -ocr_service.prepare_server(workdir="workdir", port=9292, device="gpu", gpuid=0) -ocr_service.run_debugger_service() +if sys.argv[1] == 'gpu': + ocr_service.set_gpus("0") + ocr_service.prepare_server(workdir="workdir", port=9292, device="gpu", gpuid=0) + ocr_service.run_debugger_service(gpu=True) +elif sys.argv[1] == 'cpu': + ocr_service.prepare_server(workdir="workdir", port=9292, device="cpu") + ocr_service.run_debugger_service() ocr_service.run_web_service() diff --git a/doc/doc_ch/FAQ.md b/doc/doc_ch/FAQ.md index 38b8baeed585fbd58a56eca310cf2dbe4392b5f7..f782a1643c4f75cb82745000a24a8a17fb0ff4b8 100644 --- a/doc/doc_ch/FAQ.md +++ b/doc/doc_ch/FAQ.md @@ -1,51 +1,495 @@ -## FAQ +# FAQ -1. **预测报错:got an unexpected keyword argument 'gradient_clip'** -安装的paddle版本不对,目前本项目仅支持paddle1.7,近期会适配到1.8。 +## 写在前面 -2. **转换attention识别模型时报错:KeyError: 'predict'** -问题已解决,请更新到最新代码。 +- 我们收集整理了开源以来在issues和用户群中的常见问题并且给出了简要解答,旨在为OCR的开发者提供一些参考,也希望帮助大家少走一些弯路。 -3. **关于推理速度** -图片中的文字较多时,预测时间会增,可以使用--rec_batch_num设置更小预测batch num,默认值为30,可以改为10或其他数值。 +- OCR领域大佬众多,本文档回答主要依赖有限的项目实践,难免挂一漏万,如有遗漏和不足,也**希望有识之士帮忙补充和修正**,万分感谢。 -4. **服务部署与移动端部署** -预计6月中下旬会先后发布基于Serving的服务部署方案和基于Paddle Lite的移动端部署方案,欢迎持续关注。 -5. **自研算法发布时间** -自研算法SAST、SRN、End2End-PSL都将在7-8月陆续发布,敬请期待。 +## PaddleOCR常见问题汇总(持续更新) -6. **如何在Windows或Mac系统上运行** -PaddleOCR已完成Windows和Mac系统适配,运行时注意两点:1、在[快速安装](./installation.md)时,如果不想安装docker,可跳过第一步,直接从第二步安装paddle开始。2、inference模型下载时,如果没有安装wget,可直接点击模型链接或将链接地址复制到浏览器进行下载,并解压放置到相应目录。 +* [【精选】OCR精选10个问题](#OCR精选10个问题) +* [【理论篇】OCR通用21个问题](#OCR通用问题) + * [基础知识3题](#基础知识) + * [数据集4题](#数据集) + * [模型训练调优6题](#模型训练调优) + * [预测部署8题](#预测部署) +* [【实战篇】PaddleOCR实战53个问题](#PaddleOCR实战问题) + * [使用咨询17题](#使用咨询) + * [数据集9题](#数据集) + * [模型训练调优13题](#模型训练调优) + * [预测部署14题](#预测部署) -7. **超轻量模型和通用OCR模型的区别** -目前PaddleOCR开源了2个中文模型,分别是8.6M超轻量中文模型和通用中文OCR模型。两者对比信息如下: - - 相同点:两者使用相同的**算法**和**训练数据**; - - 不同点:不同之处在于**骨干网络**和**通道参数**,超轻量模型使用MobileNetV3作为骨干网络,通用模型使用Resnet50_vd作为检测模型backbone,Resnet34_vd作为识别模型backbone,具体参数差异可对比两种模型训练的配置文件. + + + +## 【精选】OCR精选10个问题 + +#### Q1.1.1:基于深度学习的文字检测方法有哪几种?各有什么优缺点? + +**A**:常用的基于深度学习的文字检测方法一般可以分为基于回归的、基于分割的两大类,当然还有一些将两者进行结合的方法。 + +(1)基于回归的方法分为box回归和像素值回归。a. 采用box回归的方法主要有CTPN、Textbox系列和EAST,这类算法对规则形状文本检测效果较好,但无法准确检测不规则形状文本。 b. 像素值回归的方法主要有CRAFT和SA-Text,这类算法能够检测弯曲文本且对小文本效果优秀但是实时性能不够。 + +(2)基于分割的算法,如PSENet,这类算法不受文本形状的限制,对各种形状的文本都能取得较好的效果,但是往往后处理比较复杂,导致耗时严重。目前也有一些算法专门针对这个问题进行改进,如DB,将二值化进行近似,使其可导,融入训练,从而获取更准确的边界,大大降低了后处理的耗时。 + +#### Q1.1.2:对于中文行文本识别,CTC和Attention哪种更优? + +**A**:(1)从效果上来看,通用OCR场景CTC的识别效果优于Attention,因为带识别的字典中的字符比较多,常用中文汉字三千字以上,如果训练样本不足的情况下,对于这些字符的序列关系挖掘比较困难。中文场景下Attention模型的优势无法体现。而且Attention适合短语句识别,对长句子识别比较差。 + +(2)从训练和预测速度上,Attention的串行解码结构限制了预测速度,而CTC网络结构更高效,预测速度上更有优势。 + +#### Q1.1.3:弯曲形变的文字识别需要怎么处理?TPS应用场景是什么,是否好用? + +**A**:(1)在大多数情况下,如果遇到的场景弯曲形变不是太严重,检测4个顶点,然后直接通过仿射变换转正识别就足够了。 + +(2)如果不能满足需求,可以尝试使用TPS(Thin Plate Spline),即薄板样条插值。TPS是一种插值算法,经常用于图像变形等,通过少量的控制点就可以驱动图像进行变化。一般用在有弯曲形变的文本识别中,当检测到不规则的/弯曲的(如,使用基于分割的方法检测算法)文本区域,往往先使用TPS算法对文本区域矫正成矩形再进行识别,如,STAR-Net、RARE等识别算法中引入了TPS模块。 +**Warning**:TPS看起来美好,在实际应用时经常发现并不够鲁棒,并且会增加耗时,需要谨慎使用。 + +#### Q1.1.4:简单的对于精度要求不高的OCR任务,数据集需要准备多少张呢? + +**A**:(1)训练数据的数量和需要解决问题的复杂度有关系。难度越大,精度要求越高,则数据集需求越大,而且一般情况实际中的训练数据越多效果越好。 + +(2)对于精度要求不高的场景,检测任务和识别任务需要的数据量是不一样的。对于检测任务,500张图像可以保证基本的检测效果。对于识别任务,需要保证识别字典中每个字符出现在不同场景的行文本图像数目需要大于200张(举例,如果有字典中有5个字,每个字都需要出现在200张图片以上,那么最少要求的图像数量应该在200-1000张之间),这样可以保证基本的识别效果。 + +#### Q1.1.5:背景干扰的文字(如印章盖到落款上,需要识别落款或者印章中的文字),如何识别? + +**A**:(1)在人眼确认可识别的条件下,对于背景有干扰的文字,首先要保证检测框足够准确,如果检测框不准确,需要考虑是否可以通过过滤颜色等方式对图像预处理并且增加更多相关的训练数据;在识别的部分,注意在训练数据中加入背景干扰类的扩增图像。 + +(2)如果MobileNet模型不能满足需求,可以尝试ResNet系列大模型来获得更好的效果 +。 + +#### Q1.1.6:OCR领域常用的评估指标是什么? + +**A**:对于两阶段的可以分开来看,分别是检测和识别阶段 + +(1)检测阶段:先按照检测框和标注框的IOU评估,IOU大于某个阈值判断为检测准确。这里检测框和标注框不同于一般的通用目标检测框,是采用多边形进行表示。检测准确率:正确的检测框个数在全部检测框的占比,主要是判断检测指标。检测召回率:正确的检测框个数在全部标注框的占比,主要是判断漏检的指标。 + + +(2)识别阶段: +字符识别准确率,即正确识别的文本行占标注的文本行数量的比例,只有整行文本识别对才算正确识别。 + +(3)端到端统计: +端对端召回率:准确检测并正确识别文本行在全部标注文本行的占比; +端到端准确率:准确检测并正确识别文本行在 检测到的文本行数量 的占比; +准确检测的标准是检测框与标注框的IOU大于某个阈值,正确识别的的检测框中的文本与标注的文本相同。 + + +#### Q1.1.7:单张图上多语种并存识别(如单张图印刷体和手写文字并存),应该如何处理? + +**A**:单张图像中存在多种类型文本的情况很常见,典型的以学生的试卷为代表,一张图像同时存在手写体和印刷体两种文本,这类情况下,可以尝试”1个检测模型+1个N分类模型+N个识别模型”的解决方案。 +其中不同类型文本共用同一个检测模型,N分类模型指额外训练一个分类器,将检测到的文本进行分类,如手写+印刷的情况就是二分类,N种语言就是N分类,在识别的部分,针对每个类型的文本单独训练一个识别模型,如手写+印刷的场景,就需要训练一个手写体识别模型,一个印刷体识别模型,如果一个文本框的分类结果是手写体,那么就传给手写体识别模型进行识别,其他情况同理。 + +#### Q1.1.8:请问PaddleOCR项目中的中文超轻量和通用模型用了哪些数据集?训练多少样本,gpu什么配置,跑了多少个epoch,大概跑了多久? + +**A**: +(1)检测的话,LSVT街景数据集共3W张图像,超轻量模型,150epoch左右,2卡V100 跑了不到2天;通用模型:2卡V100 150epoch 不到4天。 +(2) +识别的话,520W左右的数据集(真实数据26W+合成数据500W)训练,超轻量模型:4卡V100,总共训练了5天左右。通用模型:4卡V100,共训练6天。 + +超轻量模型训练分为2个阶段: +(1)全量数据训练50epoch,耗时3天 +(2)合成数据+真实数据按照1:1数据采样,进行finetune训练200epoch,耗时2天 + +通用模型训练: +真实数据+合成数据,动态采样(1:1)训练,200epoch,耗时 6天左右。 + + +#### Q1.1.9:PaddleOCR模型推理方式有几种?各自的优缺点是什么 + +**A**:目前推理方式支持基于训练引擎推理和基于预测引擎推理。 + +(1)基于训练引擎推理不需要转换模型,但是需要先组网再load参数,语言只支持python,不适合系统集成。 + +(2)基于预测引擎的推理需要先转换模型为inference格式,然后可以进行不需要组网的推理,语言支持c++和python,适合系统集成。 + +#### Q1.1.10:PaddleOCR中,对于模型预测加速,CPU加速的途径有哪些?基于TenorRT加速GPU对输入有什么要求? + +**A**:(1)CPU可以使用mkldnn进行加速;对于python inference的话,可以把enable_mkldnn改为true,[参考代码](https://github.com/PaddlePaddle/PaddleOCR/blob/549108fe0aa0d87c0a3b2d471f1c653e89daab80/tools/infer/utility.py#L73),对于cpp inference的话,在配置文件里面配置use_mkldnn 1即可,[参考代码](https://github.com/PaddlePaddle/PaddleOCR/blob/549108fe0aa0d87c0a3b2d471f1c653e89daab80/deploy/cpp_infer/tools/config.txt#L6) + +(2)GPU需要注意变长输入问题等,TRT6 之后才支持变长输入 + + +## 【理论篇】OCR通用问题 +### 基础知识 + +#### Q2.1.1:CRNN能否识别两行的文字?还是说必须一行? + +**A**:CRNN是一种基于1D-CTC的算法,其原理决定无法识别2行或多行的文字,只能单行识别。 + +#### Q2.1.2:怎么判断行文本图像是否是颠倒的? + +**A**:有两种方案:(1)原始图像和颠倒图像都进行识别预测,取得分较高的为识别结果。 +(2)训练一个正常图像和颠倒图像的方向分类器进行判断。 + +#### Q2.1.3:目前OCR普遍是二阶段,端到端的方案在业界落地情况如何? + +**A**:端到端在文字分布密集的业务场景,效率会比较有保证,精度的话看自己业务数据积累情况,如果行级别的识别数据积累比较多的话two-stage会比较好。百度的落地场景,比如工业仪表识别、车牌识别都用到端到端解决方案。 + + +### 数据集 + +#### Q2.2.1:支持空格的模型,标注数据的时候是不是要标注空格?中间几个空格都要标注出来么? + +**A**:如果需要检测和识别模型,就需要在标注的时候把空格标注出来,而且在字典中增加空格对应的字符。标注过程中,如果中间几个空格标注一个就行。 + +#### Q2.2.2:如果考虑支持竖排文字识别,相关的数据集如何合成? + +**A**:竖排文字与横排文字合成方式相同,只是选择了垂直字体。合成工具推荐:[text_renderer](https://github.com/Sanster/text_renderer) + +#### Q2.2.3:训练文字识别模型,真实数据有30w,合成数据有500w,需要做样本均衡吗? + +**A**:需要,一般需要保证一个batch中真实数据样本和合成数据样本的比例是1:1~1:3左右效果比较理想。如果合成数据过大,会过拟合到合成数据,预测效果往往不佳。还有一种**启发性**的尝试是可以先用大量合成数据训练一个base模型,然后再用真实数据微调,在一些简单场景效果也是会有提升的。 + +#### Q2.2.4:请问一下,竖排文字识别时候,字的特征已经变了,这种情况在数据集和字典标注是新增一个类别还是多个角度的字共享一个类别? + +**A**:可以根据实际场景做不同的尝试,共享一个类别是可以收敛,效果也还不错。但是如果分开训练,同类样本之间一致性更好,更容易收敛,识别效果会更优。 + +### 模型训练调优 + +#### Q2.3.1:如何更换文本检测/识别的backbone? +**A**:无论是文字检测,还是文字识别,骨干网络的选择是预测效果和预测效率的权衡。一般,选择更大规模的骨干网络,例如ResNet101_vd,则检测或识别更准确,但预测耗时相应也会增加。而选择更小规模的骨干网络,例如MobileNetV3_small_x0_35,则预测更快,但检测或识别的准确率会大打折扣。幸运的是不同骨干网络的检测或识别效果与在ImageNet数据集图像1000分类任务效果正相关。[**飞桨图像分类套件PaddleClas**](https://github.com/PaddlePaddle/PaddleClas)汇总了ResNet_vd、Res2Net、HRNet、MobileNetV3、GhostNet等23种系列的分类网络结构,在上述图像分类任务的top1识别准确率,GPU(V100和T4)和CPU(骁龙855)的预测耗时以及相应的[**117个预训练模型下载地址**](https://paddleclas.readthedocs.io/zh_CN/latest/models/models_intro.html)。 + + (1)文字检测骨干网络的替换,主要是确定类似与ResNet的4个stages,以方便集成后续的类似FPN的检测头。此外,对于文字检测问题,使用ImageNet训练的分类预训练模型,可以加速收敛和效果提升。 + + (2)文字识别的骨干网络的替换,需要注意网络宽高stride的下降位置。由于文本识别一般宽高比例很大,因此高度下降频率少一些,宽度下降频率多一些。可以参考PaddleOCR中[MobileNetV3骨干网络](https://github.com/PaddlePaddle/PaddleOCR/blob/develop/ppocr/modeling/backbones/rec_mobilenet_v3.py)的改动。 + +#### Q2.3.2:文本识别训练不加LSTM是否可以收敛? + +**A**:理论上是可以收敛的,加上LSTM模块主要是为了挖掘文字之间的序列关系,提升识别效果。对于有明显上下文语义的场景效果会比较明显。 + +#### Q2.3.3:文本识别中LSTM和GRU如何选择? + +**A**:从项目实践经验来看,序列模块采用LSTM的识别效果优于GRU,但是LSTM的计算量比GRU大一些,可以根据自己实际情况选择。 + +#### Q2.3.4:对于CRNN模型,backbone采用DenseNet和ResNet_vd,哪种网络结构更好? + +**A**:Backbone的识别效果在CRNN模型上的效果,与Imagenet 1000 图像分类任务上识别效果和效率一致。在图像分类任务上ResnNet_vd(79%+)的识别精度明显优于DenseNet(77%+),此外对于GPU,Nvidia针对ResNet系列模型做了优化,预测效率更高,所以相对而言,resnet_vd是较好选择。如果是移动端,可以优先考虑MobileNetV3系列。 + +#### Q2.3.5:训练识别时,如何选择合适的网络输入shape? + +**A**:一般高度采用32,最长宽度的选择,有两种方法: + +(1)统计训练样本图像的宽高比分布。最大宽高比的选取考虑满足80%的训练样本。 + +(2)统计训练样本文字数目。最长字符数目的选取考虑满足80%的训练样本。然后中文字符长宽比近似认为是1,英文认为3:1,预估一个最长宽度。 + +#### Q2.3.6:如何识别文字比较长的文本? + +**A**:在中文识别模型训练时,并不是采用直接将训练样本缩放到[3,32,320]进行训练,而是先等比例缩放图像,保证图像高度为32,宽度不足320的部分补0,宽高比大于10的样本直接丢弃。预测时,如果是单张图像预测,则按上述操作直接对图像缩放,不做宽度320的限制。如果是多张图预测,则采用batch方式预测,每个batch的宽度动态变换,采用这个batch中最长宽度。 + +### 预测部署 + +#### Q2.4.1:请问对于图片中的密集文字,有什么好的处理办法吗? + +**A**:可以先试用预训练模型测试一下,例如DB+CRNN,判断下密集文字图片中是检测还是识别的问题,然后针对性的改善。还有一种是如果图象中密集文字较小,可以尝试增大图像分辨率,对图像进行一定范围内的拉伸,将文字稀疏化,提高识别效果。 + +#### Q2.4.2:对于一些在识别时稍微模糊的文本,有没有一些图像增强的方式? + +**A**:在人类肉眼可以识别的前提下,可以考虑图像处理中的均值滤波、中值滤波或者高斯滤波等模糊算子尝试。也可以尝试从数据扩增扰动来强化模型鲁棒性,另外新的思路有对抗性训练和超分SR思路,可以尝试借鉴。但目前业界尚无普遍认可的最优方案,建议优先在数据采集阶段增加一些限制提升图片质量。 + +#### Q2.4.3:对于特定文字检测,例如身份证只检测姓名,检测指定区域文字更好,还是检测全部区域再筛选更好? + +**A**:两个角度来说明一般检测全部区域再筛选更好。 + +(1)由于特定文字和非特定文字之间的视觉特征并没有很强的区分行,只检测指定区域,容易造成特定文字漏检。 + +(2)产品的需求可能是变化的,不排除后续对于模型需求变化的可能性(比如又需要增加一个字段),相比于训练模型,后处理的逻辑会更容易调整。 + +#### Q2.4.4:对于小白如何快速入门中文OCR项目实践? + +**A**:建议可以先了解OCR方向的基础知识,大概了解基础的检测和识别模型算法。然后在Github上可以查看OCR方向相关的repo。目前来看,从内容的完备性来看,PaddleOCR的中英文双语教程文档是有明显优势的,在数据集、模型训练、预测部署文档详实,可以快速入手。而且还有微信用户群答疑,非常适合学习实践。项目地址:[PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR) + +#### Q2.4.5:如何识别带空格的英文行文本图像? + +**A**:空格识别可以考虑以下两种方案: + +(1)优化文本检测算法。检测结果在空格处将文本断开。这种方案在检测数据标注时,需要将含有空格的文本行分成好多段。 + +(2)优化文本识别算法。在识别字典里面引入空格字符,然后在识别的训练数据中,如果用空行,进行标注。此外,合成数据时,通过拼接训练数据,生成含有空格的文本。 + +#### Q2.4.6:中英文一起识别时也可以加空格字符来训练吗 + +**A**:中文识别可以加空格当做分隔符训练,具体的效果如何没法给出直接评判,根据实际业务数据训练来判断。 + +#### Q2.4.7:低像素文字或者字号比较小的文字有什么超分辨率方法吗 + +**A**:超分辨率方法分为传统方法和基于深度学习的方法。基于深度学习的方法中,比较经典的有SRCNN,另外CVPR2020也有一篇超分辨率的工作可以参考文章:Unpaired Image Super-Resolution using Pseudo-Supervision,但是没有充分的实践验证过,需要看实际场景下的效果。 + +#### Q2.4.8:表格识别有什么好的模型 或者论文推荐么 + +**A**:表格目前学术界比较成熟的解决方案不多 ,可以尝试下分割的论文方案。 + + + +## 【实战篇】PaddleOCR实战问题 + +### 使用咨询 + +#### Q3.1.1:OSError: [WinError 126] 找不到指定的模块。mac pro python 3.4 shapely import 问题 + +**A**:这个问题是因为shapely库安装有误,可以参考 [#212](https://github.com/PaddlePaddle/PaddleOCR/issues/212) 这个issue重新安装一下 + +#### Q3.1.2:安装了paddle-gpu,运行时提示没有安装gpu版本的paddle,可能是什么原因? + +**A**:用户同时安装了paddle cpu和gpu版本,都删掉之后,重新安装gpu版本的padle就好了 + +#### Q3.1.3:试用报错:Cannot load cudnn shared library,是什么原因呢? + +**A**:需要把cudnn lib添加到LD_LIBRARY_PATH中去。 + +#### Q3.1.4:PaddlePaddle怎么指定GPU运行 os.environ["CUDA_VISIBLE_DEVICES"]这种不生效 + +**A**:通过设置 export CUDA_VISIBLE_DEVICES='0'环境变量 + +#### Q3.1.5:windows下训练没有问题,aistudio中提示数据路径有问题 + +**A**:需要把`\`改为`/`(windows和linux的文件夹分隔符不一样,windows下的是`\`,linux下是`/`) + +#### Q3.1.6:gpu版的paddle虽然能在cpu上运行,但是必须要有gpu设备 + +**A**:export CUDA_VISIBLE_DEVICES='',CPU是可以正常跑的 + +#### Q3.1.7:预测报错ImportError: dlopen: cannot load any more object with static TLS + +**A**:glibc的版本问题,运行需要glibc的版本号大于2.23。 + +#### Q3.1.8:提供的inference model和预训练模型的区别 + +**A**:inference model为固化模型,文件中包含网络结构和网络参数,多用于预测部署。预训练模型是训练过程中保存好的模型,多用于fine-tune训练或者断点训练。 + +#### Q3.1.9:模型的解码部分有后处理? + +**A**:有的检测的后处理在ppocr/postprocess路径下,识别的后处理均在ppocr/utils/character.py文件内 + +#### Q3.1.10:PaddleOCR中文模型是否支持数字识别? + +**A**:支持的,可以看下ppocr/utils/ppocr_keys_v1.txt 这个文件,是支持的识别字符列表,其中包含了数字识别。 + +#### Q3.1.11:PaddleOCR如何做到横排和竖排同时支持的? + +**A**:合成了一批竖排文字,逆时针旋转90度后加入训练集与横排一起训练。预测时根据图片长款比判断是否为竖排,若为竖排则将crop出的文本逆时针旋转90度后送入识别网络。 + +#### Q3.1.12:如何获取检测文本框的坐标? + +**A**:文本检测的结果有box和文本信息, 具体 [参考代码](https://github.com/PaddlePaddle/PaddleOCR/blob/9d33e36df550762b204d5fbfd7977a25e31b2c44/tools/infer/predict_system.py#L13) + +#### Q3.1.13:识别模型框出来的位置太紧凑,会丢失边缘的文字信息,导致识别错误 + +**A**: 可以在命令中加入 --det_db_unclip_ratio ,参数[定义位置](https://github.com/PaddlePaddle/PaddleOCR/blob/develop/tools/infer/utility.py#L49),这个参数是检测后处理时控制文本框大小的,默认2.0,可以尝试改成2.5或者更大,反之,如果觉得文本框不够紧凑,也可以把该参数调小。 + +#### Q3.1.14:英文手写体识别有计划提供的预训练模型吗? + +**A**:近期也在开展需求调研,如果企业用户需求较多,我们会考虑增加相应的研发投入,后续提供对应的预训练模型,如果有需求欢迎通过issue或者加入微信群联系我们。 + +#### Q3.1.15:PaddleOCR的算法可以用于手写文字检测识别吗?后续有计划推出手写预训练模型么? +**A**:理论上只要有相应的数据集,都是可以的。当然手写识别毕竟和印刷体有区别,对应训练调优策略可能需要适配性优化。 + + +#### Q3.1.16:PaddleOCR是否支持在Windows或Mac系统上运行? + +**A**:PaddleOCR已完成Windows和Mac系统适配,运行时注意两点: + +(1)在[快速安装](./installation.md)时,如果不想安装docker,可跳过第一步,直接从第二步安装paddle开始。 + +(2)inference模型下载时,如果没有安装wget,可直接点击模型链接或将链接地址复制到浏览器进行下载,并解压放置到相应目录。 + +#### Q3.1.17:PaddleOCR开源的超轻量模型和通用OCR模型的区别? +**A**:目前PaddleOCR开源了2个中文模型,分别是8.6M超轻量中文模型和通用中文OCR模型。两者对比信息如下: +- 相同点:两者使用相同的**算法**和**训练数据**; +- 不同点:不同之处在于**骨干网络**和**通道参数**,超轻量模型使用MobileNetV3作为骨干网络,通用模型使用Resnet50_vd作为检测模型backbone,Resnet34_vd作为识别模型backbone,具体参数差异可对比两种模型训练的配置文件. |模型|骨干网络|检测训练配置|识别训练配置| |-|-|-|-| |8.6M超轻量中文OCR模型|MobileNetV3+MobileNetV3|det_mv3_db.yml|rec_chinese_lite_train.yml| |通用中文OCR模型|Resnet50_vd+Resnet34_vd|det_r50_vd_db.yml|rec_chinese_common_train.yml| -8. **是否有计划开源仅识别数字或仅识别英文+数字的模型** -暂不计划开源仅数字、仅数字+英文、或其他小垂类专用模型。PaddleOCR开源了多种检测、识别算法供用户自定义训练,两种中文模型也是基于开源的算法库训练产出,有小垂类需求的小伙伴,可以按照教程准备好数据,选择合适的配置文件,自行训练,相信能有不错的效果。训练有任何问题欢迎提issue或在交流群提问,我们会及时解答。 -9. **开源模型使用的训练数据是什么,能否开源** -目前开源的模型,数据集和量级如下: - - 检测: - 英文数据集,ICDAR2015 - 中文数据集,LSVT街景数据集训练数据3w张图片 - - 识别: - 英文数据集,MJSynth和SynthText合成数据,数据量上千万。 - 中文数据集,LSVT街景数据集根据真值将图crop出来,并进行位置校准,总共30w张图像。此外基于LSVT的语料,合成数据500w。 - - 其中,公开数据集都是开源的,用户可自行搜索下载,也可参考[中文数据集](./datasets.md),合成数据暂不开源,用户可使用开源合成工具自行合成,可参考的合成工具包括[text_renderer](https://github.com/Sanster/text_renderer)、[SynthText](https://github.com/ankush-me/SynthText)、[TextRecognitionDataGenerator](https://github.com/Belval/TextRecognitionDataGenerator)等。 +### 数据集 + +#### Q3.2.1:如何制作PaddleOCR支持的数据格式 + +**A**:可以参考检测与识别训练文档,里面有数据格式详细介绍。[检测文档](https://github.com/PaddlePaddle/PaddleOCR/blob/develop/doc/doc_ch/detection.md),[识别文档](https://github.com/PaddlePaddle/PaddleOCR/blob/develop/doc/doc_ch/recognition.md) + +#### Q3.2.2:请问一下,如果想用预训练模型,但是我的数据里面又出现了预训练模型字符集中没有的字符,新的字符是在字符集前面添加还是在后面添加? + +**A**:在后面添加,修改dict之后,就改变了模型最后一层fc的结构,之前训练到的参数没有用到,相当于从头训练,因此acc是0。 + +#### Q3.2.3:如何调试数据读取程序? + +**A**:tools/train.py中有一个test_reader()函数用于调试数据读取。 + +#### Q3.2.4:开源模型使用的训练数据是什么,能否开源? + +**A**:目前开源的模型,数据集和量级如下: + +- 检测: + - 英文数据集,ICDAR2015 + - 中文数据集,LSVT街景数据集训练数据3w张图片 + +- 识别: + - 英文数据集,MJSynth和SynthText合成数据,数据量上千万。 + - 中文数据集,LSVT街景数据集根据真值将图crop出来,并进行位置校准,总共30w张图像。此外基于LSVT的语料,合成数据500w。 + +其中,公开数据集都是开源的,用户可自行搜索下载,也可参考[中文数据集](./datasets.md),合成数据暂不开源,用户可使用开源合成工具自行合成,可参考的合成工具包括[text_renderer](https://github.com/Sanster/text_renderer)、[SynthText](https://github.com/ankush-me/SynthText)、[TextRecognitionDataGenerator](https://github.com/Belval/TextRecognitionDataGenerator)等。 + +#### Q3.2.5:请问中文字符集多大呢?支持生僻字识别吗? + +**A**:中文字符集是6623, 支持生僻字识别。训练样本中有部分生僻字,但样本不多,如果有特殊需求建议使用自己的数据集做fine-tune。 + +#### Q3.2.6:中文文本检测、文本识别构建训练集的话,大概需要多少数据量 + +**A**:检测需要的数据相对较少,在PaddleOCR模型的基础上进行Fine-tune,一般需要500张可达到不错的效果。 +识别分英文和中文,一般英文场景需要几十万数据可达到不错的效果,中文则需要几百万甚至更多。 + +#### Q3.2.7:中文识别模型如何选择? + +**A**:中文模型共有2大类:通用模型和超轻量模型。他们各自的优势如下: +超轻量模型具有更小的模型大小,更快的预测速度。适合用于端侧使用。 +通用模型具有更高的模型精度,适合对模型大小不敏感的场景。 +此外基于以上模型,PaddleOCR还提供了支持空格识别的模型,主要针对中文场景中的英文句子。 +您可以根据实际使用需求进行选择。 + +#### Q3.2.8:图像旋转90° 文本检测可以正常检测到具体文本位置,但是识别准确度大幅降低,是否会考虑增加相应的旋转预处理? + +**A**:目前模型只支持两种方向的文字:水平和垂直。 为了降低模型大小,加快模型预测速度,PaddleOCR暂时没有加入图片的方向判断。建议用户在识别前自行转正,后期也会考虑添加选择角度判断。 + +#### Q3.2.9:同一张图通用检测出21个条目,轻量级检测出26个 ,难道不是轻量级的好吗? + +**A**:可以主要参考可视化效果,通用模型更倾向于检测一整行文字,轻量级可能会有一行文字被分成两段检测的情况,不是数量越多,效果就越好。 + +### 模型训练调优 + +#### Q3.3.1:文本长度超过25,应该怎么处理? + +**A**:默认训练时的文本可识别的最大长度为25,超过25的文本会被忽略不参与训练。如果您训练样本中的长文本较多,可以修改配置文件中的 max\_text\_length 字段,设置为更大的最长文本长度,具体位置在[这里](https://github.com/PaddlePaddle/PaddleOCR/blob/fb9e47b262529386983edc21b33abfa16bbf06ac/configs/rec/rec_chinese_lite_train.yml#L13)。 + +#### Q3.3.2:配置文件里面检测的阈值设置么? + +**A**:有的,检测相关的参数主要有以下几个: +``max_side_len:预测时图像resize的长边尺寸 +thresh: 用于二值化输出图的阈值 +box_thresh:用于过滤文本框的阈值,低于此阈值的文本框不要 +unclip_ratio: 文本框扩张的系数,关系到文本框的大小`` + +这些参数的默认值见[代码](https://github.com/PaddlePaddle/PaddleOCR/blob/develop/tools/infer/utility.py#L40),可以通过从命令行传递参数进行修改。 + +#### Q3.3.3:我想请教一下,你们在训练识别时候,lsvt里的非矩形框文字,你们是怎么做处理的呢。忽略掉还是去最小旋转框? + +**A**:现在是忽略处理的 + +#### Q3.3.4:训练过程中,如何恰当的停止训练(直接kill,经常还有显存占用的问题) + +**A**:可以通过下面的脚本终止所有包含train.py字段的进程, + +``` +ps -axu | grep train.py | awk '{print $2}' | xargs kill -9 +``` + +#### Q3.3.5:读数据进程数设置4~8时训练一会进程接连defunct后gpu利用率一直为0卡死 + +**A**:修改多进程的队列数后解决, 将[代码段]( https://github.com/PaddlePaddle/PaddleOCR/blob/549108fe0aa0d87c0a3b2d471f1c653e89daab80/ppocr/data/reader_main.py#L75 ) 修改为: + +``` +return paddle.reader.multiprocess_reader(readers, False, queue_size=320) + +``` + +#### Q3.3.6:可不可以将pretrain_weights设置为空呢?想从零开始训练一个model + +**A**:这个是可以的,在训练通用识别模型的时候,pretrain_weights就设置为空,但是这样可能需要更长的迭代轮数才能达到相同的精度。 + +#### Q3.3.7:PaddleOCR默认不是200个step保存一次模型吗?为啥文件夹下面都没有生成 + +**A**:因为默认保存的起始点不是0,而是4000,将eval_batch_step [4000, 5000]改为[0, 2000] 就是从第0次迭代开始,每2000迭代保存一次模型 + +#### Q3.3.8:如何进行模型微调? + +**A**:注意配置好匹配的数据集合适,然后在finetune训练时,可以加载我们提供的预训练模型,设置配置文件中Global.pretrain_weights 参数为要加载的预训练模型路径。 + +#### Q3.3.9:文本检测换成自己的数据没法训练,有一些”###”是什么意思? + +**A**:数据格式有问题,”###” 表示要被忽略的文本区域,所以你的数据都被跳过了,可以换成其他任意字符或者就写个空的。 + +#### Q3.3.10:copy_from_cpu这个地方,这块input不变(t_data的size不变)连续调用两次copy_from_cpu()时,这里面的gpu_place会重新malloc GPU内存吗?还是只有当ele_size变化时才会重新在GPU上malloc呢? + +**A**:小于等于的时候都不会重新分配,只有大于的时候才会重新分配 + +#### Q3.3.11:自己训练出来的未inference转换的模型 可以当作预训练模型吗? + +**A**:可以的,但是如果训练数据两量少的话,可能会过拟合到少量数据上,泛化性能不佳。 + +#### Q3.3.12:使用带TPS的识别模型预测报错 + +**A**:直接更换配置文件里的Backbone.function即可,格式为:网络文件路径,网络Class名词。如果所需的backbone在PaddleOCR里没有提供,可以参照PaddleClas里面的网络结构,进行修改尝试。具体修改原则可以参考OCR通用问题中 "如何更换文本检测/识别的backbone" 的回答。 + +#### Q3.3.13:如何更换文本检测/识别的backbone?报错信息:``Input(X) dims[3] and Input(Grid) dims[2] should be equal, but received X dimension[3](320) != Grid dimension[2](100) `` + +**A**:TPS模块暂时无法支持变长的输入,请设置 ``--rec_image_shape='3,32,100' --rec_char_type='en' 固定输入shape`` + +### 预测部署 + +#### Q3.4.1:如何pip安装opt模型转换工具? + +**A**:由于OCR端侧部署需要某些算子的支持,这些算子仅在Paddle-Lite 最新develop分支中,所以需要自己编译opt模型转换工具。opt工具可以通过编译PaddleLite获得,编译步骤参考[lite部署文档](https://github.com/PaddlePaddle/PaddleOCR/blob/0791714b91/deploy/lite/readme.md) 中2.1 模型优化部分。 + +#### Q3.4.2:如何将PaddleOCR预测模型封装成SDK + +**A**:如果是Python的话,可以使用tools/infer/predict_system.py中的TextSystem进行sdk封装,如果是c++的话,可以使用deploy/cpp_infer/src下面的DBDetector和CRNNRecognizer完成封装 + +#### Q3.4.3:服务部署可以只发布文本识别,而不带文本检测模型么? + +**A**:可以的。默认的服务部署是检测和识别串联预测的。也支持单独发布文本检测或文本识别模型,比如使用PaddleHUBPaddleOCR 模型时,deploy下有三个文件夹,分别是 + +- ocr_det:检测预测 +- ocr_rec: 识别预测 +- ocr_system: 检测识别串联预测 + +每个模块是单独分开的,所以可以选择只发布文本识别模型。使用PaddleServing部署时同理。 + + +#### Q3.4.4:为什么PaddleOCR检测预测是只支持一张图片测试?即test_batch_size_per_card=1 + +**A**:测试的时候,对图像等比例缩放,最长边960,不同图像等比例缩放后长宽不一致,无法组成batch,所以设置为test_batch_size为1。 + +#### Q3.4.5:为什么使用c++ inference和python inference结果不一致? + +**A**:可能是导出的inference model版本与预测库版本需要保持一致,比如在Windows下,Paddle官网提供的预测库版本是1.8,而PaddleOCR提供的inference model 版本是1.7,因此最终预测结果会有差别。可以在Paddle1.8环境下导出模型,再基于该模型进行预测。 +此外也需要保证两者的预测参数配置完全一致。 + +#### Q3.4.6:为什么第一张张图预测时间很长,第二张之后预测时间会降低? + +**A**:第一张图需要显存资源初始化,耗时较多。完成模型加载后,之后的预测时间会明显缩短。 + +#### Q3.4.7:请问opt工具可以直接转int8量化后的模型为.nb文件吗 + +**A**:有的,PaddleLite提供完善的opt工具,可以参考[文档](https://paddle-lite.readthedocs.io/zh/latest/user_guides/post_quant_with_data.html) + +#### Q3.4.8:请问在安卓端怎么设置这个参数 --det_db_unclip_ratio=3 + +**A**:在安卓APK上无法设置,没有暴露这个接口,如果使用的是PaddledOCR/deploy/lite/的demo,可以修改config.txt中的对应参数来设置 + +#### Q3.4.9:PaddleOCR模型是否可以转换成ONNX模型? + +**A**:目前暂不支持转ONNX,相关工作在研发中。 + +#### Q3.4.10:使用opt工具对检测模型转换时报错 can not found op arguments for node conv2_b_attr + +**A**:这个问题大概率是编译opt工具的Paddle-Lite不是develop分支,建议使用Paddle-Lite 的develop分支编译opt工具。 + +#### Q3.4.11:libopenblas.so找不到是什么意思? + +**A**:目前包括mkl和openblas两种版本的预测库,推荐使用mkl的预测库,如果下载的预测库是mkl的,编译的时候也需要勾选`with_mkl`选项 +,以Linux下编译为例,需要在设置这里为ON,`-DWITH_MKL=ON`,[参考链接](https://github.com/PaddlePaddle/PaddleOCR/blob/8a78af26df0dd8f15b734cc8db13e25d2a3656a2/deploy/cpp_infer/tools/build.sh#L12)。此外,使用预测库时,推荐在Linux或者Windows上进行开发,不推荐在MacOS上开发。 + +#### Q3.4.12:使用自定义字典训练,inference时如何修改 + +**A**:使用了自定义字典的话,用inference预测时,需要通过 --rec_char_dict_path 修改字典路径。详细操作可参考[文档](https://github.com/PaddlePaddle/PaddleOCR/blob/develop/doc/doc_ch/inference.md#%E8%87%AA%E5%AE%9A%E4%B9%89%E6%96%87%E6%9C%AC%E8%AF%86%E5%88%AB%E5%AD%97%E5%85%B8%E7%9A%84%E6%8E%A8%E7%90%86) -10. **使用带TPS的识别模型预测报错** -报错信息:Input(X) dims[3] and Input(Grid) dims[2] should be equal, but received X dimension[3](320) != Grid dimension[2](100) -原因:TPS模块暂时无法支持变长的输入,请设置 --rec_image_shape='3,32,100' --rec_char_type='en' 固定输入shape +#### Q3.4.13:能否返回单字字符的位置? -11. **自定义字典训练的模型,识别结果出现字典里没出现的字** -预测时没有设置采用的自定义字典路径。设置方法是在预测时,通过增加输入参数rec_char_dict_path来设置。 +**A**:训练的时候标注是整个文本行的标注,所以预测的也是文本行位置,如果要获取单字符位置信息,可以根据预测的文本,计算字符数量,再去根据整个文本行的位置信息,估计文本块中每个字符的位置。 +#### Q3.4.14:PaddleOCR模型部署方式有哪几种? +**A**:目前有Inference部署,serving部署和手机端Paddle Lite部署,可根据不同场景做灵活的选择:Inference部署适用于本地离线部署,serving部署适用于云端部署,Paddle Lite部署适用于手机端集成。 diff --git a/doc/doc_ch/config.md b/doc/doc_ch/config.md index 03fe1b3280881472c830cf5ac57dee183a94b373..fe8db9c893cf0e6190111de5fe7627d2fe52a4fd 100644 --- a/doc/doc_ch/config.md +++ b/doc/doc_ch/config.md @@ -63,8 +63,9 @@ | beta1 | 设置一阶矩估计的指数衰减率 | 0.9 | \ | | beta2 | 设置二阶矩估计的指数衰减率 | 0.999 | \ | | decay | 是否使用decay | \ | \ | -| function(decay) | 设置decay方式 | - | 目前支持cosine_decay与piecewise_decay | -| step_each_epoch | 每个epoch包含多少次迭代, cosine_decay时有效 | 20 | 计算方式:total_image_num / (batch_size_per_card * card_size) | -| total_epoch | 总共迭代多少个epoch, cosine_decay时有效 | 1000 | 与Global.epoch_num 一致 | +| function(decay) | 设置decay方式 | - | 目前支持cosine_decay, cosine_decay_warmup与piecewise_decay | +| step_each_epoch | 每个epoch包含多少次迭代, cosine_decay/cosine_decay_warmup时有效 | 20 | 计算方式:total_image_num / (batch_size_per_card * card_size) | +| total_epoch | 总共迭代多少个epoch, cosine_decay/cosine_decay_warmup时有效 | 1000 | 与Global.epoch_num 一致 | +| warmup_minibatch | 线性warmup的迭代次数, cosine_decay_warmup时有效 | 1000 | \ | | boundaries | 学习率下降时的迭代次数间隔, piecewise_decay时有效 | - | 参数为列表形式 | | decay_rate | 学习率衰减系数, piecewise_decay时有效 | - | \ | diff --git a/doc/doc_ch/detection.md b/doc/doc_ch/detection.md index 850f67ca8ac0a476136e80d2e7413b328743c548..84c90d18a4ac5e1133a8202d574b789848060855 100644 --- a/doc/doc_ch/detection.md +++ b/doc/doc_ch/detection.md @@ -1,13 +1,13 @@ # 文字检测 -本节以icdar15数据集为例,介绍PaddleOCR中检测模型的训练、评估与测试。 +本节以icdar2015数据集为例,介绍PaddleOCR中检测模型的训练、评估与测试。 ## 数据准备 icdar2015数据集可以从[官网](https://rrc.cvc.uab.es/?ch=4&com=downloads)下载到,首次下载需注册。 将下载到的数据集解压到工作目录下,假设解压在 PaddleOCR/train_data/ 下。另外,PaddleOCR将零散的标注文件整理成单独的标注文件 ,您可以通过wget的方式进行下载。 -``` +```shell # 在PaddleOCR路径下 cd PaddleOCR/ wget -P ./train_data/ https://paddleocr.bj.bcebos.com/dataset/train_icdar2015_label.txt @@ -23,21 +23,21 @@ wget -P ./train_data/ https://paddleocr.bj.bcebos.com/dataset/test_icdar2015_la └─ test_icdar2015_label.txt icdar数据集的测试标注 ``` -提供的标注文件格式为,其中中间是"\t"分隔: +提供的标注文件格式如下,中间用"\t"分隔: ``` " 图像文件名 json.dumps编码的图像标注信息" ch4_test_images/img_61.jpg [{"transcription": "MASA", "points": [[310, 104], [416, 141], [418, 216], [312, 179]]}, {...}] ``` json.dumps编码前的图像标注信息是包含多个字典的list,字典中的 `points` 表示文本框的四个点的坐标(x, y),从左上角的点开始顺时针排列。 -`transcription` 表示当前文本框的文字,在文本检测任务中并不需要这个信息。 -如果您想在其他数据集上训练PaddleOCR,可以按照上述形式构建标注文件。 +`transcription` 表示当前文本框的文字,**当其内容为“###”时,表示该文本框无效,在训练时会跳过。** +如果您想在其他数据集上训练,可以按照上述形式构建标注文件。 ## 快速启动训练 首先下载模型backbone的pretrain model,PaddleOCR的检测模型目前支持两种backbone,分别是MobileNetV3、ResNet50_vd, 您可以根据需求使用[PaddleClas](https://github.com/PaddlePaddle/PaddleClas/tree/master/ppcls/modeling/architectures)中的模型更换backbone。 -``` +```shell cd PaddleOCR/ # 下载MobileNetV3的预训练模型 wget -P ./pretrain_models/ https://paddle-imagenet-models-name.bj.bcebos.com/MobileNetV3_large_x0_5_pretrained.tar @@ -45,7 +45,7 @@ wget -P ./pretrain_models/ https://paddle-imagenet-models-name.bj.bcebos.com/Mob wget -P ./pretrain_models/ https://paddle-imagenet-models-name.bj.bcebos.com/ResNet50_vd_ssld_pretrained.tar # 解压预训练模型文件,以MobileNetV3为例 -tar xf ./pretrain_models/MobileNetV3_large_x0_5_pretrained.tar ./pretrain_models/ +tar -xf ./pretrain_models/MobileNetV3_large_x0_5_pretrained.tar ./pretrain_models/ # 注:正确解压backbone预训练权重文件后,文件夹下包含众多以网络层命名的权重文件,格式如下: ./pretrain_models/MobileNetV3_large_x0_5_pretrained/ @@ -57,11 +57,11 @@ tar xf ./pretrain_models/MobileNetV3_large_x0_5_pretrained.tar ./pretrain_models ``` -**启动训练** +#### 启动训练 *如果您安装的是cpu版本,请将配置文件中的 `use_gpu` 字段修改为false* -``` +```shell python3 tools/train.py -c configs/det/det_mv3_db.yml -o Global.pretrain_weights=./pretrain_models/MobileNetV3_large_x0_5_pretrained/ ``` @@ -69,52 +69,52 @@ python3 tools/train.py -c configs/det/det_mv3_db.yml -o Global.pretrain_weights= 有关配置文件的详细解释,请参考[链接](./config.md)。 您也可以通过-o参数在不需要修改yml文件的情况下,改变训练的参数,比如,调整训练的学习率为0.0001 -``` +```shell python3 tools/train.py -c configs/det/det_mv3_db.yml -o Optimizer.base_lr=0.0001 ``` -**断点训练** +#### 断点训练 如果训练程序中断,如果希望加载训练中断的模型从而恢复训练,可以通过指定Global.checkpoints指定要加载的模型路径: -``` +```shell python3 tools/train.py -c configs/det/det_mv3_db.yml -o Global.checkpoints=./your/trained/model ``` -**注意**:Global.checkpoints的优先级高于Global.pretrain_weights的优先级,即同时指定两个参数时,优先加载Global.checkpoints指定的模型,如果Global.checkpoints指定的模型路径有误,会加载Global.pretrain_weights指定的模型。 +**注意**:`Global.checkpoints`的优先级高于`Global.pretrain_weights`的优先级,即同时指定两个参数时,优先加载`Global.checkpoints`指定的模型,如果`Global.checkpoints`指定的模型路径有误,会加载`Global.pretrain_weights`指定的模型。 ## 指标评估 PaddleOCR计算三个OCR检测相关的指标,分别是:Precision、Recall、Hmean。 -运行如下代码,根据配置文件det_db_mv3.yml中save_res_path指定的测试集检测结果文件,计算评估指标。 +运行如下代码,根据配置文件`det_db_mv3.yml`中`save_res_path`指定的测试集检测结果文件,计算评估指标。 -评估时设置后处理参数box_thresh=0.6,unclip_ratio=1.5,使用不同数据集、不同模型训练,可调整这两个参数进行优化 -``` +评估时设置后处理参数`box_thresh=0.6`,`unclip_ratio=1.5`,使用不同数据集、不同模型训练,可调整这两个参数进行优化 +```shell python3 tools/eval.py -c configs/det/det_mv3_db.yml -o Global.checkpoints="{path/to/weights}/best_accuracy" PostProcess.box_thresh=0.6 PostProcess.unclip_ratio=1.5 ``` -训练中模型参数默认保存在Global.save_model_dir目录下。在评估指标时,需要设置Global.checkpoints指向保存的参数文件。 +训练中模型参数默认保存在`Global.save_model_dir`目录下。在评估指标时,需要设置`Global.checkpoints`指向保存的参数文件。 比如: -``` +```shell python3 tools/eval.py -c configs/det/det_mv3_db.yml -o Global.checkpoints="./output/det_db/best_accuracy" PostProcess.box_thresh=0.6 PostProcess.unclip_ratio=1.5 ``` -* 注:box_thresh、unclip_ratio是DB后处理所需要的参数,在评估EAST模型时不需要设置 +* 注:`box_thresh`、`unclip_ratio`是DB后处理所需要的参数,在评估EAST模型时不需要设置 ## 测试检测效果 测试单张图像的检测效果 -``` +```shell python3 tools/infer_det.py -c configs/det/det_mv3_db.yml -o TestReader.infer_img="./doc/imgs_en/img_10.jpg" Global.checkpoints="./output/det_db/best_accuracy" ``` 测试DB模型时,调整后处理阈值, -``` +```shell python3 tools/infer_det.py -c configs/det/det_mv3_db.yml -o TestReader.infer_img="./doc/imgs_en/img_10.jpg" Global.checkpoints="./output/det_db/best_accuracy" PostProcess.box_thresh=0.6 PostProcess.unclip_ratio=1.5 ``` 测试文件夹下所有图像的检测效果 -``` +```shell python3 tools/infer_det.py -c configs/det/det_mv3_db.yml -o TestReader.infer_img="./doc/imgs_en/" Global.checkpoints="./output/det_db/best_accuracy" ``` diff --git a/doc/doc_ch/inference.md b/doc/doc_ch/inference.md index 64ee8f4247c8e57e3211309d32cb93a883dc7b7d..293fee2f4291a8400661de1ed08f0c6807eef977 100644 --- a/doc/doc_ch/inference.md +++ b/doc/doc_ch/inference.md @@ -1,14 +1,37 @@ # 基于Python预测引擎推理 -inference 模型(fluid.io.save_inference_model保存的模型) -一般是模型训练完成后保存的固化模型,多用于预测部署。 -训练过程中保存的模型是checkpoints模型,保存的是模型的参数,多用于恢复训练等。 +inference 模型(`fluid.io.save_inference_model`保存的模型) +一般是模型训练完成后保存的固化模型,多用于预测部署。训练过程中保存的模型是checkpoints模型,保存的是模型的参数,多用于恢复训练等。 与checkpoints模型相比,inference 模型会额外保存模型的结构信息,在预测部署、加速推理上性能优越,灵活方便,适合与实际系统集成。更详细的介绍请参考文档[分类预测框架](https://paddleclas.readthedocs.io/zh_CN/latest/extension/paddle_inference.html). 接下来首先介绍如何将训练的模型转换成inference模型,然后将依次介绍文本检测、文本识别以及两者串联基于预测引擎推理。 + +- [一、训练模型转inference模型](#训练模型转inference模型) + - [检测模型转inference模型](#检测模型转inference模型) + - [识别模型转inference模型](#识别模型转inference模型) + +- [二、文本检测模型推理](#文本检测模型推理) + - [1. 超轻量中文检测模型推理](#超轻量中文检测模型推理) + - [2. DB文本检测模型推理](#DB文本检测模型推理) + - [3. EAST文本检测模型推理](#EAST文本检测模型推理) + - [4. SAST文本检测模型推理](#SAST文本检测模型推理) + +- [三、文本识别模型推理](#文本识别模型推理) + - [1. 超轻量中文识别模型推理](#超轻量中文识别模型推理) + - [2. 基于CTC损失的识别模型推理](#基于CTC损失的识别模型推理) + - [3. 基于Attention损失的识别模型推理](#基于Attention损失的识别模型推理) + - [4. 自定义文本识别字典的推理](#自定义文本识别字典的推理) + +- [四、文本检测、识别串联推理](#文本检测、识别串联推理) + - [1. 超轻量中文OCR模型推理](#超轻量中文OCR模型推理) + - [2. 其他模型推理](#其他模型推理) + + + ## 一、训练模型转inference模型 + ### 检测模型转inference模型 下载超轻量级中文检测模型: @@ -24,15 +47,16 @@ wget -P ./ch_lite/ https://paddleocr.bj.bcebos.com/ch_models/ch_det_mv3_db.tar & python3 tools/export_model.py -c configs/det/det_mv3_db.yml -o Global.checkpoints=./ch_lite/det_mv3_db/best_accuracy Global.save_inference_dir=./inference/det_db/ ``` -转inference模型时,使用的配置文件和训练时使用的配置文件相同。另外,还需要设置配置文件中的Global.checkpoints、Global.save_inference_dir参数。 -其中Global.checkpoints指向训练中保存的模型参数文件,Global.save_inference_dir是生成的inference模型要保存的目录。 -转换成功后,在save_inference_dir 目录下有两个文件: +转inference模型时,使用的配置文件和训练时使用的配置文件相同。另外,还需要设置配置文件中的`Global.checkpoints`、`Global.save_inference_dir`参数。 +其中`Global.checkpoints`指向训练中保存的模型参数文件,`Global.save_inference_dir`是生成的inference模型要保存的目录。 +转换成功后,在`save_inference_dir`目录下有两个文件: ``` inference/det_db/ └─ model 检测inference模型的program文件 └─ params 检测inference模型的参数文件 ``` + ### 识别模型转inference模型 下载超轻量中文识别模型: @@ -51,7 +75,7 @@ python3 tools/export_model.py -c configs/rec/rec_chinese_lite_train.yml -o Globa Global.save_inference_dir=./inference/rec_crnn/ ``` -如果您是在自己的数据集上训练的模型,并且调整了中文字符的字典文件,请注意修改配置文件中的character_dict_path是否是所需要的字典文件。 +**注意:**如果您是在自己的数据集上训练的模型,并且调整了中文字符的字典文件,请注意修改配置文件中的`character_dict_path`是否是所需要的字典文件。 转换成功后,在目录下有两个文件: ``` @@ -60,11 +84,13 @@ python3 tools/export_model.py -c configs/rec/rec_chinese_lite_train.yml -o Globa └─ params 识别inference模型的参数文件 ``` + ## 二、文本检测模型推理 -下面将介绍超轻量中文检测模型推理、DB文本检测模型推理和EAST文本检测模型推理。默认配置是根据DB文本检测模型推理设置的。由于EAST和DB算法差别很大,在推理时,需要通过传入相应的参数适配EAST文本检测算法。 +文本检测模型推理,默认使用DB模型的配置参数。当不使用DB模型时,在推理时,需要通过传入相应的参数进行算法适配,细节参考下文。 -### 1.超轻量中文检测模型推理 + +### 1. 超轻量中文检测模型推理 超轻量中文检测模型推理,可以执行如下命令: @@ -72,11 +98,11 @@ python3 tools/export_model.py -c configs/rec/rec_chinese_lite_train.yml -o Globa python3 tools/infer/predict_det.py --image_dir="./doc/imgs/2.jpg" --det_model_dir="./inference/det_db/" ``` -可视化文本检测结果默认保存到 ./inference_results 文件夹里面,结果文件的名称前缀为'det_res'。结果示例如下: +可视化文本检测结果默认保存到`./inference_results`文件夹里面,结果文件的名称前缀为'det_res'。结果示例如下: ![](../imgs_results/det_res_2.jpg) -通过设置参数det_max_side_len的大小,改变检测算法中图片规范化的最大值。当图片的长宽都小于det_max_side_len,则使用原图预测,否则将图片等比例缩放到最大值,进行预测。该参数默认设置为det_max_side_len=960. 如果输入图片的分辨率比较大,而且想使用更大的分辨率预测,可以执行如下命令: +通过设置参数`det_max_side_len`的大小,改变检测算法中图片规范化的最大值。当图片的长宽都小于`det_max_side_len`,则使用原图预测,否则将图片等比例缩放到最大值,进行预测。该参数默认设置为`det_max_side_len=960`。 如果输入图片的分辨率比较大,而且想使用更大的分辨率预测,可以执行如下命令: ``` python3 tools/infer/predict_det.py --image_dir="./doc/imgs/2.jpg" --det_model_dir="./inference/det_db/" --det_max_side_len=1200 @@ -87,7 +113,8 @@ python3 tools/infer/predict_det.py --image_dir="./doc/imgs/2.jpg" --det_model_di python3 tools/infer/predict_det.py --image_dir="./doc/imgs/2.jpg" --det_model_dir="./inference/det_db/" --use_gpu=False ``` -### 2.DB文本检测模型推理 + +### 2. DB文本检测模型推理 首先将DB文本检测训练过程中保存的模型,转换成inference model。以基于Resnet50_vd骨干网络,在ICDAR2015英文数据集训练的模型为例([模型下载地址](https://paddleocr.bj.bcebos.com/det_r50_vd_db.tar)),可以使用如下命令进行转换: @@ -105,13 +132,14 @@ DB文本检测模型推理,可以执行如下命令: python3 tools/infer/predict_det.py --image_dir="./doc/imgs_en/img_10.jpg" --det_model_dir="./inference/det_db/" ``` -可视化文本检测结果默认保存到 ./inference_results 文件夹里面,结果文件的名称前缀为'det_res'。结果示例如下: +可视化文本检测结果默认保存到`./inference_results`文件夹里面,结果文件的名称前缀为'det_res'。结果示例如下: ![](../imgs_results/det_res_img_10_db.jpg) -**注意**:由于ICDAR2015数据集只有1000张训练图像,主要针对英文场景,所以上述模型对中文文本图像检测效果非常差。 +**注意**:由于ICDAR2015数据集只有1000张训练图像,且主要针对英文场景,所以上述模型对中文文本图像检测效果会比较差。 -### 3.EAST文本检测模型推理 + +### 3. EAST文本检测模型推理 首先将EAST文本检测训练过程中保存的模型,转换成inference model。以基于Resnet50_vd骨干网络,在ICDAR2015英文数据集训练的模型为例([模型下载地址](https://paddleocr.bj.bcebos.com/det_r50_vd_east.tar)),可以使用如下命令进行转换: @@ -123,24 +151,59 @@ python3 tools/infer/predict_det.py --image_dir="./doc/imgs_en/img_10.jpg" --det_ python3 tools/export_model.py -c configs/det/det_r50_vd_east.yml -o Global.checkpoints="./models/det_r50_vd_east/best_accuracy" Global.save_inference_dir="./inference/det_east" ``` -EAST文本检测模型推理,需要设置参数det_algorithm,指定检测算法类型为EAST,可以执行如下命令: +**EAST文本检测模型推理,需要设置参数`--det_algorithm="EAST"`**,可以执行如下命令: ``` python3 tools/infer/predict_det.py --det_algorithm="EAST" --image_dir="./doc/imgs_en/img_10.jpg" --det_model_dir="./inference/det_east/" ``` -可视化文本检测结果默认保存到 ./inference_results 文件夹里面,结果文件的名称前缀为'det_res'。结果示例如下: +可视化文本检测结果默认保存到`./inference_results`文件夹里面,结果文件的名称前缀为'det_res'。结果示例如下: ![](../imgs_results/det_res_img_10_east.jpg) -**注意**:本代码库中EAST后处理中NMS采用的Python版本,所以预测速度比较耗时。如果采用C++版本,会有明显加速。 +**注意**:本代码库中,EAST后处理Locality-Aware NMS有python和c++两种版本,c++版速度明显快于python版。由于c++版本nms编译版本问题,只有python3.5环境下会调用c++版nms,其他情况将调用python版nms。 + + + +### 4. SAST文本检测模型推理 +#### (1). 四边形文本检测模型(ICDAR2015) +首先将SAST文本检测训练过程中保存的模型,转换成inference model。以基于Resnet50_vd骨干网络,在ICDAR2015英文数据集训练的模型为例([模型下载地址](https://paddleocr.bj.bcebos.com/SAST/sast_r50_vd_icdar2015.tar)),可以使用如下命令进行转换: +``` +python3 tools/export_model.py -c configs/det/det_r50_vd_sast_icdar15.yml -o Global.checkpoints="./models/sast_r50_vd_icdar2015/best_accuracy" Global.save_inference_dir="./inference/det_sast_ic15" +``` +**SAST文本检测模型推理,需要设置参数`--det_algorithm="SAST"`**,可以执行如下命令: +``` +python3 tools/infer/predict_det.py --det_algorithm="SAST" --image_dir="./doc/imgs_en/img_10.jpg" --det_model_dir="./inference/det_sast_ic15/" +``` +可视化文本检测结果默认保存到`./inference_results`文件夹里面,结果文件的名称前缀为'det_res'。结果示例如下: + +![](../imgs_results/det_res_img_10_sast.jpg) + +#### (2). 弯曲文本检测模型(Total-Text) +首先将SAST文本检测训练过程中保存的模型,转换成inference model。以基于Resnet50_vd骨干网络,在Total-Text英文数据集训练的模型为例([模型下载地址](https://paddleocr.bj.bcebos.com/SAST/sast_r50_vd_total_text.tar)),可以使用如下命令进行转换: + +``` +python3 tools/export_model.py -c configs/det/det_r50_vd_sast_totaltext.yml -o Global.checkpoints="./models/sast_r50_vd_total_text/best_accuracy" Global.save_inference_dir="./inference/det_sast_tt" +``` + +**SAST文本检测模型推理,需要设置参数`--det_algorithm="SAST"`,同时,还需要增加参数`--det_sast_polygon=True`,**可以执行如下命令: +``` +python3 tools/infer/predict_det.py --det_algorithm="SAST" --image_dir="./doc/imgs_en/img623.jpg" --det_model_dir="./inference/det_sast_tt/" --det_sast_polygon=True +``` +可视化文本检测结果默认保存到`./inference_results`文件夹里面,结果文件的名称前缀为'det_res'。结果示例如下: +![](../imgs_results/det_res_img623_sast.jpg) +**注意**:本代码库中,SAST后处理Locality-Aware NMS有python和c++两种版本,c++版速度明显快于python版。由于c++版本nms编译版本问题,只有python3.5环境下会调用c++版nms,其他情况将调用python版nms。 + + + ## 三、文本识别模型推理 下面将介绍超轻量中文识别模型推理、基于CTC损失的识别模型推理和基于Attention损失的识别模型推理。对于中文文本识别,建议优先选择基于CTC损失的识别模型,实践中也发现基于Attention损失的效果不如基于CTC损失的识别模型。此外,如果训练时修改了文本的字典,请参考下面的自定义文本识别字典的推理。 -### 1.超轻量中文识别模型推理 + +### 1. 超轻量中文识别模型推理 超轻量中文识别模型推理,可以执行如下命令: @@ -155,7 +218,8 @@ python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words/ch/word_4.jpg" Predicts of ./doc/imgs_words/ch/word_4.jpg:['实力活力', 0.89552695] -### 2.基于CTC损失的识别模型推理 + +### 2. 基于CTC损失的识别模型推理 我们以STAR-Net为例,介绍基于CTC损失的识别模型推理。 CRNN和Rosetta使用方式类似,不用设置识别算法参数rec_algorithm。 @@ -176,7 +240,8 @@ STAR-Net文本识别模型推理,可以执行如下命令: python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words_en/word_336.png" --rec_model_dir="./inference/starnet/" --rec_image_shape="3, 32, 100" --rec_char_type="en" ``` -### 3.基于Attention损失的识别模型推理 + +### 3. 基于Attention损失的识别模型推理 基于Attention损失的识别模型与ctc不同,需要额外设置识别算法参数 --rec_algorithm="RARE" @@ -202,16 +267,18 @@ self.character_str = "0123456789abcdefghijklmnopqrstuvwxyz" dict_character = list(self.character_str) ``` -### 4.自定义文本识别字典的推理 + +### 4. 自定义文本识别字典的推理 如果训练时修改了文本的字典,在使用inference模型预测时,需要通过`--rec_char_dict_path`指定使用的字典路径 ``` python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words_en/word_336.png" --rec_model_dir="./your inference model" --rec_image_shape="3, 32, 100" --rec_char_type="en" --rec_char_dict_path="your text dict path" ``` + ## 四、文本检测、识别串联推理 - -### 1.超轻量中文OCR模型推理 + +### 1. 超轻量中文OCR模型推理 在执行预测时,需要通过参数image_dir指定单张图像或者图像集合的路径、参数det_model_dir指定检测inference模型的路径和参数rec_model_dir指定识别inference模型的路径。可视化识别结果默认保存到 ./inference_results 文件夹里面。 @@ -223,9 +290,14 @@ python3 tools/infer/predict_system.py --image_dir="./doc/imgs/2.jpg" --det_model ![](../imgs_results/2.jpg) -### 2.其他模型推理 + +### 2. 其他模型推理 + +如果想尝试使用其他检测算法或者识别算法,请参考上述文本检测模型推理和文本识别模型推理,更新相应配置和模型。 + +**注意:由于检测框矫正逻辑的局限性,暂不支持使用SAST弯曲文本检测模型(即,使用参数`--det_sast_polygon=True`时)进行模型串联。** -如果想尝试使用其他检测算法或者识别算法,请参考上述文本检测模型推理和文本识别模型推理,更新相应配置和模型,下面给出基于EAST文本检测和STAR-Net文本识别执行命令: +下面给出基于EAST文本检测和STAR-Net文本识别执行命令: ``` python3 tools/infer/predict_system.py --image_dir="./doc/imgs_en/img_10.jpg" --det_model_dir="./inference/det_east/" --det_algorithm="EAST" --rec_model_dir="./inference/starnet/" --rec_image_shape="3, 32, 100" --rec_char_type="en" diff --git a/doc/doc_ch/quickstart.md b/doc/doc_ch/quickstart.md index fead57f3d12395c6b4a2417fe8a23b1e00a4579b..701b50ed36fc69a6285550e6f53f6f3a09a1a63d 100644 --- a/doc/doc_ch/quickstart.md +++ b/doc/doc_ch/quickstart.md @@ -5,6 +5,8 @@ 请先参考[快速安装](./installation.md)配置PaddleOCR运行环境。 +*注意:也可以通过 whl 包安装使用PaddleOCR,具体参考[Paddleocr Package使用说明](https://github.com/PaddlePaddle/PaddleOCR/blob/develop/doc/doc_ch/whl.md)。* + ## 2.inference模型下载 |模型名称|模型简介|检测模型地址|识别模型地址|支持空格的识别模型地址| diff --git a/doc/doc_ch/recognition.md b/doc/doc_ch/recognition.md index eda456c5c5d7573fd89de2a8ac0c1042a5b3a59b..c554b9f11c96744ae928aaf9992606a364680557 100644 --- a/doc/doc_ch/recognition.md +++ b/doc/doc_ch/recognition.md @@ -18,6 +18,8 @@ ln -sf /train_data/dataset 若您本地没有数据集,可以在官网下载 [icdar2015](http://rrc.cvc.uab.es/?ch=4&com=downloads) 数据,用于快速验证。也可以参考[DTRB](https://github.com/clovaai/deep-text-recognition-benchmark#download-lmdb-dataset-for-traininig-and-evaluation-from-here),下载 benchmark 所需的lmdb格式数据集。 +如果希望复现SRN的论文指标,需要下载离线[增广数据](https://pan.baidu.com/s/1-HSZ-ZVdqBF2HaBZ5pRAKA),提取码: y3ry。增广数据是由MJSynth和SynthText做旋转和扰动得到的。数据下载完成后请解压到 {your_path}/PaddleOCR/train_data/data_lmdb_release/training/ 路径下。 + * 使用自己数据集: 若您希望使用自己的数据进行训练,请参考下文组织您的数据。 @@ -161,6 +163,7 @@ PaddleOCR支持训练和评估交替进行, 可以在 `configs/rec/rec_icdar15_t | rec_r34_vd_none_none_ctc.yml | Rosetta | Resnet34_vd | None | None | ctc | | rec_r34_vd_tps_bilstm_attn.yml | RARE | Resnet34_vd | tps | BiLSTM | attention | | rec_r34_vd_tps_bilstm_ctc.yml | STARNet | Resnet34_vd | tps | BiLSTM | ctc | +| rec_r50fpn_vd_none_srn.yml | SRN | Resnet50_fpn_vd | None | rnn | srn | 训练中文数据,推荐使用`rec_chinese_lite_train.yml`,如您希望尝试其他算法在中文数据集上的效果,请参考下列说明修改配置文件: diff --git a/doc/doc_ch/update.md b/doc/doc_ch/update.md index 1cd7788511c29df8934efe2c1462aaca68c9b92b..23a47df580da065af0ab62aca2c50e507f564f05 100644 --- a/doc/doc_ch/update.md +++ b/doc/doc_ch/update.md @@ -1,6 +1,8 @@ # 更新 +- 2020.8.24 支持通过whl包安装使用PaddleOCR,具体参考[Paddleocr Package使用说明](https://github.com/PaddlePaddle/PaddleOCR/blob/develop/doc/doc_ch/whl.md) +- 2020.8.21 更新8月18日B站直播课回放和PPT,课节2,易学易用的OCR工具大礼包,[获取地址](https://aistudio.baidu.com/aistudio/education/group/info/1519) - 2020.8.16 开源文本检测算法[SAST](https://arxiv.org/abs/1908.05498)和文本识别算法[SRN](https://arxiv.org/abs/2003.12294) -- 2020.7.23 发布7月21日B站直播课回放和PPT,PaddleOCR开源大礼包全面解读,[获取地址](https://aistudio.baidu.com/aistudio/course/introduce/1519) +- 2020.7.23 发布7月21日B站直播课回放和PPT,课节1,PaddleOCR开源大礼包全面解读,[获取地址](https://aistudio.baidu.com/aistudio/course/introduce/1519) - 2020.7.15 添加基于EasyEdge和Paddle-Lite的移动端DEMO,支持iOS和Android系统 - 2020.7.15 完善预测部署,添加基于C++预测引擎推理、服务化部署和端侧部署方案,以及超轻量级中文OCR模型预测耗时Benchmark - 2020.7.15 整理OCR相关数据集、常用数据标注以及合成工具 diff --git a/doc/doc_ch/whl.md b/doc/doc_ch/whl.md new file mode 100644 index 0000000000000000000000000000000000000000..280cc2f62ec40ec2228128c9ddd95088904f647b --- /dev/null +++ b/doc/doc_ch/whl.md @@ -0,0 +1,194 @@ +# paddleocr package使用说明 + +## 快速上手 + +### 安装whl包 + +pip安装 +```bash +pip install paddleocr +``` + +本地构建并安装 +```bash +python setup.py bdist_wheel +pip install dist/paddleocr-0.0.3-py3-none-any.whl +``` +### 1. 代码使用 + +* 检测+识别全流程 +```python +from paddleocr import PaddleOCR, draw_ocr +ocr = PaddleOCR() # need to run only once to download and load model into memory +img_path = 'PaddleOCR/doc/imgs/11.jpg' +result = ocr.ocr(img_path) +for line in result: + print(line) + +# 显示结果 +from PIL import Image +image = Image.open(img_path).convert('RGB') +boxes = [line[0] for line in result] +txts = [line[1][0] for line in result] +scores = [line[1][1] for line in result] +im_show = draw_ocr(image, boxes, txts, scores, font_path='/path/to/PaddleOCR/doc/simfang.ttf') +im_show = Image.fromarray(im_show) +im_show.save('result.jpg') +``` +结果是一个list,每个item包含了文本框,文字和识别置信度 +```bash +[[[24.0, 36.0], [304.0, 34.0], [304.0, 72.0], [24.0, 74.0]], ['纯臻营养护发素', 0.964739]] +[[[24.0, 80.0], [172.0, 80.0], [172.0, 104.0], [24.0, 104.0]], ['产品信息/参数', 0.98069626]] +[[[24.0, 109.0], [333.0, 109.0], [333.0, 136.0], [24.0, 136.0]], ['(45元/每公斤,100公斤起订)', 0.9676722]] +...... +``` +结果可视化 + +
+ +
+ +* 单独执行检测 +```python +from paddleocr import PaddleOCR, draw_ocr +ocr = PaddleOCR() # need to run only once to download and load model into memory +img_path = 'PaddleOCR/doc/imgs/11.jpg' +result = ocr.ocr(img_path,rec=False) +for line in result: + print(line) + +# 显示结果 +from PIL import Image + +image = Image.open(img_path).convert('RGB') +im_show = draw_ocr(image, result, txts=None, scores=None, font_path='/path/to/PaddleOCR/doc/simfang.ttf') +im_show = Image.fromarray(im_show) +im_show.save('result.jpg') +``` +结果是一个list,每个item只包含文本框 +```bash +[[26.0, 457.0], [137.0, 457.0], [137.0, 477.0], [26.0, 477.0]] +[[25.0, 425.0], [372.0, 425.0], [372.0, 448.0], [25.0, 448.0]] +[[128.0, 397.0], [273.0, 397.0], [273.0, 414.0], [128.0, 414.0]] +...... +``` +结果可视化 + + +
+ +
+ +* 单独执行识别 +```python +from paddleocr import PaddleOCR +ocr = PaddleOCR() # need to run only once to download and load model into memory +img_path = 'PaddleOCR/doc/imgs_words/ch/word_1.jpg' +result = ocr.ocr(img_path,det=False) +for line in result: + print(line) +``` +结果是一个list,每个item只包含识别结果和识别置信度 +```bash +['韩国小馆', 0.9907421] +``` + +### 通过命令行使用 + +查看帮助信息 +```bash +paddleocr -h +``` + +* 检测+识别全流程 +```bash +paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg +``` +结果是一个list,每个item包含了文本框,文字和识别置信度 +```bash +[[[24.0, 36.0], [304.0, 34.0], [304.0, 72.0], [24.0, 74.0]], ['纯臻营养护发素', 0.964739]] +[[[24.0, 80.0], [172.0, 80.0], [172.0, 104.0], [24.0, 104.0]], ['产品信息/参数', 0.98069626]] +[[[24.0, 109.0], [333.0, 109.0], [333.0, 136.0], [24.0, 136.0]], ['(45元/每公斤,100公斤起订)', 0.9676722]] +...... +``` + +* 单独执行检测 +```bash +paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg --rec false +``` +结果是一个list,每个item只包含文本框 +```bash +[[26.0, 457.0], [137.0, 457.0], [137.0, 477.0], [26.0, 477.0]] +[[25.0, 425.0], [372.0, 425.0], [372.0, 448.0], [25.0, 448.0]] +[[128.0, 397.0], [273.0, 397.0], [273.0, 414.0], [128.0, 414.0]] +...... +``` + +* 单独执行识别 +```bash +paddleocr --image_dir PaddleOCR/doc/imgs_words/ch/word_1.jpg --det false +``` + +结果是一个list,每个item只包含识别结果和识别置信度 +```bash +['韩国小馆', 0.9907421] +``` + +## 自定义模型 +当内置模型无法满足需求时,需要使用到自己训练的模型。 +首先,参照[inference.md](./inference.md) 第一节转换将检测和识别模型转换为inference模型,然后按照如下方式使用 + +### 代码使用 +```python +from paddleocr import PaddleOCR, draw_ocr +# 检测模型和识别模型路径下必须含有model和params文件 +ocr = PaddleOCR(det_model_dir='{your_det_model_dir}',rec_model_dir='{your_rec_model_dir}') +img_path = 'PaddleOCR/doc/imgs/11.jpg' +result = ocr.ocr(img_path) +for line in result: + print(line) + +# 显示结果 +from PIL import Image +image = Image.open(img_path).convert('RGB') +boxes = [line[0] for line in result] +txts = [line[1][0] for line in result] +scores = [line[1][1] for line in result] +im_show = draw_ocr(image, boxes, txts, scores, font_path='/path/to/PaddleOCR/doc/simfang.ttf') +im_show = Image.fromarray(im_show) +im_show.save('result.jpg') +``` + +### 通过命令行使用 + +```bash +paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg --det_model_dir {your_det_model_dir} --rec_model_dir {your_rec_model_dir} +``` + +## 参数说明 + +| 字段 | 说明 | 默认值 | +|-------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------| +| use_gpu | 是否使用GPU | TRUE | +| gpu_mem | 初始化占用的GPU内存大小 | 8000M | +| image_dir | 通过命令行调用时执行预测的图片或文件夹路径 | | +| det_algorithm | 使用的检测算法类型 | DB | +| det_model_dir | 检测模型所在文件夹。传参方式有两种,1. None: 自动下载内置模型到 `~/.paddleocr/det`;2.自己转换好的inference模型路径,模型路径下必须包含model和params文件 | None | +| det_max_side_len | 检测算法前向时图片长边的最大尺寸,当长边超出这个值时会将长边resize到这个大小,短边等比例缩放 | 960 | +| det_db_thresh | DB模型输出预测图的二值化阈值 | 0.3 | +| det_db_box_thresh | DB模型输出框的阈值,低于此值的预测框会被丢弃 | 0.5 | +| det_db_unclip_ratio | DB模型输出框扩大的比例 | 2 | +| det_east_score_thresh | EAST模型输出预测图的二值化阈值 | 0.8 | +| det_east_cover_thresh | EAST模型输出框的阈值,低于此值的预测框会被丢弃 | 0.1 | +| det_east_nms_thresh | EAST模型输出框NMS的阈值 | 0.2 | +| rec_algorithm | 使用的识别算法类型 | CRNN | +| rec_model_dir | 识别模型所在文件夹。传承那方式有两种,1. None: 自动下载内置模型到 `~/.paddleocr/rec`;2.自己转换好的inference模型路径,模型路径下必须包含model和params文件 | None | +| rec_image_shape | 识别算法的输入图片尺寸 | "3,32,320" | +| rec_char_type | 识别算法的字符类型,中文(ch)或英文(en) | ch | +| rec_batch_num | 进行识别时,同时前向的图片数 | 30 | +| max_text_length | 识别算法能识别的最大文字长度 | 25 | +| rec_char_dict_path | 识别模型字典路径,当rec_model_dir使用方式2传参时需要修改为自己的字典路径 | ./ppocr/utils/ppocr_keys_v1.txt | +| use_space_char | 是否识别空格 | TRUE | +| enable_mkldnn | 是否启用mkldnn | FALSE | +| det | 前向时使用启动检测 | TRUE | +| rec | 前向时是否启动识别 | TRUE | diff --git a/doc/doc_en/FAQ_en.md b/doc/doc_en/FAQ_en.md index 04feb363777801088efa0425195afd9e065a5b1e..25777be77b6393c09c38e3c319ca1bd50cc3b1e8 100644 --- a/doc/doc_en/FAQ_en.md +++ b/doc/doc_en/FAQ_en.md @@ -45,9 +45,12 @@ At present, the open source model, dataset and magnitude are as follows: Among them, the public datasets are opensourced, users can search and download by themselves, or refer to [Chinese data set](./datasets_en.md), synthetic data is not opensourced, users can use open-source synthesis tools to synthesize data themselves. Current available synthesis tools include [text_renderer](https://github.com/Sanster/text_renderer), [SynthText](https://github.com/ankush-me/SynthText), [TextRecognitionDataGenerator](https://github.com/Belval/TextRecognitionDataGenerator), etc. 10. **Error in using the model with TPS module for prediction** -Error message: Input(X) dims[3] and Input(Grid) dims[2] should be equal, but received X dimension[3](108) != Grid dimension[2](100) +Error message: Input(X) dims[3] and Input(Grid) dims[2] should be equal, but received X dimension[3]\(108) != Grid dimension[2]\(100) Solution:TPS does not support variable shape. Please set --rec_image_shape='3,32,100' and --rec_char_type='en' -11. **Custom dictionary used during training, the recognition results show that words do not appear in the dictionary** +11. **Custom dictionary used during training, the recognition results show that words do not appear in the dictionary** +The used custom dictionary path is not set when making prediction. The solution is setting parameter `rec_char_dict_path` to the corresponding dictionary file. -The used custom dictionary path is not set when making prediction. The solution is setting parameter `rec_char_dict_path` to the corresponding dictionary file. \ No newline at end of file + +12. **Results of cpp_infer and python_inference are very different** +Versions of exprted inference model and inference libraray should be same. For example, on Windows platform, version of the inference libraray that PaddlePaddle provides is 1.8, but version of the inference model that PaddleOCR provides is 1.7, you should export model yourself(`tools/export_model.py`) on PaddlePaddle1.8 and then use the exported model for inference. diff --git a/doc/doc_en/config_en.md b/doc/doc_en/config_en.md index 66578424a60488a986eaff6fe937e4ffbc1bf59e..b54def895f0758df7cdbd089253d6acd712d2b8e 100644 --- a/doc/doc_en/config_en.md +++ b/doc/doc_en/config_en.md @@ -60,8 +60,9 @@ Take `rec_icdar15_train.yml` as an example: | beta1 | Set the exponential decay rate for the 1st moment estimates | 0.9 | \ | | beta2 | Set the exponential decay rate for the 2nd moment estimates | 0.999 | \ | | decay | Whether to use decay | \ | \ | -| function(decay) | Set the decay function | cosine_decay | Support cosine_decay and piecewise_decay | -| step_each_epoch | The number of steps in an epoch. Used in cosine_decay | 20 | Calculation :total_image_num / (batch_size_per_card * card_size) | -| total_epoch | The number of epochs. Used in cosine_decay | 1000 | Consistent with Global.epoch_num | +| function(decay) | Set the decay function | cosine_decay | Support cosine_decay, cosine_decay_warmup and piecewise_decay | +| step_each_epoch | The number of steps in an epoch. Used in cosine_decay/cosine_decay_warmup | 20 | Calculation: total_image_num / (batch_size_per_card * card_size) | +| total_epoch | The number of epochs. Used in cosine_decay/cosine_decay_warmup | 1000 | Consistent with Global.epoch_num | +| warmup_minibatch | Number of steps for linear warmup. Used in cosine_decay_warmup | 1000 | \ | | boundaries | The step intervals to reduce learning rate. Used in piecewise_decay | - | The format is list | | decay_rate | Learning rate decay rate. Used in piecewise_decay | - | \ | diff --git a/doc/doc_en/detection_en.md b/doc/doc_en/detection_en.md index f22c574b9413cc703f5802edb1a2837d5fad7ef8..08e6b63bb77ad1cb5ec4c741d6cad1d099f6c070 100644 --- a/doc/doc_en/detection_en.md +++ b/doc/doc_en/detection_en.md @@ -1,12 +1,12 @@ # TEXT DETECTION -This section uses the icdar15 dataset as an example to introduce the training, evaluation, and testing of the detection model in PaddleOCR. +This section uses the icdar2015 dataset as an example to introduce the training, evaluation, and testing of the detection model in PaddleOCR. ## DATA PREPARATION The icdar2015 dataset can be obtained from [official website](https://rrc.cvc.uab.es/?ch=4&com=downloads). Registration is required for downloading. Decompress the downloaded dataset to the working directory, assuming it is decompressed under PaddleOCR/train_data/. In addition, PaddleOCR organizes many scattered annotation files into two separate annotation files for train and test respectively, which can be downloaded by wget: -``` +```shell # Under the PaddleOCR path cd PaddleOCR/ wget -P ./train_data/ https://paddleocr.bj.bcebos.com/dataset/train_icdar2015_label.txt @@ -27,16 +27,19 @@ The provided annotation file format is as follow, seperated by "\t": " Image file name Image annotation information encoded by json.dumps" ch4_test_images/img_61.jpg [{"transcription": "MASA", "points": [[310, 104], [416, 141], [418, 216], [312, 179]]}, {...}] ``` -The image annotation after json.dumps() encoding is a list containing multiple dictionaries. The `points` in the dictionary represent the coordinates (x, y) of the four points of the text box, arranged clockwise from the point at the upper left corner. +The image annotation after **json.dumps()** encoding is a list containing multiple dictionaries. + +The `points` in the dictionary represent the coordinates (x, y) of the four points of the text box, arranged clockwise from the point at the upper left corner. + +`transcription` represents the text of the current text box. **When its content is "###" it means that the text box is invalid and will be skipped during training.** -`transcription` represents the text of the current text box, and this information is not needed in the text detection task. -If you want to train PaddleOCR on other datasets, you can build the annotation file according to the above format. +If you want to train PaddleOCR on other datasets, please build the annotation file according to the above format. ## TRAINING First download the pretrained model. The detection model of PaddleOCR currently supports two backbones, namely MobileNetV3 and ResNet50_vd. You can use the model in [PaddleClas](https://github.com/PaddlePaddle/PaddleClas/tree/master/ppcls/modeling/architectures) to replace backbone according to your needs. -``` +```shell cd PaddleOCR/ # Download the pre-trained model of MobileNetV3 wget -P ./pretrain_models/ https://paddle-imagenet-models-name.bj.bcebos.com/MobileNetV3_large_x0_5_pretrained.tar @@ -44,7 +47,7 @@ wget -P ./pretrain_models/ https://paddle-imagenet-models-name.bj.bcebos.com/Mob wget -P ./pretrain_models/ https://paddle-imagenet-models-name.bj.bcebos.com/ResNet50_vd_ssld_pretrained.tar # decompressing the pre-training model file, take MobileNetV3 as an example -tar xf ./pretrain_models/MobileNetV3_large_x0_5_pretrained.tar ./pretrain_models/ +tar -xf ./pretrain_models/MobileNetV3_large_x0_5_pretrained.tar ./pretrain_models/ # Note: After decompressing the backbone pre-training weight file correctly, the file list in the folder is as follows: ./pretrain_models/MobileNetV3_large_x0_5_pretrained/ @@ -56,9 +59,9 @@ tar xf ./pretrain_models/MobileNetV3_large_x0_5_pretrained.tar ./pretrain_models ``` -**START TRAINING** +#### START TRAINING *If CPU version installed, please set the parameter `use_gpu` to `false` in the configuration.* -``` +```shell python3 tools/train.py -c configs/det/det_mv3_db.yml ``` @@ -66,19 +69,19 @@ In the above instruction, use `-c` to select the training to use the `configs/de For a detailed explanation of the configuration file, please refer to [config](./config_en.md). You can also use `-o` to change the training parameters without modifying the yml file. For example, adjust the training learning rate to 0.0001 -``` +```shell python3 tools/train.py -c configs/det/det_mv3_db.yml -o Optimizer.base_lr=0.0001 ``` -**load trained model and conntinue training** +#### load trained model and conntinue training If you expect to load trained model and continue the training again, you can specify the parameter `Global.checkpoints` as the model path to be loaded. For example: -``` +```shell python3 tools/train.py -c configs/det/det_mv3_db.yml -o Global.checkpoints=./your/trained/model ``` -**Note**:The priority of `Global.checkpoints` is higher than that of `Global.pretrain_weights`, that is, when two parameters are specified at the same time, the model specified by Global.checkpoints will be loaded first. If the model path specified by `Global.checkpoints` is wrong, the one specified by `Global.pretrain_weights` will be loaded. +**Note**: The priority of `Global.checkpoints` is higher than that of `Global.pretrain_weights`, that is, when two parameters are specified at the same time, the model specified by `Global.checkpoints` will be loaded first. If the model path specified by `Global.checkpoints` is wrong, the one specified by `Global.pretrain_weights` will be loaded. ## EVALUATION @@ -89,7 +92,7 @@ Run the following code to calculate the evaluation indicators. The result will b When evaluating, set post-processing parameters `box_thresh=0.6`, `unclip_ratio=1.5`. If you use different datasets, different models for training, these two parameters should be adjusted for better result. -``` +```shell python3 tools/eval.py -c configs/det/det_mv3_db.yml -o Global.checkpoints="{path/to/weights}/best_accuracy" PostProcess.box_thresh=0.6 PostProcess.unclip_ratio=1.5 ``` The model parameters during training are saved in the `Global.save_model_dir` directory by default. When evaluating indicators, you need to set `Global.checkpoints` to point to the saved parameter file. diff --git a/doc/doc_en/inference_en.md b/doc/doc_en/inference_en.md index 58f2e3fb312af862f1eff4969772c8937d5cb767..83ec2a90c45a320815e10e8572d894068c0b5130 100644 --- a/doc/doc_en/inference_en.md +++ b/doc/doc_en/inference_en.md @@ -1,7 +1,7 @@ # Reasoning based on Python prediction engine -The inference model (the model saved by fluid.io.save_inference_model) is generally a solidified model saved after the model training is completed, and is mostly used to give prediction in deployment. +The inference model (the model saved by `fluid.io.save_inference_model`) is generally a solidified model saved after the model training is completed, and is mostly used to give prediction in deployment. The model saved during the training process is the checkpoints model, which saves the parameters of the model and is mostly used to resume training. @@ -9,7 +9,31 @@ Compared with the checkpoints model, the inference model will additionally save Next, we first introduce how to convert a trained model into an inference model, and then we will introduce text detection, text recognition, and the concatenation of them based on inference model. +- [CONVERT TRAINING MODEL TO INFERENCE MODEL](#CONVERT) + - [Convert detection model to inference model](#Convert_detection_model) + - [Convert recognition model to inference model](#Convert_recognition_model) + + +- [TEXT DETECTION MODEL INFERENCE](#DETECTION_MODEL_INFERENCE) + - [1. LIGHTWEIGHT CHINESE DETECTION MODEL INFERENCE](#LIGHTWEIGHT_DETECTION) + - [2. DB TEXT DETECTION MODEL INFERENCE](#DB_DETECTION) + - [3. EAST TEXT DETECTION MODEL INFERENCE](#EAST_DETECTION) + - [4. SAST TEXT DETECTION MODEL INFERENCE](#SAST_DETECTION) + +- [TEXT RECOGNITION MODEL INFERENCE](#RECOGNITION_MODEL_INFERENCE) + - [1. LIGHTWEIGHT CHINESE MODEL](#LIGHTWEIGHT_RECOGNITION) + - [2. CTC-BASED TEXT RECOGNITION MODEL INFERENCE](#CTC-BASED_RECOGNITION) + - [3. ATTENTION-BASED TEXT RECOGNITION MODEL INFERENCE](#ATTENTION-BASED_RECOGNITION) + - [4. TEXT RECOGNITION MODEL INFERENCE USING CUSTOM CHARACTERS DICTIONARY](#USING_CUSTOM_CHARACTERS) + + +- [TEXT DETECTION AND RECOGNITION INFERENCE CONCATENATION](#CONCATENATION) + - [1. LIGHTWEIGHT CHINESE MODEL](#LIGHTWEIGHT_CHINESE_MODEL) + - [2. OTHER MODELS](#OTHER_MODELS) + + ## CONVERT TRAINING MODEL TO INFERENCE MODEL + ### Convert detection model to inference model Download the lightweight Chinese detection model: @@ -35,6 +59,7 @@ inference/det_db/ └─ params Check the parameter file of the inference model ``` + ### Convert recognition model to inference model Download the lightweight Chinese recognition model: @@ -62,11 +87,13 @@ After the conversion is successful, there are two files in the directory: └─ params Identify the parameter files of the inference model ``` + ## TEXT DETECTION MODEL INFERENCE The following will introduce the lightweight Chinese detection model inference, DB text detection model inference and EAST text detection model inference. The default configuration is based on the inference setting of the DB text detection model. Because EAST and DB algorithms are very different, when inference, it is necessary to **adapt the EAST text detection algorithm by passing in corresponding parameters**. + ### 1. LIGHTWEIGHT CHINESE DETECTION MODEL INFERENCE For lightweight Chinese detection model inference, you can execute the following commands: @@ -90,6 +117,7 @@ If you want to use the CPU for prediction, execute the command as follows python3 tools/infer/predict_det.py --image_dir="./doc/imgs/2.jpg" --det_model_dir="./inference/det_db/" --use_gpu=False ``` + ### 2. DB TEXT DETECTION MODEL INFERENCE First, convert the model saved in the DB text detection training process into an inference model. Taking the model based on the Resnet50_vd backbone network and trained on the ICDAR2015 English dataset as an example ([model download link](https://paddleocr.bj.bcebos.com/det_r50_vd_db.tar)), you can use the following command to convert: @@ -114,6 +142,7 @@ The visualized text detection results are saved to the `./inference_results` fol **Note**: Since the ICDAR2015 dataset has only 1,000 training images, mainly for English scenes, the above model has very poor detection result on Chinese text images. + ### 3. EAST TEXT DETECTION MODEL INFERENCE First, convert the model saved in the EAST text detection training process into an inference model. Taking the model based on the Resnet50_vd backbone network and trained on the ICDAR2015 English dataset as an example ([model download link](https://paddleocr.bj.bcebos.com/det_r50_vd_east.tar)), you can use the following command to convert: @@ -126,23 +155,64 @@ First, convert the model saved in the EAST text detection training process into python3 tools/export_model.py -c configs/det/det_r50_vd_east.yml -o Global.checkpoints="./models/det_r50_vd_east/best_accuracy" Global.save_inference_dir="./inference/det_east" ``` -For EAST text detection model inference, you need to set the parameter det_algorithm, specify the detection algorithm type to EAST, run the following command: +**For EAST text detection model inference, you need to set the parameter ``--det_algorithm="EAST"``**, run the following command: ``` python3 tools/infer/predict_det.py --image_dir="./doc/imgs_en/img_10.jpg" --det_model_dir="./inference/det_east/" --det_algorithm="EAST" ``` + The visualized text detection results are saved to the `./inference_results` folder by default, and the name of the result file is prefixed with 'det_res'. Examples of results are as follows: ![](../imgs_results/det_res_img_10_east.jpg) -**Note**: The Python version of NMS in EAST post-processing used in this codebase so the prediction speed is quite slow. If you use the C++ version, there will be a significant speedup. +**Note**: EAST post-processing locality aware NMS has two versions: Python and C++. The speed of C++ version is obviously faster than that of Python version. Due to the compilation version problem of NMS of C++ version, C++ version NMS will be called only in Python 3.5 environment, and python version NMS will be called in other cases. + + + +### 4. SAST TEXT DETECTION MODEL INFERENCE +#### (1). Quadrangle text detection model (ICDAR2015) +First, convert the model saved in the SAST text detection training process into an inference model. Taking the model based on the Resnet50_vd backbone network and trained on the ICDAR2015 English dataset as an example ([model download link](https://paddleocr.bj.bcebos.com/SAST/sast_r50_vd_icdar2015.tar)), you can use the following command to convert: + +``` +python3 tools/export_model.py -c configs/det/det_r50_vd_sast_icdar15.yml -o Global.checkpoints="./models/sast_r50_vd_icdar2015/best_accuracy" Global.save_inference_dir="./inference/det_sast_ic15" +``` + +**For SAST quadrangle text detection model inference, you need to set the parameter `--det_algorithm="SAST"`**, run the following command: + +``` +python3 tools/infer/predict_det.py --det_algorithm="SAST" --image_dir="./doc/imgs_en/img_10.jpg" --det_model_dir="./inference/det_sast_ic15/" +``` + +The visualized text detection results are saved to the `./inference_results` folder by default, and the name of the result file is prefixed with 'det_res'. Examples of results are as follows: + +![](../imgs_results/det_res_img_10_sast.jpg) +#### (2). Curved text detection model (Total-Text) +First, convert the model saved in the SAST text detection training process into an inference model. Taking the model based on the Resnet50_vd backbone network and trained on the Total-Text English dataset as an example ([model download link](https://paddleocr.bj.bcebos.com/SAST/sast_r50_vd_total_text.tar)), you can use the following command to convert: +``` +python3 tools/export_model.py -c configs/det/det_r50_vd_sast_totaltext.yml -o Global.checkpoints="./models/sast_r50_vd_total_text/best_accuracy" Global.save_inference_dir="./inference/det_sast_tt" +``` + +**For SAST curved text detection model inference, you need to set the parameter `--det_algorithm="SAST"` and `--det_sast_polygon=True`**, run the following command: + +``` +python3 tools/infer/predict_det.py --det_algorithm="SAST" --image_dir="./doc/imgs_en/img623.jpg" --det_model_dir="./inference/det_sast_tt/" --det_sast_polygon=True +``` + +The visualized text detection results are saved to the `./inference_results` folder by default, and the name of the result file is prefixed with 'det_res'. Examples of results are as follows: + +![](../imgs_results/det_res_img623_sast.jpg) + +**Note**: SAST post-processing locality aware NMS has two versions: Python and C++. The speed of C++ version is obviously faster than that of Python version. Due to the compilation version problem of NMS of C++ version, C++ version NMS will be called only in Python 3.5 environment, and python version NMS will be called in other cases. + + ## TEXT RECOGNITION MODEL INFERENCE The following will introduce the lightweight Chinese recognition model inference, other CTC-based and Attention-based text recognition models inference. For Chinese text recognition, it is recommended to choose the recognition model based on CTC loss. In practice, it is also found that the result of the model based on Attention loss is not as good as the one based on CTC loss. In addition, if the characters dictionary is modified during training, make sure that you use the same characters set during inferencing. Please check below for details. + ### 1. LIGHTWEIGHT CHINESE TEXT RECOGNITION MODEL REFERENCE For lightweight Chinese recognition model inference, you can execute the following commands: @@ -158,6 +228,7 @@ After executing the command, the prediction results (recognized text and score) Predicts of ./doc/imgs_words/ch/word_4.jpg:['实力活力', 0.89552695] + ### 2. CTC-BASED TEXT RECOGNITION MODEL INFERENCE Taking STAR-Net as an example, we introduce the recognition model inference based on CTC loss. CRNN and Rosetta are used in a similar way, by setting the recognition algorithm parameter `rec_algorithm`. @@ -178,6 +249,7 @@ For STAR-Net text recognition model inference, execute the following commands: python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words_en/word_336.png" --rec_model_dir="./inference/starnet/" --rec_image_shape="3, 32, 100" --rec_char_type="en" ``` + ### 3. ATTENTION-BASED TEXT RECOGNITION MODEL INFERENCE ![](../imgs_words_en/word_336.png) @@ -196,6 +268,7 @@ self.character_str = "0123456789abcdefghijklmnopqrstuvwxyz" dict_character = list(self.character_str) ``` + ### 4. TEXT RECOGNITION MODEL INFERENCE USING CUSTOM CHARACTERS DICTIONARY If the chars dictionary is modified during training, you need to specify the new dictionary path by setting the parameter `rec_char_dict_path` when using your inference model to predict. @@ -203,8 +276,10 @@ If the chars dictionary is modified during training, you need to specify the new python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words_en/word_336.png" --rec_model_dir="./your inference model" --rec_image_shape="3, 32, 100" --rec_char_type="en" --rec_char_dict_path="your text dict path" ``` + ## TEXT DETECTION AND RECOGNITION INFERENCE CONCATENATION + ### 1. LIGHTWEIGHT CHINESE MODEL When performing prediction, you need to specify the path of a single image or a folder of images through the parameter `image_dir`, the parameter `det_model_dir` specifies the path to detect the inference model, and the parameter `rec_model_dir` specifies the path to identify the inference model. The visualized recognition results are saved to the `./inference_results` folder by default. @@ -217,9 +292,14 @@ After executing the command, the recognition result image is as follows: ![](../imgs_results/2.jpg) + ### 2. OTHER MODELS -If you want to try other detection algorithms or recognition algorithms, please refer to the above text detection model inference and text recognition model inference, update the corresponding configuration and model, the following command uses the combination of the EAST text detection and STAR-Net text recognition: +If you want to try other detection algorithms or recognition algorithms, please refer to the above text detection model inference and text recognition model inference, update the corresponding configuration and model. + +**Note: due to the limitation of rotation logic of detected box, SAST curved text detection model (using the parameter `det_sast_polygon=True`) is not supported for model combination yet.** + +The following command uses the combination of the EAST text detection and STAR-Net text recognition: ``` python3 tools/infer/predict_system.py --image_dir="./doc/imgs_en/img_10.jpg" --det_model_dir="./inference/det_east/" --det_algorithm="EAST" --rec_model_dir="./inference/starnet/" --rec_image_shape="3, 32, 100" --rec_char_type="en" diff --git a/doc/doc_en/quickstart_en.md b/doc/doc_en/quickstart_en.md index bf22f22fee75a028e5f5effd6f7e36b08c194222..d1fa1683fcfea14be477c910fb2a8dc7709c5d36 100644 --- a/doc/doc_en/quickstart_en.md +++ b/doc/doc_en/quickstart_en.md @@ -5,6 +5,7 @@ Please refer to [quick installation](./installation_en.md) to configure the PaddleOCR operating environment. +*Note: Support the use of PaddleOCR through whl package installation,pelease refer [PaddleOCR Package](https://github.com/PaddlePaddle/PaddleOCR/blob/develop/doc/doc_en/whl_en.md)。* ## 2.inference models diff --git a/doc/doc_en/recognition_en.md b/doc/doc_en/recognition_en.md index 9b34c4309ff22b57d3a0258b742d081fcc14f716..b9c42afada05950539d4dbfb45e133af946b1aae 100644 --- a/doc/doc_en/recognition_en.md +++ b/doc/doc_en/recognition_en.md @@ -18,6 +18,8 @@ ln -sf /train_data/dataset If you do not have a dataset locally, you can download it on the official website [icdar2015](http://rrc.cvc.uab.es/?ch=4&com=downloads). Also refer to [DTRB](https://github.com/clovaai/deep-text-recognition-benchmark#download-lmdb-dataset-for-traininig-and-evaluation-from-here),download the lmdb format dataset required for benchmark +If you want to reproduce the paper indicators of SRN, you need to download offline [augmented data](https://pan.baidu.com/s/1-HSZ-ZVdqBF2HaBZ5pRAKA), extraction code: y3ry. The augmented data is obtained by rotation and perturbation of mjsynth and synthtext. Please unzip the data to {your_path}/PaddleOCR/train_data/data_lmdb_Release/training/path. + * Use your own dataset: If you want to use your own data for training, please refer to the following to organize your data. diff --git a/doc/doc_en/update_en.md b/doc/doc_en/update_en.md index dc839d8955afcfa2d1efbee5e02d35f384d6c627..ca050370989ba3cded8c7211b7ab297ebe239c5f 100644 --- a/doc/doc_en/update_en.md +++ b/doc/doc_en/update_en.md @@ -1,4 +1,5 @@ # RECENT UPDATES +- 2020.8.24 Support the use of PaddleOCR through whl package installation,pelease refer [PaddleOCR Package](https://github.com/PaddlePaddle/PaddleOCR/blob/develop/doc/doc_en/whl_en.md) - 2020.8.16 Release text detection algorithm [SAST](https://arxiv.org/abs/1908.05498) and text recognition algorithm [SRN](https://arxiv.org/abs/2003.12294) - 2020.7.23, Release the playback and PPT of live class on BiliBili station, PaddleOCR Introduction, [address](https://aistudio.baidu.com/aistudio/course/introduce/1519) - 2020.7.15, Add mobile App demo , support both iOS and Android ( based on easyedge and Paddle Lite) diff --git a/doc/doc_en/whl_en.md b/doc/doc_en/whl_en.md new file mode 100644 index 0000000000000000000000000000000000000000..73ab78c111fd4c59a7866ba061877cc91100fb93 --- /dev/null +++ b/doc/doc_en/whl_en.md @@ -0,0 +1,199 @@ +# paddleocr package + +## Get started quickly +### install package +install by pypi +```bash +pip install paddleocr +``` + +build own whl package and install +```bash +python setup.py bdist_wheel +pip install dist/paddleocr-0.0.3-py3-none-any.whl +``` +### 1. Use by code + +* detection and recognition +```python +from paddleocr import PaddleOCR,draw_ocr +ocr = PaddleOCR() # need to run only once to download and load model into memory +img_path = 'PaddleOCR/doc/imgs_en/img_12.jpg' +result = ocr.ocr(img_path) +for line in result: + print(line) + +# draw result +from PIL import Image +image = Image.open(img_path).convert('RGB') +boxes = [line[0] for line in result] +txts = [line[1][0] for line in result] +scores = [line[1][1] for line in result] +im_show = draw_ocr(image, boxes, txts, scores, font_path='/path/to/PaddleOCR/doc/simfang.ttf') +im_show = Image.fromarray(im_show) +im_show.save('result.jpg') +``` + +Output will be a list, each item contains bounding box, text and recognition confidence +```bash +[[[442.0, 173.0], [1169.0, 173.0], [1169.0, 225.0], [442.0, 225.0]], ['ACKNOWLEDGEMENTS', 0.99283075]] +[[[393.0, 340.0], [1207.0, 342.0], [1207.0, 389.0], [393.0, 387.0]], ['We would like to thank all the designers and', 0.9357758]] +[[[399.0, 398.0], [1204.0, 398.0], [1204.0, 433.0], [399.0, 433.0]], ['contributors whohave been involved in the', 0.9592447]] +...... +``` + +Visualization of results + +
+ +
+ +* only detection +```python +from paddleocr import PaddleOCR,draw_ocr +ocr = PaddleOCR() # need to run only once to download and load model into memory +img_path = 'PaddleOCR/doc/imgs_en/img_12.jpg' +result = ocr.ocr(img_path,rec=False) +for line in result: + print(line) + +# draw result +from PIL import Image + +image = Image.open(img_path).convert('RGB') +im_show = draw_ocr(image, result, txts=None, scores=None, font_path='/path/to/PaddleOCR/doc/simfang.ttf') +im_show = Image.fromarray(im_show) +im_show.save('result.jpg') +``` + +Output will be a list, each item only contains bounding box +```bash +[[756.0, 812.0], [805.0, 812.0], [805.0, 830.0], [756.0, 830.0]] +[[820.0, 803.0], [1085.0, 801.0], [1085.0, 836.0], [820.0, 838.0]] +[[393.0, 801.0], [715.0, 805.0], [715.0, 839.0], [393.0, 836.0]] +...... +``` + +Visualization of results + +
+ +
+ +* only recognition +```python +from paddleocr import PaddleOCR +ocr = PaddleOCR() # need to run only once to load model into memory +img_path = 'PaddleOCR/doc/imgs_words_en/word_10.png' +result = ocr.ocr(img_path,det=False) +for line in result: + print(line) +``` + +Output will be a list, each item contains text and recognition confidence +```bash +['PAIN', 0.990372] +``` + +### Use by command line + +show help information +```bash +paddleocr -h +``` + +* detection and recognition +```bash +paddleocr --image_dir PaddleOCR/doc/imgs_en/img_12.jpg +``` + +Output will be a list, each item contains bounding box, text and recognition confidence +```bash +[[[442.0, 173.0], [1169.0, 173.0], [1169.0, 225.0], [442.0, 225.0]], ['ACKNOWLEDGEMENTS', 0.99283075]] +[[[393.0, 340.0], [1207.0, 342.0], [1207.0, 389.0], [393.0, 387.0]], ['We would like to thank all the designers and', 0.9357758]] +[[[399.0, 398.0], [1204.0, 398.0], [1204.0, 433.0], [399.0, 433.0]], ['contributors whohave been involved in the', 0.9592447]] +...... +``` + +* only detection +```bash +paddleocr --image_dir PaddleOCR/doc/imgs_en/img_12.jpg --rec false +``` + +Output will be a list, each item only contains bounding box +```bash +[[756.0, 812.0], [805.0, 812.0], [805.0, 830.0], [756.0, 830.0]] +[[820.0, 803.0], [1085.0, 801.0], [1085.0, 836.0], [820.0, 838.0]] +[[393.0, 801.0], [715.0, 805.0], [715.0, 839.0], [393.0, 836.0]] +...... +``` + +* only recognition +```bash +paddleocr --image_dir PaddleOCR/doc/imgs_words_en/word_10.png --det false +``` + +Output will be a list, each item contains text and recognition confidence +```bash +['PAIN', 0.990372] +``` + +## Use custom model +When the built-in model cannot meet the needs, you need to use your own trained model. +First, refer to the first section of [inference_en.md](./inference_en.md) to convert your det and rec model to inference model, and then use it as follows + +### 1. Use by code + +```python +from paddleocr import PaddleOCR,draw_ocr +# The path of detection and recognition model must contain model and params files +ocr = PaddleOCR(det_model_dir='{your_det_model_dir}',rec_model_dir='{your_rec_model_dir}å') +img_path = 'PaddleOCR/doc/imgs_en/img_12.jpg' +result = ocr.ocr(img_path) +for line in result: + print(line) + +# draw result +from PIL import Image +image = Image.open(img_path).convert('RGB') +boxes = [line[0] for line in result] +txts = [line[1][0] for line in result] +scores = [line[1][1] for line in result] +im_show = draw_ocr(image, boxes, txts, scores, font_path='/path/to/PaddleOCR/doc/simfang.ttf') +im_show = Image.fromarray(im_show) +im_show.save('result.jpg') +``` + +### Use by command line + +```bash +paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg --det_model_dir {your_det_model_dir} --rec_model_dir {your_rec_model_dir} +``` + +## Parameter Description + +| Parameter | Description | Default value | +|-------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------| +| use_gpu | use GPU or not | TRUE | +| gpu_mem | GPU memory size used for initialization | 8000M | +| image_dir | The images path or folder path for predicting when used by the command line | | +| det_algorithm | Type of detection algorithm selected | DB | +| det_model_dir | the text detection inference model folder. There are two ways to transfer parameters, 1. None: Automatically download the built-in model to `~/.paddleocr/det`; 2. The path of the inference model converted by yourself, the model and params files must be included in the model path | None | +| det_max_side_len | The maximum size of the long side of the image. When the long side exceeds this value, the long side will be resized to this size, and the short side will be scaled proportionally | 960 | +| det_db_thresh | Binarization threshold value of DB output map | 0.3 | +| det_db_box_thresh | The threshold value of the DB output box. Boxes score lower than this value will be discarded | 0.5 | +| det_db_unclip_ratio | The expanded ratio of DB output box | 2 | +| det_east_score_thresh | Binarization threshold value of EAST output map | 0.8 | +| det_east_cover_thresh | The threshold value of the EAST output box. Boxes score lower than this value will be discarded | 0.1 | +| det_east_nms_thresh | The NMS threshold value of EAST model output box | 0.2 | +| rec_algorithm | Type of recognition algorithm selected | CRNN | +| rec_model_dir | the text recognition inference model folder. There are two ways to transfer parameters, 1. None: Automatically download the built-in model to `~/.paddleocr/rec`; 2. The path of the inference model converted by yourself, the model and params files must be included in the model path | None | +| rec_image_shape | image shape of recognition algorithm | "3,32,320" | +| rec_char_type | Character type of recognition algorithm, Chinese (ch) or English (en) | ch | +| rec_batch_num | When performing recognition, the batchsize of forward images | 30 | +| max_text_length | The maximum text length that the recognition algorithm can recognize | 25 | +| rec_char_dict_path | the alphabet path which needs to be modified to your own path when `rec_model_Name` use mode 2 | ./ppocr/utils/ppocr_keys_v1.txt | +| use_space_char | Whether to recognize spaces | TRUE | +| enable_mkldnn | Whether to enable mkldnn | FALSE | +| det | Enable detction when `ppocr.ocr` func exec | TRUE | +| rec | Enable detction when `ppocr.ocr` func exec | TRUE | diff --git a/doc/imgs_en/img623.jpg b/doc/imgs_en/img623.jpg new file mode 100755 index 0000000000000000000000000000000000000000..2fae1b5b1ca1d557355933e93bbe268d8bba6778 Binary files /dev/null and b/doc/imgs_en/img623.jpg differ diff --git a/doc/imgs_results/det_res_img623_sast.jpg b/doc/imgs_results/det_res_img623_sast.jpg new file mode 100644 index 0000000000000000000000000000000000000000..b2dd538f7729724a33516091d11c081c7c2c1bd7 Binary files /dev/null and b/doc/imgs_results/det_res_img623_sast.jpg differ diff --git a/doc/imgs_results/det_res_img_10_sast.jpg b/doc/imgs_results/det_res_img_10_sast.jpg new file mode 100644 index 0000000000000000000000000000000000000000..c63faf1354601f25cedb57a3b87f4467999f5457 Binary files /dev/null and b/doc/imgs_results/det_res_img_10_sast.jpg differ diff --git a/doc/imgs_results/whl/11_det.jpg b/doc/imgs_results/whl/11_det.jpg new file mode 100644 index 0000000000000000000000000000000000000000..fe0cd23cc24457f5d7084fff0c63c239d09c9969 Binary files /dev/null and b/doc/imgs_results/whl/11_det.jpg differ diff --git a/doc/imgs_results/whl/11_det_rec.jpg b/doc/imgs_results/whl/11_det_rec.jpg new file mode 100644 index 0000000000000000000000000000000000000000..31c566478fd874d10a61dcd54635453e34c20e4c Binary files /dev/null and b/doc/imgs_results/whl/11_det_rec.jpg differ diff --git a/doc/imgs_results/whl/12_det.jpg b/doc/imgs_results/whl/12_det.jpg new file mode 100644 index 0000000000000000000000000000000000000000..1d5ccf2a6b5d3fa9516560e0cb2646ad6b917da6 Binary files /dev/null and b/doc/imgs_results/whl/12_det.jpg differ diff --git a/doc/imgs_results/whl/12_det_rec.jpg b/doc/imgs_results/whl/12_det_rec.jpg new file mode 100644 index 0000000000000000000000000000000000000000..9db8b57e1279362db2c9f3d6a3ba36b77bf13775 Binary files /dev/null and b/doc/imgs_results/whl/12_det_rec.jpg differ diff --git a/docker/hubserving/README.md b/docker/hubserving/README.md new file mode 100644 index 0000000000000000000000000000000000000000..71e2377dcc4f7524384752b95c53f02471353f34 --- /dev/null +++ b/docker/hubserving/README.md @@ -0,0 +1,58 @@ +English | [简体中文](README_cn.md) + +## Introduction +Many user hopes package the PaddleOCR service into an docker image, so that it can be quickly released and used in the docker or k8s environment. + +This page provide some standardized code to achieve this goal. You can quickly publish the PaddleOCR project into a callable Restful API service through the following steps. (At present, the deployment based on the HubServing mode is implemented first, and author plans to increase the deployment of the PaddleServing mode in the futrue) + +## 1. Prerequisites + +You need to install the following basic components first: +a. Docker +b. Graphics driver and CUDA 10.0+(GPU) +c. NVIDIA Container Toolkit(GPU,Docker 19.03+ can skip this) +d. cuDNN 7.6+(GPU) + +## 2. Build Image +a. Download PaddleOCR sourcecode +``` +git clone https://github.com/PaddlePaddle/PaddleOCR.git +``` +b. Goto Dockerfile directory(ps:Need to distinguish between cpu and gpu version, the following takes cpu as an example, gpu version needs to replace the keyword) +``` +cd docker/cpu +``` +c. Build image +``` +docker build -t paddleocr:cpu . +``` + +## 3. Start container +a. CPU version +``` +sudo docker run -dp 8866:8866 --name paddle_ocr paddleocr:cpu +``` +b. GPU version (base on NVIDIA Container Toolkit) +``` +sudo nvidia-docker run -dp 8866:8866 --name paddle_ocr paddleocr:gpu +``` +c. GPU version (Docker 19.03++) +``` +sudo docker run -dp 8866:8866 --gpus all --name paddle_ocr paddleocr:gpu +``` +d. Check service status(If you can see the following statement then it means completed:Successfully installed ocr_system && Running on http://0.0.0.0:8866/) +``` +docker logs -f paddle_ocr +``` + +## 4. Test +a. Calculate the Base64 encoding of the picture to be recognized (if you just test, you can use a free online tool, like:https://freeonlinetools24.com/base64-image/) +b. Post a service request(sample request in sample_request.txt) + +``` +curl -H "Content-Type:application/json" -X POST --data "{\"images\": [\"Input image Base64 encode(need to delete the code 'data:image/jpg;base64,')\"]}" http://localhost:8866/predict/ocr_system +``` +c. Get resposne(If the call is successful, the following result will be returned) +``` +{"msg":"","results":[[{"confidence":0.8403433561325073,"text":"约定","text_region":[[345,377],[641,390],[634,540],[339,528]]},{"confidence":0.8131805658340454,"text":"最终相遇","text_region":[[356,532],[624,530],[624,596],[356,598]]}]],"status":"0"} +``` diff --git a/docker/hubserving/README_cn.md b/docker/hubserving/README_cn.md new file mode 100644 index 0000000000000000000000000000000000000000..9b9e5f50f5b22f3a2125a656112a20542010ac68 --- /dev/null +++ b/docker/hubserving/README_cn.md @@ -0,0 +1,57 @@ +[English](README.md) | 简体中文 + +## Docker化部署服务 +在日常项目应用中,相信大家一般都会希望能通过Docker技术,把PaddleOCR服务打包成一个镜像,以便在Docker或k8s环境里,快速发布上线使用。 + +本文将提供一些标准化的代码来实现这样的目标。大家通过如下步骤可以把PaddleOCR项目快速发布成可调用的Restful API服务。(目前暂时先实现了基于HubServing模式的部署,后续作者计划增加PaddleServing模式的部署) + +## 1.实施前提准备 + +需要先完成如下基本组件的安装: +a. Docker环境 +b. 显卡驱动和CUDA 10.0+(GPU) +c. NVIDIA Container Toolkit(GPU,Docker 19.03以上版本可以跳过此步) +d. cuDNN 7.6+(GPU) + +## 2.制作镜像 +a.下载PaddleOCR项目代码 +``` +git clone https://github.com/PaddlePaddle/PaddleOCR.git +``` +b.切换至Dockerfile目录(注:需要区分cpu或gpu版本,下文以cpu为例,gpu版本需要替换一下关键字即可) +``` +cd docker/cpu +``` +c.生成镜像 +``` +docker build -t paddleocr:cpu . +``` + +## 3.启动Docker容器 +a. CPU 版本 +``` +sudo docker run -dp 8866:8866 --name paddle_ocr paddleocr:cpu +``` +b. GPU 版本 (通过NVIDIA Container Toolkit) +``` +sudo nvidia-docker run -dp 8866:8866 --name paddle_ocr paddleocr:gpu +``` +c. GPU 版本 (Docker 19.03以上版本,可以直接用如下命令) +``` +sudo docker run -dp 8866:8866 --gpus all --name paddle_ocr paddleocr:gpu +``` +d. 检查服务运行情况(出现:Successfully installed ocr_system和Running on http://0.0.0.0:8866/等信息,表示运行成功) +``` +docker logs -f paddle_ocr +``` + +## 4.测试服务 +a. 计算待识别图片的Base64编码(如果只是测试一下效果,可以通过免费的在线工具实现,如:http://tool.chinaz.com/tools/imgtobase/) +b. 发送服务请求(可参见sample_request.txt中的值) +``` +curl -H "Content-Type:application/json" -X POST --data "{\"images\": [\"填入图片Base64编码(需要删除'data:image/jpg;base64,')\"]}" http://localhost:8866/predict/ocr_system +``` +c. 返回结果(如果调用成功,会返回如下结果) +``` +{"msg":"","results":[[{"confidence":0.8403433561325073,"text":"约定","text_region":[[345,377],[641,390],[634,540],[339,528]]},{"confidence":0.8131805658340454,"text":"最终相遇","text_region":[[356,532],[624,530],[624,596],[356,598]]}]],"status":"0"} +``` diff --git a/docker/hubserving/readme.md b/docker/hubserving/readme.md index 109e6aa6a536c146095b8f46516b1c895dc08337..71e2377dcc4f7524384752b95c53f02471353f34 100644 --- a/docker/hubserving/readme.md +++ b/docker/hubserving/readme.md @@ -1,55 +1,58 @@ -# Docker化部署服务 -在日常项目应用中,相信大家一般都会希望能通过Docker技术,把PaddleOCR服务打包成一个镜像,以便在Docker或k8s环境里,快速发布上线使用。 +English | [简体中文](README_cn.md) -本文将提供一些标准化的代码来实现这样的目标。大家通过如下步骤可以把PaddleOCR项目快速发布成可调用的Restful API服务。(目前暂时先实现了基于HubServing模式的部署,后续作者计划增加PaddleServing模式的部署) +## Introduction +Many user hopes package the PaddleOCR service into an docker image, so that it can be quickly released and used in the docker or k8s environment. -## 1.实施前提准备 +This page provide some standardized code to achieve this goal. You can quickly publish the PaddleOCR project into a callable Restful API service through the following steps. (At present, the deployment based on the HubServing mode is implemented first, and author plans to increase the deployment of the PaddleServing mode in the futrue) -需要先完成如下基本组件的安装: -a. Docker环境 -b. 显卡驱动和CUDA 10.0+(GPU) -c. NVIDIA Container Toolkit(GPU,Docker 19.03以上版本可以跳过此步) +## 1. Prerequisites + +You need to install the following basic components first: +a. Docker +b. Graphics driver and CUDA 10.0+(GPU) +c. NVIDIA Container Toolkit(GPU,Docker 19.03+ can skip this) d. cuDNN 7.6+(GPU) -## 2.制作镜像 -a.下载PaddleOCR项目代码 +## 2. Build Image +a. Download PaddleOCR sourcecode ``` git clone https://github.com/PaddlePaddle/PaddleOCR.git ``` -b.切换至Dockerfile目录(注:需要区分cpu或gpu版本,下文以cpu为例,gpu版本需要替换一下关键字即可) +b. Goto Dockerfile directory(ps:Need to distinguish between cpu and gpu version, the following takes cpu as an example, gpu version needs to replace the keyword) ``` cd docker/cpu ``` -c.生成镜像 +c. Build image ``` docker build -t paddleocr:cpu . ``` -## 3.启动Docker容器 -a. CPU 版本 +## 3. Start container +a. CPU version ``` sudo docker run -dp 8866:8866 --name paddle_ocr paddleocr:cpu ``` -b. GPU 版本 (通过NVIDIA Container Toolkit) +b. GPU version (base on NVIDIA Container Toolkit) ``` sudo nvidia-docker run -dp 8866:8866 --name paddle_ocr paddleocr:gpu ``` -c. GPU 版本 (Docker 19.03以上版本,可以直接用如下命令) +c. GPU version (Docker 19.03++) ``` sudo docker run -dp 8866:8866 --gpus all --name paddle_ocr paddleocr:gpu ``` -d. 检查服务运行情况(出现:Successfully installed ocr_system和Running on http://0.0.0.0:8866/等信息,表示运行成功) +d. Check service status(If you can see the following statement then it means completed:Successfully installed ocr_system && Running on http://0.0.0.0:8866/) ``` docker logs -f paddle_ocr ``` -## 4.测试服务 -a. 计算待识别图片的Base64编码(如果只是测试一下效果,可以通过免费的在线工具实现,如:http://tool.chinaz.com/tools/imgtobase/) -b. 发送服务请求(可参见sample_request.txt中的值) +## 4. Test +a. Calculate the Base64 encoding of the picture to be recognized (if you just test, you can use a free online tool, like:https://freeonlinetools24.com/base64-image/) +b. Post a service request(sample request in sample_request.txt) + ``` -curl -H "Content-Type:application/json" -X POST --data "{\"images\": [\"填入图片Base64编码(需要删除'data:image/jpg;base64,')\"]}" http://localhost:8866/predict/ocr_system +curl -H "Content-Type:application/json" -X POST --data "{\"images\": [\"Input image Base64 encode(need to delete the code 'data:image/jpg;base64,')\"]}" http://localhost:8866/predict/ocr_system ``` -c. 返回结果(如果调用成功,会返回如下结果) +c. Get resposne(If the call is successful, the following result will be returned) ``` {"msg":"","results":[[{"confidence":0.8403433561325073,"text":"约定","text_region":[[345,377],[641,390],[634,540],[339,528]]},{"confidence":0.8131805658340454,"text":"最终相遇","text_region":[[356,532],[624,530],[624,596],[356,598]]}]],"status":"0"} ``` diff --git a/paddleocr.py b/paddleocr.py new file mode 100644 index 0000000000000000000000000000000000000000..d3d73cb1b92cb2228fafb4e0efa36ab13207a4b3 --- /dev/null +++ b/paddleocr.py @@ -0,0 +1,213 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys + +__dir__ = os.path.dirname(__file__) +sys.path.append(os.path.join(__dir__, '')) + +import cv2 +import numpy as np +from pathlib import Path +import tarfile +import requests +from tqdm import tqdm + +from tools.infer import predict_system +from ppocr.utils.utility import initial_logger + +logger = initial_logger() +from ppocr.utils.utility import check_and_read_gif, get_image_file_list + +__all__ = ['PaddleOCR'] + +model_params = { + 'det': 'https://paddleocr.bj.bcebos.com/ch_models/ch_det_mv3_db_infer.tar', + 'rec': + 'https://paddleocr.bj.bcebos.com/ch_models/ch_rec_mv3_crnn_enhance_infer.tar', +} + +SUPPORT_DET_MODEL = ['DB'] +SUPPORT_REC_MODEL = ['CRNN'] +BASE_DIR = os.path.expanduser("~/.paddleocr/") + + +def download_with_progressbar(url, save_path): + response = requests.get(url, stream=True) + total_size_in_bytes = int(response.headers.get('content-length', 0)) + block_size = 1024 # 1 Kibibyte + progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True) + with open(save_path, 'wb') as file: + for data in response.iter_content(block_size): + progress_bar.update(len(data)) + file.write(data) + progress_bar.close() + if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes: + logger.error("ERROR, something went wrong") + sys.exit(0) + + +def maybe_download(model_storage_directory, url): + # using custom model + if not os.path.exists(os.path.join( + model_storage_directory, 'model')) or not os.path.exists( + os.path.join(model_storage_directory, 'params')): + tmp_path = os.path.join(model_storage_directory, url.split('/')[-1]) + print('download {} to {}'.format(url, tmp_path)) + os.makedirs(model_storage_directory, exist_ok=True) + download_with_progressbar(url, tmp_path) + with tarfile.open(tmp_path, 'r') as tarObj: + for member in tarObj.getmembers(): + if "model" in member.name: + filename = 'model' + elif "params" in member.name: + filename = 'params' + else: + continue + file = tarObj.extractfile(member) + with open( + os.path.join(model_storage_directory, filename), + 'wb') as f: + f.write(file.read()) + os.remove(tmp_path) + + +def parse_args(): + import argparse + + def str2bool(v): + return v.lower() in ("true", "t", "1") + + parser = argparse.ArgumentParser() + # params for prediction engine + parser.add_argument("--use_gpu", type=str2bool, default=True) + parser.add_argument("--ir_optim", type=str2bool, default=True) + parser.add_argument("--use_tensorrt", type=str2bool, default=False) + parser.add_argument("--gpu_mem", type=int, default=8000) + + # params for text detector + parser.add_argument("--image_dir", type=str) + parser.add_argument("--det_algorithm", type=str, default='DB') + parser.add_argument("--det_model_dir", type=str, default=None) + parser.add_argument("--det_max_side_len", type=float, default=960) + + # DB parmas + parser.add_argument("--det_db_thresh", type=float, default=0.3) + parser.add_argument("--det_db_box_thresh", type=float, default=0.5) + parser.add_argument("--det_db_unclip_ratio", type=float, default=2.0) + + # EAST parmas + parser.add_argument("--det_east_score_thresh", type=float, default=0.8) + parser.add_argument("--det_east_cover_thresh", type=float, default=0.1) + parser.add_argument("--det_east_nms_thresh", type=float, default=0.2) + + # params for text recognizer + parser.add_argument("--rec_algorithm", type=str, default='CRNN') + parser.add_argument("--rec_model_dir", type=str, default=None) + parser.add_argument("--rec_image_shape", type=str, default="3, 32, 320") + parser.add_argument("--rec_char_type", type=str, default='ch') + parser.add_argument("--rec_batch_num", type=int, default=30) + parser.add_argument("--max_text_length", type=int, default=25) + parser.add_argument( + "--rec_char_dict_path", + type=str, + default="./ppocr/utils/ppocr_keys_v1.txt") + parser.add_argument("--use_space_char", type=bool, default=True) + parser.add_argument("--enable_mkldnn", type=bool, default=False) + + parser.add_argument("--det", type=str2bool, default=True) + parser.add_argument("--rec", type=str2bool, default=True) + parser.add_argument("--use_zero_copy_run", type=bool, default=False) + return parser.parse_args() + + +class PaddleOCR(predict_system.TextSystem): + def __init__(self, **kwargs): + """ + paddleocr package + args: + **kwargs: other params show in paddleocr --help + """ + postprocess_params = parse_args() + postprocess_params.__dict__.update(**kwargs) + + # init model dir + if postprocess_params.det_model_dir is None: + postprocess_params.det_model_dir = os.path.join(BASE_DIR, 'det') + if postprocess_params.rec_model_dir is None: + postprocess_params.rec_model_dir = os.path.join(BASE_DIR, 'rec') + print(postprocess_params) + # download model + maybe_download(postprocess_params.det_model_dir, model_params['det']) + maybe_download(postprocess_params.rec_model_dir, model_params['rec']) + + if postprocess_params.det_algorithm not in SUPPORT_DET_MODEL: + logger.error('det_algorithm must in {}'.format(SUPPORT_DET_MODEL)) + sys.exit(0) + if postprocess_params.rec_algorithm not in SUPPORT_REC_MODEL: + logger.error('rec_algorithm must in {}'.format(SUPPORT_REC_MODEL)) + sys.exit(0) + + postprocess_params.rec_char_dict_path = Path( + __file__).parent / postprocess_params.rec_char_dict_path + + # init det_model and rec_model + super().__init__(postprocess_params) + + def ocr(self, img, det=True, rec=True): + """ + ocr with paddleocr + args: + img: img for ocr, support ndarray, img_path and list or ndarray + det: use text detection or not, if false, only rec will be exec. default is True + rec: use text recognition or not, if false, only det will be exec. default is True + """ + assert isinstance(img, (np.ndarray, list, str)) + if isinstance(img, str): + image_file = img + img, flag = check_and_read_gif(image_file) + if not flag: + img = cv2.imread(image_file) + if img is None: + logger.error("error in loading image:{}".format(image_file)) + return None + if det and rec: + dt_boxes, rec_res = self.__call__(img) + return [[box.tolist(), res] for box, res in zip(dt_boxes, rec_res)] + elif det and not rec: + dt_boxes, elapse = self.text_detector(img) + if dt_boxes is None: + return None + return [box.tolist() for box in dt_boxes] + else: + if not isinstance(img, list): + img = [img] + rec_res, elapse = self.text_recognizer(img) + return rec_res + + +def main(): + # for com + args = parse_args() + image_file_list = get_image_file_list(args.image_dir) + if len(image_file_list) == 0: + logger.error('no images find in {}'.format(args.image_dir)) + return + ocr_engine = PaddleOCR() + for img_path in image_file_list: + print(img_path) + result = ocr_engine.ocr(img_path, det=args.det, rec=args.rec) + for line in result: + print(line) \ No newline at end of file diff --git a/ppocr/data/rec/dataset_traversal.py b/ppocr/data/rec/dataset_traversal.py index 5efba512c0e22dda1be17b121c0f12f42b74f2ee..84f325b9b880d6289a4d60f7ebff39d962fdb5a1 100755 --- a/ppocr/data/rec/dataset_traversal.py +++ b/ppocr/data/rec/dataset_traversal.py @@ -214,6 +214,8 @@ class SimpleReader(object): self.mode = params['mode'] self.infer_img = params['infer_img'] self.use_tps = False + if "num_heads" in params: + self.num_heads = params['num_heads'] if "tps" in params: self.use_tps = True self.use_distort = False @@ -237,7 +239,7 @@ class SimpleReader(object): def get_device_num(): if self.use_gpu: - gpus = os.environ.get("CUDA_VISIBLE_DEVICES", 1) + gpus = os.environ.get("CUDA_VISIBLE_DEVICES", '1') gpu_num = len(gpus.split(',')) return gpu_num else: @@ -251,12 +253,20 @@ class SimpleReader(object): img = cv2.imread(single_img) if img.shape[-1] == 1 or len(list(img.shape)) == 2: img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) - norm_img = process_image( - img=img, - image_shape=self.image_shape, - char_ops=self.char_ops, - tps=self.use_tps, - infer_mode=True) + if self.loss_type == 'srn': + norm_img = process_image_srn( + img=img, + image_shape=self.image_shape, + char_ops=self.char_ops, + num_heads=self.num_heads, + max_text_length=self.max_text_length) + else: + norm_img = process_image( + img=img, + image_shape=self.image_shape, + char_ops=self.char_ops, + tps=self.use_tps, + infer_mode=True) yield norm_img else: with open(self.label_file_path, "rb") as fin: @@ -286,14 +296,25 @@ class SimpleReader(object): img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) label = substr[1] - outs = process_image( - img=img, - image_shape=self.image_shape, - label=label, - char_ops=self.char_ops, - loss_type=self.loss_type, - max_text_length=self.max_text_length, - distort=self.use_distort) + if self.loss_type == "srn": + outs = process_image_srn( + img=img, + image_shape=self.image_shape, + num_heads=self.num_heads, + max_text_length=self.max_text_length, + label=label, + char_ops=self.char_ops, + loss_type=self.loss_type) + + else: + outs = process_image( + img=img, + image_shape=self.image_shape, + label=label, + char_ops=self.char_ops, + loss_type=self.loss_type, + max_text_length=self.max_text_length, + distort=self.use_distort) if outs is None: continue yield outs diff --git a/ppocr/data/rec/img_tools.py b/ppocr/data/rec/img_tools.py index 527e0266ee33ac81e29b5610ed05f401860078a4..8b497e6b803ba0fffaefc3e12c366130504b9ce0 100755 --- a/ppocr/data/rec/img_tools.py +++ b/ppocr/data/rec/img_tools.py @@ -410,7 +410,8 @@ def resize_norm_img_srn(img, image_shape): def srn_other_inputs(image_shape, num_heads, - max_text_length): + max_text_length, + char_num): imgC, imgH, imgW = image_shape feature_dim = int((imgH / 8) * (imgW / 8)) @@ -418,7 +419,7 @@ def srn_other_inputs(image_shape, encoder_word_pos = np.array(range(0, feature_dim)).reshape((feature_dim, 1)).astype('int64') gsrm_word_pos = np.array(range(0, max_text_length)).reshape((max_text_length, 1)).astype('int64') - lbl_weight = np.array([37] * max_text_length).reshape((-1,1)).astype('int64') + lbl_weight = np.array([int(char_num-1)] * max_text_length).reshape((-1,1)).astype('int64') gsrm_attn_bias_data = np.ones((1, max_text_length, max_text_length)) gsrm_slf_attn_bias1 = np.triu(gsrm_attn_bias_data, 1).reshape([-1, 1, max_text_length, max_text_length]) @@ -441,17 +442,18 @@ def process_image_srn(img, loss_type=None): norm_img = resize_norm_img_srn(img, image_shape) norm_img = norm_img[np.newaxis, :] + char_num = char_ops.get_char_num() + [lbl_weight, encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1, gsrm_slf_attn_bias2] = \ - srn_other_inputs(image_shape, num_heads, max_text_length) + srn_other_inputs(image_shape, num_heads, max_text_length,char_num) if label is not None: - char_num = char_ops.get_char_num() text = char_ops.encode(label) if len(text) == 0 or len(text) > max_text_length: return None else: if loss_type == "srn": - text_padded = [37] * max_text_length + text_padded = [int(char_num-1)] * max_text_length for i in range(len(text)): text_padded[i] = text[i] lbl_weight[i] = [1.0] diff --git a/ppocr/modeling/backbones/rec_resnet50_fpn.py b/ppocr/modeling/backbones/rec_resnet_fpn.py similarity index 51% rename from ppocr/modeling/backbones/rec_resnet50_fpn.py rename to ppocr/modeling/backbones/rec_resnet_fpn.py index f6d72377fe4e2d3355a4510f070178ad48dd2a27..0a05b5def8b79943f045d9cc941835cddc82bfdb 100755 --- a/ppocr/modeling/backbones/rec_resnet50_fpn.py +++ b/ppocr/modeling/backbones/rec_resnet_fpn.py @@ -22,12 +22,12 @@ import paddle import paddle.fluid as fluid from paddle.fluid.param_attr import ParamAttr - -__all__ = ["ResNet", "ResNet18", "ResNet34", "ResNet50", "ResNet101", "ResNet152"] +__all__ = [ + "ResNet", "ResNet18", "ResNet34", "ResNet50", "ResNet101", "ResNet152" +] Trainable = True -w_nolr = fluid.ParamAttr( - trainable = Trainable) +w_nolr = fluid.ParamAttr(trainable=Trainable) train_parameters = { "input_size": [3, 224, 224], "input_mean": [0.485, 0.456, 0.406], @@ -40,12 +40,12 @@ train_parameters = { } } + class ResNet(): def __init__(self, params): self.layers = params['layers'] self.params = train_parameters - def __call__(self, input): layers = self.layers supported_layers = [18, 34, 50, 101, 152] @@ -60,12 +60,17 @@ class ResNet(): depth = [3, 4, 23, 3] elif layers == 152: depth = [3, 8, 36, 3] - stride_list = [(2,2),(2,2),(1,1),(1,1)] + stride_list = [(2, 2), (2, 2), (1, 1), (1, 1)] num_filters = [64, 128, 256, 512] conv = self.conv_bn_layer( - input=input, num_filters=64, filter_size=7, stride=2, act='relu', name="conv1") - F = [] + input=input, + num_filters=64, + filter_size=7, + stride=2, + act='relu', + name="conv1") + F = [] if layers >= 50: for block in range(len(depth)): for i in range(depth[block]): @@ -79,26 +84,67 @@ class ResNet(): conv = self.bottleneck_block( input=conv, num_filters=num_filters[block], - stride=stride_list[block] if i == 0 else 1, name=conv_name) + stride=stride_list[block] if i == 0 else 1, + name=conv_name) + F.append(conv) + else: + for block in range(len(depth)): + for i in range(depth[block]): + conv_name = "res" + str(block + 2) + chr(97 + i) + + if i == 0 and block != 0: + stride = (2, 1) + else: + stride = (1, 1) + + conv = self.basic_block( + input=conv, + num_filters=num_filters[block], + stride=stride, + if_first=block == i == 0, + name=conv_name) F.append(conv) base = F[-1] - for i in [-2, -3]: + for i in [-2, -3]: b, c, w, h = F[i].shape - if (w,h) == base.shape[2:]: + if (w, h) == base.shape[2:]: base = base else: - base = fluid.layers.conv2d_transpose( input=base, num_filters=c,filter_size=4, stride=2, - padding=1,act=None, + base = fluid.layers.conv2d_transpose( + input=base, + num_filters=c, + filter_size=4, + stride=2, + padding=1, + act=None, param_attr=w_nolr, bias_attr=w_nolr) - base = fluid.layers.batch_norm(base, act = "relu", param_attr=w_nolr, bias_attr=w_nolr) + base = fluid.layers.batch_norm( + base, act="relu", param_attr=w_nolr, bias_attr=w_nolr) base = fluid.layers.concat([base, F[i]], axis=1) - base = fluid.layers.conv2d(base, num_filters=c, filter_size=1, param_attr=w_nolr, bias_attr=w_nolr) - base = fluid.layers.conv2d(base, num_filters=c, filter_size=3,padding = 1, param_attr=w_nolr, bias_attr=w_nolr) - base = fluid.layers.batch_norm(base, act = "relu", param_attr=w_nolr, bias_attr=w_nolr) - - base = fluid.layers.conv2d(base, num_filters=512, filter_size=1,bias_attr=w_nolr,param_attr=w_nolr) + base = fluid.layers.conv2d( + base, + num_filters=c, + filter_size=1, + param_attr=w_nolr, + bias_attr=w_nolr) + base = fluid.layers.conv2d( + base, + num_filters=c, + filter_size=3, + padding=1, + param_attr=w_nolr, + bias_attr=w_nolr) + base = fluid.layers.batch_norm( + base, act="relu", param_attr=w_nolr, bias_attr=w_nolr) + + base = fluid.layers.conv2d( + base, + num_filters=512, + filter_size=1, + bias_attr=w_nolr, + param_attr=w_nolr) return base @@ -113,13 +159,14 @@ class ResNet(): conv = fluid.layers.conv2d( input=input, num_filters=num_filters, - filter_size= 2 if stride==(1,1) else filter_size, - dilation = 2 if stride==(1,1) else 1, + filter_size=2 if stride == (1, 1) else filter_size, + dilation=2 if stride == (1, 1) else 1, stride=stride, padding=(filter_size - 1) // 2, groups=groups, act=None, - param_attr=ParamAttr(name=name + "_weights",trainable = Trainable), + param_attr=ParamAttr( + name=name + "_weights", trainable=Trainable), bias_attr=False, name=name + '.conv2d.output.1') @@ -127,28 +174,35 @@ class ResNet(): bn_name = "bn_" + name else: bn_name = "bn" + name[3:] - return fluid.layers.batch_norm(input=conv, - act=act, - name=bn_name + '.output.1', - param_attr=ParamAttr(name=bn_name + '_scale',trainable = Trainable), - bias_attr=ParamAttr(bn_name + '_offset',trainable = Trainable), - moving_mean_name=bn_name + '_mean', - moving_variance_name=bn_name + '_variance', ) + return fluid.layers.batch_norm( + input=conv, + act=act, + name=bn_name + '.output.1', + param_attr=ParamAttr( + name=bn_name + '_scale', trainable=Trainable), + bias_attr=ParamAttr( + bn_name + '_offset', trainable=Trainable), + moving_mean_name=bn_name + '_mean', + moving_variance_name=bn_name + '_variance', ) def shortcut(self, input, ch_out, stride, is_first, name): ch_in = input.shape[1] if ch_in != ch_out or stride != 1 or is_first == True: - if stride == (1,1): + if stride == (1, 1): return self.conv_bn_layer(input, ch_out, 1, 1, name=name) - else: #stride == (2,2) + else: #stride == (2,2) return self.conv_bn_layer(input, ch_out, 1, stride, name=name) - + else: return input def bottleneck_block(self, input, num_filters, stride, name): conv0 = self.conv_bn_layer( - input=input, num_filters=num_filters, filter_size=1, act='relu', name=name + "_branch2a") + input=input, + num_filters=num_filters, + filter_size=1, + act='relu', + name=name + "_branch2a") conv1 = self.conv_bn_layer( input=conv0, num_filters=num_filters, @@ -157,16 +211,36 @@ class ResNet(): act='relu', name=name + "_branch2b") conv2 = self.conv_bn_layer( - input=conv1, num_filters=num_filters * 4, filter_size=1, act=None, name=name + "_branch2c") + input=conv1, + num_filters=num_filters * 4, + filter_size=1, + act=None, + name=name + "_branch2c") - short = self.shortcut(input, num_filters * 4, stride, is_first=False, name=name + "_branch1") + short = self.shortcut( + input, + num_filters * 4, + stride, + is_first=False, + name=name + "_branch1") - return fluid.layers.elementwise_add(x=short, y=conv2, act='relu', name=name + ".add.output.5") + return fluid.layers.elementwise_add( + x=short, y=conv2, act='relu', name=name + ".add.output.5") def basic_block(self, input, num_filters, stride, is_first, name): - conv0 = self.conv_bn_layer(input=input, num_filters=num_filters, filter_size=3, act='relu', stride=stride, - name=name + "_branch2a") - conv1 = self.conv_bn_layer(input=conv0, num_filters=num_filters, filter_size=3, act=None, - name=name + "_branch2b") - short = self.shortcut(input, num_filters, stride, is_first, name=name + "_branch1") + conv0 = self.conv_bn_layer( + input=input, + num_filters=num_filters, + filter_size=3, + act='relu', + stride=stride, + name=name + "_branch2a") + conv1 = self.conv_bn_layer( + input=conv0, + num_filters=num_filters, + filter_size=3, + act=None, + name=name + "_branch2b") + short = self.shortcut( + input, num_filters, stride, is_first, name=name + "_branch1") return fluid.layers.elementwise_add(x=short, y=conv1, act='relu') diff --git a/ppocr/modeling/heads/det_sast_head.py b/ppocr/modeling/heads/det_sast_head.py index b5e19b844abda003d65a1f95026685cfe0cfffd6..0097913dd7e08c76c45064940416e7c9ffc32f26 100644 --- a/ppocr/modeling/heads/det_sast_head.py +++ b/ppocr/modeling/heads/det_sast_head.py @@ -49,7 +49,7 @@ class SASTHead(object): for i in range(4): if i == 0: g[i] = deconv_bn_layer(input=h[i], num_filters=num_outputs[i + 1], act=None, name='fpn_up_g0') - print("g[{}] shape: {}".format(i, g[i].shape)) + #print("g[{}] shape: {}".format(i, g[i].shape)) else: g[i] = fluid.layers.elementwise_add(x=g[i - 1], y=h[i]) g[i] = fluid.layers.relu(g[i]) @@ -58,7 +58,7 @@ class SASTHead(object): g[i] = conv_bn_layer(input=g[i], num_filters=num_outputs[i], filter_size=3, stride=1, act='relu', name='fpn_up_g%d_1'%i) g[i] = deconv_bn_layer(input=g[i], num_filters=num_outputs[i + 1], act=None, name='fpn_up_g%d_2'%i) - print("g[{}] shape: {}".format(i, g[i].shape)) + #print("g[{}] shape: {}".format(i, g[i].shape)) g[4] = fluid.layers.elementwise_add(x=g[3], y=h[4]) g[4] = fluid.layers.relu(g[4]) diff --git a/ppocr/modeling/heads/self_attention/model.py b/ppocr/modeling/heads/self_attention/model.py index 8ac1458b7dca3dedc368a16fa00f52a9aa4f4f93..8bf34e4ac6a2c3c33d2a46b1f4f9dbfaf8db8f57 100644 --- a/ppocr/modeling/heads/self_attention/model.py +++ b/ppocr/modeling/heads/self_attention/model.py @@ -4,8 +4,10 @@ import numpy as np import paddle.fluid as fluid import paddle.fluid.layers as layers -# Set seed for CE -dropout_seed = None +encoder_data_input_fields = ( + "src_word", + "src_pos", + "src_slf_attn_bias", ) def wrap_layer_with_block(layer, block_idx): @@ -45,25 +47,6 @@ def wrap_layer_with_block(layer, block_idx): return layer_wrapper -def position_encoding_init(n_position, d_pos_vec): - """ - Generate the initial values for the sinusoid position encoding table. - """ - channels = d_pos_vec - position = np.arange(n_position) - num_timescales = channels // 2 - log_timescale_increment = (np.log(float(1e4) / float(1)) / - (num_timescales - 1)) - inv_timescales = np.exp(np.arange( - num_timescales)) * -log_timescale_increment - scaled_time = np.expand_dims(position, 1) * np.expand_dims(inv_timescales, - 0) - signal = np.concatenate([np.sin(scaled_time), np.cos(scaled_time)], axis=1) - signal = np.pad(signal, [[0, 0], [0, np.mod(channels, 2)]], 'constant') - position_enc = signal - return position_enc.astype("float32") - - def multi_head_attention(queries, keys, values, @@ -200,10 +183,7 @@ def multi_head_attention(queries, weights = layers.softmax(product) if dropout_rate: weights = layers.dropout( - weights, - dropout_prob=dropout_rate, - seed=dropout_seed, - is_test=False) + weights, dropout_prob=dropout_rate, seed=None, is_test=False) out = layers.matmul(weights, v) return out @@ -235,7 +215,7 @@ def positionwise_feed_forward(x, d_inner_hid, d_hid, dropout_rate): act="relu") if dropout_rate: hidden = layers.dropout( - hidden, dropout_prob=dropout_rate, seed=dropout_seed, is_test=False) + hidden, dropout_prob=dropout_rate, seed=None, is_test=False) out = layers.fc(input=hidden, size=d_hid, num_flatten_dims=2) return out @@ -259,10 +239,7 @@ def pre_post_process_layer(prev_out, out, process_cmd, dropout_rate=0.): elif cmd == "d": # add dropout if dropout_rate: out = layers.dropout( - out, - dropout_prob=dropout_rate, - seed=dropout_seed, - is_test=False) + out, dropout_prob=dropout_rate, seed=None, is_test=False) return out @@ -271,7 +248,7 @@ post_process_layer = pre_post_process_layer def prepare_encoder( - src_word, #[b,t,c] + src_word, # [b,t,c] src_pos, src_vocab_size, src_emb_dim, @@ -286,9 +263,8 @@ def prepare_encoder( This module is used at the bottom of the encoder stacks. """ - src_word_emb = src_word #layers.concat(res,axis=1) + src_word_emb = src_word src_word_emb = layers.cast(src_word_emb, 'float32') - # print("src_word_emb",src_word_emb) src_word_emb = layers.scale(x=src_word_emb, scale=src_emb_dim**0.5) src_pos_enc = layers.embedding( @@ -299,7 +275,7 @@ def prepare_encoder( src_pos_enc.stop_gradient = True enc_input = src_word_emb + src_pos_enc return layers.dropout( - enc_input, dropout_prob=dropout_rate, seed=dropout_seed, + enc_input, dropout_prob=dropout_rate, seed=None, is_test=False) if dropout_rate else enc_input @@ -324,7 +300,7 @@ def prepare_decoder(src_word, param_attr=fluid.ParamAttr( name=word_emb_param_name, initializer=fluid.initializer.Normal(0., src_emb_dim**-0.5))) - # print("target_word_emb",src_word_emb) + src_word_emb = layers.scale(x=src_word_emb, scale=src_emb_dim**0.5) src_pos_enc = layers.embedding( src_pos, @@ -334,16 +310,10 @@ def prepare_decoder(src_word, src_pos_enc.stop_gradient = True enc_input = src_word_emb + src_pos_enc return layers.dropout( - enc_input, dropout_prob=dropout_rate, seed=dropout_seed, + enc_input, dropout_prob=dropout_rate, seed=None, is_test=False) if dropout_rate else enc_input -# prepare_encoder = partial( -# prepare_encoder_decoder, pos_enc_param_name=pos_enc_param_names[0]) -# prepare_decoder = partial( -# prepare_encoder_decoder, pos_enc_param_name=pos_enc_param_names[1]) - - def encoder_layer(enc_input, attn_bias, n_head, @@ -412,234 +382,6 @@ def encoder(enc_input, return enc_output -def decoder_layer(dec_input, - enc_output, - slf_attn_bias, - dec_enc_attn_bias, - n_head, - d_key, - d_value, - d_model, - d_inner_hid, - prepostprocess_dropout, - attention_dropout, - relu_dropout, - preprocess_cmd, - postprocess_cmd, - cache=None, - gather_idx=None): - """ The layer to be stacked in decoder part. - The structure of this module is similar to that in the encoder part except - a multi-head attention is added to implement encoder-decoder attention. - """ - slf_attn_output = multi_head_attention( - pre_process_layer(dec_input, preprocess_cmd, prepostprocess_dropout), - None, - None, - slf_attn_bias, - d_key, - d_value, - d_model, - n_head, - attention_dropout, - cache=cache, - gather_idx=gather_idx) - slf_attn_output = post_process_layer( - dec_input, - slf_attn_output, - postprocess_cmd, - prepostprocess_dropout, ) - enc_attn_output = multi_head_attention( - pre_process_layer(slf_attn_output, preprocess_cmd, - prepostprocess_dropout), - enc_output, - enc_output, - dec_enc_attn_bias, - d_key, - d_value, - d_model, - n_head, - attention_dropout, - cache=cache, - gather_idx=gather_idx, - static_kv=True) - enc_attn_output = post_process_layer( - slf_attn_output, - enc_attn_output, - postprocess_cmd, - prepostprocess_dropout, ) - ffd_output = positionwise_feed_forward( - pre_process_layer(enc_attn_output, preprocess_cmd, - prepostprocess_dropout), - d_inner_hid, - d_model, - relu_dropout, ) - dec_output = post_process_layer( - enc_attn_output, - ffd_output, - postprocess_cmd, - prepostprocess_dropout, ) - return dec_output - - -def decoder(dec_input, - enc_output, - dec_slf_attn_bias, - dec_enc_attn_bias, - n_layer, - n_head, - d_key, - d_value, - d_model, - d_inner_hid, - prepostprocess_dropout, - attention_dropout, - relu_dropout, - preprocess_cmd, - postprocess_cmd, - caches=None, - gather_idx=None): - """ - The decoder is composed of a stack of identical decoder_layer layers. - """ - for i in range(n_layer): - dec_output = decoder_layer( - dec_input, - enc_output, - dec_slf_attn_bias, - dec_enc_attn_bias, - n_head, - d_key, - d_value, - d_model, - d_inner_hid, - prepostprocess_dropout, - attention_dropout, - relu_dropout, - preprocess_cmd, - postprocess_cmd, - cache=None if caches is None else caches[i], - gather_idx=gather_idx) - dec_input = dec_output - dec_output = pre_process_layer(dec_output, preprocess_cmd, - prepostprocess_dropout) - return dec_output - - -def make_all_inputs(input_fields): - """ - Define the input data layers for the transformer model. - """ - inputs = [] - for input_field in input_fields: - input_var = layers.data( - name=input_field, - shape=input_descs[input_field][0], - dtype=input_descs[input_field][1], - lod_level=input_descs[input_field][2] - if len(input_descs[input_field]) == 3 else 0, - append_batch_size=False) - inputs.append(input_var) - return inputs - - -def make_all_py_reader_inputs(input_fields, is_test=False): - reader = layers.py_reader( - capacity=20, - name="test_reader" if is_test else "train_reader", - shapes=[input_descs[input_field][0] for input_field in input_fields], - dtypes=[input_descs[input_field][1] for input_field in input_fields], - lod_levels=[ - input_descs[input_field][2] - if len(input_descs[input_field]) == 3 else 0 - for input_field in input_fields - ]) - return layers.read_file(reader), reader - - -def transformer(src_vocab_size, - trg_vocab_size, - max_length, - n_layer, - n_head, - d_key, - d_value, - d_model, - d_inner_hid, - prepostprocess_dropout, - attention_dropout, - relu_dropout, - preprocess_cmd, - postprocess_cmd, - weight_sharing, - label_smooth_eps, - bos_idx=0, - use_py_reader=False, - is_test=False): - if weight_sharing: - assert src_vocab_size == trg_vocab_size, ( - "Vocabularies in source and target should be same for weight sharing." - ) - - data_input_names = encoder_data_input_fields + \ - decoder_data_input_fields[:-1] + label_data_input_fields - - if use_py_reader: - all_inputs, reader = make_all_py_reader_inputs(data_input_names, - is_test) - else: - all_inputs = make_all_inputs(data_input_names) - # print("all inputs",all_inputs) - enc_inputs_len = len(encoder_data_input_fields) - dec_inputs_len = len(decoder_data_input_fields[:-1]) - enc_inputs = all_inputs[0:enc_inputs_len] - dec_inputs = all_inputs[enc_inputs_len:enc_inputs_len + dec_inputs_len] - label = all_inputs[-2] - weights = all_inputs[-1] - - enc_output = wrap_encoder( - src_vocab_size, 64, n_layer, n_head, d_key, d_value, d_model, - d_inner_hid, prepostprocess_dropout, attention_dropout, relu_dropout, - preprocess_cmd, postprocess_cmd, weight_sharing, enc_inputs) - - predict = wrap_decoder( - trg_vocab_size, - max_length, - n_layer, - n_head, - d_key, - d_value, - d_model, - d_inner_hid, - prepostprocess_dropout, - attention_dropout, - relu_dropout, - preprocess_cmd, - postprocess_cmd, - weight_sharing, - dec_inputs, - enc_output, ) - - # Padding index do not contribute to the total loss. The weights is used to - # cancel padding index in calculating the loss. - if label_smooth_eps: - label = layers.label_smooth( - label=layers.one_hot( - input=label, depth=trg_vocab_size), - epsilon=label_smooth_eps) - - cost = layers.softmax_with_cross_entropy( - logits=predict, - label=label, - soft_label=True if label_smooth_eps else False) - weighted_cost = cost * weights - sum_cost = layers.reduce_sum(weighted_cost) - token_num = layers.reduce_sum(weights) - token_num.stop_gradient = True - avg_cost = sum_cost / token_num - return sum_cost, avg_cost, predict, token_num, reader if use_py_reader else None - - def wrap_encoder_forFeature(src_vocab_size, max_length, n_layer, @@ -662,44 +404,8 @@ def wrap_encoder_forFeature(src_vocab_size, img """ - if enc_inputs is None: - # This is used to implement independent encoder program in inference. - conv_features, src_pos, src_slf_attn_bias = make_all_inputs( - encoder_data_input_fields) - else: - conv_features, src_pos, src_slf_attn_bias = enc_inputs # - b, t, c = conv_features.shape - #""" - # insert cnn - #""" - #import basemodel - # feat = basemodel.resnet_50(img) - - # mycrnn = basemodel.CRNN() - # feat = mycrnn.ocr_convs(img,use_cudnn=TrainTaskConfig.use_gpu) - # b, c, w, h = feat.shape - # src_word = layers.reshape(feat, shape=[-1, c, w * h]) - - #myconv8 = basemodel.conv8() - #feat = myconv8.net(img ) - #b , c, h, w = feat.shape#h=6 - #print(feat) - #layers.Print(feat,message="conv_feat",summarize=10) - - #feat =layers.conv2d(feat,c,filter_size =[4 , 1],act="relu") - #feat = layers.pool2d(feat,pool_stride=(3,1),pool_size=(3,1)) - #src_word = layers.squeeze(feat,axes=[2]) #src_word [-1,c,ww] - - #feat = layers.transpose(feat, [0,3,1,2]) - #src_word = layers.reshape(feat,[-1,w, c*h]) - #src_word = layers.im2sequence( - # input=feat, - # stride=[1, 1], - # filter_size=[feat.shape[2], 1]) - #layers.Print(src_word,message="src_word",summarize=10) - - # print('feat',feat) - #print("src_word",src_word) + conv_features, src_pos, src_slf_attn_bias = enc_inputs # + b, t, c = conv_features.shape enc_input = prepare_encoder( conv_features, @@ -749,43 +455,9 @@ def wrap_encoder(src_vocab_size, img, src_pos, src_slf_attn_bias = enc_inputs img """ - if enc_inputs is None: - # This is used to implement independent encoder program in inference. - src_word, src_pos, src_slf_attn_bias = make_all_inputs( - encoder_data_input_fields) - else: - src_word, src_pos, src_slf_attn_bias = enc_inputs # - #""" - # insert cnn - #""" - #import basemodel - # feat = basemodel.resnet_50(img) - - # mycrnn = basemodel.CRNN() - # feat = mycrnn.ocr_convs(img,use_cudnn=TrainTaskConfig.use_gpu) - # b, c, w, h = feat.shape - # src_word = layers.reshape(feat, shape=[-1, c, w * h]) - #myconv8 = basemodel.conv8() - #feat = myconv8.net(img ) - #b , c, h, w = feat.shape#h=6 - #print(feat) - #layers.Print(feat,message="conv_feat",summarize=10) + src_word, src_pos, src_slf_attn_bias = enc_inputs # - #feat =layers.conv2d(feat,c,filter_size =[4 , 1],act="relu") - #feat = layers.pool2d(feat,pool_stride=(3,1),pool_size=(3,1)) - #src_word = layers.squeeze(feat,axes=[2]) #src_word [-1,c,ww] - - #feat = layers.transpose(feat, [0,3,1,2]) - #src_word = layers.reshape(feat,[-1,w, c*h]) - #src_word = layers.im2sequence( - # input=feat, - # stride=[1, 1], - # filter_size=[feat.shape[2], 1]) - #layers.Print(src_word,message="src_word",summarize=10) - - # print('feat',feat) - #print("src_word",src_word) enc_input = prepare_decoder( src_word, src_pos, @@ -811,248 +483,3 @@ def wrap_encoder(src_vocab_size, preprocess_cmd, postprocess_cmd, ) return enc_output - - -def wrap_decoder(trg_vocab_size, - max_length, - n_layer, - n_head, - d_key, - d_value, - d_model, - d_inner_hid, - prepostprocess_dropout, - attention_dropout, - relu_dropout, - preprocess_cmd, - postprocess_cmd, - weight_sharing, - dec_inputs=None, - enc_output=None, - caches=None, - gather_idx=None, - bos_idx=0): - """ - The wrapper assembles together all needed layers for the decoder. - """ - if dec_inputs is None: - # This is used to implement independent decoder program in inference. - trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, enc_output = \ - make_all_inputs(decoder_data_input_fields) - else: - trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias = dec_inputs - - dec_input = prepare_decoder( - trg_word, - trg_pos, - trg_vocab_size, - d_model, - max_length, - prepostprocess_dropout, - bos_idx=bos_idx, - word_emb_param_name="src_word_emb_table" - if weight_sharing else "trg_word_emb_table") - dec_output = decoder( - dec_input, - enc_output, - trg_slf_attn_bias, - trg_src_attn_bias, - n_layer, - n_head, - d_key, - d_value, - d_model, - d_inner_hid, - prepostprocess_dropout, - attention_dropout, - relu_dropout, - preprocess_cmd, - postprocess_cmd, - caches=caches, - gather_idx=gather_idx) - return dec_output - # Reshape to 2D tensor to use GEMM instead of BatchedGEMM - dec_output = layers.reshape( - dec_output, shape=[-1, dec_output.shape[-1]], inplace=True) - if weight_sharing: - predict = layers.matmul( - x=dec_output, - y=fluid.default_main_program().global_block().var( - "trg_word_emb_table"), - transpose_y=True) - else: - predict = layers.fc(input=dec_output, - size=trg_vocab_size, - bias_attr=False) - if dec_inputs is None: - # Return probs for independent decoder program. - predict = layers.softmax(predict) - return predict - - -def fast_decode(src_vocab_size, - trg_vocab_size, - max_in_len, - n_layer, - n_head, - d_key, - d_value, - d_model, - d_inner_hid, - prepostprocess_dropout, - attention_dropout, - relu_dropout, - preprocess_cmd, - postprocess_cmd, - weight_sharing, - beam_size, - max_out_len, - bos_idx, - eos_idx, - use_py_reader=False): - """ - Use beam search to decode. Caches will be used to store states of history - steps which can make the decoding faster. - """ - data_input_names = encoder_data_input_fields + fast_decoder_data_input_fields - - if use_py_reader: - all_inputs, reader = make_all_py_reader_inputs(data_input_names) - else: - all_inputs = make_all_inputs(data_input_names) - - enc_inputs_len = len(encoder_data_input_fields) - dec_inputs_len = len(fast_decoder_data_input_fields) - enc_inputs = all_inputs[0:enc_inputs_len] #enc_inputs tensor - dec_inputs = all_inputs[enc_inputs_len:enc_inputs_len + - dec_inputs_len] #dec_inputs tensor - - enc_output = wrap_encoder( - src_vocab_size, - 64, ##to do !!!!!???? - n_layer, - n_head, - d_key, - d_value, - d_model, - d_inner_hid, - prepostprocess_dropout, - attention_dropout, - relu_dropout, - preprocess_cmd, - postprocess_cmd, - weight_sharing, - enc_inputs, - bos_idx=bos_idx) - start_tokens, init_scores, parent_idx, trg_src_attn_bias = dec_inputs - - def beam_search(): - max_len = layers.fill_constant( - shape=[1], - dtype=start_tokens.dtype, - value=max_out_len, - force_cpu=True) - step_idx = layers.fill_constant( - shape=[1], dtype=start_tokens.dtype, value=0, force_cpu=True) - cond = layers.less_than(x=step_idx, y=max_len) # default force_cpu=True - while_op = layers.While(cond) - # array states will be stored for each step. - ids = layers.array_write( - layers.reshape(start_tokens, (-1, 1)), step_idx) - scores = layers.array_write(init_scores, step_idx) - # cell states will be overwrited at each step. - # caches contains states of history steps in decoder self-attention - # and static encoder output projections in encoder-decoder attention - # to reduce redundant computation. - caches = [ - { - "k": # for self attention - layers.fill_constant_batch_size_like( - input=start_tokens, - shape=[-1, n_head, 0, d_key], - dtype=enc_output.dtype, - value=0), - "v": # for self attention - layers.fill_constant_batch_size_like( - input=start_tokens, - shape=[-1, n_head, 0, d_value], - dtype=enc_output.dtype, - value=0), - "static_k": # for encoder-decoder attention - layers.create_tensor(dtype=enc_output.dtype), - "static_v": # for encoder-decoder attention - layers.create_tensor(dtype=enc_output.dtype) - } for i in range(n_layer) - ] - - with while_op.block(): - pre_ids = layers.array_read(array=ids, i=step_idx) - # Since beam_search_op dosen't enforce pre_ids' shape, we can do - # inplace reshape here which actually change the shape of pre_ids. - pre_ids = layers.reshape(pre_ids, (-1, 1, 1), inplace=True) - pre_scores = layers.array_read(array=scores, i=step_idx) - # gather cell states corresponding to selected parent - pre_src_attn_bias = layers.gather( - trg_src_attn_bias, index=parent_idx) - pre_pos = layers.elementwise_mul( - x=layers.fill_constant_batch_size_like( - input=pre_src_attn_bias, # cann't use lod tensor here - value=1, - shape=[-1, 1, 1], - dtype=pre_ids.dtype), - y=step_idx, - axis=0) - logits = wrap_decoder( - trg_vocab_size, - max_in_len, - n_layer, - n_head, - d_key, - d_value, - d_model, - d_inner_hid, - prepostprocess_dropout, - attention_dropout, - relu_dropout, - preprocess_cmd, - postprocess_cmd, - weight_sharing, - dec_inputs=(pre_ids, pre_pos, None, pre_src_attn_bias), - enc_output=enc_output, - caches=caches, - gather_idx=parent_idx, - bos_idx=bos_idx) - # intra-beam topK - topk_scores, topk_indices = layers.topk( - input=layers.softmax(logits), k=beam_size) - accu_scores = layers.elementwise_add( - x=layers.log(topk_scores), y=pre_scores, axis=0) - # beam_search op uses lod to differentiate branches. - accu_scores = layers.lod_reset(accu_scores, pre_ids) - # topK reduction across beams, also contain special handle of - # end beams and end sentences(batch reduction) - selected_ids, selected_scores, gather_idx = layers.beam_search( - pre_ids=pre_ids, - pre_scores=pre_scores, - ids=topk_indices, - scores=accu_scores, - beam_size=beam_size, - end_id=eos_idx, - return_parent_idx=True) - layers.increment(x=step_idx, value=1.0, in_place=True) - # cell states(caches) have been updated in wrap_decoder, - # only need to update beam search states here. - layers.array_write(selected_ids, i=step_idx, array=ids) - layers.array_write(selected_scores, i=step_idx, array=scores) - layers.assign(gather_idx, parent_idx) - layers.assign(pre_src_attn_bias, trg_src_attn_bias) - length_cond = layers.less_than(x=step_idx, y=max_len) - finish_cond = layers.logical_not(layers.is_empty(x=selected_ids)) - layers.logical_and(x=length_cond, y=finish_cond, out=cond) - - finished_ids, finished_scores = layers.beam_search_decode( - ids, scores, beam_size=beam_size, end_id=eos_idx) - return finished_ids, finished_scores - - finished_ids, finished_scores = beam_search() - return finished_ids, finished_scores, reader if use_py_reader else None diff --git a/ppocr/optimizer.py b/ppocr/optimizer.py index 55f2eba14c4be738c0dbc686cd32afbcff62f874..fd315cd1319d4925e893705957a42f931a39076e 100644 --- a/ppocr/optimizer.py +++ b/ppocr/optimizer.py @@ -14,14 +14,50 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import math import paddle.fluid as fluid from paddle.fluid.regularizer import L2Decay +from paddle.fluid.layers.learning_rate_scheduler import _decay_step_counter +import paddle.fluid.layers.ops as ops from ppocr.utils.utility import initial_logger logger = initial_logger() +def cosine_decay_with_warmup(learning_rate, + step_each_epoch, + epochs=500, + warmup_minibatch=1000): + """Applies cosine decay to the learning rate. + lr = 0.05 * (math.cos(epoch * (math.pi / 120)) + 1) + decrease lr for every mini-batch and start with warmup. + """ + global_step = _decay_step_counter() + lr = fluid.layers.tensor.create_global_var( + shape=[1], + value=0.0, + dtype='float32', + persistable=True, + name="learning_rate") + + warmup_minibatch = fluid.layers.fill_constant( + shape=[1], + dtype='float32', + value=float(warmup_minibatch), + force_cpu=True) + + with fluid.layers.control_flow.Switch() as switch: + with switch.case(global_step < warmup_minibatch): + decayed_lr = learning_rate * (1.0 * global_step / warmup_minibatch) + fluid.layers.tensor.assign(input=decayed_lr, output=lr) + with switch.default(): + decayed_lr = learning_rate * \ + (ops.cos((global_step - warmup_minibatch) * (math.pi / (epochs * step_each_epoch))) + 1)/2 + fluid.layers.tensor.assign(input=decayed_lr, output=lr) + return lr + + def AdamDecay(params, parameter_list=None): """ define optimizer function @@ -36,7 +72,9 @@ def AdamDecay(params, parameter_list=None): l2_decay = params.get("l2_decay", 0.0) if 'decay' in params: - supported_decay_mode = ["cosine_decay", "piecewise_decay"] + supported_decay_mode = [ + "cosine_decay", "cosine_decay_warmup", "piecewise_decay" + ] params = params['decay'] decay_mode = params['function'] assert decay_mode in supported_decay_mode, "Supported decay mode is {}, but got {}".format( @@ -49,6 +87,15 @@ def AdamDecay(params, parameter_list=None): learning_rate=base_lr, step_each_epoch=step_each_epoch, epochs=total_epoch) + elif decay_mode == "cosine_decay_warmup": + step_each_epoch = params['step_each_epoch'] + total_epoch = params['total_epoch'] + warmup_minibatch = params.get("warmup_minibatch", 1000) + base_lr = cosine_decay_with_warmup( + learning_rate=base_lr, + step_each_epoch=step_each_epoch, + epochs=total_epoch, + warmup_minibatch=warmup_minibatch) elif decay_mode == "piecewise_decay": boundaries = params["boundaries"] decay_rate = params["decay_rate"] @@ -104,5 +151,5 @@ def RMSProp(params, parameter_list=None): optimizer = fluid.optimizer.RMSProp( learning_rate=base_lr, regularization=fluid.regularizer.L2Decay(regularization_coeff=l2_decay)) - - return optimizer \ No newline at end of file + + return optimizer diff --git a/ppocr/postprocess/east_postprocess.py b/ppocr/postprocess/east_postprocess.py index 8200df3c281383fbc8c3f8df4f5090d923fb4d73..270cf6699bb7f77c730c6ff80b49f1798b9bb720 100755 --- a/ppocr/postprocess/east_postprocess.py +++ b/ppocr/postprocess/east_postprocess.py @@ -22,9 +22,9 @@ import cv2 import os import sys -__dir__ = os.path.dirname(__file__) +__dir__ = os.path.dirname(os.path.abspath(__file__)) sys.path.append(__dir__) -sys.path.append(os.path.join(__dir__, '..')) +sys.path.append(os.path.abspath(os.path.join(__dir__, '..'))) class EASTPostPocess(object): diff --git a/ppocr/postprocess/lanms/.ycm_extra_conf.py b/ppocr/postprocess/lanms/.ycm_extra_conf.py index 3c8673ddbbd92042660545d123c6bbba4f0d8273..cd1a74e920bad8d84b755b5dbfbf83e6884836d6 100644 --- a/ppocr/postprocess/lanms/.ycm_extra_conf.py +++ b/ppocr/postprocess/lanms/.ycm_extra_conf.py @@ -25,7 +25,7 @@ import ycm_core # These are the compilation flags that will be used in case there's no # compilation database set (by default, one is not set). # CHANGE THIS LIST OF FLAGS. YES, THIS IS THE DROID YOU HAVE BEEN LOOKING FOR. -sys.path.append(os.path.dirname(__file__)) +sys.path.append(os.path.dirname(os.path.abspath(__file__))) BASE_DIR = os.path.dirname(os.path.realpath(__file__)) diff --git a/ppocr/utils/character.py b/ppocr/utils/character.py index c7c93fc557604a32d12343d929c119fd787ee126..b4b2021e02c9905623fd9fad5c9673543569c1c2 100755 --- a/ppocr/utils/character.py +++ b/ppocr/utils/character.py @@ -26,8 +26,6 @@ class CharacterOps(object): self.character_type = config['character_type'] self.loss_type = config['loss_type'] self.max_text_len = config['max_text_length'] - if self.loss_type == "srn" and self.character_type != "en": - raise Exception("SRN can only support in character_type == en") if self.character_type == "en": self.character_str = "0123456789abcdefghijklmnopqrstuvwxyz" dict_character = list(self.character_str) @@ -160,13 +158,15 @@ def cal_predicts_accuracy_srn(char_ops, acc_num = 0 img_num = 0 + char_num = char_ops.get_char_num() + total_len = preds.shape[0] img_num = int(total_len / max_text_len) for i in range(img_num): cur_label = [] cur_pred = [] for j in range(max_text_len): - if labels[j + i * max_text_len] != 37: #0 + if labels[j + i * max_text_len] != int(char_num-1): #0 cur_label.append(labels[j + i * max_text_len][0]) else: break @@ -178,7 +178,7 @@ def cal_predicts_accuracy_srn(char_ops, elif j == len(cur_label) and j == max_text_len: acc_num += 1 break - elif j == len(cur_label) and preds[j + i * max_text_len][0] == 37: + elif j == len(cur_label) and preds[j + i * max_text_len][0] == int(char_num-1): acc_num += 1 break acc = acc_num * 1.0 / img_num diff --git a/requirments.txt b/requirments.txt index 94e8478ffad88a6e5cd69424c6aa485400cfae06..ec538138beaed70ec8f5285ea0c4114f22e3b0ef 100644 --- a/requirments.txt +++ b/requirments.txt @@ -1,4 +1,6 @@ shapely imgaug pyclipper -lmdb \ No newline at end of file +lmdb +tqdm +numpy \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..7141f170f3afa2be5217faff66a2aeb12dbefcbe --- /dev/null +++ b/setup.py @@ -0,0 +1,56 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from setuptools import setup +from io import open + +with open('requirments.txt', encoding="utf-8-sig") as f: + requirements = f.readlines() + requirements.append('tqdm') + + +def readme(): + with open('doc/doc_en/whl_en.md', encoding="utf-8-sig") as f: + README = f.read() + return README + + +setup( + name='paddleocr', + packages=['paddleocr'], + package_dir={'paddleocr': ''}, + include_package_data=True, + entry_points={"console_scripts": ["paddleocr= paddleocr.paddleocr:main"]}, + version='0.0.3', + install_requires=requirements, + license='Apache License 2.0', + description='Awesome OCR toolkits based on PaddlePaddle (8.6M ultra-lightweight pre-trained model, support training and deployment among server, mobile, embeded and IoT devices', + long_description=readme(), + long_description_content_type='text/markdown', + url='https://github.com/PaddlePaddle/PaddleOCR', + download_url='https://github.com/PaddlePaddle/PaddleOCR.git', + keywords=[ + 'ocr textdetection textrecognition paddleocr crnn east star-net rosetta ocrlite db chineseocr chinesetextdetection chinesetextrecognition' + ], + classifiers=[ + 'Intended Audience :: Developers', 'Operating System :: OS Independent', + 'Natural Language :: Chinese (Simplified)', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.2', + 'Programming Language :: Python :: 3.3', + 'Programming Language :: Python :: 3.4', + 'Programming Language :: Python :: 3.5', + 'Programming Language :: Python :: 3.6', + 'Programming Language :: Python :: 3.7', 'Topic :: Utilities' + ], ) diff --git a/tools/eval.py b/tools/eval.py index 041e825e4f840cef22eb58f6ad6e1c8305b255ee..aff5fc7111a062c9b4346e9c2dcbc8f9225fe8da 100755 --- a/tools/eval.py +++ b/tools/eval.py @@ -18,9 +18,9 @@ from __future__ import print_function import os import sys -__dir__ = os.path.dirname(__file__) +__dir__ = os.path.dirname(os.path.abspath(__file__)) sys.path.append(__dir__) -sys.path.append(os.path.join(__dir__, '..')) +sys.path.append(os.path.abspath(os.path.join(__dir__, '..'))) def set_paddle_flags(**kwargs): diff --git a/tools/export_model.py b/tools/export_model.py index de4ba0e4c44fec1cd2427bfe7c9065639eef26e2..0bd06b98dcacc06893becbefacbea198c432bc39 100644 --- a/tools/export_model.py +++ b/tools/export_model.py @@ -18,9 +18,9 @@ from __future__ import print_function import os import sys -__dir__ = os.path.dirname(__file__) +__dir__ = os.path.dirname(os.path.abspath(__file__)) sys.path.append(__dir__) -sys.path.append(os.path.join(__dir__, '..')) +sys.path.append(os.path.abspath(os.path.join(__dir__, '..'))) def set_paddle_flags(**kwargs): diff --git a/tools/infer/predict_cls.py b/tools/infer/predict_cls.py index d4434445bbb9717033d72fe47c539f5f81efe002..54e2dbbba5481e803d29ff16b032fcb57f6446c5 100755 --- a/tools/infer/predict_cls.py +++ b/tools/infer/predict_cls.py @@ -28,6 +28,7 @@ import copy import numpy as np import math import time +from paddle import fluid class TextClassifier(object): @@ -37,6 +38,7 @@ class TextClassifier(object): self.cls_image_shape = [int(v) for v in args.cls_image_shape.split(",")] self.cls_batch_num = args.rec_batch_num self.label_list = args.label_list + self.use_zero_copy_run = args.use_zero_copy_run def resize_norm_img(self, img): imgC, imgH, imgW = self.cls_image_shape @@ -89,8 +91,12 @@ class TextClassifier(object): norm_img_batch = norm_img_batch.copy() starttime = time.time() - self.input_tensor.copy_from_cpu(norm_img_batch) - self.predictor.zero_copy_run() + if self.use_zero_copy_run: + self.input_tensor.copy_from_cpu(norm_img_batch) + self.predictor.zero_copy_run() + else: + norm_img_batch = fluid.core.PaddleTensor(norm_img_batch) + self.predictor.run([norm_img_batch]) prob_out = self.output_tensors[0].copy_to_cpu() label_out = self.output_tensors[1].copy_to_cpu() diff --git a/tools/infer/predict_det.py b/tools/infer/predict_det.py index 75644aeb990ab95edb51f2809bb8cc8fbdf3e2be..625f87abc39fc0e9d7683f72dafec1d53324873a 100755 --- a/tools/infer/predict_det.py +++ b/tools/infer/predict_det.py @@ -13,30 +13,36 @@ # limitations under the License. import os import sys -__dir__ = os.path.dirname(__file__) +__dir__ = os.path.dirname(os.path.abspath(__file__)) sys.path.append(__dir__) -sys.path.append(os.path.join(__dir__, '../..')) +sys.path.append(os.path.abspath(os.path.join(__dir__, '../..'))) + +import cv2 +import copy +import numpy as np +import math +import time +import sys + +import paddle.fluid as fluid import tools.infer.utility as utility from ppocr.utils.utility import initial_logger logger = initial_logger() from ppocr.utils.utility import get_image_file_list, check_and_read_gif -import cv2 +from ppocr.data.det.sast_process import SASTProcessTest from ppocr.data.det.east_process import EASTProcessTest from ppocr.data.det.db_process import DBProcessTest from ppocr.postprocess.db_postprocess import DBPostProcess from ppocr.postprocess.east_postprocess import EASTPostPocess -import copy -import numpy as np -import math -import time -import sys +from ppocr.postprocess.sast_postprocess import SASTPostProcess class TextDetector(object): def __init__(self, args): max_side_len = args.det_max_side_len self.det_algorithm = args.det_algorithm + self.use_zero_copy_run = args.use_zero_copy_run preprocess_params = {'max_side_len': max_side_len} postprocess_params = {} if self.det_algorithm == "DB": @@ -52,6 +58,20 @@ class TextDetector(object): postprocess_params["cover_thresh"] = args.det_east_cover_thresh postprocess_params["nms_thresh"] = args.det_east_nms_thresh self.postprocess_op = EASTPostPocess(postprocess_params) + elif self.det_algorithm == "SAST": + self.preprocess_op = SASTProcessTest(preprocess_params) + postprocess_params["score_thresh"] = args.det_sast_score_thresh + postprocess_params["nms_thresh"] = args.det_sast_nms_thresh + self.det_sast_polygon = args.det_sast_polygon + if self.det_sast_polygon: + postprocess_params["sample_pts_num"] = 6 + postprocess_params["expand_scale"] = 1.2 + postprocess_params["shrink_ratio_of_width"] = 0.2 + else: + postprocess_params["sample_pts_num"] = 2 + postprocess_params["expand_scale"] = 1.0 + postprocess_params["shrink_ratio_of_width"] = 0.3 + self.postprocess_op = SASTPostProcess(postprocess_params) else: logger.info("unknown det_algorithm:{}".format(self.det_algorithm)) sys.exit(0) @@ -84,7 +104,7 @@ class TextDetector(object): return rect def clip_det_res(self, points, img_height, img_width): - for pno in range(4): + for pno in range(points.shape[0]): points[pno, 0] = int(min(max(points[pno, 0], 0), img_width - 1)) points[pno, 1] = int(min(max(points[pno, 1], 0), img_height - 1)) return points @@ -103,6 +123,15 @@ class TextDetector(object): dt_boxes = np.array(dt_boxes_new) return dt_boxes + def filter_tag_det_res_only_clip(self, dt_boxes, image_shape): + img_height, img_width = image_shape[0:2] + dt_boxes_new = [] + for box in dt_boxes: + box = self.clip_det_res(box, img_height, img_width) + dt_boxes_new.append(box) + dt_boxes = np.array(dt_boxes_new) + return dt_boxes + def __call__(self, img): ori_im = img.copy() im, ratio_list = self.preprocess_op(img) @@ -110,8 +139,12 @@ class TextDetector(object): return None, 0 im = im.copy() starttime = time.time() - self.input_tensor.copy_from_cpu(im) - self.predictor.zero_copy_run() + if self.use_zero_copy_run: + self.input_tensor.copy_from_cpu(im) + self.predictor.zero_copy_run() + else: + im = fluid.core.PaddleTensor(im) + self.predictor.run([im]) outputs = [] for output_tensor in self.output_tensors: output = output_tensor.copy_to_cpu() @@ -120,11 +153,20 @@ class TextDetector(object): if self.det_algorithm == "EAST": outs_dict['f_geo'] = outputs[0] outs_dict['f_score'] = outputs[1] + elif self.det_algorithm == 'SAST': + outs_dict['f_border'] = outputs[0] + outs_dict['f_score'] = outputs[1] + outs_dict['f_tco'] = outputs[2] + outs_dict['f_tvo'] = outputs[3] else: outs_dict['maps'] = outputs[0] + dt_boxes_list = self.postprocess_op(outs_dict, [ratio_list]) dt_boxes = dt_boxes_list[0] - dt_boxes = self.filter_tag_det_res(dt_boxes, ori_im.shape) + if self.det_algorithm == "SAST" and self.det_sast_polygon: + dt_boxes = self.filter_tag_det_res_only_clip(dt_boxes, ori_im.shape) + else: + dt_boxes = self.filter_tag_det_res(dt_boxes, ori_im.shape) elapse = time.time() - starttime return dt_boxes, elapse diff --git a/tools/infer/predict_rec.py b/tools/infer/predict_rec.py index c81b4eb2560ee5ad66a85c96efe4de935a2beee1..6a379853a4a7d62cbffcbebbf09e2fb3e2207b27 100755 --- a/tools/infer/predict_rec.py +++ b/tools/infer/predict_rec.py @@ -17,15 +17,18 @@ __dir__ = os.path.dirname(os.path.abspath(__file__)) sys.path.append(__dir__) sys.path.append(os.path.abspath(os.path.join(__dir__, '../..'))) -import tools.infer.utility as utility -from ppocr.utils.utility import initial_logger -logger = initial_logger() -from ppocr.utils.utility import get_image_file_list, check_and_read_gif import cv2 import copy import numpy as np import math import time + +import paddle.fluid as fluid + +import tools.infer.utility as utility +from ppocr.utils.utility import initial_logger +logger = initial_logger() +from ppocr.utils.utility import get_image_file_list, check_and_read_gif from ppocr.utils.character import CharacterOps @@ -37,6 +40,7 @@ class TextRecognizer(object): self.character_type = args.rec_char_type self.rec_batch_num = args.rec_batch_num self.rec_algorithm = args.rec_algorithm + self.use_zero_copy_run = args.use_zero_copy_run char_ops_params = { "character_type": args.rec_char_type, "character_dict_path": args.rec_char_dict_path, @@ -102,8 +106,12 @@ class TextRecognizer(object): norm_img_batch = np.concatenate(norm_img_batch) norm_img_batch = norm_img_batch.copy() starttime = time.time() - self.input_tensor.copy_from_cpu(norm_img_batch) - self.predictor.zero_copy_run() + if self.use_zero_copy_run: + self.input_tensor.copy_from_cpu(norm_img_batch) + self.predictor.zero_copy_run() + else: + norm_img_batch = fluid.core.PaddleTensor(norm_img_batch) + self.predictor.run([norm_img_batch]) if self.loss_type == "ctc": rec_idx_batch = self.output_tensors[0].copy_to_cpu() diff --git a/tools/infer/predict_system.py b/tools/infer/predict_system.py index c34fb9635d4d689355d823fcc36bba19136da05c..555c12b1a929662f436e3a9a031b2e480a837622 100755 --- a/tools/infer/predict_system.py +++ b/tools/infer/predict_system.py @@ -160,12 +160,7 @@ def main(args): scores = [rec_res[i][1] for i in range(len(rec_res))] draw_img = draw_ocr( - image, - boxes, - txts, - scores, - draw_txt=True, - drop_score=drop_score) + image, boxes, txts, scores, drop_score=drop_score) draw_img_save = "./inference_results/" if not os.path.exists(draw_img_save): os.makedirs(draw_img_save) diff --git a/tools/infer/utility.py b/tools/infer/utility.py index bde7a41cc7265a97710e82c342687ce398a478d4..cbbda97b2a60aeba2a592a8d1b5aa1dc294d4067 100755 --- a/tools/infer/utility.py +++ b/tools/infer/utility.py @@ -53,6 +53,11 @@ def parse_args(): parser.add_argument("--det_east_cover_thresh", type=float, default=0.1) parser.add_argument("--det_east_nms_thresh", type=float, default=0.2) + #SAST parmas + parser.add_argument("--det_sast_score_thresh", type=float, default=0.5) + parser.add_argument("--det_sast_nms_thresh", type=float, default=0.2) + parser.add_argument("--det_sast_polygon", type=bool, default=False) + #params for text recognizer parser.add_argument("--rec_algorithm", type=str, default='CRNN') parser.add_argument("--rec_model_dir", type=str) @@ -73,6 +78,7 @@ def parse_args(): parser.add_argument("--cls_batch_num", type=int, default=30) parser.add_argument("--enable_mkldnn", type=bool, default=False) + parser.add_argument("--use_zero_copy_run", type=bool, default=False) return parser.parse_args() @@ -109,9 +115,12 @@ def create_predictor(args, mode): #config.enable_memory_optim() config.disable_glog_info() - # use zero copy - config.delete_pass("conv_transpose_eltwiseadd_bn_fuse_pass") - config.switch_use_feed_fetch_ops(False) + if args.use_zero_copy_run: + config.delete_pass("conv_transpose_eltwiseadd_bn_fuse_pass") + config.switch_use_feed_fetch_ops(False) + else: + config.switch_use_feed_fetch_ops(True) + predictor = create_paddle_predictor(config) input_names = predictor.get_input_names() input_tensor = predictor.get_input_tensor(input_names[0]) @@ -143,7 +152,12 @@ def resize_img(img, input_size=600): return im -def draw_ocr(image, boxes, txts, scores, draw_txt=True, drop_score=0.5): +def draw_ocr(image, + boxes, + txts=None, + scores=None, + drop_score=0.5, + font_path="./doc/simfang.ttf"): """ Visualize the results of OCR detection and recognition args: @@ -151,23 +165,29 @@ def draw_ocr(image, boxes, txts, scores, draw_txt=True, drop_score=0.5): boxes(list): boxes with shape(N, 4, 2) txts(list): the texts scores(list): txxs corresponding scores - draw_txt(bool): whether draw text or not drop_score(float): only scores greater than drop_threshold will be visualized + font_path: the path of font which is used to draw text return(array): the visualized img """ if scores is None: scores = [1] * len(boxes) - for (box, score) in zip(boxes, scores): - if score < drop_score or math.isnan(score): + box_num = len(boxes) + for i in range(box_num): + if scores is not None and (scores[i] < drop_score or + math.isnan(scores[i])): continue - box = np.reshape(np.array(box), [-1, 1, 2]).astype(np.int64) + box = np.reshape(np.array(boxes[i]), [-1, 1, 2]).astype(np.int64) image = cv2.polylines(np.array(image), [box], True, (255, 0, 0), 2) - - if draw_txt: + if txts is not None: img = np.array(resize_img(image, input_size=600)) txt_img = text_visual( - txts, scores, img_h=img.shape[0], img_w=600, threshold=drop_score) + txts, + scores, + img_h=img.shape[0], + img_w=600, + threshold=drop_score, + font_path=font_path) img = np.concatenate([np.array(img), np.array(txt_img)], axis=1) return img return image @@ -245,7 +265,12 @@ def str_count(s): return s_len - math.ceil(en_dg_count / 2) -def text_visual(texts, scores, img_h=400, img_w=600, threshold=0.): +def text_visual(texts, + scores, + img_h=400, + img_w=600, + threshold=0., + font_path="./doc/simfang.ttf"): """ create new blank img and draw txt on it args: @@ -253,6 +278,7 @@ def text_visual(texts, scores, img_h=400, img_w=600, threshold=0.): scores(list|None): corresponding score of each txt img_h(int): the height of blank img img_w(int): the width of blank img + font_path: the path of font which is used to draw text return(array): """ @@ -271,7 +297,7 @@ def text_visual(texts, scores, img_h=400, img_w=600, threshold=0.): font_size = 20 txt_color = (0, 0, 0) - font = ImageFont.truetype("./doc/simfang.ttf", font_size, encoding="utf-8") + font = ImageFont.truetype(font_path, font_size, encoding="utf-8") gap = font_size + 5 txt_img_list = [] @@ -352,6 +378,6 @@ if __name__ == '__main__': txts.append(dic['transcription']) scores.append(round(dic['scores'], 3)) - new_img = draw_ocr(image, boxes, txts, scores, draw_txt=True) + new_img = draw_ocr(image, boxes, txts, scores) cv2.imwrite(img_name, new_img) diff --git a/tools/infer_det.py b/tools/infer_det.py index a8b49b6b075ba509e17c37c8d1f05dee9822edec..1e7fdcc46a1d2f47a7928d6dc171ae393b15f901 100755 --- a/tools/infer_det.py +++ b/tools/infer_det.py @@ -22,9 +22,9 @@ import json import os import sys -__dir__ = os.path.dirname(__file__) +__dir__ = os.path.dirname(os.path.abspath(__file__)) sys.path.append(__dir__) -sys.path.append(os.path.join(__dir__, '..')) +sys.path.append(os.path.abspath(os.path.join(__dir__, '..'))) def set_paddle_flags(**kwargs): @@ -134,8 +134,10 @@ def main(): dic = {'f_score': outs[0], 'f_geo': outs[1]} elif config['Global']['algorithm'] == 'DB': dic = {'maps': outs[0]} + elif config['Global']['algorithm'] == 'SAST': + dic = {'f_score': outs[0], 'f_border': outs[1], 'f_tvo': outs[2], 'f_tco': outs[3]} else: - raise Exception("only support algorithm: ['EAST', 'DB']") + raise Exception("only support algorithm: ['EAST', 'DB', 'SAST']") dt_boxes_list = postprocess(dic, ratio_list) for ino in range(img_num): dt_boxes = dt_boxes_list[ino] @@ -149,7 +151,7 @@ def main(): fout.write(otstr.encode()) src_img = cv2.imread(img_name) draw_det_res(dt_boxes, config, src_img, img_name) - + logger.info("success!") diff --git a/tools/infer_rec.py b/tools/infer_rec.py index 21b503cc7f342885094a03ef1f1ed0f05698ac70..fd70cd66dccc2cb755efbd10c4d16c9f7a97146d 100755 --- a/tools/infer_rec.py +++ b/tools/infer_rec.py @@ -19,9 +19,9 @@ from __future__ import print_function import numpy as np import os import sys -__dir__ = os.path.dirname(__file__) +__dir__ = os.path.dirname(os.path.abspath(__file__)) sys.path.append(__dir__) -sys.path.append(os.path.join(__dir__, '..')) +sys.path.append(os.path.abspath(os.path.join(__dir__, '..'))) def set_paddle_flags(**kwargs): @@ -140,12 +140,12 @@ def main(): preds = preds.reshape(-1) preds_text = char_ops.decode(preds) elif loss_type == "srn": - cur_pred = [] + char_num = char_ops.get_char_num() preds = np.array(predict[0]) preds = preds.reshape(-1) probs = np.array(predict[1]) ind = np.argmax(probs, axis=1) - valid_ind = np.where(preds != 37)[0] + valid_ind = np.where(preds != int(char_num-1))[0] if len(valid_ind) == 0: continue score = np.mean(probs[valid_ind, ind[valid_ind]]) diff --git a/tools/train.py b/tools/train.py index e477d9c3d1ef612e68293b8a1be151813ae469fd..531dd15933ebfd83527f091215c40b85253f7866 100755 --- a/tools/train.py +++ b/tools/train.py @@ -18,9 +18,9 @@ from __future__ import print_function import os import sys -__dir__ = os.path.dirname(__file__) +__dir__ = os.path.dirname(os.path.abspath(__file__)) sys.path.append(__dir__) -sys.path.append(os.path.join(__dir__, '..')) +sys.path.append(os.path.abspath(os.path.join(__dir__, '..'))) def set_paddle_flags(**kwargs):