diff --git a/PPOCRLabel/PPOCRLabel.py b/PPOCRLabel/PPOCRLabel.py index 8ba0c53275ed0450a38fcb1b1b514aede3575ec2..15b0b0b5a9628987ec58f4ee02a8d57fdd41471d 100644 --- a/PPOCRLabel/PPOCRLabel.py +++ b/PPOCRLabel/PPOCRLabel.py @@ -106,7 +106,7 @@ class MainWindow(QMainWindow, WindowMixin): getStr = lambda strId: self.stringBundle.getString(strId) self.defaultSaveDir = defaultSaveDir - self.ocr = PaddleOCR(use_pdserving=False, use_angle_cls=True, det=True, cls=True, use_gpu=True, lang=lang) + self.ocr = PaddleOCR(use_pdserving=False, use_angle_cls=True, det=True, cls=True, use_gpu=False, lang=lang) if os.path.exists('./data/paddle.png'): result = self.ocr.ocr('./data/paddle.png', cls=True, det=True) @@ -147,7 +147,7 @@ class MainWindow(QMainWindow, WindowMixin): self.prevLabelText = getStr('tempLabel') self.model = 'paddle' self.PPreader = None - self.autoSaveNum = 10 + self.autoSaveNum = 5 ################# file list ############### self.fileListWidget = QListWidget() diff --git a/PPOCRLabel/README.md b/PPOCRLabel/README.md index 4e1e230405c58a7bf38b7b5dc958643690fd3f7e..fda8725b46906884c6fd7e0556244e0103815d73 100644 --- a/PPOCRLabel/README.md +++ b/PPOCRLabel/README.md @@ -2,7 +2,7 @@ English | [简体中文](README_ch.md) # PPOCRLabel -PPOCRLabel is a semi-automatic graphic annotation tool suitable for OCR field. It is written in python3 and pyqt5, supporting rectangular box annotation and four-point annotation modes. Annotations can be directly used for the training of PPOCR detection and recognition models. +PPOCRLabel is a semi-automatic graphic annotation tool suitable for OCR field, with built-in PPOCR model to automatically detect and re-recognize data. It is written in python3 and pyqt5, supporting rectangular box annotation and four-point annotation modes. Annotations can be directly used for the training of PPOCR detection and recognition models. @@ -10,11 +10,15 @@ PPOCRLabel is a semi-automatic graphic annotation tool suitable for OCR field. I - 2020.12.18: Support re-recognition of a single label box (by [ninetailskim](https://github.com/ninetailskim) ), perfect shortcut keys. +### TODO: +- Lock box mode: For the same scene data, the size and position of the locked detection box can be transferred between different pictures. +- Experience optimization: Add undo, batch operation include move, copy, delete and so on, optimize the annotation process. + ## Installation ### 1. Install PaddleOCR -Refer to [PaddleOCR installation document](https://github.com/PaddlePaddle/PaddleOCR/blob/develop/doc/doc_ch/installation.md) to prepare PaddleOCR +PaddleOCR models has been built in PPOCRLabel, please refer to [PaddleOCR installation document](https://github.com/PaddlePaddle/PaddleOCR/blob/develop/doc/doc_ch/installation.md) to prepare PaddleOCR and make sure it works. ### 2. Install PPOCRLabel @@ -60,7 +64,7 @@ python3 PPOCRLabel.py 4.1 Click 'Create RectBox' or press 'W' in English keyboard mode to draw a new rectangle detection box. Click and release left mouse to select a region to annotate the text area. - 4.2 Press 'P' to enter four-point labeling mode which enables you to create any four-point shape by clicking four points with the left mouse button in succession and DOUBLE CLICK the left mouse as the signal of labeling completion. + 4.2 Press 'Q' to enter four-point labeling mode which enables you to create any four-point shape by clicking four points with the left mouse button in succession and DOUBLE CLICK the left mouse as the signal of labeling completion. 5. After the marking frame is drawn, the user clicks "OK", and the detection frame will be pre-assigned a "TEMPORARY" label. @@ -72,7 +76,7 @@ python3 PPOCRLabel.py 9. Click "Delete Image" and the image will be deleted to the recycle bin. -10. Labeling result: the user can save manually through the menu "File - Save Label", while the program will also save automatically after every 10 images confirmed by the user.the manually checked label will be stored in *Label.txt* under the opened picture folder. +10. Labeling result: the user can save manually through the menu "File - Save Label", while the program will also save automatically after every 5 images confirmed by the user.the manually checked label will be stored in *Label.txt* under the opened picture folder. Click "PaddleOCR"-"Save Recognition Results" in the menu bar, the recognition training data of such pictures will be saved in the *crop_img* folder, and the recognition label will be saved in *rec_gt.txt*[4]. ### Note @@ -88,7 +92,7 @@ Therefore, if the recognition result has been manually changed before, it may ch | File name | Description | | :-----------: | :----------------------------------------------------------: | -| Label.txt | The detection label file can be directly used for PPOCR detection model training. After the user saves 10 label results, the file will be automatically saved. It will also be written when the user closes the application or changes the file folder. | +| Label.txt | The detection label file can be directly used for PPOCR detection model training. After the user saves 5 label results, the file will be automatically saved. It will also be written when the user closes the application or changes the file folder. | | fileState.txt | The picture status file save the image in the current folder that has been manually confirmed by the user. | | Cache.cach | Cache files to save the results of model recognition. | | rec_gt.txt | The recognition label file, which can be directly used for PPOCR identification model training, is generated after the user clicks on the menu bar "File"-"Save recognition result". | @@ -124,6 +128,15 @@ Therefore, if the recognition result has been manually changed before, it may ch - Custom model: The model trained by users can be replaced by modifying PPOCRLabel.py in [PaddleOCR class instantiation](https://github.com/PaddlePaddle/PaddleOCR/blob/develop/PPOCRLabel/PPOCRLabel.py#L110) referring [Custom Model Code](https://github.com/PaddlePaddle/PaddleOCR/blob/develop/doc/doc_en/whl_en.md#use-custom-model) +### Save + +PPOCRLabel supports three ways to save Label.txt + +- Automatically save: When it detects that the user has manually checked 5 pictures, the program automatically writes the annotations into Label.txt. The user can change the value of ``self.autoSaveNum`` in ``PPOCRLabel.py`` to set the number of images to be automatically saved after confirmation. +- Manual save: Click "File-Save Marking Results" to manually save the label. +- Close application save + + ### Export partial recognition results For some data that are difficult to recognize, the recognition results will not be exported by **unchecking** the corresponding tags in the recognition results checkbox. diff --git a/PPOCRLabel/README_ch.md b/PPOCRLabel/README_ch.md index c4b4b07e3f9a7ca9faf9de11e8d0684cbe3efde9..b41ae200e0a784de2887f008a82d2c3ae645c976 100644 --- a/PPOCRLabel/README_ch.md +++ b/PPOCRLabel/README_ch.md @@ -2,18 +2,26 @@ # PPOCRLabel -PPOCRLabel是一款适用于OCR领域的半自动化图形标注工具,使用python3和pyqt5编写,支持矩形框标注和四点标注模式,导出格式可直接用于PPOCR检测和识别模型的训练。 +PPOCRLabel是一款适用于OCR领域的半自动化图形标注工具,内置PPOCR模型对数据自动标注和重新识别。使用python3和pyqt5编写,支持矩形框标注和四点标注模式,导出格式可直接用于PPOCR检测和识别模型的训练。 #### 近期更新 -- 2020.12.18: 支持对单个标记框进行重新识别(by [ninetailskim](https://github.com/ninetailskim) ),完善快捷键。 +- 2020.12.18: 支持对单个标记框进行重新识别(by [ninetailskim](https://github.com/ninetailskim)),完善快捷键。 + +#### 尽请期待 + +- 锁定框模式:针对同一场景数据,被锁定的检测框的大小与位置能在不同图片之间传递。 +- 体验优化:增加撤销操作,批量移动、复制、删除等功能。优化标注流程。 + +如果您对以上内容感兴趣或对完善工具有不一样的想法,欢迎加入我们的队伍与我们共同开发 + ## 安装 ### 1. 安装PaddleOCR -参考[PaddleOCR安装文档](https://github.com/PaddlePaddle/PaddleOCR/blob/develop/doc/doc_ch/installation.md)准备好PaddleOCR +PPOCRLabel内置PaddleOCR模型,故请参考[PaddleOCR安装文档](https://github.com/PaddlePaddle/PaddleOCR/blob/develop/doc/doc_ch/installation.md)准备好PaddleOCR,并确保PaddleOCR安装成功。 ### 2. 安装PPOCRLabel #### Windows + Anaconda @@ -49,13 +57,13 @@ python3 PPOCRLabel.py --lang ch 1. 安装与运行:使用上述命令安装与运行程序。 2. 打开文件夹:在菜单栏点击 “文件” - "打开目录" 选择待标记图片的文件夹[1]. 3. 自动标注:点击 ”自动标注“,使用PPOCR超轻量模型对图片文件名前图片状态[2]为 “X” 的图片进行自动标注。 -4. 手动标注:点击 “矩形标注”(推荐直接在英文模式下点击键盘中的 “W”),用户可对当前图片中模型未检出的部分进行手动绘制标记框。点击键盘P,则使用四点标注模式(或点击“编辑” - “四点标注”),用户依次点击4个点后,双击左键表示标注完成。 +4. 手动标注:点击 “矩形标注”(推荐直接在英文模式下点击键盘中的 “W”),用户可对当前图片中模型未检出的部分进行手动绘制标记框。点击键盘Q,则使用四点标注模式(或点击“编辑” - “四点标注”),用户依次点击4个点后,双击左键表示标注完成。 5. 标记框绘制完成后,用户点击 “确认”,检测框会先被预分配一个 “待识别” 标签。 6. 重新识别:将图片中的所有检测画绘制/调整完成后,点击 “重新识别”,PPOCR模型会对当前图片中的**所有检测框**重新识别[3]。 7. 内容更改:双击识别结果,对不准确的识别结果进行手动更改。 8. 确认标记:点击 “确认”,图片状态切换为 “√”,跳转至下一张(此时不会直接将结果写入文件)。 9. 删除:点击 “删除图像”,图片将会被删除至回收站。 -10. 保存结果:用户可以通过菜单中“文件-保存标记结果”手动保存,同时程序也会在用户每确认10张图片后自动保存一次。手动确认过的标记将会被存放在所打开图片文件夹下的*Label.txt*中。在菜单栏点击 “文件” - "保存识别结果"后,会将此类图片的识别训练数据保存在*crop_img*文件夹下,识别标签保存在*rec_gt.txt*中[4]。 +10. 保存结果:用户可以通过菜单中“文件-保存标记结果”手动保存,同时程序也会在用户每确认5张图片后自动保存一次。手动确认过的标记将会被存放在所打开图片文件夹下的*Label.txt*中。在菜单栏点击 “文件” - "保存识别结果"后,会将此类图片的识别训练数据保存在*crop_img*文件夹下,识别标签保存在*rec_gt.txt*中[4]。 ### 注意 @@ -69,7 +77,7 @@ python3 PPOCRLabel.py --lang ch | 文件名 | 说明 | | :-----------: | :----------------------------------------------------------: | -| Label.txt | 检测标签,可直接用于PPOCR检测模型训练。用户每保存10张检测结果后,程序会进行自动写入。当用户关闭应用程序或切换文件路径后同样会进行写入。 | +| Label.txt | 检测标签,可直接用于PPOCR检测模型训练。用户每保存5张检测结果后,程序会进行自动写入。当用户关闭应用程序或切换文件路径后同样会进行写入。 | | fileState.txt | 图片状态标记文件,保存当前文件夹下已经被用户手动确认过的图片名称。 | | Cache.cach | 缓存文件,保存模型自动识别的结果。 | | rec_gt.txt | 识别标签。可直接用于PPOCR识别模型训练。需用户手动点击菜单栏“文件” - "保存识别结果"后产生。 | @@ -104,6 +112,14 @@ python3 PPOCRLabel.py --lang ch - 自定义模型:用户可根据[自定义模型代码使用](https://github.com/PaddlePaddle/PaddleOCR/blob/develop/doc/doc_ch/whl.md#%E8%87%AA%E5%AE%9A%E4%B9%89%E6%A8%A1%E5%9E%8B),通过修改PPOCRLabel.py中针对[PaddleOCR类的实例化](https://github.com/PaddlePaddle/PaddleOCR/blob/develop/PPOCRLabel/PPOCRLabel.py#L110)替换成自己训练的模型。 +### 保存方式 + +PPOCRLabel支持三种保存方式: + +- 程序自动保存:当检测到用户手动确认过5张图片后,程序自动将标记结果写入Label.txt中。其中用户可通过更改```PPOCRLabel.py```中的```self.autoSaveNum```的数值设置确认几张图片后进行自动保存。 +- 手动保存:点击“文件 - 保存标记结果”手动保存标记。 +- 关闭应用程序保存 + ### 导出部分识别结果 针对部分难以识别的数据,通过在识别结果的复选框中**取消勾选**相应的标记,其识别结果不会被导出。 @@ -115,7 +131,7 @@ python3 PPOCRLabel.py --lang ch - PPOCRLabel**不支持对中文文件名**的图片进行自动标注。 -- 针对Linux用户::如果您在打开软件过程中出现**objc[XXXXX]**开头的错误,证明您的opencv版本太高,建议安装4.2版本: +- 针对Linux用户:如果您在打开软件过程中出现**objc[XXXXX]**开头的错误,证明您的opencv版本太高,建议安装4.2版本: ``` pip install opencv-python==4.2.0.32 ``` @@ -129,6 +145,7 @@ python3 PPOCRLabel.py --lang ch ``` pip install opencv-contrib-python-headless ``` + ### 参考资料 1.[Tzutalin. LabelImg. Git code (2015)](https://github.com/tzutalin/labelImg) diff --git a/PPOCRLabel/requirements/requirements-linux-python3.txt b/PPOCRLabel/requirements/requirements-linux-python3.txt deleted file mode 100644 index 787a7fda11ade96de811381d158f2742298b0632..0000000000000000000000000000000000000000 --- a/PPOCRLabel/requirements/requirements-linux-python3.txt +++ /dev/null @@ -1,2 +0,0 @@ -pyqt5==5.10.1 -lxml==4.2.4 diff --git a/README.md b/README.md index 3f6737f8343d4d03be98d76fe941482c5de8397f..5b6e4bd0b594d71edd3ab4f8da350475c3ac83b8 100644 --- a/README.md +++ b/README.md @@ -122,8 +122,7 @@ For a new language request, please refer to [Guideline for new language_requests -PP-OCR is a practical ultra-lightweight OCR system. It is mainly composed of three parts: DB text detection, detection frame correction and CRNN text recognition. The system adopts 19 effective strategies from 8 aspects including backbone network selection and adjustment, prediction head design, data augmentation, learning rate transformation strategy, regularization parameter selection, pre-training model use, and automatic model tailoring and quantization to optimize and slim down the models of each module. The final results are an ultra-lightweight Chinese and English OCR model with an overall size of 3.5M and a 2.8M English digital OCR model. For more details, please refer to the PP-OCR technical article (https://arxiv.org/abs/2009.09941). Besides, The implementation of the FPGM Pruner and PACT quantization is based on [PaddleSlim](https://github.com/PaddlePaddle/PaddleSlim). - +PP-OCR is a practical ultra-lightweight OCR system. It is mainly composed of three parts: DB text detection[2], detection frame correction and CRNN text recognition[7]. The system adopts 19 effective strategies from 8 aspects including backbone network selection and adjustment, prediction head design, data augmentation, learning rate transformation strategy, regularization parameter selection, pre-training model use, and automatic model tailoring and quantization to optimize and slim down the models of each module. The final results are an ultra-lightweight Chinese and English OCR model with an overall size of 3.5M and a 2.8M English digital OCR model. For more details, please refer to the PP-OCR technical article (https://arxiv.org/abs/2009.09941). Besides, The implementation of the FPGM Pruner [8] and PACT quantization [9] is based on [PaddleSlim](https://github.com/PaddlePaddle/PaddleSlim). ## Visualization [more](./doc/doc_en/visualization_en.md) @@ -174,7 +173,7 @@ This project is released under -PP-OCR是一个实用的超轻量OCR系统。主要由DB文本检测、检测框矫正和CRNN文本识别三部分组成。该系统从骨干网络选择和调整、预测头部的设计、数据增强、学习率变换策略、正则化参数选择、预训练模型使用以及模型自动裁剪量化8个方面,采用19个有效策略,对各个模块的模型进行效果调优和瘦身,最终得到整体大小为3.5M的超轻量中英文OCR和2.8M的英文数字OCR。更多细节请参考PP-OCR技术方案 https://arxiv.org/abs/2009.09941 。其中FPGM裁剪器和PACT量化的实现可以参考[PaddleSlim](https://github.com/PaddlePaddle/PaddleSlim)。 +PP-OCR是一个实用的超轻量OCR系统。主要由DB文本检测[2]、检测框矫正和CRNN文本识别三部分组成[7]。该系统从骨干网络选择和调整、预测头部的设计、数据增强、学习率变换策略、正则化参数选择、预训练模型使用以及模型自动裁剪量化8个方面,采用19个有效策略,对各个模块的模型进行效果调优和瘦身,最终得到整体大小为3.5M的超轻量中英文OCR和2.8M的英文数字OCR。更多细节请参考PP-OCR技术方案 https://arxiv.org/abs/2009.09941 。其中FPGM裁剪器[8]和PACT量化[9]的实现可以参考[PaddleSlim](https://github.com/PaddlePaddle/PaddleSlim)。 ## 效果展示 [more](./doc/doc_ch/visualization.md) @@ -149,7 +149,7 @@ PP-OCR是一个实用的超轻量OCR系统。主要由DB文本检测、检测框 - 非常感谢 [Khanh Tran](https://github.com/xxxpsyduck) 和 [Karl Horky](https://github.com/karlhorky) 贡献修改英文文档 -- 非常感谢 [zhangxin](https://github.com/ZhangXinNan)([Blog](https://blog.csdn.net/sdlypyzq)) 贡献新的可视化方式、添加.gitgnore、处理手动设置PYTHONPATH环境变量的问题 +- 非常感谢 [zhangxin](https://github.com/ZhangXinNan)([Blog](https://blog.csdn.net/sdlypyzq)) 贡献新的可视化方式、添加.gitignore、处理手动设置PYTHONPATH环境变量的问题 - 非常感谢 [lyl120117](https://github.com/lyl120117) 贡献打印网络结构的代码 - 非常感谢 [xiangyubo](https://github.com/xiangyubo) 贡献手写中文OCR数据集 - 非常感谢 [authorfu](https://github.com/authorfu) 贡献Android和[xiadeye](https://github.com/xiadeye) 贡献IOS的demo代码 diff --git a/StyleText/README.md b/StyleText/README.md index 60a9ee99a2d7273db2b07fc0dadc5cf4b8b84d75..3e14e320334c77723f0a493718d731efd354ea69 100644 --- a/StyleText/README.md +++ b/StyleText/README.md @@ -22,7 +22,7 @@ English | [简体中文](README_ch.md) -The Style-Text data synthesis tool is a tool based on Baidu's self-developed text editing algorithm "Editing Text in the Wild" [https://arxiv.org/abs/1908.03047](https://arxiv.org/abs/1908.03047). +The Style-Text data synthesis tool is a tool based on Baidu and HUST cooperation research work, "Editing Text in the Wild" [https://arxiv.org/abs/1908.03047](https://arxiv.org/abs/1908.03047). Different from the commonly used GAN-based data synthesis tools, the main framework of Style-Text includes: * (1) Text foreground style transfer module. @@ -124,7 +124,7 @@ In actual application scenarios, it is often necessary to synthesize pictures in * `corpus_file`: Filepath of the corpus. Corpus file should be a text file which will be split by line-endings('\n'). Corpus generator samples one line each time. -Example of corpus file: +Example of corpus file: ``` PaddleOCR 飞桨文字识别 diff --git a/StyleText/README_ch.md b/StyleText/README_ch.md index 5b8a3ee0fef321ed9ccee7733a74645234c44a12..fd259ca018efcdcf6bdd1040ee1642424c120ae7 100644 --- a/StyleText/README_ch.md +++ b/StyleText/README_ch.md @@ -21,7 +21,7 @@ -Style-Text数据合成工具是基于百度自研的文本编辑算法《Editing Text in the Wild》https://arxiv.org/abs/1908.03047 +Style-Text数据合成工具是基于百度和华科合作研发的文本编辑算法《Editing Text in the Wild》https://arxiv.org/abs/1908.03047 不同于常用的基于GAN的数据合成工具,Style-Text主要框架包括:1.文本前景风格迁移模块 2.背景抽取模块 3.融合模块。经过这样三步,就可以迅速实现图像文本风格迁移。下图是一些该数据合成工具效果图。 @@ -128,7 +128,7 @@ python3 tools/synth_image.py -c configs/config.yml --style_image examples/style_ 2. 运行`tools/synth_dataset`合成数据: ``` bash - python tools/synth_dataset.py -c configs/dataset_config.yml + python3 tools/synth_dataset.py -c configs/dataset_config.yml ``` 我们在examples目录下提供了样例图片和语料。
diff --git a/configs/det/det_mv3_db.yml b/configs/det/det_mv3_db.yml index 5c8a0923427bc96c10f0a1275c3639cea735f1f4..bdb4afc085f9a565defb3d34df86f9ecdb2492d5 100644 --- a/configs/det/det_mv3_db.yml +++ b/configs/det/det_mv3_db.yml @@ -67,7 +67,7 @@ Train: data_dir: ./train_data/icdar2015/text_localization/ label_file_list: - ./train_data/icdar2015/text_localization/train_icdar2015_label.txt - ratio_list: [0.5] + ratio_list: [1.0] transforms: - DecodeImage: # load image img_mode: BGR diff --git a/configs/det/det_r50_vd_db.yml b/configs/det/det_r50_vd_db.yml index f1188fe357ea5c02f8839239e788a629221bf118..19c059d6737f9e98f33e6fc3b074587b24361dfc 100644 --- a/configs/det/det_r50_vd_db.yml +++ b/configs/det/det_r50_vd_db.yml @@ -66,7 +66,7 @@ Train: data_dir: ./train_data/icdar2015/text_localization/ label_file_list: - ./train_data/icdar2015/text_localization/train_icdar2015_label.txt - ratio_list: [0.5] + ratio_list: [1.0] transforms: - DecodeImage: # load image img_mode: BGR diff --git a/configs/det/det_r50_vd_sast_totaltext.yml b/configs/det/det_r50_vd_sast_totaltext.yml index a92f1b6e539b9f78d2edc705cd9cda0fb6522c28..e040c4207e497a7bf237a84c9c8d1b7c33a2dde8 100755 --- a/configs/det/det_r50_vd_sast_totaltext.yml +++ b/configs/det/det_r50_vd_sast_totaltext.yml @@ -62,7 +62,7 @@ Train: name: SimpleDataSet data_dir: ./train_data/ label_file_list: [./train_data/art_latin_icdar_14pt/train_no_tt_test/train_label_json.txt, ./train_data/total_text_icdar_14pt/train_label_json.txt] - data_ratio_list: [0.5, 0.5] + ratio_list: [0.5, 0.5] transforms: - DecodeImage: # load image img_mode: BGR diff --git a/deploy/cpp_infer/CMakeLists.txt b/deploy/cpp_infer/CMakeLists.txt index 0f751a22ce7f848b2d7e268fb92059f9d5170cb5..1188336730ea9fbaac11a84250a3be3e418ec5f5 100644 --- a/deploy/cpp_infer/CMakeLists.txt +++ b/deploy/cpp_infer/CMakeLists.txt @@ -138,12 +138,22 @@ endif() # Note: libpaddle_inference_api.so/a must put before libpaddle_fluid.so/a if(WITH_STATIC_LIB) - set(DEPS - ${PADDLE_LIB}/paddle/lib/libpaddle_fluid${CMAKE_STATIC_LIBRARY_SUFFIX}) + if(WIN32) + set(DEPS + ${PADDLE_LIB}/paddle/lib/paddle_fluid${CMAKE_STATIC_LIBRARY_SUFFIX}) + else() + set(DEPS + ${PADDLE_LIB}/paddle/lib/libpaddle_fluid${CMAKE_STATIC_LIBRARY_SUFFIX}) + endif() else() - set(DEPS - ${PADDLE_LIB}/paddle/lib/libpaddle_fluid${CMAKE_SHARED_LIBRARY_SUFFIX}) -endif() + if(WIN32) + set(DEPS + ${PADDLE_LIB}/paddle/lib/paddle_fluid${CMAKE_SHARED_LIBRARY_SUFFIX}) + else() + set(DEPS + ${PADDLE_LIB}/paddle/lib/libpaddle_fluid${CMAKE_SHARED_LIBRARY_SUFFIX}) + endif() +endif(WITH_STATIC_LIB) if (NOT WIN32) set(DEPS ${DEPS} diff --git a/deploy/cpp_infer/include/config.h b/deploy/cpp_infer/include/config.h index 27539ea7934dc192e86bca3ea6bfd7999ee229a3..dbfbc2df141042f1065b380010e1ea3ff3ccedab 100644 --- a/deploy/cpp_infer/include/config.h +++ b/deploy/cpp_infer/include/config.h @@ -25,9 +25,9 @@ namespace PaddleOCR { -class Config { +class OCRConfig { public: - explicit Config(const std::string &config_file) { + explicit OCRConfig(const std::string &config_file) { config_map_ = LoadConfig(config_file); this->use_gpu = bool(stoi(config_map_["use_gpu"])); @@ -41,8 +41,6 @@ public: this->use_mkldnn = bool(stoi(config_map_["use_mkldnn"])); - this->use_zero_copy_run = bool(stoi(config_map_["use_zero_copy_run"])); - this->max_side_len = stoi(config_map_["max_side_len"]); this->det_db_thresh = stod(config_map_["det_db_thresh"]); @@ -64,6 +62,10 @@ public: this->cls_thresh = stod(config_map_["cls_thresh"]); this->visualize = bool(stoi(config_map_["visualize"])); + + this->use_tensorrt = bool(stoi(config_map_["use_tensorrt"])); + + this->use_fp16 = bool(stod(config_map_["use_fp16"])); } bool use_gpu = false; @@ -76,8 +78,6 @@ public: bool use_mkldnn = false; - bool use_zero_copy_run = false; - int max_side_len = 960; double det_db_thresh = 0.3; @@ -100,6 +100,10 @@ public: bool visualize = true; + bool use_tensorrt = false; + + bool use_fp16 = false; + void PrintConfigInfo(); private: diff --git a/deploy/cpp_infer/include/ocr_cls.h b/deploy/cpp_infer/include/ocr_cls.h index 38a37cff3c035eafe3617d83b2cc15ca47f30186..41494085a797c7a4490942741e6e888033c0be00 100644 --- a/deploy/cpp_infer/include/ocr_cls.h +++ b/deploy/cpp_infer/include/ocr_cls.h @@ -30,6 +30,8 @@ #include #include +using namespace paddle_infer; + namespace PaddleOCR { class Classifier { @@ -37,16 +39,17 @@ public: explicit Classifier(const std::string &model_dir, const bool &use_gpu, const int &gpu_id, const int &gpu_mem, const int &cpu_math_library_num_threads, - const bool &use_mkldnn, const bool &use_zero_copy_run, - const double &cls_thresh) { + const bool &use_mkldnn, const double &cls_thresh, + const bool &use_tensorrt, const bool &use_fp16) { this->use_gpu_ = use_gpu; this->gpu_id_ = gpu_id; this->gpu_mem_ = gpu_mem; this->cpu_math_library_num_threads_ = cpu_math_library_num_threads; this->use_mkldnn_ = use_mkldnn; - this->use_zero_copy_run_ = use_zero_copy_run; this->cls_thresh = cls_thresh; + this->use_tensorrt_ = use_tensorrt; + this->use_fp16_ = use_fp16; LoadModel(model_dir); } @@ -57,20 +60,20 @@ public: cv::Mat Run(cv::Mat &img); private: - std::shared_ptr predictor_; + std::shared_ptr predictor_; bool use_gpu_ = false; int gpu_id_ = 0; int gpu_mem_ = 4000; int cpu_math_library_num_threads_ = 4; bool use_mkldnn_ = false; - bool use_zero_copy_run_ = false; double cls_thresh = 0.5; std::vector mean_ = {0.5f, 0.5f, 0.5f}; std::vector scale_ = {1 / 0.5f, 1 / 0.5f, 1 / 0.5f}; bool is_scale_ = true; - + bool use_tensorrt_ = false; + bool use_fp16_ = false; // pre-process ClsResizeImg resize_op_; Normalize normalize_op_; diff --git a/deploy/cpp_infer/include/ocr_det.h b/deploy/cpp_infer/include/ocr_det.h index 0308d07f3bac67a275452500184e0959b16e8003..bab9c95fa4a3f1cb160ccbf9ca4587fa4c2ba16a 100644 --- a/deploy/cpp_infer/include/ocr_det.h +++ b/deploy/cpp_infer/include/ocr_det.h @@ -32,6 +32,8 @@ #include #include +using namespace paddle_infer; + namespace PaddleOCR { class DBDetector { @@ -39,17 +41,16 @@ public: explicit DBDetector(const std::string &model_dir, const bool &use_gpu, const int &gpu_id, const int &gpu_mem, const int &cpu_math_library_num_threads, - const bool &use_mkldnn, const bool &use_zero_copy_run, - const int &max_side_len, const double &det_db_thresh, + const bool &use_mkldnn, const int &max_side_len, + const double &det_db_thresh, const double &det_db_box_thresh, - const double &det_db_unclip_ratio, - const bool &visualize) { + const double &det_db_unclip_ratio, const bool &visualize, + const bool &use_tensorrt, const bool &use_fp16) { this->use_gpu_ = use_gpu; this->gpu_id_ = gpu_id; this->gpu_mem_ = gpu_mem; this->cpu_math_library_num_threads_ = cpu_math_library_num_threads; this->use_mkldnn_ = use_mkldnn; - this->use_zero_copy_run_ = use_zero_copy_run; this->max_side_len_ = max_side_len; @@ -58,6 +59,8 @@ public: this->det_db_unclip_ratio_ = det_db_unclip_ratio; this->visualize_ = visualize; + this->use_tensorrt_ = use_tensorrt; + this->use_fp16_ = use_fp16; LoadModel(model_dir); } @@ -69,14 +72,13 @@ public: void Run(cv::Mat &img, std::vector>> &boxes); private: - std::shared_ptr predictor_; + std::shared_ptr predictor_; bool use_gpu_ = false; int gpu_id_ = 0; int gpu_mem_ = 4000; int cpu_math_library_num_threads_ = 4; bool use_mkldnn_ = false; - bool use_zero_copy_run_ = false; int max_side_len_ = 960; @@ -85,6 +87,8 @@ private: double det_db_unclip_ratio_ = 2.0; bool visualize_ = true; + bool use_tensorrt_ = false; + bool use_fp16_ = false; std::vector mean_ = {0.485f, 0.456f, 0.406f}; std::vector scale_ = {1 / 0.229f, 1 / 0.224f, 1 / 0.225f}; diff --git a/deploy/cpp_infer/include/ocr_rec.h b/deploy/cpp_infer/include/ocr_rec.h index 89bcd82cb99a90ddd8e152a034769312d9791e7e..94d605a96e1f43423b15b0d81c7cd88f618ea4d3 100644 --- a/deploy/cpp_infer/include/ocr_rec.h +++ b/deploy/cpp_infer/include/ocr_rec.h @@ -32,6 +32,8 @@ #include #include +using namespace paddle_infer; + namespace PaddleOCR { class CRNNRecognizer { @@ -39,14 +41,15 @@ public: explicit CRNNRecognizer(const std::string &model_dir, const bool &use_gpu, const int &gpu_id, const int &gpu_mem, const int &cpu_math_library_num_threads, - const bool &use_mkldnn, const bool &use_zero_copy_run, - const string &label_path) { + const bool &use_mkldnn, const string &label_path, + const bool &use_tensorrt, const bool &use_fp16) { this->use_gpu_ = use_gpu; this->gpu_id_ = gpu_id; this->gpu_mem_ = gpu_mem; this->cpu_math_library_num_threads_ = cpu_math_library_num_threads; this->use_mkldnn_ = use_mkldnn; - this->use_zero_copy_run_ = use_zero_copy_run; + this->use_tensorrt_ = use_tensorrt; + this->use_fp16_ = use_fp16; this->label_list_ = Utility::ReadDict(label_path); this->label_list_.insert(this->label_list_.begin(), @@ -63,21 +66,21 @@ public: Classifier *cls); private: - std::shared_ptr predictor_; + std::shared_ptr predictor_; bool use_gpu_ = false; int gpu_id_ = 0; int gpu_mem_ = 4000; int cpu_math_library_num_threads_ = 4; bool use_mkldnn_ = false; - bool use_zero_copy_run_ = false; std::vector label_list_; std::vector mean_ = {0.5f, 0.5f, 0.5f}; std::vector scale_ = {1 / 0.5f, 1 / 0.5f, 1 / 0.5f}; bool is_scale_ = true; - + bool use_tensorrt_ = false; + bool use_fp16_ = false; // pre-process CrnnResizeImg resize_op_; Normalize normalize_op_; diff --git a/deploy/cpp_infer/readme.md b/deploy/cpp_infer/readme.md index 66302a0114186306fde0572fca23aabd27620f95..b563ecf48c2aba03e25a03ae0328c244bb900356 100644 --- a/deploy/cpp_infer/readme.md +++ b/deploy/cpp_infer/readme.md @@ -122,10 +122,10 @@ build/paddle_inference_install_dir/ * 下载之后使用下面的方法解压。 ``` -tar -xf fluid_inference.tgz +tar -xf paddle_inference.tgz ``` -最终会在当前的文件夹中生成`fluid_inference/`的子文件夹。 +最终会在当前的文件夹中生成`paddle_inference/`的子文件夹。 ## 2 开始运行 @@ -137,11 +137,11 @@ tar -xf fluid_inference.tgz ``` inference/ |-- det_db -| |--model -| |--params +| |--inference.pdparams +| |--inference.pdimodel |-- rec_rcnn -| |--model -| |--params +| |--inference.pdparams +| |--inference.pdparams ``` @@ -180,7 +180,7 @@ cmake .. \ make -j ``` -`OPENCV_DIR`为opencv编译安装的地址;`LIB_DIR`为下载(`fluid_inference`文件夹)或者编译生成的Paddle预测库地址(`build/fluid_inference_install_dir`文件夹);`CUDA_LIB_DIR`为cuda库文件地址,在docker中;为`/usr/local/cuda/lib64`;`CUDNN_LIB_DIR`为cudnn库文件地址,在docker中为`/usr/lib/x86_64-linux-gnu/`。 +`OPENCV_DIR`为opencv编译安装的地址;`LIB_DIR`为下载(`paddle_inference`文件夹)或者编译生成的Paddle预测库地址(`build/paddle_inference_install_dir`文件夹);`CUDA_LIB_DIR`为cuda库文件地址,在docker中;为`/usr/local/cuda/lib64`;`CUDNN_LIB_DIR`为cudnn库文件地址,在docker中为`/usr/lib/x86_64-linux-gnu/`。 * 编译完成之后,会在`build`文件夹下生成一个名为`ocr_system`的可执行文件。 @@ -202,7 +202,6 @@ gpu_id 0 # GPU id,使用GPU时有效 gpu_mem 4000 # 申请的GPU内存 cpu_math_library_num_threads 10 # CPU预测时的线程数,在机器核数充足的情况下,该值越大,预测速度越快 use_mkldnn 1 # 是否使用mkldnn库 -use_zero_copy_run 1 # 是否使用use_zero_copy_run进行预测 # det config max_side_len 960 # 输入图像长宽大于960时,等比例缩放图像,使得图像最长边为960 diff --git a/deploy/cpp_infer/readme_en.md b/deploy/cpp_infer/readme_en.md index 8bd76c045b82513ea82a53af58b5805e1b34fc8d..41c764bc18a69965da6ad2ea521f438840c286e6 100644 --- a/deploy/cpp_infer/readme_en.md +++ b/deploy/cpp_infer/readme_en.md @@ -130,10 +130,10 @@ Among them, `paddle` is the Paddle library required for C++ prediction later, an * After downloading, use the following method to uncompress. ``` -tar -xf fluid_inference.tgz +tar -xf paddle_inference.tgz ``` -Finally you can see the following files in the folder of `fluid_inference/`. +Finally you can see the following files in the folder of `paddle_inference/`. ## 2. Compile and run the demo @@ -145,11 +145,11 @@ Finally you can see the following files in the folder of `fluid_inference/`. ``` inference/ |-- det_db -| |--model -| |--params +| |--inference.pdparams +| |--inference.pdimodel |-- rec_rcnn -| |--model -| |--params +| |--inference.pdparams +| |--inference.pdparams ``` @@ -188,7 +188,9 @@ cmake .. \ make -j ``` -`OPENCV_DIR` is the opencv installation path; `LIB_DIR` is the download (`fluid_inference` folder) or the generated Paddle inference library path (`build/fluid_inference_install_dir` folder); `CUDA_LIB_DIR` is the cuda library file path, in docker; it is `/usr/local/cuda/lib64`; `CUDNN_LIB_DIR` is the cudnn library file path, in docker it is `/usr/lib/x86_64-linux-gnu/`. +`OPENCV_DIR` is the opencv installation path; `LIB_DIR` is the download (`paddle_inference` folder) +or the generated Paddle inference library path (`build/paddle_inference_install_dir` folder); +`CUDA_LIB_DIR` is the cuda library file path, in docker; it is `/usr/local/cuda/lib64`; `CUDNN_LIB_DIR` is the cudnn library file path, in docker it is `/usr/lib/x86_64-linux-gnu/`. * After the compilation is completed, an executable file named `ocr_system` will be generated in the `build` folder. @@ -211,7 +213,6 @@ gpu_id 0 # GPU id when use_gpu is 1 gpu_mem 4000 # GPU memory requested cpu_math_library_num_threads 10 # Number of threads when using CPU inference. When machine cores is enough, the large the value, the faster the inference speed use_mkldnn 1 # Whether to use mkdlnn library -use_zero_copy_run 1 # Whether to use use_zero_copy_run for inference max_side_len 960 # Limit the maximum image height and width to 960 det_db_thresh 0.3 # Used to filter the binarized image of DB prediction, setting 0.-0.3 has no obvious effect on the result @@ -244,4 +245,4 @@ The detection results will be shown on the screen, which is as follows. ### 2.3 Notes -* Paddle2.0.0-beta0 inference model library is recommanded for this tuturial. +* Paddle2.0.0-beta0 inference model library is recommended for this toturial. diff --git a/deploy/cpp_infer/src/config.cpp b/deploy/cpp_infer/src/config.cpp index 52dfa209b049c6d47285bcba40e41de846de610f..303c3c1259515ee8c67fa865bf485ae3338505d6 100644 --- a/deploy/cpp_infer/src/config.cpp +++ b/deploy/cpp_infer/src/config.cpp @@ -16,8 +16,8 @@ namespace PaddleOCR { -std::vector Config::split(const std::string &str, - const std::string &delim) { +std::vector OCRConfig::split(const std::string &str, + const std::string &delim) { std::vector res; if ("" == str) return res; @@ -38,7 +38,7 @@ std::vector Config::split(const std::string &str, } std::map -Config::LoadConfig(const std::string &config_path) { +OCRConfig::LoadConfig(const std::string &config_path) { auto config = Utility::ReadDict(config_path); std::map dict; @@ -53,7 +53,7 @@ Config::LoadConfig(const std::string &config_path) { return dict; } -void Config::PrintConfigInfo() { +void OCRConfig::PrintConfigInfo() { std::cout << "=======Paddle OCR inference config======" << std::endl; for (auto iter = config_map_.begin(); iter != config_map_.end(); iter++) { std::cout << iter->first << " : " << iter->second << std::endl; diff --git a/deploy/cpp_infer/src/main.cpp b/deploy/cpp_infer/src/main.cpp index 63da62c7d4e0e9592d62ac61ae1888dc35a71ec0..f40e5edfcc2c19e0a61894bed11aef636317e056 100644 --- a/deploy/cpp_infer/src/main.cpp +++ b/deploy/cpp_infer/src/main.cpp @@ -42,7 +42,7 @@ int main(int argc, char **argv) { exit(1); } - Config config(argv[1]); + OCRConfig config(argv[1]); config.PrintConfigInfo(); @@ -50,37 +50,24 @@ int main(int argc, char **argv) { cv::Mat srcimg = cv::imread(img_path, cv::IMREAD_COLOR); - DBDetector det( - config.det_model_dir, config.use_gpu, config.gpu_id, config.gpu_mem, - config.cpu_math_library_num_threads, config.use_mkldnn, - config.use_zero_copy_run, config.max_side_len, config.det_db_thresh, - config.det_db_box_thresh, config.det_db_unclip_ratio, config.visualize); + DBDetector det(config.det_model_dir, config.use_gpu, config.gpu_id, + config.gpu_mem, config.cpu_math_library_num_threads, + config.use_mkldnn, config.max_side_len, config.det_db_thresh, + config.det_db_box_thresh, config.det_db_unclip_ratio, + config.visualize, config.use_tensorrt, config.use_fp16); Classifier *cls = nullptr; if (config.use_angle_cls == true) { cls = new Classifier(config.cls_model_dir, config.use_gpu, config.gpu_id, config.gpu_mem, config.cpu_math_library_num_threads, - config.use_mkldnn, config.use_zero_copy_run, - config.cls_thresh); + config.use_mkldnn, config.cls_thresh, + config.use_tensorrt, config.use_fp16); } CRNNRecognizer rec(config.rec_model_dir, config.use_gpu, config.gpu_id, config.gpu_mem, config.cpu_math_library_num_threads, - config.use_mkldnn, config.use_zero_copy_run, - config.char_list_file); - -#ifdef USE_MKL -#pragma omp parallel - for (auto i = 0; i < 10; i++) { - LOG_IF(WARNING, - config.cpu_math_library_num_threads != omp_get_num_threads()) - << "WARNING! MKL is running on " << omp_get_num_threads() - << " threads while cpu_math_library_num_threads is set to " - << config.cpu_math_library_num_threads - << ". Possible reason could be 1. You have set omp_set_num_threads() " - "somewhere; 2. MKL is not linked properly"; - } -#endif + config.use_mkldnn, config.char_list_file, + config.use_tensorrt, config.use_fp16); auto start = std::chrono::system_clock::now(); std::vector>> boxes; @@ -90,11 +77,11 @@ int main(int argc, char **argv) { auto end = std::chrono::system_clock::now(); auto duration = std::chrono::duration_cast(end - start); - std::cout << "花费了" + std::cout << "Cost" << double(duration.count()) * std::chrono::microseconds::period::num / std::chrono::microseconds::period::den - << "秒" << std::endl; + << "s" << std::endl; return 0; } diff --git a/deploy/cpp_infer/src/ocr_cls.cpp b/deploy/cpp_infer/src/ocr_cls.cpp index fed2023f9f111294a07a9c841f4843404bbd9af2..3aeda2ed0c286d1ec5e816e15ac5500f53c9a3a2 100644 --- a/deploy/cpp_infer/src/ocr_cls.cpp +++ b/deploy/cpp_infer/src/ocr_cls.cpp @@ -35,26 +35,16 @@ cv::Mat Classifier::Run(cv::Mat &img) { this->permute_op_.Run(&resize_img, input.data()); // Inference. - if (this->use_zero_copy_run_) { - auto input_names = this->predictor_->GetInputNames(); - auto input_t = this->predictor_->GetInputTensor(input_names[0]); - input_t->Reshape({1, 3, resize_img.rows, resize_img.cols}); - input_t->copy_from_cpu(input.data()); - this->predictor_->ZeroCopyRun(); - } else { - paddle::PaddleTensor input_t; - input_t.shape = {1, 3, resize_img.rows, resize_img.cols}; - input_t.data = - paddle::PaddleBuf(input.data(), input.size() * sizeof(float)); - input_t.dtype = PaddleDType::FLOAT32; - std::vector outputs; - this->predictor_->Run({input_t}, &outputs, 1); - } + auto input_names = this->predictor_->GetInputNames(); + auto input_t = this->predictor_->GetInputHandle(input_names[0]); + input_t->Reshape({1, 3, resize_img.rows, resize_img.cols}); + input_t->CopyFromCpu(input.data()); + this->predictor_->Run(); std::vector softmax_out; std::vector label_out; auto output_names = this->predictor_->GetOutputNames(); - auto softmax_out_t = this->predictor_->GetOutputTensor(output_names[0]); + auto softmax_out_t = this->predictor_->GetOutputHandle(output_names[0]); auto softmax_shape_out = softmax_out_t->shape(); int softmax_out_num = @@ -63,7 +53,7 @@ cv::Mat Classifier::Run(cv::Mat &img) { softmax_out.resize(softmax_out_num); - softmax_out_t->copy_to_cpu(softmax_out.data()); + softmax_out_t->CopyToCpu(softmax_out.data()); float score = 0; int label = 0; @@ -86,6 +76,13 @@ void Classifier::LoadModel(const std::string &model_dir) { if (this->use_gpu_) { config.EnableUseGpu(this->gpu_mem_, this->gpu_id_); + if (this->use_tensorrt_) { + config.EnableTensorRtEngine( + 1 << 20, 10, 3, + this->use_fp16_ ? paddle_infer::Config::Precision::kHalf + : paddle_infer::Config::Precision::kFloat32, + false, false); + } } else { config.DisableGpu(); if (this->use_mkldnn_) { @@ -95,7 +92,7 @@ void Classifier::LoadModel(const std::string &model_dir) { } // false for zero copy tensor - config.SwitchUseFeedFetchOps(!this->use_zero_copy_run_); + config.SwitchUseFeedFetchOps(false); // true for multiple input config.SwitchSpecifyInputNames(true); @@ -104,6 +101,6 @@ void Classifier::LoadModel(const std::string &model_dir) { config.EnableMemoryOptim(); config.DisableGlogInfo(); - this->predictor_ = CreatePaddlePredictor(config); + this->predictor_ = CreatePredictor(config); } } // namespace PaddleOCR diff --git a/deploy/cpp_infer/src/ocr_det.cpp b/deploy/cpp_infer/src/ocr_det.cpp index e253f9cc89810f4d1adfca5be5186220a873d1a2..3678f37dfb1c0c4aed392dd31830e732e2854899 100644 --- a/deploy/cpp_infer/src/ocr_det.cpp +++ b/deploy/cpp_infer/src/ocr_det.cpp @@ -17,12 +17,20 @@ namespace PaddleOCR { void DBDetector::LoadModel(const std::string &model_dir) { - AnalysisConfig config; + // AnalysisConfig config; + paddle_infer::Config config; config.SetModel(model_dir + "/inference.pdmodel", model_dir + "/inference.pdiparams"); if (this->use_gpu_) { config.EnableUseGpu(this->gpu_mem_, this->gpu_id_); + if (this->use_tensorrt_) { + config.EnableTensorRtEngine( + 1 << 20, 10, 3, + this->use_fp16_ ? paddle_infer::Config::Precision::kHalf + : paddle_infer::Config::Precision::kFloat32, + false, false); + } } else { config.DisableGpu(); if (this->use_mkldnn_) { @@ -32,10 +40,8 @@ void DBDetector::LoadModel(const std::string &model_dir) { } config.SetCpuMathLibraryNumThreads(this->cpu_math_library_num_threads_); } - - // false for zero copy tensor - // true for commom tensor - config.SwitchUseFeedFetchOps(!this->use_zero_copy_run_); + // use zero_copy_run as default + config.SwitchUseFeedFetchOps(false); // true for multiple input config.SwitchSpecifyInputNames(true); @@ -44,7 +50,7 @@ void DBDetector::LoadModel(const std::string &model_dir) { config.EnableMemoryOptim(); config.DisableGlogInfo(); - this->predictor_ = CreatePaddlePredictor(config); + this->predictor_ = CreatePredictor(config); } void DBDetector::Run(cv::Mat &img, @@ -64,31 +70,21 @@ void DBDetector::Run(cv::Mat &img, this->permute_op_.Run(&resize_img, input.data()); // Inference. - if (this->use_zero_copy_run_) { - auto input_names = this->predictor_->GetInputNames(); - auto input_t = this->predictor_->GetInputTensor(input_names[0]); - input_t->Reshape({1, 3, resize_img.rows, resize_img.cols}); - input_t->copy_from_cpu(input.data()); - this->predictor_->ZeroCopyRun(); - } else { - paddle::PaddleTensor input_t; - input_t.shape = {1, 3, resize_img.rows, resize_img.cols}; - input_t.data = - paddle::PaddleBuf(input.data(), input.size() * sizeof(float)); - input_t.dtype = PaddleDType::FLOAT32; - std::vector outputs; - this->predictor_->Run({input_t}, &outputs, 1); - } + auto input_names = this->predictor_->GetInputNames(); + auto input_t = this->predictor_->GetInputHandle(input_names[0]); + input_t->Reshape({1, 3, resize_img.rows, resize_img.cols}); + input_t->CopyFromCpu(input.data()); + this->predictor_->Run(); std::vector out_data; auto output_names = this->predictor_->GetOutputNames(); - auto output_t = this->predictor_->GetOutputTensor(output_names[0]); + auto output_t = this->predictor_->GetOutputHandle(output_names[0]); std::vector output_shape = output_t->shape(); int out_num = std::accumulate(output_shape.begin(), output_shape.end(), 1, std::multiplies()); out_data.resize(out_num); - output_t->copy_to_cpu(out_data.data()); + output_t->CopyToCpu(out_data.data()); int n2 = output_shape[2]; int n3 = output_shape[3]; diff --git a/deploy/cpp_infer/src/ocr_rec.cpp b/deploy/cpp_infer/src/ocr_rec.cpp index d4deb5a17fc47427eb92cda02c270d268cfcafc7..27cfe4c95009c6454514a43e304a23503fe5fa9a 100644 --- a/deploy/cpp_infer/src/ocr_rec.cpp +++ b/deploy/cpp_infer/src/ocr_rec.cpp @@ -43,32 +43,22 @@ void CRNNRecognizer::Run(std::vector>> boxes, this->permute_op_.Run(&resize_img, input.data()); // Inference. - if (this->use_zero_copy_run_) { - auto input_names = this->predictor_->GetInputNames(); - auto input_t = this->predictor_->GetInputTensor(input_names[0]); - input_t->Reshape({1, 3, resize_img.rows, resize_img.cols}); - input_t->copy_from_cpu(input.data()); - this->predictor_->ZeroCopyRun(); - } else { - paddle::PaddleTensor input_t; - input_t.shape = {1, 3, resize_img.rows, resize_img.cols}; - input_t.data = - paddle::PaddleBuf(input.data(), input.size() * sizeof(float)); - input_t.dtype = PaddleDType::FLOAT32; - std::vector outputs; - this->predictor_->Run({input_t}, &outputs, 1); - } + auto input_names = this->predictor_->GetInputNames(); + auto input_t = this->predictor_->GetInputHandle(input_names[0]); + input_t->Reshape({1, 3, resize_img.rows, resize_img.cols}); + input_t->CopyFromCpu(input.data()); + this->predictor_->Run(); std::vector predict_batch; auto output_names = this->predictor_->GetOutputNames(); - auto output_t = this->predictor_->GetOutputTensor(output_names[0]); + auto output_t = this->predictor_->GetOutputHandle(output_names[0]); auto predict_shape = output_t->shape(); int out_num = std::accumulate(predict_shape.begin(), predict_shape.end(), 1, std::multiplies()); predict_batch.resize(out_num); - output_t->copy_to_cpu(predict_batch.data()); + output_t->CopyToCpu(predict_batch.data()); // ctc decode std::vector str_res; @@ -86,7 +76,7 @@ void CRNNRecognizer::Run(std::vector>> boxes, float(*std::max_element(&predict_batch[n * predict_shape[2]], &predict_batch[(n + 1) * predict_shape[2]])); - if (argmax_idx > 0 && (not(i > 0 && argmax_idx == last_index))) { + if (argmax_idx > 0 && (!(i > 0 && argmax_idx == last_index))) { score += max_value; count += 1; str_res.push_back(label_list_[argmax_idx]); @@ -102,12 +92,20 @@ void CRNNRecognizer::Run(std::vector>> boxes, } void CRNNRecognizer::LoadModel(const std::string &model_dir) { - AnalysisConfig config; + // AnalysisConfig config; + paddle_infer::Config config; config.SetModel(model_dir + "/inference.pdmodel", model_dir + "/inference.pdiparams"); if (this->use_gpu_) { config.EnableUseGpu(this->gpu_mem_, this->gpu_id_); + if (this->use_tensorrt_) { + config.EnableTensorRtEngine( + 1 << 20, 10, 3, + this->use_fp16_ ? paddle_infer::Config::Precision::kHalf + : paddle_infer::Config::Precision::kFloat32, + false, false); + } } else { config.DisableGpu(); if (this->use_mkldnn_) { @@ -118,9 +116,7 @@ void CRNNRecognizer::LoadModel(const std::string &model_dir) { config.SetCpuMathLibraryNumThreads(this->cpu_math_library_num_threads_); } - // false for zero copy tensor - // true for commom tensor - config.SwitchUseFeedFetchOps(!this->use_zero_copy_run_); + config.SwitchUseFeedFetchOps(false); // true for multiple input config.SwitchSpecifyInputNames(true); @@ -129,7 +125,7 @@ void CRNNRecognizer::LoadModel(const std::string &model_dir) { config.EnableMemoryOptim(); config.DisableGlogInfo(); - this->predictor_ = CreatePaddlePredictor(config); + this->predictor_ = CreatePredictor(config); } cv::Mat CRNNRecognizer::GetRotateCropImage(const cv::Mat &srcimage, @@ -187,4 +183,4 @@ cv::Mat CRNNRecognizer::GetRotateCropImage(const cv::Mat &srcimage, } } -} // namespace PaddleOCR \ No newline at end of file +} // namespace PaddleOCR diff --git a/deploy/cpp_infer/tools/config.txt b/deploy/cpp_infer/tools/config.txt index f1ab0b1131ef5d55b098667612c019e0fc01c9dc..e185377e2f2c9cbd5c1d8ed09ba43df9c41c05d2 100644 --- a/deploy/cpp_infer/tools/config.txt +++ b/deploy/cpp_infer/tools/config.txt @@ -1,10 +1,9 @@ # model load config -use_gpu 0 +use_gpu 0 gpu_id 0 gpu_mem 4000 cpu_math_library_num_threads 10 use_mkldnn 0 -use_zero_copy_run 1 # det config max_side_len 960 @@ -25,3 +24,7 @@ char_list_file ../../ppocr/utils/ppocr_keys_v1.txt # show the detection results visualize 1 +# use_tensorrt +use_tensorrt 0 +use_fp16 0 + diff --git a/deploy/hubserving/ocr_cls/params.py b/deploy/hubserving/ocr_cls/params.py index 72a7a10249176d86f75b5d3c3adae7f1021a75a8..982f013647b69cdc47c13e6206177fe74849da41 100755 --- a/deploy/hubserving/ocr_cls/params.py +++ b/deploy/hubserving/ocr_cls/params.py @@ -18,7 +18,7 @@ def read_params(): cfg.cls_batch_num = 30 cfg.cls_thresh = 0.9 - cfg.use_zero_copy_run = False cfg.use_pdserving = False + cfg.use_tensorrt = False return cfg diff --git a/deploy/hubserving/ocr_det/params.py b/deploy/hubserving/ocr_det/params.py index e50decbbc8ee604863c5965aa95bf1f79fa71d0a..132158904d44a5a45600e6cfc9cd3e565ddcef0b 100755 --- a/deploy/hubserving/ocr_det/params.py +++ b/deploy/hubserving/ocr_det/params.py @@ -27,7 +27,7 @@ def read_params(): # cfg.det_east_cover_thresh = 0.1 # cfg.det_east_nms_thresh = 0.2 - cfg.use_zero_copy_run = False cfg.use_pdserving = False + cfg.use_tensorrt = False return cfg diff --git a/deploy/hubserving/ocr_rec/params.py b/deploy/hubserving/ocr_rec/params.py index 6f428ecb2686afa5ff66b84d963d1c2175b9cee2..f8d29114357946c9b6264079fca2eb4b19dbefba 100644 --- a/deploy/hubserving/ocr_rec/params.py +++ b/deploy/hubserving/ocr_rec/params.py @@ -13,7 +13,7 @@ def read_params(): #params for text recognizer cfg.rec_algorithm = "CRNN" - cfg.rec_model_dir = "./inference/ch_ppocr_mobile_v1.1_rec_infer/" + cfg.rec_model_dir = "./inference/ch_ppocr_mobile_v2.0_rec_infer/" cfg.rec_image_shape = "3, 32, 320" cfg.rec_char_type = 'ch' @@ -23,7 +23,7 @@ def read_params(): cfg.rec_char_dict_path = "./ppocr/utils/ppocr_keys_v1.txt" cfg.use_space_char = True - cfg.use_zero_copy_run = False cfg.use_pdserving = False + cfg.use_tensorrt = False return cfg diff --git a/deploy/hubserving/ocr_system/params.py b/deploy/hubserving/ocr_system/params.py index a0e1960b2857630780f6b34773d7760279f862a2..add466668eee0be1e1674fce5f5a07c24c0c5e3f 100755 --- a/deploy/hubserving/ocr_system/params.py +++ b/deploy/hubserving/ocr_system/params.py @@ -47,8 +47,8 @@ def read_params(): cfg.cls_batch_num = 30 cfg.cls_thresh = 0.9 - cfg.use_zero_copy_run = False cfg.use_pdserving = False + cfg.use_tensorrt = False cfg.drop_score = 0.5 return cfg diff --git a/deploy/slim/quantization/README.md b/deploy/slim/quantization/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ccd4d06b4f16165f968402751b63a8fe58773e0b --- /dev/null +++ b/deploy/slim/quantization/README.md @@ -0,0 +1,61 @@ + +## 介绍 +复杂的模型有利于提高模型的性能,但也导致模型中存在一定冗余,模型量化将全精度缩减到定点数减少这种冗余,达到减少模型计算复杂度,提高模型推理性能的目的。 +模型量化可以在基本不损失模型的精度的情况下,将FP32精度的模型参数转换为Int8精度,减小模型参数大小并加速计算,使用量化后的模型在移动端等部署时更具备速度优势。 + +本教程将介绍如何使用飞桨模型压缩库PaddleSlim做PaddleOCR模型的压缩。 +[PaddleSlim](https://github.com/PaddlePaddle/PaddleSlim) 集成了模型剪枝、量化(包括量化训练和离线量化)、蒸馏和神经网络搜索等多种业界常用且领先的模型压缩功能,如果您感兴趣,可以关注并了解。 + +在开始本教程之前,建议先了解[PaddleOCR模型的训练方法](../../../doc/doc_ch/quickstart.md)以及[PaddleSlim](https://paddleslim.readthedocs.io/zh_CN/latest/index.html) + + +## 快速开始 +量化多适用于轻量模型在移动端的部署,当训练出一个模型后,如果希望进一步的压缩模型大小并加速预测,可使用量化的方法压缩模型。 + +模型量化主要包括五个步骤: +1. 安装 PaddleSlim +2. 准备训练好的模型 +3. 量化训练 +4. 导出量化推理模型 +5. 量化模型预测部署 + +### 1. 安装PaddleSlim + +```bash +git clone https://github.com/PaddlePaddle/PaddleSlim.git +cd Paddleslim +python setup.py install +``` + +### 2. 准备训练好的模型 + +PaddleOCR提供了一系列训练好的[模型](../../../doc/doc_ch/models_list.md),如果待量化的模型不在列表中,需要按照[常规训练](../../../doc/doc_ch/quickstart.md)方法得到训练好的模型。 + +### 3. 量化训练 +量化训练包括离线量化训练和在线量化训练,在线量化训练效果更好,需加载预训练模型,在定义好量化策略后即可对模型进行量化。 + + +量化训练的代码位于slim/quantization/quant.py 中,比如训练检测模型,训练指令如下: +```bash +python deploy/slim/quantization/quant.py -c configs/det/det_mv3_db.yml -o Global.pretrain_weights='your trained model' Global.save_model_dir=./output/quant_model + +# 比如下载提供的训练模型 +wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_train.tar +tar -xf ch_ppocr_mobile_v2.0_det_train.tar +python deploy/slim/quantization/quant.py -c configs/det/det_mv3_db.yml -o Global.pretrain_weights=./ch_ppocr_mobile_v2.0_det_train/best_accuracy Global.save_model_dir=./output/quant_model + +``` +如果要训练识别模型的量化,修改配置文件和加载的模型参数即可。 + +### 4. 导出模型 + +在得到量化训练保存的模型后,我们可以将其导出为inference_model,用于预测部署: + +```bash +python deploy/slim/quantization/export_model.py -c configs/det/det_mv3_db.yml -o Global.checkpoints=output/quant_model/best_accuracy Global.save_model_dir=./output/quant_inference_model +``` + +### 5. 量化模型部署 + +上述步骤导出的量化模型,参数精度仍然是FP32,但是参数的数值范围是int8,导出的模型可以通过PaddleLite的opt模型转换工具完成模型转换。 +量化模型部署的可参考 [移动端模型部署](../../lite/readme.md) diff --git a/deploy/slim/quantization/README_en.md b/deploy/slim/quantization/README_en.md new file mode 100644 index 0000000000000000000000000000000000000000..7da0b3e7e7d5f72e45dc17864630b9725f6fc8ba --- /dev/null +++ b/deploy/slim/quantization/README_en.md @@ -0,0 +1,68 @@ + +## Introduction + +Generally, a more complex model would achive better performance in the task, but it also leads to some redundancy in the model. +Quantization is a technique that reduces this redundancy by reducing the full precision data to a fixed number, +so as to reduce model calculation complexity and improve model inference performance. + +This example uses PaddleSlim provided [APIs of Quantization](https://paddlepaddle.github.io/PaddleSlim/api/quantization_api/) to compress the OCR model. + +It is recommended that you could understand following pages before reading this example: +- [The training strategy of OCR model](../../../doc/doc_en/quickstart_en.md) +- [PaddleSlim Document](https://paddlepaddle.github.io/PaddleSlim/api/quantization_api/) + +## Quick Start +Quantization is mostly suitable for the deployment of lightweight models on mobile terminals. +After training, if you want to further compress the model size and accelerate the prediction, you can use quantization methods to compress the model according to the following steps. + +1. Install PaddleSlim +2. Prepare trained model +3. Quantization-Aware Training +4. Export inference model +5. Deploy quantization inference model + + +### 1. Install PaddleSlim + +```bash +git clone https://github.com/PaddlePaddle/PaddleSlim.git +cd Paddleslim +python setup.py install +``` + + +### 2. Download Pretrain Model +PaddleOCR provides a series of trained [models](../../../doc/doc_en/models_list_en.md). +If the model to be quantified is not in the list, you need to follow the [Regular Training](../../../doc/doc_en/quickstart_en.md) method to get the trained model. + + +### 3. Quant-Aware Training +Quantization training includes offline quantization training and online quantization training. +Online quantization training is more effective. It is necessary to load the pre-training model. +After the quantization strategy is defined, the model can be quantified. + +The code for quantization training is located in `slim/quantization/quant.py`. For example, to train a detection model, the training instructions are as follows: +```bash +python deploy/slim/quantization/quant.py -c configs/det/det_mv3_db.yml -o Global.pretrain_weights='your trained model' Global.save_model_dir=./output/quant_model + +# download provided model +wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_train.tar +tar -xf ch_ppocr_mobile_v2.0_det_train.tar +python deploy/slim/quantization/quant.py -c configs/det/det_mv3_db.yml -o Global.pretrain_weights=./ch_ppocr_mobile_v2.0_det_train/best_accuracy Global.save_model_dir=./output/quant_model + +``` + + +### 4. Export inference model + +After getting the model after pruning and finetuning we, can export it as inference_model for predictive deployment: + +```bash +python deploy/slim/quantization/export_model.py -c configs/det/det_mv3_db.yml -o Global.checkpoints=output/quant_model/best_accuracy Global.save_model_dir=./output/quant_inference_model +``` + +### 5. Deploy +The numerical range of the quantized model parameters derived from the above steps is still FP32, but the numerical range of the parameters is int8. +The derived model can be converted through the `opt tool` of PaddleLite. + +For quantitative model deployment, please refer to [Mobile terminal model deployment](../../lite/readme_en.md) diff --git a/deploy/slim/quantization/export_model.py b/deploy/slim/quantization/export_model.py new file mode 100755 index 0000000000000000000000000000000000000000..100b107a1deb1ce9932c9cefa50659c060f5803e --- /dev/null +++ b/deploy/slim/quantization/export_model.py @@ -0,0 +1,118 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys + +__dir__ = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(__dir__) +sys.path.append(os.path.abspath(os.path.join(__dir__, '..', '..', '..'))) +sys.path.append( + os.path.abspath(os.path.join(__dir__, '..', '..', '..', 'tools'))) + +import argparse + +import paddle +from paddle.jit import to_static + +from ppocr.modeling.architectures import build_model +from ppocr.postprocess import build_post_process +from ppocr.utils.save_load import init_model +from ppocr.utils.logging import get_logger +from tools.program import load_config, merge_config, ArgsParser +from ppocr.metrics import build_metric +import tools.program as program +from paddleslim.dygraph.quant import QAT +from ppocr.data import build_dataloader + + +def main(): + ############################################################################################################ + # 1. quantization configs + ############################################################################################################ + quant_config = { + # weight preprocess type, default is None and no preprocessing is performed. + 'weight_preprocess_type': None, + # activation preprocess type, default is None and no preprocessing is performed. + 'activation_preprocess_type': None, + # weight quantize type, default is 'channel_wise_abs_max' + 'weight_quantize_type': 'channel_wise_abs_max', + # activation quantize type, default is 'moving_average_abs_max' + 'activation_quantize_type': 'moving_average_abs_max', + # weight quantize bit num, default is 8 + 'weight_bits': 8, + # activation quantize bit num, default is 8 + 'activation_bits': 8, + # data type after quantization, such as 'uint8', 'int8', etc. default is 'int8' + 'dtype': 'int8', + # window size for 'range_abs_max' quantization. default is 10000 + 'window_size': 10000, + # The decay coefficient of moving average, default is 0.9 + 'moving_rate': 0.9, + # for dygraph quantization, layers of type in quantizable_layer_type will be quantized + 'quantizable_layer_type': ['Conv2D', 'Linear'], + } + FLAGS = ArgsParser().parse_args() + config = load_config(FLAGS.config) + merge_config(FLAGS.opt) + logger = get_logger() + # build post process + + post_process_class = build_post_process(config['PostProcess'], + config['Global']) + + # build model + # for rec algorithm + if hasattr(post_process_class, 'character'): + char_num = len(getattr(post_process_class, 'character')) + config['Architecture']["Head"]['out_channels'] = char_num + model = build_model(config['Architecture']) + + # get QAT model + quanter = QAT(config=quant_config) + quanter.quantize(model) + + init_model(config, model, logger) + model.eval() + + # build metric + eval_class = build_metric(config['Metric']) + + # build dataloader + valid_dataloader = build_dataloader(config, 'Eval', device, logger) + + # start eval + metirc = program.eval(model, valid_dataloader, post_process_class, + eval_class) + logger.info('metric eval ***************') + for k, v in metirc.items(): + logger.info('{}:{}'.format(k, v)) + + save_path = '{}/inference'.format(config['Global']['save_inference_dir']) + infer_shape = [3, 32, 100] if config['Architecture'][ + 'model_type'] != "det" else [3, 640, 640] + + quanter.save_quantized_model( + model, + save_path, + input_spec=[ + paddle.static.InputSpec( + shape=[None] + infer_shape, dtype='float32') + ]) + logger.info('inference QAT model is saved to {}'.format(save_path)) + + +if __name__ == "__main__": + config, device, logger, vdl_writer = program.preprocess() + main() diff --git a/deploy/slim/quantization/quant.py b/deploy/slim/quantization/quant.py new file mode 100755 index 0000000000000000000000000000000000000000..7671e5f871ce6769fc51876d1fa2e5f0af63d904 --- /dev/null +++ b/deploy/slim/quantization/quant.py @@ -0,0 +1,166 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import sys + +__dir__ = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(__dir__) +sys.path.append(os.path.abspath(os.path.join(__dir__, '..', '..', '..'))) +sys.path.append( + os.path.abspath(os.path.join(__dir__, '..', '..', '..', 'tools'))) + +import yaml +import paddle +import paddle.distributed as dist + +paddle.seed(2) + +from ppocr.data import build_dataloader +from ppocr.modeling.architectures import build_model +from ppocr.losses import build_loss +from ppocr.optimizer import build_optimizer +from ppocr.postprocess import build_post_process +from ppocr.metrics import build_metric +from ppocr.utils.save_load import init_model +import tools.program as program +from paddleslim.dygraph.quant import QAT + +dist.get_world_size() + + +class PACT(paddle.nn.Layer): + def __init__(self): + super(PACT, self).__init__() + alpha_attr = paddle.ParamAttr( + name=self.full_name() + ".pact", + initializer=paddle.nn.initializer.Constant(value=20), + learning_rate=1.0, + regularizer=paddle.regularizer.L2Decay(2e-5)) + + self.alpha = self.create_parameter( + shape=[1], attr=alpha_attr, dtype='float32') + + def forward(self, x): + out_left = paddle.nn.functional.relu(x - self.alpha) + out_right = paddle.nn.functional.relu(-self.alpha - x) + x = x - out_left + out_right + return x + + +quant_config = { + # weight preprocess type, default is None and no preprocessing is performed. + 'weight_preprocess_type': None, + # activation preprocess type, default is None and no preprocessing is performed. + 'activation_preprocess_type': None, + # weight quantize type, default is 'channel_wise_abs_max' + 'weight_quantize_type': 'channel_wise_abs_max', + # activation quantize type, default is 'moving_average_abs_max' + 'activation_quantize_type': 'moving_average_abs_max', + # weight quantize bit num, default is 8 + 'weight_bits': 8, + # activation quantize bit num, default is 8 + 'activation_bits': 8, + # data type after quantization, such as 'uint8', 'int8', etc. default is 'int8' + 'dtype': 'int8', + # window size for 'range_abs_max' quantization. default is 10000 + 'window_size': 10000, + # The decay coefficient of moving average, default is 0.9 + 'moving_rate': 0.9, + # for dygraph quantization, layers of type in quantizable_layer_type will be quantized + 'quantizable_layer_type': ['Conv2D', 'Linear'], +} + + +def main(config, device, logger, vdl_writer): + # init dist environment + if config['Global']['distributed']: + dist.init_parallel_env() + + global_config = config['Global'] + + # build dataloader + train_dataloader = build_dataloader(config, 'Train', device, logger) + if config['Eval']: + valid_dataloader = build_dataloader(config, 'Eval', device, logger) + else: + valid_dataloader = None + + # build post process + post_process_class = build_post_process(config['PostProcess'], + global_config) + + # build model + # for rec algorithm + if hasattr(post_process_class, 'character'): + char_num = len(getattr(post_process_class, 'character')) + config['Architecture']["Head"]['out_channels'] = char_num + model = build_model(config['Architecture']) + + # prepare to quant + quanter = QAT(config=quant_config, act_preprocess=PACT) + quanter.quantize(model) + + if config['Global']['distributed']: + model = paddle.DataParallel(model) + + # build loss + loss_class = build_loss(config['Loss']) + + # build optim + optimizer, lr_scheduler = build_optimizer( + config['Optimizer'], + epochs=config['Global']['epoch_num'], + step_each_epoch=len(train_dataloader), + parameters=model.parameters()) + + # build metric + eval_class = build_metric(config['Metric']) + # load pretrain model + pre_best_model_dict = init_model(config, model, logger, optimizer) + + logger.info('train dataloader has {} iters, valid dataloader has {} iters'. + format(len(train_dataloader), len(valid_dataloader))) + # start train + program.train(config, train_dataloader, valid_dataloader, device, model, + loss_class, optimizer, lr_scheduler, post_process_class, + eval_class, pre_best_model_dict, logger, vdl_writer) + + +def test_reader(config, device, logger): + loader = build_dataloader(config, 'Train', device, logger) + import time + starttime = time.time() + count = 0 + try: + for data in loader(): + count += 1 + if count % 1 == 0: + batch_time = time.time() - starttime + starttime = time.time() + logger.info("reader: {}, {}, {}".format( + count, len(data[0]), batch_time)) + except Exception as e: + logger.info(e) + logger.info("finish reader: {}, Success!".format(count)) + + +if __name__ == '__main__': + config, device, logger, vdl_writer = program.preprocess(is_train=True) + main(config, device, logger, vdl_writer) + # test_reader(config, device, logger) diff --git a/doc/doc_ch/FAQ.md b/doc/doc_ch/FAQ.md index 27b3126cd45d2fc7043d6b55f9192984cb08e3ec..da31d608bfdead2734d11d0ab0619cdff7aeeb78 100755 --- a/doc/doc_ch/FAQ.md +++ b/doc/doc_ch/FAQ.md @@ -9,42 +9,50 @@ ## PaddleOCR常见问题汇总(持续更新) -* [近期更新(2020.12.14)](#近期更新) +* [近期更新(2021.1.4)](#近期更新) * [【精选】OCR精选10个问题](#OCR精选10个问题) -* [【理论篇】OCR通用30个问题](#OCR通用问题) +* [【理论篇】OCR通用31个问题](#OCR通用问题) * [基础知识7题](#基础知识) * [数据集7题](#数据集2) - * [模型训练调优7题](#模型训练调优2) - * [预测部署9题](#预测部署2) -* [【实战篇】PaddleOCR实战87个问题](#PaddleOCR实战问题) - * [使用咨询21题](#使用咨询) + * [模型训练调优17题](#模型训练调优2) +* [【实战篇】PaddleOCR实战101个问题](#PaddleOCR实战问题) + * [使用咨询31题](#使用咨询) * [数据集17题](#数据集3) - * [模型训练调优25题](#模型训练调优3) - * [预测部署24题](#预测部署3) + * [模型训练调优26题](#模型训练调优3) + * [预测部署27题](#预测部署3) -## 近期更新(2020.12.14) +## 近期更新(2021.1.4) -#### Q3.1.21:PaddleOCR支持动态图吗? +#### Q3.1.29: PPOCRLabel创建矩形框时只能拖出正方形,如何进行矩形标注? -**A**:动态图版本正在紧锣密鼓开发中,将于2020年12月16日发布,敬请关注。 +**A**: 取消勾选:“编辑”-“正方形标注” -#### Q3.3.23:检测模型训练或预测时出现elementwise_add报错 +#### Q3.1.30: Style-Text 如何不文字风格迁移,就像普通文本生成程序一样默认字体直接输出到分割的背景图? -**A**:设置的输入尺寸必须是32的倍数,否则在网络多次下采样和上采样后,feature map会产生1个像素的diff,从而导致elementwise_add时报shape不匹配的错误。 +**A**: 使用image_synth模式会输出fake_bg.jpg,即为背景图。如果想要批量提取背景,可以稍微修改一下代码,将fake_bg保存下来即可。要修改的位置: +https://github.com/PaddlePaddle/PaddleOCR/blob/de3e2e7cd3b8b65ee02d7a41e570fa5b511a3c1d/StyleText/engine/synthesisers.py#L68 -#### Q3.3.24: DB检测训练输入尺寸640,可以改大一些吗? +#### Q3.1.31: 怎么输出网络结构以及每层的参数信息? -**A**: 不建议改大。检测模型训练输入尺寸是预处理中random crop后的尺寸,并非直接将原图进行resize,多数场景下这个尺寸并不小了,改大后可能反而并不合适,而且训练会变慢。另外,代码里可能有的地方参数按照预设输入尺寸适配的,改大后可能有隐藏风险。 +**A**: 可以使用 `paddle.summary`, 具体参考:https://www.paddlepaddle.org.cn/documentation/docs/zh/2.0-rc1/api/paddle/hapi/model_summary/summary_cn.html#summary。 -#### Q3.3.25: 识别模型训练时,loss能正常下降,但acc一直为0 +#### Q3.4.26: 目前paddle hub serving 只支持 imgpath,如果我想用imgurl 去哪里改呢? -**A**: 识别模型训练初期acc为0是正常的,多训一段时间指标就上来了。 +**A**: 图片是在这里读取的:https://github.com/PaddlePaddle/PaddleOCR/blob/67ef25d593c4eabfaaceb22daade4577f53bed81/deploy/hubserving/ocr_system/module.py#L55, +可以参考下面的写法,将url path转化为np array(https://cloud.tencent.com/developer/article/1467840) +``` +response = request.urlopen('http://i1.whymtj.com/uploads/tu/201902/9999/52491ae4ba.jpg') +img_array = np.array(bytearray(response.read()), dtype=np.uint8) +img = cv.imdecode(img_array, -1) +``` -#### Q3.4.24:DB模型能正确推理预测,但换成EAST或SAST模型时报错或结果不正确 +#### Q3.4.27: C++ 端侧部署可以只对OCR的检测部署吗? + +**A**: 可以的,识别和检测模块是解耦的。如果想对检测部署,需要自己修改一下main函数, +只保留检测相关就可以:https://github.com/PaddlePaddle/PaddleOCR/blob/de3e2e7cd3b8b65ee02d7a41e570fa5b511a3c1d/deploy/cpp_infer/src/main.cpp#L72 -**A**:使用EAST或SAST模型进行推理预测时,需要在命令中指定参数--det_algorithm="EAST" 或 --det_algorithm="SAST",使用DB时不用指定是因为该参数默认值是"DB":https://github.com/PaddlePaddle/PaddleOCR/blob/e7a708e9fdaf413ed7a14da8e4a7b4ac0b211e42/tools/infer/utility.py#L43 ## 【精选】OCR精选10个问题 @@ -238,18 +246,15 @@ (2)调大系统的[l2 dcay值](https://github.com/PaddlePaddle/PaddleOCR/blob/a501603d54ff5513fc4fc760319472e59da25424/configs/rec/ch_ppocr_v1.1/rec_chinese_lite_train_v1.1.yml#L47) - -### 预测部署 - -#### Q2.4.1:请问对于图片中的密集文字,有什么好的处理办法吗? +#### Q2.3.8:请问对于图片中的密集文字,有什么好的处理办法吗? **A**:可以先试用预训练模型测试一下,例如DB+CRNN,判断下密集文字图片中是检测还是识别的问题,然后针对性的改善。还有一种是如果图象中密集文字较小,可以尝试增大图像分辨率,对图像进行一定范围内的拉伸,将文字稀疏化,提高识别效果。 -#### Q2.4.2:对于一些在识别时稍微模糊的文本,有没有一些图像增强的方式? +#### Q2.3.9:对于一些在识别时稍微模糊的文本,有没有一些图像增强的方式? **A**:在人类肉眼可以识别的前提下,可以考虑图像处理中的均值滤波、中值滤波或者高斯滤波等模糊算子尝试。也可以尝试从数据扩增扰动来强化模型鲁棒性,另外新的思路有对抗性训练和超分SR思路,可以尝试借鉴。但目前业界尚无普遍认可的最优方案,建议优先在数据采集阶段增加一些限制提升图片质量。 -#### Q2.4.3:对于特定文字检测,例如身份证只检测姓名,检测指定区域文字更好,还是检测全部区域再筛选更好? +#### Q2.3.10:对于特定文字检测,例如身份证只检测姓名,检测指定区域文字更好,还是检测全部区域再筛选更好? **A**:两个角度来说明一般检测全部区域再筛选更好。 @@ -257,11 +262,11 @@ (2)产品的需求可能是变化的,不排除后续对于模型需求变化的可能性(比如又需要增加一个字段),相比于训练模型,后处理的逻辑会更容易调整。 -#### Q2.4.4:对于小白如何快速入门中文OCR项目实践? +#### Q2.3.11:对于小白如何快速入门中文OCR项目实践? **A**:建议可以先了解OCR方向的基础知识,大概了解基础的检测和识别模型算法。然后在Github上可以查看OCR方向相关的repo。目前来看,从内容的完备性来看,PaddleOCR的中英文双语教程文档是有明显优势的,在数据集、模型训练、预测部署文档详实,可以快速入手。而且还有微信用户群答疑,非常适合学习实践。项目地址:[PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR) -#### Q2.4.5:如何识别带空格的英文行文本图像? +#### Q3.12:如何识别带空格的英文行文本图像? **A**:空格识别可以考虑以下两种方案: @@ -269,22 +274,26 @@ (2)优化文本识别算法。在识别字典里面引入空格字符,然后在识别的训练数据中,如果用空行,进行标注。此外,合成数据时,通过拼接训练数据,生成含有空格的文本。 -#### Q2.4.6:中英文一起识别时也可以加空格字符来训练吗 +#### Q2.3.13:中英文一起识别时也可以加空格字符来训练吗 **A**:中文识别可以加空格当做分隔符训练,具体的效果如何没法给出直接评判,根据实际业务数据训练来判断。 -#### Q2.4.7:低像素文字或者字号比较小的文字有什么超分辨率方法吗 +#### Q2.3.14:低像素文字或者字号比较小的文字有什么超分辨率方法吗 **A**:超分辨率方法分为传统方法和基于深度学习的方法。基于深度学习的方法中,比较经典的有SRCNN,另外CVPR2020也有一篇超分辨率的工作可以参考文章:Unpaired Image Super-Resolution using Pseudo-Supervision,但是没有充分的实践验证过,需要看实际场景下的效果。 -#### Q2.4.8:表格识别有什么好的模型 或者论文推荐么 +#### Q2.3.15:表格识别有什么好的模型 或者论文推荐么 **A**:表格目前学术界比较成熟的解决方案不多 ,可以尝试下分割的论文方案。 -#### Q2.4.9:弯曲文本有试过opencv的TPS进行弯曲校正吗? +#### Q2.3.16:弯曲文本有试过opencv的TPS进行弯曲校正吗? **A**:opencv的tps需要标出上下边界对应的点,这个点很难通过传统方法或者深度学习方法获取。PaddleOCR里StarNet网络中的tps模块实现了自动学点,自动校正,可以直接尝试这个。 +#### Q2.3.17: StyleText 合成数据效果不好? +**A**:StyleText模型生成的数据主要用于OCR识别模型的训练。PaddleOCR目前识别模型的输入为32 x N,因此当前版本模型主要适用高度为32的数据。 +建议要合成的数据尺寸设置为32 x N。尺寸相差不多的数据也可以生成,尺寸很大或很小的数据效果确实不佳。 + @@ -392,6 +401,63 @@ **A**:动态图版本正在紧锣密鼓开发中,将于2020年12月16日发布,敬请关注。 +#### Q3.1.22:ModuleNotFoundError: No module named 'paddle.nn', +**A**:paddle.nn是Paddle2.0版本特有的功能,请安装大于等于Paddle 2.0.0rc1的版本,安装方式为 +``` +python3 -m pip install paddlepaddle-gpu==2.0.0rc1 -i https://mirror.baidu.com/pypi/simple +``` + +#### Q3.1.23: ImportError: /usr/lib/x86_64_linux-gnu/libstdc++.so.6:version `CXXABI_1.3.11` not found (required by /usr/lib/python3.6/site-package/paddle/fluid/core+avx.so) +**A**:这个问题是glibc版本不足导致的,Paddle2.0rc1版本对gcc版本和glib版本有更高的要求,推荐gcc版本为8.2,glibc版本2.12以上。 +如果您的环境不满足这个要求,或者使用的docker镜像为: +`hub.baidubce.com/paddlepaddle/paddle:latest-gpu-cuda9.0-cudnn7-dev` +`hub.baidubce.com/paddlepaddle/paddle:latest-gpu-cuda9.0-cudnn7-dev`,安装Paddle2.0rc版本可能会出现上述错误,2.0版本推荐使用新的docker镜像 `paddlepaddle/paddle:latest-dev-cuda10.1-cudnn7-gcc82`。 +或者访问[dockerhub](https://hub.docker.com/r/paddlepaddle/paddle/tags/)获得与您机器适配的镜像。 + + +#### Q3.1.24: PaddleOCR develop分支和dygraph分支的区别? +**A** 目前PaddleOCR有四个分支,分别是: + +- develop:基于Paddle静态图开发的分支,推荐使用paddle1.8 或者2.0版本,该分支具备完善的模型训练、预测、推理部署、量化裁剪等功能,领先于release/1.1分支。 +- release/1.1:PaddleOCR 发布的第一个稳定版本,基于静态图开发,具备完善的训练、预测、推理部署、量化裁剪等功能。 +- dygraph:基于Paddle动态图开发的分支,目前仍在开发中,未来将作为主要开发分支,运行要求使用Paddle2.0rc1版本,目前仍在开发中。 +- release/2.0-rc1-0:PaddleOCR发布的第二个稳定版本,基于动态图和paddle2.0版本开发,动态图开发的工程更易于调试,目前支,支持模型训练、预测,暂不支持移动端部署。 + +如果您已经上手过PaddleOCR,并且希望在各种环境上部署PaddleOCR,目前建议使用静态图分支,develop或者release/1.1分支。如果您是初学者,想快速训练,调试PaddleOCR中的算法,建议尝鲜PaddleOCR dygraph分支。 + +**注意**:develop和dygraph分支要求的Paddle版本、本地环境有差别,请注意不同分支环境安装部分的差异。 + +#### Q3.1.25: 使用dygraph分支,在docker中训练PaddleOCR的时候,数据路径没有任何问题,但是一直报错`reader rasied an exception`,这是为什么呢? + +**A** 创建docker的时候,`/dev/shm`的默认大小为64M,如果使用多进程读取数据,共享内存可能不够,因此需要给`/dev/shm`分配更大的空间,在创建docker的时候,传入`--shm-size=8g`表示给`/dev/shm`分配8g的空间。 + +#### Q3.1.26: 在repo中没有找到Lite和PaddleServing相关的部署教程,这是在哪里呢? + +**A** 目前PaddleOCR的默认分支为dygraph,关于Lite和PaddleLite的动态图部署还在适配中,如果希望在Lite端或者使用PaddleServing部署,推荐使用develop分支(静态图)的代码。 + +#### Q3.1.27: 如何可视化acc,loss曲线图,模型网络结构图等? + +**A** 在配置文件里有`use_visualdl`的参数,设置为True即可,更多的使用命令可以参考:[VisualDL使用指南](https://www.paddlepaddle.org.cn/documentation/docs/zh/2.0-rc1/guides/03_VisualDL/visualdl.html)。 + +#### Q3.1.28: 在使用StyleText数据合成工具的时候,报错`ModuleNotFoundError: No module named 'utils.config'`,这是为什么呢? + +**A** 有2个解决方案 +- 在StyleText路径下面设置PYTHONPATH:`export PYTHONPATH=./` +- 拉取最新的代码 + +#### Q3.1.29: PPOCRLabel创建矩形框时只能拖出正方形,如何进行矩形标注? + +**A** 取消勾选:“编辑”-“正方形标注” + +#### Q3.1.30: Style-Text 如何不文字风格迁移,就像普通文本生成程序一样默认字体直接输出到分割的背景图? + +**A** 使用image_synth模式会输出fake_bg.jpg,即为背景图。如果想要批量提取背景,可以稍微修改一下代码,将fake_bg保存下来即可。要修改的位置: +https://github.com/PaddlePaddle/PaddleOCR/blob/de3e2e7cd3b8b65ee02d7a41e570fa5b511a3c1d/StyleText/engine/synthesisers.py#L68 + +#### Q3.1.31: 怎么输出网络结构以及每层的参数信息? + +**A** 可以使用 `paddle.summary`, 具体参考:https://www.paddlepaddle.org.cn/documentation/docs/zh/2.0-rc1/api/paddle/hapi/model_summary/summary_cn.html#summary。 + ### 数据集 @@ -594,11 +660,11 @@ ps -axu | grep train.py | awk '{print $2}' | xargs kill -9 #### Q3.3.20: 文字检测时怎么模糊的数据增强? -**A**: 模糊的数据增强需要修改代码进行添加,以DB为例,参考[Normalize](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/ppocr/data/imaug/operators.py#L60) ,添加模糊的增强就行 +**A**: 模糊的数据增强需要修改代码进行添加,以DB为例,参考[Normalize](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/ppocr/data/imaug/operators.py#L60) ,添加模糊的增强就行 #### Q3.3.21: 文字检测时怎么更改图片旋转的角度,实现360度任意旋转? -**A**: 将[这里](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/ppocr/data/imaug/iaa_augment.py#L64) 的(-10,10) 改为(-180,180)即可 +**A**: 将[这里](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/ppocr/data/imaug/iaa_augment.py#L64) 的(-10,10) 改为(-180,180)即可 #### Q3.3.22: 训练数据的长宽比过大怎么修改shape @@ -617,6 +683,10 @@ ps -axu | grep train.py | awk '{print $2}' | xargs kill -9 **A**: 识别模型训练初期acc为0是正常的,多训一段时间指标就上来了。 +#### Q3.3.26: PaddleOCR在训练的时候一直使用cosine_decay的学习率下降策略,这是为什么呢? + +**A**: cosine_decay表示在训练的过程中,学习率按照cosine的变化趋势逐渐下降至0,在迭代轮数更长的情况下,比常量的学习率变化策略会有更好的收敛效果,因此在实际训练的时候,均采用了cosine_decay,来获得精度更高的模型。 + ### 预测部署 @@ -728,4 +798,25 @@ ps -axu | grep train.py | awk '{print $2}' | xargs kill -9 #### Q3.4.24:DB模型能正确推理预测,但换成EAST或SAST模型时报错或结果不正确 -**A**:使用EAST或SAST模型进行推理预测时,需要在命令中指定参数--det_algorithm="EAST" 或 --det_algorithm="SAST",使用DB时不用指定是因为该参数默认值是"DB":https://github.com/PaddlePaddle/PaddleOCR/blob/e7a708e9fdaf413ed7a14da8e4a7b4ac0b211e42/tools/infer/utility.py#L43 \ No newline at end of file +**A**:使用EAST或SAST模型进行推理预测时,需要在命令中指定参数--det_algorithm="EAST" 或 --det_algorithm="SAST",使用DB时不用指定是因为该参数默认值是"DB":https://github.com/PaddlePaddle/PaddleOCR/blob/e7a708e9fdaf413ed7a14da8e4a7b4ac0b211e42/tools/infer/utility.py#L43 + +#### Q3.4.25 : PaddleOCR模型Python端预测和C++预测结果不一致? +正常来说,python端预测和C++预测文本是一致的,如果预测结果差异较大, +建议首先排查diff出现在检测模型还是识别模型,或者尝试换其他模型是否有类似的问题。 +其次,检查python端和C++端数据处理部分是否存在差异,建议保存环境,更新PaddleOCR代码再试下。 +如果更新代码或者更新代码都没能解决,建议在PaddleOCR微信群里或者issue中抛出您的问题。 + +### Q3.4.26: 目前paddle hub serving 只支持 imgpath,如果我想用imgurl 去哪里改呢? + +**A**: 图片是在这里读取的:https://github.com/PaddlePaddle/PaddleOCR/blob/67ef25d593c4eabfaaceb22daade4577f53bed81/deploy/hubserving/ocr_system/module.py#L55, +可以参考下面的写法,将url path转化为np array(https://cloud.tencent.com/developer/article/1467840) +``` +response = request.urlopen('http://i1.whymtj.com/uploads/tu/201902/9999/52491ae4ba.jpg') +img_array = np.array(bytearray(response.read()), dtype=np.uint8) +img = cv.imdecode(img_array, -1) +``` + +### Q3.4.27: C++ 端侧部署可以只对OCR的检测部署吗? + +**A** 可以的,识别和检测模块是解耦的。如果想对检测部署,需要自己修改一下main函数, +只保留检测相关就可以:https://github.com/PaddlePaddle/PaddleOCR/blob/de3e2e7cd3b8b65ee02d7a41e570fa5b511a3c1d/deploy/cpp_infer/src/main.cpp#L72 diff --git a/doc/doc_ch/algorithm_overview.md b/doc/doc_ch/algorithm_overview.md index a23bfcb112d54719298709d5e253f609ec9dea74..8cebce3adf5c414674d2990c1b2a018ae52e57f6 100755 --- a/doc/doc_ch/algorithm_overview.md +++ b/doc/doc_ch/algorithm_overview.md @@ -9,9 +9,9 @@ ### 1.文本检测算法 PaddleOCR开源的文本检测算法列表: -- [x] DB([paper]( https://arxiv.org/abs/1911.08947) )(ppocr推荐) -- [x] EAST([paper](https://arxiv.org/abs/1704.03155)) -- [x] SAST([paper](https://arxiv.org/abs/1908.05498)) +- [x] DB([paper]( https://arxiv.org/abs/1911.08947)) [2](ppocr推荐) +- [x] EAST([paper](https://arxiv.org/abs/1704.03155))[1] +- [x] SAST([paper](https://arxiv.org/abs/1908.05498))[4] 在ICDAR2015文本检测公开数据集上,算法效果如下: @@ -21,13 +21,13 @@ PaddleOCR开源的文本检测算法列表: |EAST|MobileNetV3|78.24%|79.15%|78.69%|[下载链接](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_mv3_east_v2.0_train.tar)| |DB|ResNet50_vd|86.41%|78.72%|82.38%|[下载链接](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_db_v2.0_train.tar)| |DB|MobileNetV3|77.29%|73.08%|75.12%|[下载链接](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_mv3_db_v2.0_train.tar)| -|SAST|ResNet50_vd|91.83%|81.80%|86.52%|[下载链接](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_sast_icdar15_v2.0_train.tar)| +|SAST|ResNet50_vd|91.39%|83.77%|87.42%|[下载链接](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_sast_icdar15_v2.0_train.tar)| 在Total-text文本检测公开数据集上,算法效果如下: |模型|骨干网络|precision|recall|Hmean|下载链接| | --- | --- | --- | --- | --- | --- | -|SAST|ResNet50_vd|89.05%|76.80%|82.47%|[下载链接](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_sast_totaltext_v2.0_train.tar)| +|SAST|ResNet50_vd|89.63%|78.44%|83.66%|[下载链接](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_sast_totaltext_v2.0_train.tar)| **说明:** SAST模型训练额外加入了icdar2013、icdar2017、COCO-Text、ArT等公开数据集进行调优。PaddleOCR用到的经过整理格式的英文公开数据集下载:[百度云地址](https://pan.baidu.com/s/12cPnZcVuV1zn5DOd4mqjVw) (提取码: 2bpi) @@ -38,13 +38,13 @@ PaddleOCR文本检测算法的训练和使用请参考文档教程中[模型训 ### 2.文本识别算法 PaddleOCR基于动态图开源的文本识别算法列表: -- [x] CRNN([paper](https://arxiv.org/abs/1507.05717) )(ppocr推荐) -- [x] Rosetta([paper](https://arxiv.org/abs/1910.05085)) -- [ ] STAR-Net([paper](http://www.bmva.org/bmvc/2016/papers/paper043/index.html)) coming soon -- [ ] RARE([paper](https://arxiv.org/abs/1603.03915v1)) coming soon -- [ ] SRN([paper](https://arxiv.org/abs/2003.12294)) coming soon +- [x] CRNN([paper](https://arxiv.org/abs/1507.05717))[7](ppocr推荐) +- [x] Rosetta([paper](https://arxiv.org/abs/1910.05085))[10] +- [ ] STAR-Net([paper](http://www.bmva.org/bmvc/2016/papers/paper043/index.html))[11] coming soon +- [ ] RARE([paper](https://arxiv.org/abs/1603.03915v1))[12] coming soon +- [ ] SRN([paper](https://arxiv.org/abs/2003.12294))[5] coming soon -参考[DTRB](https://arxiv.org/abs/1904.01906)文字识别训练和评估流程,使用MJSynth和SynthText两个文字识别数据集训练,在IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE数据集上进行评估,算法效果如下: +参考[DTRB][3](https://arxiv.org/abs/1904.01906)文字识别训练和评估流程,使用MJSynth和SynthText两个文字识别数据集训练,在IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE数据集上进行评估,算法效果如下: |模型|骨干网络|Avg Accuracy|模型存储命名|下载链接| |-|-|-|-|-| diff --git a/doc/doc_ch/angle_class.md b/doc/doc_ch/angle_class.md index 846be15f834952587e8d3b2533ff147375db7c31..4d7ff0d7aa839591df6e359d4f7295ab2f0cc445 100644 --- a/doc/doc_ch/angle_class.md +++ b/doc/doc_ch/angle_class.md @@ -21,9 +21,8 @@ ln -sf /train_data/cls/dataset ``` " 图像文件名 图像标注信息 " - -train_data/cls/word_001.jpg 0 -train_data/cls/word_002.jpg 180 +train/word_001.jpg 0 +train/word_002.jpg 180 ``` 最终训练集应有如下文件结构: @@ -55,6 +54,8 @@ train_data/cls/word_002.jpg 180 ### 启动训练 +将准备好的txt文件和图片文件夹路径分别写入配置文件的 `Train/Eval.dataset.label_file_list` 和 `Train/Eval.dataset.data_dir` 字段下,`Train/Eval.dataset.data_dir`字段下的路径和文件里记载的图片名构成了图片的绝对路径。 + PaddleOCR提供了训练脚本、评估脚本和预测脚本。 开始训练: diff --git a/doc/doc_ch/quickstart.md b/doc/doc_ch/quickstart.md index eabf1d91cfcc5afad3b9495f63cd6379562342b9..d9460989336118bfde6cafb5cc2a7f1d0b6b8691 100644 --- a/doc/doc_ch/quickstart.md +++ b/doc/doc_ch/quickstart.md @@ -96,5 +96,5 @@ python3 tools/infer/predict_system.py --image_dir="./doc/imgs/11.jpg" --det_mode 此外,文档教程中也提供了中文OCR模型的其他预测部署方式: - [基于C++预测引擎推理](../../deploy/cpp_infer/readme.md) -- [服务部署](../../deploy/pdserving/readme.md) -- [端侧部署](../../deploy/lite/readme.md) +- [服务部署](../../deploy/hubserving) +- [端侧部署(目前只支持静态图)](https://github.com/PaddlePaddle/PaddleOCR/tree/develop/deploy/lite) diff --git a/doc/doc_ch/reference.md b/doc/doc_ch/reference.md index 9d9a6785b353ba8800ae0ff9db8cb40e9bf9caa9..f1337dedc96c685173cbcc8450a57c259d2c0029 100644 --- a/doc/doc_ch/reference.md +++ b/doc/doc_ch/reference.md @@ -11,11 +11,12 @@ } 2. DB: -@article{liao2019real, - title={Real-time Scene Text Detection with Differentiable Binarization}, +@inproceedings{liao2020real, + title={Real-Time Scene Text Detection with Differentiable Binarization.}, author={Liao, Minghui and Wan, Zhaoyi and Yao, Cong and Chen, Kai and Bai, Xiang}, - journal={arXiv preprint arXiv:1911.08947}, - year={2019} + booktitle={AAAI}, + pages={11474--11481}, + year={2020} } 3. DTRB: @@ -37,10 +38,11 @@ } 5. SRN: -@article{yu2020towards, - title={Towards Accurate Scene Text Recognition with Semantic Reasoning Networks}, - author={Yu, Deli and Li, Xuan and Zhang, Chengquan and Han, Junyu and Liu, Jingtuo and Ding, Errui}, - journal={arXiv preprint arXiv:2003.12294}, +@inproceedings{yu2020towards, + title={Towards accurate scene text recognition with semantic reasoning networks}, + author={Yu, Deli and Li, Xuan and Zhang, Chengquan and Liu, Tao and Han, Junyu and Liu, Jingtuo and Ding, Errui}, + booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, + pages={12113--12122}, year={2020} } @@ -52,4 +54,62 @@ pages={9086--9095}, year={2019} } -``` \ No newline at end of file + +7. CRNN: +@article{shi2016end, + title={An end-to-end trainable neural network for image-based sequence recognition and its application to scene text recognition}, + author={Shi, Baoguang and Bai, Xiang and Yao, Cong}, + journal={IEEE transactions on pattern analysis and machine intelligence}, + volume={39}, + number={11}, + pages={2298--2304}, + year={2016}, + publisher={IEEE} +} + +8. FPGM: +@inproceedings{he2019filter, + title={Filter pruning via geometric median for deep convolutional neural networks acceleration}, + author={He, Yang and Liu, Ping and Wang, Ziwei and Hu, Zhilan and Yang, Yi}, + booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition}, + pages={4340--4349}, + year={2019} +} + +9. PACT: +@article{choi2018pact, + title={Pact: Parameterized clipping activation for quantized neural networks}, + author={Choi, Jungwook and Wang, Zhuo and Venkataramani, Swagath and Chuang, Pierce I-Jen and Srinivasan, Vijayalakshmi and Gopalakrishnan, Kailash}, + journal={arXiv preprint arXiv:1805.06085}, + year={2018} +} + +10.Rosetta +@inproceedings{borisyuk2018rosetta, + title={Rosetta: Large scale system for text detection and recognition in images}, + author={Borisyuk, Fedor and Gordo, Albert and Sivakumar, Viswanath}, + booktitle={Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery \& Data Mining}, + pages={71--79}, + year={2018} +} + +11.STAR-Net +@inproceedings{liu2016star, + title={STAR-Net: A SpaTial Attention Residue Network for Scene Text Recognition.}, + author={Liu, Wei and Chen, Chaofeng and Wong, Kwan-Yee K and Su, Zhizhong and Han, Junyu}, + booktitle={BMVC}, + volume={2}, + pages={7}, + year={2016} +} + +12.RARE +@inproceedings{shi2016robust, + title={Robust scene text recognition with automatic rectification}, + author={Shi, Baoguang and Wang, Xinggang and Lyu, Pengyuan and Yao, Cong and Bai, Xiang}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={4168--4176}, + year={2016} +} + +``` diff --git a/doc/doc_ch/tree.md b/doc/doc_ch/tree.md index 5f048db022dbe422a78f87b0236d04e00ccc4d48..c222bcb447292fb3644c6d6fc6cf013a67b9dff3 100644 --- a/doc/doc_ch/tree.md +++ b/doc/doc_ch/tree.md @@ -211,6 +211,6 @@ PaddleOCR ├── README_ch.md // 中文说明文档 ├── README_en.md // 英文说明文档 ├── README.md // 主页说明文档 -├── requirements.txt // 安装依赖 +├── requirements.txt // 安装依赖 ├── setup.py // whl包打包脚本 ├── train.sh // 启动训练脚本 diff --git a/doc/doc_en/algorithm_overview_en.md b/doc/doc_en/algorithm_overview_en.md index 7f1afd027b9b56ad9a1f7a10f3f6b1fc34587252..f2349a1c3cb5096db23ff2a4465c51e0abfca36b 100755 --- a/doc/doc_en/algorithm_overview_en.md +++ b/doc/doc_en/algorithm_overview_en.md @@ -11,9 +11,9 @@ This tutorial lists the text detection algorithms and text recognition algorithm ### 1. Text Detection Algorithm PaddleOCR open source text detection algorithms list: -- [x] EAST([paper](https://arxiv.org/abs/1704.03155)) -- [x] DB([paper](https://arxiv.org/abs/1911.08947)) -- [x] SAST([paper](https://arxiv.org/abs/1908.05498) )(Baidu Self-Research) +- [x] EAST([paper](https://arxiv.org/abs/1704.03155))[2] +- [x] DB([paper](https://arxiv.org/abs/1911.08947))[1] +- [x] SAST([paper](https://arxiv.org/abs/1908.05498))[4] On the ICDAR2015 dataset, the text detection result is as follows: @@ -23,13 +23,13 @@ On the ICDAR2015 dataset, the text detection result is as follows: |EAST|MobileNetV3|78.24%|79.15%|78.69%|[Download link](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_mv3_east_v2.0_train.tar)| |DB|ResNet50_vd|86.41%|78.72%|82.38%|[Download link](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_db_v2.0_train.tar)| |DB|MobileNetV3|77.29%|73.08%|75.12%|[Download link](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_mv3_db_v2.0_train.tar)| -|SAST|ResNet50_vd|91.83%|81.80%|86.52%|[Download link](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_sast_icdar15_v2.0_train.tar)| +|SAST|ResNet50_vd|91.39%|83.77%|87.42%|[Download link](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_sast_icdar15_v2.0_train.tar)| On Total-Text dataset, the text detection result is as follows: |Model|Backbone|precision|recall|Hmean|Download link| | --- | --- | --- | --- | --- | --- | -|SAST|ResNet50_vd|89.05%|76.80%|82.47%|[Download link](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_sast_totaltext_v2.0_train.tar)| +|SAST|ResNet50_vd|89.63%|78.44%|83.66%|[Download link](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_sast_totaltext_v2.0_train.tar)| **Note:** Additional data, like icdar2013, icdar2017, COCO-Text, ArT, was added to the model training of SAST. Download English public dataset in organized format used by PaddleOCR from [Baidu Drive](https://pan.baidu.com/s/12cPnZcVuV1zn5DOd4mqjVw) (download code: 2bpi). @@ -39,11 +39,11 @@ For the training guide and use of PaddleOCR text detection algorithms, please re ### 2. Text Recognition Algorithm PaddleOCR open-source text recognition algorithms list: -- [x] CRNN([paper](https://arxiv.org/abs/1507.05717)) -- [x] Rosetta([paper](https://arxiv.org/abs/1910.05085)) -- [ ] STAR-Net([paper](http://www.bmva.org/bmvc/2016/papers/paper043/index.html)) coming soon -- [ ] RARE([paper](https://arxiv.org/abs/1603.03915v1)) coming soon -- [ ] SRN([paper](https://arxiv.org/abs/2003.12294) )(Baidu Self-Research) coming soon +- [x] CRNN([paper](https://arxiv.org/abs/1507.05717))[7] +- [x] Rosetta([paper](https://arxiv.org/abs/1910.05085))[10] +- [ ] STAR-Net([paper](http://www.bmva.org/bmvc/2016/papers/paper043/index.html))[11] coming soon +- [ ] RARE([paper](https://arxiv.org/abs/1603.03915v1))[12] coming soon +- [ ] SRN([paper](https://arxiv.org/abs/2003.12294))[5] coming soon Refer to [DTRB](https://arxiv.org/abs/1904.01906), the training and evaluation result of these above text recognition (using MJSynth and SynthText for training, evaluate on IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE) is as follow: diff --git a/doc/doc_en/angle_class_en.md b/doc/doc_en/angle_class_en.md index e6157d1635431a45b3bc9392d1115dcdd917aeeb..8d9328700f3e638eb4576d132aa32fb93b3ad0c0 100644 --- a/doc/doc_en/angle_class_en.md +++ b/doc/doc_en/angle_class_en.md @@ -23,8 +23,8 @@ First put the training images in the same folder (train_images), and use a txt f ``` " Image file name Image annotation " -train_data/word_001.jpg 0 -train_data/word_002.jpg 180 +train/word_001.jpg 0 +train/word_002.jpg 180 ``` The final training set should have the following file structure: @@ -57,6 +57,7 @@ containing all images (test) and a cls_gt_test.txt. The structure of the test se ``` ### TRAINING +Write the prepared txt file and image folder path into the configuration file under the `Train/Eval.dataset.label_file_list` and `Train/Eval.dataset.data_dir` fields, the absolute path of the image consists of the `Train/Eval.dataset.data_dir` field and the image name recorded in the txt file. PaddleOCR provides training scripts, evaluation scripts, and prediction scripts. diff --git a/doc/doc_en/quickstart_en.md b/doc/doc_en/quickstart_en.md index e351ecc650d621b1da5f34dd941eaf6fb3094402..a5c0881de30bfd4b76d30c7840b6585b5d7e2af9 100644 --- a/doc/doc_en/quickstart_en.md +++ b/doc/doc_en/quickstart_en.md @@ -99,5 +99,5 @@ For more text detection and recognition tandem reasoning, please refer to the do In addition, the tutorial also provides other deployment methods for the Chinese OCR model: - [Server-side C++ inference](../../deploy/cpp_infer/readme_en.md) -- [Service deployment](../../deploy/pdserving/readme_en.md) -- [End-to-end deployment](../../deploy/lite/readme_en.md) +- [Service deployment](../../deploy/hubserving) +- [End-to-end deployment](https://github.com/PaddlePaddle/PaddleOCR/tree/develop/deploy/lite) diff --git a/doc/joinus.PNG b/doc/joinus.PNG index 18edf81200c3f80be0670dabbf1d6c503fedca4a..f0f481929f207fbc452a7457df1a490947934a60 100644 Binary files a/doc/joinus.PNG and b/doc/joinus.PNG differ diff --git a/ppocr/data/imaug/operators.py b/ppocr/data/imaug/operators.py index 57cd3b4b172cc52d073999a007fe5a9a343c9186..8b9175cf0b201969fdfbd7fb082200e2241f4444 100644 --- a/ppocr/data/imaug/operators.py +++ b/ppocr/data/imaug/operators.py @@ -119,10 +119,10 @@ class DetResizeForTest(object): if 'image_shape' in kwargs: self.image_shape = kwargs['image_shape'] self.resize_type = 1 - if 'limit_side_len' in kwargs: + elif 'limit_side_len' in kwargs: self.limit_side_len = kwargs['limit_side_len'] self.limit_type = kwargs.get('limit_type', 'min') - if 'resize_long' in kwargs: + elif 'resize_long' in kwargs: self.resize_type = 2 self.resize_long = kwargs.get('resize_long', 960) else: diff --git a/ppocr/losses/det_basic_loss.py b/ppocr/losses/det_basic_loss.py index ef656e8c77de5d1bfa66d55ea2d7f68a9f7217ec..57b3667d9f32a871f748c40a65429551613991ca 100644 --- a/ppocr/losses/det_basic_loss.py +++ b/ppocr/losses/det_basic_loss.py @@ -45,7 +45,6 @@ class BalanceLoss(nn.Layer): self.balance_loss = balance_loss self.main_loss_type = main_loss_type self.negative_ratio = negative_ratio - self.main_loss_type = main_loss_type self.return_origin = return_origin self.eps = eps diff --git a/ppocr/losses/det_sast_loss.py b/ppocr/losses/det_sast_loss.py index a07af6a4598ffb97167094b40a7851ec1089e798..2e0c756bd4ebe4157ed397a6c2e9b7e94054b4e7 100644 --- a/ppocr/losses/det_sast_loss.py +++ b/ppocr/losses/det_sast_loss.py @@ -19,7 +19,6 @@ from __future__ import print_function import paddle from paddle import nn from .det_basic_loss import DiceLoss -import paddle.fluid as fluid import numpy as np @@ -27,9 +26,7 @@ class SASTLoss(nn.Layer): """ """ - def __init__(self, - eps=1e-6, - **kwargs): + def __init__(self, eps=1e-6, **kwargs): super(SASTLoss, self).__init__() self.dice_loss = DiceLoss(eps=eps) @@ -39,7 +36,7 @@ class SASTLoss(nn.Layer): tcl_mask: N x 128 x 1 tcl_label: N x X list or LoDTensor """ - + f_score = predicts['f_score'] f_border = predicts['f_border'] f_tvo = predicts['f_tvo'] @@ -53,15 +50,17 @@ class SASTLoss(nn.Layer): score_loss = 1.0 - 2 * intersection / (union + 1e-5) #border loss - l_border_split, l_border_norm = paddle.split(l_border, num_or_sections=[4, 1], axis=1) + l_border_split, l_border_norm = paddle.split( + l_border, num_or_sections=[4, 1], axis=1) f_border_split = f_border border_ex_shape = l_border_norm.shape * np.array([1, 4, 1, 1]) - l_border_norm_split = paddle.expand(x=l_border_norm, shape=border_ex_shape) - l_border_score = paddle.expand(x=l_score, shape=border_ex_shape) - l_border_mask = paddle.expand(x=l_mask, shape=border_ex_shape) + l_border_norm_split = paddle.expand( + x=l_border_norm, shape=border_ex_shape) + l_border_score = paddle.expand(x=l_score, shape=border_ex_shape) + l_border_mask = paddle.expand(x=l_mask, shape=border_ex_shape) border_diff = l_border_split - f_border_split - abs_border_diff = paddle.abs(border_diff) + abs_border_diff = paddle.abs(border_diff) border_sign = abs_border_diff < 1.0 border_sign = paddle.cast(border_sign, dtype='float32') border_sign.stop_gradient = True @@ -72,15 +71,16 @@ class SASTLoss(nn.Layer): (paddle.sum(l_border_score * l_border_mask) + 1e-5) #tvo_loss - l_tvo_split, l_tvo_norm = paddle.split(l_tvo, num_or_sections=[8, 1], axis=1) + l_tvo_split, l_tvo_norm = paddle.split( + l_tvo, num_or_sections=[8, 1], axis=1) f_tvo_split = f_tvo tvo_ex_shape = l_tvo_norm.shape * np.array([1, 8, 1, 1]) l_tvo_norm_split = paddle.expand(x=l_tvo_norm, shape=tvo_ex_shape) - l_tvo_score = paddle.expand(x=l_score, shape=tvo_ex_shape) - l_tvo_mask = paddle.expand(x=l_mask, shape=tvo_ex_shape) + l_tvo_score = paddle.expand(x=l_score, shape=tvo_ex_shape) + l_tvo_mask = paddle.expand(x=l_mask, shape=tvo_ex_shape) # tvo_geo_diff = l_tvo_split - f_tvo_split - abs_tvo_geo_diff = paddle.abs(tvo_geo_diff) + abs_tvo_geo_diff = paddle.abs(tvo_geo_diff) tvo_sign = abs_tvo_geo_diff < 1.0 tvo_sign = paddle.cast(tvo_sign, dtype='float32') tvo_sign.stop_gradient = True @@ -91,15 +91,16 @@ class SASTLoss(nn.Layer): (paddle.sum(l_tvo_score * l_tvo_mask) + 1e-5) #tco_loss - l_tco_split, l_tco_norm = paddle.split(l_tco, num_or_sections=[2, 1], axis=1) + l_tco_split, l_tco_norm = paddle.split( + l_tco, num_or_sections=[2, 1], axis=1) f_tco_split = f_tco tco_ex_shape = l_tco_norm.shape * np.array([1, 2, 1, 1]) l_tco_norm_split = paddle.expand(x=l_tco_norm, shape=tco_ex_shape) - l_tco_score = paddle.expand(x=l_score, shape=tco_ex_shape) - l_tco_mask = paddle.expand(x=l_mask, shape=tco_ex_shape) - + l_tco_score = paddle.expand(x=l_score, shape=tco_ex_shape) + l_tco_mask = paddle.expand(x=l_mask, shape=tco_ex_shape) + tco_geo_diff = l_tco_split - f_tco_split - abs_tco_geo_diff = paddle.abs(tco_geo_diff) + abs_tco_geo_diff = paddle.abs(tco_geo_diff) tco_sign = abs_tco_geo_diff < 1.0 tco_sign = paddle.cast(tco_sign, dtype='float32') tco_sign.stop_gradient = True @@ -109,13 +110,12 @@ class SASTLoss(nn.Layer): tco_loss = paddle.sum(tco_out_loss * l_tco_score * l_tco_mask) / \ (paddle.sum(l_tco_score * l_tco_mask) + 1e-5) - # total loss tvo_lw, tco_lw = 1.5, 1.5 score_lw, border_lw = 1.0, 1.0 total_loss = score_loss * score_lw + border_loss * border_lw + \ tvo_loss * tvo_lw + tco_loss * tco_lw - + losses = {'loss':total_loss, "score_loss":score_loss,\ "border_loss":border_loss, 'tvo_loss':tvo_loss, 'tco_loss':tco_loss} - return losses \ No newline at end of file + return losses diff --git a/ppocr/metrics/rec_metric.py b/ppocr/metrics/rec_metric.py index bd0f92e0d759204b33b6cb9b261531d61134605e..a86fc8382f40b5b73edc7ec8e9d4dbe3e5822283 100644 --- a/ppocr/metrics/rec_metric.py +++ b/ppocr/metrics/rec_metric.py @@ -26,6 +26,8 @@ class RecMetric(object): all_num = 0 norm_edit_dis = 0.0 for (pred, pred_conf), (target, _) in zip(preds, labels): + pred = pred.replace(" ", "") + target = target.replace(" ", "") norm_edit_dis += Levenshtein.distance(pred, target) / max( len(pred), len(target)) if pred == target: diff --git a/ppocr/modeling/transforms/tps.py b/ppocr/modeling/transforms/tps.py index 86665bedfff726c174e676cb544000a37ada0dad..e7a152c1ccbb1d0175f14f671041285cb853e11a 100644 --- a/ppocr/modeling/transforms/tps.py +++ b/ppocr/modeling/transforms/tps.py @@ -16,6 +16,7 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import math import paddle from paddle import nn, ParamAttr from paddle.nn import functional as F @@ -88,11 +89,14 @@ class LocalizationNetwork(nn.Layer): in_channels = num_filters self.block_list.append(pool) name = "loc_fc1" + stdv = 1.0 / math.sqrt(num_filters_list[-1] * 1.0) self.fc1 = nn.Linear( in_channels, fc_dim, weight_attr=ParamAttr( - learning_rate=loc_lr, name=name + "_w"), + learning_rate=loc_lr, + name=name + "_w", + initializer=nn.initializer.Uniform(-stdv, stdv)), bias_attr=ParamAttr(name=name + '.b_0'), name=name) diff --git a/ppocr/optimizer/learning_rate.py b/ppocr/optimizer/learning_rate.py index 8f303e838c3dab79385198dd98c84c4afed1a7b1..e1b10992676cfdf73fb7573e5289c133981d1474 100644 --- a/ppocr/optimizer/learning_rate.py +++ b/ppocr/optimizer/learning_rate.py @@ -18,6 +18,7 @@ from __future__ import print_function from __future__ import unicode_literals from paddle.optimizer import lr +from .lr_scheduler import CyclicalCosineDecay class Linear(object): @@ -46,7 +47,7 @@ class Linear(object): self.end_lr = end_lr self.power = power self.last_epoch = last_epoch - self.warmup_epoch = warmup_epoch * step_each_epoch + self.warmup_epoch = round(warmup_epoch * step_each_epoch) def __call__(self): learning_rate = lr.PolynomialDecay( @@ -87,7 +88,7 @@ class Cosine(object): self.learning_rate = learning_rate self.T_max = step_each_epoch * epochs self.last_epoch = last_epoch - self.warmup_epoch = warmup_epoch * step_each_epoch + self.warmup_epoch = round(warmup_epoch * step_each_epoch) def __call__(self): learning_rate = lr.CosineAnnealingDecay( @@ -129,7 +130,7 @@ class Step(object): self.learning_rate = learning_rate self.gamma = gamma self.last_epoch = last_epoch - self.warmup_epoch = warmup_epoch * step_each_epoch + self.warmup_epoch = round(warmup_epoch * step_each_epoch) def __call__(self): learning_rate = lr.StepDecay( @@ -168,7 +169,7 @@ class Piecewise(object): self.boundaries = [step_each_epoch * e for e in decay_epochs] self.values = values self.last_epoch = last_epoch - self.warmup_epoch = warmup_epoch * step_each_epoch + self.warmup_epoch = round(warmup_epoch * step_each_epoch) def __call__(self): learning_rate = lr.PiecewiseDecay( @@ -183,3 +184,45 @@ class Piecewise(object): end_lr=self.values[0], last_epoch=self.last_epoch) return learning_rate + + +class CyclicalCosine(object): + """ + Cyclical cosine learning rate decay + Args: + learning_rate(float): initial learning rate + step_each_epoch(int): steps each epoch + epochs(int): total training epochs + cycle(int): period of the cosine learning rate + last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate. + """ + + def __init__(self, + learning_rate, + step_each_epoch, + epochs, + cycle, + warmup_epoch=0, + last_epoch=-1, + **kwargs): + super(CyclicalCosine, self).__init__() + self.learning_rate = learning_rate + self.T_max = step_each_epoch * epochs + self.last_epoch = last_epoch + self.warmup_epoch = round(warmup_epoch * step_each_epoch) + self.cycle = round(cycle * step_each_epoch) + + def __call__(self): + learning_rate = CyclicalCosineDecay( + learning_rate=self.learning_rate, + T_max=self.T_max, + cycle=self.cycle, + last_epoch=self.last_epoch) + if self.warmup_epoch > 0: + learning_rate = lr.LinearWarmup( + learning_rate=learning_rate, + warmup_steps=self.warmup_epoch, + start_lr=0.0, + end_lr=self.learning_rate, + last_epoch=self.last_epoch) + return learning_rate diff --git a/ppocr/optimizer/lr_scheduler.py b/ppocr/optimizer/lr_scheduler.py new file mode 100644 index 0000000000000000000000000000000000000000..21aec737d0005e3dcd814ad7eff88988ab2c0796 --- /dev/null +++ b/ppocr/optimizer/lr_scheduler.py @@ -0,0 +1,49 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +from paddle.optimizer.lr import LRScheduler + + +class CyclicalCosineDecay(LRScheduler): + def __init__(self, + learning_rate, + T_max, + cycle=1, + last_epoch=-1, + eta_min=0.0, + verbose=False): + """ + Cyclical cosine learning rate decay + A learning rate which can be referred in https://arxiv.org/pdf/2012.12645.pdf + Args: + learning rate(float): learning rate + T_max(int): maximum epoch num + cycle(int): period of the cosine decay + last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate. + eta_min(float): minimum learning rate during training + verbose(bool): whether to print learning rate for each epoch + """ + super(CyclicalCosineDecay, self).__init__(learning_rate, last_epoch, + verbose) + self.cycle = cycle + self.eta_min = eta_min + + def get_lr(self): + if self.last_epoch == 0: + return self.base_lr + reletive_epoch = self.last_epoch % self.cycle + lr = self.eta_min + 0.5 * (self.base_lr - self.eta_min) * \ + (1 + math.cos(math.pi * reletive_epoch / self.cycle)) + return lr diff --git a/ppocr/utils/utility.py b/ppocr/utils/utility.py index 28fbc2b1c6e79ade2ae0ba68b001b8a8e65a7f01..29576d971486326aec3c93601656d7b982ef3336 100755 --- a/ppocr/utils/utility.py +++ b/ppocr/utils/utility.py @@ -57,7 +57,7 @@ def get_image_file_list(img_file): elif os.path.isdir(img_file): for single_file in os.listdir(img_file): file_path = os.path.join(img_file, single_file) - if imghdr.what(file_path) in img_end: + if os.path.isfile(file_path) and imghdr.what(file_path) in img_end: imgs_lists.append(file_path) if len(imgs_lists) == 0: raise Exception("not found any img file in {}".format(img_file)) diff --git a/tools/infer/predict_cls.py b/tools/infer/predict_cls.py index 420213ee5a6fce1f11c72b960d7e90344dd295ee..074172cc947cdc03b21392cf7b109971763f796a 100755 --- a/tools/infer/predict_cls.py +++ b/tools/infer/predict_cls.py @@ -18,13 +18,14 @@ __dir__ = os.path.dirname(os.path.abspath(__file__)) sys.path.append(__dir__) sys.path.append(os.path.abspath(os.path.join(__dir__, '../..'))) +os.environ["FLAGS_allocator_strategy"] = 'auto_growth' + import cv2 import copy import numpy as np import math import time import traceback -import paddle.fluid as fluid import tools.infer.utility as utility from ppocr.postprocess import build_post_process @@ -39,7 +40,6 @@ class TextClassifier(object): self.cls_image_shape = [int(v) for v in args.cls_image_shape.split(",")] self.cls_batch_num = args.cls_batch_num self.cls_thresh = args.cls_thresh - self.use_zero_copy_run = args.use_zero_copy_run postprocess_params = { 'name': 'ClsPostProcess', "label_list": args.label_list, @@ -99,12 +99,8 @@ class TextClassifier(object): norm_img_batch = norm_img_batch.copy() starttime = time.time() - if self.use_zero_copy_run: - self.input_tensor.copy_from_cpu(norm_img_batch) - self.predictor.zero_copy_run() - else: - norm_img_batch = fluid.core.PaddleTensor(norm_img_batch) - self.predictor.run([norm_img_batch]) + self.input_tensor.copy_from_cpu(norm_img_batch) + self.predictor.run() prob_out = self.output_tensors[0].copy_to_cpu() cls_result = self.postprocess_op(prob_out) elapse += time.time() - starttime @@ -143,10 +139,11 @@ def main(args): "Please set --rec_image_shape='3,32,100' and --rec_char_type='en' ") exit() for ino in range(len(img_list)): - logger.info("Predicts of {}:{}".format(valid_image_file_list[ino], cls_res[ - ino])) + logger.info("Predicts of {}:{}".format(valid_image_file_list[ino], + cls_res[ino])) logger.info("Total predict time for {} images, cost: {:.3f}".format( len(img_list), predict_time)) + if __name__ == "__main__": main(utility.parse_args()) diff --git a/tools/infer/predict_det.py b/tools/infer/predict_det.py index fe772991b7db06b192f4e3c4b99cef703c64b0df..077692afa84a745cb1b1fcb5b2c71f3dd5653013 100755 --- a/tools/infer/predict_det.py +++ b/tools/infer/predict_det.py @@ -18,11 +18,12 @@ __dir__ = os.path.dirname(os.path.abspath(__file__)) sys.path.append(__dir__) sys.path.append(os.path.abspath(os.path.join(__dir__, '../..'))) +os.environ["FLAGS_allocator_strategy"] = 'auto_growth' + import cv2 import numpy as np import time import sys -import paddle import tools.infer.utility as utility from ppocr.utils.logging import get_logger @@ -37,7 +38,6 @@ class TextDetector(object): def __init__(self, args): self.args = args self.det_algorithm = args.det_algorithm - self.use_zero_copy_run = args.use_zero_copy_run pre_process_list = [{ 'DetResizeForTest': { 'limit_side_len': args.det_limit_side_len, @@ -72,7 +72,9 @@ class TextDetector(object): postprocess_params["nms_thresh"] = args.det_east_nms_thresh elif self.det_algorithm == "SAST": pre_process_list[0] = { - 'DetResizeForTest': {'resize_long': args.det_limit_side_len} + 'DetResizeForTest': { + 'resize_long': args.det_limit_side_len + } } postprocess_params['name'] = 'SASTPostProcess' postprocess_params["score_thresh"] = args.det_sast_score_thresh @@ -161,12 +163,8 @@ class TextDetector(object): img = img.copy() starttime = time.time() - if self.use_zero_copy_run: - self.input_tensor.copy_from_cpu(img) - self.predictor.zero_copy_run() - else: - im = paddle.fluid.core.PaddleTensor(img) - self.predictor.run([im]) + self.input_tensor.copy_from_cpu(img) + self.predictor.run() outputs = [] for output_tensor in self.output_tensors: output = output_tensor.copy_to_cpu() diff --git a/tools/infer/predict_rec.py b/tools/infer/predict_rec.py index c615fa0d36e9179d0e11d7e5588d223361aab349..974fdbb6c7f4d33bd39e818945be480d858c0d09 100755 --- a/tools/infer/predict_rec.py +++ b/tools/infer/predict_rec.py @@ -18,12 +18,13 @@ __dir__ = os.path.dirname(os.path.abspath(__file__)) sys.path.append(__dir__) sys.path.append(os.path.abspath(os.path.join(__dir__, '../..'))) +os.environ["FLAGS_allocator_strategy"] = 'auto_growth' + import cv2 import numpy as np import math import time import traceback -import paddle.fluid as fluid import tools.infer.utility as utility from ppocr.postprocess import build_post_process @@ -39,7 +40,6 @@ class TextRecognizer(object): self.character_type = args.rec_char_type self.rec_batch_num = args.rec_batch_num self.rec_algorithm = args.rec_algorithm - self.use_zero_copy_run = args.use_zero_copy_run postprocess_params = { 'name': 'CTCLabelDecode', "character_type": args.rec_char_type, @@ -101,12 +101,8 @@ class TextRecognizer(object): norm_img_batch = np.concatenate(norm_img_batch) norm_img_batch = norm_img_batch.copy() starttime = time.time() - if self.use_zero_copy_run: - self.input_tensor.copy_from_cpu(norm_img_batch) - self.predictor.zero_copy_run() - else: - norm_img_batch = fluid.core.PaddleTensor(norm_img_batch) - self.predictor.run([norm_img_batch]) + self.input_tensor.copy_from_cpu(norm_img_batch) + self.predictor.run() outputs = [] for output_tensor in self.output_tensors: output = output_tensor.copy_to_cpu() @@ -145,8 +141,8 @@ def main(args): "Please set --rec_image_shape='3,32,100' and --rec_char_type='en' ") exit() for ino in range(len(img_list)): - logger.info("Predicts of {}:{}".format(valid_image_file_list[ino], rec_res[ - ino])) + logger.info("Predicts of {}:{}".format(valid_image_file_list[ino], + rec_res[ino])) logger.info("Total predict time for {} images, cost: {:.3f}".format( len(img_list), predict_time)) diff --git a/tools/infer/predict_system.py b/tools/infer/predict_system.py index 07dfc216a2aaa509a6625df2a29a3429a6b05086..8c4f9214db9621fe4e0393ed3dac0e9a7ccedbf6 100755 --- a/tools/infer/predict_system.py +++ b/tools/infer/predict_system.py @@ -18,6 +18,8 @@ __dir__ = os.path.dirname(os.path.abspath(__file__)) sys.path.append(__dir__) sys.path.append(os.path.abspath(os.path.join(__dir__, '../..'))) +os.environ["FLAGS_allocator_strategy"] = 'auto_growth' + import cv2 import copy import numpy as np diff --git a/tools/infer/utility.py b/tools/infer/utility.py index c3d294e60091f68d93cab244dc495e4fca2aa5a6..966fa3cc4c8c4e721fa83e440c9c6181937c7e96 100755 --- a/tools/infer/utility.py +++ b/tools/infer/utility.py @@ -20,8 +20,7 @@ import numpy as np import json from PIL import Image, ImageDraw, ImageFont import math -from paddle.fluid.core import AnalysisConfig -from paddle.fluid.core import create_paddle_predictor +from paddle import inference def parse_args(): @@ -34,7 +33,7 @@ def parse_args(): parser.add_argument("--ir_optim", type=str2bool, default=True) parser.add_argument("--use_tensorrt", type=str2bool, default=False) parser.add_argument("--use_fp16", type=str2bool, default=False) - parser.add_argument("--gpu_mem", type=int, default=8000) + parser.add_argument("--gpu_mem", type=int, default=500) # params for text detector parser.add_argument("--image_dir", type=str) @@ -63,7 +62,7 @@ def parse_args(): parser.add_argument("--rec_model_dir", type=str) parser.add_argument("--rec_image_shape", type=str, default="3, 32, 320") parser.add_argument("--rec_char_type", type=str, default='ch') - parser.add_argument("--rec_batch_num", type=int, default=1) + parser.add_argument("--rec_batch_num", type=int, default=6) parser.add_argument("--max_text_length", type=int, default=25) parser.add_argument( "--rec_char_dict_path", @@ -83,8 +82,6 @@ def parse_args(): parser.add_argument("--cls_thresh", type=float, default=0.9) parser.add_argument("--enable_mkldnn", type=str2bool, default=False) - parser.add_argument("--use_zero_copy_run", type=str2bool, default=False) - parser.add_argument("--use_pdserving", type=str2bool, default=False) return parser.parse_args() @@ -110,14 +107,14 @@ def create_predictor(args, mode, logger): logger.info("not find params file path {}".format(params_file_path)) sys.exit(0) - config = AnalysisConfig(model_file_path, params_file_path) + config = inference.Config(model_file_path, params_file_path) if args.use_gpu: config.enable_use_gpu(args.gpu_mem, 0) if args.use_tensorrt: config.enable_tensorrt_engine( - precision_mode=AnalysisConfig.Precision.Half - if args.use_fp16 else AnalysisConfig.Precision.Float32, + precision_mode=inference.PrecisionType.Half + if args.use_fp16 else inference.PrecisionType.Float32, max_batch_size=args.max_batch_size) else: config.disable_gpu() @@ -126,24 +123,23 @@ def create_predictor(args, mode, logger): # cache 10 different shapes for mkldnn to avoid memory leak config.set_mkldnn_cache_capacity(10) config.enable_mkldnn() + args.rec_batch_num = 1 # config.enable_memory_optim() config.disable_glog_info() - if args.use_zero_copy_run: - config.delete_pass("conv_transpose_eltwiseadd_bn_fuse_pass") - config.switch_use_feed_fetch_ops(False) - else: - config.switch_use_feed_fetch_ops(True) + config.delete_pass("conv_transpose_eltwiseadd_bn_fuse_pass") + config.switch_use_feed_fetch_ops(False) - predictor = create_paddle_predictor(config) + # create predictor + predictor = inference.create_predictor(config) input_names = predictor.get_input_names() for name in input_names: - input_tensor = predictor.get_input_tensor(name) + input_tensor = predictor.get_input_handle(name) output_names = predictor.get_output_names() output_tensors = [] for output_name in output_names: - output_tensor = predictor.get_output_tensor(output_name) + output_tensor = predictor.get_output_handle(output_name) output_tensors.append(output_tensor) return predictor, input_tensor, output_tensors diff --git a/tools/infer_cls.py b/tools/infer_cls.py index 85e11ac3d48a5c054f39153c3c67c36d3f69974c..496964826b0b063f9f937c31342932c6cd95502f 100755 --- a/tools/infer_cls.py +++ b/tools/infer_cls.py @@ -25,6 +25,8 @@ __dir__ = os.path.dirname(os.path.abspath(__file__)) sys.path.append(__dir__) sys.path.append(os.path.abspath(os.path.join(__dir__, '..'))) +os.environ["FLAGS_allocator_strategy"] = 'auto_growth' + import paddle from ppocr.data import create_operators, transform diff --git a/tools/infer_det.py b/tools/infer_det.py index d1b1b7520368f9b1bccf871db921058fe09a6bbe..d890970ec14c25815fed8366d9257495f7485e0d 100755 --- a/tools/infer_det.py +++ b/tools/infer_det.py @@ -25,6 +25,8 @@ __dir__ = os.path.dirname(os.path.abspath(__file__)) sys.path.append(__dir__) sys.path.append(os.path.abspath(os.path.join(__dir__, '..'))) +os.environ["FLAGS_allocator_strategy"] = 'auto_growth' + import cv2 import json import paddle diff --git a/tools/infer_rec.py b/tools/infer_rec.py index e3e85b5d9e4fb129379ee294d209f030a5e80b3e..7e4b081140c37ff1eb8c5e0085185b8961198a0b 100755 --- a/tools/infer_rec.py +++ b/tools/infer_rec.py @@ -25,6 +25,8 @@ __dir__ = os.path.dirname(os.path.abspath(__file__)) sys.path.append(__dir__) sys.path.append(os.path.abspath(os.path.join(__dir__, '..'))) +os.environ["FLAGS_allocator_strategy"] = 'auto_growth' + import paddle from ppocr.data import create_operators, transform diff --git a/tools/program.py b/tools/program.py index 4331f9d46d053c8472389d338922ae23b3d2a4cf..c29154268588fe4c74e9a3feed699e6f3b4fd047 100755 --- a/tools/program.py +++ b/tools/program.py @@ -131,7 +131,7 @@ def check_gpu(use_gpu): "model on CPU" try: - if use_gpu and not paddle.fluid.is_compiled_with_cuda(): + if use_gpu and not paddle.is_compiled_with_cuda(): print(err) sys.exit(1) except Exception as e: @@ -179,9 +179,9 @@ def train(config, if 'start_epoch' in best_model_dict: start_epoch = best_model_dict['start_epoch'] else: - start_epoch = 0 + start_epoch = 1 - for epoch in range(start_epoch, epoch_num): + for epoch in range(start_epoch, epoch_num + 1): if epoch > 0: train_dataloader = build_dataloader(config, 'Train', device, logger) train_batch_cost = 0.0