diff --git a/PPOCRLabel/README.md b/PPOCRLabel/README.md index e8634ef8c06feae1f0adffb22c5694084dab78cd..9b882812f33a781a448a4f0a89fe15c349f587ae 100644 --- a/PPOCRLabel/README.md +++ b/PPOCRLabel/README.md @@ -196,18 +196,28 @@ For some data that are difficult to recognize, the recognition results will not ``` cd ./PPOCRLabel # Change the directory to the PPOCRLabel folder - python gen_ocr_train_val_test.py --trainValTestRatio 6:2:2 --labelRootPath ../train_data/label --detRootPath ../train_data/det --recRootPath ../train_data/rec + python gen_ocr_train_val_test.py --trainValTestRatio 6:2:2 --datasetRootPath ../train_data ``` Parameter Description: - `trainValTestRatio` is the division ratio of the number of images in the training set, validation set, and test set, set according to your actual situation, the default is `6:2:2` - - `labelRootPath` is the storage path of the dataset labeled by PPOCRLabel, the default is `../train_data/label` - - - `detRootPath` is the path where the text detection dataset is divided according to the dataset marked by PPOCRLabel. The default is `../train_data/det` - - - `recRootPath` is the path where the character recognition dataset is divided according to the dataset marked by PPOCRLabel. The default is `../train_data/rec` + - `datasetRootPath` is the storage path of the complete dataset labeled by PPOCRLabel. The default path is `PaddleOCR/train_data` . + ``` + |-train_data + |-crop_img + |- word_001_crop_0.png + |- word_002_crop_0.jpg + |- word_003_crop_0.jpg + | ... + | Label.txt + | rec_gt.txt + |- word_001.png + |- word_002.jpg + |- word_003.jpg + | ... + ``` ### 3.6 Error message @@ -231,4 +241,4 @@ For some data that are difficult to recognize, the recognition results will not ### 4. Related -1.[Tzutalin. LabelImg. Git code (2015)](https://github.com/tzutalin/labelImg) \ No newline at end of file +1.[Tzutalin. LabelImg. Git code (2015)](https://github.com/tzutalin/labelImg) diff --git a/PPOCRLabel/README_ch.md b/PPOCRLabel/README_ch.md index e1c391bc8637baa4adfa8852d805ed0f4bf04d6d..bdeb78cd4727318dbc8f02dfb341bfa4d4d214f6 100644 --- a/PPOCRLabel/README_ch.md +++ b/PPOCRLabel/README_ch.md @@ -181,18 +181,28 @@ PPOCRLabel支持三种导出方式: ``` cd ./PPOCRLabel # 将目录切换到PPOCRLabel文件夹下 -python gen_ocr_train_val_test.py --trainValTestRatio 6:2:2 --labelRootPath ../train_data/label --detRootPath ../train_data/det --recRootPath ../train_data/rec +python gen_ocr_train_val_test.py --trainValTestRatio 6:2:2 --datasetRootPath ../train_data ``` 参数说明: - `trainValTestRatio` 是训练集、验证集、测试集的图像数量划分比例,根据实际情况设定,默认是`6:2:2` -- `labelRootPath` 是PPOCRLabel标注的数据集存放路径,默认是`../train_data/label` - -- `detRootPath` 是根据PPOCRLabel标注的数据集划分后的文本检测数据集存放的路径,默认是`../train_data/det ` - -- `recRootPath` 是根据PPOCRLabel标注的数据集划分后的字符识别数据集存放的路径,默认是`../train_data/rec` +- `datasetRootPath` 是PPOCRLabel标注的完整数据集存放路径。默认路径是 `PaddleOCR/train_data` 分割数据集前应有如下结构: + ``` + |-train_data + |-crop_img + |- word_001_crop_0.png + |- word_002_crop_0.jpg + |- word_003_crop_0.jpg + | ... + | Label.txt + | rec_gt.txt + |- word_001.png + |- word_002.jpg + |- word_003.jpg + | ... + ``` ### 3.6 错误提示 diff --git a/PPOCRLabel/gen_ocr_train_val_test.py b/PPOCRLabel/gen_ocr_train_val_test.py index 64cba612ae267835dd47aedc2b0356c9df462038..03ae566c6ec64d7ade229fb9571b0cd89ec189d4 100644 --- a/PPOCRLabel/gen_ocr_train_val_test.py +++ b/PPOCRLabel/gen_ocr_train_val_test.py @@ -17,15 +17,14 @@ def isCreateOrDeleteFolder(path, flag): return flagAbsPath -def splitTrainVal(root, dir, absTrainRootPath, absValRootPath, absTestRootPath, trainTxt, valTxt, testTxt, flag): +def splitTrainVal(root, absTrainRootPath, absValRootPath, absTestRootPath, trainTxt, valTxt, testTxt, flag): # 按照指定的比例划分训练集、验证集、测试集 - labelPath = os.path.join(root, dir) - labelAbsPath = os.path.abspath(labelPath) + dataAbsPath = os.path.abspath(root) if flag == "det": - labelFilePath = os.path.join(labelAbsPath, args.detLabelFileName) + labelFilePath = os.path.join(dataAbsPath, args.detLabelFileName) elif flag == "rec": - labelFilePath = os.path.join(labelAbsPath, args.recLabelFileName) + labelFilePath = os.path.join(dataAbsPath, args.recLabelFileName) labelFileRead = open(labelFilePath, "r", encoding="UTF-8") labelFileContent = labelFileRead.readlines() @@ -38,9 +37,9 @@ def splitTrainVal(root, dir, absTrainRootPath, absValRootPath, absTestRootPath, imageName = os.path.basename(imageRelativePath) if flag == "det": - imagePath = os.path.join(labelAbsPath, imageName) + imagePath = os.path.join(dataAbsPath, imageName) elif flag == "rec": - imagePath = os.path.join(labelAbsPath, "{}\\{}".format(args.recImageDirName, imageName)) + imagePath = os.path.join(dataAbsPath, "{}\\{}".format(args.recImageDirName, imageName)) # 按预设的比例划分训练集、验证集、测试集 trainValTestRatio = args.trainValTestRatio.split(":") @@ -90,15 +89,20 @@ def genDetRecTrainVal(args): recValTxt = open(os.path.join(args.recRootPath, "val.txt"), "a", encoding="UTF-8") recTestTxt = open(os.path.join(args.recRootPath, "test.txt"), "a", encoding="UTF-8") - for root, dirs, files in os.walk(args.labelRootPath): + splitTrainVal(args.datasetRootPath, detAbsTrainRootPath, detAbsValRootPath, detAbsTestRootPath, detTrainTxt, detValTxt, + detTestTxt, "det") + + for root, dirs, files in os.walk(args.datasetRootPath): for dir in dirs: - splitTrainVal(root, dir, detAbsTrainRootPath, detAbsValRootPath, detAbsTestRootPath, detTrainTxt, detValTxt, - detTestTxt, "det") - splitTrainVal(root, dir, recAbsTrainRootPath, recAbsValRootPath, recAbsTestRootPath, recTrainTxt, recValTxt, - recTestTxt, "rec") + if dir == 'crop_img': + splitTrainVal(root, recAbsTrainRootPath, recAbsValRootPath, recAbsTestRootPath, recTrainTxt, recValTxt, + recTestTxt, "rec") + else: + continue break + if __name__ == "__main__": # 功能描述:分别划分检测和识别的训练集、验证集、测试集 # 说明:可以根据自己的路径和需求调整参数,图像数据往往多人合作分批标注,每一批图像数据放在一个文件夹内用PPOCRLabel进行标注, @@ -110,9 +114,9 @@ if __name__ == "__main__": default="6:2:2", help="ratio of trainset:valset:testset") parser.add_argument( - "--labelRootPath", + "--datasetRootPath", type=str, - default="../train_data/label", + default="../train_data/", help="path to the dataset marked by ppocrlabel, E.g, dataset folder named 1,2,3..." ) parser.add_argument( diff --git a/doc/doc_ch/code_and_doc.md b/doc/doc_ch/code_and_doc.md index ebf5c03cf7259cfdfa03f10146eea21a89101051..5ee0ce2e5baf0ed24cb89cb7b53b113485db7eea 100644 --- a/doc/doc_ch/code_and_doc.md +++ b/doc/doc_ch/code_and_doc.md @@ -16,7 +16,7 @@ PaddleOCR的Python代码遵循 [PEP8规范](https://www.python.org/dev/peps/pep- - 空格 - - 空格应该加在逗号、分号、冒号前,而非他们的后面 + - 空格应该加在逗号、分号、冒号后,而非他们的前面 ```python # 正确: @@ -323,4 +323,4 @@ git push origin new_branch 2)如果评审意见比较多: - 请给出总体的修改情况。 -- 请采用`start a review`进行回复,而非直接回复的方式。原因是每个回复都会发送一封邮件,会造成邮件灾难。 \ No newline at end of file +- 请采用`start a review`进行回复,而非直接回复的方式。原因是每个回复都会发送一封邮件,会造成邮件灾难。 diff --git a/doc/doc_ch/models_list.md b/doc/doc_ch/models_list.md index 6843ffdc19d5bde205124c30f1d0a5fc2144ce99..8db7e174cc0cfdf55043a2e6a42b23c80d1ffe0f 100644 --- a/doc/doc_ch/models_list.md +++ b/doc/doc_ch/models_list.md @@ -50,7 +50,7 @@ PaddleOCR提供的可下载模型包括`推理模型`、`训练模型`、`预训 |模型名称|模型简介|配置文件|推理模型大小|下载地址| | --- | --- | --- | --- | --- | |ch_PP-OCRv2_rec_slim|【最新】slim量化版超轻量模型,支持中英文、数字识别|[ch_PP-OCRv2_rec.yml](../../configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec.yml)| 9M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_train.tar) | -|ch_PP-OCRv2_rec|【最新】原始超轻量模型,支持中英文、数字识别|[ch_PP-OCRv2_rec.yml](../../configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec.yml)|8.5M|[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_train.tar) | +|ch_PP-OCRv2_rec|【最新】原始超轻量模型,支持中英文、数字识别|[ch_PP-OCRv2_rec_distillation.yml](../../configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec_distillation.yml)|8.5M|[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_train.tar) | |ch_ppocr_mobile_slim_v2.0_rec|slim裁剪量化版超轻量模型,支持中英文、数字识别|[rec_chinese_lite_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml)| 6M |[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_slim_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_slim_train.tar) | |ch_ppocr_mobile_v2.0_rec|原始超轻量模型,支持中英文、数字识别|[rec_chinese_lite_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml)|5.2M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_train.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_pre.tar) | |ch_ppocr_server_v2.0_rec|通用模型,支持中英文、数字识别|[rec_chinese_common_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_common_train_v2.0.yml)|94.8M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_train.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_pre.tar) | diff --git a/doc/doc_ch/thirdparty.md b/doc/doc_ch/thirdparty.md index 7466a6edf40d533b95cb6124da9d347dbcf877a6..a80ddef4ef31c76b50cbd10b13e8f67fc24d5c8e 100644 --- a/doc/doc_ch/thirdparty.md +++ b/doc/doc_ch/thirdparty.md @@ -16,22 +16,29 @@ PaddleOCR希望可以通过AI的力量助力任何一位有梦想的开发者实 ### 1.1 基于PaddleOCR的社区项目 -- 【最新】 [FastOCRLabel](https://gitee.com/BaoJianQiang/FastOCRLabel):完整的C#版本标注工具 (@ [包建强](https://gitee.com/BaoJianQiang) ) - -#### 1.1.1 通用工具 - -- [DangoOCR离线版](https://github.com/PantsuDango/DangoOCR):通用型桌面级即时翻译工具 (@ [PantsuDango](https://github.com/PantsuDango)) -- [scr2txt](https://github.com/lstwzd/scr2txt):截屏转文字工具 (@ [lstwzd](https://github.com/lstwzd)) -- [AI Studio项目](https://aistudio.baidu.com/aistudio/projectdetail/1054614?channelType=0&channel=0):英文视频自动生成字幕( @ [叶月水狐](https://aistudio.baidu.com/aistudio/personalcenter/thirdview/322052)) - -#### 1.1.2 垂类场景工具 - -- [id_card_ocr](https://github.com/baseli/id_card_ocr):身份证复印件识别(@ [baseli](https://github.com/baseli)) -- [Paddle_Table_Image_Reader](https://github.com/thunder95/Paddle_Table_Image_Reader):能看懂表格图片的数据助手(@ [thunder95](https://github.com/thunder95])) - -#### 1.1.3 前后处理 - -- [paddleOCRCorrectOutputs](https://github.com/yuranusduke/paddleOCRCorrectOutputs):获取OCR识别结果的key-value(@ [yuranusduke](https://github.com/yuranusduke)) +| 类别 | 项目 | 描述 | 开发者 | +| -------- | ------------------------------------------------------------ | -------------------------- | ------------------------------------------------------------ | +| 通用工具 | [FastOCRLabel](https://gitee.com/BaoJianQiang/FastOCRLabel) | 完整的C#版本标注GUI | [包建强](https://gitee.com/BaoJianQiang) | +| 通用工具 | [DangoOCR离线版](https://github.com/PantsuDango/DangoOCR) | 通用型桌面级即时翻译GUI | [PantsuDango](https://github.com/PantsuDango) | +| 通用工具 | [scr2txt](https://github.com/lstwzd/scr2txt) | 截屏转文字GUI | [lstwzd](https://github.com/lstwzd) | +| 通用工具 | [ocr_sdk](https://github.com/mymagicpower/AIAS/blob/main/1_image_sdks/text_recognition/ocr_sdk) | OCR java SDK工具箱 | [Calvin](https://github.com/mymagicpower) | +| 通用工具 | [iocr](https://github.com/mymagicpower/AIAS/blob/main/8_suite_hub/iocr) | IOCR 自定义模板识别(支持表格识别) | [Calvin](https://github.com/mymagicpower) | +| 通用工具 | [Lmdb Dataset Format Conversion Tool](https://github.com/OneYearIsEnough/PaddleOCR-Recog-LmdbDataset-Conversion) | 文本识别任务中lmdb数据格式转换工具 | [OneYearIsEnough](https://github.com/OneYearIsEnough) | +| 垂类工具 | [AI Studio项目](https://aistudio.baidu.com/aistudio/projectdetail/1054614?channelType=0&channel=0) | 英文视频自动生成字幕 | [叶月水狐](https://aistudio.baidu.com/aistudio/personalcenter/thirdview/322052) | +| 垂类工具 | [id_card_ocr](https://github.com/baseli/id_card_ocr) | 身份证复印件识别 | [baseli](https://github.com/baseli) | +| 垂类工具 | [Paddle_Table_Image_Reader](https://github.com/thunder95/Paddle_Table_Image_Reader) | 能看懂表格图片的数据助手 | [thunder95](https://github.com/thunder95]) | +| 垂类工具 | [AI Studio项目](https://aistudio.baidu.com/aistudio/projectdetail/3382897) | OCR流程中对手写体进行过滤 | [daassh](https://github.com/daassh) | +| 垂类场景调优 | [AI Studio项目](https://aistudio.baidu.com/aistudio/projectdetail/2803693) | 电表读数和编号识别 | [深渊上的坑](https://github.com/edencfc) | +| 垂类工具 | [AI Studio项目](https://aistudio.baidu.com/aistudio/projectdetail/3284199) | LCD液晶字符检测 | [Dream拒杰](https://github.com/zhangyingying520) | +| 前后处理 | [paddleOCRCorrectOutputs](https://github.com/yuranusduke/paddleOCRCorrectOutputs) | 获取OCR识别结果的key-value | [yuranusduke](https://github.com/yuranusduke) | +|前处理| [optlab](https://github.com/GreatV/optlab) |OCR前处理工具箱,基于Qt和Leptonica。|[GreatV](https://github.com/GreatV)| +|应用部署| [PaddleOCRSharp](https://github.com/raoyutian/PaddleOCRSharp) |PaddleOCR的.NET封装与应用部署。|[raoyutian](https://github.com/raoyutian/PaddleOCRSharp)| +|应用部署| [PaddleSharp](https://github.com/sdcb/PaddleSharp) |PaddleOCR的.NET封装与应用部署,支持跨平台、GPU|[sdcb](https://github.com/sdcb)| +| 应用部署 | [PaddleOCR-Streamlit-Demo](https://github.com/Lovely-Pig/PaddleOCR-Streamlit-Demo) | 使用Streamlit部署PaddleOCR | [Lovely-Pig](https://github.com/Lovely-Pig) | +| 应用部署 | [PaddleOCR-PyWebIO-Demo](https://github.com/Lovely-Pig/PaddleOCR-PyWebIO-Demo) | 使用PyWebIO部署PaddleOCR | [Lovely-Pig](https://github.com/Lovely-Pig) | +| 应用部署 | [PaddleOCR-Paddlejs-Vue-Demo](https://github.com/Lovely-Pig/PaddleOCR-Paddlejs-Vue-Demo) | 使用Paddle.js和Vue部署PaddleOCR | [Lovely-Pig](https://github.com/Lovely-Pig) | +| 应用部署 | [PaddleOCR-Paddlejs-React-Demo](https://github.com/Lovely-Pig/PaddleOCR-Paddlejs-React-Demo) | 使用Paddle.js和React部署PaddleOCR | [Lovely-Pig](https://github.com/Lovely-Pig) | +| 学术前沿模型训练与推理 | [AI Studio项目](https://aistudio.baidu.com/aistudio/projectdetail/3397137) | StarNet-MobileNetV3算法–中文训练 | [xiaoyangyang2](https://github.com/xiaoyangyang2) | ### 1.2 为PaddleOCR新增功能 @@ -71,7 +78,7 @@ PaddleOCR非常欢迎社区贡献以PaddleOCR为核心的各种服务、部署 - 提交代码前请再三确认不会引入新的bug,并在PR中描述优化点。如果该PR解决了某个issue,请在PR中连接到该issue。所有的PR都应该遵守附录3中的[3.2.10 提交代码的一些约定。](./code_and_doc.md/#提交代码的一些约定) -- 请在提交之前参考下方的[附录3:Pull Request说明](./code_and_doc.md/#附录3)。如果您对git的提交流程不熟悉,同样可以参考附录3的3.2节。 +- 请在提交之前参考下方的[附录3:Pull Request说明](./code_and_doc.md#附录3)。如果您对git的提交流程不熟悉,同样可以参考附录3的3.2节。 **最后请在PR的题目中加上标签`【third-party】` , 在说明中@Evezerest,拥有此标签的PR将会被高优处理**。 diff --git a/doc/doc_en/code_and_doc_en.md b/doc/doc_en/code_and_doc_en.md new file mode 100644 index 0000000000000000000000000000000000000000..f3ee769e7dd7184226f5385056f2ef04c5000dbf --- /dev/null +++ b/doc/doc_en/code_and_doc_en.md @@ -0,0 +1,349 @@ + - Appendix + + This appendix contains python, document specifications and Pull Request process. Please follow the relevant contents + + - [Appendix 1:Python Code Specification](#Appendix1) + + - [Appendix 2:Document Specification](#Appendix2) + + - [Appendix 3:Pull Request Description](#Appendix3) + + + + ## Appendix 1:Python Code Specification + + The Python code of PaddleOCR follows [PEP8 Specification]( https://www.python.org/dev/peps/pep-0008/ ), some of the key concerns include the following + + - Space + + - Spaces should be added after commas, semicolons, colons, not before them + + ```python + # true: + print(x, y) + + # false: + print(x , y) + ``` + + - When specifying a keyword parameter or default parameter value in a function, do not use spaces on both sides of it + + ```python + # true: + def complex(real, imag=0.0) + # false: + def complex(real, imag = 0.0) + ``` + + - comment + + - Inline comments: inline comments are indicated by the` # `sign. Two spaces should be left between code and` # `, and one space should be left between` # `and comments, for example + + ```python + x = x + 1 # Compensate for border + ``` + + - Functions and methods: The definition of each function should include the following: + + - Function description: Utility, input and output of function + + - Args: Name and description of each parameter + - Returns: The meaning and type of the return value + + ```python + def fetch_bigtable_rows(big_table, keys, other_silly_variable=None): + """Fetches rows from a Bigtable. + + Retrieves rows pertaining to the given keys from the Table instance + represented by big_table. Silly things may happen if + other_silly_variable is not None. + + Args: + big_table: An open Bigtable Table instance. + keys: A sequence of strings representing the key of each table row + to fetch. + other_silly_variable: Another optional variable, that has a much + longer name than the other args, and which does nothing. + + Returns: + A dict mapping keys to the corresponding table row data + fetched. Each row is represented as a tuple of strings. For + example: + + {'Serak': ('Rigel VII', 'Preparer'), + 'Zim': ('Irk', 'Invader'), + 'Lrrr': ('Omicron Persei 8', 'Emperor')} + + If a key from the keys argument is missing from the dictionary, + then that row was not found in the table. + """ + pass + ``` + + + + ## Appendix 2: Document Specification + + ### 2.1 Overall Description + + - Document Location: If you add new features to your original Markdown file, please **Do not re-create** a new file. If you don't know where to add it, you can first PR the code and then ask the official in commit. + + - New Markdown Document Name: Describe the content of the document in English, typically a combination of lowercase letters and underscores, such as `add_New_Algorithm.md` + + - New Markdown Document Format: Catalog - Body - FAQ + + > The directory generation method can use [this site](https://ecotrust-canada.github.io/markdown-toc/ ) Automatically extract directories after copying MD contents, and then add ` before each heading of the MD file + + - English and Chinese: Any changes or additions to the document need to be made in both Chinese and English documents. + + ### 2.2 Format Specification + + - Title format: The document title format follows the format of: Arabic decimal point combination-space-title (for example, `2.1 XXXX`, `2.XXXX`) + + - Code block: Displays code in code block format that needs to be run, describing the meaning of command parameters before the code block. for example: + + > Pipeline of detection + direction Classify + recognition: Vertical text can be recognized after set direction classifier parameters`--use_angle_cls true`. + > + > ``` + > paddleocr --image_dir ./imgs/11.jpg --use_angle_cls true + > ``` + + - Variable Rrferences: If code variables or command parameters are referenced in line, they need to be represented in line code, for example, above `--use_angle_cls true` with one space in front and one space in back + + - Uniform naming: e.g. PP-OCRv2, PP-OCR mobile, `paddleocr` whl package, PPOCRLabel, Paddle Lite, etc. + + - Supplementary notes: Supplementary notes by reference format `>`. + + - Picture: If a picture is added to the description document, specify the naming of the picture (describing its content) and add the picture under `doc/`. + + - Title: Capitalize the first letter of each word in the title. + + + + ## Appendix 3: Pull Request Description + + ### 3.1 PaddleOCR Branch Description + + PaddleOCR will maintain two branches in the future, one for each: + + - release/x.x family branch: stable release version branch, also the default branch. PaddleOCR releases a new release branch based on feature updates and adapts to the release version of Paddle. As versions iterate, more and more release/x.x family branches are maintained by default with the latest version of the release branch. + - dygraph branch: For the development branch, adapts the dygraph version of the Paddle dynamic graph to primarily develop new functionality. If you need to redevelop, choose the dygraph branch. To ensure that the dygraph branch pulls out the release/x.x branch when needed, the code for the dygraph branch can only use the valid API in the latest release branch of Paddle. That is, if a new API has been developed in the Paddle dygraph branch but has not yet appeared in the release branch code, do not use it in Paddle OCR. In addition, performance optimization, parameter tuning, policy updates that do not involve API can be developed normally. + + The historical branch of PaddleOCR will no longer be maintained in the future. These branches will continue to be maintained, considering that some of you may still be using them: + + - Develop branch: This branch was used for the development and testing of static diagrams and is currently compatible with version >=1.7. If you have special needs, you can also use this branch to accommodate older versions of Paddle, but you won't update your code until you fix the bug. + + PaddleOCR welcomes you to actively contribute code to repo. Here are some basic processes for contributing code. + + ### 3.2 PaddleOCR Code Submission Process And Specification + + > If you are familiar with Git use, you can jump directly to [Some Conventions For Submitting Code in 3.2.10](#Some_conventions_for_submitting_code) + + #### 3.2.1 Create Your `Remote Repo` + + - In PaddleOCR [GitHub Home]( https://github.com/PaddlePaddle/PaddleOCR ) Click the `Fork` button in the upper left corner to create a `remote repo`in your personal directory, such as ` https://github.com/ {your_name}/PaddleOCR`. + + ![banner](../banner.png) + + - Clone `Remote repo` + + ``` + # pull code of develop branch + git clone https://github.com/{your_name}/PaddleOCR.git -b dygraph + cd PaddleOCR + ``` + + > Clone failures are mostly due to network reasons, try again later or configure the proxy + + #### 3.2.2 Login And Connect Using Token + + Start by viewing the information for the current `remote repo`. + + ``` + git remote -v + # origin https://github.com/{your_name}/PaddleOCR.git (fetch) + # origin https://github.com/{your_name}/PaddleOCR.git (push) + ``` + + Only the information of the clone `remote repo`, i.e. the PaddleOCR under your username, is available. Due to the change in Github's login method, you need to reconfigure the `remote repo` address by means of a Token. The token is generated as follows: + + 1. Find Personal Access Tokens: Click on your avatar in the upper right corner of the Github page and choose Settings --> Developer settings --> Personal access tokens, + + 2. Click Generate new token: Fill in the token name in Note, such as 'paddle'. In Select scopes, select repo (required), admin:repo_hook, delete_repo, etc. You can check them according to your needs. Then click Generate token to generate the token, and finally copy the generated token. + + Delete the original origin configuration + + ``` + git remote rm origin + ``` + + Change the remote branch to `https://oauth2:{token}@github.com/{your_name}/PaddleOCR.git`. For example, if the token value is 12345 and your user name is PPOCR, run the following command + + ``` + git remote add origin https://oauth2:12345@github.com/PPOCR/PaddleOCR.git + ``` + + This establishes a connection to our own `remote repo`. Next we create a remote host of the original PaddleOCR repo, named upstream. + + ``` + git remote add upstream https://github.com/PaddlePaddle/PaddleOCR.git + ``` + + Use `git remote -v` to view current `remote warehouse` information, output as follows, found to include two origin and two upstream of `remote repo` . + + ``` + origin https://github.com/{your_name}/PaddleOCR.git (fetch) + origin https://github.com/{your_name}/PaddleOCR.git (push) + upstream https://github.com/PaddlePaddle/PaddleOCR.git (fetch) + upstream https://github.com/PaddlePaddle/PaddleOCR.git (push) + ``` + + This is mainly to keep the local repository up to date when subsequent pull request (PR) submissions are made. + + #### 3.2.3 Create Local Branch + + First get the latest code of upstream, then create a new_branch branch based on the dygraph of the upstream repo (upstream). + + ``` + git fetch upstream + git checkout -b new_branch upstream/dygraph + ``` + + > If for a newly forked PaddleOCR project, the user's remote repo (origin) has the same branch updates as the upstream repository (upstream), you can also create a new local branch based on the default branch of the origin repo or a specified branch with the following command + > + > ``` + > # Create new_branch branch on user remote repo (origin) based on develop branch + > git checkout -b new_branch origin/develop + > # Create new_branch branch based on upstream remote repo develop branch + > # If you need to create a new branch from upstream, + > # you need to first use git fetch upstream to get upstream code + > git checkout -b new_branch upstream/develop + > ``` + + The final switch to the new branch is displayed with the following output information. + + ``` + Branch new_branch set up to track remote branch develop from upstream. + Switched to a new branch 'new_branch' + ``` + + After switching branches, file changes can be made on this branch + + #### 3.2.4 Use Pre-Commit Hook + + Paddle developers use the pre-commit tool to manage Git pre-submit hooks. It helps us format the source code (C++, Python) and automatically check for basic things (such as having only one EOL per file, not adding large files to Git) before committing it. + + The pre-commit test is part of the unit test in Travis-CI. PR that does not satisfy the hook cannot be submitted to PaddleOCR. Install it first and run it in the current directory: + + ``` + pip install pre-commit + pre-commit install + ``` + + > 1. Paddle uses clang-format to adjust the C/C++ source code format. Make sure the `clang-format` version is above 3.8. + > + > 2. Yapf installed through pip install pre-commit is slightly different from conda install-c conda-forge pre-commit, and PaddleOCR developers use `pip install pre-commit`. + + #### 3.2.5 Modify And Submit Code + + If you make some changes on `README.Md ` on PaddleOCR, you can view the changed file through `git status`, and then add the changed file using `git add`。 + + ``` + git status # View change files + git add README.md + pre-commit + ``` + + Repeat these steps until the pre-comit format check does not error. As shown below. + + ![img](../precommit_pass.png) + + Use the following command to complete the submission. + + ``` + git commit -m "your commit info" + ``` + + #### 3.2.6 Keep Local Repo Up To Date + + Get the latest code for upstream and update the current branch. Here the upstream comes from section 2.2, `Connecting to a remote repo`. + + ``` + git fetch upstream + # If you want to commit to another branch, you need to pull code from another branch of upstream, here is develop + git pull upstream develop + ``` + + #### 3.2.7 Push To Remote Repo + + ``` + git push origin new_branch + ``` + + #### 3.2.7 Submit Pull Request + + Click the new pull request to select the local branch and the target branch, as shown in the following figure. In the description of PR, fill in the functions completed by the PR. Next, wait for review, and if you need to modify something, update the corresponding branch in origin with the steps above. + + ![banner](../pr.png) + + #### 3.2.8 Sign CLA Agreement And Pass Unit Tests + + - Signing the CLA When submitting a Pull Request to PaddlePaddle for the first time, you need to sign a CLA (Contributor License Agreement) agreement to ensure that your code can be incorporated as follows: + + 1. Please check the Check section in PR, find the license/cla, and click on the right detail to enter the CLA website + + 2. Click Sign in with GitHub to agree on the CLA website and when clicked, it will jump back to your Pull Request page + + #### 3.2.9 Delete Branch + + - Remove remote branch + + After PR is merged into the main repo, we can delete the branch of the remote repofrom the PR page. + You can also use `git push origin:branch name` to delete remote branches, such as: + + ``` + git push origin :new_branch + ``` + +- Delete local branch + + ``` + # Switch to the development branch, otherwise the current branch cannot be deleted + git checkout develop + + # Delete new_ Branch Branch + git branch -D new_branch + ``` + + + + #### 3.2.10 Some Conventions For Submitting Code + + In order for official maintainers to better focus on the code itself when reviewing it, please follow the following conventions each time you submit your code: + + 1)Please ensure that the unit tests in Travis-CI pass smoothly. If not, indicate that there is a problem with the submitted code, and the official maintainer generally does not review it. + + 2)Before submitting a Pull Request. + + - Note the number of commits. + + Reason: If you only modify one file and submit more than a dozen commits, each commit will only make a few modifications, which can be very confusing to the reviewer. The reviewer needs to look at each commit individually to see what changes have been made, and does not exclude the fact that changes between commits overlap each other. + + Suggestion: Keep as few commits as possible each time you submit, and supplement your last commit with git commit --amend. For multiple commits that have been Push to a remote warehouse, you can refer to [squash commits after push](https://stackoverflow.com/questions/5667884/how-to-squash-commits-in-git-after-they-have-been-pushed ). + + - Note the name of each commit: it should reflect the content of the current commit, not be too arbitrary. + + + 3) If you have solved a problem, add in the first comment box of the Pull Request:fix #issue_number,This will automatically close the corresponding Issue when the Pull Request is merged. Key words include:close, closes, closed, fix, fixes, fixed, resolve, resolves, resolved,please choose the right vocabulary. Detailed reference [Closing issues via commit messages](https://help.github.com/articles/closing-issues-via-commit-messages). + + In addition, in response to the reviewer's comments, you are requested to abide by the following conventions: + + 1) Each review comment from an official maintainer would like a response, which would better enhance the contribution of the open source community. + + - If you agree to the review opinion and modify it accordingly, give a simple Done. + - If you disagree with the review, please give your own reasons for refuting. + + 2)If there are many reviews: + + - Please give an overview of the changes. + - Please reply with `start a review', not directly. The reason is that each reply sends an e-mail message, which can cause a mail disaster. diff --git a/doc/doc_en/community_contribution_en.md b/doc/doc_en/community_contribution_en.md new file mode 100644 index 0000000000000000000000000000000000000000..a4a83cc5b5e942b94d0249586d06d24c3856f5e0 --- /dev/null +++ b/doc/doc_en/community_contribution_en.md @@ -0,0 +1,100 @@ +# Community Contribution + +Thank you for your support and interest in PaddleOCR. The goal of PaddleOCR is to build a professional, harmonious and supportive open source community with developers. This document presents existing community contributions, explanations for various contributions, and new opportunities and processes to make the contribution process more efficient and clear. + +PaddleOCR wants to help any developer with a dream realize their vision and enjoy the joy of creating value through the power of AI. + +--- + + + + + +> The picture above shows PaddleOCR's current Contributor, updated regularly + +## 1. Community Contribution + +### 1.1 PaddleOCR Based Community Project + +- 【The lastest】 [FastOCRLabel](https://gitee.com/BaoJianQiang/FastOCRLabel): Complete C# version annotation tool (@ [包建强](https://gitee.com/BaoJianQiang) ) + +#### 1.1.1 Universal Tools + +- [DangoOCR offline version](https://github.com/PantsuDango/DangoOCR):Universal desktop instant translation tool (@ [PantsuDango](https://github.com/PantsuDango)) +- [scr2txt](https://github.com/lstwzd/scr2txt):Screenshot to Text tool (@ [lstwzd](https://github.com/lstwzd)) +- [AI Studio project](https://aistudio.baidu.com/aistudio/projectdetail/1054614?channelType=0&channel=0):English video automatically generates subtitles( @ [叶月水狐](https://aistudio.baidu.com/aistudio/personalcenter/thirdview/322052)) + +#### 1.1.2 Vertical Scene Tools + +- [id_card_ocr](https://github.com/baseli/id_card_ocr):Identification of copy of ID card(@ [baseli](https://github.com/baseli)) +- [Paddle_Table_Image_Reader](https://github.com/thunder95/Paddle_Table_Image_Reader): A data assistant that can read tables and pictures(@ [thunder95](https://github.com/thunder95])) + +#### 1.1.3 Pre And Post Processing + +- [paddleOCRCorrectOutputs](https://github.com/yuranusduke/paddleOCRCorrectOutputs):Get the key-value of OCR recognition result (@ [yuranusduke](https://github.com/yuranusduke)) + +### 1.2 New Features For PaddleOCR + +- Thanks [authorfu](https://github.com/authorfu) for contributing Android([#340](https://github.com/PaddlePaddle/PaddleOCR/pull/340)) and [xiadeye](https://github.com/xiadeye) for contributing IOS demo code([#325](https://github.com/PaddlePaddle/PaddleOCR/pull/325)). +- Thanks [tangmq](https://gitee.com/tangmq) for adding docker deployment service to PaddleOCR to support quick release of callable restful API services([#507](https://github.com/PaddlePaddle/PaddleOCR/pull/507)). +- Thanks [lijinhan](https://github.com/lijinhan) for adding Java springboot to PaddleOCR and call OCR hubserving interface to complete the use of OCR service deployment([#1027](https://github.com/PaddlePaddle/PaddleOCR/pull/1027)). +- Thanks [Evezerest](https://github.com/Evezerest), [ninetailskim](https://github.com/ninetailskim), [edencfc](https://github.com/edencfc), [BeyondYourself](https://github.com/BeyondYourself), [1084667371](https://github.com/1084667371) for contributing complete code of [PPOCRLabel](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.3/PPOCRLabel/README_ch.md). + +### 1.3 Code And Document Optimization + +- Thanks [zhangxin](https://github.com/ZhangXinNan)([Blog](https://blog.csdn.net/sdlypyzq)) for contributing new visualization methods and adding .gitgnore, handling the problem of manually setting the PYTHONPATH environment variable([#210](https://github.com/PaddlePaddle/PaddleOCR/pull/210)). +- Thanks [lyl120117](https://github.com/lyl120117) for contributing code to print network structure([#304](https://github.com/PaddlePaddle/PaddleOCR/pull/304)). +- Thanks [BeyondYourself](https://github.com/BeyondYourself) for making a lot of great suggestions for PaddleOCR and simplifying some code styles of paddleocr([so many commits)](https://github.com/PaddlePaddle/PaddleOCR/commits?author=BeyondYourself). +- Thanks [Khanh Tran](https://github.com/xxxpsyduck) and [Karl Horky](https://github.com/karlhorky) for contributing modifing English documents. + +### 1.4 Multilingual Corpus + +- Thanks [xiangyubo](https://github.com/xiangyubo) for contributing handwritting Chinese OCR dataset([#321](https://github.com/PaddlePaddle/PaddleOCR/pull/321)). +- Thanks [Mejans](https://github.com/Mejans) for contributing dictionary and corpus of the new language Occitan to PaddleOCR([#954](https://github.com/PaddlePaddle/PaddleOCR/pull/954)). + +## 2. Contribution Illustrating + +### 2.1 New Function Class + +PaddleOCR welcomes community contributions to various services, deployment examples and software applications with paddleOCR as the core. Certified community contributions will be added to the above community contribution table to increase exposure for the majority of developers, which is also the glory of PaddleOCR, including: + +- Project form: the project code certified by the official community shall have good specifications and structure, and shall be equipped with a detailed README.md, which describes how to use the project. Through add a line 'paddleocr' to the requirements.txt, which can be automatically included in the usedby of paddleocr. + +- Integration method: if it is an update to the existing PaddleOCR tool, it will be integrated into the main repo. If a new function is expanded for paddleocr, please contact the official personnel first to confirm whether the project is integrated into the master repo, *even if the new function is not integrated into the master repo, we will also increase the exposure of your personal project in the way of community contribution.* + + +### 2.2 Code Optimization + +If you encounter code bugs and unexpected functions when using PaddleOCR, you can contribute your modifications to PaddleOCR, including: + +- Python code specifications are available for reference [Appendix 1:Python code specifications](./code_and_doc_en.md/#Appendix1). + +- Before submitting the code, please confirm again and again that no new bugs will be introduced, and describe the optimization points in the PR. If the PR solves an issue, please connect to the issue in the PR. All PR shall comply with the requirements in Appendix [3.2.10 Some conventions for submitting code.](./code_and_doc_en.md/#Some conventions for submitting code) + +- Please refer to the below before submitting. If you are not familiar with the git submission process, you can also refer to Section 3.2 of [Appendix 3: description of Pull Request](./code_and_doc_en.md/#Appendix3).If you are not familiar with the git submission process, you can also refer to Section 3.2 of Appendix 3. + +**Finally, please add the label Third Party in the title of PR and @ Everest in the description , PR with this label will be treated with high priority`[third-part]`.** + +### 2.3 Document Optimization + +If you encounter problems such as unclear document description, missing description and invalid link when using PaddleOCR, you can contribute your modifications to PaddleOCR. For document writing specifications, please refer to [Appendix 2: document specifications](./code_and_doc_en.md/#Appendix2). **Finally, please add the label Third Party in the title of PR and @ Everest in the description , PR with this label will be treated with high priority`[third-party].** + +## 3. More Contribution Opportunities + +We encourage developers to use PaddleOCR to realize their ideas. At the same time, we also list some valuable development directions after analysis, which are collected in the regular season of community projects as a whole. + +## 4. Contact Us + +We very much welcome developers to contact us before they intend to contribute code, documents, corpus and other contents to PaddleOCR, which can greatly reduce the communication cost in the PR process. At the same time, if you find some ideas difficult to realize personally, we can also recruit like-minded developers for the project in the form of SIG. Projects funded through SIG channels will receive deep R & D support and operational resources (such as official account publicity, live broadcast lessons, etc.). + +Our recommended contribution process is: + +- By adding the `[Third Party]` mark in the topic of GitHub issue, explain the problems encountered (and the ideas to solve) or the functions to be expanded, and wait for the reply of the person on duty. For example, ` [Third Party] contributes IOS examples to PaddleOCR`. +- After communicating with us and confirming that the technical scheme or bugs and optimization points are correct, add functions or modify them accordingly, and the codes and documents shall comply with relevant specifications. +- PR links to the above issue and waits for review. + +## 5. Thanks And Follow-Up + + - After the code is combined, the information will be updated in the first section of this document. The default link is GitHub name and home page. If you need to change the home page, you can also contact us. + - New important function classes will be advertised in the user group and enjoy the honor of the open source community. + - **If you have a PaddleOCR based project that does not appear in the above list, follow `4. Contact Us` .** diff --git a/doc/doc_en/enhanced_ctc_loss_en.md b/doc/doc_en/enhanced_ctc_loss_en.md new file mode 100644 index 0000000000000000000000000000000000000000..908f79e412e2a00e4fec027befc8a1430c077e27 --- /dev/null +++ b/doc/doc_en/enhanced_ctc_loss_en.md @@ -0,0 +1,110 @@ +# Enhanced CTC Loss + +In OCR recognition, CRNN is a text recognition algorithm widely applied in the industry. In the training phase, it uses CTCLoss to calculate the network loss. In the inference phase, it uses CTCDecode to obtain the decoding result. Although the CRNN algorithm has been proven to achieve reliable recognition results in actual business, users have endless requirements for recognition accuracy. So how to improve the accuracy of text recognition? Taking CTCLoss as the starting point, this paper explores the improved fusion scheme of CTCLoss from three different perspectives: Hard Example Mining, Multi-task Learning, and Metric Learning. Based on the exploration, we propose EnhancedCTCLoss, which includes the following 3 components: Focal-CTC Loss, A-CTC Loss, C-CTC Loss. + +## 1. Focal-CTC Loss + +Focal Loss was proposed by the paper, "[Focal Loss for Dense Object Detection](https://arxiv.org/abs/1708.02002)". When the loss was first proposed, it was mainly to solve the problem of a serious imbalance in the ratio of positive and negative samples in one-stage target detection. This loss function reduces the weight of a large number of simple negative samples in training and also can be understood as a kind of difficult sample mining. +The form of the loss function is as follows: + +
+ +
+ +Among them, y' is the output of the activation function, and the value is between 0-1. It adds a modulation factor (1-y’)^γ and a balance factor α on the basis of the original cross-entropy loss. When α = 1, y = 1, the comparison between the loss function and the cross-entropy loss is shown in the following figure: + +
+ +
+ + + +As can be seen from the above figure, when γ > 0, the adjustment coefficient (1-y’)^γ gives smaller weight to the easy-to-classify sample loss, making the network pay more attention to the difficult and misclassified samples. The adjustment factor γ is used to adjust the rate at which the weight of simple samples decreases. When γ = 0, it is the cross-entropy loss function. When γ increases, the influence of the adjustment factor will also increase. Experiments revealed that 2 is the optimal value of γ. The balance factor α is used to balance the uneven proportions of the positive and negative samples. In the text, α is taken as 0.25. + +For the classic CTC algorithm, suppose a certain feature sequence (f1, f2, ......ft), after CTC decoding, the probability that the result is equal to label is y', then the probability that the CTC decoding result is not equal to label is (1-y'); it is not difficult to find that the CTCLoss value and y' have the following relationship: + +
+ +
+ + + +Combining the idea of Focal Loss, assigning larger weights to difficult samples and smaller weights to simple samples can make the network focus more on the mining of difficult samples and further improve the accuracy of recognition. Therefore, we propose Focal-CTC Loss. Its definition is as follows: + +
+ +
+ + + +In the experiment, the value of γ is 2, α = 1, see this for specific implementation: [rec_ctc_loss.py](../../ppocr/losses/rec_ctc_loss.py) + + + +## 2. A-CTC Loss + +A-CTC Loss is short for CTC Loss + ACE Loss. Among them, ACE Loss was proposed by the paper, “[Aggregation Cross-Entropy for Sequence Recognition](https://arxiv.org/abs/1904.08364)”. Compared with CTCLoss, ACE Loss has the following two advantages: ++ ACE Loss can solve the recognition problem of 2-D text, while CTCLoss can only process 1-D text ++ ACE Loss is better than CTC loss in time complexity and space complexity + +The advantages and disadvantages of the OCR recognition algorithm summarized by the predecessors are shown in the following figure: + +
+ +
+ + +Although ACELoss does handle 2D predictions, as shown in the figure above, and has advantages in memory usage and inference speed, in practice, we found that using ACELoss alone, the recognition effect is not as good as CTCLoss. Consequently, we tried to combine CTCLoss and ACELoss, and CTCLoss is the mainstay while ACELoss acts as an auxiliary supervision loss. This attempt has achieved better results. On our internal experimental data set, compared to using CTCLoss alone, the recognition accuracy can be improved by about 1%. +A_CTC Loss is defined as follows: + +
+ +
+ + + +In the experiment, λ = 0.1. See the ACE loss implementation code: [ace_loss.py](../../ppocr/losses/ace_loss.py) + + + +## 3. C-CTC Loss + +C-CTC Loss is short for CTC Loss + Center Loss. Among them, Center Loss was proposed by the paper, “[A Discriminative Feature Learning Approach for Deep Face Recognition](https://link.springer.com/chapter/10.1007/978-3-319-46478-7_31)“. It was first used in face recognition tasks to increase the distance between classes and reduce the distance within classes. It is an earlier and also widely used algorithm. + +In the task of Chinese OCR recognition, through the analysis of bad cases, we found that a major difficulty in Chinese recognition is that there are many similar characters, which are easy to misunderstand. From this, we thought about whether we can learn from the idea of n to increase the class spacing of similar characters, to improve recognition accuracy. However, Metric Learning is mainly used in the field of image recognition, and the label of the training data is a fixed value; for OCR recognition, it is a sequence recognition task essentially, and there is no explicit alignment between features and labels. Therefore, how to combine the two is still a direction worth exploring. + +By trying Arcmargin, Cosmargin and other methods, we finally found that Centerloss can help further improve the accuracy of recognition. C_CTC Loss is defined as follows: + +
+ +
+ +In the experiment, we set λ=0.25. See the center_loss implementation code: [center_loss.py](../../ppocr/losses/center_loss.py) + +It is worth mentioning that in C-CTC Loss, choosing to initialize the Center randomly does not bring significant improvement. Our Center initialization method is as follows: ++ Based on the original CTCLoss, a network N is obtained by training ++ Select the training set, identify the completely correct part, and form the set G ++ Send each sample in G to the network, perform forward calculation, and extract the correspondence between the input of the last FC layer (ie feature) and the result of argmax calculation (ie index) ++ Aggregate features with the same index, calculate the average, and get the initial center of each character. + +Taking the configuration file `configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec.yml` as an example, the center extraction command is as follows: + +``` +python tools/export_center.py -c configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec.yml -o Global.pretrained_model="./output/rec_mobile_pp-OCRv2/best_accuracy" +``` + +After running, `train_center.pkl` will be generated in the main directory of PaddleOCR. + + + +## 4. Experiment + +For the above three solutions, we conducted training and evaluation based on Baidu's internal data set. The experimental conditions are shown in the following table: + +| algorithm | Focal_CTC | A_CTC | C-CTC | +| :-------- | :-------- | ----: | :---: | +| gain | +0.3% | +0.7% | +1.7% | + +Based on the above experimental conclusions, we adopted the C-CTC strategy in PP-OCRv2. It is worth mentioning that, because PP-OCRv2 deals with the recognition task of 6625 Chinese characters, the character set is relatively large and there are many similar characters, so the C-CTC solution brings a significant improvement on this task. But if you switch to other OCR recognition tasks, the conclusion may be different. You can try Focal-CTC, A-CTC, C-CTC, and the combined solution EnhancedCTC. We believe it will bring different degrees of improvement. + +The unified combined plan is shown in the following file: [rec_enhanced_ctc_loss.py](../../ppocr/losses/rec_enhanced_ctc_loss.py) \ No newline at end of file diff --git a/doc/doc_en/models_list_en.md b/doc/doc_en/models_list_en.md index e3cf251c3439ba009a1de0ba48f7c0aa10b117c4..62e48309c83602112df38a4a8afed26bc4c5b6a7 100644 --- a/doc/doc_en/models_list_en.md +++ b/doc/doc_en/models_list_en.md @@ -43,8 +43,8 @@ Relationship of the above models is as follows. |model name|description|config|model size|download| | --- | --- | --- | --- | --- | -|ch_PP-OCRv2_rec_slim|[New] Slim qunatization with distillation lightweight model, supporting Chinese, English, multilingual text detection|[ch_PP-OCRv2_rec.yml](../../configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec.yml)| 9M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_train.tar) | -|ch_PP-OCRv2_rec|[New] Original lightweight model, supporting Chinese, English, multilingual text detection|[ch_PP-OCRv2_rec.yml](../../configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec.yml)|8.5M|[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_train.tar) | +|ch_PP-OCRv2_rec_slim|[New] Slim qunatization with distillation lightweight model, supporting Chinese, English, multilingual text recognition|[ch_PP-OCRv2_rec.yml](../../configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec.yml)| 9M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_train.tar) | +|ch_PP-OCRv2_rec|[New] Original lightweight model, supporting Chinese, English, multilingual text recognition|[ch_PP-OCRv2_rec_distillation.yml](../../configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec_distillation.yml)|8.5M|[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_train.tar) | |ch_ppocr_mobile_slim_v2.0_rec|Slim pruned and quantized lightweight model, supporting Chinese, English and number recognition|[rec_chinese_lite_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml)| 6M | [inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_slim_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_slim_train.tar) | |ch_ppocr_mobile_v2.0_rec|Original lightweight model, supporting Chinese, English and number recognition|[rec_chinese_lite_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml)|5.2M|[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_train.tar) / [pre-trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_pre.tar) | |ch_ppocr_server_v2.0_rec|General model, supporting Chinese, English and number recognition|[rec_chinese_common_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_common_train_v2.0.yml)|94.8M|[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_train.tar) / [pre-trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_pre.tar) | diff --git a/ppstructure/docs/kie_en.md b/ppstructure/docs/kie_en.md new file mode 100644 index 0000000000000000000000000000000000000000..a424968a9b5a33132afe52a4850cfe541919ae1c --- /dev/null +++ b/ppstructure/docs/kie_en.md @@ -0,0 +1,77 @@ + + +# Key Information Extraction(KIE) + +This section provides a tutorial example on how to quickly use, train, and evaluate a key information extraction(KIE) model, [SDMGR](https://arxiv.org/abs/2103.14470), in PaddleOCR. + +[SDMGR(Spatial Dual-Modality Graph Reasoning)](https://arxiv.org/abs/2103.14470) is a KIE algorithm that classifies each detected text region into predefined categories, such as order ID, invoice number, amount, and etc. + + +* [1. Quick Use](#1-----) +* [2. Model Training](#2-----) +* [3. Model Evaluation](#3-----) + + + +## 1. Quick Use + +[Wildreceipt dataset](https://paperswithcode.com/dataset/wildreceipt) is used for this tutorial. It contains 1765 photos, with 25 classes, and 50000 text boxes, which can be downloaded by wget: + +```shell +wget https://paddleocr.bj.bcebos.com/dygraph_v2.1/kie/wildreceipt.tar && tar xf wildreceipt.tar +``` + +Download the pretrained model and predict the result: + +```shell +cd PaddleOCR/ +wget https://paddleocr.bj.bcebos.com/dygraph_v2.1/kie/kie_vgg16.tar && tar xf kie_vgg16.tar +python3.7 tools/infer_kie.py -c configs/kie/kie_unet_sdmgr.yml -o Global.checkpoints=kie_vgg16/best_accuracy Global.infer_img=../wildreceipt/1.txt +``` + +The prediction result is saved as `./output/sdmgr_kie/predicts_kie.txt`, and the visualization results are saved in the folder`/output/sdmgr_kie/kie_results/`. + +The visualization results are shown in the figure below: + +
+ +
+ + +## 2. Model Training + +Create a softlink to the folder, `PaddleOCR/train_data`: +```shell +cd PaddleOCR/ && mkdir train_data && cd train_data + +ln -s ../../wildreceipt ./ +``` + +The configuration file used for training is `configs/kie/kie_unet_sdmgr.yml`. The default training data path in the configuration file is `train_data/wildreceipt`. After preparing the data, you can execute the model training with the following command: +```shell +python3.7 tools/train.py -c configs/kie/kie_unet_sdmgr.yml -o Global.save_model_dir=./output/kie/ +``` + + +## 3. Model Evaluation + +After training, you can execute the model evaluation with the following command: + +```shell +python3.7 tools/eval.py -c configs/kie/kie_unet_sdmgr.yml -o Global.checkpoints=./output/kie/best_accuracy +``` + +**Reference:** + + + +```bibtex +@misc{sun2021spatial, + title={Spatial Dual-Modality Graph Reasoning for Key Information Extraction}, + author={Hongbin Sun and Zhanghui Kuang and Xiaoyu Yue and Chenhao Lin and Wayne Zhang}, + year={2021}, + eprint={2103.14470}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` diff --git a/ppstructure/docs/quickstart.md b/ppstructure/docs/quickstart.md index 446c577ec39cf24dd4b8699558c633a1308fa444..8d9c0a31c5cbd8d3c26acc6f516f733712e78f27 100644 --- a/ppstructure/docs/quickstart.md +++ b/ppstructure/docs/quickstart.md @@ -1,13 +1,13 @@ # PP-Structure 快速开始 -* [1. 安装PaddleOCR whl包](#1) -* [2. 便捷使用](#2) - + [2.1 命令行使用](#21) - + [2.2 Python脚本使用](#22) - + [2.3 返回结果说明](#23) - + [2.4 参数说明](#24) -* [3. Python脚本使用](#3) - +- [PP-Structure 快速开始](#pp-structure-快速开始) + - [1. 安装依赖包](#1-安装依赖包) + - [2. 便捷使用](#2-便捷使用) + - [2.1 命令行使用](#21-命令行使用) + - [2.2 Python脚本使用](#22-python脚本使用) + - [2.3 返回结果说明](#23-返回结果说明) + - [2.4 参数说明](#24-参数说明) + - [3. Python脚本使用](#3-python脚本使用) @@ -33,6 +33,7 @@ pip3 install -e . ### 2.1 命令行使用 * 版面分析+表格识别 + ```bash paddleocr --image_dir=../doc/table/1.png --type=structure ``` @@ -46,6 +47,7 @@ coming soon ### 2.2 Python脚本使用 * 版面分析+表格识别 + ```python import os import cv2 @@ -79,9 +81,11 @@ comming soon ### 2.3 返回结果说明 + PP-Structure的返回结果为一个dict组成的list,示例如下 * 版面分析+表格识别 + ```shell [ { 'type': 'Text', @@ -91,13 +95,14 @@ PP-Structure的返回结果为一个dict组成的list,示例如下 } ] ``` + dict 里各个字段说明如下 -| 字段 | 说明 | -| --------------- | -------------| -|type|图片区域的类型| -|bbox|图片区域的在原图的坐标,分别[左上角x,左上角y,右下角x,右下角y]| -|res|图片区域的OCR或表格识别结果。
表格: 表格的HTML字符串;
OCR: 一个包含各个单行文字的检测坐标和识别结果的元组| +| 字段 | 说明 | +| ---- | -------------------------------------------------------------------------------------------------------------------------- | +| type | 图片区域的类型 | +| bbox | 图片区域的在原图的坐标,分别[左上角x,左上角y,右下角x,右下角y] | +| res | 图片区域的OCR或表格识别结果。`
` 表格: 表格的HTML字符串; `
` OCR: 一个包含各个单行文字的检测坐标和识别结果的元组 | * VQA @@ -107,20 +112,20 @@ comming soon ### 2.4 参数说明 -| 字段 | 说明 | 默认值 | -| --------------- | ---------------------------------------- | ------------------------------------------- | -| output | excel和识别结果保存的地址 | ./output/table | -| table_max_len | 表格结构模型预测时,图像的长边resize尺度 | 488 | -| table_model_dir | 表格结构模型 inference 模型地址 | None | -| table_char_type | 表格结构模型所用字典地址 | ../ppocr/utils/dict/table_structure_dict.txt | -| model_name_or_path | VQA SER模型地址 | None | -| max_seq_length | VQA SER模型最大支持token长度 | 512 | -| label_map_path | VQA SER 标签文件地址 | ./vqa/labels/labels_ser.txt | -| mode | pipeline预测模式,structure: 版面分析+表格识别; vqa: ser文档信息抽取 | structure | +| 字段 | 说明 | 默认值 | +| ------------------ | -------------------------------------------------------------------- | -------------------------------------------- | +| output | excel和识别结果保存的地址 | ./output/table | +| table_max_len | 表格结构模型预测时,图像的长边resize尺度 | 488 | +| table_model_dir | 表格结构模型 inference 模型地址 | None | +| table_char_type | 表格结构模型所用字典地址 | ../ppocr/utils/dict/table_structure_dict.txt | +| model_name_or_path | VQA SER模型地址 | None | +| max_seq_length | VQA SER模型最大支持token长度 | 512 | +| label_map_path | VQA SER 标签文件地址 | ./vqa/labels/labels_ser.txt | +| mode | pipeline预测模式,structure: 版面分析+表格识别; vqa: ser文档信息抽取 | structure | 大部分参数和paddleocr whl包保持一致,见 [whl包文档](../doc/doc_ch/whl.md) -运行完成后,每张图片会在`output`字段指定的目录下有一个同名目录,图片里的每个表格会存储为一个excel,图片区域会被裁剪之后保存下来,excel文件和图片名名为表格在图片里的坐标。 +运行完成后,每张图片会在 `output`字段指定的目录下有一个同名目录,图片里的每个表格会存储为一个excel,图片区域会被裁剪之后保存下来,excel文件和图片名名为表格在图片里的坐标。 @@ -133,16 +138,16 @@ cd ppstructure # 下载模型 mkdir inference && cd inference -# 下载超轻量级中文OCR模型的检测模型并解压 -wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_infer.tar && tar xf ch_ppocr_mobile_v2.0_det_infer.tar -# 下载超轻量级中文OCR模型的识别模型并解压 -wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_infer.tar && tar xf ch_ppocr_mobile_v2.0_rec_infer.tar -# 下载超轻量级英文表格英寸模型并解压 +# 下载PP-OCRv2文本检测模型并解压 +wget https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_slim_quant_infer.tar && tar xf ch_PP-OCRv2_det_slim_quant_infer.tar +# 下载PP-OCRv2文本识别模型并解压 +wget https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_infer.tar && tar xf ch_PP-OCRv2_rec_slim_quant_infer.tar +# 下载超轻量级英文表格预测模型并解压 wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_structure_infer.tar && tar xf en_ppocr_mobile_v2.0_table_structure_infer.tar cd .. -python3 predict_system.py --det_model_dir=inference/ch_ppocr_mobile_v2.0_det_infer \ - --rec_model_dir=inference/ch_ppocr_mobile_v2.0_rec_infer \ +python3 predict_system.py --det_model_dir=inference/ch_PP-OCRv2_det_slim_quant_infer \ + --rec_model_dir=inference/ch_PP-OCRv2_rec_slim_quant_infer \ --table_model_dir=inference/en_ppocr_mobile_v2.0_table_structure_infer \ --image_dir=../doc/table/1.png \ --rec_char_dict_path=../ppocr/utils/ppocr_keys_v1.txt \ @@ -150,7 +155,8 @@ python3 predict_system.py --det_model_dir=inference/ch_ppocr_mobile_v2.0_det_inf --output=../output/table \ --vis_font_path=../doc/fonts/simfang.ttf ``` -运行完成后,每张图片会在`output`字段指定的目录下的`talbe`目录下有一个同名目录,图片里的每个表格会存储为一个excel,图片区域会被裁剪之后保存下来,excel文件和图片名名为表格在图片里的坐标。 + +运行完成后,每张图片会在 `output`字段指定的目录下的 `talbe`目录下有一个同名目录,图片里的每个表格会存储为一个excel,图片区域会被裁剪之后保存下来,excel文件和图片名名为表格在图片里的坐标。 * VQA @@ -168,4 +174,5 @@ python3 predict_system.py --model_name_or_path=vqa/PP-Layout_v1.0_ser_pretrained --image_dir=vqa/images/input/zh_val_0.jpg \ --vis_font_path=../doc/fonts/simfang.ttf ``` -运行完成后,每张图片会在`output`字段指定的目录下的`vqa`目录下存放可视化之后的图片,图片名和输入图片名一致。 + +运行完成后,每张图片会在 `output`字段指定的目录下的 `vqa`目录下存放可视化之后的图片,图片名和输入图片名一致。 diff --git a/ppstructure/table/README.md b/ppstructure/table/README.md index 30a11a20e5de90500d1408f671ba914f336a0b43..150ed34ebdbc375a918542eae883c070069b998b 100644 --- a/ppstructure/table/README.md +++ b/ppstructure/table/README.md @@ -1,7 +1,9 @@ # Table Recognition ## 1. pipeline + The table recognition mainly contains three models + 1. Single line text detection-DB 2. Single line text recognition-CRNN 3. Table structure and cell coordinate prediction-RARE @@ -16,13 +18,13 @@ The table recognition flow chart is as follows 4. The cell recognition result and the table structure together construct the html string of the table. ## 2. Performance -We evaluated the algorithm on the PubTabNet[1] eval dataset, and the performance is as follows: +We evaluated the algorithm on the PubTabNet``[1]`` eval dataset, and the performance is as follows: -|Method|[TEDS(Tree-Edit-Distance-based Similarity)](https://github.com/ibm-aur-nlp/PubTabNet/tree/master/src)| -| --- | --- | -| EDD[2] | 88.3 | -| Ours | 93.32 | +| Method | [TEDS(Tree-Edit-Distance-based Similarity)](https://github.com/ibm-aur-nlp/PubTabNet/tree/master/src) | +| ------------------------- | -------------------------------------------------------------------------------------------------- | +| EDD``[2]`` | 88.3 | +| Ours | 93.32 | ## 3. How to use @@ -41,8 +43,9 @@ wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_tab wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_structure_infer.tar && tar xf en_ppocr_mobile_v2.0_table_structure_infer.tar cd .. # run -python3 table/predict_table.py --det_model_dir=inference/en_ppocr_mobile_v2.0_table_det_infer --rec_model_dir=inference/en_ppocr_mobile_v2.0_table_rec_infer --table_model_dir=inference/en_ppocr_mobile_v2.0_table_structure_infer --image_dir=../doc/table/table.jpg --rec_char_dict_path=../ppocr/utils/dict/table_dict.txt --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt --rec_char_dict_path=../ppocr/utils/dict/en_dict.txt --det_limit_side_len=736 --det_limit_type=min --output ../output/table +python3 table/predict_table.py --det_model_dir=inference/en_ppocr_mobile_v2.0_table_det_infer --rec_model_dir=inference/en_ppocr_mobile_v2.0_table_rec_infer --table_model_dir=inference/en_ppocr_mobile_v2.0_table_structure_infer --image_dir=../doc/table/table.jpg --rec_char_dict_path=../ppocr/utils/dict/table_dict.txt --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt --det_limit_side_len=736 --det_limit_type=min --output ../output/table ``` + Note: The above model is trained on the PubLayNet dataset and only supports English scanning scenarios. If you need to identify other scenarios, you need to train the model yourself and replace the three fields `det_model_dir`, `rec_model_dir`, `table_model_dir`. After running, the excel sheet of each picture will be saved in the directory specified by the output field @@ -51,11 +54,14 @@ After running, the excel sheet of each picture will be saved in the directory sp In this chapter, we only introduce the training of the table structure model, For model training of [text detection](../../doc/doc_en/detection_en.md) and [text recognition](../../doc/doc_en/recognition_en.md), please refer to the corresponding documents -#### data preparation -The training data uses public data set [PubTabNet](https://arxiv.org/abs/1911.10683 ), Can be downloaded from the official [website](https://github.com/ibm-aur-nlp/PubTabNet) 。The PubTabNet data set contains about 500,000 images, as well as annotations in html format。 +#### data preparation + +The training data uses public data set [PubTabNet](https://arxiv.org/abs/1911.10683), Can be downloaded from the official [website](https://github.com/ibm-aur-nlp/PubTabNet) 。The PubTabNet data set contains about 500,000 images, as well as annotations in html format。 + +#### Start training -#### Start training *If you are installing the cpu version of paddle, please modify the `use_gpu` field in the configuration file to false* + ```shell # single GPU training python3 tools/train.py -c configs/table/table_mv3.yml @@ -80,6 +86,7 @@ python3 tools/train.py -c configs/table/table_mv3.yml -o Global.checkpoints=./yo ### 3.3 Eval The table uses [TEDS(Tree-Edit-Distance-based Similarity)](https://github.com/ibm-aur-nlp/PubTabNet/tree/master/src) as the evaluation metric of the model. Before the model evaluation, the three models in the pipeline need to be exported as inference models (we have provided them), and the gt for evaluation needs to be prepared. Examples of gt are as follows: + ```json {"PMC4289340_004_00.png": [ ["", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "
", "", "", "
", "", "", "
", "", ""], @@ -87,18 +94,22 @@ The table uses [TEDS(Tree-Edit-Distance-based Similarity)](https://github.com/ib [["", "F", "e", "a", "t", "u", "r", "e", ""], ["", "G", "b", "3", " ", "+", ""], ["", "G", "b", "3", " ", "-", ""], ["", "P", "a", "t", "i", "e", "n", "t", "s", ""], ["6", "2"], ["4", "5"]] ]} ``` + In gt json, the key is the image name, the value is the corresponding gt, and gt is a list composed of four items, and each item is + 1. HTML string list of table structure 2. The coordinates of each cell (not including the empty text in the cell) 3. The text information in each cell (not including the empty text in the cell) Use the following command to evaluate. After the evaluation is completed, the teds indicator will be output. + ```python cd PaddleOCR/ppstructure python3 table/eval_table.py --det_model_dir=path/to/det_model_dir --rec_model_dir=path/to/rec_model_dir --table_model_dir=path/to/table_model_dir --image_dir=../doc/table/1.png --rec_char_dict_path=../ppocr/utils/dict/table_dict.txt --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt --det_limit_side_len=736 --det_limit_type=min --gt_path=path/to/gt.json ``` If the PubLatNet eval dataset is used, it will be output + ```bash teds: 93.32 ``` @@ -109,8 +120,10 @@ teds: 93.32 cd PaddleOCR/ppstructure python3 table/predict_table.py --det_model_dir=path/to/det_model_dir --rec_model_dir=path/to/rec_model_dir --table_model_dir=path/to/table_model_dir --image_dir=../doc/table/1.png --rec_char_dict_path=../ppocr/utils/dict/table_dict.txt --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt --rec_char_type=EN --det_limit_side_len=736 --det_limit_type=min --output ../output/table ``` + After running, the excel sheet of each picture will be saved in the directory specified by the output field Reference + 1. https://github.com/ibm-aur-nlp/PubTabNet 2. https://arxiv.org/pdf/1911.10683 diff --git a/ppstructure/table/README_ch.md b/ppstructure/table/README_ch.md index 33276b36e4973e83d7efa673b90013cf5727dfe2..d0fae97dc462555a155bb645dc1a7f559e069f11 100644 --- a/ppstructure/table/README_ch.md +++ b/ppstructure/table/README_ch.md @@ -1,17 +1,23 @@ # 表格识别 -* [1. 表格识别 pipeline](#1) -* [2. 性能](#2) -* [3. 使用](#3) - + [3.1 快速开始](#31) - + [3.2 训练](#32) - + [3.3 评估](#33) - + [3.4 预测](#34) +- [表格识别](#表格识别) + - [1. 表格识别 pipeline](#1-表格识别-pipeline) + - [2. 性能](#2-性能) + - [3. 使用](#3-使用) + - [3.1 快速开始](#31-快速开始) + - [3.2 训练](#32-训练) + - [数据准备](#数据准备) + - [启动训练](#启动训练) + - [断点训练](#断点训练) + - [3.3 评估](#33-评估) + - [3.4 预测](#34-预测) + ## 1. 表格识别 pipeline 表格识别主要包含三个模型 + 1. 单行文本检测-DB 2. 单行文本识别-CRNN 3. 表格结构和cell坐标预测-RARE @@ -27,20 +33,23 @@ 3. 由单行文字的坐标、识别结果和单元格的坐标一起组合出单元格的识别结果。 4. 单元格的识别结果和表格结构一起构造表格的html字符串。 - + ## 2. 性能 -我们在 PubTabNet[1] 评估数据集上对算法进行了评估,性能如下 +我们在 PubTabNet``[1]`` 评估数据集上对算法进行了评估,性能如下 -|算法|[TEDS(Tree-Edit-Distance-based Similarity)](https://github.com/ibm-aur-nlp/PubTabNet/tree/master/src)| -| --- | --- | -| EDD[2] | 88.3 | -| Ours | 93.32 | +| 算法 | [TEDS(Tree-Edit-Distance-based Similarity)](https://github.com/ibm-aur-nlp/PubTabNet/tree/master/src) | +| ------------------------- | -------------------------------------------------------------------------------------------------- | +| EDD``[2]`` | 88.3 | +| Ours | 93.32 | + ## 3. 使用 + + ### 3.1 快速开始 ```python @@ -56,20 +65,27 @@ wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_tab wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_structure_infer.tar && tar xf en_ppocr_mobile_v2.0_table_structure_infer.tar cd .. # 执行预测 -python3 table/predict_table.py --det_model_dir=inference/en_ppocr_mobile_v2.0_table_det_infer --rec_model_dir=inference/en_ppocr_mobile_v2.0_table_rec_infer --table_model_dir=inference/en_ppocr_mobile_v2.0_table_structure_infer --image_dir=../doc/table/table.jpg --rec_char_dict_path=../ppocr/utils/dict/table_dict.txt --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt --rec_char_dict_path=../ppocr/utils/dict/en_dict.txt --det_limit_side_len=736 --det_limit_type=min --output ../output/table +python3 table/predict_table.py --det_model_dir=inference/en_ppocr_mobile_v2.0_table_det_infer --rec_model_dir=inference/en_ppocr_mobile_v2.0_table_rec_infer --table_model_dir=inference/en_ppocr_mobile_v2.0_table_structure_infer --image_dir=../doc/table/table.jpg --rec_char_dict_path=../ppocr/utils/dict/table_dict.txt --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt --det_limit_side_len=736 --det_limit_type=min --output ../output/table ``` + 运行完成后,每张图片的excel表格会保存到output字段指定的目录下 note: 上述模型是在 PubLayNet 数据集上训练的表格识别模型,仅支持英文扫描场景,如需识别其他场景需要自己训练模型后替换 `det_model_dir`,`rec_model_dir`,`table_model_dir`三个字段即可。 + + ### 3.2 训练 + 在这一章节中,我们仅介绍表格结构模型的训练,[文字检测](../../doc/doc_ch/detection.md)和[文字识别](../../doc/doc_ch/recognition.md)的模型训练请参考对应的文档。 -#### 数据准备 +#### 数据准备 + 训练数据使用公开数据集PubTabNet ([论文](https://arxiv.org/abs/1911.10683),[下载地址](https://github.com/ibm-aur-nlp/PubTabNet))。PubTabNet数据集包含约50万张表格数据的图像,以及图像对应的html格式的注释。 -#### 启动训练 +#### 启动训练 + *如果您安装的是cpu版本,请将配置文件中的 `use_gpu` 字段修改为false* + ```shell # 单机单卡训练 python3 tools/train.py -c configs/table/table_mv3.yml @@ -82,16 +98,19 @@ python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/ #### 断点训练 如果训练程序中断,如果希望加载训练中断的模型从而恢复训练,可以通过指定Global.checkpoints指定要加载的模型路径: + ```shell python3 tools/train.py -c configs/table/table_mv3.yml -o Global.checkpoints=./your/trained/model ``` -**注意**:`Global.checkpoints`的优先级高于`Global.pretrain_weights`的优先级,即同时指定两个参数时,优先加载`Global.checkpoints`指定的模型,如果`Global.checkpoints`指定的模型路径有误,会加载`Global.pretrain_weights`指定的模型。 +**注意**:`Global.checkpoints`的优先级高于 `Global.pretrain_weights`的优先级,即同时指定两个参数时,优先加载 `Global.checkpoints`指定的模型,如果 `Global.checkpoints`指定的模型路径有误,会加载 `Global.pretrain_weights`指定的模型。 + ### 3.3 评估 表格使用 [TEDS(Tree-Edit-Distance-based Similarity)](https://github.com/ibm-aur-nlp/PubTabNet/tree/master/src) 作为模型的评估指标。在进行模型评估之前,需要将pipeline中的三个模型分别导出为inference模型(我们已经提供好),还需要准备评估的gt, gt示例如下: + ```json {"PMC4289340_004_00.png": [ ["", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "
", "", "", "
", "", "", "
", "", ""], @@ -99,21 +118,28 @@ python3 tools/train.py -c configs/table/table_mv3.yml -o Global.checkpoints=./yo [["", "F", "e", "a", "t", "u", "r", "e", ""], ["", "G", "b", "3", " ", "+", ""], ["", "G", "b", "3", " ", "-", ""], ["", "P", "a", "t", "i", "e", "n", "t", "s", ""], ["6", "2"], ["4", "5"]] ]} ``` + json 中,key为图片名,value为对应的gt,gt是一个由三个item组成的list,每个item分别为 + 1. 表格结构的html字符串list 2. 每个cell的坐标 (不包括cell里文字为空的) 3. 每个cell里的文字信息 (不包括cell里文字为空的) 准备完成后使用如下命令进行评估,评估完成后会输出teds指标。 + ```python cd PaddleOCR/ppstructure python3 table/eval_table.py --det_model_dir=path/to/det_model_dir --rec_model_dir=path/to/rec_model_dir --table_model_dir=path/to/table_model_dir --image_dir=../doc/table/1.png --rec_char_dict_path=../ppocr/utils/dict/table_dict.txt --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt --det_limit_side_len=736 --det_limit_type=min --gt_path=path/to/gt.json ``` + 如使用PubLatNet评估数据集,将会输出 + ```bash teds: 93.32 ``` + + ### 3.4 预测 ```python @@ -122,5 +148,6 @@ python3 table/predict_table.py --det_model_dir=path/to/det_model_dir --rec_model ``` Reference + 1. https://github.com/ibm-aur-nlp/PubTabNet 2. https://arxiv.org/pdf/1911.10683 diff --git a/ppstructure/vqa/README-en.md b/ppstructure/vqa/README-en.md new file mode 100644 index 0000000000000000000000000000000000000000..168640874aa5e2339e81d7dc467e515d5aa9101e --- /dev/null +++ b/ppstructure/vqa/README-en.md @@ -0,0 +1,331 @@ +# Document Visual Q&A(DOC-VQA) + +Document Visual Q&A, mainly for the image content of the question and answer, DOC-VQA is a type of VQA task, DOC-VQA mainly asks questions about the textual content of text images. + +The DOC-VQA algorithm in PP-Structure is developed based on PaddleNLP natural language processing algorithm library. + +The main features are as follows: + +- Integrated LayoutXLM model and PP-OCR prediction engine. +- Support Semantic Entity Recognition (SER) and Relation Extraction (RE) tasks based on multi-modal methods. Based on SER task, text recognition and classification in images can be completed. Based on THE RE task, we can extract the relation of the text content in the image, such as judge the problem pair. + +- Support custom training for SER and RE tasks. + +- Support OCR+SER end-to-end system prediction and evaluation. + +- Support OCR+SER+RE end-to-end system prediction. + +**Note**: This project is based on the open source implementation of [LayoutXLM](https://arxiv.org/pdf/2104.08836.pdf) on Paddle 2.2, and at the same time, after in-depth polishing by the flying Paddle team and the Industrial and **Commercial Bank of China** in the scene of real estate certificate, jointly open source. + + +## 1.Performance + +We evaluated the algorithm on [XFUN](https://github.com/doc-analysis/XFUND) 's Chinese data set, and the performance is as follows + +| Model | Task | F1 | Model Download Link | +|:---:|:---:|:---:| :---:| +| LayoutXLM | RE | 0.7113 | [Link](https://paddleocr.bj.bcebos.com/pplayout/PP-Layout_v1.0_re_pretrained.tar) | +| LayoutXLM | SER | 0.9056 | [Link](https://paddleocr.bj.bcebos.com/pplayout/PP-Layout_v1.0_ser_pretrained.tar) | +| LayoutLM | SER | 0.78 | [Link](https://paddleocr.bj.bcebos.com/pplayout/LayoutLM_ser_pretrained.tar) | + + + +## 2.Demonstration + +**Note**: the test images are from the xfun dataset. + +### 2.1 SER + +![](./images/result_ser/zh_val_0_ser.jpg) | ![](./images/result_ser/zh_val_42_ser.jpg) +---|--- + +Different colored boxes in the figure represent different categories. For xfun dataset, there are three categories: query, answer and header: + +* Dark purple: header +* Light purple: query +* Army green: answer + +The corresponding category and OCR recognition results are also marked at the top left of the OCR detection box. + + +### 2.2 RE + +![](./images/result_re/zh_val_21_re.jpg) | ![](./images/result_re/zh_val_40_re.jpg) +---|--- + + +In the figure, the red box represents the question, the blue box represents the answer, and the question and answer are connected by green lines. The corresponding category and OCR recognition results are also marked at the top left of the OCR detection box. + + +## 3. Setup + +### 3.1 Installation dependency + +- **(1) Install PaddlePaddle** + +```bash +pip3 install --upgrade pip + +# GPU PaddlePaddle Install +python3 -m pip install paddlepaddle-gpu==2.2 -i https://mirror.baidu.com/pypi/simple + +# CPU PaddlePaddle Install +python3 -m pip install paddlepaddle==2.2 -i https://mirror.baidu.com/pypi/simple + +``` +For more requirements, please refer to the [instructions](https://www.paddlepaddle.org.cn/install/quick) in the installation document. + + +### 3.2 Install PaddleOCR (including pp-ocr and VQA) + +- **(1) PIP quick install paddleocr WHL package (forecast only)** + +```bash +pip install paddleocr +``` + +- **(2) Download VQA source code (prediction + training)** + +```bash +[recommended] git clone https://github.com/PaddlePaddle/PaddleOCR + +# If you cannot pull successfully because of network problems, you can also choose to use the hosting on the code cloud: +git clone https://gitee.com/paddlepaddle/PaddleOCR + +# Note: the code cloud hosting code may not be able to synchronize the update of this GitHub project in real time, with a delay of 3 ~ 5 days. Please give priority to the recommended method. +``` + +- **(3) Install PaddleNLP** + +```bash +# You need to use the latest code version of paddlenlp for installation +git clone https://github.com/PaddlePaddle/PaddleNLP -b develop +cd PaddleNLP +pip3 install -e . +``` + + +- **(4) Install requirements for VQA** + +```bash +cd ppstructure/vqa +pip install -r requirements.txt +``` + +## 4.Usage + + +### 4.1 Data and pre training model preparation + +Download address of processed xfun Chinese dataset: [https://paddleocr.bj.bcebos.com/dataset/XFUND.tar](https://paddleocr.bj.bcebos.com/dataset/XFUND.tar)。 + + +Download and unzip the dataset, and then place the dataset in the current directory. + +```shell +wget https://paddleocr.bj.bcebos.com/dataset/XFUND.tar +``` + +If you want to convert data sets in other languages in xfun, you can refer to [xfun data conversion script.](helper/trans_xfun_data.py)) + +If you want to experience the prediction process directly, you can download the pre training model provided by us, skip the training process and predict directly. + + +### 4.2 SER Task + +* Start training + +```shell +python3.7 train_ser.py \ + --model_name_or_path "layoutxlm-base-uncased" \ + --ser_model_type "LayoutXLM" \ + --train_data_dir "XFUND/zh_train/image" \ + --train_label_path "XFUND/zh_train/xfun_normalize_train.json" \ + --eval_data_dir "XFUND/zh_val/image" \ + --eval_label_path "XFUND/zh_val/xfun_normalize_val.json" \ + --num_train_epochs 200 \ + --eval_steps 10 \ + --output_dir "./output/ser/" \ + --learning_rate 5e-5 \ + --warmup_steps 50 \ + --evaluate_during_training \ + --seed 2048 +``` + +Finally, Precision, Recall, F1 and other indicators will be printed, and the model and training log will be saved in/ In the output/Ser/ folder. + +* Recovery training + +```shell +python3.7 train_ser.py \ + --model_name_or_path "model_path" \ + --ser_model_type "LayoutXLM" \ + --train_data_dir "XFUND/zh_train/image" \ + --train_label_path "XFUND/zh_train/xfun_normalize_train.json" \ + --eval_data_dir "XFUND/zh_val/image" \ + --eval_label_path "XFUND/zh_val/xfun_normalize_val.json" \ + --num_train_epochs 200 \ + --eval_steps 10 \ + --output_dir "./output/ser/" \ + --learning_rate 5e-5 \ + --warmup_steps 50 \ + --evaluate_during_training \ + --num_workers 8 \ + --seed 2048 \ + --resume +``` + +* Evaluation +```shell +export CUDA_VISIBLE_DEVICES=0 +python3 eval_ser.py \ + --model_name_or_path "PP-Layout_v1.0_ser_pretrained/" \ + --ser_model_type "LayoutXLM" \ + --eval_data_dir "XFUND/zh_val/image" \ + --eval_label_path "XFUND/zh_val/xfun_normalize_val.json" \ + --per_gpu_eval_batch_size 8 \ + --num_workers 8 \ + --output_dir "output/ser/" \ + --seed 2048 +``` +Finally, Precision, Recall, F1 and other indicators will be printed + +* The OCR recognition results provided in the evaluation set are used for prediction + +```shell +export CUDA_VISIBLE_DEVICES=0 +python3.7 infer_ser.py \ + --model_name_or_path "PP-Layout_v1.0_ser_pretrained/" \ + --ser_model_type "LayoutXLM" \ + --output_dir "output/ser/" \ + --infer_imgs "XFUND/zh_val/image/" \ + --ocr_json_path "XFUND/zh_val/xfun_normalize_val.json" +``` + +It will end up in output_res The visual image of the prediction result and the text file of the prediction result are saved in the res directory. The file name is infer_ results.txt. + +* Using OCR engine + SER concatenation results + +```shell +export CUDA_VISIBLE_DEVICES=0 +python3.7 infer_ser_e2e.py \ + --model_name_or_path "PP-Layout_v1.0_ser_pretrained/" \ + --ser_model_type "LayoutXLM" \ + --max_seq_length 512 \ + --output_dir "output/ser_e2e/" \ + --infer_imgs "images/input/zh_val_0.jpg" +``` + +* End-to-end evaluation of OCR engine + SER prediction system + +```shell +export CUDA_VISIBLE_DEVICES=0 +python3.7 helper/eval_with_label_end2end.py --gt_json_path XFUND/zh_val/xfun_normalize_val.json --pred_json_path output_res/infer_results.txt +``` + + +### 4.3 RE Task + +* Start training + +```shell +export CUDA_VISIBLE_DEVICES=0 +python3 train_re.py \ + --model_name_or_path "layoutxlm-base-uncased" \ + --train_data_dir "XFUND/zh_train/image" \ + --train_label_path "XFUND/zh_train/xfun_normalize_train.json" \ + --eval_data_dir "XFUND/zh_val/image" \ + --eval_label_path "XFUND/zh_val/xfun_normalize_val.json" \ + --label_map_path "labels/labels_ser.txt" \ + --num_train_epochs 200 \ + --eval_steps 10 \ + --output_dir "output/re/" \ + --learning_rate 5e-5 \ + --warmup_steps 50 \ + --per_gpu_train_batch_size 8 \ + --per_gpu_eval_batch_size 8 \ + --num_workers 8 \ + --evaluate_during_training \ + --seed 2048 + +``` + +* Resume training + +```shell +export CUDA_VISIBLE_DEVICES=0 +python3 train_re.py \ + --model_name_or_path "model_path" \ + --train_data_dir "XFUND/zh_train/image" \ + --train_label_path "XFUND/zh_train/xfun_normalize_train.json" \ + --eval_data_dir "XFUND/zh_val/image" \ + --eval_label_path "XFUND/zh_val/xfun_normalize_val.json" \ + --label_map_path "labels/labels_ser.txt" \ + --num_train_epochs 2 \ + --eval_steps 10 \ + --output_dir "output/re/" \ + --learning_rate 5e-5 \ + --warmup_steps 50 \ + --per_gpu_train_batch_size 8 \ + --per_gpu_eval_batch_size 8 \ + --num_workers 8 \ + --evaluate_during_training \ + --seed 2048 \ + --resume + +``` + +Finally, Precision, Recall, F1 and other indicators will be printed, and the model and training log will be saved in the output/RE file folder. + +* Evaluation +```shell +export CUDA_VISIBLE_DEVICES=0 +python3 eval_re.py \ + --model_name_or_path "PP-Layout_v1.0_re_pretrained/" \ + --max_seq_length 512 \ + --eval_data_dir "XFUND/zh_val/image" \ + --eval_label_path "XFUND/zh_val/xfun_normalize_val.json" \ + --label_map_path "labels/labels_ser.txt" \ + --output_dir "output/re/" \ + --per_gpu_eval_batch_size 8 \ + --num_workers 8 \ + --seed 2048 +``` +Finally, Precision, Recall, F1 and other indicators will be printed + + +* The OCR recognition results provided in the evaluation set are used for prediction + +```shell +export CUDA_VISIBLE_DEVICES=0 +python3 infer_re.py \ + --model_name_or_path "PP-Layout_v1.0_re_pretrained/" \ + --max_seq_length 512 \ + --eval_data_dir "XFUND/zh_val/image" \ + --eval_label_path "XFUND/zh_val/xfun_normalize_val.json" \ + --label_map_path "labels/labels_ser.txt" \ + --output_dir "output/re/" \ + --per_gpu_eval_batch_size 1 \ + --seed 2048 +``` + +The visual image of the prediction result and the text file of the prediction result are saved in the output_res file folder, the file name is`infer_results.txt`。 + +* Concatenation results using OCR engine + SER+ RE + +```shell +export CUDA_VISIBLE_DEVICES=0 +python3.7 infer_ser_re_e2e.py \ + --model_name_or_path "PP-Layout_v1.0_ser_pretrained/" \ + --re_model_name_or_path "PP-Layout_v1.0_re_pretrained/" \ + --ser_model_type "LayoutXLM" \ + --max_seq_length 512 \ + --output_dir "output/ser_re_e2e/" \ + --infer_imgs "images/input/zh_val_21.jpg" +``` + +## Reference + +- LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding, https://arxiv.org/pdf/2104.08836.pdf +- microsoft/unilm/layoutxlm, https://github.com/microsoft/unilm/tree/master/layoutxlm +- XFUND dataset, https://github.com/doc-analysis/XFUND diff --git a/ppstructure/vqa/README.md b/ppstructure/vqa/README.md index b5e95fd219961363d6c1c09330ea795e11725a4e..619ada71a82eacd88abd39199d0b220dc6c64c9b 100644 --- a/ppstructure/vqa/README.md +++ b/ppstructure/vqa/README.md @@ -15,7 +15,7 @@ PP-Structure 里的 DOC-VQA算法基于PaddleNLP自然语言处理算法库进 **Note**:本项目基于 [LayoutXLM](https://arxiv.org/pdf/2104.08836.pdf) 在Paddle 2.2上的开源实现,同时经过飞桨团队与**中国工商银行**在不动产证场景深入打磨,联合开源。 -## 1 性能 +## 1.性能 我们在 [XFUN](https://github.com/doc-analysis/XFUND) 的中文数据集上对算法进行了评估,性能如下 @@ -27,7 +27,7 @@ PP-Structure 里的 DOC-VQA算法基于PaddleNLP自然语言处理算法库进 -## 2. 效果演示 +## 2.效果演示 **注意:** 测试图片来源于XFUN数据集。 @@ -54,7 +54,7 @@ PP-Structure 里的 DOC-VQA算法基于PaddleNLP自然语言处理算法库进 图中红色框表示问题,蓝色框表示答案,问题和答案之间使用绿色线连接。在OCR检测框的左上方也标出了对应的类别和OCR识别结果。 -## 3. 安装 +## 3.安装 ### 3.1 安装依赖 @@ -211,7 +211,7 @@ python3 helper/eval_with_label_end2end.py --gt_json_path XFUND/zh_val/xfun_norma ``` -### 3.3 RE任务 +### 4.3 RE任务 * 启动训练 diff --git a/tools/infer_rec.py b/tools/infer_rec.py index adc3c1c3c49dcaad5ec8657f5d32b2eca8e10a40..b0c836ff3bcee5b9009b62c60483697db8091bde 100755 --- a/tools/infer_rec.py +++ b/tools/infer_rec.py @@ -137,7 +137,7 @@ def main(): if info is not None: logger.info("\t result: {}".format(info)) - fout.write(file + "\t" + info) + fout.write(os.path.basename(file) + "\t" + info + "\n") logger.info("success!")