diff --git a/.gitignore b/.gitignore index 3300be325f1f6c8b2b58301fc87a4f9d241afb84..3a05fb74687f2b12790f2f73fc96cf8a6abb2bd3 100644 --- a/.gitignore +++ b/.gitignore @@ -31,4 +31,4 @@ paddleocr.egg-info/ /deploy/android_demo/app/.cxx/ /deploy/android_demo/app/cache/ test_tipc/web/models/ -test_tipc/web/node_modules/ +test_tipc/web/node_modules/ \ No newline at end of file diff --git a/PPOCRLabel/PPOCRLabel.py b/PPOCRLabel/PPOCRLabel.py index 827f1cf76846d0e232e980bc21f45ae0cd1a640b..390c2b159575bf1c60387e42b5be3d917ba845f7 100644 --- a/PPOCRLabel/PPOCRLabel.py +++ b/PPOCRLabel/PPOCRLabel.py @@ -2285,7 +2285,7 @@ class MainWindow(QMainWindow): ''' Table Recegnition ''' - from paddleocr.ppstructure.table.predict_table import to_excel + from paddleocr import to_excel import time @@ -2309,7 +2309,7 @@ class MainWindow(QMainWindow): # ONLY SUPPORT ONE TABLE in one image hasTable = False for region in res: - if region['type'] == 'Table': + if region['type'] == 'table': if region['res']['boxes'] is None: msg = 'Can not recognise the detection box in ' + self.filePath + '. Please change manually' QMessageBox.information(self, "Information", msg) @@ -2335,10 +2335,7 @@ class MainWindow(QMainWindow): bbox = np.array(region['res']['boxes'][i]) rec_text = region['res']['rec_res'][i][0] - # polys to rectangles - x1, y1 = np.min(bbox[:, 0]), np.min(bbox[:, 1]) - x2, y2 = np.max(bbox[:, 0]), np.max(bbox[:, 1]) - rext_bbox = [[x1, y1], [x2, y1], [x2, y2], [x1, y2]] + rext_bbox = [[bbox[0], bbox[1]], [bbox[2], bbox[1]], [bbox[2], bbox[3]], [bbox[0], bbox[3]]] # save bbox to shape shape = Shape(label=rec_text, line_color=DEFAULT_LINE_COLOR, key_cls=None) @@ -2452,13 +2449,6 @@ class MainWindow(QMainWindow): export PPLabel and CSV to JSON (PubTabNet) ''' import pandas as pd - from libs.dataPartitionDialog import DataPartitionDialog - - # data partition user input - partitionDialog = DataPartitionDialog(parent=self) - partitionDialog.exec() - if partitionDialog.getStatus() == False: - return # automatically save annotations self.saveFilestate() @@ -2481,28 +2471,19 @@ class MainWindow(QMainWindow): labeldict[file] = eval(label) else: labeldict[file] = [] + + # read table recognition output + TableRec_excel_dir = os.path.join( + self.lastOpenDir, 'tableRec_excel_output') - train_split, val_split, test_split = partitionDialog.getDataPartition() - # check validate - if train_split + val_split + test_split > 100: - msg = "The sum of training, validation and testing data should be less than 100%" - QMessageBox.information(self, "Information", msg) - return - print(train_split, val_split, test_split) - train_split, val_split, test_split = float(train_split) / 100., float(val_split) / 100., float(test_split) / 100. - train_id = int(len(labeldict) * train_split) - val_id = int(len(labeldict) * (train_split + val_split)) - print('Data partition: train:', train_id, - 'validation:', val_id - train_id, - 'test:', len(labeldict) - val_id) - - TableRec_excel_dir = os.path.join(self.lastOpenDir, 'tableRec_excel_output') - json_results = [] - imgid = 0 + # save txt + fid = open( + "{}/gt.txt".format(self.lastOpenDir), "w", encoding='utf-8') for image_path in labeldict.keys(): # load csv annotations filename, _ = os.path.splitext(os.path.basename(image_path)) - csv_path = os.path.join(TableRec_excel_dir, filename + '.xlsx') + csv_path = os.path.join( + TableRec_excel_dir, filename + '.xlsx') if not os.path.exists(csv_path): continue @@ -2521,28 +2502,31 @@ class MainWindow(QMainWindow): cells = [] for anno in labeldict[image_path]: tokens = list(anno['transcription']) - obb = anno['points'] - hbb = OBB2HBB(np.array(obb)).tolist() - cells.append({'tokens': tokens, 'bbox': hbb}) - - # data split - if imgid < train_id: - split = 'train' - elif imgid < val_id: - split = 'val' - else: - split = 'test' - - # save dict - html = {'structure': {'tokens': token_list}, 'cell': cells} - json_results.append({'filename': os.path.basename(image_path), 'split': split, 'imgid': imgid, 'html': html}) - imgid += 1 - - # save json - with open("{}/annotation.json".format(self.lastOpenDir), "w", encoding='utf-8') as fid: - fid.write(json.dumps(json_results, ensure_ascii=False)) - - msg = 'JSON sucessfully saved in {}/annotation.json'.format(self.lastOpenDir) + cells.append({ + 'tokens': tokens, + 'bbox': anno['points'] + }) + + # 构造标注信息 + html = { + 'structure': { + 'tokens': token_list + }, + 'cells': cells + } + d = { + 'filename': os.path.basename(image_path), + 'html': html + } + # 重构HTML + d['gt'] = rebuild_html_from_ppstructure_label(d) + fid.write('{}\n'.format( + json.dumps( + d, ensure_ascii=False))) + + # convert to PP-Structure label format + fid.close() + msg = 'JSON sucessfully saved in {}/gt.txt'.format(self.lastOpenDir) QMessageBox.information(self, "Information", msg) def autolcm(self): diff --git a/PPOCRLabel/README.md b/PPOCRLabel/README.md index 3bdc336827adb87f52e9baa2c012304595b2c656..089a63fd55bb8c127104e7c404852ba52c3ac88c 100644 --- a/PPOCRLabel/README.md +++ b/PPOCRLabel/README.md @@ -1,10 +1,14 @@ English | [简体中文](README_ch.md) -# PPOCRLabel +# PPOCRLabelv2 -PPOCRLabel is a semi-automatic graphic annotation tool suitable for OCR field, with built-in PP-OCR model to automatically detect and re-recognize data. It is written in python3 and pyqt5, supporting rectangular box, table and multi-point annotation modes. Annotations can be directly used for the training of PP-OCR detection and recognition models. +PPOCRLabelv2 is a semi-automatic graphic annotation tool suitable for OCR field, with built-in PP-OCR model to automatically detect and re-recognize data. It is written in Python3 and PyQT5, supporting rectangular box, table, irregular text and key information annotation modes. Annotations can be directly used for the training of PP-OCR detection and recognition models. - +| regular text annotation | table annotation | +| :-------------------------------------------------: | :--------------------------------------------: | +| | | +| **irregular text annotation** | **key information annotation** | +| | | ### Recent Update diff --git a/PPOCRLabel/README_ch.md b/PPOCRLabel/README_ch.md index 107f902a68bd68b30d286e8dd88b29752f0c6ad0..3ea684a3f09a6084403fa0b91e2511b7fd790f4b 100644 --- a/PPOCRLabel/README_ch.md +++ b/PPOCRLabel/README_ch.md @@ -1,10 +1,14 @@ [English](README.md) | 简体中文 -# PPOCRLabel +# PPOCRLabelv2 PPOCRLabel是一款适用于OCR领域的半自动化图形标注工具,内置PP-OCR模型对数据自动标注和重新识别。使用Python3和PyQT5编写,支持矩形框标注和四点标注模式,导出格式可直接用于PaddleOCR检测和识别模型的训练。 - +| 常规标注 | 表格标注 | +| :-------------------------------------------------: | :--------------------------------------------: | +| | | +| **不规则文本标注** | **关键信息标注** | +| | | #### 近期更新 - 2022.05:**新增表格标注**,使用方法见下方`2.2 表格标注`(by [whjdark](https://github.com/peterh0323); [Evezerest](https://github.com/Evezerest)) diff --git a/PPOCRLabel/libs/canvas.py b/PPOCRLabel/libs/canvas.py index ae9511612a2ba83001c12ae8ed82498952207f98..81f37995126140b03650f5ddea37ea282d5ceb09 100644 --- a/PPOCRLabel/libs/canvas.py +++ b/PPOCRLabel/libs/canvas.py @@ -627,7 +627,7 @@ class Canvas(QWidget): # adaptive BBOX label & index font size if self.pixmap: h, w = self.pixmap.size().height(), self.pixmap.size().width() - fontszie = int(max(h, w) / 96) + fontszie = int(max(h, w) / 48) for s in self.shapes: s.fontsize = fontszie diff --git a/PPOCRLabel/libs/dataPartitionDialog.py b/PPOCRLabel/libs/dataPartitionDialog.py deleted file mode 100644 index 33bd491552fe773bd07020d82f7ea9bab76e7557..0000000000000000000000000000000000000000 --- a/PPOCRLabel/libs/dataPartitionDialog.py +++ /dev/null @@ -1,113 +0,0 @@ -try: - from PyQt5.QtGui import * - from PyQt5.QtCore import * - from PyQt5.QtWidgets import * -except ImportError: - from PyQt4.QtGui import * - from PyQt4.QtCore import * - -from libs.utils import newIcon - -import time -import datetime -import json -import cv2 -import numpy as np - - -BB = QDialogButtonBox - -class DataPartitionDialog(QDialog): - def __init__(self, parent=None): - super().__init__() - self.parnet = parent - self.title = 'DATA PARTITION' - - self.train_ratio = 70 - self.val_ratio = 15 - self.test_ratio = 15 - - self.initUI() - - def initUI(self): - self.setWindowTitle(self.title) - self.setWindowModality(Qt.ApplicationModal) - - self.flag_accept = True - - if self.parnet.lang == 'ch': - msg = "导出JSON前请保存所有图像的标注且关闭EXCEL!" - else: - msg = "Please save all the annotations and close the EXCEL before exporting JSON!" - - info_msg = QLabel(msg, self) - info_msg.setWordWrap(True) - info_msg.setStyleSheet("color: red") - info_msg.setFont(QFont('Arial', 12)) - - train_lbl = QLabel('Train split: ', self) - train_lbl.setFont(QFont('Arial', 15)) - val_lbl = QLabel('Valid split: ', self) - val_lbl.setFont(QFont('Arial', 15)) - test_lbl = QLabel('Test split: ', self) - test_lbl.setFont(QFont('Arial', 15)) - - self.train_input = QLineEdit(self) - self.train_input.setFont(QFont('Arial', 15)) - self.val_input = QLineEdit(self) - self.val_input.setFont(QFont('Arial', 15)) - self.test_input = QLineEdit(self) - self.test_input.setFont(QFont('Arial', 15)) - - self.train_input.setText(str(self.train_ratio)) - self.val_input.setText(str(self.val_ratio)) - self.test_input.setText(str(self.test_ratio)) - - validator = QIntValidator(0, 100) - self.train_input.setValidator(validator) - self.val_input.setValidator(validator) - self.test_input.setValidator(validator) - - gridlayout = QGridLayout() - gridlayout.addWidget(info_msg, 0, 0, 1, 2) - gridlayout.addWidget(train_lbl, 1, 0) - gridlayout.addWidget(val_lbl, 2, 0) - gridlayout.addWidget(test_lbl, 3, 0) - gridlayout.addWidget(self.train_input, 1, 1) - gridlayout.addWidget(self.val_input, 2, 1) - gridlayout.addWidget(self.test_input, 3, 1) - - bb = BB(BB.Ok | BB.Cancel, Qt.Horizontal, self) - bb.button(BB.Ok).setIcon(newIcon('done')) - bb.button(BB.Cancel).setIcon(newIcon('undo')) - bb.accepted.connect(self.validate) - bb.rejected.connect(self.cancel) - gridlayout.addWidget(bb, 4, 0, 1, 2) - - self.setLayout(gridlayout) - - self.show() - - def validate(self): - self.flag_accept = True - self.accept() - - def cancel(self): - self.flag_accept = False - self.reject() - - def getStatus(self): - return self.flag_accept - - def getDataPartition(self): - self.train_ratio = int(self.train_input.text()) - self.val_ratio = int(self.val_input.text()) - self.test_ratio = int(self.test_input.text()) - - return self.train_ratio, self.val_ratio, self.test_ratio - - def closeEvent(self, event): - self.flag_accept = False - self.reject() - - diff --git a/PPOCRLabel/libs/utils.py b/PPOCRLabel/libs/utils.py index e397f139e0cf34de4fd517f920dd3fef12cc2cd7..1bd46ab4dac65f4e63e4ac4b2af5a8d295d89671 100644 --- a/PPOCRLabel/libs/utils.py +++ b/PPOCRLabel/libs/utils.py @@ -176,18 +176,6 @@ def boxPad(box, imgShape, pad : int) -> np.array: return box -def OBB2HBB(obb) -> np.array: - """ - Convert Oriented Bounding Box to Horizontal Bounding Box. - """ - hbb = np.zeros(4, dtype=np.int32) - hbb[0] = min(obb[:, 0]) - hbb[1] = min(obb[:, 1]) - hbb[2] = max(obb[:, 0]) - hbb[3] = max(obb[:, 1]) - return hbb - - def expand_list(merged, html_list): ''' Fill blanks according to merged cells @@ -232,6 +220,26 @@ def convert_token(html_list): return token_list +def rebuild_html_from_ppstructure_label(label_info): + from html import escape + html_code = label_info['html']['structure']['tokens'].copy() + to_insert = [ + i for i, tag in enumerate(html_code) if tag in ('', '>') + ] + for i, cell in zip(to_insert[::-1], label_info['html']['cells'][::-1]): + if cell['tokens']: + cell = [ + escape(token) if len(token) == 1 else token + for token in cell['tokens'] + ] + cell = ''.join(cell) + html_code.insert(i + 1, cell) + html_code = ''.join(html_code) + html_code = '{}
'.format( + html_code) + return html_code + + def stepsInfo(lang='en'): if lang == 'ch': msg = "1. 安装与运行:使用上述命令安装与运行程序。\n" \ diff --git a/PPOCRLabel/setup.py b/PPOCRLabel/setup.py index 1ec54df11a75b8a7ad8f023ca4a5b24ef5343d71..1750f84b8259a237fb6bb1b5eb9dc33e29441bc1 100644 --- a/PPOCRLabel/setup.py +++ b/PPOCRLabel/setup.py @@ -33,7 +33,7 @@ setup( package_dir={'PPOCRLabel': ''}, include_package_data=True, entry_points={"console_scripts": ["PPOCRLabel= PPOCRLabel.PPOCRLabel:main"]}, - version='1.0.2', + version='2.1.1', install_requires=requirements, license='Apache License 2.0', description='PPOCRLabel is a semi-automatic graphic annotation tool suitable for OCR field, with built-in PPOCR model to automatically detect and re-recognize data. It is written in python3 and pyqt5, supporting rectangular box annotation and four-point annotation modes. Annotations can be directly used for the training of PPOCR detection and recognition models', diff --git a/README.md b/README.md index f57672e5055df042ede9ae03bbed590889c5941c..62cc8536da3e7cd6d49aea19b85e19cc2537d642 100644 --- a/README.md +++ b/README.md @@ -26,17 +26,19 @@ PaddleOCR aims to create multilingual, awesome, leading, and practical OCR tools ## Recent updates +- **🔥2022.8.24 Release PaddleOCR [release/2.6](https://github.com/PaddlePaddle/PaddleOCR/tree/release/2.6)** + - Release [PP-Structurev2](./ppstructure/),with functions and performance fully upgraded, adapted to Chinese scenes, and new support for [Layout Recovery](./ppstructure/recovery) and **one line command to convert PDF to Word**; + - [Layout Analysis](./ppstructure/layout) optimization: model storage reduced by 95%, while speed increased by 11 times, and the average CPU time-cost is only 41ms; + - [Table Recognition](./ppstructure/table) optimization: 3 optimization strategies are designed, and the model accuracy is improved by 6% under comparable time consumption; + - [Key Information Extraction](./ppstructure/kie) optimization:a visual-independent model structure is designed, the accuracy of semantic entity recognition is increased by 2.8%, and the accuracy of relation extraction is increased by 9.1%. + +- **🔥2022.7 Release [OCR scene application collection](./applications/README_en.md)** + - Release **9 vertical models** such as digital tube, LCD screen, license plate, handwriting recognition model, high-precision SVTR model, etc, covering the main OCR vertical applications in general, manufacturing, finance, and transportation industries. + - **🔥2022.5.9 Release PaddleOCR [release/2.5](https://github.com/PaddlePaddle/PaddleOCR/tree/release/2.5)** - Release [PP-OCRv3](./doc/doc_en/ppocr_introduction_en.md#pp-ocrv3): With comparable speed, the effect of Chinese scene is further improved by 5% compared with PP-OCRv2, the effect of English scene is improved by 11%, and the average recognition accuracy of 80 language multilingual models is improved by more than 5%. - Release [PPOCRLabelv2](./PPOCRLabel): Add the annotation function for table recognition task, key information extraction task and irregular text image. - Release interactive e-book [*"Dive into OCR"*](./doc/doc_en/ocr_book_en.md), covers the cutting-edge theory and code practice of OCR full stack technology. -- 2021.12.21 Release PaddleOCR [release/2.4](https://github.com/PaddlePaddle/PaddleOCR/tree/release/2.4) - - Release 1 text detection algorithm (PSENet), 3 text recognition algorithms (NRTR、SEED、SAR). - - Release 1 key information extraction algorithm (SDMGR, [tutorial](./ppstructure/docs/kie_en.md)) and 3 [DocVQA](./ppstructure/vqa) algorithms (LayoutLM, LayoutLMv2, LayoutXLM). -- 2021.9.7 Release PaddleOCR [release/2.3](https://github.com/PaddlePaddle/PaddleOCR/tree/release/2.3) - - Release [PP-OCRv2](./doc/doc_en/ppocr_introduction_en.md#pp-ocrv2). The inference speed of PP-OCRv2 is 220% higher than that of PP-OCR server in CPU device. The F-score of PP-OCRv2 is 7% higher than that of PP-OCR mobile. -- 2021.8.3 Release PaddleOCR [release/2.2](https://github.com/PaddlePaddle/PaddleOCR/tree/release/2.2) - - Release a new structured documents analysis toolkit, i.e., [PP-Structure](./ppstructure/README.md), support layout analysis and table recognition (One-key to export chart images to Excel files). - [more](./doc/doc_en/update_en.md) @@ -45,7 +47,9 @@ PaddleOCR aims to create multilingual, awesome, leading, and practical OCR tools PaddleOCR support a variety of cutting-edge algorithms related to OCR, and developed industrial featured models/solution [PP-OCR](./doc/doc_en/ppocr_introduction_en.md) and [PP-Structure](./ppstructure/README.md) on this basis, and get through the whole process of data production, model training, compression, inference and deployment. -![](./doc/features_en.png) +
+ +
> It is recommended to start with the “quick experience” in the document tutorial @@ -113,18 +117,19 @@ PaddleOCR support a variety of cutting-edge algorithms related to OCR, and devel - [Quick Start](./ppstructure/docs/quickstart_en.md) - [Model Zoo](./ppstructure/docs/models_list_en.md) - [Model training](./doc/doc_en/training_en.md) - - [Layout Parser](./ppstructure/layout/README.md) + - [Layout Analysis](./ppstructure/layout/README.md) - [Table Recognition](./ppstructure/table/README.md) - - [DocVQA](./ppstructure/vqa/README.md) - - [Key Information Extraction](./ppstructure/docs/kie_en.md) + - [Key Information Extraction](./ppstructure/kie/README.md) - [Inference and Deployment](./deploy/README.md) - [Python Inference](./ppstructure/docs/inference_en.md) - - [C++ Inference]() - - [Serving](./deploy/pdserving/README.md) -- [Academic algorithms](./doc/doc_en/algorithms_en.md) + - [C++ Inference](./deploy/cpp_infer/readme.md) + - [Serving](./deploy/hubserving/readme_en.md) +- [Academic Algorithms](./doc/doc_en/algorithm_overview_en.md) - [Text detection](./doc/doc_en/algorithm_overview_en.md) - [Text recognition](./doc/doc_en/algorithm_overview_en.md) - - [End-to-end](./doc/doc_en/algorithm_overview_en.md) + - [End-to-end OCR](./doc/doc_en/algorithm_overview_en.md) + - [Table Recognition](./doc/doc_en/algorithm_overview_en.md) + - [Key Information Extraction](./doc/doc_en/algorithm_overview_en.md) - [Add New Algorithms to PaddleOCR](./doc/doc_en/add_new_algorithm_en.md) - Data Annotation and Synthesis - [Semi-automatic Annotation Tool: PPOCRLabel](./PPOCRLabel/README.md) @@ -135,9 +140,9 @@ PaddleOCR support a variety of cutting-edge algorithms related to OCR, and devel - [General OCR Datasets(Chinese/English)](doc/doc_en/dataset/datasets_en.md) - [HandWritten_OCR_Datasets(Chinese)](doc/doc_en/dataset/handwritten_datasets_en.md) - [Various OCR Datasets(multilingual)](doc/doc_en/dataset/vertical_and_multilingual_datasets_en.md) - - [layout analysis](doc/doc_en/dataset/layout_datasets_en.md) - - [table recognition](doc/doc_en/dataset/table_datasets_en.md) - - [DocVQA](doc/doc_en/dataset/docvqa_datasets_en.md) + - [Layout Analysis](doc/doc_en/dataset/layout_datasets_en.md) + - [Table Recognition](doc/doc_en/dataset/table_datasets_en.md) + - [Key Information Extraction](doc/doc_en/dataset/kie_datasets_en.md) - [Code Structure](./doc/doc_en/tree_en.md) - [Visualization](#Visualization) - [Community](#Community) @@ -176,7 +181,7 @@ PaddleOCR support a variety of cutting-edge algorithms related to OCR, and devel
-PP-Structure +PP-Structurev2 - layout analysis + table recognition
@@ -185,12 +190,28 @@ PaddleOCR support a variety of cutting-edge algorithms related to OCR, and devel - SER (Semantic entity recognition)
- + +
+ +
+ +
+ +
+
- RE (Relation Extraction)
- + +
+ +
+ +
+ +
+
diff --git a/README_ch.md b/README_ch.md index e801ce561cb41aafb376f81a3016f0a6b838320d..24a925f6c8092f28b58452e761ac74b0a5f3d2c3 100755 --- a/README_ch.md +++ b/README_ch.md @@ -27,28 +27,20 @@ PaddleOCR旨在打造一套丰富、领先、且实用的OCR工具库,助力 ## 近期更新 -- **🔥2022.5.11~13 每晚8:30【超强OCR技术详解与产业应用实战】三日直播课** - - 11日:开源最强OCR系统PP-OCRv3揭秘 - - 12日:云边端全覆盖的PP-OCRv3训练部署实战 - - 13日:OCR产业应用全流程拆解与实战 +- **🔥2022.8.24 发布 PaddleOCR [release/2.6](https://github.com/PaddlePaddle/PaddleOCR/tree/release/2.6)** + - 发布[PP-Structurev2](./ppstructure/),系统功能性能全面升级,适配中文场景,新增支持[版面复原](./ppstructure/recovery),支持**一行命令完成PDF转Word**; + - [版面分析](./ppstructure/layout)模型优化:模型存储减少95%,速度提升11倍,平均CPU耗时仅需41ms; + - [表格识别](./ppstructure/table)模型优化:设计3大优化策略,预测耗时不变情况下,模型精度提升6%; + - [关键信息抽取](./ppstructure/kie)模型优化:设计视觉无关模型结构,语义实体识别精度提升2.8%,关系抽取精度提升9.1%。 - 赶紧扫码报名吧! -
- -
- -- **🔥2022.5.9 发布PaddleOCR [release/2.5](https://github.com/PaddlePaddle/PaddleOCR/tree/release/2.5)** +- **🔥2022.8 发布 [OCR场景应用集合](./applications)** + - 包含数码管、液晶屏、车牌、高精度SVTR模型、手写体识别等**9个垂类模型**,覆盖通用,制造、金融、交通行业的主要OCR垂类应用。 + +- **2022.5.9 发布 PaddleOCR [release/2.5](https://github.com/PaddlePaddle/PaddleOCR/tree/release/2.5)** - 发布[PP-OCRv3](./doc/doc_ch/ppocr_introduction.md#pp-ocrv3),速度可比情况下,中文场景效果相比于PP-OCRv2再提升5%,英文场景提升11%,80语种多语言模型平均识别准确率提升5%以上; - 发布半自动标注工具[PPOCRLabelv2](./PPOCRLabel):新增表格文字图像、图像关键信息抽取任务和不规则文字图像的标注功能; - 发布OCR产业落地工具集:打通22种训练部署软硬件环境与方式,覆盖企业90%的训练部署环境需求; - 发布交互式OCR开源电子书[《动手学OCR》](./doc/doc_ch/ocr_book.md),覆盖OCR全栈技术的前沿理论与代码实践,并配套教学视频。 -- 2021.12.21 发布PaddleOCR [release/2.4](https://github.com/PaddlePaddle/PaddleOCR/tree/release/2.4) - - OCR算法新增1种文本检测算法([PSENet](./doc/doc_ch/algorithm_det_psenet.md)),3种文本识别算法([NRTR](./doc/doc_ch/algorithm_rec_nrtr.md)、[SEED](./doc/doc_ch/algorithm_rec_seed.md)、[SAR](./doc/doc_ch/algorithm_rec_sar.md)); - - 文档结构化算法新增1种关键信息提取算法([SDMGR](./ppstructure/docs/kie.md)),3种[DocVQA](./ppstructure/vqa)算法(LayoutLM、LayoutLMv2,LayoutXLM)。 -- 2021.9.7 发布PaddleOCR [release/2.3](https://github.com/PaddlePaddle/PaddleOCR/tree/release/2.3) - - 发布[PP-OCRv2](./doc/doc_ch/ppocr_introduction.md#pp-ocrv2),CPU推理速度相比于PP-OCR server提升220%;效果相比于PP-OCR mobile 提升7%。 -- 2021.8.3 发布PaddleOCR [release/2.2](https://github.com/PaddlePaddle/PaddleOCR/tree/release/2.2) - - 发布文档结构分析[PP-Structure](./ppstructure/README_ch.md)工具包,支持版面分析与表格识别(含Excel导出)。 > [更多](./doc/doc_ch/update.md) @@ -56,7 +48,9 @@ PaddleOCR旨在打造一套丰富、领先、且实用的OCR工具库,助力 支持多种OCR相关前沿算法,在此基础上打造产业级特色模型[PP-OCR](./doc/doc_ch/ppocr_introduction.md)和[PP-Structure](./ppstructure/README_ch.md),并打通数据生产、模型训练、压缩、预测部署全流程。 -![](./doc/features.png) +
+ +
> 上述内容的使用方法建议从文档教程中的快速开始体验 @@ -71,24 +65,22 @@ PaddleOCR旨在打造一套丰富、领先、且实用的OCR工具库,助力 ## 《动手学OCR》电子书 - [《动手学OCR》电子书📚](./doc/doc_ch/ocr_book.md) -## 场景应用 -- PaddleOCR场景应用覆盖通用,制造、金融、交通行业的主要OCR垂类应用,在PP-OCR、PP-Structure的通用能力基础之上,以notebook的形式展示利用场景数据微调、模型优化方法、数据增广等内容,为开发者快速落地OCR应用提供示范与启发。详情可查看[README](./applications)。 ## 开源社区 - +- **项目合作📑:** 如果您是企业开发者且有明确的OCR垂类应用需求,填写[问卷](https://paddle.wjx.cn/vj/QwF7GKw.aspx)后可免费与官方团队展开不同层次的合作。 - **加入社区👬:** 微信扫描二维码并填写问卷之后,加入交流群领取福利 - - **获取5月11-13日每晚20:30《OCR超强技术详解与产业应用实战》的直播课链接** + - **获取PaddleOCR最新发版解说《OCR超强技术详解与产业应用实战》系列直播课回放链接** - **10G重磅OCR学习大礼包:**《动手学OCR》电子书,配套讲解视频和notebook项目;66篇OCR相关顶会前沿论文打包放送,包括CVPR、AAAI、IJCAI、ICCV等;PaddleOCR历次发版直播课视频;OCR社区优秀开发者项目分享视频。 - -- **社区贡献**🏅️:[社区贡献](./doc/doc_ch/thirdparty.md)文档中包含了社区用户**使用PaddleOCR开发的各种工具、应用**以及**为PaddleOCR贡献的功能、优化的文档与代码**等,是官方为社区开发者打造的荣誉墙,也是帮助优质项目宣传的广播站。 +- **社区项目**🏅️:[社区项目](./doc/doc_ch/thirdparty.md)文档中包含了社区用户**使用PaddleOCR开发的各种工具、应用**以及**为PaddleOCR贡献的功能、优化的文档与代码**等,是官方为社区开发者打造的荣誉墙,也是帮助优质项目宣传的广播站。 - **社区常规赛**🎁:社区常规赛是面向OCR开发者的积分赛事,覆盖文档、代码、模型和应用四大类型,以季度为单位评选并发放奖励,赛题详情与报名方法可参考[链接](https://github.com/PaddlePaddle/PaddleOCR/issues/4982)。
- +
+ ## PP-OCR系列模型列表(更新中) @@ -96,14 +88,21 @@ PaddleOCR旨在打造一套丰富、领先、且实用的OCR工具库,助力 | ------------------------------------- | ----------------------- | --------------- | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | | 中英文超轻量PP-OCRv3模型(16.2M) | ch_PP-OCRv3_xx | 移动端&服务器端 | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_distill_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_train.tar) | | 英文超轻量PP-OCRv3模型(13.4M) | en_PP-OCRv3_xx | 移动端&服务器端 | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_distill_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_train.tar) | -| 中英文超轻量PP-OCRv2模型(13.0M) | ch_PP-OCRv2_xx | 移动端&服务器端 | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_distill_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_train.tar) | -| 中英文超轻量PP-OCR mobile模型(9.4M) | ch_ppocr_mobile_v2.0_xx | 移动端&服务器端 | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_pre.tar) | -| 中英文通用PP-OCR server模型(143.4M) | ch_ppocr_server_v2.0_xx | 服务器端 | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_pre.tar) | -更多模型下载(包括多语言),可以参考[PP-OCR 系列模型下载](./doc/doc_ch/models_list.md),文档分析相关模型参考[PP-Structure 系列模型下载](./ppstructure/docs/models_list.md) +- 超轻量OCR系列更多模型下载(包括多语言),可以参考[PP-OCR系列模型下载](./doc/doc_ch/models_list.md),文档分析相关模型参考[PP-Structure系列模型下载](./ppstructure/docs/models_list.md) + +### PaddleOCR场景应用模型 +| 行业 | 类别 | 亮点 | 文档说明 | 模型下载 | +| ---- | ------------ | ---------------------------------- | ------------------------------------------------------------ | --------------------------------------------- | +| 制造 | 数码管识别 | 数码管数据合成、漏识别调优 | [光功率计数码管字符识别](./applications/光功率计数码管字符识别/光功率计数码管字符识别.md) | [下载链接](./applications/README.md#模型下载) | +| 金融 | 通用表单识别 | 多模态通用表单结构化提取 | [多模态表单识别](./applications/多模态表单识别.md) | [下载链接](./applications/README.md#模型下载) | +| 交通 | 车牌识别 | 多角度图像处理、轻量模型、端侧部署 | [轻量级车牌识别](./applications/轻量级车牌识别.md) | [下载链接](./applications/README.md#模型下载) | + +- 更多制造、金融、交通行业的主要OCR垂类应用模型(如电表、液晶屏、高精度SVTR模型等),可参考[场景应用模型下载](./applications) + ## 文档教程 - [运行环境准备](./doc/doc_ch/environment.md) @@ -120,7 +119,7 @@ PaddleOCR旨在打造一套丰富、领先、且实用的OCR工具库,助力 - [知识蒸馏](./doc/doc_ch/knowledge_distillation.md) - [推理部署](./deploy/README_ch.md) - [基于Python预测引擎推理](./doc/doc_ch/inference_ppocr.md) - - [基于C++预测引擎推理](./deploy/cpp_infer/readme.md) + - [基于C++预测引擎推理](./deploy/cpp_infer/readme_ch.md) - [服务化部署](./deploy/pdserving/README_CN.md) - [端侧部署](./deploy/lite/readme.md) - [Paddle2ONNX模型转化与预测](./deploy/paddle2onnx/readme.md) @@ -132,16 +131,17 @@ PaddleOCR旨在打造一套丰富、领先、且实用的OCR工具库,助力 - [模型训练](./doc/doc_ch/training.md) - [版面分析](./ppstructure/layout/README_ch.md) - [表格识别](./ppstructure/table/README_ch.md) - - [关键信息提取](./ppstructure/docs/kie.md) - - [DocVQA](./ppstructure/vqa/README_ch.md) + - [关键信息提取](./ppstructure/kie/README_ch.md) - [推理部署](./deploy/README_ch.md) - [基于Python预测引擎推理](./ppstructure/docs/inference.md) - - [基于C++预测引擎推理]() - - [服务化部署](./deploy/pdserving/README_CN.md) -- [前沿算法与模型🚀](./doc/doc_ch/algorithm.md) - - [文本检测算法](./doc/doc_ch/algorithm_overview.md#11-%E6%96%87%E6%9C%AC%E6%A3%80%E6%B5%8B%E7%AE%97%E6%B3%95) - - [文本识别算法](./doc/doc_ch/algorithm_overview.md#12-%E6%96%87%E6%9C%AC%E8%AF%86%E5%88%AB%E7%AE%97%E6%B3%95) - - [端到端算法](./doc/doc_ch/algorithm_overview.md#2-%E6%96%87%E6%9C%AC%E8%AF%86%E5%88%AB%E7%AE%97%E6%B3%95) + - [基于C++预测引擎推理](./deploy/cpp_infer/readme_ch.md) + - [服务化部署](./deploy/hubserving/readme.md) +- [前沿算法与模型🚀](./doc/doc_ch/algorithm_overview.md) + - [文本检测算法](./doc/doc_ch/algorithm_overview.md) + - [文本识别算法](./doc/doc_ch/algorithm_overview.md) + - [端到端OCR算法](./doc/doc_ch/algorithm_overview.md) + - [表格识别算法](./doc/doc_ch/algorithm_overview.md) + - [关键信息抽取算法](./doc/doc_ch/algorithm_overview.md) - [使用PaddleOCR架构添加新算法](./doc/doc_ch/add_new_algorithm.md) - [场景应用](./applications) - 数据标注与合成 @@ -155,7 +155,7 @@ PaddleOCR旨在打造一套丰富、领先、且实用的OCR工具库,助力 - [垂类多语言OCR数据集](doc/doc_ch/dataset/vertical_and_multilingual_datasets.md) - [版面分析数据集](doc/doc_ch/dataset/layout_datasets.md) - [表格识别数据集](doc/doc_ch/dataset/table_datasets.md) - - [DocVQA数据集](doc/doc_ch/dataset/docvqa_datasets.md) + - [关键信息提取数据集](doc/doc_ch/dataset/kie_datasets.md) - [代码组织结构](./doc/doc_ch/tree.md) - [效果展示](#效果展示) - [《动手学OCR》电子书📚](./doc/doc_ch/ocr_book.md) @@ -214,14 +214,30 @@ PaddleOCR旨在打造一套丰富、领先、且实用的OCR工具库,助力 - SER(语义实体识别)
- +
+
+ +
+ +
+ +
+ - RE(关系提取)
- + +
+ +
+
+
+ +
+ diff --git a/__init__.py b/__init__.py index 15a9aca4da19a981b9e678e7cc93e33cf40fc81c..a7c32e9629d2e5ff04dc2ca45c6317caac8fa631 100644 --- a/__init__.py +++ b/__init__.py @@ -16,5 +16,6 @@ from .paddleocr import * __version__ = paddleocr.VERSION __all__ = [ 'PaddleOCR', 'PPStructure', 'draw_ocr', 'draw_structure_result', - 'save_structure_res', 'download_with_progressbar' + 'save_structure_res', 'download_with_progressbar', 'sorted_layout_boxes', + 'convert_info_docx', 'to_excel' ] diff --git a/applications/README.md b/applications/README.md index 017c2a9f6f696904e9bf2f1180104e66c90ee712..2637cd6eaf0c3c59d56673c5e2d294ee7fca2b8b 100644 --- a/applications/README.md +++ b/applications/README.md @@ -20,10 +20,10 @@ PaddleOCR场景应用覆盖通用,制造、金融、交通行业的主要OCR ### 通用 -| 类别 | 亮点 | 模型下载 | 教程 | -| ---------------------- | ------------ | -------------- | --------------------------------------- | -| 高精度中文识别模型SVTR | 比PP-OCRv3识别模型精度高3%,可用于数据挖掘或对预测效率要求不高的场景。| [模型下载](#2) | [中文](./高精度中文识别模型.md)/English | -| 手写体识别 | 新增字形支持 | | | +| 类别 | 亮点 | 模型下载 | 教程 | 示例图 | +| ---------------------- | ------------------------------------------------------------ | -------------- | --------------------------------------- | ------------------------------------------------------------ | +| 高精度中文识别模型SVTR | 比PP-OCRv3识别模型精度高3%,
可用于数据挖掘或对预测效率要求不高的场景。 | [模型下载](#2) | [中文](./高精度中文识别模型.md)/English | | +| 手写体识别 | 新增字形支持 | [模型下载](#2) | [中文](./手写文字识别.md)/English | | @@ -42,14 +42,14 @@ PaddleOCR场景应用覆盖通用,制造、金融、交通行业的主要OCR ### 金融 -| 类别 | 亮点 | 模型下载 | 教程 | 示例图 | -| -------------- | ------------------------ | -------------- | ----------------------------------- | ------------------------------------------------------------ | -| 表单VQA | 多模态通用表单结构化提取 | [模型下载](#2) | [中文](./多模态表单识别.md)/English | | -| 增值税发票 | 尽请期待 | | | | -| 印章检测与识别 | 端到端弯曲文本识别 | | | | -| 通用卡证识别 | 通用结构化提取 | | | | -| 身份证识别 | 结构化提取、图像阴影 | | | | -| 合同比对 | 密集文本检测、NLP串联 | | | | +| 类别 | 亮点 | 模型下载 | 教程 | 示例图 | +| -------------- | ----------------------------- | -------------- | ------------------------------------- | ------------------------------------------------------------ | +| 表单VQA | 多模态通用表单结构化提取 | [模型下载](#2) | [中文](./多模态表单识别.md)/English | | +| 增值税发票 | 关键信息抽取,SER、RE任务训练 | [模型下载](#2) | [中文](./发票关键信息抽取.md)/English | | +| 印章检测与识别 | 端到端弯曲文本识别 | | | | +| 通用卡证识别 | 通用结构化提取 | | | | +| 身份证识别 | 结构化提取、图像阴影 | | | | +| 合同比对 | 密集文本检测、NLP串联 | | | | diff --git a/applications/README_en.md b/applications/README_en.md new file mode 100644 index 0000000000000000000000000000000000000000..95c56a1f740faa95e1fe3adeaeb90bfe902f8ed8 --- /dev/null +++ b/applications/README_en.md @@ -0,0 +1,79 @@ +English| [简体中文](README.md) + +# Application + +PaddleOCR scene application covers general, manufacturing, finance, transportation industry of the main OCR vertical applications, on the basis of the general capabilities of PP-OCR, PP-Structure, in the form of notebook to show the use of scene data fine-tuning, model optimization methods, data augmentation and other content, for developers to quickly land OCR applications to provide demonstration and inspiration. + +- [Tutorial](#1) + - [General](#11) + - [Manufacturing](#12) + - [Finance](#13) + - [Transportation](#14) + +- [Model Download](#2) + + + +## Tutorial + + + +### General + +| Case | Feature | Model Download | Tutorial | Example | +| ---------------------------------------------- | ---------------- | -------------------- | --------------------------------------- | ------------------------------------------------------------ | +| High-precision Chineses recognition model SVTR | New model | [Model Download](#2) | [中文](./高精度中文识别模型.md)/English | | +| Chinese handwriting recognition | New font support | [Model Download](#2) | [中文](./手写文字识别.md)/English | | + + + +### Manufacturing + +| Case | Feature | Model Download | Tutorial | Example | +| ------------------------------ | ------------------------------------------------------------ | -------------------- | ------------------------------------------------------------ | ------------------------------------------------------------ | +| Digital tube | Digital tube data sythesis, recognition model fine-tuning | [Model Download](#2) | [中文](./光功率计数码管字符识别/光功率计数码管字符识别.md)/English | | +| LCD screen | Detection model distillation, serving deployment | [Model Download](#2) | [中文](./液晶屏读数识别.md)/English | | +| Packaging production data | Dot matrix character synthesis, overexposure and overdark text recognition | [Model Download](#2) | [中文](./包装生产日期识别.md)/English | | +| PCB text recognition | Small size text detection and recognition | [Model Download](#2) | [中文](./PCB字符识别/PCB字符识别.md)/English | | +| Meter text recognition | High-resolution image detection fine-tuning | [Model Download](#2) | | | +| LCD character defect detection | Non-text character recognition | | | | + + + +### Finance + +| Case | Feature | Model Download | Tutorial | Example | +| ----------------------------------- | -------------------------------------------------- | -------------------- | ------------------------------------- | ------------------------------------------------------------ | +| Form visual question and answer | Multimodal general form structured extraction | [Model Download](#2) | [中文](./多模态表单识别.md)/English | | +| VAT invoice | Key information extraction, SER, RE task fine-tune | [Model Download](#2) | [中文](./发票关键信息抽取.md)/English | | +| Seal detection and recognition | End-to-end curved text recognition | | | | +| Universal card recognition | Universal structured extraction | | | | +| ID card recognition | Structured extraction, image shading | | | | +| Contract key information extraction | Dense text detection, NLP concatenation | | | | + + + +### Transportation + +| Case | Feature | Model Download | Tutorial | Example | +| ----------------------------------------------- | ------------------------------------------------------------ | -------------------- | ----------------------------------- | ------------------------------------------------------------ | +| License plate recognition | Multi-angle images, lightweight models, edge-side deployment | [Model Download](#2) | [中文](./轻量级车牌识别.md)/English | | +| Driver's license/driving license identification | coming soon | | | | +| Express text recognition | coming soon | | | | + + + +## Model Download + +- For international developers: We're building a way to download these trained models, and since the current tutorials are Chinese, if you are good at both Chinese and English, or willing to polish English documents, please let us know in [discussion](https://github.com/PaddlePaddle/PaddleOCR/discussions). +- For Chinese developer: If you want to download the trained application model in the above scenarios, scan the QR code below with your WeChat, follow the PaddlePaddle official account to fill in the questionnaire, and join the PaddleOCR official group to get the 20G OCR learning materials (including "Dive into OCR" e-book, course video, application models and other materials) + +
+ +
+ + If you are an enterprise developer and have not found a suitable solution in the above scenarios, you can fill in the [OCR Application Cooperation Survey Questionnaire](https://paddle.wjx.cn/vj/QwF7GKw.aspx) to carry out different levels of cooperation with the official team **for free**, including but not limited to problem abstraction, technical solution determination, project Q&A, joint research and development, etc. If you have already used paddleOCR in your project, you can also fill out this questionnaire to jointly promote with the PaddlePaddle and enhance the technical publicity of enterprises. Looking forward to your submission! + + +trackgit-views + diff --git "a/applications/\345\217\221\347\245\250\345\205\263\351\224\256\344\277\241\346\201\257\346\212\275\345\217\226.md" "b/applications/\345\217\221\347\245\250\345\205\263\351\224\256\344\277\241\346\201\257\346\212\275\345\217\226.md" new file mode 100644 index 0000000000000000000000000000000000000000..82f5b8d48600c6bebb4d3183ee801305d305d531 --- /dev/null +++ "b/applications/\345\217\221\347\245\250\345\205\263\351\224\256\344\277\241\346\201\257\346\212\275\345\217\226.md" @@ -0,0 +1,343 @@ + +# 基于VI-LayoutXLM的发票关键信息抽取 + +- [1. 项目背景及意义](#1-项目背景及意义) +- [2. 项目内容](#2-项目内容) +- [3. 安装环境](#3-安装环境) +- [4. 关键信息抽取](#4-关键信息抽取) + - [4.1 文本检测](#41-文本检测) + - [4.2 文本识别](#42-文本识别) + - [4.3 语义实体识别](#43-语义实体识别) + - [4.4 关系抽取](#44-关系抽取) + + + +## 1. 项目背景及意义 + +关键信息抽取在文档场景中被广泛使用,如身份证中的姓名、住址信息抽取,快递单中的姓名、联系方式等关键字段内容的抽取。传统基于模板匹配的方案需要针对不同的场景制定模板并进行适配,较为繁琐,不够鲁棒。基于该问题,我们借助飞桨提供的PaddleOCR套件中的关键信息抽取方案,实现对增值税发票场景的关键信息抽取。 + +## 2. 项目内容 + +本项目基于PaddleOCR开源套件,以VI-LayoutXLM多模态关键信息抽取模型为基础,针对增值税发票场景进行适配,提取该场景的关键信息。 + +## 3. 安装环境 + +```bash +# 首先git官方的PaddleOCR项目,安装需要的依赖 +# 第一次运行打开该注释 +git clone https://gitee.com/PaddlePaddle/PaddleOCR.git +cd PaddleOCR +# 安装PaddleOCR的依赖 +pip install -r requirements.txt +# 安装关键信息抽取任务的依赖 +pip install -r ./ppstructure/kie/requirements.txt +``` + +## 4. 关键信息抽取 + +基于文档图像的关键信息抽取包含3个部分:(1)文本检测(2)文本识别(3)关键信息抽取方法,包括语义实体识别或者关系抽取,下面分别进行介绍。 + +### 4.1 文本检测 + + +本文重点关注发票的关键信息抽取模型训练与预测过程,因此在关键信息抽取过程中,直接使用标注的文本检测与识别标注信息进行测试,如果你希望自定义该场景的文本检测模型,完成端到端的关键信息抽取部分,请参考[文本检测模型训练教程](../doc/doc_ch/detection.md),按照训练数据格式准备数据,并完成该场景下垂类文本检测模型的微调过程。 + + +### 4.2 文本识别 + +本文重点关注发票的关键信息抽取模型训练与预测过程,因此在关键信息抽取过程中,直接使用提供的文本检测与识别标注信息进行测试,如果你希望自定义该场景的文本检测模型,完成端到端的关键信息抽取部分,请参考[文本识别模型训练教程](../doc/doc_ch/recognition.md),按照训练数据格式准备数据,并完成该场景下垂类文本识别模型的微调过程。 + +### 4.3 语义实体识别 (Semantic Entity Recognition) + +语义实体识别指的是给定一段文本行,确定其类别(如`姓名`、`住址`等类别)。PaddleOCR中提供了基于VI-LayoutXLM的多模态语义实体识别方法,融合文本、位置与版面信息,相比LayoutXLM多模态模型,去除了其中的视觉骨干网络特征提取部分,引入符合阅读顺序的文本行排序方法,同时使用UDML联合互蒸馏方法进行训练,最终在精度与速度方面均超越LayoutXLM。更多关于VI-LayoutXLM的算法介绍与精度指标,请参考:[VI-LayoutXLM算法介绍](../doc/doc_ch/algorithm_kie_vi_layoutxlm.md)。 + +#### 4.3.1 准备数据 + +发票场景为例,我们首先需要标注出其中的关键字段,我们将其标注为`问题-答案`的key-value pair,如下,编号No为12270830,则`No`字段标注为question,`12270830`字段标注为answer。如下图所示。 + +
+ +
+ +**注意:** + +* 如果文本检测模型数据标注过程中,没有标注 **非关键信息内容** 的检测框,那么在标注关键信息抽取任务的时候,也不需要标注该部分,如上图所示;如果标注的过程,如果同时标注了**非关键信息内容** 的检测框,那么我们需要将该部分的label记为other。 +* 标注过程中,需要以文本行为单位进行标注,无需标注单个字符的位置信息。 + + +已经处理好的增值税发票数据集从这里下载:[增值税发票数据集下载链接](https://aistudio.baidu.com/aistudio/datasetdetail/165561)。 + +下载好发票数据集,并解压在train_data目录下,目录结构如下所示。 + +``` +train_data + |--zzsfp + |---class_list.txt + |---imgs/ + |---train.json + |---val.json +``` + +其中`class_list.txt`是包含`other`, `question`, `answer`,3个种类的的类别列表(不区分大小写),`imgs`目录底下,`train.json`与`val.json`分别表示训练与评估集合的标注文件。训练集中包含30张图片,验证集中包含8张图片。部分标注如下所示。 + +```py +b33.jpg [{"transcription": "No", "label": "question", "points": [[2882, 472], [3026, 472], [3026, 588], [2882, 588]], }, {"transcription": "12269563", "label": "answer", "points": [[3066, 448], [3598, 448], [3598, 576], [3066, 576]], ]}] +``` + +相比于OCR检测的标注,仅多了`label`字段。 + + +#### 4.3.2 开始训练 + + +VI-LayoutXLM的配置为[ser_vi_layoutxlm_xfund_zh_udml.yml](../configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh_udml.yml),需要修改数据、类别数目以及配置文件。 + +```yml +Architecture: + model_type: &model_type "kie" + name: DistillationModel + algorithm: Distillation + Models: + Teacher: + pretrained: + freeze_params: false + return_all_feats: true + model_type: *model_type + algorithm: &algorithm "LayoutXLM" + Transform: + Backbone: + name: LayoutXLMForSer + pretrained: True + # one of base or vi + mode: vi + checkpoints: + # 定义类别数目 + num_classes: &num_classes 5 + ... + +PostProcess: + name: DistillationSerPostProcess + model_name: ["Student", "Teacher"] + key: backbone_out + # 定义类别文件 + class_path: &class_path train_data/zzsfp/class_list.txt + +Train: + dataset: + name: SimpleDataSet + # 定义训练数据目录与标注文件 + data_dir: train_data/zzsfp/imgs + label_file_list: + - train_data/zzsfp/train.json + ... + +Eval: + dataset: + # 定义评估数据目录与标注文件 + name: SimpleDataSet + data_dir: train_data/zzsfp/imgs + label_file_list: + - train_data/zzsfp/val.json + ... +``` + +LayoutXLM与VI-LayoutXLM针对该场景的训练结果如下所示。 + +| 模型 | 迭代轮数 | Hmean | +| :---: | :---: | :---: | +| LayoutXLM | 50 | 100% | +| VI-LayoutXLM | 50 | 100% | + +可以看出,由于当前数据量较少,场景比较简单,因此2个模型的Hmean均达到了100%。 + + +#### 4.3.3 模型评估 + +模型训练过程中,使用的是知识蒸馏的策略,最终保留了学生模型的参数,在评估时,我们需要针对学生模型的配置文件进行修改: [ser_vi_layoutxlm_xfund_zh.yml](../configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml),修改内容与训练配置相同,包括**类别数、类别映射文件、数据目录**。 + +修改完成后,执行下面的命令完成评估过程。 + +```bash +# 注意:需要根据你的配置文件地址与保存的模型地址,对评估命令进行修改 +python3 tools/eval.py -c ./fapiao/ser_vi_layoutxlm.yml -o Architecture.Backbone.checkpoints=fapiao/models/ser_vi_layoutxlm_fapiao_udml/best_accuracy +``` + +输出结果如下所示。 + +``` +[2022/08/18 08:49:58] ppocr INFO: metric eval *************** +[2022/08/18 08:49:58] ppocr INFO: precision:1.0 +[2022/08/18 08:49:58] ppocr INFO: recall:1.0 +[2022/08/18 08:49:58] ppocr INFO: hmean:1.0 +[2022/08/18 08:49:58] ppocr INFO: fps:1.9740402401574881 +``` + +#### 4.3.4 模型预测 + +使用下面的命令进行预测。 + +```bash +python3 tools/infer_kie_token_ser.py -c fapiao/ser_vi_layoutxlm.yml -o Architecture.Backbone.checkpoints=fapiao/models/ser_vi_layoutxlm_fapiao_udml/best_accuracy Global.infer_img=./train_data/XFUND/zh_val/val.json Global.infer_mode=False +``` + +预测结果会保存在配置文件中的`Global.save_res_path`目录中。 + +部分预测结果如下所示。 + +
+ +
+ + +* 注意:在预测时,使用的文本检测与识别结果为标注的结果,直接从json文件里面进行读取。 + +如果希望使用OCR引擎结果得到的结果进行推理,则可以使用下面的命令进行推理。 + + +```bash +python3 tools/infer_kie_token_ser.py -c fapiao/ser_vi_layoutxlm.yml -o Architecture.Backbone.checkpoints=fapiao/models/ser_vi_layoutxlm_fapiao_udml/best_accuracy Global.infer_img=./train_data/zzsfp/imgs/b25.jpg Global.infer_mode=True +``` + +结果如下所示。 + +
+ +
+ +它会使用PP-OCRv3的文本检测与识别模型进行获取文本位置与内容信息。 + +可以看出,由于训练的过程中,没有标注额外的字段为other类别,所以大多数检测出来的字段被预测为question或者answer。 + +如果希望构建基于你在垂类场景训练得到的OCR检测与识别模型,可以使用下面的方法传入检测与识别的inference 模型路径,即可完成OCR文本检测与识别以及SER的串联过程。 + +```bash +python3 tools/infer_kie_token_ser.py -c fapiao/ser_vi_layoutxlm.yml -o Architecture.Backbone.checkpoints=fapiao/models/ser_vi_layoutxlm_fapiao_udml/best_accuracy Global.infer_img=./train_data/zzsfp/imgs/b25.jpg Global.infer_mode=True Global.kie_rec_model_dir="your_rec_model" Global.kie_det_model_dir="your_det_model" +``` + +### 4.4 关系抽取(Relation Extraction) + +使用SER模型,可以获取图像中所有的question与answer的字段,继续这些字段的类别,我们需要进一步获取question与answer之间的连接,因此需要进一步训练关系抽取模型,解决该问题。本文也基于VI-LayoutXLM多模态预训练模型,进行下游RE任务的模型训练。 + +#### 4.4.1 准备数据 + +以发票场景为例,相比于SER任务,RE中还需要标记每个文本行的id信息以及链接关系linking,如下所示。 + +
+ +
+ + +标注文件的部分内容如下所示。 + +```py +b33.jpg [{"transcription": "No", "label": "question", "points": [[2882, 472], [3026, 472], [3026, 588], [2882, 588]], "id": 0, "linking": [[0, 1]]}, {"transcription": "12269563", "label": "answer", "points": [[3066, 448], [3598, 448], [3598, 576], [3066, 576]], "id": 1, "linking": [[0, 1]]}] +``` + +相比与SER的标注,多了`id`与`linking`的信息,分别表示唯一标识以及连接关系。 + +已经处理好的增值税发票数据集从这里下载:[增值税发票数据集下载链接](https://aistudio.baidu.com/aistudio/datasetdetail/165561)。 + +#### 4.4.2 开始训练 + +基于VI-LayoutXLM的RE任务配置为[re_vi_layoutxlm_xfund_zh_udml.yml](../configs/kie/vi_layoutxlm/re_vi_layoutxlm_xfund_zh_udml.yml),需要修改**数据路径、类别列表文件**。 + +```yml +Train: + dataset: + name: SimpleDataSet + # 定义训练数据目录与标注文件 + data_dir: train_data/zzsfp/imgs + label_file_list: + - train_data/zzsfp/train.json + transforms: + - DecodeImage: # load image + img_mode: RGB + channel_first: False + - VQATokenLabelEncode: # Class handling label + contains_re: True + algorithm: *algorithm + class_path: &class_path train_data/zzsfp/class_list.txt + ... + +Eval: + dataset: + # 定义评估数据目录与标注文件 + name: SimpleDataSet + data_dir: train_data/zzsfp/imgs + label_file_list: + - train_data/zzsfp/val.json + ... + +``` + +LayoutXLM与VI-LayoutXLM针对该场景的训练结果如下所示。 + +| 模型 | 迭代轮数 | Hmean | +| :---: | :---: | :---: | +| LayoutXLM | 50 | 98.0% | +| VI-LayoutXLM | 50 | 99.3% | + +可以看出,对于VI-LayoutXLM相比LayoutXLM的Hmean高了1.3%。 + +如需获取已训练模型,请扫码填写问卷,加入PaddleOCR官方交流群获取全部OCR垂类模型下载链接、《动手学OCR》电子书等全套OCR学习资料🎁 + +
+ +
+ + +#### 4.4.3 模型评估 + +模型训练过程中,使用的是知识蒸馏的策略,最终保留了学生模型的参数,在评估时,我们需要针对学生模型的配置文件进行修改: [re_vi_layoutxlm_xfund_zh.yml](../configs/kie/vi_layoutxlm/re_vi_layoutxlm_xfund_zh.yml),修改内容与训练配置相同,包括**类别映射文件、数据目录**。 + +修改完成后,执行下面的命令完成评估过程。 + +```bash +# 注意:需要根据你的配置文件地址与保存的模型地址,对评估命令进行修改 +python3 tools/eval.py -c ./fapiao/re_vi_layoutxlm.yml -o Architecture.Backbone.checkpoints=fapiao/models/re_vi_layoutxlm_fapiao_udml/best_accuracy +``` + +输出结果如下所示。 + +```py +[2022/08/18 12:17:14] ppocr INFO: metric eval *************** +[2022/08/18 12:17:14] ppocr INFO: precision:1.0 +[2022/08/18 12:17:14] ppocr INFO: recall:0.9873417721518988 +[2022/08/18 12:17:14] ppocr INFO: hmean:0.9936305732484078 +[2022/08/18 12:17:14] ppocr INFO: fps:2.765963539771157 +``` + +#### 4.4.4 模型预测 + +使用下面的命令进行预测。 + +```bash +# -c 后面的是RE任务的配置文件 +# -o 后面的字段是RE任务的配置 +# -c_ser 后面的是SER任务的配置文件 +# -c_ser 后面的字段是SER任务的配置 +python3 tools/infer_kie_token_ser_re.py -c fapiao/re_vi_layoutxlm.yml -o Architecture.Backbone.checkpoints=fapiao/models/re_vi_layoutxlm_fapiao_trained/best_accuracy Global.infer_img=./train_data/zzsfp/val.json Global.infer_mode=False -c_ser fapiao/ser_vi_layoutxlm.yml -o_ser Architecture.Backbone.checkpoints=fapiao/models/ser_vi_layoutxlm_fapiao_trained/best_accuracy +``` + +预测结果会保存在配置文件中的`Global.save_res_path`目录中。 + +部分预测结果如下所示。 + +
+ +
+ + +* 注意:在预测时,使用的文本检测与识别结果为标注的结果,直接从json文件里面进行读取。 + +如果希望使用OCR引擎结果得到的结果进行推理,则可以使用下面的命令进行推理。 + +```bash +python3 tools/infer_kie_token_ser_re.py -c fapiao/re_vi_layoutxlm.yml -o Architecture.Backbone.checkpoints=fapiao/models/re_vi_layoutxlm_fapiao_udml/best_accuracy Global.infer_img=./train_data/zzsfp/val.json Global.infer_mode=True -c_ser fapiao/ser_vi_layoutxlm.yml -o_ser Architecture.Backbone.checkpoints=fapiao/models/ser_vi_layoutxlm_fapiao_udml/best_accuracy +``` + +如果希望构建基于你在垂类场景训练得到的OCR检测与识别模型,可以使用下面的方法传入,即可完成SER + RE的串联过程。 + +```bash +python3 tools/infer_kie_token_ser_re.py -c fapiao/re_vi_layoutxlm.yml -o Architecture.Backbone.checkpoints=fapiao/models/re_vi_layoutxlm_fapiao_udml/best_accuracy Global.infer_img=./train_data/zzsfp/val.json Global.infer_mode=True -c_ser fapiao/ser_vi_layoutxlm.yml -o_ser Architecture.Backbone.checkpoints=fapiao/models/ser_vi_layoutxlm_fapiao_udml/best_accuracy Global.kie_rec_model_dir="your_rec_model" Global.kie_det_model_dir="your_det_model" +``` diff --git a/configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_cml.yml b/configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_cml.yml index df429314cd0ec058aa6779a0ff55656f1b211bbf..0c6ab2a0d1d9733d647dc40a7b182fe201866a78 100644 --- a/configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_cml.yml +++ b/configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_cml.yml @@ -14,6 +14,9 @@ Global: use_visualdl: False infer_img: doc/imgs_en/img_10.jpg save_res_path: ./output/det_db/predicts_db.txt + use_amp: False + amp_level: O2 + amp_custom_black_list: ['exp'] Architecture: name: DistillationModel @@ -188,7 +191,6 @@ Eval: channel_first: False - DetLabelEncode: # Class handling label - DetResizeForTest: -# image_shape: [736, 1280] - NormalizeImage: scale: 1./255. mean: [0.485, 0.456, 0.406] diff --git a/configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml b/configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml index ef58befd694e26704c734d7fd072ebc3370c8554..000d95e892cb8e6dcceeb7c22264c28934d1000c 100644 --- a/configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml +++ b/configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml @@ -24,6 +24,7 @@ Architecture: model_type: det Models: Student: + pretrained: model_type: det algorithm: DB Transform: null @@ -40,6 +41,7 @@ Architecture: name: DBHead k: 50 Student2: + pretrained: model_type: det algorithm: DB Transform: null @@ -91,14 +93,11 @@ Loss: - ["Student", "Student2"] maps_name: "thrink_maps" weight: 1.0 - # act: None model_name_pairs: ["Student", "Student2"] key: maps - DistillationDBLoss: weight: 1.0 model_name_list: ["Student", "Student2"] - # key: maps - # name: DBLoss balance_loss: true main_loss_type: DiceLoss alpha: 5 @@ -197,6 +196,7 @@ Train: drop_last: false batch_size_per_card: 8 num_workers: 4 + Eval: dataset: name: SimpleDataSet @@ -204,31 +204,21 @@ Eval: label_file_list: - ./train_data/icdar2015/text_localization/test_icdar2015_label.txt transforms: - - DecodeImage: - img_mode: BGR - channel_first: false - - DetLabelEncode: null - - DetResizeForTest: null - - NormalizeImage: - scale: 1./255. - mean: - - 0.485 - - 0.456 - - 0.406 - std: - - 0.229 - - 0.224 - - 0.225 - order: hwc - - ToCHWImage: null - - KeepKeys: - keep_keys: - - image - - shape - - polys - - ignore_tags + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - DetLabelEncode: # Class handling label + - DetResizeForTest: + - NormalizeImage: + scale: 1./255. + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: 'hwc' + - ToCHWImage: + - KeepKeys: + keep_keys: ['image', 'shape', 'polys', 'ignore_tags'] loader: - shuffle: false - drop_last: false - batch_size_per_card: 1 - num_workers: 2 + shuffle: False + drop_last: False + batch_size_per_card: 1 # must be 1 + num_workers: 2 \ No newline at end of file diff --git a/configs/det/det_r18_vd_ct.yml b/configs/det/det_r18_vd_ct.yml new file mode 100644 index 0000000000000000000000000000000000000000..42922dfd22c0e49d20d50534c76fedae16b27a4a --- /dev/null +++ b/configs/det/det_r18_vd_ct.yml @@ -0,0 +1,107 @@ +Global: + use_gpu: true + epoch_num: 600 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/det_ct/ + save_epoch_step: 10 + # evaluation is run every 2000 iterations + eval_batch_step: [0,1000] + cal_metric_during_train: False + pretrained_model: ./pretrain_models/ResNet18_vd_pretrained.pdparams + checkpoints: + save_inference_dir: + use_visualdl: False + infer_img: doc/imgs_en/img623.jpg + save_res_path: ./output/det_ct/predicts_ct.txt + +Architecture: + model_type: det + algorithm: CT + Transform: + Backbone: + name: ResNet_vd + layers: 18 + Neck: + name: CTFPN + Head: + name: CT_Head + in_channels: 512 + hidden_dim: 128 + num_classes: 3 + +Loss: + name: CTLoss + +Optimizer: + name: Adam + lr: #PolynomialDecay + name: Linear + learning_rate: 0.001 + end_lr: 0. + epochs: 600 + step_each_epoch: 1254 + power: 0.9 + +PostProcess: + name: CTPostProcess + box_type: poly + +Metric: + name: CTMetric + main_indicator: f_score + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/total_text/train + label_file_list: + - ./train_data/total_text/train/train.txt + ratio_list: [1.0] + transforms: + - DecodeImage: + img_mode: RGB + channel_first: False + - CTLabelEncode: # Class handling label + - RandomScale: + - MakeShrink: + - GroupRandomHorizontalFlip: + - GroupRandomRotate: + - GroupRandomCropPadding: + - MakeCentripetalShift: + - ColorJitter: + brightness: 0.125 + saturation: 0.5 + - ToCHWImage: + - NormalizeImage: + - KeepKeys: + keep_keys: ['image', 'gt_kernel', 'training_mask', 'gt_instance', 'gt_kernel_instance', 'training_mask_distance', 'gt_distance'] # the order of the dataloader list + loader: + shuffle: True + drop_last: True + batch_size_per_card: 4 + num_workers: 8 + +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data/total_text/test + label_file_list: + - ./train_data/total_text/test/test.txt + ratio_list: [1.0] + transforms: + - DecodeImage: + img_mode: RGB + channel_first: False + - CTLabelEncode: # Class handling label + - ScaleAlignedShort: + - NormalizeImage: + order: 'hwc' + - ToCHWImage: + - KeepKeys: + keep_keys: ['image', 'shape', 'polys', 'texts'] # the order of the dataloader list + loader: + shuffle: False + drop_last: False + batch_size_per_card: 1 + num_workers: 2 diff --git a/configs/e2e/e2e_r50_vd_pg.yml b/configs/e2e/e2e_r50_vd_pg.yml index c4c5226e796a42db723ce78ef65473e357c25dc6..4642f544868f720d413f7f5242740705bc9fd0a5 100644 --- a/configs/e2e/e2e_r50_vd_pg.yml +++ b/configs/e2e/e2e_r50_vd_pg.yml @@ -13,6 +13,7 @@ Global: save_inference_dir: use_visualdl: False infer_img: + infer_visual_type: EN # two mode: EN is for english datasets, CN is for chinese datasets valid_set: totaltext # two mode: totaltext valid curved words, partvgg valid non-curved words save_res_path: ./output/pgnet_r50_vd_totaltext/predicts_pgnet.txt character_dict_path: ppocr/utils/ic15_dict.txt @@ -32,6 +33,7 @@ Architecture: name: PGFPN Head: name: PGHead + character_dict_path: ppocr/utils/ic15_dict.txt # the same as Global:character_dict_path Loss: name: PGLoss @@ -45,16 +47,18 @@ Optimizer: beta1: 0.9 beta2: 0.999 lr: + name: Cosine learning_rate: 0.001 + warmup_epoch: 50 regularizer: name: 'L2' - factor: 0 - + factor: 0.0001 PostProcess: name: PGPostProcess score_thresh: 0.5 mode: fast # fast or slow two ways + point_gather_mode: align # same as PGProcessTrain: point_gather_mode Metric: name: E2EMetric @@ -76,9 +80,12 @@ Train: - E2ELabelEncodeTrain: - PGProcessTrain: batch_size: 14 # same as loader: batch_size_per_card + use_resize: True + use_random_crop: False min_crop_size: 24 min_text_size: 4 max_text_size: 512 + point_gather_mode: align # two mode: align and none, align mode is better than none mode - KeepKeys: keep_keys: [ 'images', 'tcl_maps', 'tcl_label_maps', 'border_maps','direction_maps', 'training_masks', 'label_list', 'pos_list', 'pos_mask' ] # dataloader will return list in this order loader: diff --git a/configs/kie/layoutlm_series/re_layoutlmv2_xfund_zh.yml b/configs/kie/layoutlm_series/re_layoutlmv2_xfund_zh.yml index 4b330d8d58bef2d549ec7e0fea5986746a23fbe4..3e3578d8cac1aadd484f583dbe0955f7c47fca73 100644 --- a/configs/kie/layoutlm_series/re_layoutlmv2_xfund_zh.yml +++ b/configs/kie/layoutlm_series/re_layoutlmv2_xfund_zh.yml @@ -11,11 +11,11 @@ Global: save_inference_dir: use_visualdl: False seed: 2022 - infer_img: ppstructure/docs/vqa/input/zh_val_21.jpg + infer_img: ppstructure/docs/kie/input/zh_val_21.jpg save_res_path: ./output/re_layoutlmv2_xfund_zh/res/ Architecture: - model_type: vqa + model_type: kie algorithm: &algorithm "LayoutLMv2" Transform: Backbone: diff --git a/configs/kie/layoutlm_series/re_layoutxlm_xfund_zh.yml b/configs/kie/layoutlm_series/re_layoutxlm_xfund_zh.yml index a092106eea10e0457419e5551dd75819adeddf1b..2401cf317987c5614a476065191e750587bc09b5 100644 --- a/configs/kie/layoutlm_series/re_layoutxlm_xfund_zh.yml +++ b/configs/kie/layoutlm_series/re_layoutxlm_xfund_zh.yml @@ -11,11 +11,11 @@ Global: save_inference_dir: use_visualdl: False seed: 2022 - infer_img: ppstructure/docs/vqa/input/zh_val_21.jpg + infer_img: ppstructure/docs/kie/input/zh_val_21.jpg save_res_path: ./output/re_layoutxlm_xfund_zh/res/ Architecture: - model_type: vqa + model_type: kie algorithm: &algorithm "LayoutXLM" Transform: Backbone: diff --git a/configs/kie/layoutlm_series/ser_layoutlm_xfund_zh.yml b/configs/kie/layoutlm_series/ser_layoutlm_xfund_zh.yml index 8c754dd8c542b12de4ee493052407bb0da687fd0..34c7d4114062e9227d48ad5684024e2776e68447 100644 --- a/configs/kie/layoutlm_series/ser_layoutlm_xfund_zh.yml +++ b/configs/kie/layoutlm_series/ser_layoutlm_xfund_zh.yml @@ -11,11 +11,11 @@ Global: save_inference_dir: use_visualdl: False seed: 2022 - infer_img: ppstructure/docs/vqa/input/zh_val_42.jpg + infer_img: ppstructure/docs/kie/input/zh_val_42.jpg save_res_path: ./output/re_layoutlm_xfund_zh/res Architecture: - model_type: vqa + model_type: kie algorithm: &algorithm "LayoutLM" Transform: Backbone: diff --git a/configs/kie/layoutlm_series/ser_layoutlmv2_xfund_zh.yml b/configs/kie/layoutlm_series/ser_layoutlmv2_xfund_zh.yml index 3c0ffabe4465e36e5699a135a9ed0b6254cbf20b..c5e833524011b03110db3bd6f4bf845db8473922 100644 --- a/configs/kie/layoutlm_series/ser_layoutlmv2_xfund_zh.yml +++ b/configs/kie/layoutlm_series/ser_layoutlmv2_xfund_zh.yml @@ -11,11 +11,11 @@ Global: save_inference_dir: use_visualdl: False seed: 2022 - infer_img: ppstructure/docs/vqa/input/zh_val_42.jpg + infer_img: ppstructure/docs/kie/input/zh_val_42.jpg save_res_path: ./output/ser_layoutlmv2_xfund_zh/res/ Architecture: - model_type: vqa + model_type: kie algorithm: &algorithm "LayoutLMv2" Transform: Backbone: diff --git a/configs/kie/layoutlm_series/ser_layoutxlm_xfund_zh.yml b/configs/kie/layoutlm_series/ser_layoutxlm_xfund_zh.yml index 18f87bdebc249940ef3ec1897af3ad1a240f3705..abcfec2d16f13d4b4266633dbb509e0fba6d931f 100644 --- a/configs/kie/layoutlm_series/ser_layoutxlm_xfund_zh.yml +++ b/configs/kie/layoutlm_series/ser_layoutxlm_xfund_zh.yml @@ -11,11 +11,11 @@ Global: save_inference_dir: use_visualdl: False seed: 2022 - infer_img: ppstructure/docs/vqa/input/zh_val_42.jpg + infer_img: ppstructure/docs/kie/input/zh_val_42.jpg save_res_path: ./output/ser_layoutxlm_xfund_zh/res Architecture: - model_type: vqa + model_type: kie algorithm: &algorithm "LayoutXLM" Transform: Backbone: diff --git a/configs/kie/vi_layoutxlm/re_vi_layoutxlm_xfund_zh.yml b/configs/kie/vi_layoutxlm/re_vi_layoutxlm_xfund_zh.yml index 89f7d5c3cb74854bb9fe7e28fdc8365ed37655be..ea9f50ef56ec8b169333263c1d5e96586f9472b3 100644 --- a/configs/kie/vi_layoutxlm/re_vi_layoutxlm_xfund_zh.yml +++ b/configs/kie/vi_layoutxlm/re_vi_layoutxlm_xfund_zh.yml @@ -11,11 +11,13 @@ Global: save_inference_dir: use_visualdl: False seed: 2022 - infer_img: ppstructure/docs/vqa/input/zh_val_21.jpg + infer_img: ppstructure/docs/kie/input/zh_val_21.jpg save_res_path: ./output/re/xfund_zh/with_gt + kie_rec_model_dir: + kie_det_model_dir: Architecture: - model_type: vqa + model_type: kie algorithm: &algorithm "LayoutXLM" Transform: Backbone: diff --git a/configs/kie/vi_layoutxlm/re_vi_layoutxlm_xfund_zh_udml.yml b/configs/kie/vi_layoutxlm/re_vi_layoutxlm_xfund_zh_udml.yml index c1bfdb6c6cee1c9618602016fec6cc1ec0a7b3bf..b96528d2738e7cfb2575feca4146af1eed0c5d2f 100644 --- a/configs/kie/vi_layoutxlm/re_vi_layoutxlm_xfund_zh_udml.yml +++ b/configs/kie/vi_layoutxlm/re_vi_layoutxlm_xfund_zh_udml.yml @@ -11,11 +11,11 @@ Global: save_inference_dir: use_visualdl: False seed: 2022 - infer_img: ppstructure/docs/vqa/input/zh_val_21.jpg + infer_img: ppstructure/docs/kie/input/zh_val_21.jpg save_res_path: ./output/re/xfund_zh/with_gt Architecture: - model_type: &model_type "vqa" + model_type: &model_type "kie" name: DistillationModel algorithm: Distillation Models: diff --git a/configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml b/configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml index d54125db64cef289457c4b855fe9bded3fa4149f..b8aa44dde8fd3fdc4ff14bbca20513b95178cdb0 100644 --- a/configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml +++ b/configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml @@ -11,16 +11,18 @@ Global: save_inference_dir: use_visualdl: False seed: 2022 - infer_img: ppstructure/docs/vqa/input/zh_val_42.jpg + infer_img: ppstructure/docs/kie/input/zh_val_42.jpg # if you want to predict using the groundtruth ocr info, # you can use the following config # infer_img: train_data/XFUND/zh_val/val.json # infer_mode: False save_res_path: ./output/ser/xfund_zh/res + kie_rec_model_dir: + kie_det_model_dir: Architecture: - model_type: vqa + model_type: kie algorithm: &algorithm "LayoutXLM" Transform: Backbone: diff --git a/configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh_udml.yml b/configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh_udml.yml index 6f0961c8e80312ab26a8d1649bf2bb10f8792efb..238bbd2b2c7083b5534062afd3e6c11a87494a56 100644 --- a/configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh_udml.yml +++ b/configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh_udml.yml @@ -11,12 +11,12 @@ Global: save_inference_dir: use_visualdl: False seed: 2022 - infer_img: ppstructure/docs/vqa/input/zh_val_42.jpg + infer_img: ppstructure/docs/kie/input/zh_val_42.jpg save_res_path: ./output/ser_layoutxlm_xfund_zh/res Architecture: - model_type: &model_type "vqa" + model_type: &model_type "kie" name: DistillationModel algorithm: Distillation Models: diff --git a/configs/rec/PP-OCRv3/multi_language/arabic_PP-OCRv3_rec.yml b/configs/rec/PP-OCRv3/multi_language/arabic_PP-OCRv3_rec.yml index 0ad1ab0adc189102ff07094fcda92d4f9ea9c662..8c650bd826d127f25c907f97d20d1a52f67f9203 100644 --- a/configs/rec/PP-OCRv3/multi_language/arabic_PP-OCRv3_rec.yml +++ b/configs/rec/PP-OCRv3/multi_language/arabic_PP-OCRv3_rec.yml @@ -12,7 +12,7 @@ Global: checkpoints: save_inference_dir: use_visualdl: false - infer_img: doc/imgs_words/ch/word_1.jpg + infer_img: ./doc/imgs_words/arabic/ar_2.jpg character_dict_path: ppocr/utils/dict/arabic_dict.txt max_text_length: &max_text_length 25 infer_mode: false diff --git a/configs/rec/rec_r31_robustscanner.yml b/configs/rec/rec_r31_robustscanner.yml new file mode 100644 index 0000000000000000000000000000000000000000..40d39aee3c42c18085ace035944dba057b923245 --- /dev/null +++ b/configs/rec/rec_r31_robustscanner.yml @@ -0,0 +1,109 @@ +Global: + use_gpu: true + epoch_num: 5 + log_smooth_window: 20 + print_batch_step: 20 + save_model_dir: ./output/rec/rec_r31_robustscanner/ + save_epoch_step: 1 + # evaluation is run every 2000 iterations + eval_batch_step: [0, 2000] + cal_metric_during_train: True + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: False + infer_img: ./inference/rec_inference + # for data or label process + character_dict_path: ppocr/utils/dict90.txt + max_text_length: &max_text_length 40 + infer_mode: False + use_space_char: False + rm_symbol: True + save_res_path: ./output/rec/predicts_robustscanner.txt + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Piecewise + decay_epochs: [3, 4] + values: [0.001, 0.0001, 0.00001] + regularizer: + name: 'L2' + factor: 0 + +Architecture: + model_type: rec + algorithm: RobustScanner + Transform: + Backbone: + name: ResNet31 + init_type: KaimingNormal + Head: + name: RobustScannerHead + enc_outchannles: 128 + hybrid_dec_rnn_layers: 2 + hybrid_dec_dropout: 0 + position_dec_rnn_layers: 2 + start_idx: 91 + mask: True + padding_idx: 92 + encode_value: False + max_text_length: *max_text_length + +Loss: + name: SARLoss + +PostProcess: + name: SARLabelDecode + +Metric: + name: RecMetric + is_filter: True + + +Train: + dataset: + name: LMDBDataSet + data_dir: ./train_data/data_lmdb_release/training/ + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - SARLabelEncode: # Class handling label + - RobustScannerRecResizeImg: + image_shape: [3, 48, 48, 160] # h:48 w:[48,160] + width_downsample_ratio: 0.25 + max_text_length: *max_text_length + - KeepKeys: + keep_keys: ['image', 'label', 'valid_ratio', 'word_positons'] # dataloader will return list in this order + loader: + shuffle: True + batch_size_per_card: 64 + drop_last: True + num_workers: 8 + use_shared_memory: False + +Eval: + dataset: + name: LMDBDataSet + data_dir: ./train_data/data_lmdb_release/evaluation/ + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - SARLabelEncode: # Class handling label + - RobustScannerRecResizeImg: + image_shape: [3, 48, 48, 160] + max_text_length: *max_text_length + width_downsample_ratio: 0.25 + - KeepKeys: + keep_keys: ['image', 'label', 'valid_ratio', 'word_positons'] # dataloader will return list in this order + loader: + shuffle: False + drop_last: False + batch_size_per_card: 64 + num_workers: 4 + use_shared_memory: False + diff --git a/configs/sr/sr_tsrn_transformer_strock.yml b/configs/sr/sr_tsrn_transformer_strock.yml new file mode 100644 index 0000000000000000000000000000000000000000..c8c308c4337ddbb2933714391762efbfda44bf32 --- /dev/null +++ b/configs/sr/sr_tsrn_transformer_strock.yml @@ -0,0 +1,85 @@ +Global: + use_gpu: true + epoch_num: 500 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/sr/sr_tsrn_transformer_strock/ + save_epoch_step: 3 + # evaluation is run every 2000 iterations + eval_batch_step: [0, 1000] + cal_metric_during_train: False + pretrained_model: + checkpoints: + save_inference_dir: sr_output + use_visualdl: False + infer_img: doc/imgs_words_en/word_52.png + # for data or label process + character_dict_path: ./train_data/srdata/english_decomposition.txt + max_text_length: 100 + infer_mode: False + use_space_char: False + save_res_path: ./output/sr/predicts_gestalt.txt + +Optimizer: + name: Adam + beta1: 0.5 + beta2: 0.999 + clip_norm: 0.25 + lr: + learning_rate: 0.0001 + +Architecture: + model_type: sr + algorithm: Gestalt + Transform: + name: TSRN + STN: True + infer_mode: False + +Loss: + name: StrokeFocusLoss + character_dict_path: ./train_data/srdata/english_decomposition.txt + +PostProcess: + name: None + +Metric: + name: SRMetric + main_indicator: all + +Train: + dataset: + name: LMDBDataSetSR + data_dir: ./train_data/srdata/train + transforms: + - SRResize: + imgH: 32 + imgW: 128 + down_sample_scale: 2 + - SRLabelEncode: # Class handling label + - KeepKeys: + keep_keys: ['img_lr', 'img_hr', 'length', 'input_tensor', 'label'] # dataloader will return list in this order + loader: + shuffle: False + batch_size_per_card: 16 + drop_last: True + num_workers: 4 + +Eval: + dataset: + name: LMDBDataSetSR + data_dir: ./train_data/srdata/test + transforms: + - SRResize: + imgH: 32 + imgW: 128 + down_sample_scale: 2 + - SRLabelEncode: # Class handling label + - KeepKeys: + keep_keys: ['img_lr', 'img_hr','length', 'input_tensor', 'label'] # dataloader will return list in this order + loader: + shuffle: False + drop_last: False + batch_size_per_card: 16 + num_workers: 4 + diff --git a/configs/table/SLANet.yml b/configs/table/SLANet.yml new file mode 100644 index 0000000000000000000000000000000000000000..a896614556e36f77bd784218b6c2f29914219dbe --- /dev/null +++ b/configs/table/SLANet.yml @@ -0,0 +1,143 @@ +Global: + use_gpu: true + epoch_num: 100 + log_smooth_window: 20 + print_batch_step: 20 + save_model_dir: ./output/SLANet + save_epoch_step: 400 + # evaluation is run every 1000 iterations after the 0th iteration + eval_batch_step: [0, 1000] + cal_metric_during_train: True + pretrained_model: + checkpoints: + save_inference_dir: ./output/SLANet/infer + use_visualdl: False + infer_img: ppstructure/docs/table/table.jpg + # for data or label process + character_dict_path: ppocr/utils/dict/table_structure_dict.txt + character_type: en + max_text_length: &max_text_length 500 + box_format: &box_format 'xyxy' # 'xywh', 'xyxy', 'xyxyxyxy' + infer_mode: False + use_sync_bn: True + save_res_path: 'output/infer' + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + clip_norm: 5.0 + lr: + name: Piecewise + learning_rate: 0.001 + decay_epochs : [40, 50] + values : [0.001, 0.0001, 0.00005] + regularizer: + name: 'L2' + factor: 0.00000 + +Architecture: + model_type: table + algorithm: SLANet + Backbone: + name: PPLCNet + scale: 1.0 + pretrained: true + use_ssld: true + Neck: + name: CSPPAN + out_channels: 96 + Head: + name: SLAHead + hidden_size: 256 + max_text_length: *max_text_length + loc_reg_num: &loc_reg_num 4 + +Loss: + name: SLALoss + structure_weight: 1.0 + loc_weight: 2.0 + loc_loss: smooth_l1 + +PostProcess: + name: TableLabelDecode + merge_no_span_structure: &merge_no_span_structure True + +Metric: + name: TableMetric + main_indicator: acc + compute_bbox_metric: False + loc_reg_num: *loc_reg_num + box_format: *box_format + +Train: + dataset: + name: PubTabDataSet + data_dir: train_data/table/pubtabnet/train/ + label_file_list: [train_data/table/pubtabnet/PubTabNet_2.0.0_train.jsonl] + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - TableLabelEncode: + learn_empty_box: False + merge_no_span_structure: *merge_no_span_structure + replace_empty_cell_token: False + loc_reg_num: *loc_reg_num + max_text_length: *max_text_length + - TableBoxEncode: + in_box_format: *box_format + out_box_format: *box_format + - ResizeTableImage: + max_len: 488 + - NormalizeImage: + scale: 1./255. + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: 'hwc' + - PaddingTableImage: + size: [488, 488] + - ToCHWImage: + - KeepKeys: + keep_keys: [ 'image', 'structure', 'bboxes', 'bbox_masks', 'shape' ] + loader: + shuffle: True + batch_size_per_card: 48 + drop_last: True + num_workers: 1 + +Eval: + dataset: + name: PubTabDataSet + data_dir: train_data/table/pubtabnet/val/ + label_file_list: [train_data/table/pubtabnet/PubTabNet_2.0.0_val.jsonl] + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - TableLabelEncode: + learn_empty_box: False + merge_no_span_structure: *merge_no_span_structure + replace_empty_cell_token: False + loc_reg_num: *loc_reg_num + max_text_length: *max_text_length + - TableBoxEncode: + in_box_format: *box_format + out_box_format: *box_format + - ResizeTableImage: + max_len: 488 + - NormalizeImage: + scale: 1./255. + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: 'hwc' + - PaddingTableImage: + size: [488, 488] + - ToCHWImage: + - KeepKeys: + keep_keys: [ 'image', 'structure', 'bboxes', 'bbox_masks', 'shape' ] + loader: + shuffle: False + drop_last: False + batch_size_per_card: 48 + num_workers: 1 diff --git a/configs/table/SLANet_ch.yml b/configs/table/SLANet_ch.yml new file mode 100644 index 0000000000000000000000000000000000000000..3b1e5c6bd9dd4cd2a084d557a1285983a56bdf2a --- /dev/null +++ b/configs/table/SLANet_ch.yml @@ -0,0 +1,141 @@ +Global: + use_gpu: True + epoch_num: 400 + log_smooth_window: 20 + print_batch_step: 20 + save_model_dir: ./output/SLANet_ch + save_epoch_step: 400 + # evaluation is run every 331 iterations after the 0th iteration + eval_batch_step: [0, 331] + cal_metric_during_train: True + pretrained_model: + checkpoints: + save_inference_dir: ./output/SLANet_ch/infer + use_visualdl: False + infer_img: ppstructure/docs/table/table.jpg + # for data or label process + character_dict_path: ppocr/utils/dict/table_structure_dict_ch.txt + character_type: en + max_text_length: &max_text_length 500 + box_format: &box_format xyxyxyxy # 'xywh', 'xyxy', 'xyxyxyxy' + infer_mode: False + use_sync_bn: True + save_res_path: output/infer + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + clip_norm: 5.0 + lr: + learning_rate: 0.001 + regularizer: + name: 'L2' + factor: 0.00000 + +Architecture: + model_type: table + algorithm: SLANet + Backbone: + name: PPLCNet + scale: 1.0 + pretrained: True + use_ssld: True + Neck: + name: CSPPAN + out_channels: 96 + Head: + name: SLAHead + hidden_size: 256 + max_text_length: *max_text_length + loc_reg_num: &loc_reg_num 8 + +Loss: + name: SLALoss + structure_weight: 1.0 + loc_weight: 2.0 + loc_loss: smooth_l1 + +PostProcess: + name: TableLabelDecode + merge_no_span_structure: &merge_no_span_structure True + +Metric: + name: TableMetric + main_indicator: acc + compute_bbox_metric: False + loc_reg_num: *loc_reg_num + box_format: *box_format + del_thead_tbody: True + +Train: + dataset: + name: PubTabDataSet + data_dir: train_data/table/train/ + label_file_list: [train_data/table/train.txt] + transforms: + - DecodeImage: + img_mode: BGR + channel_first: False + - TableLabelEncode: + learn_empty_box: False + merge_no_span_structure: *merge_no_span_structure + replace_empty_cell_token: False + loc_reg_num: *loc_reg_num + max_text_length: *max_text_length + - TableBoxEncode: + in_box_format: *box_format + out_box_format: *box_format + - ResizeTableImage: + max_len: 488 + - NormalizeImage: + scale: 1./255. + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: 'hwc' + - PaddingTableImage: + size: [488, 488] + - ToCHWImage: + - KeepKeys: + keep_keys: [ 'image', 'structure', 'bboxes', 'bbox_masks', 'shape' ] + loader: + shuffle: True + batch_size_per_card: 48 + drop_last: True + num_workers: 1 + +Eval: + dataset: + name: PubTabDataSet + data_dir: train_data/table/val/ + label_file_list: [train_data/table/val.txt] + transforms: + - DecodeImage: + img_mode: BGR + channel_first: False + - TableLabelEncode: + learn_empty_box: False + merge_no_span_structure: *merge_no_span_structure + replace_empty_cell_token: False + loc_reg_num: *loc_reg_num + max_text_length: *max_text_length + - TableBoxEncode: + in_box_format: *box_format + out_box_format: *box_format + - ResizeTableImage: + max_len: 488 + - NormalizeImage: + scale: 1./255. + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: 'hwc' + - PaddingTableImage: + size: [488, 488] + - ToCHWImage: + - KeepKeys: + keep_keys: [ 'image', 'structure', 'bboxes', 'bbox_masks', 'shape' ] + loader: + shuffle: False + drop_last: False + batch_size_per_card: 48 + num_workers: 1 diff --git a/configs/table/table_master.yml b/configs/table/table_master.yml index b8daf3630755e61322665b6fc5f830e4a45875b8..df437f7c95523c5fe12f7166d011b4ad8473628b 100755 --- a/configs/table/table_master.yml +++ b/configs/table/table_master.yml @@ -8,16 +8,15 @@ Global: eval_batch_step: [0, 6259] cal_metric_during_train: true pretrained_model: null - checkpoints: + checkpoints: save_inference_dir: output/table_master/infer use_visualdl: false infer_img: ppstructure/docs/table/table.jpg save_res_path: ./output/table_master character_dict_path: ppocr/utils/dict/table_master_structure_dict.txt infer_mode: false - max_text_length: 500 - process_total_num: 0 - process_cut_num: 0 + max_text_length: &max_text_length 500 + box_format: &box_format 'xywh' # 'xywh', 'xyxy', 'xyxyxyxy' Optimizer: @@ -52,7 +51,8 @@ Architecture: headers: 8 dropout: 0 d_ff: 2024 - max_text_length: 500 + max_text_length: *max_text_length + loc_reg_num: &loc_reg_num 4 Loss: name: TableMasterLoss @@ -61,11 +61,13 @@ Loss: PostProcess: name: TableMasterLabelDecode box_shape: pad + merge_no_span_structure: &merge_no_span_structure True Metric: name: TableMetric main_indicator: acc compute_bbox_metric: False + box_format: *box_format Train: dataset: @@ -78,15 +80,18 @@ Train: channel_first: False - TableMasterLabelEncode: learn_empty_box: False - merge_no_span_structure: True + merge_no_span_structure: *merge_no_span_structure replace_empty_cell_token: True + loc_reg_num: *loc_reg_num + max_text_length: *max_text_length - ResizeTableImage: max_len: 480 resize_bboxes: True - PaddingTableImage: size: [480, 480] - TableBoxEncode: - use_xywh: True + in_box_format: *box_format + out_box_format: *box_format - NormalizeImage: scale: 1./255. mean: [0.5, 0.5, 0.5] @@ -112,15 +117,18 @@ Eval: channel_first: False - TableMasterLabelEncode: learn_empty_box: False - merge_no_span_structure: True + merge_no_span_structure: *merge_no_span_structure replace_empty_cell_token: True + loc_reg_num: *loc_reg_num + max_text_length: *max_text_length - ResizeTableImage: max_len: 480 resize_bboxes: True - PaddingTableImage: size: [480, 480] - TableBoxEncode: - use_xywh: True + in_box_format: *box_format + out_box_format: *box_format - NormalizeImage: scale: 1./255. mean: [0.5, 0.5, 0.5] diff --git a/configs/table/table_mv3.yml b/configs/table/table_mv3.yml index 66c1c83e124d4e94e1f4036a494dfd80c840f229..9355a236e15b60db18e8715c2702701fd5d36c71 100755 --- a/configs/table/table_mv3.yml +++ b/configs/table/table_mv3.yml @@ -17,10 +17,9 @@ Global: # for data or label process character_dict_path: ppocr/utils/dict/table_structure_dict.txt character_type: en - max_text_length: 800 + max_text_length: &max_text_length 500 + box_format: &box_format 'xyxy' # 'xywh', 'xyxy', 'xyxyxyxy' infer_mode: False - process_total_num: 0 - process_cut_num: 0 Optimizer: name: Adam @@ -39,12 +38,14 @@ Architecture: Backbone: name: MobileNetV3 scale: 1.0 - model_name: large + model_name: small + disable_se: true Head: name: TableAttentionHead hidden_size: 256 loc_type: 2 - max_text_length: 800 + max_text_length: *max_text_length + loc_reg_num: &loc_reg_num 4 Loss: name: TableAttentionLoss @@ -72,6 +73,8 @@ Train: learn_empty_box: False merge_no_span_structure: False replace_empty_cell_token: False + loc_reg_num: *loc_reg_num + max_text_length: *max_text_length - TableBoxEncode: - ResizeTableImage: max_len: 488 @@ -87,15 +90,15 @@ Train: keep_keys: [ 'image', 'structure', 'bboxes', 'bbox_masks', 'shape' ] loader: shuffle: True - batch_size_per_card: 32 + batch_size_per_card: 48 drop_last: True num_workers: 1 Eval: dataset: name: PubTabDataSet - data_dir: /home/zhoujun20/table/PubTabNe/pubtabnet/val/ - label_file_list: [/home/zhoujun20/table/PubTabNe/pubtabnet/val_500.jsonl] + data_dir: train_data/table/pubtabnet/val/ + label_file_list: [train_data/table/pubtabnet/PubTabNet_2.0.0_val.jsonl] transforms: - DecodeImage: # load image img_mode: BGR @@ -104,6 +107,8 @@ Eval: learn_empty_box: False merge_no_span_structure: False replace_empty_cell_token: False + loc_reg_num: *loc_reg_num + max_text_length: *max_text_length - TableBoxEncode: - ResizeTableImage: max_len: 488 @@ -120,5 +125,5 @@ Eval: loader: shuffle: False drop_last: False - batch_size_per_card: 16 + batch_size_per_card: 48 num_workers: 1 diff --git a/deploy/android_demo/app/src/main/cpp/native.cpp b/deploy/android_demo/app/src/main/cpp/native.cpp index ced932556f09244d1e9e962e7b75461203a7cc3a..4961e5ecf141bb50701ecf9c3654a54f062937ce 100644 --- a/deploy/android_demo/app/src/main/cpp/native.cpp +++ b/deploy/android_demo/app/src/main/cpp/native.cpp @@ -47,7 +47,7 @@ str_to_cpu_mode(const std::string &cpu_mode) { std::string upper_key; std::transform(cpu_mode.cbegin(), cpu_mode.cend(), upper_key.begin(), ::toupper); - auto index = cpu_mode_map.find(upper_key); + auto index = cpu_mode_map.find(upper_key.c_str()); if (index == cpu_mode_map.end()) { LOGE("cpu_mode not found %s", upper_key.c_str()); return paddle::lite_api::LITE_POWER_HIGH; @@ -116,4 +116,4 @@ Java_com_baidu_paddle_lite_demo_ocr_OCRPredictorNative_release( ppredictor::OCR_PPredictor *ppredictor = (ppredictor::OCR_PPredictor *)java_pointer; delete ppredictor; -} \ No newline at end of file +} diff --git a/deploy/android_demo/app/src/main/java/com/baidu/paddle/lite/demo/ocr/OCRPredictorNative.java b/deploy/android_demo/app/src/main/java/com/baidu/paddle/lite/demo/ocr/OCRPredictorNative.java index 622da2a3f9a1233167e777e62b687c1f246df01f..41fa183dea1d968582dbedf4e831c55b043ae00f 100644 --- a/deploy/android_demo/app/src/main/java/com/baidu/paddle/lite/demo/ocr/OCRPredictorNative.java +++ b/deploy/android_demo/app/src/main/java/com/baidu/paddle/lite/demo/ocr/OCRPredictorNative.java @@ -54,7 +54,7 @@ public class OCRPredictorNative { } public void destory() { - if (nativePointer > 0) { + if (nativePointer != 0) { release(nativePointer); nativePointer = 0; } diff --git a/deploy/cpp_infer/docs/windows_vs2019_build.md b/deploy/cpp_infer/docs/windows_vs2019_build.md index 4f391d925008b4bffcbd123e937eb608f502c646..bcaefa46f83a30a4c232add78dc2e9f521b9f84f 100644 --- a/deploy/cpp_infer/docs/windows_vs2019_build.md +++ b/deploy/cpp_infer/docs/windows_vs2019_build.md @@ -109,8 +109,10 @@ CUDA_LIB、CUDNN_LIB、TENSORRT_DIR、WITH_GPU、WITH_TENSORRT 运行之前,将下面文件拷贝到`build/Release/`文件夹下 1. `paddle_inference/paddle/lib/paddle_inference.dll` -2. `opencv/build/x64/vc15/bin/opencv_world455.dll` -3. 如果使用openblas版本的预测库还需要拷贝 `paddle_inference/third_party/install/openblas/lib/openblas.dll` +2. `paddle_inference/third_party/install/onnxruntime/lib/onnxruntime.dll` +3. `paddle_inference/third_party/install/paddle2onnx/lib/paddle2onnx.dll` +4. `opencv/build/x64/vc15/bin/opencv_world455.dll` +5. 如果使用openblas版本的预测库还需要拷贝 `paddle_inference/third_party/install/openblas/lib/openblas.dll` ### Step4: 预测 diff --git a/deploy/cpp_infer/include/args.h b/deploy/cpp_infer/include/args.h index 473ff25d981f8409f60a43940aaaec376375adf5..f7fac9c92c421ca85818b2d04097ce8e55ea117e 100644 --- a/deploy/cpp_infer/include/args.h +++ b/deploy/cpp_infer/include/args.h @@ -30,7 +30,8 @@ DECLARE_string(image_dir); DECLARE_string(type); // detection related DECLARE_string(det_model_dir); -DECLARE_int32(max_side_len); +DECLARE_string(limit_type); +DECLARE_int32(limit_side_len); DECLARE_double(det_db_thresh); DECLARE_double(det_db_box_thresh); DECLARE_double(det_db_unclip_ratio); @@ -48,7 +49,14 @@ DECLARE_int32(rec_batch_num); DECLARE_string(rec_char_dict_path); DECLARE_int32(rec_img_h); DECLARE_int32(rec_img_w); +// structure model related +DECLARE_string(table_model_dir); +DECLARE_int32(table_max_len); +DECLARE_int32(table_batch_num); +DECLARE_string(table_char_dict_path); +DECLARE_bool(merge_no_span_structure); // forward related DECLARE_bool(det); DECLARE_bool(rec); DECLARE_bool(cls); +DECLARE_bool(table); \ No newline at end of file diff --git a/deploy/cpp_infer/include/ocr_det.h b/deploy/cpp_infer/include/ocr_det.h index 7efd4d8f0f4ccb705fc34695bb9843e0b6af5a9b..d1421b103b28b44e15a7df53a63fd893ca60e529 100644 --- a/deploy/cpp_infer/include/ocr_det.h +++ b/deploy/cpp_infer/include/ocr_det.h @@ -41,8 +41,8 @@ public: explicit DBDetector(const std::string &model_dir, const bool &use_gpu, const int &gpu_id, const int &gpu_mem, const int &cpu_math_library_num_threads, - const bool &use_mkldnn, const int &max_side_len, - const double &det_db_thresh, + const bool &use_mkldnn, const string &limit_type, + const int &limit_side_len, const double &det_db_thresh, const double &det_db_box_thresh, const double &det_db_unclip_ratio, const std::string &det_db_score_mode, @@ -54,7 +54,8 @@ public: this->cpu_math_library_num_threads_ = cpu_math_library_num_threads; this->use_mkldnn_ = use_mkldnn; - this->max_side_len_ = max_side_len; + this->limit_type_ = limit_type; + this->limit_side_len_ = limit_side_len; this->det_db_thresh_ = det_db_thresh; this->det_db_box_thresh_ = det_db_box_thresh; @@ -84,7 +85,8 @@ private: int cpu_math_library_num_threads_ = 4; bool use_mkldnn_ = false; - int max_side_len_ = 960; + string limit_type_ = "max"; + int limit_side_len_ = 960; double det_db_thresh_ = 0.3; double det_db_box_thresh_ = 0.5; @@ -106,7 +108,7 @@ private: Permute permute_op_; // post-process - PostProcessor post_processor_; + DBPostProcessor post_processor_; }; } // namespace PaddleOCR \ No newline at end of file diff --git a/deploy/cpp_infer/include/paddleocr.h b/deploy/cpp_infer/include/paddleocr.h index 6db9d86cb152bfcc708a87c6a98be59d88a5d8db..a2c60b14acceaa90a8d8e4a70ccc50f02f254eb6 100644 --- a/deploy/cpp_infer/include/paddleocr.h +++ b/deploy/cpp_infer/include/paddleocr.h @@ -47,11 +47,7 @@ public: ocr(std::vector cv_all_img_names, bool det = true, bool rec = true, bool cls = true); -private: - DBDetector *detector_ = nullptr; - Classifier *classifier_ = nullptr; - CRNNRecognizer *recognizer_ = nullptr; - +protected: void det(cv::Mat img, std::vector &ocr_results, std::vector ×); void rec(std::vector img_list, @@ -62,6 +58,11 @@ private: std::vector ×); void log(std::vector &det_times, std::vector &rec_times, std::vector &cls_times, int img_num); + +private: + DBDetector *detector_ = nullptr; + Classifier *classifier_ = nullptr; + CRNNRecognizer *recognizer_ = nullptr; }; } // namespace PaddleOCR diff --git a/deploy/cpp_infer/include/paddlestructure.h b/deploy/cpp_infer/include/paddlestructure.h new file mode 100644 index 0000000000000000000000000000000000000000..6d2c8b7d203a05f531b8d038d885061c42897373 --- /dev/null +++ b/deploy/cpp_infer/include/paddlestructure.h @@ -0,0 +1,76 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "opencv2/core.hpp" +#include "opencv2/imgcodecs.hpp" +#include "opencv2/imgproc.hpp" +#include "paddle_api.h" +#include "paddle_inference_api.h" +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include + +using namespace paddle_infer; + +namespace PaddleOCR { + +class PaddleStructure : public PPOCR { +public: + explicit PaddleStructure(); + ~PaddleStructure(); + std::vector> + structure(std::vector cv_all_img_names, bool layout = false, + bool table = true); + +private: + StructureTableRecognizer *recognizer_ = nullptr; + + void table(cv::Mat img, StructurePredictResult &structure_result, + std::vector &time_info_table, + std::vector &time_info_det, + std::vector &time_info_rec, + std::vector &time_info_cls); + std::string rebuild_table(std::vector rec_html_tags, + std::vector> rec_boxes, + std::vector &ocr_result); + + float iou(std::vector &box1, std::vector &box2); + float dis(std::vector &box1, std::vector &box2); + + static bool comparison_dis(const std::vector &dis1, + const std::vector &dis2) { + if (dis1[1] < dis2[1]) { + return true; + } else if (dis1[1] == dis2[1]) { + return dis1[0] < dis2[0]; + } else { + return false; + } + } +}; + +} // namespace PaddleOCR \ No newline at end of file diff --git a/deploy/cpp_infer/include/postprocess_op.h b/deploy/cpp_infer/include/postprocess_op.h index 4a98b151bdcc53e2ab3fbda1dca55dd9746bd86c..f5db52a6097f0fb916fc96fd8c76095f2ed1a9fa 100644 --- a/deploy/cpp_infer/include/postprocess_op.h +++ b/deploy/cpp_infer/include/postprocess_op.h @@ -34,7 +34,7 @@ using namespace std; namespace PaddleOCR { -class PostProcessor { +class DBPostProcessor { public: void GetContourArea(const std::vector> &box, float unclip_ratio, float &distance); @@ -90,4 +90,20 @@ private: } }; +class TablePostProcessor { +public: + void init(std::string label_path, bool merge_no_span_structure = true); + void Run(std::vector &loc_preds, std::vector &structure_probs, + std::vector &rec_scores, std::vector &loc_preds_shape, + std::vector &structure_probs_shape, + std::vector> &rec_html_tag_batch, + std::vector>> &rec_boxes_batch, + std::vector &width_list, std::vector &height_list); + +private: + std::vector label_list_; + std::string end = "eos"; + std::string beg = "sos"; +}; + } // namespace PaddleOCR diff --git a/deploy/cpp_infer/include/preprocess_op.h b/deploy/cpp_infer/include/preprocess_op.h index 31217de301573e078f8e11ef88657f369ede9b31..078f19d5b808c81e88d7aa464d6bfaca7fe1b14e 100644 --- a/deploy/cpp_infer/include/preprocess_op.h +++ b/deploy/cpp_infer/include/preprocess_op.h @@ -48,11 +48,12 @@ class PermuteBatch { public: virtual void Run(const std::vector imgs, float *data); }; - + class ResizeImgType0 { public: - virtual void Run(const cv::Mat &img, cv::Mat &resize_img, int max_size_len, - float &ratio_h, float &ratio_w, bool use_tensorrt); + virtual void Run(const cv::Mat &img, cv::Mat &resize_img, string limit_type, + int limit_side_len, float &ratio_h, float &ratio_w, + bool use_tensorrt); }; class CrnnResizeImg { @@ -69,4 +70,16 @@ public: const std::vector &rec_image_shape = {3, 48, 192}); }; +class TableResizeImg { +public: + virtual void Run(const cv::Mat &img, cv::Mat &resize_img, + const int max_len = 488); +}; + +class TablePadImg { +public: + virtual void Run(const cv::Mat &img, cv::Mat &resize_img, + const int max_len = 488); +}; + } // namespace PaddleOCR \ No newline at end of file diff --git a/deploy/cpp_infer/include/structure_table.h b/deploy/cpp_infer/include/structure_table.h new file mode 100644 index 0000000000000000000000000000000000000000..c09e65654a7c8a4deb6729ddfd876531020f306b --- /dev/null +++ b/deploy/cpp_infer/include/structure_table.h @@ -0,0 +1,101 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "opencv2/core.hpp" +#include "opencv2/imgcodecs.hpp" +#include "opencv2/imgproc.hpp" +#include "paddle_api.h" +#include "paddle_inference_api.h" +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include + +using namespace paddle_infer; + +namespace PaddleOCR { + +class StructureTableRecognizer { +public: + explicit StructureTableRecognizer( + const std::string &model_dir, const bool &use_gpu, const int &gpu_id, + const int &gpu_mem, const int &cpu_math_library_num_threads, + const bool &use_mkldnn, const string &label_path, + const bool &use_tensorrt, const std::string &precision, + const int &table_batch_num, const int &table_max_len, + const bool &merge_no_span_structure) { + this->use_gpu_ = use_gpu; + this->gpu_id_ = gpu_id; + this->gpu_mem_ = gpu_mem; + this->cpu_math_library_num_threads_ = cpu_math_library_num_threads; + this->use_mkldnn_ = use_mkldnn; + this->use_tensorrt_ = use_tensorrt; + this->precision_ = precision; + this->table_batch_num_ = table_batch_num; + this->table_max_len_ = table_max_len; + + this->post_processor_.init(label_path, merge_no_span_structure); + LoadModel(model_dir); + } + + // Load Paddle inference model + void LoadModel(const std::string &model_dir); + + void Run(std::vector img_list, + std::vector> &rec_html_tags, + std::vector &rec_scores, + std::vector>> &rec_boxes, + std::vector ×); + +private: + std::shared_ptr predictor_; + + bool use_gpu_ = false; + int gpu_id_ = 0; + int gpu_mem_ = 4000; + int cpu_math_library_num_threads_ = 4; + bool use_mkldnn_ = false; + int table_max_len_ = 488; + + std::vector mean_ = {0.485f, 0.456f, 0.406f}; + std::vector scale_ = {1 / 0.229f, 1 / 0.224f, 1 / 0.225f}; + bool is_scale_ = true; + + bool use_tensorrt_ = false; + std::string precision_ = "fp32"; + int table_batch_num_ = 1; + + // pre-process + TableResizeImg resize_op_; + Normalize normalize_op_; + PermuteBatch permute_op_; + TablePadImg pad_op_; + + // post-process + TablePostProcessor post_processor_; + +}; // class StructureTableRecognizer + +} // namespace PaddleOCR \ No newline at end of file diff --git a/deploy/cpp_infer/include/utility.h b/deploy/cpp_infer/include/utility.h index eb18c0624492e9b47de156d60611d637d8dca6c3..85b280fe25a46be70dba529891c3470a729dfbf1 100644 --- a/deploy/cpp_infer/include/utility.h +++ b/deploy/cpp_infer/include/utility.h @@ -40,6 +40,15 @@ struct OCRPredictResult { int cls_label = -1; }; +struct StructurePredictResult { + std::vector box; + std::vector> cell_box; + std::string type; + std::vector text_res; + std::string html; + float html_score = -1; +}; + class Utility { public: static std::vector ReadDict(const std::string &path); @@ -48,6 +57,10 @@ public: const std::vector &ocr_result, const std::string &save_path); + static void VisualizeBboxes(const cv::Mat &srcimg, + const StructurePredictResult &structure_result, + const std::string &save_path); + template inline static size_t argmax(ForwardIterator first, ForwardIterator last) { return std::distance(first, std::max_element(first, last)); @@ -68,6 +81,25 @@ public: static void CreateDir(const std::string &path); static void print_result(const std::vector &ocr_result); + + static cv::Mat crop_image(cv::Mat &img, std::vector &area); + + static void sorted_boxes(std::vector &ocr_result); + + static std::vector xyxyxyxy2xyxy(std::vector> &box); + static std::vector xyxyxyxy2xyxy(std::vector &box); + +private: + static bool comparison_box(const OCRPredictResult &result1, + const OCRPredictResult &result2) { + if (result1.box[0][1] < result2.box[0][1]) { + return true; + } else if (result1.box[0][1] == result2.box[0][1]) { + return result1.box[0][0] < result2.box[0][0]; + } else { + return false; + } + } }; } // namespace PaddleOCR \ No newline at end of file diff --git a/deploy/cpp_infer/readme.md b/deploy/cpp_infer/readme.md index a87db7e6596bc2528bfb4a93c3170ebf0482ccad..2974f3227aa6f9cdd967665addc905f7b902bac2 100644 --- a/deploy/cpp_infer/readme.md +++ b/deploy/cpp_infer/readme.md @@ -171,6 +171,9 @@ inference/ |-- cls | |--inference.pdiparams | |--inference.pdmodel +|-- table +| |--inference.pdiparams +| |--inference.pdmodel ``` @@ -275,6 +278,17 @@ Specifically, --cls=true \ ``` + +##### 7. table +```shell +./build/ppocr --det_model_dir=inference/det_db \ + --rec_model_dir=inference/rec_rcnn \ + --table_model_dir=inference/table \ + --image_dir=../../ppstructure/docs/table/table.jpg \ + --type=structure \ + --table=true +``` + More parameters are as follows, - Common parameters @@ -293,9 +307,9 @@ More parameters are as follows, |parameter|data type|default|meaning| | :---: | :---: | :---: | :---: | -|det|bool|true|前向是否执行文字检测| -|rec|bool|true|前向是否执行文字识别| -|cls|bool|false|前向是否执行文字方向分类| +|det|bool|true|Whether to perform text detection in the forward direction| +|rec|bool|true|Whether to perform text recognition in the forward direction| +|cls|bool|false|Whether to perform text direction classification in the forward direction| - Detection related parameters @@ -329,6 +343,16 @@ More parameters are as follows, |rec_img_h|int|48|image height of recognition| |rec_img_w|int|320|image width of recognition| +- Table recognition related parameters + +|parameter|data type|default|meaning| +| :---: | :---: | :---: | :---: | +|table_model_dir|string|-|Address of table recognition inference model| +|table_char_dict_path|string|../../ppocr/utils/dict/table_structure_dict.txt|dictionary file| +|table_max_len|int|488|The size of the long side of the input image of the table recognition model, the final input image size of the network is(table_max_len,table_max_len)| +|merge_no_span_structure|bool|true|Whether to merge and to
MethodsRPFFPS
SegLink [26]70.086.077.08.9
PixelLink [4]73.283.077.8-
TextSnake [18]73.983.278.31.1
TextField [37]75.987.481.35.2
MSR[38]76.787.481.7-
FTSN [3]77.187.682.0-
LSE[30]81.784.282.9-
CRAFT [2]78.288.282.98.6
MCN [16]798883-
ATRR[35]82.185.283.6-
PAN [34]83.884.484.130.2
DB[12]79.291.584.932.0
DRRG [41]82.3088.0585.08-
Ours (SynText)80.6885.4082.9712.68
Ours (MLT-17)84.5486.6285.5712.31
+``` ## 3. FAQ diff --git a/deploy/cpp_infer/readme_ch.md b/deploy/cpp_infer/readme_ch.md index 8c334851c0d44acd393c6daa79edf25dc9e6fa24..03394efdc64788d924e155c989b1fac95f8432da 100644 --- a/deploy/cpp_infer/readme_ch.md +++ b/deploy/cpp_infer/readme_ch.md @@ -181,6 +181,9 @@ inference/ |-- cls | |--inference.pdiparams | |--inference.pdmodel +|-- table +| |--inference.pdiparams +| |--inference.pdmodel ``` @@ -285,6 +288,16 @@ CUDNN_LIB_DIR=/your_cudnn_lib_dir --cls=true \ ``` +##### 7. 表格识别 +```shell +./build/ppocr --det_model_dir=inference/det_db \ + --rec_model_dir=inference/rec_rcnn \ + --table_model_dir=inference/table \ + --image_dir=../../ppstructure/docs/table/table.jpg \ + --type=structure \ + --table=true +``` + 更多支持的可调节参数解释如下: - 通用参数 @@ -328,21 +341,33 @@ CUDNN_LIB_DIR=/your_cudnn_lib_dir |cls_thresh|float|0.9|方向分类器的得分阈值| |cls_batch_num|int|1|方向分类器batchsize| -- 识别模型相关 +- 文字识别模型相关 |参数名称|类型|默认参数|意义| | :---: | :---: | :---: | :---: | -|rec_model_dir|string|-|识别模型inference model地址| +|rec_model_dir|string|-|文字识别模型inference model地址| |rec_char_dict_path|string|../../ppocr/utils/ppocr_keys_v1.txt|字典文件| -|rec_batch_num|int|6|识别模型batchsize| -|rec_img_h|int|48|识别模型输入图像高度| -|rec_img_w|int|320|识别模型输入图像宽度| +|rec_batch_num|int|6|文字识别模型batchsize| +|rec_img_h|int|48|文字识别模型输入图像高度| +|rec_img_w|int|320|文字识别模型输入图像宽度| + + +- 表格识别模型相关 + +|参数名称|类型|默认参数|意义| +| :---: | :---: | :---: | :---: | +|table_model_dir|string|-|表格识别模型inference model地址| +|table_char_dict_path|string|../../ppocr/utils/dict/table_structure_dict.txt|字典文件| +|table_max_len|int|488|表格识别模型输入图像长边大小,最终网络输入图像大小为(table_max_len,table_max_len)| +|merge_no_span_structure|bool|true|是否合并 和 为| * PaddleOCR也支持多语言的预测,更多支持的语言和模型可以参考[识别文档](../../doc/doc_ch/recognition.md)中的多语言字典与模型部分,如果希望进行多语言预测,只需将修改`rec_char_dict_path`(字典文件路径)以及`rec_model_dir`(inference模型路径)字段即可。 最终屏幕上会输出检测结果如下。 +- ocr + ```bash predict img: ../../doc/imgs/12.jpg ../../doc/imgs/12.jpg @@ -353,6 +378,13 @@ predict img: ../../doc/imgs/12.jpg The detection visualized image saved in ./output//12.jpg ``` +- table + +```bash +predict img: ../../ppstructure/docs/table/table.jpg +0 type: table, region: [0,0,371,293], res:
MethodsRPFFPS
SegLink [26]70.086.077.08.9
PixelLink [4]73.283.077.8-
TextSnake [18]73.983.278.31.1
TextField [37]75.987.481.35.2
MSR[38]76.787.481.7-
FTSN [3]77.187.682.0-
LSE[30]81.784.282.9-
CRAFT [2]78.288.282.98.6
MCN [16]798883-
ATRR[35]82.185.283.6-
PAN [34]83.884.484.130.2
DB[12]79.291.584.932.0
DRRG [41]82.3088.0585.08-
Ours (SynText)80.6885.4082.9712.68
Ours (MLT-17)84.5486.6285.5712.31
+``` + ## 3. FAQ diff --git a/deploy/cpp_infer/src/args.cpp b/deploy/cpp_infer/src/args.cpp index 93d0f5ea5fd07bdc3eb44537bc1c0d4e131736d3..17e9c8b625baf53c2583a6d778aba552cdd19e97 100644 --- a/deploy/cpp_infer/src/args.cpp +++ b/deploy/cpp_infer/src/args.cpp @@ -30,7 +30,8 @@ DEFINE_string( "Perform ocr or structure, the value is selected in ['ocr','structure']."); // detection related DEFINE_string(det_model_dir, "", "Path of det inference model."); -DEFINE_int32(max_side_len, 960, "max_side_len of input image."); +DEFINE_string(limit_type, "max", "limit_type of input image."); +DEFINE_int32(limit_side_len, 960, "limit_side_len of input image."); DEFINE_double(det_db_thresh, 0.3, "Threshold of det_db_thresh."); DEFINE_double(det_db_box_thresh, 0.6, "Threshold of det_db_box_thresh."); DEFINE_double(det_db_unclip_ratio, 1.5, "Threshold of det_db_unclip_ratio."); @@ -50,7 +51,18 @@ DEFINE_string(rec_char_dict_path, "../../ppocr/utils/ppocr_keys_v1.txt", DEFINE_int32(rec_img_h, 48, "rec image height"); DEFINE_int32(rec_img_w, 320, "rec image width"); +// structure model related +DEFINE_string(table_model_dir, "", "Path of table struture inference model."); +DEFINE_int32(table_max_len, 488, "max len size of input image."); +DEFINE_int32(table_batch_num, 1, "table_batch_num."); +DEFINE_bool(merge_no_span_structure, true, + "Whether merge and to "); +DEFINE_string(table_char_dict_path, + "../../ppocr/utils/dict/table_structure_dict_ch.txt", + "Path of dictionary."); + // ocr forward related DEFINE_bool(det, true, "Whether use det in forward."); DEFINE_bool(rec, true, "Whether use rec in forward."); -DEFINE_bool(cls, false, "Whether use cls in forward."); \ No newline at end of file +DEFINE_bool(cls, false, "Whether use cls in forward."); +DEFINE_bool(table, false, "Whether use table structure in forward."); \ No newline at end of file diff --git a/deploy/cpp_infer/src/main.cpp b/deploy/cpp_infer/src/main.cpp index c4b5b97ea8b2ebf77dd9a3e2af69a1a1ca19ed2a..34ffdc62674ef02b2d30c8e213a783495ceaff99 100644 --- a/deploy/cpp_infer/src/main.cpp +++ b/deploy/cpp_infer/src/main.cpp @@ -19,6 +19,7 @@ #include #include +#include using namespace PaddleOCR; @@ -32,6 +33,12 @@ void check_params() { } } if (FLAGS_rec) { + std::cout + << "In PP-OCRv3, rec_image_shape parameter defaults to '3, 48, 320'," + "if you are using recognition model with PP-OCRv2 or an older " + "version, " + "please set --rec_image_shape='3,32,320" + << std::endl; if (FLAGS_rec_model_dir.empty() || FLAGS_image_dir.empty()) { std::cout << "Usage[rec]: ./ppocr " "--rec_model_dir=/PATH/TO/REC_INFERENCE_MODEL/ " @@ -47,6 +54,17 @@ void check_params() { exit(1); } } + if (FLAGS_table) { + if (FLAGS_table_model_dir.empty() || FLAGS_det_model_dir.empty() || + FLAGS_rec_model_dir.empty() || FLAGS_image_dir.empty()) { + std::cout << "Usage[table]: ./ppocr " + << "--det_model_dir=/PATH/TO/DET_INFERENCE_MODEL/ " + << "--rec_model_dir=/PATH/TO/REC_INFERENCE_MODEL/ " + << "--table_model_dir=/PATH/TO/TABLE_INFERENCE_MODEL/ " + << "--image_dir=/PATH/TO/INPUT/IMAGE/" << std::endl; + exit(1); + } + } if (FLAGS_precision != "fp32" && FLAGS_precision != "fp16" && FLAGS_precision != "int8") { cout << "precison should be 'fp32'(default), 'fp16' or 'int8'. " << endl; @@ -54,21 +72,7 @@ void check_params() { } } -int main(int argc, char **argv) { - // Parsing command-line - google::ParseCommandLineFlags(&argc, &argv, true); - check_params(); - - if (!Utility::PathExists(FLAGS_image_dir)) { - std::cerr << "[ERROR] image path not exist! image_dir: " << FLAGS_image_dir - << endl; - exit(1); - } - - std::vector cv_all_img_names; - cv::glob(FLAGS_image_dir, cv_all_img_names); - std::cout << "total images num: " << cv_all_img_names.size() << endl; - +void ocr(std::vector &cv_all_img_names) { PPOCR ocr = PPOCR(); std::vector> ocr_results = @@ -109,3 +113,55 @@ int main(int argc, char **argv) { } } } + +void structure(std::vector &cv_all_img_names) { + PaddleOCR::PaddleStructure engine = PaddleOCR::PaddleStructure(); + std::vector> structure_results = + engine.structure(cv_all_img_names, false, FLAGS_table); + for (int i = 0; i < cv_all_img_names.size(); i++) { + cout << "predict img: " << cv_all_img_names[i] << endl; + cv::Mat srcimg = cv::imread(cv_all_img_names[i], cv::IMREAD_COLOR); + for (int j = 0; j < structure_results[i].size(); j++) { + std::cout << j << "\ttype: " << structure_results[i][j].type + << ", region: ["; + std::cout << structure_results[i][j].box[0] << "," + << structure_results[i][j].box[1] << "," + << structure_results[i][j].box[2] << "," + << structure_results[i][j].box[3] << "], res: "; + if (structure_results[i][j].type == "table") { + std::cout << structure_results[i][j].html << std::endl; + std::string file_name = Utility::basename(cv_all_img_names[i]); + + Utility::VisualizeBboxes(srcimg, structure_results[i][j], + FLAGS_output + "/" + std::to_string(j) + "_" + + file_name); + } else { + Utility::print_result(structure_results[i][j].text_res); + } + } + } +} + +int main(int argc, char **argv) { + // Parsing command-line + google::ParseCommandLineFlags(&argc, &argv, true); + check_params(); + + if (!Utility::PathExists(FLAGS_image_dir)) { + std::cerr << "[ERROR] image path not exist! image_dir: " << FLAGS_image_dir + << endl; + exit(1); + } + + std::vector cv_all_img_names; + cv::glob(FLAGS_image_dir, cv_all_img_names); + std::cout << "total images num: " << cv_all_img_names.size() << endl; + + if (FLAGS_type == "ocr") { + ocr(cv_all_img_names); + } else if (FLAGS_type == "structure") { + structure(cv_all_img_names); + } else { + std::cout << "only value in ['ocr','structure'] is supported" << endl; + } +} diff --git a/deploy/cpp_infer/src/ocr_cls.cpp b/deploy/cpp_infer/src/ocr_cls.cpp index 674630bf1e7e04841e027a7320d62af4a453ffc8..92d83600cea04419db231c0097caa53ed6fec58b 100644 --- a/deploy/cpp_infer/src/ocr_cls.cpp +++ b/deploy/cpp_infer/src/ocr_cls.cpp @@ -112,6 +112,11 @@ void Classifier::LoadModel(const std::string &model_dir) { precision = paddle_infer::Config::Precision::kInt8; } config.EnableTensorRtEngine(1 << 20, 10, 3, precision, false, false); + if (!Utility::PathExists("./trt_cls_shape.txt")){ + config.CollectShapeRangeInfo("./trt_cls_shape.txt"); + } else { + config.EnableTunedTensorRtDynamicShape("./trt_cls_shape.txt", true); + } } } else { config.DisableGpu(); diff --git a/deploy/cpp_infer/src/ocr_det.cpp b/deploy/cpp_infer/src/ocr_det.cpp index 550997e71937d23a7448e8ff1c4ffad579d2931c..030d5c2f359bba522662324d84c6ef1cc0bc83b8 100644 --- a/deploy/cpp_infer/src/ocr_det.cpp +++ b/deploy/cpp_infer/src/ocr_det.cpp @@ -32,49 +32,13 @@ void DBDetector::LoadModel(const std::string &model_dir) { if (this->precision_ == "int8") { precision = paddle_infer::Config::Precision::kInt8; } - config.EnableTensorRtEngine(1 << 20, 1, 20, precision, false, false); - std::map> min_input_shape = { - {"x", {1, 3, 50, 50}}, - {"conv2d_92.tmp_0", {1, 120, 20, 20}}, - {"conv2d_91.tmp_0", {1, 24, 10, 10}}, - {"conv2d_59.tmp_0", {1, 96, 20, 20}}, - {"nearest_interp_v2_1.tmp_0", {1, 256, 10, 10}}, - {"nearest_interp_v2_2.tmp_0", {1, 256, 20, 20}}, - {"conv2d_124.tmp_0", {1, 256, 20, 20}}, - {"nearest_interp_v2_3.tmp_0", {1, 64, 20, 20}}, - {"nearest_interp_v2_4.tmp_0", {1, 64, 20, 20}}, - {"nearest_interp_v2_5.tmp_0", {1, 64, 20, 20}}, - {"elementwise_add_7", {1, 56, 2, 2}}, - {"nearest_interp_v2_0.tmp_0", {1, 256, 2, 2}}}; - std::map> max_input_shape = { - {"x", {1, 3, this->max_side_len_, this->max_side_len_}}, - {"conv2d_92.tmp_0", {1, 120, 400, 400}}, - {"conv2d_91.tmp_0", {1, 24, 200, 200}}, - {"conv2d_59.tmp_0", {1, 96, 400, 400}}, - {"nearest_interp_v2_1.tmp_0", {1, 256, 200, 200}}, - {"nearest_interp_v2_2.tmp_0", {1, 256, 400, 400}}, - {"conv2d_124.tmp_0", {1, 256, 400, 400}}, - {"nearest_interp_v2_3.tmp_0", {1, 64, 400, 400}}, - {"nearest_interp_v2_4.tmp_0", {1, 64, 400, 400}}, - {"nearest_interp_v2_5.tmp_0", {1, 64, 400, 400}}, - {"elementwise_add_7", {1, 56, 400, 400}}, - {"nearest_interp_v2_0.tmp_0", {1, 256, 400, 400}}}; - std::map> opt_input_shape = { - {"x", {1, 3, 640, 640}}, - {"conv2d_92.tmp_0", {1, 120, 160, 160}}, - {"conv2d_91.tmp_0", {1, 24, 80, 80}}, - {"conv2d_59.tmp_0", {1, 96, 160, 160}}, - {"nearest_interp_v2_1.tmp_0", {1, 256, 80, 80}}, - {"nearest_interp_v2_2.tmp_0", {1, 256, 160, 160}}, - {"conv2d_124.tmp_0", {1, 256, 160, 160}}, - {"nearest_interp_v2_3.tmp_0", {1, 64, 160, 160}}, - {"nearest_interp_v2_4.tmp_0", {1, 64, 160, 160}}, - {"nearest_interp_v2_5.tmp_0", {1, 64, 160, 160}}, - {"elementwise_add_7", {1, 56, 40, 40}}, - {"nearest_interp_v2_0.tmp_0", {1, 256, 40, 40}}}; - - config.SetTRTDynamicShapeInfo(min_input_shape, max_input_shape, - opt_input_shape); + config.EnableTensorRtEngine(1 << 30, 1, 20, precision, false, false); + if (!Utility::PathExists("./trt_det_shape.txt")){ + config.CollectShapeRangeInfo("./trt_det_shape.txt"); + } else { + config.EnableTunedTensorRtDynamicShape("./trt_det_shape.txt", true); + } + } } else { config.DisableGpu(); @@ -109,7 +73,8 @@ void DBDetector::Run(cv::Mat &img, img.copyTo(srcimg); auto preprocess_start = std::chrono::steady_clock::now(); - this->resize_op_.Run(img, resize_img, this->max_side_len_, ratio_h, ratio_w, + this->resize_op_.Run(img, resize_img, this->limit_type_, + this->limit_side_len_, ratio_h, ratio_w, this->use_tensorrt_); this->normalize_op_.Run(&resize_img, this->mean_, this->scale_, diff --git a/deploy/cpp_infer/src/ocr_rec.cpp b/deploy/cpp_infer/src/ocr_rec.cpp index 0f90ddfab4872f97829da081e64cb7437e72493a..088cb942ba5ac4b09c9e8d1731a3b20d40967edf 100644 --- a/deploy/cpp_infer/src/ocr_rec.cpp +++ b/deploy/cpp_infer/src/ocr_rec.cpp @@ -147,20 +147,12 @@ void CRNNRecognizer::LoadModel(const std::string &model_dir) { if (this->precision_ == "int8") { precision = paddle_infer::Config::Precision::kInt8; } - config.EnableTensorRtEngine(1 << 20, 10, 15, precision, false, false); - int imgH = this->rec_image_shape_[1]; - int imgW = this->rec_image_shape_[2]; - std::map> min_input_shape = { - {"x", {1, 3, imgH, 10}}, {"lstm_0.tmp_0", {10, 1, 96}}}; - std::map> max_input_shape = { - {"x", {this->rec_batch_num_, 3, imgH, 2500}}, - {"lstm_0.tmp_0", {1000, 1, 96}}}; - std::map> opt_input_shape = { - {"x", {this->rec_batch_num_, 3, imgH, imgW}}, - {"lstm_0.tmp_0", {25, 1, 96}}}; - - config.SetTRTDynamicShapeInfo(min_input_shape, max_input_shape, - opt_input_shape); + if (!Utility::PathExists("./trt_rec_shape.txt")){ + config.CollectShapeRangeInfo("./trt_rec_shape.txt"); + } else { + config.EnableTunedTensorRtDynamicShape("./trt_rec_shape.txt", true); + } + } } else { config.DisableGpu(); diff --git a/deploy/cpp_infer/src/paddleocr.cpp b/deploy/cpp_infer/src/paddleocr.cpp index cd620a9206cad8ec2b1cd5924c714a8a1fa989b6..1de4fc7e9af8bf63cf68ef42d2a508cdc4b5f9f3 100644 --- a/deploy/cpp_infer/src/paddleocr.cpp +++ b/deploy/cpp_infer/src/paddleocr.cpp @@ -23,10 +23,10 @@ PPOCR::PPOCR() { if (FLAGS_det) { this->detector_ = new DBDetector( FLAGS_det_model_dir, FLAGS_use_gpu, FLAGS_gpu_id, FLAGS_gpu_mem, - FLAGS_cpu_threads, FLAGS_enable_mkldnn, FLAGS_max_side_len, - FLAGS_det_db_thresh, FLAGS_det_db_box_thresh, FLAGS_det_db_unclip_ratio, - FLAGS_det_db_score_mode, FLAGS_use_dilation, FLAGS_use_tensorrt, - FLAGS_precision); + FLAGS_cpu_threads, FLAGS_enable_mkldnn, FLAGS_limit_type, + FLAGS_limit_side_len, FLAGS_det_db_thresh, FLAGS_det_db_box_thresh, + FLAGS_det_db_unclip_ratio, FLAGS_det_db_score_mode, FLAGS_use_dilation, + FLAGS_use_tensorrt, FLAGS_precision); } if (FLAGS_cls && FLAGS_use_angle_cls) { @@ -56,7 +56,8 @@ void PPOCR::det(cv::Mat img, std::vector &ocr_results, res.box = boxes[i]; ocr_results.push_back(res); } - + // sort boex from top to bottom, from left to right + Utility::sorted_boxes(ocr_results); times[0] += det_times[0]; times[1] += det_times[1]; times[2] += det_times[2]; diff --git a/deploy/cpp_infer/src/paddlestructure.cpp b/deploy/cpp_infer/src/paddlestructure.cpp new file mode 100644 index 0000000000000000000000000000000000000000..ea69977a1e45b0f7c1235a647d7c56db4d3cbc74 --- /dev/null +++ b/deploy/cpp_infer/src/paddlestructure.cpp @@ -0,0 +1,265 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include "auto_log/autolog.h" +#include +#include + +namespace PaddleOCR { + +PaddleStructure::PaddleStructure() { + if (FLAGS_table) { + this->recognizer_ = new StructureTableRecognizer( + FLAGS_table_model_dir, FLAGS_use_gpu, FLAGS_gpu_id, FLAGS_gpu_mem, + FLAGS_cpu_threads, FLAGS_enable_mkldnn, FLAGS_table_char_dict_path, + FLAGS_use_tensorrt, FLAGS_precision, FLAGS_table_batch_num, + FLAGS_table_max_len, FLAGS_merge_no_span_structure); + } +}; + +std::vector> +PaddleStructure::structure(std::vector cv_all_img_names, + bool layout, bool table) { + std::vector time_info_det = {0, 0, 0}; + std::vector time_info_rec = {0, 0, 0}; + std::vector time_info_cls = {0, 0, 0}; + std::vector time_info_table = {0, 0, 0}; + + std::vector> structure_results; + + if (!Utility::PathExists(FLAGS_output) && FLAGS_det) { + Utility::CreateDir(FLAGS_output); + } + for (int i = 0; i < cv_all_img_names.size(); ++i) { + std::vector structure_result; + cv::Mat srcimg = cv::imread(cv_all_img_names[i], cv::IMREAD_COLOR); + if (!srcimg.data) { + std::cerr << "[ERROR] image read failed! image path: " + << cv_all_img_names[i] << endl; + exit(1); + } + if (layout) { + } else { + StructurePredictResult res; + res.type = "table"; + res.box = std::vector(4, 0); + res.box[2] = srcimg.cols; + res.box[3] = srcimg.rows; + structure_result.push_back(res); + } + cv::Mat roi_img; + for (int i = 0; i < structure_result.size(); i++) { + // crop image + roi_img = Utility::crop_image(srcimg, structure_result[i].box); + if (structure_result[i].type == "table") { + this->table(roi_img, structure_result[i], time_info_table, + time_info_det, time_info_rec, time_info_cls); + } + } + structure_results.push_back(structure_result); + } + return structure_results; +}; + +void PaddleStructure::table(cv::Mat img, + StructurePredictResult &structure_result, + std::vector &time_info_table, + std::vector &time_info_det, + std::vector &time_info_rec, + std::vector &time_info_cls) { + // predict structure + std::vector> structure_html_tags; + std::vector structure_scores(1, 0); + std::vector>> structure_boxes; + std::vector structure_imes; + std::vector img_list; + img_list.push_back(img); + this->recognizer_->Run(img_list, structure_html_tags, structure_scores, + structure_boxes, structure_imes); + time_info_table[0] += structure_imes[0]; + time_info_table[1] += structure_imes[1]; + time_info_table[2] += structure_imes[2]; + + std::vector ocr_result; + std::string html; + int expand_pixel = 3; + + for (int i = 0; i < img_list.size(); i++) { + // det + this->det(img_list[i], ocr_result, time_info_det); + // crop image + std::vector rec_img_list; + std::vector ocr_box; + for (int j = 0; j < ocr_result.size(); j++) { + ocr_box = Utility::xyxyxyxy2xyxy(ocr_result[j].box); + ocr_box[0] = max(0, ocr_box[0] - expand_pixel); + ocr_box[1] = max(0, ocr_box[1] - expand_pixel), + ocr_box[2] = min(img_list[i].cols, ocr_box[2] + expand_pixel); + ocr_box[3] = min(img_list[i].rows, ocr_box[3] + expand_pixel); + + cv::Mat crop_img = Utility::crop_image(img_list[i], ocr_box); + rec_img_list.push_back(crop_img); + } + // rec + this->rec(rec_img_list, ocr_result, time_info_rec); + // rebuild table + html = this->rebuild_table(structure_html_tags[i], structure_boxes[i], + ocr_result); + structure_result.html = html; + structure_result.cell_box = structure_boxes[i]; + structure_result.html_score = structure_scores[i]; + } +}; + +std::string +PaddleStructure::rebuild_table(std::vector structure_html_tags, + std::vector> structure_boxes, + std::vector &ocr_result) { + // match text in same cell + std::vector> matched(structure_boxes.size(), + std::vector()); + + std::vector ocr_box; + std::vector structure_box; + for (int i = 0; i < ocr_result.size(); i++) { + ocr_box = Utility::xyxyxyxy2xyxy(ocr_result[i].box); + ocr_box[0] -= 1; + ocr_box[1] -= 1; + ocr_box[2] += 1; + ocr_box[3] += 1; + std::vector> dis_list(structure_boxes.size(), + std::vector(3, 100000.0)); + for (int j = 0; j < structure_boxes.size(); j++) { + if (structure_boxes[i].size() == 8) { + structure_box = Utility::xyxyxyxy2xyxy(structure_boxes[j]); + } else { + structure_box = structure_boxes[j]; + } + dis_list[j][0] = this->dis(ocr_box, structure_box); + dis_list[j][1] = 1 - this->iou(ocr_box, structure_box); + dis_list[j][2] = j; + } + // find min dis idx + std::sort(dis_list.begin(), dis_list.end(), + PaddleStructure::comparison_dis); + matched[dis_list[0][2]].push_back(ocr_result[i].text); + } + + // get pred html + std::string html_str = ""; + int td_tag_idx = 0; + for (int i = 0; i < structure_html_tags.size(); i++) { + if (structure_html_tags[i].find("") != std::string::npos) { + if (structure_html_tags[i].find("") != std::string::npos) { + html_str += ""; + } + if (matched[td_tag_idx].size() > 0) { + bool b_with = false; + if (matched[td_tag_idx][0].find("") != std::string::npos && + matched[td_tag_idx].size() > 1) { + b_with = true; + html_str += ""; + } + for (int j = 0; j < matched[td_tag_idx].size(); j++) { + std::string content = matched[td_tag_idx][j]; + if (matched[td_tag_idx].size() > 1) { + // remove blank, and + if (content.length() > 0 && content.at(0) == ' ') { + content = content.substr(0); + } + if (content.length() > 2 && content.substr(0, 3) == "") { + content = content.substr(3); + } + if (content.length() > 4 && + content.substr(content.length() - 4) == "") { + content = content.substr(0, content.length() - 4); + } + if (content.empty()) { + continue; + } + // add blank + if (j != matched[td_tag_idx].size() - 1 && + content.at(content.length() - 1) != ' ') { + content += ' '; + } + } + html_str += content; + } + if (b_with) { + html_str += ""; + } + } + if (structure_html_tags[i].find("") != std::string::npos) { + html_str += ""; + } else { + html_str += structure_html_tags[i]; + } + td_tag_idx += 1; + } else { + html_str += structure_html_tags[i]; + } + } + return html_str; +} + +float PaddleStructure::iou(std::vector &box1, std::vector &box2) { + int area1 = max(0, box1[2] - box1[0]) * max(0, box1[3] - box1[1]); + int area2 = max(0, box2[2] - box2[0]) * max(0, box2[3] - box2[1]); + + // computing the sum_area + int sum_area = area1 + area2; + + // find the each point of intersect rectangle + int x1 = max(box1[0], box2[0]); + int y1 = max(box1[1], box2[1]); + int x2 = min(box1[2], box2[2]); + int y2 = min(box1[3], box2[3]); + + // judge if there is an intersect + if (y1 >= y2 || x1 >= x2) { + return 0.0; + } else { + int intersect = (x2 - x1) * (y2 - y1); + return intersect / (sum_area - intersect + 0.00000001); + } +} + +float PaddleStructure::dis(std::vector &box1, std::vector &box2) { + int x1_1 = box1[0]; + int y1_1 = box1[1]; + int x2_1 = box1[2]; + int y2_1 = box1[3]; + + int x1_2 = box2[0]; + int y1_2 = box2[1]; + int x2_2 = box2[2]; + int y2_2 = box2[3]; + + float dis = + abs(x1_2 - x1_1) + abs(y1_2 - y1_1) + abs(x2_2 - x2_1) + abs(y2_2 - y2_1); + float dis_2 = abs(x1_2 - x1_1) + abs(y1_2 - y1_1); + float dis_3 = abs(x2_2 - x2_1) + abs(y2_2 - y2_1); + return dis + min(dis_2, dis_3); +} + +PaddleStructure::~PaddleStructure() { + if (this->recognizer_ != nullptr) { + delete this->recognizer_; + } +}; + +} // namespace PaddleOCR \ No newline at end of file diff --git a/deploy/cpp_infer/src/postprocess_op.cpp b/deploy/cpp_infer/src/postprocess_op.cpp index 5374fb1a4eba68d8055a52ec91d97c290832aa9d..4b0c693c80467bceb75da2b3fef6e816b0690979 100644 --- a/deploy/cpp_infer/src/postprocess_op.cpp +++ b/deploy/cpp_infer/src/postprocess_op.cpp @@ -17,8 +17,8 @@ namespace PaddleOCR { -void PostProcessor::GetContourArea(const std::vector> &box, - float unclip_ratio, float &distance) { +void DBPostProcessor::GetContourArea(const std::vector> &box, + float unclip_ratio, float &distance) { int pts_num = 4; float area = 0.0f; float dist = 0.0f; @@ -35,8 +35,8 @@ void PostProcessor::GetContourArea(const std::vector> &box, distance = area * unclip_ratio / dist; } -cv::RotatedRect PostProcessor::UnClip(std::vector> box, - const float &unclip_ratio) { +cv::RotatedRect DBPostProcessor::UnClip(std::vector> box, + const float &unclip_ratio) { float distance = 1.0; GetContourArea(box, unclip_ratio, distance); @@ -67,7 +67,7 @@ cv::RotatedRect PostProcessor::UnClip(std::vector> box, return res; } -float **PostProcessor::Mat2Vec(cv::Mat mat) { +float **DBPostProcessor::Mat2Vec(cv::Mat mat) { auto **array = new float *[mat.rows]; for (int i = 0; i < mat.rows; ++i) array[i] = new float[mat.cols]; @@ -81,7 +81,7 @@ float **PostProcessor::Mat2Vec(cv::Mat mat) { } std::vector> -PostProcessor::OrderPointsClockwise(std::vector> pts) { +DBPostProcessor::OrderPointsClockwise(std::vector> pts) { std::vector> box = pts; std::sort(box.begin(), box.end(), XsortInt); @@ -99,7 +99,7 @@ PostProcessor::OrderPointsClockwise(std::vector> pts) { return rect; } -std::vector> PostProcessor::Mat2Vector(cv::Mat mat) { +std::vector> DBPostProcessor::Mat2Vector(cv::Mat mat) { std::vector> img_vec; std::vector tmp; @@ -113,20 +113,20 @@ std::vector> PostProcessor::Mat2Vector(cv::Mat mat) { return img_vec; } -bool PostProcessor::XsortFp32(std::vector a, std::vector b) { +bool DBPostProcessor::XsortFp32(std::vector a, std::vector b) { if (a[0] != b[0]) return a[0] < b[0]; return false; } -bool PostProcessor::XsortInt(std::vector a, std::vector b) { +bool DBPostProcessor::XsortInt(std::vector a, std::vector b) { if (a[0] != b[0]) return a[0] < b[0]; return false; } -std::vector> PostProcessor::GetMiniBoxes(cv::RotatedRect box, - float &ssid) { +std::vector> +DBPostProcessor::GetMiniBoxes(cv::RotatedRect box, float &ssid) { ssid = std::max(box.size.width, box.size.height); cv::Mat points; @@ -160,8 +160,8 @@ std::vector> PostProcessor::GetMiniBoxes(cv::RotatedRect box, return array; } -float PostProcessor::PolygonScoreAcc(std::vector contour, - cv::Mat pred) { +float DBPostProcessor::PolygonScoreAcc(std::vector contour, + cv::Mat pred) { int width = pred.cols; int height = pred.rows; std::vector box_x; @@ -206,8 +206,8 @@ float PostProcessor::PolygonScoreAcc(std::vector contour, return score; } -float PostProcessor::BoxScoreFast(std::vector> box_array, - cv::Mat pred) { +float DBPostProcessor::BoxScoreFast(std::vector> box_array, + cv::Mat pred) { auto array = box_array; int width = pred.cols; int height = pred.rows; @@ -244,7 +244,7 @@ float PostProcessor::BoxScoreFast(std::vector> box_array, return score; } -std::vector>> PostProcessor::BoxesFromBitmap( +std::vector>> DBPostProcessor::BoxesFromBitmap( const cv::Mat pred, const cv::Mat bitmap, const float &box_thresh, const float &det_db_unclip_ratio, const std::string &det_db_score_mode) { const int min_size = 3; @@ -321,9 +321,9 @@ std::vector>> PostProcessor::BoxesFromBitmap( return boxes; } -std::vector>> -PostProcessor::FilterTagDetRes(std::vector>> boxes, - float ratio_h, float ratio_w, cv::Mat srcimg) { +std::vector>> DBPostProcessor::FilterTagDetRes( + std::vector>> boxes, float ratio_h, + float ratio_w, cv::Mat srcimg) { int oriimg_h = srcimg.rows; int oriimg_w = srcimg.cols; @@ -352,4 +352,92 @@ PostProcessor::FilterTagDetRes(std::vector>> boxes, return root_points; } +void TablePostProcessor::init(std::string label_path, + bool merge_no_span_structure) { + this->label_list_ = Utility::ReadDict(label_path); + if (merge_no_span_structure) { + this->label_list_.push_back(""); + std::vector::iterator it; + for (it = this->label_list_.begin(); it != this->label_list_.end();) { + if (*it == "") { + it = this->label_list_.erase(it); + } else { + ++it; + } + } + } + // add_special_char + this->label_list_.insert(this->label_list_.begin(), this->beg); + this->label_list_.push_back(this->end); +} + +void TablePostProcessor::Run( + std::vector &loc_preds, std::vector &structure_probs, + std::vector &rec_scores, std::vector &loc_preds_shape, + std::vector &structure_probs_shape, + std::vector> &rec_html_tag_batch, + std::vector>> &rec_boxes_batch, + std::vector &width_list, std::vector &height_list) { + for (int batch_idx = 0; batch_idx < structure_probs_shape[0]; batch_idx++) { + // image tags and boxs + std::vector rec_html_tags; + std::vector> rec_boxes; + + float score = 0.f; + int count = 0; + float char_score = 0.f; + int char_idx = 0; + + // step + for (int step_idx = 0; step_idx < structure_probs_shape[1]; step_idx++) { + std::string html_tag; + std::vector rec_box; + // html tag + int step_start_idx = (batch_idx * structure_probs_shape[1] + step_idx) * + structure_probs_shape[2]; + char_idx = int(Utility::argmax( + &structure_probs[step_start_idx], + &structure_probs[step_start_idx + structure_probs_shape[2]])); + char_score = float(*std::max_element( + &structure_probs[step_start_idx], + &structure_probs[step_start_idx + structure_probs_shape[2]])); + html_tag = this->label_list_[char_idx]; + + if (step_idx > 0 && html_tag == this->end) { + break; + } + if (html_tag == this->beg) { + continue; + } + count += 1; + score += char_score; + rec_html_tags.push_back(html_tag); + + // box + if (html_tag == "" || html_tag == "") { + for (int point_idx = 0; point_idx < loc_preds_shape[2]; point_idx++) { + step_start_idx = (batch_idx * structure_probs_shape[1] + step_idx) * + loc_preds_shape[2] + + point_idx; + float point = loc_preds[step_start_idx]; + if (point_idx % 2 == 0) { + point = int(point * width_list[batch_idx]); + } else { + point = int(point * height_list[batch_idx]); + } + rec_box.push_back(point); + } + rec_boxes.push_back(rec_box); + } + } + score /= count; + if (isnan(score) || rec_boxes.size() == 0) { + score = -1; + } + rec_scores.push_back(score); + rec_boxes_batch.push_back(rec_boxes); + rec_html_tag_batch.push_back(rec_html_tags); + } +} + } // namespace PaddleOCR diff --git a/deploy/cpp_infer/src/preprocess_op.cpp b/deploy/cpp_infer/src/preprocess_op.cpp index fff49ba2c2cd0e68f0c1d93e5877ab6276bdc337..ac185e22d68955ef440e22c327b835dbce6c4e1b 100644 --- a/deploy/cpp_infer/src/preprocess_op.cpp +++ b/deploy/cpp_infer/src/preprocess_op.cpp @@ -69,18 +69,28 @@ void Normalize::Run(cv::Mat *im, const std::vector &mean, } void ResizeImgType0::Run(const cv::Mat &img, cv::Mat &resize_img, - int max_size_len, float &ratio_h, float &ratio_w, - bool use_tensorrt) { + string limit_type, int limit_side_len, float &ratio_h, + float &ratio_w, bool use_tensorrt) { int w = img.cols; int h = img.rows; - float ratio = 1.f; - int max_wh = w >= h ? w : h; - if (max_wh > max_size_len) { - if (h > w) { - ratio = float(max_size_len) / float(h); - } else { - ratio = float(max_size_len) / float(w); + if (limit_type == "min") { + int min_wh = min(h, w); + if (min_wh < limit_side_len) { + if (h < w) { + ratio = float(limit_side_len) / float(h); + } else { + ratio = float(limit_side_len) / float(w); + } + } + } else { + int max_wh = max(h, w); + if (max_wh > limit_side_len) { + if (h > w) { + ratio = float(limit_side_len) / float(h); + } else { + ratio = float(limit_side_len) / float(w); + } } } @@ -143,4 +153,26 @@ void ClsResizeImg::Run(const cv::Mat &img, cv::Mat &resize_img, } } +void TableResizeImg::Run(const cv::Mat &img, cv::Mat &resize_img, + const int max_len) { + int w = img.cols; + int h = img.rows; + + int max_wh = w >= h ? w : h; + float ratio = w >= h ? float(max_len) / float(w) : float(max_len) / float(h); + + int resize_h = int(float(h) * ratio); + int resize_w = int(float(w) * ratio); + + cv::resize(img, resize_img, cv::Size(resize_w, resize_h)); +} + +void TablePadImg::Run(const cv::Mat &img, cv::Mat &resize_img, + const int max_len) { + int w = img.cols; + int h = img.rows; + cv::copyMakeBorder(img, resize_img, 0, max_len - h, 0, max_len - w, + cv::BORDER_CONSTANT, cv::Scalar(0, 0, 0)); +} + } // namespace PaddleOCR diff --git a/deploy/cpp_infer/src/structure_table.cpp b/deploy/cpp_infer/src/structure_table.cpp new file mode 100644 index 0000000000000000000000000000000000000000..7df0ab94b5df8a62148ceb01f48b35d73b14f78c --- /dev/null +++ b/deploy/cpp_infer/src/structure_table.cpp @@ -0,0 +1,157 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +namespace PaddleOCR { + +void StructureTableRecognizer::Run( + std::vector img_list, + std::vector> &structure_html_tags, + std::vector &structure_scores, + std::vector>> &structure_boxes, + std::vector ×) { + std::chrono::duration preprocess_diff = + std::chrono::steady_clock::now() - std::chrono::steady_clock::now(); + std::chrono::duration inference_diff = + std::chrono::steady_clock::now() - std::chrono::steady_clock::now(); + std::chrono::duration postprocess_diff = + std::chrono::steady_clock::now() - std::chrono::steady_clock::now(); + + int img_num = img_list.size(); + for (int beg_img_no = 0; beg_img_no < img_num; + beg_img_no += this->table_batch_num_) { + // preprocess + auto preprocess_start = std::chrono::steady_clock::now(); + int end_img_no = min(img_num, beg_img_no + this->table_batch_num_); + int batch_num = end_img_no - beg_img_no; + std::vector norm_img_batch; + std::vector width_list; + std::vector height_list; + for (int ino = beg_img_no; ino < end_img_no; ino++) { + cv::Mat srcimg; + img_list[ino].copyTo(srcimg); + cv::Mat resize_img; + cv::Mat pad_img; + this->resize_op_.Run(srcimg, resize_img, this->table_max_len_); + this->normalize_op_.Run(&resize_img, this->mean_, this->scale_, + this->is_scale_); + this->pad_op_.Run(resize_img, pad_img, this->table_max_len_); + norm_img_batch.push_back(pad_img); + width_list.push_back(srcimg.cols); + height_list.push_back(srcimg.rows); + } + + std::vector input( + batch_num * 3 * this->table_max_len_ * this->table_max_len_, 0.0f); + this->permute_op_.Run(norm_img_batch, input.data()); + auto preprocess_end = std::chrono::steady_clock::now(); + preprocess_diff += preprocess_end - preprocess_start; + // inference. + auto input_names = this->predictor_->GetInputNames(); + auto input_t = this->predictor_->GetInputHandle(input_names[0]); + input_t->Reshape( + {batch_num, 3, this->table_max_len_, this->table_max_len_}); + auto inference_start = std::chrono::steady_clock::now(); + input_t->CopyFromCpu(input.data()); + this->predictor_->Run(); + auto output_names = this->predictor_->GetOutputNames(); + auto output_tensor0 = this->predictor_->GetOutputHandle(output_names[0]); + auto output_tensor1 = this->predictor_->GetOutputHandle(output_names[1]); + std::vector predict_shape0 = output_tensor0->shape(); + std::vector predict_shape1 = output_tensor1->shape(); + + int out_num0 = std::accumulate(predict_shape0.begin(), predict_shape0.end(), + 1, std::multiplies()); + int out_num1 = std::accumulate(predict_shape1.begin(), predict_shape1.end(), + 1, std::multiplies()); + std::vector loc_preds; + std::vector structure_probs; + loc_preds.resize(out_num0); + structure_probs.resize(out_num1); + + output_tensor0->CopyToCpu(loc_preds.data()); + output_tensor1->CopyToCpu(structure_probs.data()); + auto inference_end = std::chrono::steady_clock::now(); + inference_diff += inference_end - inference_start; + // postprocess + auto postprocess_start = std::chrono::steady_clock::now(); + std::vector> structure_html_tag_batch; + std::vector structure_score_batch; + std::vector>> structure_boxes_batch; + this->post_processor_.Run(loc_preds, structure_probs, structure_score_batch, + predict_shape0, predict_shape1, + structure_html_tag_batch, structure_boxes_batch, + width_list, height_list); + for (int m = 0; m < predict_shape0[0]; m++) { + + structure_html_tag_batch[m].insert(structure_html_tag_batch[m].begin(), + ""); + structure_html_tag_batch[m].insert(structure_html_tag_batch[m].begin(), + ""); + structure_html_tag_batch[m].insert(structure_html_tag_batch[m].begin(), + ""); + structure_html_tag_batch[m].push_back("
"); + structure_html_tag_batch[m].push_back(""); + structure_html_tag_batch[m].push_back(""); + structure_html_tags.push_back(structure_html_tag_batch[m]); + structure_scores.push_back(structure_score_batch[m]); + structure_boxes.push_back(structure_boxes_batch[m]); + } + auto postprocess_end = std::chrono::steady_clock::now(); + postprocess_diff += postprocess_end - postprocess_start; + times.push_back(double(preprocess_diff.count() * 1000)); + times.push_back(double(inference_diff.count() * 1000)); + times.push_back(double(postprocess_diff.count() * 1000)); + } +} + +void StructureTableRecognizer::LoadModel(const std::string &model_dir) { + AnalysisConfig config; + config.SetModel(model_dir + "/inference.pdmodel", + model_dir + "/inference.pdiparams"); + + if (this->use_gpu_) { + config.EnableUseGpu(this->gpu_mem_, this->gpu_id_); + if (this->use_tensorrt_) { + auto precision = paddle_infer::Config::Precision::kFloat32; + if (this->precision_ == "fp16") { + precision = paddle_infer::Config::Precision::kHalf; + } + if (this->precision_ == "int8") { + precision = paddle_infer::Config::Precision::kInt8; + } + config.EnableTensorRtEngine(1 << 20, 10, 3, precision, false, false); + } + } else { + config.DisableGpu(); + if (this->use_mkldnn_) { + config.EnableMKLDNN(); + } + config.SetCpuMathLibraryNumThreads(this->cpu_math_library_num_threads_); + } + + // false for zero copy tensor + config.SwitchUseFeedFetchOps(false); + // true for multiple input + config.SwitchSpecifyInputNames(true); + + config.SwitchIrOptim(true); + + config.EnableMemoryOptim(); + config.DisableGlogInfo(); + + this->predictor_ = CreatePredictor(config); +} +} // namespace PaddleOCR diff --git a/deploy/cpp_infer/src/utility.cpp b/deploy/cpp_infer/src/utility.cpp index 45b8104626cfc3d128e14ece8ba6763f0986cfe4..0e6ba17fc3bab5b5e005f8b5e41640899bee39d0 100644 --- a/deploy/cpp_infer/src/utility.cpp +++ b/deploy/cpp_infer/src/utility.cpp @@ -65,6 +65,37 @@ void Utility::VisualizeBboxes(const cv::Mat &srcimg, << std::endl; } +void Utility::VisualizeBboxes(const cv::Mat &srcimg, + const StructurePredictResult &structure_result, + const std::string &save_path) { + cv::Mat img_vis; + srcimg.copyTo(img_vis); + for (int n = 0; n < structure_result.cell_box.size(); n++) { + if (structure_result.cell_box[n].size() == 8) { + cv::Point rook_points[4]; + for (int m = 0; m < structure_result.cell_box[n].size(); m += 2) { + rook_points[m / 2] = + cv::Point(int(structure_result.cell_box[n][m]), + int(structure_result.cell_box[n][m + 1])); + } + const cv::Point *ppt[1] = {rook_points}; + int npt[] = {4}; + cv::polylines(img_vis, ppt, npt, 1, 1, CV_RGB(0, 255, 0), 2, 8, 0); + } else if (structure_result.cell_box[n].size() == 4) { + cv::Point rook_points[2]; + rook_points[0] = cv::Point(int(structure_result.cell_box[n][0]), + int(structure_result.cell_box[n][1])); + rook_points[1] = cv::Point(int(structure_result.cell_box[n][2]), + int(structure_result.cell_box[n][3])); + cv::rectangle(img_vis, rook_points[0], rook_points[1], CV_RGB(0, 255, 0), + 2, 8, 0); + } + } + + cv::imwrite(save_path, img_vis); + std::cout << "The table visualized image saved in " + save_path << std::endl; +} + // list all files under a directory void Utility::GetAllFiles(const char *dir_name, std::vector &all_inputs) { @@ -248,4 +279,66 @@ void Utility::print_result(const std::vector &ocr_result) { std::cout << std::endl; } } + +cv::Mat Utility::crop_image(cv::Mat &img, std::vector &area) { + cv::Mat crop_im; + int crop_x1 = std::max(0, area[0]); + int crop_y1 = std::max(0, area[1]); + int crop_x2 = std::min(img.cols - 1, area[2] - 1); + int crop_y2 = std::min(img.rows - 1, area[3] - 1); + + crop_im = cv::Mat::zeros(area[3] - area[1], area[2] - area[0], 16); + cv::Mat crop_im_window = + crop_im(cv::Range(crop_y1 - area[1], crop_y2 + 1 - area[1]), + cv::Range(crop_x1 - area[0], crop_x2 + 1 - area[0])); + cv::Mat roi_img = + img(cv::Range(crop_y1, crop_y2 + 1), cv::Range(crop_x1, crop_x2 + 1)); + crop_im_window += roi_img; + return crop_im; +} + +void Utility::sorted_boxes(std::vector &ocr_result) { + std::sort(ocr_result.begin(), ocr_result.end(), Utility::comparison_box); + if (ocr_result.size() > 0) { + for (int i = 0; i < ocr_result.size() - 1; i++) { + for (int j = i; j > 0; j--) { + if (abs(ocr_result[j + 1].box[0][1] - ocr_result[j].box[0][1]) < 10 && + (ocr_result[j + 1].box[0][0] < ocr_result[j].box[0][0])) { + std::swap(ocr_result[i], ocr_result[i + 1]); + } + } + } + } +} + +std::vector Utility::xyxyxyxy2xyxy(std::vector> &box) { + int x_collect[4] = {box[0][0], box[1][0], box[2][0], box[3][0]}; + int y_collect[4] = {box[0][1], box[1][1], box[2][1], box[3][1]}; + int left = int(*std::min_element(x_collect, x_collect + 4)); + int right = int(*std::max_element(x_collect, x_collect + 4)); + int top = int(*std::min_element(y_collect, y_collect + 4)); + int bottom = int(*std::max_element(y_collect, y_collect + 4)); + std::vector box1(4, 0); + box1[0] = left; + box1[1] = top; + box1[2] = right; + box1[3] = bottom; + return box1; +} + +std::vector Utility::xyxyxyxy2xyxy(std::vector &box) { + int x_collect[4] = {box[0], box[2], box[4], box[6]}; + int y_collect[4] = {box[1], box[3], box[5], box[7]}; + int left = int(*std::min_element(x_collect, x_collect + 4)); + int right = int(*std::max_element(x_collect, x_collect + 4)); + int top = int(*std::min_element(y_collect, y_collect + 4)); + int bottom = int(*std::max_element(y_collect, y_collect + 4)); + std::vector box1(4, 0); + box1[0] = left; + box1[1] = top; + box1[2] = right; + box1[3] = bottom; + return box1; +} + } // namespace PaddleOCR \ No newline at end of file diff --git a/deploy/hubserving/ocr_system/module.py b/deploy/hubserving/ocr_system/module.py index 71a19c6b7049ec1d779377e7c84cbfe7d2820991..dff3abb48010946a9817b832383f1c95b7053970 100644 --- a/deploy/hubserving/ocr_system/module.py +++ b/deploy/hubserving/ocr_system/module.py @@ -118,7 +118,7 @@ class OCRSystem(hub.Module): all_results.append([]) continue starttime = time.time() - dt_boxes, rec_res = self.text_sys(img) + dt_boxes, rec_res, _ = self.text_sys(img) elapse = time.time() - starttime logger.info("Predict time: {}".format(elapse)) diff --git a/deploy/hubserving/readme.md b/deploy/hubserving/readme.md index 183a25912c2c62371e6db6af2fde5c792fbcbecb..c583cc96ede437a1f65f9b1bddb69e84b7c54852 100755 --- a/deploy/hubserving/readme.md +++ b/deploy/hubserving/readme.md @@ -20,13 +20,14 @@ PaddleOCR提供2种服务部署方式: # 基于PaddleHub Serving的服务部署 -hubserving服务部署目录下包括文本检测、文本方向分类,文本识别、文本检测+文本方向分类+文本识别3阶段串联,表格识别和PP-Structure六种服务包,请根据需求选择相应的服务包进行安装和启动。目录结构如下: +hubserving服务部署目录下包括文本检测、文本方向分类,文本识别、文本检测+文本方向分类+文本识别3阶段串联,版面分析、表格识别和PP-Structure七种服务包,请根据需求选择相应的服务包进行安装和启动。目录结构如下: ``` deploy/hubserving/ └─ ocr_cls 文本方向分类模块服务包 └─ ocr_det 文本检测模块服务包 └─ ocr_rec 文本识别模块服务包 └─ ocr_system 文本检测+文本方向分类+文本识别串联服务包 + └─ structure_layout 版面分析服务包 └─ structure_table 表格识别服务包 └─ structure_system PP-Structure服务包 ``` @@ -41,6 +42,7 @@ deploy/hubserving/ocr_system/ ``` ## 1. 近期更新 +* 2022.08.23 新增版面分析服务。 * 2022.05.05 新增PP-OCRv3检测和识别模型。 * 2022.03.30 新增PP-Structure和表格识别两种服务。 @@ -59,8 +61,9 @@ pip3 install paddlehub==2.1.0 --upgrade -i https://mirror.baidu.com/pypi/simple 检测模型:./inference/ch_PP-OCRv3_det_infer/ 识别模型:./inference/ch_PP-OCRv3_rec_infer/ 方向分类器:./inference/ch_ppocr_mobile_v2.0_cls_infer/ -表格结构识别模型:./inference/en_ppocr_mobile_v2.0_table_structure_infer/ -``` +版面分析模型:./inference/picodet_lcnet_x1_0_fgd_layout_infer/ +表格结构识别模型:./inference/ch_ppstructure_mobile_v2.0_SLANet_infer/ +``` **模型路径可在`params.py`中查看和修改。** 更多模型可以从PaddleOCR提供的模型库[PP-OCR](../../doc/doc_ch/models_list.md)和[PP-Structure](../../ppstructure/docs/models_list.md)下载,也可以替换成自己训练转换好的模型。 @@ -86,6 +89,9 @@ hub install deploy/hubserving/structure_table/ # 或,安装PP-Structure服务模块: hub install deploy/hubserving/structure_system/ + +# 或,安装版面分析服务模块: +hub install deploy/hubserving/structure_layout/ ``` * 在Windows环境下(文件夹的分隔符为`\`),安装示例如下: @@ -107,6 +113,9 @@ hub install deploy\hubserving\structure_table\ # 或,安装PP-Structure服务模块: hub install deploy\hubserving\structure_system\ + +# 或,安装版面分析服务模块: +hub install deploy\hubserving\structure_layout\ ``` ### 2.4 启动服务 @@ -117,7 +126,7 @@ $ hub serving start --modules [Module1==Version1, Module2==Version2, ...] \ --port XXXX \ --use_multiprocess \ --workers \ -``` +``` **参数:** @@ -167,12 +176,12 @@ $ hub serving start --modules [Module1==Version1, Module2==Version2, ...] \ ```shell export CUDA_VISIBLE_DEVICES=3 hub serving start -c deploy/hubserving/ocr_system/config.json -``` +``` ## 3. 发送预测请求 配置好服务端,可使用以下命令发送预测请求,获取预测结果: -```python tools/test_hubserving.py server_url image_path``` +```python tools/test_hubserving.py --server_url=server_url --image_dir=image_path``` 需要给脚本传递2个参数: - **server_url**:服务地址,格式为 @@ -184,6 +193,7 @@ hub serving start -c deploy/hubserving/ocr_system/config.json `http://127.0.0.1:8868/predict/ocr_system` `http://127.0.0.1:8869/predict/structure_table` `http://127.0.0.1:8870/predict/structure_system` +`http://127.0.0.1:8870/predict/structure_layout` - **image_dir**:测试图像路径,可以是单张图片路径,也可以是图像集合目录路径 - **visualize**:是否可视化结果,默认为False - **output**:可视化结果保存路径,默认为`./hubserving_result` @@ -202,17 +212,19 @@ hub serving start -c deploy/hubserving/ocr_system/config.json |text_region|list|文本位置坐标| |html|str|表格的html字符串| |regions|list|版面分析+表格识别+OCR的结果,每一项为一个list,包含表示区域坐标的`bbox`,区域类型的`type`和区域结果的`res`三个字段| +|layout|list|版面分析的结果,每一项一个dict,包含版面区域坐标的`bbox`,区域类型的`label`| 不同模块返回的字段不同,如,文本识别服务模块返回结果不含`text_region`字段,具体信息如下: -| 字段名/模块名 | ocr_det | ocr_cls | ocr_rec | ocr_system | structure_table | structure_system | -| --- | --- | --- | --- | --- | --- |--- | -|angle| | ✔ | | ✔ | || -|text| | |✔|✔| | ✔ | -|confidence| |✔ |✔| | | ✔| -|text_region| ✔| | |✔ | | ✔| -|html| | | | |✔ |✔| -|regions| | | | |✔ |✔ | +| 字段名/模块名 | ocr_det | ocr_cls | ocr_rec | ocr_system | structure_table | structure_system | Structure_layout | +| --- | --- | --- | --- | --- | --- | --- | --- | +|angle| | ✔ | | ✔ | ||| +|text| | |✔|✔| | ✔ | | +|confidence| |✔ |✔| | | ✔| | +|text_region| ✔| | |✔ | | ✔| | +|html| | | | |✔ |✔|| +|regions| | | | |✔ |✔ | | +|layout| | | | | | | ✔ | **说明:** 如果需要增加、删除、修改返回字段,可在相应模块的`module.py`文件中进行修改,完整流程参考下一节自定义修改服务模块。 diff --git a/deploy/hubserving/readme_en.md b/deploy/hubserving/readme_en.md index 27eccbb5e9c465f20b3725f04aa1652e6829fa3c..f09fe46417c7567305e5ce05a14be74d33450c31 100755 --- a/deploy/hubserving/readme_en.md +++ b/deploy/hubserving/readme_en.md @@ -20,13 +20,14 @@ PaddleOCR provides 2 service deployment methods: # Service deployment based on PaddleHub Serving -The hubserving service deployment directory includes six service packages: text detection, text angle class, text recognition, text detection+text angle class+text recognition three-stage series connection, table recognition and PP-Structure. Please select the corresponding service package to install and start service according to your needs. The directory is as follows: +The hubserving service deployment directory includes seven service packages: text detection, text angle class, text recognition, text detection+text angle class+text recognition three-stage series connection, layout analysis, table recognition and PP-Structure. Please select the corresponding service package to install and start service according to your needs. The directory is as follows: ``` deploy/hubserving/ └─ ocr_det text detection module service package └─ ocr_cls text angle class module service package └─ ocr_rec text recognition module service package └─ ocr_system text detection+text angle class+text recognition three-stage series connection service package + └─ structure_layout layout analysis service package └─ structure_table table recognition service package └─ structure_system PP-Structure service package ``` @@ -43,6 +44,7 @@ deploy/hubserving/ocr_system/ * 2022.05.05 add PP-OCRv3 text detection and recognition models. * 2022.03.30 add PP-Structure and table recognition services。 +* 2022.08.23 add layout analysis services。 ## 2. Quick start service @@ -61,7 +63,8 @@ Before installing the service module, you need to prepare the inference model an text detection model: ./inference/ch_PP-OCRv3_det_infer/ text recognition model: ./inference/ch_PP-OCRv3_rec_infer/ text angle classifier: ./inference/ch_ppocr_mobile_v2.0_cls_infer/ -tanle recognition: ./inference/en_ppocr_mobile_v2.0_table_structure_infer/ +layout parse model: ./inference/picodet_lcnet_x1_0_fgd_layout_infer/ +tanle recognition: ./inference/ch_ppstructure_mobile_v2.0_SLANet_infer/ ``` **The model path can be found and modified in `params.py`.** More models provided by PaddleOCR can be obtained from the [model library](../../doc/doc_en/models_list_en.md). You can also use models trained by yourself. @@ -88,6 +91,9 @@ hub install deploy/hubserving/structure_table/ # Or install PP-Structure service module hub install deploy/hubserving/structure_system/ + +# Or install layout analysis service module +hub install deploy/hubserving/structure_layout/ ``` * On Windows platform, the examples are as follows. @@ -109,6 +115,9 @@ hub install deploy/hubserving/structure_table/ # Or install PP-Structure service module hub install deploy\hubserving\structure_system\ + +# Or install layout analysis service module +hub install deploy\hubserving\structure_layout\ ``` ### 2.4 Start service @@ -177,7 +186,7 @@ hub serving start -c deploy/hubserving/ocr_system/config.json ## 3. Send prediction requests After the service starts, you can use the following command to send a prediction request to obtain the prediction result: ```shell -python tools/test_hubserving.py server_url image_path +python tools/test_hubserving.py --server_url=server_url --image_dir=image_path ``` Two parameters need to be passed to the script: @@ -189,8 +198,9 @@ For example, if using the configuration file to start the text angle classificat `http://127.0.0.1:8866/predict/ocr_cls` `http://127.0.0.1:8867/predict/ocr_rec` `http://127.0.0.1:8868/predict/ocr_system` -`http://127.0.0.1:8869/predict/structure_table` +`http://127.0.0.1:8869/predict/structure_table` `http://127.0.0.1:8870/predict/structure_system` +`http://127.0.0.1:8870/predict/structure_layout` - **image_dir**:Test image path, can be a single image path or an image directory path - **visualize**:Whether to visualize the results, the default value is False - **output**:The floder to save Visualization result, default value is `./hubserving_result` @@ -211,17 +221,19 @@ The returned result is a list. Each item in the list is a dict. The dict may con |text_region|list|text location coordinates| |html|str|table html str| |regions|list|The result of layout analysis + table recognition + OCR, each item is a list, including `bbox` indicating area coordinates, `type` of area type and `res` of area results| +|layout|list|The result of layout analysis, each item is a dict, including `bbox` indicating area coordinates, `label` of area type| The fields returned by different modules are different. For example, the results returned by the text recognition service module do not contain `text_region`. The details are as follows: -| field name/module name | ocr_det | ocr_cls | ocr_rec | ocr_system | structure_table | structure_system | -| --- | --- | --- | --- | --- | --- |--- | -|angle| | ✔ | | ✔ | || -|text| | |✔|✔| | ✔ | -|confidence| |✔ |✔| | | ✔| -|text_region| ✔| | |✔ | | ✔| -|html| | | | |✔ |✔| -|regions| | | | |✔ |✔ | +| field name/module name | ocr_det | ocr_cls | ocr_rec | ocr_system | structure_table | structure_system | structure_layout | +| --- | --- | --- | --- | --- | --- |--- |--- | +|angle| | ✔ | | ✔ | || | +|text| | |✔|✔| | ✔ | | +|confidence| |✔ |✔| | | ✔| | +|text_region| ✔| | |✔ | | ✔| | +|html| | | | |✔ |✔| | +|regions| | | | |✔ |✔ | | +|layout| | | | | | |✔ | **Note:** If you need to add, delete or modify the returned fields, you can modify the file `module.py` of the corresponding module. For the complete process, refer to the user-defined modification service module in the next section. diff --git a/deploy/hubserving/structure_layout/__init__.py b/deploy/hubserving/structure_layout/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c747d3e7aeca842933e083dffc01ef1fba3f4e85 --- /dev/null +++ b/deploy/hubserving/structure_layout/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. \ No newline at end of file diff --git a/deploy/hubserving/structure_layout/config.json b/deploy/hubserving/structure_layout/config.json new file mode 100644 index 0000000000000000000000000000000000000000..bc52c1ab603d5659f90a5ed8a72cdb06638fb9e5 --- /dev/null +++ b/deploy/hubserving/structure_layout/config.json @@ -0,0 +1,16 @@ +{ + "modules_info": { + "structure_layout": { + "init_args": { + "version": "1.0.0", + "use_gpu": true + }, + "predict_args": { + } + } + }, + "port": 8871, + "use_multiprocess": false, + "workers": 2 +} + diff --git a/deploy/hubserving/structure_layout/module.py b/deploy/hubserving/structure_layout/module.py new file mode 100644 index 0000000000000000000000000000000000000000..7091f123fc0039e4886d8763096952d7c445184c --- /dev/null +++ b/deploy/hubserving/structure_layout/module.py @@ -0,0 +1,143 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import sys +sys.path.insert(0, ".") +import copy + +import time +import paddlehub +from paddlehub.common.logger import logger +from paddlehub.module.module import moduleinfo, runnable, serving +import cv2 +import paddlehub as hub + +from tools.infer.utility import base64_to_cv2 +from ppstructure.layout.predict_layout import LayoutPredictor as _LayoutPredictor +from ppstructure.utility import parse_args +from deploy.hubserving.structure_layout.params import read_params + + +@moduleinfo( + name="structure_layout", + version="1.0.0", + summary="PP-Structure layout service", + author="paddle-dev", + author_email="paddle-dev@baidu.com", + type="cv/structure_layout") +class LayoutPredictor(hub.Module): + def _initialize(self, use_gpu=False, enable_mkldnn=False): + """ + initialize with the necessary elements + """ + cfg = self.merge_configs() + cfg.use_gpu = use_gpu + if use_gpu: + try: + _places = os.environ["CUDA_VISIBLE_DEVICES"] + int(_places[0]) + print("use gpu: ", use_gpu) + print("CUDA_VISIBLE_DEVICES: ", _places) + cfg.gpu_mem = 8000 + except: + raise RuntimeError( + "Environment Variable CUDA_VISIBLE_DEVICES is not set correctly. If you wanna use gpu, please set CUDA_VISIBLE_DEVICES via export CUDA_VISIBLE_DEVICES=cuda_device_id." + ) + cfg.ir_optim = True + cfg.enable_mkldnn = enable_mkldnn + + self.layout_predictor = _LayoutPredictor(cfg) + + def merge_configs(self): + # deafult cfg + backup_argv = copy.deepcopy(sys.argv) + sys.argv = sys.argv[:1] + cfg = parse_args() + + update_cfg_map = vars(read_params()) + + for key in update_cfg_map: + cfg.__setattr__(key, update_cfg_map[key]) + + sys.argv = copy.deepcopy(backup_argv) + return cfg + + def read_images(self, paths=[]): + images = [] + for img_path in paths: + assert os.path.isfile( + img_path), "The {} isn't a valid file.".format(img_path) + img = cv2.imread(img_path) + if img is None: + logger.info("error in loading image:{}".format(img_path)) + continue + images.append(img) + return images + + def predict(self, images=[], paths=[]): + """ + Get the chinese texts in the predicted images. + Args: + images (list(numpy.ndarray)): images data, shape of each is [H, W, C]. If images not paths + paths (list[str]): The paths of images. If paths not images + Returns: + res (list): The layout results of images. + """ + + if images != [] and isinstance(images, list) and paths == []: + predicted_data = images + elif images == [] and isinstance(paths, list) and paths != []: + predicted_data = self.read_images(paths) + else: + raise TypeError("The input data is inconsistent with expectations.") + + assert predicted_data != [], "There is not any image to be predicted. Please check the input data." + + all_results = [] + for img in predicted_data: + if img is None: + logger.info("error in loading image") + all_results.append([]) + continue + starttime = time.time() + res, _ = self.layout_predictor(img) + elapse = time.time() - starttime + logger.info("Predict time: {}".format(elapse)) + + for item in res: + item['bbox'] = item['bbox'].tolist() + all_results.append({'layout': res}) + return all_results + + @serving + def serving_method(self, images, **kwargs): + """ + Run as a service. + """ + images_decode = [base64_to_cv2(image) for image in images] + results = self.predict(images_decode, **kwargs) + return results + + +if __name__ == '__main__': + layout = LayoutPredictor() + layout._initialize() + image_path = ['./ppstructure/docs/table/1.png'] + res = layout.predict(paths=image_path) + print(res) diff --git a/deploy/hubserving/structure_layout/params.py b/deploy/hubserving/structure_layout/params.py new file mode 100755 index 0000000000000000000000000000000000000000..448b66ac42dac555f084299f525ee9e91ad481d8 --- /dev/null +++ b/deploy/hubserving/structure_layout/params.py @@ -0,0 +1,32 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + + +class Config(object): + pass + + +def read_params(): + cfg = Config() + + # params for layout analysis + cfg.layout_model_dir = './inference/picodet_lcnet_x1_0_fgd_layout_infer/' + cfg.layout_dict_path = './ppocr/utils/dict/layout_dict/layout_publaynet_dict.txt' + cfg.layout_score_threshold = 0.5 + cfg.layout_nms_threshold = 0.5 + return cfg diff --git a/deploy/hubserving/structure_system/module.py b/deploy/hubserving/structure_system/module.py index 92846edc6698d0d75224a2b2a844c572fcb17a56..61c93bb146ab11998bc7ed3350cb2686b73e3d3b 100644 --- a/deploy/hubserving/structure_system/module.py +++ b/deploy/hubserving/structure_system/module.py @@ -119,7 +119,7 @@ class StructureSystem(hub.Module): all_results.append([]) continue starttime = time.time() - res = self.table_sys(img) + res, _ = self.table_sys(img) elapse = time.time() - starttime logger.info("Predict time: {}".format(elapse)) @@ -144,6 +144,6 @@ class StructureSystem(hub.Module): if __name__ == '__main__': structure_system = StructureSystem() structure_system._initialize() - image_path = ['./doc/table/1.png'] + image_path = ['./ppstructure/docs/table/1.png'] res = structure_system.predict(paths=image_path) print(res) diff --git a/deploy/hubserving/structure_system/params.py b/deploy/hubserving/structure_system/params.py index 3cc6a2794f80bcd68e254b82e45a05eb17811f65..fe691fbc2d172cc1ad32115abd5a4ee850d8ab2e 100755 --- a/deploy/hubserving/structure_system/params.py +++ b/deploy/hubserving/structure_system/params.py @@ -23,8 +23,10 @@ def read_params(): cfg = table_read_params() # params for layout parser model - cfg.layout_path_model = 'lp://PubLayNet/ppyolov2_r50vd_dcn_365e_publaynet/config' - cfg.layout_label_map = None + cfg.layout_model_dir = '' + cfg.layout_dict_path = './ppocr/utils/dict/layout_publaynet_dict.txt' + cfg.layout_score_threshold = 0.5 + cfg.layout_nms_threshold = 0.5 cfg.mode = 'structure' cfg.output = './output' diff --git a/deploy/hubserving/structure_table/module.py b/deploy/hubserving/structure_table/module.py index 00393daa037368191201a5afed4aa29a3920c268..b4432b2d7b8764bc0327e7b12fe7887530e825c4 100644 --- a/deploy/hubserving/structure_table/module.py +++ b/deploy/hubserving/structure_table/module.py @@ -118,11 +118,11 @@ class TableSystem(hub.Module): all_results.append([]) continue starttime = time.time() - pred_html = self.table_sys(img) + res, _ = self.table_sys(img) elapse = time.time() - starttime logger.info("Predict time: {}".format(elapse)) - all_results.append({'html': pred_html}) + all_results.append({'html': res['html']}) return all_results @serving @@ -138,6 +138,6 @@ class TableSystem(hub.Module): if __name__ == '__main__': table_system = TableSystem() table_system._initialize() - image_path = ['./doc/table/table.jpg'] + image_path = ['./ppstructure/docs/table/table.jpg'] res = table_system.predict(paths=image_path) print(res) diff --git a/deploy/lite/config.txt b/deploy/lite/config.txt index dda0d2b0320544d3a82f59b0672c086c64d83d3d..404249323b6cb5de345438056a9a10abd64b38bc 100644 --- a/deploy/lite/config.txt +++ b/deploy/lite/config.txt @@ -5,4 +5,4 @@ det_db_unclip_ratio 1.6 det_db_use_dilate 0 det_use_polygon_score 1 use_direction_classify 1 -rec_image_height 32 \ No newline at end of file +rec_image_height 48 \ No newline at end of file diff --git a/deploy/lite/readme.md b/deploy/lite/readme.md index a1bef8120e52dd91db0fda4ac2a4d91cc2800818..fc91cbfa7d69f6a8c1086243e4df3f820bd78339 100644 --- a/deploy/lite/readme.md +++ b/deploy/lite/readme.md @@ -99,6 +99,8 @@ The following table also provides a series of models that can be deployed on mob |Version|Introduction|Model size|Detection model|Text Direction model|Recognition model|Paddle-Lite branch| |---|---|---|---|---|---|---| +|PP-OCRv3|extra-lightweight chinese OCR optimized model|16.2M|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.nb)|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_ppocr_mobile_v2.0_cls_infer_opt.nb)|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.nb)|v2.10| +|PP-OCRv3(slim)|extra-lightweight chinese OCR optimized model|5.9M|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_slim_infer.nb)|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_ppocr_mobile_v2.0_cls_slim_opt.nb)|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_infer.nb)|v2.10| |PP-OCRv2|extra-lightweight chinese OCR optimized model|11M|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_PP-OCRv2_det_infer_opt.nb)|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_ppocr_mobile_v2.0_cls_infer_opt.nb)|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_PP-OCRv2_rec_infer_opt.nb)|v2.10| |PP-OCRv2(slim)|extra-lightweight chinese OCR optimized model|4.6M|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_PP-OCRv2_det_slim_opt.nb)|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_ppocr_mobile_v2.0_cls_slim_opt.nb)|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_PP-OCRv2_rec_slim_opt.nb)|v2.10| @@ -134,17 +136,16 @@ Introduction to paddle_lite_opt parameters: The following takes the ultra-lightweight Chinese model of PaddleOCR as an example to introduce the use of the compiled opt file to complete the conversion of the inference model to the Paddle-Lite optimized model ``` -# 【[Recommendation] Download the Chinese and English inference model of PP-OCRv2 -wget https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_slim_quant_infer.tar && tar xf ch_PP-OCRv2_det_slim_quant_infer.tar -wget https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_infer.tar && tar xf ch_PP-OCRv2_rec_slim_quant_infer.tar +# 【[Recommendation] Download the Chinese and English inference model of PP-OCRv3 +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_slim_infer.tar && tar xf ch_PP-OCRv3_det_slim_infer.tar +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_infer.tar && tar xf ch_PP-OCRv2_rec_slim_quant_infer.tar wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/slim/ch_ppocr_mobile_v2.0_cls_slim_infer.tar && tar xf ch_ppocr_mobile_v2.0_cls_slim_infer.tar # Convert detection model -./opt --model_file=./ch_PP-OCRv2_det_slim_quant_infer/inference.pdmodel --param_file=./ch_PP-OCRv2_det_slim_quant_infer/inference.pdiparams --optimize_out=./ch_PP-OCRv2_det_slim_opt --valid_targets=arm --optimize_out_type=naive_buffer +paddle_lite_opt --model_file=./ch_PP-OCRv3_det_slim_infer/inference.pdmodel --param_file=./ch_PP-OCRv3_det_slim_infer/inference.pdiparams --optimize_out=./ch_PP-OCRv3_det_slim_opt --valid_targets=arm --optimize_out_type=naive_buffer # Convert recognition model -./opt --model_file=./ch_PP-OCRv2_rec_slim_quant_infer/inference.pdmodel --param_file=./ch_PP-OCRv2_rec_slim_quant_infer/inference.pdiparams --optimize_out=./ch_PP-OCRv2_rec_slim_opt --valid_targets=arm --optimize_out_type=naive_buffer +paddle_lite_opt --model_file=./ch_PP-OCRv3_rec_slim_infer/inference.pdmodel --param_file=./ch_PP-OCRv3_rec_slim_infer/inference.pdiparams --optimize_out=./ch_PP-OCRv3_rec_slim_opt --valid_targets=arm --optimize_out_type=naive_buffer # Convert angle classifier model -./opt --model_file=./ch_ppocr_mobile_v2.0_cls_slim_infer/inference.pdmodel --param_file=./ch_ppocr_mobile_v2.0_cls_slim_infer/inference.pdiparams --optimize_out=./ch_ppocr_mobile_v2.0_cls_slim_opt --valid_targets=arm --optimize_out_type=naive_buffer - +paddle_lite_opt --model_file=./ch_ppocr_mobile_v2.0_cls_slim_infer/inference.pdmodel --param_file=./ch_ppocr_mobile_v2.0_cls_slim_infer/inference.pdiparams --optimize_out=./ch_ppocr_mobile_v2.0_cls_slim_opt --valid_targets=arm --optimize_out_type=naive_buffer ``` After the conversion is successful, there will be more files ending with `.nb` in the inference model directory, which is the successfully converted model file. @@ -197,15 +198,15 @@ Some preparatory work is required first. cp ../../../cxx/lib/libpaddle_light_api_shared.so ./debug/ ``` -Prepare the test image, taking PaddleOCR/doc/imgs/11.jpg as an example, copy the image file to the demo/cxx/ocr/debug/ folder. Prepare the model files optimized by the lite opt tool, ch_det_mv3_db_opt.nb, ch_rec_mv3_crnn_opt.nb, and place them under the demo/cxx/ocr/debug/ folder. +Prepare the test image, taking PaddleOCR/doc/imgs/11.jpg as an example, copy the image file to the demo/cxx/ocr/debug/ folder. Prepare the model files optimized by the lite opt tool, ch_PP-OCRv3_det_slim_opt.nb , ch_PP-OCRv3_rec_slim_opt.nb , and place them under the demo/cxx/ocr/debug/ folder. The structure of the OCR demo is as follows after the above command is executed: ``` demo/cxx/ocr/ |-- debug/ -| |--ch_PP-OCRv2_det_slim_opt.nb Detection model -| |--ch_PP-OCRv2_rec_slim_opt.nb Recognition model +| |--ch_PP-OCRv3_det_slim_opt.nb Detection model +| |--ch_PP-OCRv3_rec_slim_opt.nb Recognition model | |--ch_ppocr_mobile_v2.0_cls_slim_opt.nb Text direction classification model | |--11.jpg Image for OCR | |--ppocr_keys_v1.txt Dictionary file @@ -240,7 +241,7 @@ det_db_thresh 0.3 # Used to filter the binarized image of DB prediction, det_db_box_thresh 0.5 # DDB post-processing filter box threshold, if there is a missing box detected, it can be reduced as appropriate det_db_unclip_ratio 1.6 # Indicates the compactness of the text box, the smaller the value, the closer the text box to the text use_direction_classify 0 # Whether to use the direction classifier, 0 means not to use, 1 means to use -rec_image_height 32 # The height of the input image of the recognition model, the PP-OCRv3 model needs to be set to 48, and the PP-OCRv2 model needs to be set to 32 +rec_image_height 48 # The height of the input image of the recognition model, the PP-OCRv3 model needs to be set to 48, and the PP-OCRv2 model needs to be set to 32 ``` 5. Run Model on phone @@ -260,14 +261,14 @@ After the above steps are completed, you can use adb to push the file to the pho export LD_LIBRARY_PATH=${PWD}:$LD_LIBRARY_PATH # The use of ocr_db_crnn is: # ./ocr_db_crnn Mode Detection model file Orientation classifier model file Recognition model file Hardware Precision Threads Batchsize Test image path Dictionary file path - ./ocr_db_crnn system ch_PP-OCRv2_det_slim_opt.nb ch_PP-OCRv2_rec_slim_opt.nb ch_ppocr_mobile_v2.0_cls_slim_opt.nb arm8 INT8 10 1 ./11.jpg config.txt ppocr_keys_v1.txt True + ./ocr_db_crnn system ch_PP-OCRv3_det_slim_opt.nb ch_PP-OCRv3_rec_slim_opt.nb ch_ppocr_mobile_v2.0_cls_slim_opt.nb arm8 INT8 10 1 ./11.jpg config.txt ppocr_keys_v1.txt True # precision can be INT8 for quantitative model or FP32 for normal model. # Only using detection model -./ocr_db_crnn det ch_PP-OCRv2_det_slim_opt.nb arm8 INT8 10 1 ./11.jpg config.txt +./ocr_db_crnn det ch_PP-OCRv3_det_slim_opt.nb arm8 INT8 10 1 ./11.jpg config.txt # Only using recognition model -./ocr_db_crnn rec ch_PP-OCRv2_rec_slim_opt.nb arm8 INT8 10 1 word_1.jpg ppocr_keys_v1.txt config.txt +./ocr_db_crnn rec ch_PP-OCRv3_rec_slim_opt.nb arm8 INT8 10 1 word_1.jpg ppocr_keys_v1.txt config.txt ``` If you modify the code, you need to recompile and push to the phone. diff --git a/deploy/lite/readme_ch.md b/deploy/lite/readme_ch.md index 0793827fe647c470944fc36e2b243c8f7e704e99..78e2510917e0fd85c4a724ec74eccb0b7cfc6118 100644 --- a/deploy/lite/readme_ch.md +++ b/deploy/lite/readme_ch.md @@ -97,6 +97,8 @@ Paddle-Lite 提供了多种策略来自动优化原始的模型,其中包括 |模型版本|模型简介|模型大小|检测模型|文本方向分类模型|识别模型|Paddle-Lite版本| |---|---|---|---|---|---|---| +|PP-OCRv3|蒸馏版超轻量中文OCR移动端模型|16.2M|[下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.nb)|[下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_ppocr_mobile_v2.0_cls_infer_opt.nb)|[下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.nb)|v2.10| +|PP-OCRv3(slim)|蒸馏版超轻量中文OCR移动端模型|5.9M|[下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_slim_infer.nb)|[下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_ppocr_mobile_v2.0_cls_slim_opt.nb)|[下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_infer.nb)|v2.10| |PP-OCRv2|蒸馏版超轻量中文OCR移动端模型|11M|[下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_PP-OCRv2_det_infer_opt.nb)|[下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_ppocr_mobile_v2.0_cls_infer_opt.nb)|[下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_PP-OCRv2_rec_infer_opt.nb)|v2.10| |PP-OCRv2(slim)|蒸馏版超轻量中文OCR移动端模型|4.6M|[下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_PP-OCRv2_det_slim_opt.nb)|[下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_ppocr_mobile_v2.0_cls_slim_opt.nb)|[下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_PP-OCRv2_rec_slim_opt.nb)|v2.10| @@ -131,16 +133,16 @@ paddle_lite_opt 参数介绍: 下面以PaddleOCR的超轻量中文模型为例,介绍使用编译好的opt文件完成inference模型到Paddle-Lite优化模型的转换。 ``` -# 【推荐】 下载 PP-OCRv2版本的中英文 inference模型 -wget https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_slim_quant_infer.tar && tar xf ch_PP-OCRv2_det_slim_quant_infer.tar -wget https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_infer.tar && tar xf ch_PP-OCRv2_rec_slim_quant_infer.tar +# 【推荐】 下载 PP-OCRv3版本的中英文 inference模型 +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_slim_infer.tar && tar xf ch_PP-OCRv3_det_slim_infer.tar +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_infer.tar && tar xf ch_PP-OCRv2_rec_slim_quant_infer.tar wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/slim/ch_ppocr_mobile_v2.0_cls_slim_infer.tar && tar xf ch_ppocr_mobile_v2.0_cls_slim_infer.tar # 转换检测模型 -./opt --model_file=./ch_PP-OCRv2_det_slim_quant_infer/inference.pdmodel --param_file=./ch_PP-OCRv2_det_slim_quant_infer/inference.pdiparams --optimize_out=./ch_PP-OCRv2_det_slim_opt --valid_targets=arm --optimize_out_type=naive_buffer +paddle_lite_opt --model_file=./ch_PP-OCRv3_det_slim_infer/inference.pdmodel --param_file=./ch_PP-OCRv3_det_slim_infer/inference.pdiparams --optimize_out=./ch_PP-OCRv3_det_slim_opt --valid_targets=arm --optimize_out_type=naive_buffer # 转换识别模型 -./opt --model_file=./ch_PP-OCRv2_rec_slim_quant_infer/inference.pdmodel --param_file=./ch_PP-OCRv2_rec_slim_quant_infer/inference.pdiparams --optimize_out=./ch_PP-OCRv2_rec_slim_opt --valid_targets=arm --optimize_out_type=naive_buffer +paddle_lite_opt --model_file=./ch_PP-OCRv3_rec_slim_infer/inference.pdmodel --param_file=./ch_PP-OCRv3_rec_slim_infer/inference.pdiparams --optimize_out=./ch_PP-OCRv3_rec_slim_opt --valid_targets=arm --optimize_out_type=naive_buffer # 转换方向分类器模型 -./opt --model_file=./ch_ppocr_mobile_v2.0_cls_slim_infer/inference.pdmodel --param_file=./ch_ppocr_mobile_v2.0_cls_slim_infer/inference.pdiparams --optimize_out=./ch_ppocr_mobile_v2.0_cls_slim_opt --valid_targets=arm --optimize_out_type=naive_buffer +paddle_lite_opt --model_file=./ch_ppocr_mobile_v2.0_cls_slim_infer/inference.pdmodel --param_file=./ch_ppocr_mobile_v2.0_cls_slim_infer/inference.pdiparams --optimize_out=./ch_ppocr_mobile_v2.0_cls_slim_opt --valid_targets=arm --optimize_out_type=naive_buffer ``` @@ -194,15 +196,15 @@ wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/slim/ch_ppocr_mobile_v2.0_cls ``` 准备测试图像,以`PaddleOCR/doc/imgs/11.jpg`为例,将测试的图像复制到`demo/cxx/ocr/debug/`文件夹下。 - 准备lite opt工具优化后的模型文件,比如使用`ch_PP-OCRv2_det_slim_opt.ch_PP-OCRv2_rec_slim_rec.nb, ch_ppocr_mobile_v2.0_cls_slim_opt.nb`,模型文件放置在`demo/cxx/ocr/debug/`文件夹下。 + 准备lite opt工具优化后的模型文件,比如使用`ch_PP-OCRv3_det_slim_opt.ch_PP-OCRv3_rec_slim_rec.nb, ch_ppocr_mobile_v2.0_cls_slim_opt.nb`,模型文件放置在`demo/cxx/ocr/debug/`文件夹下。 执行完成后,ocr文件夹下将有如下文件格式: ``` demo/cxx/ocr/ |-- debug/ -| |--ch_PP-OCRv2_det_slim_opt.nb 优化后的检测模型文件 -| |--ch_PP-OCRv2_rec_slim_opt.nb 优化后的识别模型文件 +| |--ch_PP-OCRv3_det_slim_opt.nb 优化后的检测模型文件 +| |--ch_PP-OCRv3_rec_slim_opt.nb 优化后的识别模型文件 | |--ch_ppocr_mobile_v2.0_cls_slim_opt.nb 优化后的文字方向分类器模型文件 | |--11.jpg 待测试图像 | |--ppocr_keys_v1.txt 中文字典文件 @@ -239,7 +241,7 @@ det_db_thresh 0.3 # 用于过滤DB预测的二值化图像,设置为0. det_db_box_thresh 0.5 # 检测器后处理过滤box的阈值,如果检测存在漏框情况,可酌情减小 det_db_unclip_ratio 1.6 # 表示文本框的紧致程度,越小则文本框更靠近文本 use_direction_classify 0 # 是否使用方向分类器,0表示不使用,1表示使用 -rec_image_height 32 # 识别模型输入图像的高度,PP-OCRv3模型设置为48,PP-OCRv2模型需要设置为32 +rec_image_height 48 # 识别模型输入图像的高度,PP-OCRv3模型设置为48,PP-OCRv2模型需要设置为32 ``` 5. 启动调试 @@ -259,13 +261,13 @@ rec_image_height 32 # 识别模型输入图像的高度,PP-OCRv3模型 export LD_LIBRARY_PATH=${PWD}:$LD_LIBRARY_PATH # 开始使用,ocr_db_crnn可执行文件的使用方式为: # ./ocr_db_crnn 预测模式 检测模型文件 方向分类器模型文件 识别模型文件 运行硬件 运行精度 线程数 batchsize 测试图像路径 参数配置路径 字典文件路径 是否使用benchmark参数 - ./ocr_db_crnn system ch_PP-OCRv2_det_slim_opt.nb ch_PP-OCRv2_rec_slim_opt.nb ch_ppocr_mobile_v2.0_cls_slim_opt.nb arm8 INT8 10 1 ./11.jpg config.txt ppocr_keys_v1.txt True + ./ocr_db_crnn system ch_PP-OCRv3_det_slim_opt.nb ch_PP-OCRv3_rec_slim_opt.nb ch_ppocr_mobile_v2.0_cls_slim_opt.nb arm8 INT8 10 1 ./11.jpg config.txt ppocr_keys_v1.txt True # 仅使用文本检测模型,使用方式如下: -./ocr_db_crnn det ch_PP-OCRv2_det_slim_opt.nb arm8 INT8 10 1 ./11.jpg config.txt +./ocr_db_crnn det ch_PP-OCRv3_det_slim_opt.nb arm8 INT8 10 1 ./11.jpg config.txt # 仅使用文本识别模型,使用方式如下: -./ocr_db_crnn rec ch_PP-OCRv2_rec_slim_opt.nb arm8 INT8 10 1 word_1.jpg ppocr_keys_v1.txt config.txt +./ocr_db_crnn rec ch_PP-OCRv3_rec_slim_opt.nb arm8 INT8 10 1 word_1.jpg ppocr_keys_v1.txt config.txt ``` 如果对代码做了修改,则需要重新编译并push到手机上。 diff --git a/deploy/slim/quantization/README.md b/deploy/slim/quantization/README.md index 4c1d784b99aade614d78b4bd6fb20afef15f0f6f..7f1ff7ae22e78cded28f1689d66a5e41dd8950a2 100644 --- a/deploy/slim/quantization/README.md +++ b/deploy/slim/quantization/README.md @@ -22,7 +22,7 @@ ### 1. 安装PaddleSlim ```bash -pip3 install paddleslim==2.2.2 +pip3 install paddleslim==2.3.2 ``` ### 2. 准备训练好的模型 @@ -33,17 +33,7 @@ PaddleOCR提供了一系列训练好的[模型](../../../doc/doc_ch/models_list. 量化训练包括离线量化训练和在线量化训练,在线量化训练效果更好,需加载预训练模型,在定义好量化策略后即可对模型进行量化。 -量化训练的代码位于slim/quantization/quant.py 中,比如训练检测模型,训练指令如下: -```bash -python deploy/slim/quantization/quant.py -c configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml -o Global.pretrained_model='your trained model' Global.save_model_dir=./output/quant_model - -# 比如下载提供的训练模型 -wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_train.tar -tar -xf ch_ppocr_mobile_v2.0_det_train.tar -python deploy/slim/quantization/quant.py -c configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml -o Global.pretrained_model=./ch_ppocr_mobile_v2.0_det_train/best_accuracy Global.save_model_dir=./output/quant_model -``` - -模型蒸馏和模型量化可以同时使用,以PPOCRv3检测模型为例: +量化训练的代码位于slim/quantization/quant.py 中,比如训练检测模型,以PPOCRv3检测模型为例,训练指令如下: ``` # 下载检测预训练模型: wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_distill_train.tar @@ -58,7 +48,7 @@ python deploy/slim/quantization/quant.py -c configs/det/ch_PP-OCRv3/ch_PP-OCRv3_ 在得到量化训练保存的模型后,我们可以将其导出为inference_model,用于预测部署: ```bash -python deploy/slim/quantization/export_model.py -c configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml -o Global.checkpoints=output/quant_model/best_accuracy Global.save_inference_dir=./output/quant_inference_model +python deploy/slim/quantization/export_model.py -c configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml -o Global.checkpoints=output/quant_model/best_accuracy Global.save_inference_dir=./output/quant_inference_model ``` ### 5. 量化模型部署 diff --git a/deploy/slim/quantization/README_en.md b/deploy/slim/quantization/README_en.md index 33b2c4784afa4be68c8b9db1a02d83013c886655..f82c3d844e292ee76b95624f7632ed40301e5a4c 100644 --- a/deploy/slim/quantization/README_en.md +++ b/deploy/slim/quantization/README_en.md @@ -25,7 +25,7 @@ After training, if you want to further compress the model size and accelerate th ### 1. Install PaddleSlim ```bash -pip3 install paddleslim==2.2.2 +pip3 install paddleslim==2.3.2 ``` @@ -39,18 +39,7 @@ Quantization training includes offline quantization training and online quantiza Online quantization training is more effective. It is necessary to load the pre-trained model. After the quantization strategy is defined, the model can be quantified. -The code for quantization training is located in `slim/quantization/quant.py`. For example, to train a detection model, the training instructions are as follows: -```bash -python deploy/slim/quantization/quant.py -c configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml -o Global.pretrained_model='your trained model' Global.save_model_dir=./output/quant_model - -# download provided model -wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_train.tar -tar -xf ch_ppocr_mobile_v2.0_det_train.tar -python deploy/slim/quantization/quant.py -c configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml -o Global.pretrained_model=./ch_ppocr_mobile_v2.0_det_train/best_accuracy Global.save_model_dir=./output/quant_model -``` - - -Model distillation and model quantization can be used at the same time, taking the PPOCRv3 detection model as an example: +The code for quantization training is located in `slim/quantization/quant.py`. For example, the training instructions of slim PPOCRv3 detection model are as follows: ``` # download provided model wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_distill_train.tar @@ -66,11 +55,11 @@ If you want to quantify the text recognition model, you can modify the configura Once we got the model after pruning and fine-tuning, we can export it as an inference model for the deployment of predictive tasks: ```bash -python deploy/slim/quantization/export_model.py -c configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml -o Global.checkpoints=output/quant_model/best_accuracy Global.save_inference_dir=./output/quant_inference_model +python deploy/slim/quantization/export_model.py -c configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml -o Global.checkpoints=output/quant_model/best_accuracy Global.save_inference_dir=./output/quant_inference_model ``` ### 5. Deploy The numerical range of the quantized model parameters derived from the above steps is still FP32, but the numerical range of the parameters is int8. The derived model can be converted through the `opt tool` of PaddleLite. -For quantitative model deployment, please refer to [Mobile terminal model deployment](../../lite/readme_en.md) +For quantitative model deployment, please refer to [Mobile terminal model deployment](../../lite/readme.md) diff --git a/deploy/slim/quantization/export_model.py b/deploy/slim/quantization/export_model.py index fd1c3e5e109667fa74f5ade18b78f634e4d325db..bd132b625181cab853961efd2e2c38c411e9edf4 100755 --- a/deploy/slim/quantization/export_model.py +++ b/deploy/slim/quantization/export_model.py @@ -151,17 +151,24 @@ def main(): arch_config = config["Architecture"] - arch_config = config["Architecture"] + if arch_config["algorithm"] == "SVTR" and arch_config["Head"][ + "name"] != 'MultiHead': + input_shape = config["Eval"]["dataset"]["transforms"][-2][ + 'SVTRRecResizeImg']['image_shape'] + else: + input_shape = None if arch_config["algorithm"] in ["Distillation", ]: # distillation model archs = list(arch_config["Models"].values()) for idx, name in enumerate(model.model_name_list): sub_model_save_path = os.path.join(save_path, name, "inference") export_single_model(model.model_list[idx], archs[idx], - sub_model_save_path, logger, quanter) + sub_model_save_path, logger, input_shape, + quanter) else: save_path = os.path.join(save_path, "inference") - export_single_model(model, arch_config, save_path, logger, quanter) + export_single_model(model, arch_config, save_path, logger, input_shape, + quanter) if __name__ == "__main__": diff --git a/deploy/slim/quantization/quant.py b/deploy/slim/quantization/quant.py index 64521b5e06df61cf656da4087e6cd49f82adfadd..ef2c3e28f94e8b72d1aa7822fc88ecfd5c406b89 100755 --- a/deploy/slim/quantization/quant.py +++ b/deploy/slim/quantization/quant.py @@ -158,8 +158,7 @@ def main(config, device, logger, vdl_writer): pre_best_model_dict = dict() # load fp32 model to begin quantization - if config["Global"]["pretrained_model"] is not None: - pre_best_model_dict = load_model(config, model) + pre_best_model_dict = load_model(config, model, None, config['Architecture']["model_type"]) freeze_params = False if config['Architecture']["algorithm"] in ["Distillation"]: @@ -184,8 +183,7 @@ def main(config, device, logger, vdl_writer): model=model) # resume PACT training process - if config["Global"]["checkpoints"] is not None: - pre_best_model_dict = load_model(config, model, optimizer) + pre_best_model_dict = load_model(config, model, optimizer, config['Architecture']["model_type"]) # build metric eval_class = build_metric(config['Metric']) diff --git a/deploy/slim/quantization/quant_kl.py b/deploy/slim/quantization/quant_kl.py index cc3a455b971937fbb2e401b87112475341bd41f3..73e1a957e8606fd7cc8269e96eec1e274484db06 100755 --- a/deploy/slim/quantization/quant_kl.py +++ b/deploy/slim/quantization/quant_kl.py @@ -97,6 +97,17 @@ def sample_generator(loader): return __reader__ +def sample_generator_layoutxlm_ser(loader): + def __reader__(): + for indx, data in enumerate(loader): + input_ids = np.array(data[0]) + bbox = np.array(data[1]) + attention_mask = np.array(data[2]) + token_type_ids = np.array(data[3]) + images = np.array(data[4]) + yield [input_ids, bbox, attention_mask, token_type_ids, images] + + return __reader__ def main(config, device, logger, vdl_writer): # init dist environment @@ -107,16 +118,18 @@ def main(config, device, logger, vdl_writer): # build dataloader config['Train']['loader']['num_workers'] = 0 + is_layoutxlm_ser = config['Architecture']['model_type'] =='kie' and config['Architecture']['Backbone']['name'] == 'LayoutXLMForSer' train_dataloader = build_dataloader(config, 'Train', device, logger) if config['Eval']: config['Eval']['loader']['num_workers'] = 0 valid_dataloader = build_dataloader(config, 'Eval', device, logger) + if is_layoutxlm_ser: + train_dataloader = valid_dataloader else: valid_dataloader = None paddle.enable_static() - place = paddle.CPUPlace() - exe = paddle.static.Executor(place) + exe = paddle.static.Executor(device) if 'inference_model' in global_config.keys(): # , 'inference_model'): inference_model_dir = global_config['inference_model'] @@ -127,6 +140,11 @@ def main(config, device, logger, vdl_writer): raise ValueError( "Please set inference model dir in Global.inference_model or Global.pretrained_model for post-quantazition" ) + + if is_layoutxlm_ser: + generator = sample_generator_layoutxlm_ser(train_dataloader) + else: + generator = sample_generator(train_dataloader) paddleslim.quant.quant_post_static( executor=exe, @@ -134,7 +152,7 @@ def main(config, device, logger, vdl_writer): model_filename='inference.pdmodel', params_filename='inference.pdiparams', quantize_model_path=global_config['save_inference_dir'], - sample_generator=sample_generator(train_dataloader), + sample_generator=generator, save_model_filename='inference.pdmodel', save_params_filename='inference.pdiparams', batch_size=1, diff --git a/doc/datasets/wildreceipt_demo/1bbe854b8817dedb8585e0732089fd1f752d2cec.jpeg b/doc/datasets/wildreceipt_demo/1bbe854b8817dedb8585e0732089fd1f752d2cec.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..dfed3a0c0e943ca6716ea446f0bf10c9dac38cd0 Binary files /dev/null and b/doc/datasets/wildreceipt_demo/1bbe854b8817dedb8585e0732089fd1f752d2cec.jpeg differ diff --git a/doc/datasets/wildreceipt_demo/2769.jpeg b/doc/datasets/wildreceipt_demo/2769.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..d5a28763c9ef08a57afdfa6218322a4e9b8ab199 Binary files /dev/null and b/doc/datasets/wildreceipt_demo/2769.jpeg differ diff --git a/doc/doc_ch/algorithm.md b/doc/doc_ch/algorithm.md deleted file mode 100644 index d50a5aa4e80336036424bddace9579db98c699c3..0000000000000000000000000000000000000000 --- a/doc/doc_ch/algorithm.md +++ /dev/null @@ -1,14 +0,0 @@ -# 前沿算法与模型 - -PaddleOCR将**持续新增**支持OCR领域前沿算法与模型,已支持的模型与使用教程可点击下方列表查看: - -- [文本检测算法](./algorithm_overview.md#11-%E6%96%87%E6%9C%AC%E6%A3%80%E6%B5%8B%E7%AE%97%E6%B3%95) -- [文本识别算法](./algorithm_overview.md#12-%E6%96%87%E6%9C%AC%E8%AF%86%E5%88%AB%E7%AE%97%E6%B3%95) -- [端到端算法](./algorithm_overview.md#2-%E6%96%87%E6%9C%AC%E8%AF%86%E5%88%AB%E7%AE%97%E6%B3%95) -- [表格识别]](./algorithm_overview.md#3-%E8%A1%A8%E6%A0%BC%E8%AF%86%E5%88%AB%E7%AE%97%E6%B3%95) - -**欢迎广大开发者合作共建,贡献更多算法,合入有奖🎁!具体可查看[社区常规赛](https://github.com/PaddlePaddle/PaddleOCR/issues/4982)。** - -新增算法可参考如下教程: - -- [使用PaddleOCR架构添加新算法](./add_new_algorithm.md) diff --git a/doc/doc_ch/algorithm_det_ct.md b/doc/doc_ch/algorithm_det_ct.md new file mode 100644 index 0000000000000000000000000000000000000000..ea3522b7bf3c2dc17ef4f645bc47738477f07cf1 --- /dev/null +++ b/doc/doc_ch/algorithm_det_ct.md @@ -0,0 +1,95 @@ +# CT + +- [1. 算法简介](#1) +- [2. 环境配置](#2) +- [3. 模型训练、评估、预测](#3) + - [3.1 训练](#3-1) + - [3.2 评估](#3-2) + - [3.3 预测](#3-3) +- [4. 推理部署](#4) + - [4.1 Python推理](#4-1) + - [4.2 C++推理](#4-2) + - [4.3 Serving服务化部署](#4-3) + - [4.4 更多推理部署](#4-4) +- [5. FAQ](#5) + + +## 1. 算法简介 + +论文信息: +> [CentripetalText: An Efficient Text Instance Representation for Scene Text Detection](https://arxiv.org/abs/2107.05945) +> Tao Sheng, Jie Chen, Zhouhui Lian +> NeurIPS, 2021 + + +在Total-Text文本检测公开数据集上,算法复现效果如下: + +|模型|骨干网络|配置文件|precision|recall|Hmean|下载链接| +| --- | --- | --- | --- | --- | --- | --- | +|CT|ResNet18_vd|[configs/det/det_r18_vd_ct.yml](../../configs/det/det_r18_vd_ct.yml)|88.68%|81.70%|85.05%|[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r18_ct_train.tar)| + + + +## 2. 环境配置 +请先参考[《运行环境准备》](./environment.md)配置PaddleOCR运行环境,参考[《项目克隆》](./clone.md)克隆项目代码。 + + + +## 3. 模型训练、评估、预测 + +CT模型使用Total-Text文本检测公开数据集训练得到,数据集下载可参考 [Total-Text-Dataset](https://github.com/cs-chan/Total-Text-Dataset/tree/master/Dataset), 我们将标签文件转成了paddleocr格式,转换好的标签文件下载参考[train.txt](https://paddleocr.bj.bcebos.com/dataset/ct_tipc/train.txt), [text.txt](https://paddleocr.bj.bcebos.com/dataset/ct_tipc/test.txt)。 + +请参考[文本检测训练教程](./detection.md)。PaddleOCR对代码进行了模块化,训练不同的检测模型只需要**更换配置文件**即可。 + + + +## 4. 推理部署 + + +### 4.1 Python推理 +首先将CT文本检测训练过程中保存的模型,转换成inference model。以基于Resnet18_vd骨干网络,在Total-Text英文数据集训练的模型为例( [模型下载地址](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r18_ct_train.tar) ),可以使用如下命令进行转换: + +```shell +python3 tools/export_model.py -c configs/det/det_r18_vd_ct.yml -o Global.pretrained_model=./det_r18_ct_train/best_accuracy Global.save_inference_dir=./inference/det_ct +``` + +CT文本检测模型推理,可以执行如下命令: + +```shell +python3 tools/infer/predict_det.py --image_dir="./doc/imgs_en/img623.jpg" --det_model_dir="./inference/det_ct/" --det_algorithm="CT" +``` + +可视化文本检测结果默认保存到`./inference_results`文件夹里面,结果文件的名称前缀为'det_res'。结果示例如下: + +![](../imgs_results/det_res_img623_ct.jpg) + + + +### 4.2 C++推理 + +暂不支持 + + +### 4.3 Serving服务化部署 + +暂不支持 + + +### 4.4 更多推理部署 + +暂不支持 + + +## 5. FAQ + + +## 引用 + +```bibtex +@inproceedings{sheng2021centripetaltext, + title={CentripetalText: An Efficient Text Instance Representation for Scene Text Detection}, + author={Tao Sheng and Jie Chen and Zhouhui Lian}, + booktitle={Thirty-Fifth Conference on Neural Information Processing Systems}, + year={2021} +} +``` diff --git a/doc/doc_ch/algorithm_kie_layoutxlm.md b/doc/doc_ch/algorithm_kie_layoutxlm.md new file mode 100644 index 0000000000000000000000000000000000000000..e693be49b7bc89e04b169fe74cf76525b2494948 --- /dev/null +++ b/doc/doc_ch/algorithm_kie_layoutxlm.md @@ -0,0 +1,172 @@ +# 关键信息抽取算法-LayoutXLM + +- [1. 算法简介](#1-算法简介) +- [2. 环境配置](#2-环境配置) +- [3. 模型训练、评估、预测](#3-模型训练评估预测) +- [4. 推理部署](#4-推理部署) + - [4.1 Python推理](#41-python推理) + - [4.2 C++推理部署](#42-推理部署) + - [4.3 Serving服务化部署](#43-serving服务化部署) + - [4.4 更多推理部署](#44-更多推理部署) +- [5. FAQ](#5-faq) +- [引用](#引用) + + + + +## 1. 算法简介 + + +论文信息: + +> [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) +> +> Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei +> +> 2021 + +在XFUND_zh数据集上,算法复现效果如下: + +|模型|骨干网络|任务|配置文件|hmean|下载链接| +| --- | --- |--|--- | --- | --- | +|LayoutXLM|LayoutXLM-base|SER |[ser_layoutxlm_xfund_zh.yml](../../configs/kie/layoutlm_series/ser_layoutxlm_xfund_zh.yml)|90.38%|[训练模型](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutXLM_xfun_zh.tar)/[推理模型](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutXLM_xfun_zh_infer.tar)| +|LayoutXLM|LayoutXLM-base|RE | [re_layoutxlm_xfund_zh.yml](../../configs/kie/layoutlm_series/re_layoutxlm_xfund_zh.yml)|74.83%|[训练模型](https://paddleocr.bj.bcebos.com/pplayout/re_LayoutXLM_xfun_zh.tar)/[推理模型(coming soon)]()| + + + +## 2. 环境配置 +请先参考[《运行环境准备》](./environment.md)配置PaddleOCR运行环境,参考[《项目克隆》](./clone.md)克隆项目代码。 + + + + +## 3. 模型训练、评估、预测 + +请参考[关键信息抽取教程](./kie.md)。PaddleOCR对代码进行了模块化,训练不同的关键信息抽取模型只需要**更换配置文件**即可。 + + + +## 4. 推理部署 + + + +### 4.1 Python推理 + +**注:** 目前RE任务推理过程仍在适配中,下面以SER任务为例,介绍基于LayoutXLM模型的关键信息抽取过程。 + +首先将训练得到的模型转换成inference model。LayoutXLM模型在XFUND_zh数据集上训练的模型为例([模型下载地址](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutXLM_xfun_zh.tar)),可以使用下面的命令进行转换。 + +``` bash +wget https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutXLM_xfun_zh.tar +tar -xf ser_LayoutXLM_xfun_zh.tar +python3 tools/export_model.py -c configs/kie/layoutlm_series/ser_layoutxlm_xfund_zh.yml -o Architecture.Backbone.checkpoints=./ser_LayoutXLM_xfun_zh/best_accuracy Global.save_inference_dir=./inference/ser_layoutxlm +``` + +LayoutXLM模型基于SER任务进行推理,可以执行如下命令: + +```bash +cd ppstructure +python3 kie/predict_kie_token_ser.py \ + --kie_algorithm=LayoutXLM \ + --ser_model_dir=../inference/ser_layoutxlm_infer \ + --image_dir=./docs/kie/input/zh_val_42.jpg \ + --ser_dict_path=../train_data/XFUND/class_list_xfun.txt \ + --vis_font_path=../doc/fonts/simfang.ttf +``` + +SER可视化结果默认保存到`./output`文件夹里面,结果示例如下: + +
+ +
+ + + +### 4.2 C++推理部署 + +暂不支持 + + +### 4.3 Serving服务化部署 + +暂不支持 + + +### 4.4 更多推理部署 + +暂不支持 + + + +## 5. FAQ + +## 引用 + + +```bibtex +@article{DBLP:journals/corr/abs-2104-08836, + author = {Yiheng Xu and + Tengchao Lv and + Lei Cui and + Guoxin Wang and + Yijuan Lu and + Dinei Flor{\^{e}}ncio and + Cha Zhang and + Furu Wei}, + title = {LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich + Document Understanding}, + journal = {CoRR}, + volume = {abs/2104.08836}, + year = {2021}, + url = {https://arxiv.org/abs/2104.08836}, + eprinttype = {arXiv}, + eprint = {2104.08836}, + timestamp = {Thu, 14 Oct 2021 09:17:23 +0200}, + biburl = {https://dblp.org/rec/journals/corr/abs-2104-08836.bib}, + bibsource = {dblp computer science bibliography, https://dblp.org} +} + +@article{DBLP:journals/corr/abs-1912-13318, + author = {Yiheng Xu and + Minghao Li and + Lei Cui and + Shaohan Huang and + Furu Wei and + Ming Zhou}, + title = {LayoutLM: Pre-training of Text and Layout for Document Image Understanding}, + journal = {CoRR}, + volume = {abs/1912.13318}, + year = {2019}, + url = {http://arxiv.org/abs/1912.13318}, + eprinttype = {arXiv}, + eprint = {1912.13318}, + timestamp = {Mon, 01 Jun 2020 16:20:46 +0200}, + biburl = {https://dblp.org/rec/journals/corr/abs-1912-13318.bib}, + bibsource = {dblp computer science bibliography, https://dblp.org} +} + +@article{DBLP:journals/corr/abs-2012-14740, + author = {Yang Xu and + Yiheng Xu and + Tengchao Lv and + Lei Cui and + Furu Wei and + Guoxin Wang and + Yijuan Lu and + Dinei A. F. Flor{\^{e}}ncio and + Cha Zhang and + Wanxiang Che and + Min Zhang and + Lidong Zhou}, + title = {LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding}, + journal = {CoRR}, + volume = {abs/2012.14740}, + year = {2020}, + url = {https://arxiv.org/abs/2012.14740}, + eprinttype = {arXiv}, + eprint = {2012.14740}, + timestamp = {Tue, 27 Jul 2021 09:53:52 +0200}, + biburl = {https://dblp.org/rec/journals/corr/abs-2012-14740.bib}, + bibsource = {dblp computer science bibliography, https://dblp.org} +} +``` diff --git a/doc/doc_ch/algorithm_kie_sdmgr.md b/doc/doc_ch/algorithm_kie_sdmgr.md new file mode 100644 index 0000000000000000000000000000000000000000..10f3ca063596942618466723ed69a9047e9c828d --- /dev/null +++ b/doc/doc_ch/algorithm_kie_sdmgr.md @@ -0,0 +1,144 @@ + +# 关键信息抽取算法-SDMGR + +- [1. 算法简介](#1-算法简介) +- [2. 环境配置](#2-环境配置) +- [3. 模型训练、评估、预测](#3-模型训练评估预测) + - [3.1 模型训练](#31-模型训练) + - [3.2 模型评估](#32-模型评估) + - [3.3 模型预测](#33-模型预测) +- [4. 推理部署](#4-推理部署) + - [4.1 Python推理](#41-python推理) + - [4.2 C++推理部署](#42-c推理部署) + - [4.3 Serving服务化部署](#43-serving服务化部署) + - [4.4 更多推理部署](#44-更多推理部署) +- [5. FAQ](#5-faq) +- [引用](#引用) + + + + +## 1. 算法简介 + + +论文信息: + +> [Spatial Dual-Modality Graph Reasoning for Key Information Extraction](https://arxiv.org/abs/2103.14470) +> +> Hongbin Sun and Zhanghui Kuang and Xiaoyu Yue and Chenhao Lin and Wayne Zhang +> +> 2021 + +在wildreceipt发票公开数据集上,算法复现效果如下: + +|模型|骨干网络|配置文件|hmean|下载链接| +| --- | --- | --- | --- | --- | +|SDMGR|VGG6|[configs/kie/sdmgr/kie_unet_sdmgr.yml](../../configs/kie/sdmgr/kie_unet_sdmgr.yml)|86.7%|[训练模型]( https://paddleocr.bj.bcebos.com/dygraph_v2.1/kie/kie_vgg16.tar)/[推理模型(coming soon)]()| + + + + +## 2. 环境配置 +请先参考[《运行环境准备》](./environment.md)配置PaddleOCR运行环境,参考[《项目克隆》](./clone.md)克隆项目代码。 + + + + +## 3. 模型训练、评估、预测 + +SDMGR是一个关键信息提取算法,将每个检测到的文本区域分类为预定义的类别,如订单ID、发票号码,金额等。 + +训练和测试的数据采用wildreceipt数据集,通过如下指令下载数据集: + +```bash +wget https://paddleocr.bj.bcebos.com/ppstructure/dataset/wildreceipt.tar && tar xf wildreceipt.tar +``` + +创建数据集软链到PaddleOCR/train_data目录下: +``` +cd PaddleOCR/ && mkdir train_data && cd train_data + +ln -s ../../wildreceipt ./ +``` + + +### 3.1 模型训练 + +训练采用的配置文件是`configs/kie/sdmgr/kie_unet_sdmgr.yml`,配置文件中默认训练数据路径是`train_data/wildreceipt`,准备好数据后,可以通过如下指令执行训练: + +``` +python3 tools/train.py -c configs/kie/sdmgr/kie_unet_sdmgr.yml -o Global.save_model_dir=./output/kie/ +``` + +### 3.2 模型评估 + +执行下面的命令进行模型评估 + +```bash +python3 tools/eval.py -c configs/kie/sdmgr/kie_unet_sdmgr.yml -o Global.checkpoints=./output/kie/best_accuracy +``` + +输出信息示例如下所示。 + +```py +[2022/08/10 05:22:23] ppocr INFO: metric eval *************** +[2022/08/10 05:22:23] ppocr INFO: hmean:0.8670120239257812 +[2022/08/10 05:22:23] ppocr INFO: fps:10.18816520530961 +``` + +### 3.3 模型预测 + +执行下面的命令进行模型预测,预测的时候需要预先加载存储图片路径以及OCR信息的文本文件,使用`Global.infer_img`进行指定。 + +```bash +python3 tools/infer_kie.py -c configs/kie/kie_unet_sdmgr.yml -o Global.checkpoints=kie_vgg16/best_accuracy Global.infer_img=./train_data/wildreceipt/1.txt +``` + +执行预测后的结果保存在`./output/sdmgr_kie/predicts_kie.txt`文件中,可视化结果保存在`/output/sdmgr_kie/kie_results/`目录下。 + +可视化结果如下图所示: + +
+ +
+ + +## 4. 推理部署 + + +### 4.1 Python推理 + +暂不支持 + + +### 4.2 C++推理部署 + +暂不支持 + + +### 4.3 Serving服务化部署 + +暂不支持 + + +### 4.4 更多推理部署 + +暂不支持 + + + +## 5. FAQ + +## 引用 + + +```bibtex +@misc{sun2021spatial, + title={Spatial Dual-Modality Graph Reasoning for Key Information Extraction}, + author={Hongbin Sun and Zhanghui Kuang and Xiaoyu Yue and Chenhao Lin and Wayne Zhang}, + year={2021}, + eprint={2103.14470}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` diff --git a/doc/doc_ch/algorithm_kie_vi_layoutxlm.md b/doc/doc_ch/algorithm_kie_vi_layoutxlm.md new file mode 100644 index 0000000000000000000000000000000000000000..f1bb4b1e62736e88594196819dcc41980f1716bf --- /dev/null +++ b/doc/doc_ch/algorithm_kie_vi_layoutxlm.md @@ -0,0 +1,166 @@ +# 关键信息抽取算法-VI-LayoutXLM + +- [1. 算法简介](#1-算法简介) +- [2. 环境配置](#2-环境配置) +- [3. 模型训练、评估、预测](#3-模型训练评估预测) +- [4. 推理部署](#4-推理部署) + - [4.1 Python推理](#41-python推理) + - [4.2 C++推理部署](#42-c推理部署) + - [4.3 Serving服务化部署](#43-serving服务化部署) + - [4.4 更多推理部署](#44-更多推理部署) +- [5. FAQ](#5-faq) +- [引用](#引用) + + + + +## 1. 算法简介 + +VI-LayoutXLM基于LayoutXLM进行改进,在下游任务训练过程中,去除视觉骨干网络模块,最终精度基本无损的情况下,模型推理速度进一步提升。 + +在XFUND_zh数据集上,算法复现效果如下: + +|模型|骨干网络|任务|配置文件|hmean|下载链接| +| --- | --- |---| --- | --- | --- | +|VI-LayoutXLM |VI-LayoutXLM-base | SER |[ser_vi_layoutxlm_xfund_zh_udml.yml](../../configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh_udml.yml)|93.19%|[训练模型](https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/ser_vi_layoutxlm_xfund_pretrained.tar)/[推理模型](https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/ser_vi_layoutxlm_xfund_infer.tar)| +|VI-LayoutXLM |VI-LayoutXLM-base |RE | [re_vi_layoutxlm_xfund_zh_udml.yml](../../configs/kie/vi_layoutxlm/re_vi_layoutxlm_xfund_zh_udml.yml)|83.92%|[训练模型](https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/re_vi_layoutxlm_xfund_pretrained.tar)/[推理模型(coming soon)]()| + + + +## 2. 环境配置 +请先参考[《运行环境准备》](./environment.md)配置PaddleOCR运行环境,参考[《项目克隆》](./clone.md)克隆项目代码。 + + + + +## 3. 模型训练、评估、预测 + +请参考[关键信息抽取教程](./kie.md)。PaddleOCR对代码进行了模块化,训练不同的关键信息抽取模型只需要**更换配置文件**即可。 + + + +## 4. 推理部署 + + + +### 4.1 Python推理 + +**注:** 目前RE任务推理过程仍在适配中,下面以SER任务为例,介绍基于VI-LayoutXLM模型的关键信息抽取过程。 + +首先将训练得到的模型转换成inference model。以VI-LayoutXLM模型在XFUND_zh数据集上训练的模型为例([模型下载地址](https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/ser_vi_layoutxlm_xfund_pretrained.tar)),可以使用下面的命令进行转换。 + +``` bash +wget https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/ser_vi_layoutxlm_xfund_pretrained.tar +tar -xf ser_vi_layoutxlm_xfund_pretrained.tar +python3 tools/export_model.py -c configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml -o Architecture.Backbone.checkpoints=./ser_vi_layoutxlm_xfund_pretrained/best_accuracy Global.save_inference_dir=./inference/ser_vi_layoutxlm_infer +``` + +VI-LayoutXLM模型基于SER任务进行推理,可以执行如下命令: + +```bash +cd ppstructure +python3 kie/predict_kie_token_ser.py \ + --kie_algorithm=LayoutXLM \ + --ser_model_dir=../inference/ser_vi_layoutxlm_infer \ + --image_dir=./docs/kie/input/zh_val_42.jpg \ + --ser_dict_path=../train_data/XFUND/class_list_xfun.txt \ + --vis_font_path=../doc/fonts/simfang.ttf \ + --ocr_order_method="tb-yx" +``` + +SER可视化结果默认保存到`./output`文件夹里面,结果示例如下: + +
+ +
+ + + +### 4.2 C++推理部署 + +暂不支持 + + +### 4.3 Serving服务化部署 + +暂不支持 + + +### 4.4 更多推理部署 + +暂不支持 + + + +## 5. FAQ + +## 引用 + + +```bibtex +@article{DBLP:journals/corr/abs-2104-08836, + author = {Yiheng Xu and + Tengchao Lv and + Lei Cui and + Guoxin Wang and + Yijuan Lu and + Dinei Flor{\^{e}}ncio and + Cha Zhang and + Furu Wei}, + title = {LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich + Document Understanding}, + journal = {CoRR}, + volume = {abs/2104.08836}, + year = {2021}, + url = {https://arxiv.org/abs/2104.08836}, + eprinttype = {arXiv}, + eprint = {2104.08836}, + timestamp = {Thu, 14 Oct 2021 09:17:23 +0200}, + biburl = {https://dblp.org/rec/journals/corr/abs-2104-08836.bib}, + bibsource = {dblp computer science bibliography, https://dblp.org} +} + +@article{DBLP:journals/corr/abs-1912-13318, + author = {Yiheng Xu and + Minghao Li and + Lei Cui and + Shaohan Huang and + Furu Wei and + Ming Zhou}, + title = {LayoutLM: Pre-training of Text and Layout for Document Image Understanding}, + journal = {CoRR}, + volume = {abs/1912.13318}, + year = {2019}, + url = {http://arxiv.org/abs/1912.13318}, + eprinttype = {arXiv}, + eprint = {1912.13318}, + timestamp = {Mon, 01 Jun 2020 16:20:46 +0200}, + biburl = {https://dblp.org/rec/journals/corr/abs-1912-13318.bib}, + bibsource = {dblp computer science bibliography, https://dblp.org} +} + +@article{DBLP:journals/corr/abs-2012-14740, + author = {Yang Xu and + Yiheng Xu and + Tengchao Lv and + Lei Cui and + Furu Wei and + Guoxin Wang and + Yijuan Lu and + Dinei A. F. Flor{\^{e}}ncio and + Cha Zhang and + Wanxiang Che and + Min Zhang and + Lidong Zhou}, + title = {LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding}, + journal = {CoRR}, + volume = {abs/2012.14740}, + year = {2020}, + url = {https://arxiv.org/abs/2012.14740}, + eprinttype = {arXiv}, + eprint = {2012.14740}, + timestamp = {Tue, 27 Jul 2021 09:53:52 +0200}, + biburl = {https://dblp.org/rec/journals/corr/abs-2012-14740.bib}, + bibsource = {dblp computer science bibliography, https://dblp.org} +} +``` diff --git a/doc/doc_ch/algorithm_overview.md b/doc/doc_ch/algorithm_overview.md index 9d725a86ab8f48051fdb36fe20e94fbe88abc2f6..ecb0e9dfefbfdef2f8cea273c4e3de468aa29415 100755 --- a/doc/doc_ch/algorithm_overview.md +++ b/doc/doc_ch/algorithm_overview.md @@ -1,13 +1,19 @@ -# OCR算法 +# 前沿算法与模型 -- [1. 两阶段算法](#1) +- [1. 两阶段OCR算法](#1) - [1.1 文本检测算法](#11) - [1.2 文本识别算法](#12) -- [2. 端到端算法](#2) +- [2. 端到端OCR算法](#2) - [3. 表格识别算法](#3) +- [4. 关键信息抽取算法](#4) +本文给出了PaddleOCR已支持的OCR算法列表,以及每个算法在**英文公开数据集**上的模型和指标,主要用于算法简介和算法性能对比,更多包括中文在内的其他数据集上的模型请参考[PP-OCRv3 系列模型下载](./models_list.md)。 + +>> +PaddleOCR将**持续新增**支持OCR领域前沿算法与模型,**欢迎广大开发者合作共建,贡献更多算法,合入有奖🎁!具体可查看[社区常规赛](https://github.com/PaddlePaddle/PaddleOCR/issues/4982)。** +>> +新增算法可参考教程:[使用PaddleOCR架构添加新算法](./add_new_algorithm.md) -本文给出了PaddleOCR已支持的OCR算法列表,以及每个算法在**英文公开数据集**上的模型和指标,主要用于算法简介和算法性能对比,更多包括中文在内的其他数据集上的模型请参考[PP-OCR v2.0 系列模型下载](./models_list.md)。 @@ -18,7 +24,7 @@ ### 1.1 文本检测算法 已支持的文本检测算法列表(戳链接获取使用教程): -- [x] [DB](./algorithm_det_db.md) +- [x] [DB与DB++](./algorithm_det_db.md) - [x] [EAST](./algorithm_det_east.md) - [x] [SAST](./algorithm_det_sast.md) - [x] [PSENet](./algorithm_det_psenet.md) @@ -35,6 +41,7 @@ |SAST|ResNet50_vd|91.39%|83.77%|87.42%|[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_sast_icdar15_v2.0_train.tar)| |PSE|ResNet50_vd|85.81%|79.53%|82.55%|[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/en_det/det_r50_vd_pse_v2.0_train.tar)| |PSE|MobileNetV3|82.20%|70.48%|75.89%|[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/en_det/det_mv3_pse_v2.0_train.tar)| +|DB++|ResNet50|90.89%|82.66%|86.58%|[合成数据预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/en_det/ResNet50_dcn_asf_synthtext_pretrained.pdparams)/[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/en_det/det_r50_db%2B%2B_icdar15_train.tar)| 在Total-text文本检测公开数据集上,算法效果如下: @@ -71,6 +78,7 @@ - [x] [ABINet](./algorithm_rec_abinet.md) - [x] [VisionLAN](./algorithm_rec_visionlan.md) - [x] [SPIN](./algorithm_rec_spin.md) +- [x] [RobustScanner](./algorithm_rec_robustscanner.md) 参考[DTRB](https://arxiv.org/abs/1904.01906)[3]文字识别训练和评估流程,使用MJSynth和SynthText两个文字识别数据集训练,在IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE数据集上进行评估,算法效果如下: @@ -93,6 +101,7 @@ |ABINet|Resnet45| 90.75% | rec_r45_abinet | [训练模型](https://paddleocr.bj.bcebos.com/rec_r45_abinet_train.tar) | |VisionLAN|Resnet45| 90.30% | rec_r45_visionlan | [训练模型](https://paddleocr.bj.bcebos.com/rec_r45_visionlan_train.tar) | |SPIN|ResNet32| 90.00% | rec_r32_gaspin_bilstm_att | coming soon | +|RobustScanner|ResNet31| 87.77% | rec_r31_robustscanner | coming soon | @@ -114,3 +123,34 @@ |模型|骨干网络|配置文件|acc|下载链接| |---|---|---|---|---| |TableMaster|TableResNetExtra|[configs/table/table_master.yml](../../configs/table/table_master.yml)|77.47%|[训练模型](https://paddleocr.bj.bcebos.com/ppstructure/models/tablemaster/table_structure_tablemaster_train.tar) / [推理模型](https://paddleocr.bj.bcebos.com/ppstructure/models/tablemaster/table_structure_tablemaster_infer.tar)| + + + +## 4. 关键信息抽取算法 + +已支持的关键信息抽取算法列表(戳链接获取使用教程): + +- [x] [VI-LayoutXLM](./algorithm_kie_vi_layoutxlm.md) +- [x] [LayoutLM](./algorithm_kie_layoutxlm.md) +- [x] [LayoutLMv2](./algorithm_kie_layoutxlm.md) +- [x] [LayoutXLM](./algorithm_kie_layoutxlm.md) +- [x] [SDMGR](././algorithm_kie_sdmgr.md) + +在wildreceipt发票公开数据集上,算法复现效果如下: + +|模型|骨干网络|配置文件|hmean|下载链接| +| --- | --- | --- | --- | --- | +|SDMGR|VGG6|[configs/kie/sdmgr/kie_unet_sdmgr.yml](../../configs/kie/sdmgr/kie_unet_sdmgr.yml)|86.7%|[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/kie/kie_vgg16.tar)| + + +在XFUND_zh公开数据集上,算法效果如下: + +|模型|骨干网络|任务|配置文件|hmean|下载链接| +| --- | --- | --- | --- | --- | --- | +|VI-LayoutXLM| VI-LayoutXLM-base | SER | [ser_vi_layoutxlm_xfund_zh_udml.yml](../../configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh_udml.yml)|**93.19%**|[训练模型](https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/ser_vi_layoutxlm_xfund_pretrained.tar)| +|LayoutXLM| LayoutXLM-base | SER | [ser_layoutxlm_xfund_zh.yml](../../configs/kie/layoutlm_series/ser_layoutxlm_xfund_zh.yml)|90.38%|[训练模型](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutXLM_xfun_zh.tar)| +|LayoutLM| LayoutLM-base | SER | [ser_layoutlm_xfund_zh.yml](../../configs/kie/layoutlm_series/ser_layoutlm_xfund_zh.yml)|77.31%|[训练模型](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutLM_xfun_zh.tar)| +|LayoutLMv2| LayoutLMv2-base | SER | [ser_layoutlmv2_xfund_zh.yml](../../configs/kie/layoutlm_series/ser_layoutlmv2_xfund_zh.yml)|85.44%|[训练模型](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutLMv2_xfun_zh.tar)| +|VI-LayoutXLM| VI-LayoutXLM-base | RE | [re_vi_layoutxlm_xfund_zh_udml.yml](../../configs/kie/vi_layoutxlm/re_vi_layoutxlm_xfund_zh_udml.yml)|**83.92%**|[训练模型](https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/re_vi_layoutxlm_xfund_pretrained.tar)| +|LayoutXLM| LayoutXLM-base | RE | [re_layoutxlm_xfund_zh.yml](../../configs/kie/layoutlm_series/re_layoutxlm_xfund_zh.yml)|74.83%|[训练模型](https://paddleocr.bj.bcebos.com/pplayout/re_LayoutXLM_xfun_zh.tar)| +|LayoutLMv2| LayoutLMv2-base | RE | [re_layoutlmv2_xfund_zh.yml](../../configs/kie/layoutlm_series/re_layoutlmv2_xfund_zh.yml)|67.77%|[训练模型](https://paddleocr.bj.bcebos.com/pplayout/re_LayoutLMv2_xfun_zh.tar)| diff --git a/doc/doc_ch/algorithm_rec_robustscanner.md b/doc/doc_ch/algorithm_rec_robustscanner.md new file mode 100644 index 0000000000000000000000000000000000000000..869f9a7c00b617de87ab3c96326e18e536bc18a8 --- /dev/null +++ b/doc/doc_ch/algorithm_rec_robustscanner.md @@ -0,0 +1,113 @@ +# RobustScanner + +- [1. 算法简介](#1) +- [2. 环境配置](#2) +- [3. 模型训练、评估、预测](#3) + - [3.1 训练](#3-1) + - [3.2 评估](#3-2) + - [3.3 预测](#3-3) +- [4. 推理部署](#4) + - [4.1 Python推理](#4-1) + - [4.2 C++推理](#4-2) + - [4.3 Serving服务化部署](#4-3) + - [4.4 更多推理部署](#4-4) +- [5. FAQ](#5) + + +## 1. 算法简介 + +论文信息: +> [RobustScanner: Dynamically Enhancing Positional Clues for Robust Text Recognition](https://arxiv.org/pdf/2007.07542.pdf) +> Xiaoyu Yue, Zhanghui Kuang, Chenhao Lin, Hongbin Sun, Wayne +Zhang +> ECCV, 2020 + +使用MJSynth和SynthText两个合成文字识别数据集训练,在IIIT, SVT, IC13, IC15, SVTP, CUTE数据集上进行评估,算法复现效果如下: + +|模型|骨干网络|配置文件|Acc|下载链接| +| --- | --- | --- | --- | --- | +|RobustScanner|ResNet31|[rec_r31_robustscanner.yml](../../configs/rec/rec_r31_robustscanner.yml)|87.77%|coming soon| + +注:除了使用MJSynth和SynthText两个文字识别数据集外,还加入了[SynthAdd](https://pan.baidu.com/share/init?surl=uV0LtoNmcxbO-0YA7Ch4dg)数据(提取码:627x),和部分真实数据,具体数据细节可以参考论文。 + + +## 2. 环境配置 +请先参考[《运行环境准备》](./environment.md)配置PaddleOCR运行环境,参考[《项目克隆》](./clone.md)克隆项目代码。 + + + +## 3. 模型训练、评估、预测 + +请参考[文本识别教程](./recognition.md)。PaddleOCR对代码进行了模块化,训练不同的识别模型只需要**更换配置文件**即可。 + +训练 + +具体地,在完成数据准备后,便可以启动训练,训练命令如下: + +``` +#单卡训练(训练周期长,不建议) +python3 tools/train.py -c configs/rec/rec_r31_robustscanner.yml + +#多卡训练,通过--gpus参数指定卡号 +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/rec/rec_r31_robustscanner.yml +``` + +评估 + +``` +# GPU 评估, Global.pretrained_model 为待测权重 +python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/rec/rec_r31_robustscanner.yml -o Global.pretrained_model={path/to/weights}/best_accuracy +``` + +预测: + +``` +# 预测使用的配置文件必须与训练一致 +python3 tools/infer_rec.py -c configs/rec/rec_r31_robustscanner.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.infer_img=doc/imgs_words/en/word_1.png +``` + + +## 4. 推理部署 + + +### 4.1 Python推理 +首先将RobustScanner文本识别训练过程中保存的模型,转换成inference model。可以使用如下命令进行转换: + +``` +python3 tools/export_model.py -c configs/rec/rec_r31_robustscanner.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.save_inference_dir=./inference/rec_r31_robustscanner +``` +RobustScanner文本识别模型推理,可以执行如下命令: + +``` +python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words/en/word_1.png" --rec_model_dir="./inference/rec_r31_robustscanner/" --rec_image_shape="3, 48, 48, 160" --rec_algorithm="RobustScanner" --rec_char_dict_path="ppocr/utils/dict90.txt" --use_space_char=False +``` + + +### 4.2 C++推理 + +由于C++预处理后处理还未支持RobustScanner,所以暂未支持 + + +### 4.3 Serving服务化部署 + +暂不支持 + + +### 4.4 更多推理部署 + +暂不支持 + + +## 5. FAQ + + +## 引用 + +```bibtex +@article{2020RobustScanner, + title={RobustScanner: Dynamically Enhancing Positional Clues for Robust Text Recognition}, + author={Xiaoyu Yue and Zhanghui Kuang and Chenhao Lin and Hongbin Sun and Wayne Zhang}, + journal={ECCV2020}, + year={2020}, +} +``` diff --git a/doc/doc_ch/algorithm_rec_sar.md b/doc/doc_ch/algorithm_rec_sar.md index b8304313994754480a89d708e39149d67f828c0d..cfb1de25390bda8c6ba4be1db9101269873e8b5b 100644 --- a/doc/doc_ch/algorithm_rec_sar.md +++ b/doc/doc_ch/algorithm_rec_sar.md @@ -79,7 +79,7 @@ python3 tools/export_model.py -c configs/rec/rec_r31_sar.yml -o Global.pretraine SAR文本识别模型推理,可以执行如下命令: ``` -python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words/en/word_1.png" --rec_model_dir="./inference/rec_sar/" --rec_image_shape="3, 48, 48, 160" --rec_char_type="ch" --rec_algorithm="SAR" --rec_char_dict_path="ppocr/utils/dict90.txt" --max_text_length=30 --use_space_char=False +python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words/en/word_1.png" --rec_model_dir="./inference/rec_sar/" --rec_image_shape="3, 48, 48, 160" --rec_algorithm="SAR" --rec_char_dict_path="ppocr/utils/dict90.txt" --max_text_length=30 --use_space_char=False ``` diff --git a/doc/doc_ch/algorithm_rec_srn.md b/doc/doc_ch/algorithm_rec_srn.md index ca7961359eb902fafee959b26d02f324aece233a..dd61a388c7024fabdadec1c120bd3341ed0197cc 100644 --- a/doc/doc_ch/algorithm_rec_srn.md +++ b/doc/doc_ch/algorithm_rec_srn.md @@ -78,7 +78,7 @@ python3 tools/export_model.py -c configs/rec/rec_r50_fpn_srn.yml -o Global.pretr SRN文本识别模型推理,可以执行如下命令: ``` -python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words/en/word_1.png" --rec_model_dir="./inference/rec_srn/" --rec_image_shape="1,64,256" --rec_char_type="ch" --rec_algorithm="SRN" --rec_char_dict_path=./ppocr/utils/ic15_dict.txt --use_space_char=False +python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words/en/word_1.png" --rec_model_dir="./inference/rec_srn/" --rec_image_shape="1,64,256" --rec_algorithm="SRN" --rec_char_dict_path=./ppocr/utils/ic15_dict.txt --use_space_char=False ``` diff --git a/doc/doc_ch/algorithm_rec_visionlan.md b/doc/doc_ch/algorithm_rec_visionlan.md index 0c4fe86e58831f4f5480483f5c21ff1da4176d2b..df039491d49e192349d57b44cc448c57e4211098 100644 --- a/doc/doc_ch/algorithm_rec_visionlan.md +++ b/doc/doc_ch/algorithm_rec_visionlan.md @@ -101,7 +101,7 @@ python3 tools/export_model.py -c configs/rec/rec_r45_visionlan.yml -o Global.pre 执行如下命令进行模型推理: ```shell -python3 tools/infer/predict_rec.py --image_dir='./doc/imgs_words/en/word_2.png' --rec_model_dir='./inference/rec_r45_visionlan/' --rec_algorithm='VisionLAN' --rec_image_shape='3,64,256' --rec_char_dict_path='./ppocr/utils/dict36.txt' +python3 tools/infer/predict_rec.py --image_dir='./doc/imgs_words/en/word_2.png' --rec_model_dir='./inference/rec_r45_visionlan/' --rec_algorithm='VisionLAN' --rec_image_shape='3,64,256' --rec_char_dict_path='./ppocr/utils/ic15_dict.txt' --use_space_char=False # 预测文件夹下所有图像时,可修改image_dir为文件夹,如 --image_dir='./doc/imgs_words_en/'。 ``` @@ -110,7 +110,7 @@ python3 tools/infer/predict_rec.py --image_dir='./doc/imgs_words/en/word_2.png' 执行命令后,上面图像的预测结果(识别的文本和得分)会打印到屏幕上,示例如下: 结果如下: ```shell -Predicts of ./doc/imgs_words/en/word_2.png:('yourself', 0.97076982) +Predicts of ./doc/imgs_words/en/word_2.png:('yourself', 0.9999493) ``` **注意**: diff --git a/doc/doc_ch/algorithm_sr_gestalt.md b/doc/doc_ch/algorithm_sr_gestalt.md new file mode 100644 index 0000000000000000000000000000000000000000..aac82b1b62b10d070b7b67702198f462219acb6c --- /dev/null +++ b/doc/doc_ch/algorithm_sr_gestalt.md @@ -0,0 +1,127 @@ +# Text Gestalt + +- [1. 算法简介](#1) +- [2. 环境配置](#2) +- [3. 模型训练、评估、预测](#3) + - [3.1 训练](#3-1) + - [3.2 评估](#3-2) + - [3.3 预测](#3-3) +- [4. 推理部署](#4) + - [4.1 Python推理](#4-1) + - [4.2 C++推理](#4-2) + - [4.3 Serving服务化部署](#4-3) + - [4.4 更多推理部署](#4-4) +- [5. FAQ](#5) + + +## 1. 算法简介 + +论文信息: +> [Text Gestalt: Stroke-Aware Scene Text Image Super-Resolution](https://arxiv.org/pdf/2112.08171.pdf) + +> Chen, Jingye and Yu, Haiyang and Ma, Jianqi and Li, Bin and Xue, Xiangyang + +> AAAI, 2022 + +参考[FudanOCR](https://github.com/FudanVI/FudanOCR/tree/main/text-gestalt) 数据下载说明,在TextZoom测试集合上超分算法效果如下: + +|模型|骨干网络|PSNR_Avg|SSIM_Avg|配置文件|下载链接| +|---|---|---|---|---|---| +|Text Gestalt|tsrn|19.28|0.6560| [configs/sr/sr_tsrn_transformer_strock.yml](../../configs/sr/sr_tsrn_transformer_strock.yml)|[训练模型](https://paddleocr.bj.bcebos.com/sr_tsrn_transformer_strock_train.tar)| + + + +## 2. 环境配置 +请先参考[《运行环境准备》](./environment.md)配置PaddleOCR运行环境,参考[《项目克隆》](./clone.md)克隆项目代码。 + + + +## 3. 模型训练、评估、预测 + +请参考[文本识别训练教程](./recognition.md)。PaddleOCR对代码进行了模块化,训练不同的识别模型只需要**更换配置文件**即可。 + +- 训练 + +在完成数据准备后,便可以启动训练,训练命令如下: + +``` +#单卡训练(训练周期长,不建议) +python3 tools/train.py -c configs/sr/sr_tsrn_transformer_strock.yml + +#多卡训练,通过--gpus参数指定卡号 +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/sr/sr_tsrn_transformer_strock.yml + +``` + +- 评估 + +``` +# GPU 评估, Global.pretrained_model 为待测权重 +python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/sr/sr_tsrn_transformer_strock.yml -o Global.pretrained_model={path/to/weights}/best_accuracy +``` + +- 预测: + +``` +# 预测使用的配置文件必须与训练一致 +python3 tools/infer_sr.py -c configs/sr/sr_tsrn_transformer_strock.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.infer_img=doc/imgs_words_en/word_52.png +``` + +![](../imgs_words_en/word_52.png) + +执行命令后,上面图像的超分结果如下: + +![](../imgs_results/sr_word_52.png) + + +## 4. 推理部署 + + +### 4.1 Python推理 + +首先将文本超分训练过程中保存的模型,转换成inference model。以 Text-Gestalt 训练的[模型](https://paddleocr.bj.bcebos.com/sr_tsrn_transformer_strock_train.tar) 为例,可以使用如下命令进行转换: +```shell +python3 tools/export_model.py -c configs/sr/sr_tsrn_transformer_strock.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.save_inference_dir=./inference/sr_out +``` +Text-Gestalt 文本超分模型推理,可以执行如下命令: +``` +python3 tools/infer/predict_sr.py --sr_model_dir=./inference/sr_out --image_dir=doc/imgs_words_en/word_52.png --sr_image_shape=3,32,128 + +``` + +执行命令后,图像的超分结果如下: + +![](../imgs_results/sr_word_52.png) + + +### 4.2 C++推理 + +暂未支持 + + +### 4.3 Serving服务化部署 + +暂未支持 + + +### 4.4 更多推理部署 + +暂未支持 + + +## 5. FAQ + + +## 引用 + +```bibtex +@inproceedings{chen2022text, + title={Text gestalt: Stroke-aware scene text image super-resolution}, + author={Chen, Jingye and Yu, Haiyang and Ma, Jianqi and Li, Bin and Xue, Xiangyang}, + booktitle={Proceedings of the AAAI Conference on Artificial Intelligence}, + volume={36}, + number={1}, + pages={285--293}, + year={2022} +} +``` diff --git a/doc/doc_ch/dataset/docvqa_datasets.md b/doc/doc_ch/dataset/kie_datasets.md similarity index 56% rename from doc/doc_ch/dataset/docvqa_datasets.md rename to doc/doc_ch/dataset/kie_datasets.md index 3ec1865ee42be99ec19343428cd9ad6439686f15..be5624dbf257150745a79db25f0367ccee339559 100644 --- a/doc/doc_ch/dataset/docvqa_datasets.md +++ b/doc/doc_ch/dataset/kie_datasets.md @@ -1,10 +1,15 @@ -## DocVQA数据集 -这里整理了常见的DocVQA数据集,持续更新中,欢迎各位小伙伴贡献数据集~ +# 关键信息抽取数据集 + +这里整理了常见的关键信息抽取数据集,持续更新中,欢迎各位小伙伴贡献数据集~ + - [FUNSD数据集](#funsd) - [XFUND数据集](#xfund) +- [wildreceipt数据集](#wildreceipt) -#### 1、FUNSD数据集 + +## 1. FUNSD数据集 + - **数据来源**:https://guillaumejaume.github.io/FUNSD/ - **数据简介**:FUNSD数据集是一个用于表单理解的数据集,它包含199张真实的、完全标注的扫描版图片,类型包括市场报告、广告以及学术报告等,并分为149张训练集以及50张测试集。FUNSD数据集适用于多种类型的DocVQA任务,如字段级实体分类、字段级实体连接等。部分图像以及标注框可视化如下所示:
@@ -16,12 +21,33 @@ - **下载地址**:https://guillaumejaume.github.io/FUNSD/download/ -#### 2、XFUND数据集 + +## 2. XFUND数据集 - **数据来源**:https://github.com/doc-analysis/XFUND - **数据简介**:XFUND是一个多语种表单理解数据集,它包含7种不同语种的表单数据,并且全部用人工进行了键-值对形式的标注。其中每个语种的数据都包含了199张表单数据,并分为149张训练集以及50张测试集。部分图像以及标注框可视化如下所示: +
- **下载地址**:https://github.com/doc-analysis/XFUND/releases/tag/v1.0 + + + + +## 3. wildreceipt数据集 + +- **数据来源**:https://arxiv.org/abs/2103.14470 +- **数据简介**:wildreceipt数据集是英文发票数据集,包含26个类别(此处类别体系包含`Ignore`类别),共标注了50000个文本框。其中训练集包含1267张图片,测试集包含472张图片。部分图像以及标注框可视化如下所示: + +
+ + +
+ +**注:** 这里对于类别为`Ignore`或者`Others`的文本,没有进行可视化。 + +- **下载地址**: + - 原始数据下载地址:[链接](https://download.openmmlab.com/mmocr/data/wildreceipt.tar) + - 数据格式转换后适配于PaddleOCR训练的数据下载地址:[链接](https://paddleocr.bj.bcebos.com/ppstructure/dataset/wildreceipt.tar) diff --git a/doc/doc_ch/dataset/table_datasets.md b/doc/doc_ch/dataset/table_datasets.md index ae902b23ccf985d522386b7454c7f76a74917502..58f4cf470542ff7ef20f518efb8b6942a3caa2f0 100644 --- a/doc/doc_ch/dataset/table_datasets.md +++ b/doc/doc_ch/dataset/table_datasets.md @@ -3,6 +3,7 @@ - [数据集汇总](#数据集汇总) - [1. PubTabNet数据集](#1-pubtabnet数据集) - [2. 好未来表格识别竞赛数据集](#2-好未来表格识别竞赛数据集) +- [3. 好未来表格识别竞赛数据集](#2-WTW中文场景表格数据集) 这里整理了常用表格识别数据集,持续更新中,欢迎各位小伙伴贡献数据集~ @@ -12,6 +13,7 @@ |---|---|---| | PubTabNet |https://github.com/ibm-aur-nlp/PubTabNet| jsonl格式,可直接用[pubtab_dataset.py](../../../ppocr/data/pubtab_dataset.py)加载 | | 好未来表格识别竞赛数据集 |https://ai.100tal.com/dataset| jsonl格式,可直接用[pubtab_dataset.py](../../../ppocr/data/pubtab_dataset.py)加载 | +| WTW中文场景表格数据集 |https://github.com/wangwen-whu/WTW-Dataset| 需要进行转换后才能用[pubtab_dataset.py](../../../ppocr/data/pubtab_dataset.py)加载 | ## 1. PubTabNet数据集 - **数据简介**:PubTabNet数据集的训练集合中包含50万张图像,验证集合中包含0.9万张图像。部分图像可视化如下所示。 @@ -31,3 +33,12 @@
+ +## 3. WTW中文场景表格数据集 +- **数据简介**:WTW中文场景表格数据集包含表格检测和表格数据两部分数据,数据集中同时包含扫描和拍照两张场景的图像。 + +https://github.com/wangwen-whu/WTW-Dataset/blob/main/demo/20210816_210413.gif + +
+ +
diff --git a/doc/doc_ch/inference_args.md b/doc/doc_ch/inference_args.md index fa188ab7c800eaabae8a4ff54413af162dd60e43..36efc6fbf7a6ec62bc700964dc13261fecdb9bd5 100644 --- a/doc/doc_ch/inference_args.md +++ b/doc/doc_ch/inference_args.md @@ -15,7 +15,7 @@ | save_crop_res | bool | False | 是否保存OCR的识别文本图像 | | crop_res_save_dir | str | "./output" | 保存OCR识别出来的文本图像路径 | | use_mp | bool | False | 是否开启多进程预测 | -| total_process_num | int | 6 | 开启的进城数,`use_mp`为`True`时生效 | +| total_process_num | int | 6 | 开启的进程数,`use_mp`为`True`时生效 | | process_id | int | 0 | 当前进程的id号,无需自己修改 | | benchmark | bool | False | 是否开启benchmark,对预测速度、显存占用等进行统计 | | save_log_path | str | "./log_output/" | 开启`benchmark`时,日志结果的保存文件夹 | @@ -39,10 +39,10 @@ | 参数名称 | 类型 | 默认值 | 含义 | | :--: | :--: | :--: | :--: | -| det_algorithm | str | "DB" | 文本检测算法名称,目前支持`DB`, `EAST`, `SAST`, `PSE` | +| det_algorithm | str | "DB" | 文本检测算法名称,目前支持`DB`, `EAST`, `SAST`, `PSE`, `DB++`, `FCE` | | det_model_dir | str | xx | 检测inference模型路径 | | det_limit_side_len | int | 960 | 检测的图像边长限制 | -| det_limit_type | str | "max" | 检测的变成限制类型,目前支持`min`, `max`,`min`表示保证图像最短边不小于`det_limit_side_len`,`max`表示保证图像最长边不大于`det_limit_side_len` | +| det_limit_type | str | "max" | 检测的边长限制类型,目前支持`min`和`max`,`min`表示保证图像最短边不小于`det_limit_side_len`,`max`表示保证图像最长边不大于`det_limit_side_len` | 其中,DB算法相关参数如下 @@ -85,9 +85,9 @@ PSE算法相关参数如下 | 参数名称 | 类型 | 默认值 | 含义 | | :--: | :--: | :--: | :--: | -| rec_algorithm | str | "CRNN" | 文本识别算法名称,目前支持`CRNN`, `SRN`, `RARE`, `NETR`, `SAR` | +| rec_algorithm | str | "CRNN" | 文本识别算法名称,目前支持`CRNN`, `SRN`, `RARE`, `NETR`, `SAR`, `ViTSTR`, `ABINet`, `VisionLAN`, `SPIN`, `RobustScanner`, `SVTR`, `SVTR_LCNet` | | rec_model_dir | str | 无,如果使用识别模型,该项是必填项 | 识别inference模型路径 | -| rec_image_shape | list | [3, 32, 320] | 识别时的图像尺寸, | +| rec_image_shape | list | [3, 48, 320] | 识别时的图像尺寸 | | rec_batch_num | int | 6 | 识别的batch size | | max_text_length | int | 25 | 识别结果最大长度,在`SRN`中有效 | | rec_char_dict_path | str | "./ppocr/utils/ppocr_keys_v1.txt" | 识别的字符字典文件 | diff --git a/doc/doc_ch/inference_ppocr.md b/doc/doc_ch/inference_ppocr.md index 622ac995d37ce290ee51af06164b0c2aef8b5a14..514f905393984e2189b4c9c920ca4aeb91ac6da1 100644 --- a/doc/doc_ch/inference_ppocr.md +++ b/doc/doc_ch/inference_ppocr.md @@ -158,3 +158,5 @@ python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --de 执行命令后,识别结果图像如下: ![](../imgs_results/system_res_00018069_v3.jpg) + +更多关于推理超参数的配置与解释,请参考:[模型推理超参数解释教程](./inference_args.md)。 diff --git a/doc/doc_ch/kie.md b/doc/doc_ch/kie.md new file mode 100644 index 0000000000000000000000000000000000000000..b6f38a662fd98597011c5a51ff29c417d880ca17 --- /dev/null +++ b/doc/doc_ch/kie.md @@ -0,0 +1,469 @@ +# 关键信息抽取 + +本文提供了PaddleOCR关键信息抽取的全流程指南,包括语义实体识别 (Semantic Entity Recognition) 以及关系抽取 (Relation Extraction, RE) 任务的数据准备、模型训练、调优、评估、预测,各个阶段的详细说明。 + +- [1. 数据准备](#1-数据准备) + - [1.1. 准备数据集](#11-准备数据集) + - [1.2. 自定义数据集](#12-自定义数据集) + - [1.3. 数据下载](#13-数据下载) +- [2. 开始训练](#2-开始训练) + - [2.1. 启动训练](#21-启动训练) + - [2.2. 断点训练](#22-断点训练) + - [2.3. 混合精度训练](#24-混合精度训练) + - [2.4. 分布式训练](#25-分布式训练) + - [2.5. 知识蒸馏训练](#26-知识蒸馏训练) + - [2.6. 其他训练环境](#27-其他训练环境) +- [3. 模型评估与预测](#3-模型评估与预测) + - [3.1. 指标评估](#31-指标评估) + - [3.2. 测试信息抽取效果](#32-测试识别效果) +- [4. 模型导出与预测](#4-模型导出与预测) +- [5. FAQ](#5-faq) + +# 1. 数据准备 + +## 1.1. 准备数据集 + +在训练信息抽取相关模型时,PaddleOCR支持以下数据格式。 + - `通用数据` 用于训练以文本文件存储的数据集(SimpleDataSet); + +训练数据的默认存储路径是 `PaddleOCR/train_data`,如果您的磁盘上已有数据集,只需创建软链接至数据集目录: + +``` +# linux and mac os +ln -sf /train_data/dataset +# windows +mklink /d /train_data/dataset +``` + +## 1.2. 自定义数据集 + +训练过程中一般包含训练集与验证集,二者数据格式相同,下面介绍如何自定义数据集。 + +**(1)训练集** + +建议将训练图片放入同一个文件夹,并用一个文本文件记录图片路径和标签,文本文件里的内容如下: + +```py +" 图像文件名 图像标注信息 " +zh_train_0.jpg [{"transcription": "汇丰晋信", "label": "other", "points": [[104, 114], [530, 114], [530, 175], [104, 175]], "id": 1, "linking": []}, {"transcription": "受理时间:", "label": "question", "points": [[126, 267], [266, 267], [266, 305], [126, 305]], "id": 7, "linking": [[7, 13]]}, {"transcription": "2020.6.15", "label": "answer", "points": [[321, 239], [537, 239], [537, 285], [321, 285]], "id": 13, "linking": [[7, 13]]}] +zh_train_1.jpg [{"transcription": "中国人体器官捐献", "label": "other", "points": [[544, 459], [954, 459], [954, 517], [544, 517]], "id": 1, "linking": []}, {"transcription": ">编号:MC545715483585", "label": "other", "points": [[1462, 470], [2054, 470], [2054, 543], [1462, 543]], "id": 10, "linking": []}, {"transcription": "CHINAORGANDONATION", "label": "other", "points": [[543, 516], [958, 516], [958, 551], [543, 551]], "id": 14, "linking": []}, {"transcription": "中国人体器官捐献志愿登记表", "label": "header", "points": [[635, 793], [1892, 793], [1892, 904], [635, 904]], "id": 18, "linking": []}] +... +``` + +**注意:** 文本文件中默认请将图片路径和图片标签用 `\t` 分割,如用其他方式分割将造成训练报错。 + +其中图像标注信息字符串经过json解析之后可以得到一个列表信息,列表中每个元素是一个字典,存储了每个文本行的需要信息,各个字段的含义如下。 + +- transcription: 存储了文本行的文字内容 +- label: 该文本行内容所属的类别 +- points: 存储文本行的四点位置信息 +- id: 存储文本行的id信息,用于RE任务的训练 +- linking: 存储文本行的之间的连接信息,用于RE任务的训练 + +**(2)验证集** + +验证集构建方式与训练集相同。 + +**(3)字典文件** + +训练集与验证集中的文本行包含标签信息,所有标签的列表存在字典文件中(如`class_list.txt`),字典文件中的每一行表示为一个类别名称。 + +以XFUND_zh数据为例,共包含4个类别,字典文件内容如下所示。 + +``` +OTHER +QUESTION +ANSWER +HEADER +``` + +在标注文件中,每个标注的文本行内容的`label`字段标注信息需要属于字典内容。 + +最终数据集应有如下文件结构: + +``` +|-train_data + |-data_name + |- train.json + |- train + |- zh_train_0.png + |- zh_train_1.jpg + | ... + |- val.json + |- val + |- zh_val_0.png + |- zh_val_1.jpg + | ... +``` + +**注:** + +- 标注文件中的类别信息不区分大小写,如`HEADER`与`header`会被解析为相同的类别id,因此在标注的时候,不能使用小写处理后相同的字符串表示不同的类别。 +- 在整理标注文件的时候,建议将other这个类别(其他,无需关注的文本行可以标注为other)放在第一行,在解析的时候,会将`other`类别的类别id解析为0,后续不会对该类进行可视化。 + +## 1.3. 数据下载 + +如果你没有本地数据集,可以从[XFUND](https://github.com/doc-analysis/XFUND)或者[FUNSD](https://guillaumejaume.github.io/FUNSD/)官网下载数据,然后使用XFUND与FUNSD的处理脚本([XFUND](../../ppstructure/kie/tools/trans_xfun_data.py), [FUNSD](../../ppstructure/kie/tools/trans_funsd_label.py)),生成用于PaddleOCR训练的数据格式,并使用公开数据集快速体验关键信息抽取的流程。 + +更多关于公开数据集的介绍,请参考[关键信息抽取数据集说明文档](./dataset/kie_datasets.md)。 + +PaddleOCR也支持了关键信息抽取模型的标注,具体使用方法请参考:[PPOCRLabel使用文档](../../PPOCRLabel/README_ch.md)。 + + +# 2. 开始训练 + +PaddleOCR提供了训练脚本、评估脚本和预测脚本,本节将以 VI-LayoutXLM 多模态预训练模型为例进行讲解。 + +> 如果希望使用基于SDMGR的关键信息抽取算法,请参考:[SDMGR使用](./algorithm_kie_sdmgr.md)。 + +## 2.1. 启动训练 + +如果你没有使用自定义数据集,可以使用PaddleOCR中已经处理好的XFUND_zh数据集进行快速体验。 + +```bash +mkdir train_data +cd train_data +wget https://paddleocr.bj.bcebos.com/ppstructure/dataset/XFUND.tar && tar -xf XFUND.tar +cd .. +``` + +如果不希望训练,直接体验后面的模型评估、预测、动转静、推理的流程,可以下载PaddleOCR中提供的预训练模型,并跳过2.1部分。 + +使用下面的方法,下载基于XFUND数据的SER与RE任务预训练模型。 + +```bash +mkdir pretrained_model +cd pretrained_model +# 下载并解压SER预训练模型 +wget https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/ser_vi_layoutxlm_xfund_pretrained.tar & tar -xf ser_vi_layoutxlm_xfund_pretrained.tar + +# 下载并解压RE预训练模型 +wget https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/re_vi_layoutxlm_xfund_pretrained.tar & tar -xf re_vi_layoutxlm_xfund_pretrained.tar +``` + +开始训练: + +- 如果您安装的是cpu版本,请将配置文件中的 `use_gpu` 字段修改为false +- PaddleOCR在训练时,会默认下载VI-LayoutXLM预训练模型,这里无需预先下载。 + +```bash +# GPU训练 支持单卡,多卡训练 +# 训练日志会自动保存到 配置文件中"{Global.save_model_dir}" 下的train.log文件中 + +# SER单卡训练 +python3 tools/train.py -c configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml + +# SER多卡训练,通过--gpus参数指定卡号 +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml + +# RE任务单卡训练 +python3 tools/train.py -c configs/kie/vi_layoutxlm/re_vi_layoutxlm_xfund_zh.yml +``` + +以SER任务为例,正常启动训练后,会看到以下log输出: + +``` +[2022/08/08 16:28:28] ppocr INFO: epoch: [1/200], global_step: 10, lr: 0.000006, loss: 1.871535, avg_reader_cost: 0.28200 s, avg_batch_cost: 0.82318 s, avg_samples: 8.0, ips: 9.71838 samples/s, eta: 0:51:59 +[2022/08/08 16:28:33] ppocr INFO: epoch: [1/200], global_step: 19, lr: 0.000018, loss: 1.461939, avg_reader_cost: 0.00042 s, avg_batch_cost: 0.32037 s, avg_samples: 6.9, ips: 21.53773 samples/s, eta: 0:37:55 +[2022/08/08 16:28:39] ppocr INFO: cur metric, precision: 0.11526348939743859, recall: 0.19776657060518732, hmean: 0.14564265817747712, fps: 34.008392345050055 +[2022/08/08 16:28:45] ppocr INFO: save best model is to ./output/ser_vi_layoutxlm_xfund_zh/best_accuracy +[2022/08/08 16:28:45] ppocr INFO: best metric, hmean: 0.14564265817747712, precision: 0.11526348939743859, recall: 0.19776657060518732, fps: 34.008392345050055, best_epoch: 1 +[2022/08/08 16:28:51] ppocr INFO: save model in ./output/ser_vi_layoutxlm_xfund_zh/latest +``` + +log 中自动打印如下信息: + +| 字段 | 含义 | +| :----: | :------: | +| epoch | 当前迭代轮次 | +| iter | 当前迭代次数 | +| lr | 当前学习率 | +| loss | 当前损失函数 | +| reader_cost | 当前 batch 数据处理耗时 | +| batch_cost | 当前 batch 总耗时 | +| samples | 当前 batch 内的样本数 | +| ips | 每秒处理图片的数量 | + + +PaddleOCR支持训练和评估交替进行, 可以在 `configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml` 中修改 `eval_batch_step` 设置评估频率,默认每19个iter评估一次。评估过程中默认将最佳hmean模型,保存为 `output/ser_vi_layoutxlm_xfund_zh/best_accuracy/` 。 + +如果验证集很大,测试将会比较耗时,建议减少评估次数,或训练完再进行评估。 + +**提示:** 可通过 -c 参数选择 `configs/kie/` 路径下的多种模型配置进行训练,PaddleOCR支持的信息抽取算法可以参考[前沿算法列表](./algorithm_overview.md)。 + + +如果你希望训练自己的数据集,需要修改配置文件中的数据配置、字典文件以及类别数。 + + +以 `configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml` 为例,修改的内容如下所示。 + +```yaml + +Architecture: + # ... + Backbone: + name: LayoutXLMForSer + pretrained: True + mode: vi + # 假设字典中包含n个字段(包含other),由于采用BIO标注,则类别数为2n-1 + num_classes: &num_classes 7 + +PostProcess: + name: kieSerTokenLayoutLMPostProcess + # 修改字典文件的路径为你自定义的数据集的字典路径 + class_path: &class_path train_data/XFUND/class_list_xfun.txt + +Train: + dataset: + name: SimpleDataSet + # 修改为你自己的训练数据目录 + data_dir: train_data/XFUND/zh_train/image + # 修改为你自己的训练数据标签文件 + label_file_list: + - train_data/XFUND/zh_train/train.json + ... + loader: + # 训练时的单卡batch_size + batch_size_per_card: 8 + ... + +Eval: + dataset: + name: SimpleDataSet + # 修改为你自己的验证数据目录 + data_dir: train_data/XFUND/zh_val/image + # 修改为你自己的验证数据标签文件 + label_file_list: + - train_data/XFUND/zh_val/val.json + ... + loader: + # 验证时的单卡batch_size + batch_size_per_card: 8 +``` + +**注意,预测/评估时的配置文件请务必与训练一致。** + +## 2.2. 断点训练 + +如果训练程序中断,如果希望加载训练中断的模型从而恢复训练,可以通过指定` Architecture.Backbone.checkpoints`指定要加载的模型路径: + +```bash +python3 tools/train.py -c configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml -o Architecture.Backbone.checkpoints=./output/ser_vi_layoutxlm_xfund_zh/best_accuracy +``` + +**注意**: + +- `Architecture.Backbone.checkpoints`的优先级高于`Architecture.Backbone.pretrained`,需要加载之前训练好的训练模型进行模型微调、恢复训练、模型评估时,需要使用`Architecture.Backbone.checkpoints`指定模型参数路径;如果需要使用默认提供的通用预训练模型进行训练,则需要指定`Architecture.Backbone.pretrained`为`True`,同时指定`Architecture.Backbone.checkpoints`为空(`null`)。 +- LayoutXLM系列模型均是调用了PaddleNLP中的预训练模型,模型加载与保存的逻辑与PaddleNLP基本一致,因此在这里不需要指定`Global.pretrained_model`或者`Global.checkpoints`参数;此外,LayoutXLM系列模型的蒸馏训练目前不支持断点训练。 + + +## 2.3. 混合精度训练 + +coming soon! + +## 2.4. 分布式训练 + +多机多卡训练时,通过 `--ips` 参数设置使用的机器IP地址,通过 `--gpus` 参数设置使用的GPU ID: + +```bash +python3 -m paddle.distributed.launch --ips="xx.xx.xx.xx,xx.xx.xx.xx" --gpus '0,1,2,3' tools/train.py -c configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml +``` + +**注意:** (1)采用多机多卡训练时,需要替换上面命令中的ips值为您机器的地址,机器之间需要能够相互ping通;(2)训练时需要在多个机器上分别启动命令。查看机器ip地址的命令为`ifconfig`;(3)更多关于分布式训练的性能优势等信息,请参考:[分布式训练教程](./distributed_training.md)。 + +## 2.5. 知识蒸馏训练 + +PaddleOCR支持了基于U-DML知识蒸馏的关键信息抽取模型训练过程,配置文件请参考:[ser_vi_layoutxlm_xfund_zh_udml.yml](../../configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh_udml.yml),更多关于知识蒸馏的说明文档请参考:[知识蒸馏说明文档](./knowledge_distillation.md)。 + +**注意**: PaddleOCR中LayoutXLM系列关键信息抽取模型的保存与加载逻辑与PaddleNLP保持一致,因此在蒸馏的过程中仅保存了学生模型的参数,如果希望使用保存的模型进行评估,需要使用学生模型的配置(上面的蒸馏文件对应的学生模型为[ser_vi_layoutxlm_xfund_zh.yml](../../configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml)) + + +## 2.6. 其他训练环境 + +- Windows GPU/CPU +在Windows平台上与Linux平台略有不同: +Windows平台只支持`单卡`的训练与预测,指定GPU进行训练`set CUDA_VISIBLE_DEVICES=0` +在Windows平台,DataLoader只支持单进程模式,因此需要设置 `num_workers` 为0; + +- macOS +不支持GPU模式,需要在配置文件中设置`use_gpu`为False,其余训练评估预测命令与Linux GPU完全相同。 + +- Linux DCU +DCU设备上运行需要设置环境变量 `export HIP_VISIBLE_DEVICES=0,1,2,3`,其余训练评估预测命令与Linux GPU完全相同。 + + +# 3. 模型评估与预测 + +## 3.1. 指标评估 + +训练中模型参数默认保存在`Global.save_model_dir`目录下。在评估指标时,需要设置`Architecture.Backbone.checkpoints`指向保存的参数文件。评估数据集可以通过 `configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml` 修改Eval中的 `label_file_path` 设置。 + +```bash +# GPU 评估, Global.checkpoints 为待测权重 +python3 tools/eval.py -c configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml -o Architecture.Backbone.checkpoints=./output/ser_vi_layoutxlm_xfund_zh/best_accuracy +``` + +会输出以下信息,打印出precision、recall、hmean等信息。 + +```py +[2022/08/09 07:59:28] ppocr INFO: metric eval *************** +[2022/08/09 07:59:28] ppocr INFO: precision:0.697476609016161 +[2022/08/09 07:59:28] ppocr INFO: recall:0.8861671469740634 +[2022/08/09 07:59:28] ppocr INFO: hmean:0.7805806758686339 +[2022/08/09 07:59:28] ppocr INFO: fps:17.367364606899105 +``` + + +## 3.2. 测试信息抽取结果 + +使用 PaddleOCR 训练好的模型,可以通过以下脚本进行快速预测。 + +默认预测的图片存储在 `infer_img` 里,通过 `-o Architecture.Backbone.checkpoints` 加载训练好的参数文件: + +根据配置文件中设置的 `save_model_dir` 和 `save_epoch_step` 字段,会有以下几种参数被保存下来: + +``` +output/ser_vi_layoutxlm_xfund_zh/ +├── best_accuracy + ├── metric.states + ├── model_config.json + ├── model_state.pdparams +├── best_accuracy.pdopt +├── config.yml +├── train.log +├── latest + ├── metric.states + ├── model_config.json + ├── model_state.pdparams +├── latest.pdopt +``` + +其中 best_accuracy.* 是评估集上的最优模型;latest.* 是最新保存的一个模型。 + + +预测使用的配置文件必须与训练一致,如您通过 `python3 tools/train.py -c configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml` 完成了模型的训练过程。 + +您可以使用如下命令进行中文模型预测。 + + +```bash +python3 tools/infer_kie_token_ser.py -c configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml -o Architecture.Backbone.checkpoints=./output/ser_vi_layoutxlm_xfund_zh/best_accuracy Global.infer_img=./ppstructure/docs/kie/input/zh_val_42.jpg +``` + +预测图片如下所示,图片会存储在`Global.save_res_path`路径中。 + +
+ +
+ +预测过程中,默认会加载PP-OCRv3的检测识别模型,用于OCR的信息抽取,如果希望加载预先获取的OCR结果,可以使用下面的方式进行预测,指定`Global.infer_img`为标注文件,其中包含图片路径以及OCR信息,同时指定`Global.infer_mode`为False,表示此时不使用OCR预测引擎。 + +```bash +python3 tools/infer_kie_token_ser.py -c configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml -o Architecture.Backbone.checkpoints=./output/ser_vi_layoutxlm_xfund_zh/best_accuracy Global.infer_img=./train_data/XFUND/zh_val/val.json Global.infer_mode=False +``` + +对于上述图片,如果使用标注的OCR结果进行信息抽取,预测结果如下。 + +
+ +
+ +可以看出,部分检测框信息更加准确,但是整体信息抽取识别结果基本一致。 + + +在RE任务模型预测时,需要先给出模型SER结果,因此需要同时加载SER的配置文件与模型权重,示例如下。 + + +```bash +python3 ./tools/infer_kie_token_ser_re.py \ + -c configs/kie/vi_layoutxlm/re_vi_layoutxlm_xfund_zh.yml \ + -o Architecture.Backbone.checkpoints=./pretrain_models/re_vi_layoutxlm_udml_xfund_zh/best_accuracy/ \ + Global.infer_img=./train_data/XFUND/zh_val/image/ \ + -c_ser configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml \ + -o_ser Architecture.Backbone.checkpoints=pretrain_models/ \ + ser_vi_layoutxlm_udml_xfund_zh/best_accuracy/ +``` + +预测结果如下所示。 + +
+ +
+ + +如果希望使用标注或者预先获取的OCR信息进行关键信息抽取,同上,可以指定`Global.infer_mode`为False,指定`Global.infer_img`为标注文件。 + +```bash +python3 ./tools/infer_kie_token_ser_re.py -c configs/kie/vi_layoutxlm/re_vi_layoutxlm_xfund_zh.yml -o Architecture.Backbone.checkpoints=./pretrain_models/re_vi_layoutxlm_udml_xfund_zh/re_layoutxlm_xfund_zh_v4_udml/best_accuracy/ Global.infer_img=./train_data/XFUND/zh_val/val.json Global.infer_mode=False -c_ser configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml -o_ser Architecture.Backbone.checkpoints=pretrain_models/ser_vi_layoutxlm_udml_xfund_zh/best_accuracy/ +``` + +其中`c_ser`表示SER的配置文件,`o_ser` 后面需要加上待修改的SER模型与配置文件,如预训练权重等。 + + +预测结果如下所示。 + +
+ +
+ +可以看出,直接使用标注的OCR结果的RE预测结果要更加准确一些。 + +# 4. 模型导出与预测 + + +## 4.1 模型导出 + +inference 模型(`paddle.jit.save`保存的模型) +一般是模型训练,把模型结构和模型参数保存在文件中的固化模型,多用于预测部署场景。 +训练过程中保存的模型是checkpoints模型,保存的只有模型的参数,多用于恢复训练等。 +与checkpoints模型相比,inference 模型会额外保存模型的结构信息,在预测部署、加速推理上性能优越,灵活方便,适合于实际系统集成。 + +信息抽取模型中的SER任务转inference模型步骤如下: + +```bash +# -c 后面设置训练算法的yml配置文件 +# -o 配置可选参数 +# Architecture.Backbone.checkpoints 参数设置待转换的训练模型地址 +# Global.save_inference_dir 参数设置转换的模型将保存的地址 + +python3 tools/export_model.py -c configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml -o Architecture.Backbone.checkpoints=./output/ser_vi_layoutxlm_xfund_zh/best_accuracy Global.save_inference_dir=./inference/ser_vi_layoutxlm +``` + +转换成功后,在目录下有三个文件: + +``` +inference/ser_vi_layoutxlm/ + ├── inference.pdiparams # inference模型的参数文件 + ├── inference.pdiparams.info # inference模型的参数信息,可忽略 + └── inference.pdmodel # inference模型的模型结构文件 +``` + +RE任务的动转静过程适配中,敬请期待。 + +## 4.2 模型推理 + +VI-LayoutXLM模型基于SER任务进行推理,可以执行如下命令: + +```bash +cd ppstructure +python3 kie/predict_kie_token_ser.py \ + --kie_algorithm=LayoutXLM \ + --ser_model_dir=../inference/ser_vi_layoutxlm \ + --image_dir=./docs/kie/input/zh_val_42.jpg \ + --ser_dict_path=../train_data/XFUND/class_list_xfun.txt \ + --vis_font_path=../doc/fonts/simfang.ttf \ + --ocr_order_method="tb-yx" +``` + +可视化SER结果结果默认保存到`./output`文件夹里面。结果示例如下: + +
+ +
+ + +# 5. FAQ + +Q1: 训练模型转inference 模型之后预测效果不一致? + +**A**:该问题多是trained model预测时候的预处理、后处理参数和inference model预测的时候的预处理、后处理参数不一致导致的。可以对比训练使用的配置文件中的预处理、后处理和预测时是否存在差异。 diff --git a/doc/doc_ch/table_recognition.md b/doc/doc_ch/table_recognition.md new file mode 100644 index 0000000000000000000000000000000000000000..e076149441eca410a25578fac8214862dfea1020 --- /dev/null +++ b/doc/doc_ch/table_recognition.md @@ -0,0 +1,343 @@ +# 表格识别 + +本文提供了PaddleOCR表格识别模型的全流程指南,包括数据准备、模型训练、调优、评估、预测,各个阶段的详细说明: + +- [1. 数据准备](#1-数据准备) + - [1.1. 数据集格式](#11-数据集格式) + - [1.2. 数据下载](#12-数据下载) + - [1.3. 数据集生成](#13-数据集生成) +- [2. 开始训练](#2-开始训练) + - [2.1. 启动训练](#21-启动训练) + - [2.2. 断点训练](#22-断点训练) + - [2.3. 更换Backbone 训练](#23-更换backbone-训练) + - [2.4. 混合精度训练](#24-混合精度训练) + - [2.5. 分布式训练](#25-分布式训练) + - [2.6. 其他训练环境](#26-其他训练环境) + - [2.7. 模型微调](#27-模型微调) +- [3. 模型评估与预测](#3-模型评估与预测) + - [3.1. 指标评估](#31-指标评估) + - [3.2. 测试表格结构识别效果](#32-测试表格结构识别效果) +- [4. 模型导出与预测](#4-模型导出与预测) + - [4.1 模型导出](#41-模型导出) + - [4.2 模型预测](#42-模型预测) +- [5. FAQ](#5-faq) + +# 1. 数据准备 + +## 1.1. 数据集格式 + +PaddleOCR 表格识别模型数据集格式如下: +```txt +img_label # 每张图片标注经过json.dumps()之后的字符串 +... +img_label +``` + +每一行的json格式为: +```txt +{ + 'filename': PMC5755158_010_01.png, # 图像名 + 'split': ’train‘, # 图像属于训练集还是验证集 + 'imgid': 0, # 图像的index + 'html': { + 'structure': {'tokens': ['', '', '', ...]}, # 表格的HTML字符串 + 'cell': [ + { + 'tokens': ['P', 'a', 'd', 'd', 'l', 'e', 'P', 'a', 'd', 'd', 'l', 'e'], # 表格中的单个文本 + 'bbox': [x0, y0, x1, y1] # 表格中的单个文本的坐标 + } + ] + } +} +``` + +训练数据的默认存储路径是 `PaddleOCR/train_data`,如果您的磁盘上已有数据集,只需创建软链接至数据集目录: + +``` +# linux and mac os +ln -sf /train_data/dataset +# windows +mklink /d /train_data/dataset +``` + +## 1.2. 数据下载 + +公开数据集下载可参考 [table_datasets](dataset/table_datasets.md)。 + +## 1.3. 数据集生成 + +使用[TableGeneration](https://github.com/WenmuZhou/TableGeneration)可进行扫描表格图像的生成。 + +TableGeneration是一个开源表格数据集生成工具,其通过浏览器渲染的方式对html字符串进行渲染后获得表格图像。部分样张如下: + +|类型|样例| +|---|---| +|简单表格|![](https://raw.githubusercontent.com/WenmuZhou/TableGeneration/main/imgs/simple.jpg)| +|彩色表格|![](https://raw.githubusercontent.com/WenmuZhou/TableGeneration/main/imgs/color.jpg)| + +# 2. 开始训练 + +PaddleOCR提供了训练脚本、评估脚本和预测脚本,本节将以 [SLANet](../../configs/table/SLANet.yml) 模型训练PubTabNet英文数据集为例: + +## 2.1. 启动训练 + +*如果您安装的是cpu版本,请将配置文件中的 `use_gpu` 字段修改为false* + +``` +# GPU训练 支持单卡,多卡训练 +# 训练日志会自动保存为 "{save_model_dir}" 下的train.log + +#单卡训练(训练周期长,不建议) +python3 tools/train.py -c configs/table/SLANet.yml + +#多卡训练,通过--gpus参数指定卡号 +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/table/SLANet.yml +``` + +正常启动训练后,会看到以下log输出: + +``` +[2022/08/16 03:07:33] ppocr INFO: epoch: [1/400], global_step: 20, lr: 0.000100, acc: 0.000000, loss: 3.915012, structure_loss: 3.229450, loc_loss: 0.670590, avg_reader_cost: 2.63382 s, avg_batch_cost: 6.32390 s, avg_samples: 48.0, ips: 7.59025 samples/s, eta: 9 days, 2:29:27 +[2022/08/16 03:08:41] ppocr INFO: epoch: [1/400], global_step: 40, lr: 0.000100, acc: 0.000000, loss: 1.750859, structure_loss: 1.082116, loc_loss: 0.652822, avg_reader_cost: 0.02533 s, avg_batch_cost: 3.37251 s, avg_samples: 48.0, ips: 14.23271 samples/s, eta: 6 days, 23:28:43 +[2022/08/16 03:09:46] ppocr INFO: epoch: [1/400], global_step: 60, lr: 0.000100, acc: 0.000000, loss: 1.395154, structure_loss: 0.776803, loc_loss: 0.625030, avg_reader_cost: 0.02550 s, avg_batch_cost: 3.26261 s, avg_samples: 48.0, ips: 14.71214 samples/s, eta: 6 days, 5:11:48 +``` + +log 中自动打印如下信息: + +| 字段 | 含义 | +| :----: | :------: | +| epoch | 当前迭代轮次 | +| global_step | 当前迭代次数 | +| lr | 当前学习率 | +| acc | 当前batch的准确率 | +| loss | 当前损失函数 | +| structure_loss | 表格结构损失值 | +| loc_loss | 单元格坐标损失值 | +| avg_reader_cost | 当前 batch 数据处理耗时 | +| avg_batch_cost | 当前 batch 总耗时 | +| avg_samples | 当前 batch 内的样本数 | +| ips | 每秒处理图片的数量 | + + +PaddleOCR支持训练和评估交替进行, 可以在 `configs/table/SLANet.yml` 中修改 `eval_batch_step` 设置评估频率,默认每1000个iter评估一次。评估过程中默认将最佳acc模型,保存为 `output/SLANet/best_accuracy` 。 + +如果验证集很大,测试将会比较耗时,建议减少评估次数,或训练完再进行评估。 + +**提示:** 可通过 -c 参数选择 `configs/table/` 路径下的多种模型配置进行训练,PaddleOCR支持的表格识别算法可以参考[前沿算法列表](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/doc/doc_ch/algorithm_overview.md#3-%E8%A1%A8%E6%A0%BC%E8%AF%86%E5%88%AB%E7%AE%97%E6%B3%95): + +**注意,预测/评估时的配置文件请务必与训练一致。** + +## 2.2. 断点训练 + +如果训练程序中断,如果希望加载训练中断的模型从而恢复训练,可以通过指定Global.checkpoints指定要加载的模型路径: +```shell +python3 tools/train.py -c configs/table/SLANet.yml -o Global.checkpoints=./your/trained/model +``` + +**注意**:`Global.checkpoints`的优先级高于`Global.pretrained_model`的优先级,即同时指定两个参数时,优先加载`Global.checkpoints`指定的模型,如果`Global.checkpoints`指定的模型路径有误,会加载`Global.pretrained_model`指定的模型。 + +## 2.3. 更换Backbone 训练 + +PaddleOCR将网络划分为四部分,分别在[ppocr/modeling](../../ppocr/modeling)下。 进入网络的数据将按照顺序(transforms->backbones->necks->heads)依次通过这四个部分。 + +```bash +├── architectures # 网络的组网代码 +├── transforms # 网络的图像变换模块 +├── backbones # 网络的特征提取模块 +├── necks # 网络的特征增强模块 +└── heads # 网络的输出模块 +``` +如果要更换的Backbone 在PaddleOCR中有对应实现,直接修改配置yml文件中`Backbone`部分的参数即可。 + +如果要使用新的Backbone,更换backbones的例子如下: + +1. 在 [ppocr/modeling/backbones](../../ppocr/modeling/backbones) 文件夹下新建文件,如my_backbone.py。 +2. 在 my_backbone.py 文件内添加相关代码,示例代码如下: + +```python +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + + +class MyBackbone(nn.Layer): + def __init__(self, *args, **kwargs): + super(MyBackbone, self).__init__() + # your init code + self.conv = nn.xxxx + + def forward(self, inputs): + # your network forward + y = self.conv(inputs) + return y +``` + +3. 在 [ppocr/modeling/backbones/\__init\__.py](../../ppocr/modeling/backbones/__init__.py)文件内导入添加的`MyBackbone`模块,然后修改配置文件中Backbone进行配置即可使用,格式如下: + +```yaml +Backbone: +name: MyBackbone +args1: args1 +``` + +**注意**:如果要更换网络的其他模块,可以参考[文档](./add_new_algorithm.md)。 + +## 2.4. 混合精度训练 + +如果您想进一步加快训练速度,可以使用[自动混合精度训练](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/01_paddle2.0_introduction/basic_concept/amp_cn.html), 以单机单卡为例,命令如下: + +```shell +python3 tools/train.py -c configs/table/SLANet.yml \ + -o Global.pretrained_model=./pretrain_models/SLANet/best_accuracy \ + Global.use_amp=True Global.scale_loss=1024.0 Global.use_dynamic_loss_scaling=True + ``` + +## 2.5. 分布式训练 + +多机多卡训练时,通过 `--ips` 参数设置使用的机器IP地址,通过 `--gpus` 参数设置使用的GPU ID: + +```bash +python3 -m paddle.distributed.launch --ips="xx.xx.xx.xx,xx.xx.xx.xx" --gpus '0,1,2,3' tools/train.py -c configs/table/SLANet.yml \ + -o Global.pretrained_model=./pretrain_models/SLANet/best_accuracy +``` + +**注意:** (1)采用多机多卡训练时,需要替换上面命令中的ips值为您机器的地址,机器之间需要能够相互ping通;(2)训练时需要在多个机器上分别启动命令。查看机器ip地址的命令为`ifconfig`;(3)更多关于分布式训练的性能优势等信息,请参考:[分布式训练教程](./distributed_training.md)。 + + +## 2.6. 其他训练环境 + +- Windows GPU/CPU +在Windows平台上与Linux平台略有不同: +Windows平台只支持`单卡`的训练与预测,指定GPU进行训练`set CUDA_VISIBLE_DEVICES=0` +在Windows平台,DataLoader只支持单进程模式,因此需要设置 `num_workers` 为0; + +- macOS +不支持GPU模式,需要在配置文件中设置`use_gpu`为False,其余训练评估预测命令与Linux GPU完全相同。 + +- Linux DCU +DCU设备上运行需要设置环境变量 `export HIP_VISIBLE_DEVICES=0,1,2,3`,其余训练评估预测命令与Linux GPU完全相同。 + +## 2.7. 模型微调 + +实际使用过程中,建议加载官方提供的预训练模型,在自己的数据集中进行微调,关于模型的微调方法,请参考:[模型微调教程](./finetune.md)。 + + +# 3. 模型评估与预测 + +## 3.1. 指标评估 + +训练中模型参数默认保存在`Global.save_model_dir`目录下。在评估指标时,需要设置`Global.checkpoints`指向保存的参数文件。评估数据集可以通过 `configs/table/SLANet.yml` 修改Eval中的 `label_file_list` 设置。 + + +``` +# GPU 评估, Global.checkpoints 为待测权重 +python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/table/SLANet.yml -o Global.checkpoints={path/to/weights}/best_accuracy +``` + +运行完成后,会输出模型的acc指标,如对英文表格识别模型进行评估,会见到如下输出。 +```bash +[2022/08/16 07:59:55] ppocr INFO: acc:0.7622245132160782 +[2022/08/16 07:59:55] ppocr INFO: fps:30.991640622573044 +``` + +## 3.2. 测试表格结构识别效果 + +使用 PaddleOCR 训练好的模型,可以通过以下脚本进行快速预测。 + +默认预测图片存储在 `infer_img` 里,通过 `-o Global.checkpoints` 加载训练好的参数文件: + +根据配置文件中设置的 `save_model_dir` 和 `save_epoch_step` 字段,会有以下几种参数被保存下来: + +``` +output/SLANet/ +├── best_accuracy.pdopt +├── best_accuracy.pdparams +├── best_accuracy.states +├── config.yml +├── latest.pdopt +├── latest.pdparams +├── latest.states +└── train.log +``` +其中 best_accuracy.* 是评估集上的最优模型;latest.* 是最后一个epoch的模型。 + +``` +# 预测表格图像 +python3 tools/infer_table.py -c configs/table/SLANet.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.infer_img=ppstructure/docs/table/table.jpg +``` + +预测图片: + +![](../../ppstructure/docs/table/table.jpg) + +得到输入图像的预测结果: + +``` +['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '
', '', ''],[[320.0562438964844, 197.83375549316406, 350.0928955078125, 214.4309539794922], ... , [318.959228515625, 271.0166931152344, 353.7394104003906, 286.4538269042969]] +``` + +单元格坐标可视化结果为 + +![](../../ppstructure/docs/imgs/slanet_result.jpg) + +# 4. 模型导出与预测 + +## 4.1 模型导出 + +inference 模型(`paddle.jit.save`保存的模型) +一般是模型训练,把模型结构和模型参数保存在文件中的固化模型,多用于预测部署场景。 +训练过程中保存的模型是checkpoints模型,保存的只有模型的参数,多用于恢复训练等。 +与checkpoints模型相比,inference 模型会额外保存模型的结构信息,在预测部署、加速推理上性能优越,灵活方便,适合于实际系统集成。 + +表格识别模型转inference模型与文字检测识别的方式相同,如下: + +``` +# -c 后面设置训练算法的yml配置文件 +# -o 配置可选参数 +# Global.pretrained_model 参数设置待转换的训练模型地址,不用添加文件后缀 .pdmodel,.pdopt或.pdparams。 +# Global.save_inference_dir参数设置转换的模型将保存的地址。 + +python3 tools/export_model.py -c configs/table/SLANet.yml -o Global.pretrained_model=./pretrain_models/SLANet/best_accuracy Global.save_inference_dir=./inference/SLANet/ +``` + +转换成功后,在目录下有三个文件: + +``` +inference/SLANet/ + ├── inference.pdiparams # inference模型的参数文件 + ├── inference.pdiparams.info # inference模型的参数信息,可忽略 + └── inference.pdmodel # inference模型的program文件 +``` + +## 4.2 模型预测 + +模型导出后,使用如下命令即可完成inference模型的预测 + +```python +python3.7 table/predict_structure.py \ + --table_model_dir={path/to/inference model} \ + --table_char_dict_path=../ppocr/utils/dict/table_structure_dict_ch.txt \ + --image_dir=docs/table/table.jpg \ + --output=../output/table +``` + +预测图片: + +![](../../ppstructure/docs/table/table.jpg) + +得到输入图像的预测结果: + +``` +['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '
', '', ''],[[320.0562438964844, 197.83375549316406, 350.0928955078125, 214.4309539794922], ... , [318.959228515625, 271.0166931152344, 353.7394104003906, 286.4538269042969]] +``` + +单元格坐标可视化结果为 + +![](../../ppstructure/docs/imgs/slanet_result.jpg) + + +# 5. FAQ + +Q1: 训练模型转inference 模型之后预测效果不一致? + +**A**:此类问题出现较多,问题多是trained model预测时候的预处理、后处理参数和inference model预测的时候的预处理、后处理参数不一致导致的。可以对比训练使用的配置文件中的预处理、后处理和预测时是否存在差异。 diff --git a/doc/doc_ch/whl.md b/doc/doc_ch/whl.md index 511e0421f1e249e340f2002a900b59633e31880e..315329464f15aa1127e34a38d3407a9c81dbc627 100644 --- a/doc/doc_ch/whl.md +++ b/doc/doc_ch/whl.md @@ -390,6 +390,7 @@ im_show.save('result.jpg') | det_db_thresh | DB模型输出预测图的二值化阈值 | 0.3 | | det_db_box_thresh | DB模型输出框的阈值,低于此值的预测框会被丢弃 | 0.5 | | det_db_unclip_ratio | DB模型输出框扩大的比例 | 2 | +| det_db_score_mode | 计算检测框score的方式,有'fast'和'slow',如果要检测的文字有弯曲,建议用'slow','slow'模式计算的box的score偏大,box不容易被过滤掉 | 'fast' | | det_east_score_thresh | EAST模型输出预测图的二值化阈值 | 0.8 | | det_east_cover_thresh | EAST模型输出框的阈值,低于此值的预测框会被丢弃 | 0.1 | | det_east_nms_thresh | EAST模型输出框NMS的阈值 | 0.2 | diff --git a/doc/doc_en/algorithm_det_ct_en.md b/doc/doc_en/algorithm_det_ct_en.md new file mode 100644 index 0000000000000000000000000000000000000000..d56b3fc6b3353bacb1f26fba3873ba5276b10c8b --- /dev/null +++ b/doc/doc_en/algorithm_det_ct_en.md @@ -0,0 +1,96 @@ +# CT + +- [1. Introduction](#1) +- [2. Environment](#2) +- [3. Model Training / Evaluation / Prediction](#3) + - [3.1 Training](#3-1) + - [3.2 Evaluation](#3-2) + - [3.3 Prediction](#3-3) +- [4. Inference and Deployment](#4) + - [4.1 Python Inference](#4-1) + - [4.2 C++ Inference](#4-2) + - [4.3 Serving](#4-3) + - [4.4 More](#4-4) +- [5. FAQ](#5) + + +## 1. Introduction + +Paper: +> [CentripetalText: An Efficient Text Instance Representation for Scene Text Detection](https://arxiv.org/abs/2107.05945) +> Tao Sheng, Jie Chen, Zhouhui Lian +> NeurIPS, 2021 + + +On the Total-Text dataset, the text detection result is as follows: + +|Model|Backbone|Configuration|Precision|Recall|Hmean|Download| +| --- | --- | --- | --- | --- | --- | --- | +|CT|ResNet18_vd|[configs/det/det_r18_vd_ct.yml](../../configs/det/det_r18_vd_ct.yml)|88.68%|81.70%|85.05%|[trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r18_ct_train.tar)| + + + +## 2. Environment +Please prepare your environment referring to [prepare the environment](./environment_en.md) and [clone the repo](./clone_en.md). + + + +## 3. Model Training / Evaluation / Prediction + + +The above CT model is trained using the Total-Text text detection public dataset. For the download of the dataset, please refer to [Total-Text-Dataset](https://github.com/cs-chan/Total-Text-Dataset/tree/master/Dataset). PaddleOCR format annotation download link [train.txt](https://paddleocr.bj.bcebos.com/dataset/ct_tipc/train.txt), [test.txt](https://paddleocr.bj.bcebos.com/dataset/ct_tipc/test.txt). + + +Please refer to [text detection training tutorial](./detection_en.md). PaddleOCR has modularized the code structure, so that you only need to **replace the configuration file** to train different detection models. + + +## 4. Inference and Deployment + + +### 4.1 Python Inference +First, convert the model saved in the CT text detection training process into an inference model. Taking the model based on the Resnet18_vd backbone network and trained on the Total Text English dataset as example ([model download link](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r18_ct_train.tar)), you can use the following command to convert: + +```shell +python3 tools/export_model.py -c configs/det/det_r18_vd_ct.yml -o Global.pretrained_model=./det_r18_ct_train/best_accuracy Global.save_inference_dir=./inference/det_ct +``` + +CT text detection model inference, you can execute the following command: + +```shell +python3 tools/infer/predict_det.py --image_dir="./doc/imgs_en/img623.jpg" --det_model_dir="./inference/det_ct/" --det_algorithm="CT" +``` + +The visualized text detection results are saved to the `./inference_results` folder by default, and the name of the result file is prefixed with 'det_res'. Examples of results are as follows: + +![](../imgs_results/det_res_img623_ct.jpg) + + + +### 4.2 C++ Inference + +Not supported + + +### 4.3 Serving + +Not supported + + +### 4.4 More + +Not supported + + +## 5. FAQ + + +## Citation + +```bibtex +@inproceedings{sheng2021centripetaltext, + title={CentripetalText: An Efficient Text Instance Representation for Scene Text Detection}, + author={Tao Sheng and Jie Chen and Zhouhui Lian}, + booktitle={Thirty-Fifth Conference on Neural Information Processing Systems}, + year={2021} +} +``` diff --git a/doc/doc_en/algorithm_det_db_en.md b/doc/doc_en/algorithm_det_db_en.md index f5f333a039acded88f0f28d302821c5eb10d7402..fde344c3572f771e3e0fe5f9f62282cd1ae0a024 100644 --- a/doc/doc_en/algorithm_det_db_en.md +++ b/doc/doc_en/algorithm_det_db_en.md @@ -1,4 +1,4 @@ -# DB +# DB && DB++ - [1. Introduction](#1) - [2. Environment](#2) @@ -21,13 +21,23 @@ Paper: > Liao, Minghui and Wan, Zhaoyi and Yao, Cong and Chen, Kai and Bai, Xiang > AAAI, 2020 +> [Real-Time Scene Text Detection with Differentiable Binarization and Adaptive Scale Fusion](https://arxiv.org/abs/2202.10304) +> Liao, Minghui and Zou, Zhisheng and Wan, Zhaoyi and Yao, Cong and Bai, Xiang +> TPAMI, 2022 + On the ICDAR2015 dataset, the text detection result is as follows: |Model|Backbone|Configuration|Precision|Recall|Hmean|Download| | --- | --- | --- | --- | --- | --- | --- | |DB|ResNet50_vd|[configs/det/det_r50_vd_db.yml](../../configs/det/det_r50_vd_db.yml)|86.41%|78.72%|82.38%|[trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_db_v2.0_train.tar)| |DB|MobileNetV3|[configs/det/det_mv3_db.yml](../../configs/det/det_mv3_db.yml)|77.29%|73.08%|75.12%|[trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_mv3_db_v2.0_train.tar)| +|DB++|ResNet50|[configs/det/det_r50_db++_ic15.yml](../../configs/det/det_r50_db++_ic15.yml)|90.89%|82.66%|86.58%|[pretrained model](https://paddleocr.bj.bcebos.com/dygraph_v2.1/en_det/ResNet50_dcn_asf_synthtext_pretrained.pdparams)/[trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.1/en_det/det_r50_db%2B%2B_icdar15_train.tar)| + +On the TD_TR dataset, the text detection result is as follows: +|Model|Backbone|Configuration|Precision|Recall|Hmean|Download| +| --- | --- | --- | --- | --- | --- | --- | +|DB++|ResNet50|[configs/det/det_r50_db++_td_tr.yml](../../configs/det/det_r50_db++_td_tr.yml)|92.92%|86.48%|89.58%|[pretrained model](https://paddleocr.bj.bcebos.com/dygraph_v2.1/en_det/ResNet50_dcn_asf_synthtext_pretrained.pdparams)/[trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.1/en_det/det_r50_db%2B%2B_td_tr_train.tar)| ## 2. Environment @@ -96,4 +106,12 @@ More deployment schemes supported for DB: pages={11474--11481}, year={2020} } -``` \ No newline at end of file + +@article{liao2022real, + title={Real-Time Scene Text Detection with Differentiable Binarization and Adaptive Scale Fusion}, + author={Liao, Minghui and Zou, Zhisheng and Wan, Zhaoyi and Yao, Cong and Bai, Xiang}, + journal={IEEE Transactions on Pattern Analysis and Machine Intelligence}, + year={2022}, + publisher={IEEE} +} +``` diff --git a/doc/doc_en/algorithm_en.md b/doc/doc_en/algorithm_en.md deleted file mode 100644 index c880336b4ad528eab2cce479edf11fce0b43f435..0000000000000000000000000000000000000000 --- a/doc/doc_en/algorithm_en.md +++ /dev/null @@ -1,11 +0,0 @@ -# Academic Algorithms and Models - -PaddleOCR will add cutting-edge OCR algorithms and models continuously. Check out the supported models and tutorials by clicking the following list: - - -- [text detection algorithms](./algorithm_overview_en.md#11) -- [text recognition algorithms](./algorithm_overview_en.md#12) -- [end-to-end algorithms](./algorithm_overview_en.md#2) -- [table recognition algorithms](./algorithm_overview_en.md#3) - -Developers are welcome to contribute more algorithms! Please refer to [add new algorithm](./add_new_algorithm_en.md) guideline. diff --git a/doc/doc_en/algorithm_kie_layoutxlm_en.md b/doc/doc_en/algorithm_kie_layoutxlm_en.md new file mode 100644 index 0000000000000000000000000000000000000000..910c1f4d497a6e503f0a7a5ec26dbeceb2d321a1 --- /dev/null +++ b/doc/doc_en/algorithm_kie_layoutxlm_en.md @@ -0,0 +1,162 @@ +# KIE Algorithm - LayoutXLM + + +- [1. Introduction](#1-introduction) +- [2. Environment](#2-environment) +- [3. Model Training / Evaluation / Prediction](#3-model-training--evaluation--prediction) +- [4. Inference and Deployment](#4-inference-and-deployment) + - [4.1 Python Inference](#41-python-inference) + - [4.2 C++ Inference](#42-c-inference) + - [4.3 Serving](#43-serving) + - [4.4 More](#44-more) +- [5. FAQ](#5-faq) +- [Citation](#Citation) + + +## 1. Introduction + +Paper: + +> [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) +> +> Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei +> +> 2021 + +On XFUND_zh dataset, the algorithm reproduction Hmean is as follows. + +|Model|Backbone|Task |Cnnfig|Hmean|Download link| +| --- | --- |--|--- | --- | --- | +|LayoutXLM|LayoutXLM-base|SER |[ser_layoutxlm_xfund_zh.yml](../../configs/kie/layoutlm_series/ser_layoutxlm_xfund_zh.yml)|90.38%|[trained model](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutXLM_xfun_zh.tar)/[inference model](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutXLM_xfun_zh_infer.tar)| +|LayoutXLM|LayoutXLM-base|RE | [re_layoutxlm_xfund_zh.yml](../../configs/kie/layoutlm_series/re_layoutxlm_xfund_zh.yml)|74.83%|[trained model](https://paddleocr.bj.bcebos.com/pplayout/re_LayoutXLM_xfun_zh.tar)/[inference model(coming soon)]()| + + +## 2. Environment + +Please refer to ["Environment Preparation"](./environment_en.md) to configure the PaddleOCR environment, and refer to ["Project Clone"](./clone_en.md) to clone the project code. + + +## 3. Model Training / Evaluation / Prediction + +Please refer to [KIE tutorial](./kie_en.md)。PaddleOCR has modularized the code structure, so that you only need to **replace the configuration file** to train different models. + + + +## 4. Inference and Deployment + +### 4.1 Python Inference + +**Note:** Currently, the RE model inference process is still in the process of adaptation. We take SER model as an example to introduce the KIE process based on LayoutXLM model. + +First, we need to export the trained model into inference model. Take LayoutXLM model trained on XFUND_zh as an example ([trained model download link](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutXLM_xfun_zh.tar)). Use the following command to export. + + +``` bash +wget https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutXLM_xfun_zh.tar +tar -xf ser_LayoutXLM_xfun_zh.tar +python3 tools/export_model.py -c configs/kie/layoutlm_series/ser_layoutxlm_xfund_zh.yml -o Architecture.Backbone.checkpoints=./ser_LayoutXLM_xfun_zh/best_accuracy Global.save_inference_dir=./inference/ser_layoutxlm +``` + +Use the following command to infer using LayoutXLM SER model. + +```bash +cd ppstructure +python3 kie/predict_kie_token_ser.py \ + --kie_algorithm=LayoutXLM \ + --ser_model_dir=../inference/ser_layoutxlm_infer \ + --image_dir=./docs/kie/input/zh_val_42.jpg \ + --ser_dict_path=../train_data/XFUND/class_list_xfun.txt \ + --vis_font_path=../doc/fonts/simfang.ttf +``` + +The SER visualization results are saved in the `./output` directory by default. The results are as follows. + + +
+ +
+ + +### 4.2 C++ Inference + +Not supported + +### 4.3 Serving + +Not supported + +### 4.4 More + +Not supported + +## 5. FAQ + +## Citation + +```bibtex +@article{DBLP:journals/corr/abs-2104-08836, + author = {Yiheng Xu and + Tengchao Lv and + Lei Cui and + Guoxin Wang and + Yijuan Lu and + Dinei Flor{\^{e}}ncio and + Cha Zhang and + Furu Wei}, + title = {LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich + Document Understanding}, + journal = {CoRR}, + volume = {abs/2104.08836}, + year = {2021}, + url = {https://arxiv.org/abs/2104.08836}, + eprinttype = {arXiv}, + eprint = {2104.08836}, + timestamp = {Thu, 14 Oct 2021 09:17:23 +0200}, + biburl = {https://dblp.org/rec/journals/corr/abs-2104-08836.bib}, + bibsource = {dblp computer science bibliography, https://dblp.org} +} + +@article{DBLP:journals/corr/abs-1912-13318, + author = {Yiheng Xu and + Minghao Li and + Lei Cui and + Shaohan Huang and + Furu Wei and + Ming Zhou}, + title = {LayoutLM: Pre-training of Text and Layout for Document Image Understanding}, + journal = {CoRR}, + volume = {abs/1912.13318}, + year = {2019}, + url = {http://arxiv.org/abs/1912.13318}, + eprinttype = {arXiv}, + eprint = {1912.13318}, + timestamp = {Mon, 01 Jun 2020 16:20:46 +0200}, + biburl = {https://dblp.org/rec/journals/corr/abs-1912-13318.bib}, + bibsource = {dblp computer science bibliography, https://dblp.org} +} + +@article{DBLP:journals/corr/abs-2012-14740, + author = {Yang Xu and + Yiheng Xu and + Tengchao Lv and + Lei Cui and + Furu Wei and + Guoxin Wang and + Yijuan Lu and + Dinei A. F. Flor{\^{e}}ncio and + Cha Zhang and + Wanxiang Che and + Min Zhang and + Lidong Zhou}, + title = {LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding}, + journal = {CoRR}, + volume = {abs/2012.14740}, + year = {2020}, + url = {https://arxiv.org/abs/2012.14740}, + eprinttype = {arXiv}, + eprint = {2012.14740}, + timestamp = {Tue, 27 Jul 2021 09:53:52 +0200}, + biburl = {https://dblp.org/rec/journals/corr/abs-2012-14740.bib}, + bibsource = {dblp computer science bibliography, https://dblp.org} +} +``` diff --git a/doc/doc_en/algorithm_kie_sdmgr_en.md b/doc/doc_en/algorithm_kie_sdmgr_en.md new file mode 100644 index 0000000000000000000000000000000000000000..5b12b8c959e830015ffb173626ac5752ee9ecee0 --- /dev/null +++ b/doc/doc_en/algorithm_kie_sdmgr_en.md @@ -0,0 +1,130 @@ + +# KIE Algorithm - SDMGR + +- [1. Introduction](#1-introduction) +- [2. Environment](#2-environment) +- [3. Model Training / Evaluation / Prediction](#3-model-training--evaluation--prediction) +- [4. Inference and Deployment](#4-inference-and-deployment) + - [4.1 Python Inference](#41-python-inference) + - [4.2 C++ Inference](#42-c-inference) + - [4.3 Serving](#43-serving) + - [4.4 More](#44-more) +- [5. FAQ](#5-faq) +- [Citation](#Citation) + +## 1. Introduction + +Paper: + +> [Spatial Dual-Modality Graph Reasoning for Key Information Extraction](https://arxiv.org/abs/2103.14470) +> +> Hongbin Sun and Zhanghui Kuang and Xiaoyu Yue and Chenhao Lin and Wayne Zhang +> +> 2021 + +On wildreceipt dataset, the algorithm reproduction Hmean is as follows. + +|Model|Backbone |Cnnfig|Hmean|Download link| +| --- | --- | --- | --- | --- | +|SDMGR|VGG6|[configs/kie/sdmgr/kie_unet_sdmgr.yml](../../configs/kie/sdmgr/kie_unet_sdmgr.yml)|86.7%|[trained model]( https://paddleocr.bj.bcebos.com/dygraph_v2.1/kie/kie_vgg16.tar)/[inference model(coming soon)]()| + + + +## 2. 环境配置 + +Please refer to ["Environment Preparation"](./environment_en.md) to configure the PaddleOCR environment, and refer to ["Project Clone"](./clone_en.md) to clone the project code. + + + +## 3. Model Training / Evaluation / Prediction + +SDMGR is a key information extraction algorithm that classifies each detected textline into predefined categories, such as order ID, invoice number, amount, etc. + +The training and test data are collected in the wildreceipt dataset, use following command to downloaded the dataset. + + +```bash +wget https://paddleocr.bj.bcebos.com/ppstructure/dataset/wildreceipt.tar && tar xf wildreceipt.tar +``` + +Create dataset soft link to `PaddleOCR/train_data` directory. + +```bash +cd PaddleOCR/ && mkdir train_data && cd train_data +ln -s ../../wildreceipt ./ +``` + + +### 3.1 Model training + +The config file is `configs/kie/sdmgr/kie_unet_sdmgr.yml`, the default dataset path is `train_data/wildreceipt`. + +Use the following command to train the model. + +```bash +python3 tools/train.py -c configs/kie/sdmgr/kie_unet_sdmgr.yml -o Global.save_model_dir=./output/kie/ +``` + +### 3.2 Model evaluation + +Use the following command to evaluate the model. + +```bash +python3 tools/eval.py -c configs/kie/sdmgr/kie_unet_sdmgr.yml -o Global.checkpoints=./output/kie/best_accuracy +``` + +An example of output information is shown below. + +```py +[2022/08/10 05:22:23] ppocr INFO: metric eval *************** +[2022/08/10 05:22:23] ppocr INFO: hmean:0.8670120239257812 +[2022/08/10 05:22:23] ppocr INFO: fps:10.18816520530961 +``` + +### 3.3 Model prediction + +Use the following command to load the model and predict. During the prediction, the text file storing the image path and OCR information needs to be loaded in advance. Use `Global.infer_img` to assign. + +```bash +python3 tools/infer_kie.py -c configs/kie/kie_unet_sdmgr.yml -o Global.checkpoints=kie_vgg16/best_accuracy Global.infer_img=./train_data/wildreceipt/1.txt +``` + +The visualization results and texts are saved in the `./output/sdmgr_kie/` directory by default. The results are as follows. + + +
+ +
+ +## 4. Inference and Deployment + +### 4.1 Python Inference + +Not supported + +### 4.2 C++ Inference + +Not supported + +### 4.3 Serving + +Not supported + +### 4.4 More + +Not supported + +## 5. FAQ + +## Citation + +```bibtex +@misc{sun2021spatial, + title={Spatial Dual-Modality Graph Reasoning for Key Information Extraction}, + author={Hongbin Sun and Zhanghui Kuang and Xiaoyu Yue and Chenhao Lin and Wayne Zhang}, + year={2021}, + eprint={2103.14470}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` diff --git a/doc/doc_en/algorithm_kie_vi_layoutxlm_en.md b/doc/doc_en/algorithm_kie_vi_layoutxlm_en.md new file mode 100644 index 0000000000000000000000000000000000000000..12b6e1bddbd03b820ce33ba86de3d430a44f8987 --- /dev/null +++ b/doc/doc_en/algorithm_kie_vi_layoutxlm_en.md @@ -0,0 +1,156 @@ +# KIE Algorithm - VI-LayoutXLM + + +- [1. Introduction](#1-introduction) +- [2. Environment](#2-environment) +- [3. Model Training / Evaluation / Prediction](#3-model-training--evaluation--prediction) +- [4. Inference and Deployment](#4-inference-and-deployment) + - [4.1 Python Inference](#41-python-inference) + - [4.2 C++ Inference](#42-c-inference) + - [4.3 Serving](#43-serving) + - [4.4 More](#44-more) +- [5. FAQ](#5-faq) +- [Citation](#Citation) + + +## 1. Introduction + +VI-LayoutXLM is improved based on LayoutXLM. In the process of downstream finetuning, the visual backbone network module is removed, and the model infernce speed is further improved on the basis of almost lossless accuracy. + +On XFUND_zh dataset, the algorithm reproduction Hmean is as follows. + +|Model|Backbone|Task |Cnnfig|Hmean|Download link| +| --- | --- |---| --- | --- | --- | +|VI-LayoutXLM |VI-LayoutXLM-base | SER |[ser_vi_layoutxlm_xfund_zh_udml.yml](../../configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh_udml.yml)|93.19%|[trained model](https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/ser_vi_layoutxlm_xfund_pretrained.tar)/[inference model](https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/ser_vi_layoutxlm_xfund_infer.tar)| +|VI-LayoutXLM |VI-LayoutXLM-base |RE | [re_vi_layoutxlm_xfund_zh_udml.yml](../../configs/kie/vi_layoutxlm/re_vi_layoutxlm_xfund_zh_udml.yml)|83.92%|[trained model](https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/re_vi_layoutxlm_xfund_pretrained.tar)/[inference model(coming soon)]()| + + +Please refer to ["Environment Preparation"](./environment_en.md) to configure the PaddleOCR environment, and refer to ["Project Clone"](./clone_en.md) to clone the project code. + + +## 3. Model Training / Evaluation / Prediction + +Please refer to [KIE tutorial](./kie_en.md)。PaddleOCR has modularized the code structure, so that you only need to **replace the configuration file** to train different models. + + +## 4. Inference and Deployment + +### 4.1 Python Inference + +**Note:** Currently, the RE model inference process is still in the process of adaptation. We take SER model as an example to introduce the KIE process based on VI-LayoutXLM model. + +First, we need to export the trained model into inference model. Take VI-LayoutXLM model trained on XFUND_zh as an example ([trained model download link](https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/ser_vi_layoutxlm_xfund_pretrained.tar)). Use the following command to export. + + +``` bash +wget https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/ser_vi_layoutxlm_xfund_pretrained.tar +tar -xf ser_vi_layoutxlm_xfund_pretrained.tar +python3 tools/export_model.py -c configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml -o Architecture.Backbone.checkpoints=./ser_vi_layoutxlm_xfund_pretrained/best_accuracy Global.save_inference_dir=./inference/ser_vi_layoutxlm_infer +``` + +Use the following command to infer using VI-LayoutXLM SER model. + + +```bash +cd ppstructure +python3 kie/predict_kie_token_ser.py \ + --kie_algorithm=LayoutXLM \ + --ser_model_dir=../inference/ser_vi_layoutxlm_infer \ + --image_dir=./docs/kie/input/zh_val_42.jpg \ + --ser_dict_path=../train_data/XFUND/class_list_xfun.txt \ + --vis_font_path=../doc/fonts/simfang.ttf \ + --ocr_order_method="tb-yx" +``` + +The SER visualization results are saved in the `./output` folder by default. The results are as follows. + + +
+ +
+ + +### 4.2 C++ Inference + +Not supported + +### 4.3 Serving + +Not supported + +### 4.4 More + +Not supported + +## 5. FAQ + +## Citation + + +```bibtex +@article{DBLP:journals/corr/abs-2104-08836, + author = {Yiheng Xu and + Tengchao Lv and + Lei Cui and + Guoxin Wang and + Yijuan Lu and + Dinei Flor{\^{e}}ncio and + Cha Zhang and + Furu Wei}, + title = {LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich + Document Understanding}, + journal = {CoRR}, + volume = {abs/2104.08836}, + year = {2021}, + url = {https://arxiv.org/abs/2104.08836}, + eprinttype = {arXiv}, + eprint = {2104.08836}, + timestamp = {Thu, 14 Oct 2021 09:17:23 +0200}, + biburl = {https://dblp.org/rec/journals/corr/abs-2104-08836.bib}, + bibsource = {dblp computer science bibliography, https://dblp.org} +} + +@article{DBLP:journals/corr/abs-1912-13318, + author = {Yiheng Xu and + Minghao Li and + Lei Cui and + Shaohan Huang and + Furu Wei and + Ming Zhou}, + title = {LayoutLM: Pre-training of Text and Layout for Document Image Understanding}, + journal = {CoRR}, + volume = {abs/1912.13318}, + year = {2019}, + url = {http://arxiv.org/abs/1912.13318}, + eprinttype = {arXiv}, + eprint = {1912.13318}, + timestamp = {Mon, 01 Jun 2020 16:20:46 +0200}, + biburl = {https://dblp.org/rec/journals/corr/abs-1912-13318.bib}, + bibsource = {dblp computer science bibliography, https://dblp.org} +} + +@article{DBLP:journals/corr/abs-2012-14740, + author = {Yang Xu and + Yiheng Xu and + Tengchao Lv and + Lei Cui and + Furu Wei and + Guoxin Wang and + Yijuan Lu and + Dinei A. F. Flor{\^{e}}ncio and + Cha Zhang and + Wanxiang Che and + Min Zhang and + Lidong Zhou}, + title = {LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding}, + journal = {CoRR}, + volume = {abs/2012.14740}, + year = {2020}, + url = {https://arxiv.org/abs/2012.14740}, + eprinttype = {arXiv}, + eprint = {2012.14740}, + timestamp = {Tue, 27 Jul 2021 09:53:52 +0200}, + biburl = {https://dblp.org/rec/journals/corr/abs-2012-14740.bib}, + bibsource = {dblp computer science bibliography, https://dblp.org} +} +``` diff --git a/doc/doc_en/algorithm_overview_en.md b/doc/doc_en/algorithm_overview_en.md index dfd8ecda5c306aeb41902caccc2b6079f4f86542..bca22f78482980bed18d6447d0cf07b27c26720d 100755 --- a/doc/doc_en/algorithm_overview_en.md +++ b/doc/doc_en/algorithm_overview_en.md @@ -1,24 +1,28 @@ -# OCR Algorithms +# Algorithms -- [1. Two-stage Algorithms](#1) +- [1. Two-stage OCR Algorithms](#1) - [1.1 Text Detection Algorithms](#11) - [1.2 Text Recognition Algorithms](#12) -- [2. End-to-end Algorithms](#2) +- [2. End-to-end OCR Algorithms](#2) - [3. Table Recognition Algorithms](#3) +- [4. Key Information Extraction Algorithms](#4) +This tutorial lists the OCR algorithms supported by PaddleOCR, as well as the models and metrics of each algorithm on **English public datasets**. It is mainly used for algorithm introduction and algorithm performance comparison. For more models on other datasets including Chinese, please refer to [PP-OCRv3 models list](./models_list_en.md). + +>> +Developers are welcome to contribute more algorithms! Please refer to [add new algorithm](./add_new_algorithm_en.md) guideline. -This tutorial lists the OCR algorithms supported by PaddleOCR, as well as the models and metrics of each algorithm on **English public datasets**. It is mainly used for algorithm introduction and algorithm performance comparison. For more models on other datasets including Chinese, please refer to [PP-OCR v2.0 models list](./models_list_en.md). -## 1. Two-stage Algorithms +## 1. Two-stage OCR Algorithms ### 1.1 Text Detection Algorithms Supported text detection algorithms (Click the link to get the tutorial): -- [x] [DB](./algorithm_det_db_en.md) +- [x] [DB && DB++](./algorithm_det_db_en.md) - [x] [EAST](./algorithm_det_east_en.md) - [x] [SAST](./algorithm_det_sast_en.md) - [x] [PSENet](./algorithm_det_psenet_en.md) @@ -35,6 +39,7 @@ On the ICDAR2015 dataset, the text detection result is as follows: |SAST|ResNet50_vd|91.39%|83.77%|87.42%|[trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_sast_icdar15_v2.0_train.tar)| |PSE|ResNet50_vd|85.81%|79.53%|82.55%|[trianed model](https://paddleocr.bj.bcebos.com/dygraph_v2.1/en_det/det_r50_vd_pse_v2.0_train.tar)| |PSE|MobileNetV3|82.20%|70.48%|75.89%|[trianed model](https://paddleocr.bj.bcebos.com/dygraph_v2.1/en_det/det_mv3_pse_v2.0_train.tar)| +|DB++|ResNet50|90.89%|82.66%|86.58%|[pretrained model](https://paddleocr.bj.bcebos.com/dygraph_v2.1/en_det/ResNet50_dcn_asf_synthtext_pretrained.pdparams)/[trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.1/en_det/det_r50_db%2B%2B_icdar15_train.tar)| On Total-Text dataset, the text detection result is as follows: @@ -70,6 +75,7 @@ Supported text recognition algorithms (Click the link to get the tutorial): - [x] [ABINet](./algorithm_rec_abinet_en.md) - [x] [VisionLAN](./algorithm_rec_visionlan_en.md) - [x] [SPIN](./algorithm_rec_spin_en.md) +- [x] [RobustScanner](./algorithm_rec_robustscanner_en.md) Refer to [DTRB](https://arxiv.org/abs/1904.01906), the training and evaluation result of these above text recognition (using MJSynth and SynthText for training, evaluate on IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE) is as follow: @@ -92,15 +98,17 @@ Refer to [DTRB](https://arxiv.org/abs/1904.01906), the training and evaluation r |ABINet|Resnet45| 90.75% | rec_r45_abinet | [trained model](https://paddleocr.bj.bcebos.com/rec_r45_abinet_train.tar) | |VisionLAN|Resnet45| 90.30% | rec_r45_visionlan | [trained model](https://paddleocr.bj.bcebos.com/rec_r45_visionlan_train.tar) | |SPIN|ResNet32| 90.00% | rec_r32_gaspin_bilstm_att | coming soon | +|RobustScanner|ResNet31| 87.77% | rec_r31_robustscanner | coming soon | -## 2. End-to-end Algorithms +## 2. End-to-end OCR Algorithms Supported end-to-end algorithms (Click the link to get the tutorial): - [x] [PGNet](./algorithm_e2e_pgnet_en.md) + ## 3. Table Recognition Algorithms @@ -112,3 +120,34 @@ On the PubTabNet dataset, the algorithm result is as follows: |Model|Backbone|Config|Acc|Download link| |---|---|---|---|---| |TableMaster|TableResNetExtra|[configs/table/table_master.yml](../../configs/table/table_master.yml)|77.47%|[trained](https://paddleocr.bj.bcebos.com/ppstructure/models/tablemaster/table_structure_tablemaster_train.tar) / [inference model](https://paddleocr.bj.bcebos.com/ppstructure/models/tablemaster/table_structure_tablemaster_infer.tar)| + + + + +## 4. Key Information Extraction Algorithms + +Supported KIE algorithms (Click the link to get the tutorial): + +- [x] [VI-LayoutXLM](./algorithm_kie_vi_layoutxlm_en.md) +- [x] [LayoutLM](./algorithm_kie_layoutxlm_en.md) +- [x] [LayoutLMv2](./algorithm_kie_layoutxlm_en.md) +- [x] [LayoutXLM](./algorithm_kie_layoutxlm_en.md) +- [x] [SDMGR](./algorithm_kie_sdmgr_en.md) + +On wildreceipt dataset, the algorithm result is as follows: + +|Model|Backbone|Config|Hmean|Download link| +| --- | --- | --- | --- | --- | +|SDMGR|VGG6|[configs/kie/sdmgr/kie_unet_sdmgr.yml](../../configs/kie/sdmgr/kie_unet_sdmgr.yml)|86.7%|[trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.1/kie/kie_vgg16.tar)| + +On XFUND_zh dataset, the algorithm result is as follows: + +|Model|Backbone|Task|Config|Hmean|Download link| +| --- | --- | --- | --- | --- | --- | +|VI-LayoutXLM| VI-LayoutXLM-base | SER | [ser_vi_layoutxlm_xfund_zh_udml.yml](../../configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh_udml.yml)|**93.19%**|[trained model](https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/ser_vi_layoutxlm_xfund_pretrained.tar)| +|LayoutXLM| LayoutXLM-base | SER | [ser_layoutxlm_xfund_zh.yml](../../configs/kie/layoutlm_series/ser_layoutxlm_xfund_zh.yml)|90.38%|[trained model](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutXLM_xfun_zh.tar)| +|LayoutLM| LayoutLM-base | SER | [ser_layoutlm_xfund_zh.yml](../../configs/kie/layoutlm_series/ser_layoutlm_xfund_zh.yml)|77.31%|[trained model](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutLM_xfun_zh.tar)| +|LayoutLMv2| LayoutLMv2-base | SER | [ser_layoutlmv2_xfund_zh.yml](../../configs/kie/layoutlm_series/ser_layoutlmv2_xfund_zh.yml)|85.44%|[trained model](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutLMv2_xfun_zh.tar)| +|VI-LayoutXLM| VI-LayoutXLM-base | RE | [re_vi_layoutxlm_xfund_zh_udml.yml](../../configs/kie/vi_layoutxlm/re_vi_layoutxlm_xfund_zh_udml.yml)|**83.92%**|[trained model](https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/re_vi_layoutxlm_xfund_pretrained.tar)| +|LayoutXLM| LayoutXLM-base | RE | [re_layoutxlm_xfund_zh.yml](../../configs/kie/layoutlm_series/re_layoutxlm_xfund_zh.yml)|74.83%|[trained model](https://paddleocr.bj.bcebos.com/pplayout/re_LayoutXLM_xfun_zh.tar)| +|LayoutLMv2| LayoutLMv2-base | RE | [re_layoutlmv2_xfund_zh.yml](../../configs/kie/layoutlm_series/re_layoutlmv2_xfund_zh.yml)|67.77%|[trained model](https://paddleocr.bj.bcebos.com/pplayout/re_LayoutLMv2_xfun_zh.tar)| diff --git a/doc/doc_en/algorithm_rec_robustscanner_en.md b/doc/doc_en/algorithm_rec_robustscanner_en.md new file mode 100644 index 0000000000000000000000000000000000000000..a324a6d547a9e448566276234c750ad4497abf9c --- /dev/null +++ b/doc/doc_en/algorithm_rec_robustscanner_en.md @@ -0,0 +1,114 @@ +# RobustScanner + +- [1. Introduction](#1) +- [2. Environment](#2) +- [3. Model Training / Evaluation / Prediction](#3) + - [3.1 Training](#3-1) + - [3.2 Evaluation](#3-2) + - [3.3 Prediction](#3-3) +- [4. Inference and Deployment](#4) + - [4.1 Python Inference](#4-1) + - [4.2 C++ Inference](#4-2) + - [4.3 Serving](#4-3) + - [4.4 More](#4-4) +- [5. FAQ](#5) + + +## 1. Introduction + +Paper: +> [RobustScanner: Dynamically Enhancing Positional Clues for Robust Text Recognition](https://arxiv.org/pdf/2007.07542.pdf) +> Xiaoyu Yue, Zhanghui Kuang, Chenhao Lin, Hongbin Sun, Wayne +Zhang +> ECCV, 2020 + +Using MJSynth and SynthText two text recognition datasets for training, and evaluating on IIIT, SVT, IC13, IC15, SVTP, CUTE datasets, the algorithm reproduction effect is as follows: + +|Model|Backbone|config|Acc|Download link| +| --- | --- | --- | --- | --- | +|RobustScanner|ResNet31|[rec_r31_robustscanner.yml](../../configs/rec/rec_r31_robustscanner.yml)|87.77%|coming soon| + +Note:In addition to using the two text recognition datasets MJSynth and SynthText, [SynthAdd](https://pan.baidu.com/share/init?surl=uV0LtoNmcxbO-0YA7Ch4dg) data (extraction code: 627x), and some real data are used in training, the specific data details can refer to the paper. + + +## 2. Environment +Please refer to ["Environment Preparation"](./environment_en.md) to configure the PaddleOCR environment, and refer to ["Project Clone"](./clone_en.md) to clone the project code. + + + +## 3. Model Training / Evaluation / Prediction + +Please refer to [Text Recognition Tutorial](./recognition_en.md). PaddleOCR modularizes the code, and training different recognition models only requires **changing the configuration file**. + +Training: + +Specifically, after the data preparation is completed, the training can be started. The training command is as follows: + +``` +#Single GPU training (long training period, not recommended) +python3 tools/train.py -c configs/rec/rec_r31_robustscanner.yml + +#Multi GPU training, specify the gpu number through the --gpus parameter +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/rec/rec_r31_robustscanner.yml +``` + +Evaluation: + +``` +# GPU evaluation +python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/rec/rec_r31_robustscanner.yml -o Global.pretrained_model={path/to/weights}/best_accuracy +``` + +Prediction: + +``` +# The configuration file used for prediction must match the training +python3 tools/infer_rec.py -c configs/rec/rec_r31_robustscanner.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.infer_img=doc/imgs_words/en/word_1.png +``` + + +## 4. Inference and Deployment + + +### 4.1 Python Inference +First, the model saved during the RobustScanner text recognition training process is converted into an inference model. you can use the following command to convert: + +``` +python3 tools/export_model.py -c configs/rec/rec_r31_robustscanner.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.save_inference_dir=./inference/rec_r31_robustscanner +``` + +For RobustScanner text recognition model inference, the following commands can be executed: + +``` +python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words/en/word_1.png" --rec_model_dir="./inference/rec_r31_robustscanner/" --rec_image_shape="3, 48, 48, 160" --rec_algorithm="RobustScanner" --rec_char_dict_path="ppocr/utils/dict90.txt" --use_space_char=False +``` + + +### 4.2 C++ Inference + +Not supported + + +### 4.3 Serving + +Not supported + + +### 4.4 More + +Not supported + + +## 5. FAQ + + +## Citation + +```bibtex +@article{2020RobustScanner, + title={RobustScanner: Dynamically Enhancing Positional Clues for Robust Text Recognition}, + author={Xiaoyu Yue and Zhanghui Kuang and Chenhao Lin and Hongbin Sun and Wayne Zhang}, + journal={ECCV2020}, + year={2020}, +} +``` diff --git a/doc/doc_en/algorithm_rec_sar_en.md b/doc/doc_en/algorithm_rec_sar_en.md index 24b87c10c3b2839909392bf3de0e0c850112fcdc..5c8319da3bc63dce55b0d5eae749ed4500b9d2f6 100644 --- a/doc/doc_en/algorithm_rec_sar_en.md +++ b/doc/doc_en/algorithm_rec_sar_en.md @@ -79,7 +79,7 @@ python3 tools/export_model.py -c configs/rec/rec_r31_sar.yml -o Global.pretraine For SAR text recognition model inference, the following commands can be executed: ``` -python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words/en/word_1.png" --rec_model_dir="./inference/rec_sar/" --rec_image_shape="3, 48, 48, 160" --rec_char_type="ch" --rec_algorithm="SAR" --rec_char_dict_path="ppocr/utils/dict90.txt" --max_text_length=30 --use_space_char=False +python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words/en/word_1.png" --rec_model_dir="./inference/rec_sar/" --rec_image_shape="3, 48, 48, 160" --rec_algorithm="SAR" --rec_char_dict_path="ppocr/utils/dict90.txt" --max_text_length=30 --use_space_char=False ``` diff --git a/doc/doc_en/algorithm_rec_visionlan_en.md b/doc/doc_en/algorithm_rec_visionlan_en.md index ebd02d52f4252c672b4a76c940ccdd621f5354ef..70c2ccc470af0a03485d9d234e86e384c087617f 100644 --- a/doc/doc_en/algorithm_rec_visionlan_en.md +++ b/doc/doc_en/algorithm_rec_visionlan_en.md @@ -90,7 +90,7 @@ After the conversion is successful, there are three files in the directory: For VisionLAN text recognition model inference, the following commands can be executed: ``` -python3 tools/infer/predict_rec.py --image_dir='./doc/imgs_words/en/word_2.png' --rec_model_dir='./inference/rec_r45_visionlan/' --rec_algorithm='VisionLAN' --rec_image_shape='3,64,256' --rec_char_dict_path='./ppocr/utils/dict36.txt' +python3 tools/infer/predict_rec.py --image_dir='./doc/imgs_words/en/word_2.png' --rec_model_dir='./inference/rec_r45_visionlan/' --rec_algorithm='VisionLAN' --rec_image_shape='3,64,256' --rec_char_dict_path='./ppocr/utils/ic15_dict.txt' --use_space_char=False ``` ![](../imgs_words/en/word_2.png) @@ -98,7 +98,7 @@ python3 tools/infer/predict_rec.py --image_dir='./doc/imgs_words/en/word_2.png' After executing the command, the prediction result (recognized text and score) of the image above is printed to the screen, an example is as follows: The result is as follows: ```shell -Predicts of ./doc/imgs_words/en/word_2.png:('yourself', 0.97076982) +Predicts of ./doc/imgs_words/en/word_2.png:('yourself', 0.9999493) ``` diff --git a/ppstructure/docs/kie_en.md b/doc/doc_en/algorithm_sdmgr_en.md similarity index 100% rename from ppstructure/docs/kie_en.md rename to doc/doc_en/algorithm_sdmgr_en.md diff --git a/doc/doc_en/algorithm_sr_gestalt_en.md b/doc/doc_en/algorithm_sr_gestalt_en.md new file mode 100644 index 0000000000000000000000000000000000000000..516b90cb3099c0627cf23ef608ffb7da31aacc35 --- /dev/null +++ b/doc/doc_en/algorithm_sr_gestalt_en.md @@ -0,0 +1,136 @@ +# Text Gestalt + +- [1. Introduction](#1) +- [2. Environment](#2) +- [3. Model Training / Evaluation / Prediction](#3) + - [3.1 Training](#3-1) + - [3.2 Evaluation](#3-2) + - [3.3 Prediction](#3-3) +- [4. Inference and Deployment](#4) + - [4.1 Python Inference](#4-1) + - [4.2 C++ Inference](#4-2) + - [4.3 Serving](#4-3) + - [4.4 More](#4-4) +- [5. FAQ](#5) + + + +## 1. Introduction + +Paper: +> [Text Gestalt: Stroke-Aware Scene Text Image Super-Resolution](https://arxiv.org/pdf/2112.08171.pdf) + +> Chen, Jingye and Yu, Haiyang and Ma, Jianqi and Li, Bin and Xue, Xiangyang + +> AAAI, 2022 + +Referring to the [FudanOCR](https://github.com/FudanVI/FudanOCR/tree/main/text-gestalt) data download instructions, the effect of the super-score algorithm on the TextZoom test set is as follows: + +|Model|Backbone|config|Acc|Download link| +|---|---|---|---|---|---| +|Text Gestalt|tsrn|19.28|0.6560| [configs/sr/sr_tsrn_transformer_strock.yml](../../configs/sr/sr_tsrn_transformer_strock.yml)|[train model](https://paddleocr.bj.bcebos.com/sr_tsrn_transformer_strock_train.tar)| + + + +## 2. Environment +Please refer to ["Environment Preparation"](./environment_en.md) to configure the PaddleOCR environment, and refer to ["Project Clone"](./clone_en.md) to clone the project code. + + + +## 3. Model Training / Evaluation / Prediction + +Please refer to [Text Recognition Tutorial](./recognition_en.md). PaddleOCR modularizes the code, and training different models only requires **changing the configuration file**. + +Training: + +Specifically, after the data preparation is completed, the training can be started. The training command is as follows: + +``` +#Single GPU training (long training period, not recommended) + +python3 tools/train.py -c configs/sr/sr_tsrn_transformer_strock.yml + +#Multi GPU training, specify the gpu number through the --gpus parameter + +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/sr/sr_tsrn_transformer_strock.yml + +``` + + +Evaluation: + +``` +# GPU evaluation +python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/sr/sr_tsrn_transformer_strock.yml -o Global.pretrained_model={path/to/weights}/best_accuracy +``` + +Prediction: + +``` +# The configuration file used for prediction must match the training + +python3 tools/infer_sr.py -c configs/sr/sr_tsrn_transformer_strock.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.infer_img=doc/imgs_words_en/word_52.png +``` + +![](../imgs_words_en/word_52.png) + +After executing the command, the super-resolution result of the above image is as follows: + +![](../imgs_results/sr_word_52.png) + + +## 4. Inference and Deployment + + +### 4.1 Python Inference + +First, the model saved during the training process is converted into an inference model. ( [Model download link](https://paddleocr.bj.bcebos.com/sr_tsrn_transformer_strock_train.tar) ), you can use the following command to convert: + +```shell +python3 tools/export_model.py -c configs/sr/sr_tsrn_transformer_strock.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.save_inference_dir=./inference/sr_out +``` + +For Text-Gestalt super-resolution model inference, the following commands can be executed: + +``` +python3 tools/infer/predict_sr.py --sr_model_dir=./inference/sr_out --image_dir=doc/imgs_words_en/word_52.png --sr_image_shape=3,32,128 + +``` + +After executing the command, the super-resolution result of the above image is as follows: + +![](../imgs_results/sr_word_52.png) + + + +### 4.2 C++ Inference + +Not supported + + +### 4.3 Serving + +Not supported + + +### 4.4 More + +Not supported + + +## 5. FAQ + + +## Citation + +```bibtex +@inproceedings{chen2022text, + title={Text gestalt: Stroke-aware scene text image super-resolution}, + author={Chen, Jingye and Yu, Haiyang and Ma, Jianqi and Li, Bin and Xue, Xiangyang}, + booktitle={Proceedings of the AAAI Conference on Artificial Intelligence}, + volume={36}, + number={1}, + pages={285--293}, + year={2022} +} +``` diff --git a/doc/doc_en/dataset/docvqa_datasets_en.md b/doc/doc_en/dataset/kie_datasets_en.md similarity index 59% rename from doc/doc_en/dataset/docvqa_datasets_en.md rename to doc/doc_en/dataset/kie_datasets_en.md index 820462c324318a391abe409412e8996f11b36279..7b476f77d0380496d026c448937e59b23ee24c87 100644 --- a/doc/doc_en/dataset/docvqa_datasets_en.md +++ b/doc/doc_en/dataset/kie_datasets_en.md @@ -1,7 +1,10 @@ -## DocVQA dataset -Here are the common DocVQA datasets, which are being updated continuously. Welcome to contribute datasets~ +## Key Information Extraction dataset + +Here are the common datasets key information extraction, which are being updated continuously. Welcome to contribute datasets. + - [FUNSD dataset](#funsd) - [XFUND dataset](#xfund) +- [wildreceipt dataset](#wildreceipt-dataset) #### 1. FUNSD dataset @@ -18,10 +21,29 @@ Here are the common DocVQA datasets, which are being updated continuously. Welco #### 2. XFUND dataset - **Data source**: https://github.com/doc-analysis/XFUND -- **Data introduction**: XFUND is a multilingual form comprehension dataset, which contains form data in 7 different languages, and all are manually annotated in the form of key-value pairs. The data for each language contains 199 form data, which are divided into 149 training sets and 50 test sets. Part of the image and the annotation box visualization are shown below: +- **Data introduction**: XFUND is a multilingual form comprehension dataset, which contains form data in 7 different languages, and all are manually annotated in the form of key-value pairs. The data for each language contains 199 form data, which are divided into 149 training sets and 50 test sets. Part of the image and the annotation box visualization are shown below. +
- **Download address**: https://github.com/doc-analysis/XFUND/releases/tag/v1.0 + + + +## 3. wildreceipt dataset + +- **Data source**: https://arxiv.org/abs/2103.14470 +- **Data introduction**: XFUND is an English receipt dataset, which contains 26 different categories. There are 1267 training images and 472 evaluation images, in which 50,000 textlines and boxes are annotated. Part of the image and the annotation box visualization are shown below. + +
+ + +
+ +**Note:** Boxes with category `Ignore` or `Others` are not visualized here. + +- **Download address**: + - Offical dataset: [link](https://download.openmmlab.com/mmocr/data/wildreceipt.tar) + - Dataset converted for PaddleOCR training process: [link](https://paddleocr.bj.bcebos.com/ppstructure/dataset/wildreceipt.tar) diff --git a/doc/doc_en/dataset/layout_datasets_en.md b/doc/doc_en/dataset/layout_datasets_en.md new file mode 100644 index 0000000000000000000000000000000000000000..54c88609d0f25f65b4878fac96a43de5f1cc3164 --- /dev/null +++ b/doc/doc_en/dataset/layout_datasets_en.md @@ -0,0 +1,55 @@ +## Layout Analysis Dataset + +Here are the common datasets of layout anlysis, which are being updated continuously. Welcome to contribute datasets. + +- [PubLayNet dataset](#publaynet) +- [CDLA dataset](#CDLA) +- [TableBank dataset](#TableBank) + + +Most of the layout analysis datasets are object detection datasets. In addition to open source datasets, you can also label or synthesize datasets using tools such as [labelme](https://github.com/wkentaro/labelme) and so on. + + + + +#### 1. PubLayNet dataset + +- **Data source**: https://github.com/ibm-aur-nlp/PubLayNet +- **Data introduction**: The PubLayNet dataset contains 350000 training images and 11000 validation images. There are 5 categories in total, namely: `text, title, list, table, figure`. Some images and their annotations as shown below. + +
+ + +
+ +- **Download address**: https://developer.ibm.com/exchanges/data/all/publaynet/ +- **Note**: When using this dataset, you need to follow [CDLA-Permissive](https://cdla.io/permissive-1-0/) license. + + + + +#### 2、CDLA数据集 +- **Data source**: https://github.com/buptlihang/CDLA +- **Data introduction**: CDLA dataset contains 5000 training images and 1000 validation images with 10 categories, which are `Text, Title, Figure, Figure caption, Table, Table caption, Header, Footer, Reference, Equation`. Some images and their annotations as shown below. + +
+ + +
+ +- **Download address**: https://github.com/buptlihang/CDLA +- **Note**: When you train detection model on CDLA dataset using [PaddleDetection](https://github.com/PaddlePaddle/PaddleDetection/tree/develop), you need to remove the label `__ignore__` and `_background_`. + + + +#### 3、TableBank dataet +- **Data source**: https://doc-analysis.github.io/tablebank-page/index.html +- **Data introduction**: TableBank dataset contains 2 types of document: Latex (187199 training images, 7265 validation images and 5719 testing images) and Word (73383 training images 2735 validation images and 2281 testing images). Some images and their annotations as shown below. + +
+ + +
+ +- **Data source**: https://doc-analysis.github.io/tablebank-page/index.html +- **Note**: When using this dataset, you need to follow [Apache-2.0](https://github.com/doc-analysis/TableBank/blob/master/LICENSE) license. diff --git a/doc/doc_en/dataset/table_datasets_en.md b/doc/doc_en/dataset/table_datasets_en.md index e30147909812a153f311add50f0bef5d1d1e0e32..70ca8309798994c6225ab0c10d4689da2387962b 100644 --- a/doc/doc_en/dataset/table_datasets_en.md +++ b/doc/doc_en/dataset/table_datasets_en.md @@ -3,6 +3,7 @@ - [Dataset Summary](#dataset-summary) - [1. PubTabNet](#1-pubtabnet) - [2. TAL Table Recognition Competition Dataset](#2-tal-table-recognition-competition-dataset) +- [3. WTW Chinese scene table dataset](#3-wtw-chinese-scene-table-dataset) Here are the commonly used table recognition datasets, which are being updated continuously. Welcome to contribute datasets~ @@ -12,6 +13,7 @@ Here are the commonly used table recognition datasets, which are being updated c |---|---|---| | PubTabNet |https://github.com/ibm-aur-nlp/PubTabNet| jsonl format, which can be loaded directly with [pubtab_dataset.py](../../../ppocr/data/pubtab_dataset.py) | | TAL Table Recognition Competition Dataset |https://ai.100tal.com/dataset| jsonl format, which can be loaded directly with [pubtab_dataset.py](../../../ppocr/data/pubtab_dataset.py) | +| WTW Chinese scene table dataset |https://github.com/wangwen-whu/WTW-Dataset| Conversion is required to load with [pubtab_dataset.py](../../../ppocr/data/pubtab_dataset.py)| ## 1. PubTabNet - **Data Introduction**:The training set of the PubTabNet dataset contains 500,000 images and the validation set contains 9000 images. Part of the image visualization is shown below. @@ -30,3 +32,11 @@ Here are the commonly used table recognition datasets, which are being updated c + +## 3. WTW Chinese scene table dataset +- **Data Introduction**:The WTW Chinese scene table dataset consists of two parts: table detection and table data. The dataset contains images of two scenes, scanned and photographed. +https://github.com/wangwen-whu/WTW-Dataset/blob/main/demo/20210816_210413.gif + +
+ +
diff --git a/doc/doc_en/inference_args_en.md b/doc/doc_en/inference_args_en.md new file mode 100644 index 0000000000000000000000000000000000000000..f2c99fc8297d47f27a219bf7d8e7f2ea518257f0 --- /dev/null +++ b/doc/doc_en/inference_args_en.md @@ -0,0 +1,120 @@ +# PaddleOCR Model Inference Parameter Explanation + +When using PaddleOCR for model inference, you can customize the modification parameters to modify the model, data, preprocessing, postprocessing, etc.(parameter file:[utility.py](../../tools/infer/utility.py)),The detailed parameter explanation is as follows: + +* Global parameters + +| parameters | type | default | implication | +| :--: | :--: | :--: | :--: | +| image_dir | str | None, must be specified explicitly | Image or folder path | +| vis_font_path | str | "./doc/fonts/simfang.ttf" | font path for visualization | +| drop_score | float | 0.5 | Results with a recognition score less than this value will be discarded and will not be returned as results | +| use_pdserving | bool | False | Whether to use Paddle Serving for prediction | +| warmup | bool | False | Whether to enable warmup, this method can be used when statistical prediction time | +| draw_img_save_dir | str | "./inference_results" | The saving folder of the system's tandem prediction OCR results | +| save_crop_res | bool | False | Whether to save the recognized text image for OCR | +| crop_res_save_dir | str | "./output" | Save the text image path recognized by OCR | +| use_mp | bool | False | Whether to enable multi-process prediction | +| total_process_num | int | 6 | The number of processes, which takes effect when `use_mp` is `True` | +| process_id | int | 0 | The id number of the current process, no need to modify it yourself | +| benchmark | bool | False | Whether to enable benchmark, and make statistics on prediction speed, memory usage, etc. | +| save_log_path | str | "./log_output/" | Folder where log results are saved when `benchmark` is enabled | +| show_log | bool | True | Whether to show the log information in the inference | +| use_onnx | bool | False | Whether to enable onnx prediction | + + +* Prediction engine related parameters + +| parameters | type | default | implication | +| :--: | :--: | :--: | :--: | +| use_gpu | bool | True | Whether to use GPU for prediction | +| ir_optim | bool | True | Whether to analyze and optimize the calculation graph. The prediction process can be accelerated when `ir_optim` is enabled | +| use_tensorrt | bool | False | Whether to enable tensorrt | +| min_subgraph_size | int | 15 | The minimum subgraph size in tensorrt. When the size of the subgraph is greater than this value, it will try to use the trt engine to calculate the subgraph. | +| precision | str | fp32 | The precision of prediction, supports `fp32`, `fp16`, `int8` | +| enable_mkldnn | bool | True | Whether to enable mkldnn | +| cpu_threads | int | 10 | When mkldnn is enabled, the number of threads predicted by the cpu | + +* Text detection model related parameters + +| parameters | type | default | implication | +| :--: | :--: | :--: | :--: | +| det_algorithm | str | "DB" | Text detection algorithm name, currently supports `DB`, `EAST`, `SAST`, `PSE`, `DB++`, `FCE` | +| det_model_dir | str | xx | Detection inference model paths | +| det_limit_side_len | int | 960 | image side length limit | +| det_limit_type | str | "max" | The side length limit type, currently supports `min`and `max`. `min` means to ensure that the shortest side of the image is not less than `det_limit_side_len`, `max` means to ensure that the longest side of the image is not greater than `det_limit_side_len` | + +The relevant parameters of the DB algorithm are as follows + +| parameters | type | default | implication | +| :--: | :--: | :--: | :--: | +| det_db_thresh | float | 0.3 | In the probability map output by DB, only pixels with a score greater than this threshold will be considered as text pixels | +| det_db_box_thresh | float | 0.6 | Within the detection box, when the average score of all pixels is greater than the threshold, the result will be considered as a text area | +| det_db_unclip_ratio | float | 1.5 | The expansion factor of the `Vatti clipping` algorithm, which is used to expand the text area | +| max_batch_size | int | 10 | max batch size | +| use_dilation | bool | False | Whether to inflate the segmentation results to obtain better detection results | +| det_db_score_mode | str | "fast" | DB detection result score calculation method, supports `fast` and `slow`, `fast` calculates the average score according to all pixels within the bounding rectangle of the polygon, `slow` calculates the average score according to all pixels within the original polygon, The calculation speed is relatively slower, but more accurate. | + +The relevant parameters of the EAST algorithm are as follows + +| parameters | type | default | implication | +| :--: | :--: | :--: | :--: | +| det_east_score_thresh | float | 0.8 | Threshold for score map in EAST postprocess | +| det_east_cover_thresh | float | 0.1 | Average score threshold for text boxes in EAST postprocess | +| det_east_nms_thresh | float | 0.2 | Threshold of nms in EAST postprocess | + +The relevant parameters of the SAST algorithm are as follows + +| parameters | type | default | implication | +| :--: | :--: | :--: | :--: | +| det_sast_score_thresh | float | 0.5 | Score thresholds in SAST postprocess | +| det_sast_nms_thresh | float | 0.5 | Thresholding of nms in SAST postprocess | +| det_sast_polygon | bool | False | Whether polygon detection, curved text scene (such as Total-Text) is set to True | + +The relevant parameters of the PSE algorithm are as follows + +| parameters | type | default | implication | +| :--: | :--: | :--: | :--: | +| det_pse_thresh | float | 0.0 | Threshold for binarizing the output image | +| det_pse_box_thresh | float | 0.85 | Threshold for filtering boxes, below this threshold is discarded | +| det_pse_min_area | float | 16 | The minimum area of the box, below this threshold is discarded | +| det_pse_box_type | str | "box" | The type of the returned box, box: four point coordinates, poly: all point coordinates of the curved text | +| det_pse_scale | int | 1 | The ratio of the input image relative to the post-processed image, such as an image of `640*640`, the network output is `160*160`, and when the scale is 2, the shape of the post-processed image is `320*320`. Increasing this value can speed up the post-processing speed, but it will bring about a decrease in accuracy | + +* Text recognition model related parameters + +| parameters | type | default | implication | +| :--: | :--: | :--: | :--: | +| rec_algorithm | str | "CRNN" | Text recognition algorithm name, currently supports `CRNN`, `SRN`, `RARE`, `NETR`, `SAR`, `ViTSTR`, `ABINet`, `VisionLAN`, `SPIN`, `RobustScanner`, `SVTR`, `SVTR_LCNet` | +| rec_model_dir | str | None, it is required if using the recognition model | recognition inference model paths | +| rec_image_shape | list | [3, 48, 320] | Image size at the time of recognition | +| rec_batch_num | int | 6 | batch size | +| max_text_length | int | 25 | The maximum length of the recognition result, valid in `SRN` | +| rec_char_dict_path | str | "./ppocr/utils/ppocr_keys_v1.txt" | character dictionary file | +| use_space_char | bool | True | Whether to include spaces, if `True`, the `space` character will be added at the end of the character dictionary | + + +* End-to-end text detection and recognition model related parameters + +| parameters | type | default | implication | +| :--: | :--: | :--: | :--: | +| e2e_algorithm | str | "PGNet" | End-to-end algorithm name, currently supports `PGNet` | +| e2e_model_dir | str | None, it is required if using the end-to-end model | end-to-end model inference model path | +| e2e_limit_side_len | int | 768 | End-to-end input image side length limit | +| e2e_limit_type | str | "max" | End-to-end side length limit type, currently supports `min` and `max`. `min` means to ensure that the shortest side of the image is not less than `e2e_limit_side_len`, `max` means to ensure that the longest side of the image is not greater than `e2e_limit_side_len` | +| e2e_pgnet_score_thresh | float | 0.5 | End-to-end score threshold, results below this threshold are discarded | +| e2e_char_dict_path | str | "./ppocr/utils/ic15_dict.txt" | Recognition dictionary file path | +| e2e_pgnet_valid_set | str | "totaltext" | The name of the validation set, currently supports `totaltext`, `partvgg`, the post-processing methods corresponding to different data sets are different, and it can be consistent with the training process | +| e2e_pgnet_mode | str | "fast" | PGNet's detection result score calculation method, supports `fast` and `slow`, `fast` calculates the average score according to all pixels within the bounding rectangle of the polygon, `slow` calculates the average score according to all pixels within the original polygon, The calculation speed is relatively slower, but more accurate. | + + +* Angle classifier model related parameters + +| parameters | type | default | implication | +| :--: | :--: | :--: | :--: | +| use_angle_cls | bool | False | whether to use an angle classifier | +| cls_model_dir | str | None, if you need to use, you must specify the path explicitly | angle classifier inference model path | +| cls_image_shape | list | [3, 48, 192] | prediction shape | +| label_list | list | ['0', '180'] | The angle value corresponding to the class id | +| cls_batch_num | int | 6 | batch size | +| cls_thresh | float | 0.9 | Prediction threshold, when the model prediction result is 180 degrees, and the score is greater than the threshold, the final prediction result is considered to be 180 degrees and needs to be flipped | diff --git a/doc/doc_en/inference_ppocr_en.md b/doc/doc_en/inference_ppocr_en.md index 0f57b0ba6b226c19ecb1e0b60afdfa34302b8e78..4c9db51e1d23e5ac05cfcb3ec43748df75c0b36c 100755 --- a/doc/doc_en/inference_ppocr_en.md +++ b/doc/doc_en/inference_ppocr_en.md @@ -160,3 +160,5 @@ python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --de After executing the command, the recognition result image is as follows: ![](../imgs_results/system_res_00018069_v3.jpg) + +For more configuration and explanation of inference parameters, please refer to:[Model Inference Parameters Explained Tutorial](./inference_args_en.md)。 diff --git a/doc/doc_en/kie_en.md b/doc/doc_en/kie_en.md new file mode 100644 index 0000000000000000000000000000000000000000..0c335a5ceb8991b80bc0cab6facdf402878abb50 --- /dev/null +++ b/doc/doc_en/kie_en.md @@ -0,0 +1,491 @@ +# Key Information Extraction + +This tutorial provides a guide to the whole process of key information extraction using PaddleOCR, including data preparation, model training, optimization, evaluation, prediction of semantic entity recognition (SER) and relationship extraction (RE) tasks. + + +- [1. Data Preparation](#Data-Preparation) + - [1.1. Prepare for dataset](#11-Prepare-for-dataset) + - [1.2. Custom Dataset](#12-Custom-Dataset) + - [1.3. Download data](#13-Download-data) +- [2. Training](#2-Training) + - [2.1. Start Training](#21-start-training) + - [2.2. Resume Training](#22-Resume-Training) + - [2.3. Mixed Precision Training](#23-Mixed-Precision-Training) + - [2.4. Distributed Training](#24-Distributed-Training) + - [2.5. Train using knowledge distillation](#25-Train-using-knowledge-distillation) + - [2.6. Training on other platform](#26-Training-on-other-platform) +- [3. Evaluation and Test](#3-Evaluation-and-Test) + - [3.1. Evaluation](#31-指标评估) + - [3.2. Test](#32-Test) +- [4. Model inference](#4-Model-inference) +- [5. FAQ](#5-faq) + + +# 1. Data Preparation + +## 1.1. Prepare for dataset + +PaddleOCR supports the following data format when training KIE models. + +- `general data` is used to train a dataset whose annotation is stored in a text file (SimpleDataset). + + +The default storage path of training data is `PaddleOCR/train_data`. If you already have datasets on your disk, you only need to create a soft link to the dataset directory. + +``` +# linux and mac os +ln -sf /train_data/dataset +# windows +mklink /d /train_data/dataset +``` + +## 1.2. Custom Dataset + +The training process generally includes the training set and the evaluation set. The data formats of the two sets are same. + +**(1) Training set** + +It is recommended to put the training images into the same folder, record the path and annotation of images in a text file. The contents of the text file are as follows: + + +```py +" image path annotation information " +zh_train_0.jpg [{"transcription": "汇丰晋信", "label": "other", "points": [[104, 114], [530, 114], [530, 175], [104, 175]], "id": 1, "linking": []}, {"transcription": "受理时间:", "label": "question", "points": [[126, 267], [266, 267], [266, 305], [126, 305]], "id": 7, "linking": [[7, 13]]}, {"transcription": "2020.6.15", "label": "answer", "points": [[321, 239], [537, 239], [537, 285], [321, 285]], "id": 13, "linking": [[7, 13]]}] +zh_train_1.jpg [{"transcription": "中国人体器官捐献", "label": "other", "points": [[544, 459], [954, 459], [954, 517], [544, 517]], "id": 1, "linking": []}, {"transcription": ">编号:MC545715483585", "label": "other", "points": [[1462, 470], [2054, 470], [2054, 543], [1462, 543]], "id": 10, "linking": []}, {"transcription": "CHINAORGANDONATION", "label": "other", "points": [[543, 516], [958, 516], [958, 551], [543, 551]], "id": 14, "linking": []}, {"transcription": "中国人体器官捐献志愿登记表", "label": "header", "points": [[635, 793], [1892, 793], [1892, 904], [635, 904]], "id": 18, "linking": []}] +... +``` + +**Note:** In the text file, please split the image path and annotation with `\t`. Otherwise, error will happen when training. + +The annotation can be parsed by `json` into a list of sub-annotations. Each element in the list is a dict, which stores the required information of each text line. The required fields are as follows. + +- transcription: stores the text content of the text line +- label: the category of the text line content +- points: stores the four point position information of the text line +- id: stores the ID information of the text line for RE model training +- linking: stores the connection information between text lines for RE model training + +**(2) Evaluation set** + +The evaluation set is constructed in the same way as the training set. + +**(3) Dictionary file** + +The textlines in the training set and the evaluation set contain label information. The list of all labels is stored in the dictionary file (such as `class_list.txt`). Each line in the dictionary file is represented as a label name. + +For example, FUND_zh data contains four categories. The contents of the dictionary file are as follows. + +``` +OTHER +QUESTION +ANSWER +HEADER +``` + +In the annotation file, the annotation information of the `label` field of the text line content of each annotation needs to belong to the dictionary content. + + +The final dataset shall have the following file structure. + +``` +|-train_data + |-data_name + |- train.json + |- train + |- zh_train_0.png + |- zh_train_1.jpg + | ... + |- val.json + |- val + |- zh_val_0.png + |- zh_val_1.jpg + | ... +``` + +**Note:** + +-The category information in the annotation file is not case sensitive. For example, 'HEADER' and 'header' will be seen as the same category ID. +- In the dictionary file, it is recommended to put the `other` category (other textlines that need not be paid attention to can be labeled as `other`) on the first line. When parsing, the category ID of the 'other' category will be resolved to 0, and the textlines predicted as `other` will not be visualized later. + +## 1.3. Download data + +If you do not have local dataset, you can donwload the source files of [XFUND](https://github.com/doc-analysis/XFUND) or [FUNSD](https://guillaumejaume.github.io/FUNSD) and use the scripts of [XFUND](../../ppstructure/kie/tools/trans_xfun_data.py) or [FUNSD](../../ppstructure/kie/tools/trans_funsd_label.py) for tranform them into PaddleOCR format. Then you can use the public dataset to quick experience KIE. + +For more information about public KIE datasets, please refer to [KIE dataset tutorial](./dataset/kie_datasets_en.md). + +PaddleOCR also supports the annotation of KIE models. Please refer to [PPOCRLabel tutorial](../../PPOCRLabel/README.md). + +# 2. Training + +PaddleOCR provides training scripts, evaluation scripts and inference scripts. We will introduce based on VI-LayoutXLM model in this section. +This section will take the VI layoutxlm multimodal pre training model as an example to explain. + +> If you want to use the SDMGR based KIE algorithm, please refer to: [SDMGR tutorial](./algorithm_kie_sdmgr_en.md). + + +## 2.1. Start Training + +If you do not use a custom dataset, you can use XFUND_zh that has been processed in PaddleOCR dataset for quick experience. + + +```bash +mkdir train_data +cd train_data +wget https://paddleocr.bj.bcebos.com/ppstructure/dataset/XFUND.tar && tar -xf XFUND.tar +cd .. +``` + +If you don't want to train, and want to directly experience the process of model evaluation, prediction, and inference, you can download the training model provided in PaddleOCR and skip section 2.1. + + +Use the following command to download the trained model. + +```bash +mkdir pretrained_model +cd pretrained_model +# download and uncompress SER model +wget https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/ser_vi_layoutxlm_xfund_pretrained.tar & tar -xf ser_vi_layoutxlm_xfund_pretrained.tar + +# download and uncompress RE model +wget https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/re_vi_layoutxlm_xfund_pretrained.tar & tar -xf re_vi_layoutxlm_xfund_pretrained.tar +``` + +Start training: + +- If your paddlepaddle version is `CPU`, you need to set `Global.use_gpu=False` in your config file. +- During training, PaddleOCR will download the VI-LayoutXLM pretraining model by default. There is no need to download it in advance. + +```bash +# GPU training, support single card and multi-cards +# The training log will be save in "{Global.save_model_dir}/train.log" + +# train SER model using single card +python3 tools/train.py -c configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml + +# train SER model using multi-cards, you can use --gpus to assign the GPU ids. +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml + +# train RE model using single card +python3 tools/train.py -c configs/kie/vi_layoutxlm/re_vi_layoutxlm_xfund_zh.yml +``` + +Take the SER model training as an example. After the training is started, you will see the following log output. + +``` +[2022/08/08 16:28:28] ppocr INFO: epoch: [1/200], global_step: 10, lr: 0.000006, loss: 1.871535, avg_reader_cost: 0.28200 s, avg_batch_cost: 0.82318 s, avg_samples: 8.0, ips: 9.71838 samples/s, eta: 0:51:59 +[2022/08/08 16:28:33] ppocr INFO: epoch: [1/200], global_step: 19, lr: 0.000018, loss: 1.461939, avg_reader_cost: 0.00042 s, avg_batch_cost: 0.32037 s, avg_samples: 6.9, ips: 21.53773 samples/s, eta: 0:37:55 +[2022/08/08 16:28:39] ppocr INFO: cur metric, precision: 0.11526348939743859, recall: 0.19776657060518732, hmean: 0.14564265817747712, fps: 34.008392345050055 +[2022/08/08 16:28:45] ppocr INFO: save best model is to ./output/ser_vi_layoutxlm_xfund_zh/best_accuracy +[2022/08/08 16:28:45] ppocr INFO: best metric, hmean: 0.14564265817747712, precision: 0.11526348939743859, recall: 0.19776657060518732, fps: 34.008392345050055, best_epoch: 1 +[2022/08/08 16:28:51] ppocr INFO: save model in ./output/ser_vi_layoutxlm_xfund_zh/latest +``` + +The following information will be automatically printed. + + +|Field | meaning| +| :----: | :------: | +|epoch | current iteration round| +|iter | current iteration times| +|lr | current learning rate| +|loss | current loss function| +| reader_cost | current batch data processing time| +| batch_ Cost | total current batch time| +|samples | number of samples in the current batch| +|ips | number of samples processed per second| + + +PaddleOCR supports evaluation during training. you can modify `eval_batch_step` in the config file `configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml` (default as 19 iters). Trained model with best hmean will be saved as `output/ser_vi_layoutxlm_xfund_zh/best_accuracy/`. + +If the evaluation dataset is very large, it's recommended to enlarge the eval interval or evaluate the model after training. + +**Note:** for more KIE models training and configuration files, you can go into `configs/kie/` or refer to [Frontier KIE algorithms](./algorithm_overview_en.md). + + +If you want to train model on your own dataset, you need to modify the data path, dictionary file and category number in the configuration file. + + +Take `configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml` as an example, contents we need to fix is as follows. + +```yaml +Architecture: + # ... + Backbone: + name: LayoutXLMForSer + pretrained: True + mode: vi + # Assuming that n categroies are included in the dictionary file (other is included), the the num_classes is set as 2n-1 + num_classes: &num_classes 7 + +PostProcess: + name: kieSerTokenLayoutLMPostProcess + # Modify the dictionary file path for your custom dataset + class_path: &class_path train_data/XFUND/class_list_xfun.txt + +Train: + dataset: + name: SimpleDataSet + # Modify the data path for your training dataset + data_dir: train_data/XFUND/zh_train/image + # Modify the data annotation path for your training dataset + label_file_list: + - train_data/XFUND/zh_train/train.json + ... + loader: + # batch size for single card when training + batch_size_per_card: 8 + ... + +Eval: + dataset: + name: SimpleDataSet + # Modify the data path for your evaluation dataset + data_dir: train_data/XFUND/zh_val/image + # Modify the data annotation path for your evaluation dataset + label_file_list: + - train_data/XFUND/zh_val/val.json + ... + loader: + # batch size for single card when evaluation + batch_size_per_card: 8 +``` + +**Note that the configuration file for prediction/evaluation must be consistent with the training file.** + + +## 2.2. Resume Training + +If the training process is interrupted and you want to load the saved model to resume training, you can specify the path of the model to be loaded by specifying `Architecture.Backbone.checkpoints`. + + +```bash +python3 tools/train.py -c configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml -o Architecture.Backbone.checkpoints=./output/ser_vi_layoutxlm_xfund_zh/best_accuracy +``` + +**Note:** + +- Priority of `Architecture.Backbone.checkpoints` is higher than` Architecture.Backbone.pretrained`. You need to set `Architecture.Backbone.checkpoints` for model finetuning, resume and evalution. If you want to train with the NLP pretrained model, you need to set `Architecture.Backbone.pretrained` as `True` and set `Architecture.Backbone.checkpoints` as null (`null`). +- PaddleNLP pretrained models are used here for LayoutXLM series models, the model loading and saving logic is same as those in PaddleNLP. Therefore we do not need to set `Global.pretrained_model` or `Global.checkpoints` here. +- If you use knowledge distillation to train the LayoutXLM series models, resuming training is not supported now. + +## 2.3. Mixed Precision Training + +coming soon! + +## 2.4. Distributed Training + +During multi-machine multi-gpu training, use the `--ips` parameter to set the used machine IP address, and the `--gpus` parameter to set the used GPU ID: + +```bash +python3 -m paddle.distributed.launch --ips="xx.xx.xx.xx,xx.xx.xx.xx" --gpus '0,1,2,3' tools/train.py -c configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml +``` + +**Note:** (1) When using multi-machine and multi-gpu training, you need to replace the ips value in the above command with the address of your machine, and the machines need to be able to ping each other. (2) Training needs to be launched separately on multiple machines. The command to view the ip address of the machine is `ifconfig`. (3) For more details about the distributed training speedup ratio, please refer to [Distributed Training Tutorial](./distributed_training_en.md). + + +## 2.5. Train with Knowledge Distillation + +Knowledge distillation is supported in PaddleOCR for KIE model training process. The configuration file is [ser_vi_layoutxlm_xfund_zh_udml.yml](../../configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh_udml.yml). For more information, please refer to [doc](./knowledge_distillation_en.md). + +**Note:** The saving and loading logic of the LayoutXLM series KIE models in PaddleOCR is consistent with PaddleNLP, so only the parameters of the student model are saved in the distillation process. If you want to use the saved model for evaluation, you need to use the configuration of the student model (the student model corresponding to the distillation file above is [ser_vi_layoutxlm_xfund_zh.yml](../../configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml). + + + +## 2.6. Training on other platform + +- Windows GPU/CPU +The Windows platform is slightly different from the Linux platform: +Windows platform only supports `single gpu` training and inference, specify GPU for training `set CUDA_VISIBLE_DEVICES=0` +On the Windows platform, DataLoader only supports single-process mode, so you need to set `num_workers` to 0; + +- macOS +GPU mode is not supported, you need to set `use_gpu` to False in the configuration file, and the rest of the training evaluation prediction commands are exactly the same as Linux GPU. + +- Linux DCU +Running on a DCU device requires setting the environment variable `export HIP_VISIBLE_DEVICES=0,1,2,3`, and the rest of the training and evaluation prediction commands are exactly the same as the Linux GPU. + +# 3. Evaluation and Test + +## 3.1. Evaluation + +The trained model will be saved in `Global.save_model_dir`. When evaluation, you need to set `Architecture.Backbone.checkpoints` as your model directroy. The evaluation dataset can be set by modifying the `Eval.dataset.label_file_list` field in the `configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml` file. + + +```bash +# GPU evaluation, Global.checkpoints is the weight to be tested +python3 tools/eval.py -c configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml -o Architecture.Backbone.checkpoints=./output/ser_vi_layoutxlm_xfund_zh/best_accuracy +``` + +The following information will be printed such as precision, recall, hmean and so on. + +```py +[2022/08/09 07:59:28] ppocr INFO: metric eval *************** +[2022/08/09 07:59:28] ppocr INFO: precision:0.697476609016161 +[2022/08/09 07:59:28] ppocr INFO: recall:0.8861671469740634 +[2022/08/09 07:59:28] ppocr INFO: hmean:0.7805806758686339 +[2022/08/09 07:59:28] ppocr INFO: fps:17.367364606899105 +``` + + +## 3.2. Test + +Using the model trained by PaddleOCR, we can quickly get prediction through the following script. + +The default prediction image is stored in `Global.infer_img`, and the trained model weight is specified via `-o Global.checkpoints`. + +According to the `Global.save_model_dir` and `save_epoch_step` fields set in the configuration file, the following parameters will be saved. + + +``` +output/ser_vi_layoutxlm_xfund_zh/ +├── best_accuracy + ├── metric.states + ├── model_config.json + ├── model_state.pdparams +├── best_accuracy.pdopt +├── config.yml +├── train.log +├── latest + ├── metric.states + ├── model_config.json + ├── model_state.pdparams +├── latest.pdopt +``` + +Among them, best_accuracy.* is the best model on the evaluation set; latest.* is the model of the last epoch. + +The configuration file for prediction must be consistent with the training file. If you finish the training process using `python3 tools/train.py -c configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml`. You can use the following command for prediction. + + +```bash +python3 tools/infer_kie_token_ser.py -c configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml -o Architecture.Backbone.checkpoints=./output/ser_vi_layoutxlm_xfund_zh/best_accuracy Global.infer_img=./ppstructure/docs/kie/input/zh_val_42.jpg +``` + +The output image is as follows, which is also saved in `Global.save_res_path`. + + +
+ +
+ +During the prediction process, the detection and recognition model of PP-OCRv3 will be loaded by default for information extraction of OCR. If you want to load the OCR results obtained in advance, you can use the following method to predict, and specify `Global.infer_img` as the annotation file, which contains the image path and OCR information, and specifies `Global.infer_mode` as False, indicating that the OCR inference engine is not used at this time. + +```bash +python3 tools/infer_kie_token_ser.py -c configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml -o Architecture.Backbone.checkpoints=./output/ser_vi_layoutxlm_xfund_zh/best_accuracy Global.infer_img=./train_data/XFUND/zh_val/val.json Global.infer_mode=False +``` + +For the above image, if information extraction is performed using the labeled OCR results, the prediction results are as follows. + +
+ +
+ +It can be seen that part of the detection information is more accurate, but the overall information extraction results are basically the same. + +In RE model prediction, the SER model result needs to be given first, so the configuration file and model weight of SER need to be loaded at the same time, as shown in the following example. + +```bash +python3 ./tools/infer_kie_token_ser_re.py \ + -c configs/kie/vi_layoutxlm/re_vi_layoutxlm_xfund_zh.yml \ + -o Architecture.Backbone.checkpoints=./pretrain_models/re_vi_layoutxlm_udml_xfund_zh/best_accuracy/ \ + Global.infer_img=./train_data/XFUND/zh_val/image/ \ + -c_ser configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml \ + -o_ser Architecture.Backbone.checkpoints=pretrain_models/ \ + ser_vi_layoutxlm_udml_xfund_zh/best_accuracy/ +``` + +The result is as follows. + +
+ +
+ + +If you want to load the OCR results obtained in advance, you can use the following method to predict, and specify `Global.infer_img` as the annotation file, which contains the image path and OCR information, and specifies `Global.infer_mode` as False, indicating that the OCR inference engine is not used at this time. + +```bash +python3 ./tools/infer_kie_token_ser_re.py \ + -c configs/kie/vi_layoutxlm/re_vi_layoutxlm_xfund_zh.yml \ + -o Architecture.Backbone.checkpoints=./pretrain_models/re_vi_layoutxlm_udml_xfund_zh/best_accuracy/ \ + Global.infer_img=./train_data/XFUND/zh_val/val.json \ + Global.infer_mode=False \ + -c_ser configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml \ + -o_ser Architecture.Backbone.checkpoints=pretrain_models/ser_vi_layoutxlm_udml_xfund_zh/best_accuracy/ +``` + +`c_ser` denotes SER configurations file, `o_ser` denotes the SER model configurations that will override corresponding content in the file. + + +The result is as follows. + +
+ +
+ + +It can be seen that the re prediction results directly using the annotated OCR results are more accurate. + + +# 4. Model inference + + +## 4.1 Export the model + +The inference model (the model saved by `paddle.jit.save`) is generally a solidified model saved after the model training is completed, and is mostly used to give prediction in deployment. + +The model saved during the training process is the checkpoints model, which saves the parameters of the model and is mostly used to resume training. + +Compared with the checkpoints model, the inference model will additionally save the structural information of the model. Therefore, it is easier to deploy because the model structure and model parameters are already solidified in the inference model file, and is suitable for integration with actual systems. + +The SER model can be converted to the inference model using the following command. + + +```bash +# -c Set the training algorithm yml configuration file. +# -o Set optional parameters. +# Architecture.Backbone.checkpoints Set the training model address. +# Global.save_inference_dir Set the address where the converted model will be saved. +python3 tools/export_model.py -c configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml -o Architecture.Backbone.checkpoints=./output/ser_vi_layoutxlm_xfund_zh/best_accuracy Global.save_inference_dir=./inference/ser_vi_layoutxlm +``` + +After the conversion is successful, there are three files in the model save directory: + +``` +inference/ser_vi_layoutxlm/ + ├── inference.pdiparams # The parameter file of recognition inference model + ├── inference.pdiparams.info # The parameter information of recognition inference model, which can be ignored + └── inference.pdmodel # The program file of recognition +``` + +Export of RE model is also in adaptation. + +## 4.2 Model inference + +The VI layoutxlm model performs reasoning based on the ser task, and can execute the following commands: + + +Using the following command to infer the VI-LayoutXLM model. + +```bash +cd ppstructure +python3 kie/predict_kie_token_ser.py \ + --kie_algorithm=LayoutXLM \ + --ser_model_dir=../inference/ser_vi_layoutxlm \ + --image_dir=./docs/kie/input/zh_val_42.jpg \ + --ser_dict_path=../train_data/XFUND/class_list_xfun.txt \ + --vis_font_path=../doc/fonts/simfang.ttf \ + --ocr_order_method="tb-yx" +``` + +The visualized result will be saved in `./output`, which is shown as follows. + +
+ +
+ + +# 5. FAQ + +Q1: After the training model is transferred to the inference model, the prediction effect is inconsistent? + +**A**:The problems are mostly caused by inconsistent preprocessing and postprocessing parameters when the trained model predicts and the preprocessing and postprocessing parameters when the inference model predicts. You can compare whether there are differences in preprocessing, postprocessing, and prediction in the configuration files used for training. diff --git a/doc/doc_en/quickstart_en.md b/doc/doc_en/quickstart_en.md index c678dc47625f4289a93621144bf5577b059d52b3..9e1de839ff0ed8291f1822186f43cb24c9f9ebce 100644 --- a/doc/doc_en/quickstart_en.md +++ b/doc/doc_en/quickstart_en.md @@ -3,14 +3,14 @@ **Note:** This tutorial mainly introduces the usage of PP-OCR series models, please refer to [PP-Structure Quick Start](../../ppstructure/docs/quickstart_en.md) for the quick use of document analysis related functions. - [1. Installation](#1-installation) - - [1.1 Install PaddlePaddle](#11-install-paddlepaddle) - - [1.2 Install PaddleOCR Whl Package](#12-install-paddleocr-whl-package) + - [1.1 Install PaddlePaddle](#11-install-paddlepaddle) + - [1.2 Install PaddleOCR Whl Package](#12-install-paddleocr-whl-package) - [2. Easy-to-Use](#2-easy-to-use) - - [2.1 Use by Command Line](#21-use-by-command-line) - - [2.1.1 Chinese and English Model](#211-chinese-and-english-model) - - [2.1.2 Multi-language Model](#212-multi-language-model) - - [2.2 Use by Code](#22-use-by-code) - - [2.2.1 Chinese & English Model and Multilingual Model](#221-chinese--english-model-and-multilingual-model) + - [2.1 Use by Command Line](#21-use-by-command-line) + - [2.1.1 Chinese and English Model](#211-chinese-and-english-model) + - [2.1.2 Multi-language Model](#212-multi-language-model) + - [2.2 Use by Code](#22-use-by-code) + - [2.2.1 Chinese & English Model and Multilingual Model](#221-chinese--english-model-and-multilingual-model) - [3. Summary](#3-summary) @@ -51,12 +51,6 @@ pip install "paddleocr>=2.0.1" # Recommend to use version 2.0.1+ Reference: [Solve shapely installation on windows](https://stackoverflow.com/questions/44398265/install-shapely-oserror-winerror-126-the-specified-module-could-not-be-found) -- **For layout analysis users**, run the following command to install **Layout-Parser** - - ```bash - pip3 install -U https://paddleocr.bj.bcebos.com/whl/layoutparser-0.0.0-py3-none-any.whl - ``` - ## 2. Easy-to-Use diff --git a/doc/doc_en/table_recognition_en.md b/doc/doc_en/table_recognition_en.md new file mode 100644 index 0000000000000000000000000000000000000000..aacf9ca673a5ce281cf7ae49bfead02b2c73db09 --- /dev/null +++ b/doc/doc_en/table_recognition_en.md @@ -0,0 +1,354 @@ +# Table Recognition + +This article provides a full-process guide for the PaddleOCR table recognition model, including data preparation, model training, tuning, evaluation, prediction, and detailed descriptions of each stage: + +- [1. Data Preparation](#1-data-preparation) + - [1.1. DataSet Format](#11-dataset-format) + - [1.2. Data Download](#12-data-download) + - [1.3. Dataset Generation](#13-dataset-generation) +- [2. Training](#2-training) + - [2.1. Start Training](#21-start-training) + - [2.2. Resume Training](#22-resume-training) + - [2.3. Training with New Backbone](#23-training-with-new-backbone) + - [2.4. Mixed Precision Training](#24-mixed-precision-training) + - [2.5. Distributed Training](#25-distributed-training) + - [2.6. Training on other platform(Windows/macOS/Linux DCU)](#26-training-on-other-platformwindowsmacoslinux-dcu) + - [2.7. Fine-tuning](#27-fine-tuning) +- [3. Evaluation and Test](#3-evaluation-and-test) + - [3.1. Evaluation](#31-evaluation) + - [3.2. Test table structure recognition effect](#32-test-table-structure-recognition-effect) +- [4. Model export and prediction](#4-model-export-and-prediction) + - [4.1 Model export](#41-model-export) + - [4.2 Prediction](#42-prediction) +- [5. FAQ](#5-faq) + +# 1. Data Preparation + +## 1.1. DataSet Format + +The format of the PaddleOCR table recognition model dataset is as follows: +```txt +img_label # Each image is marked with a string after json.dumps() +... +img_label +``` + +The json format of each line is: +```txt +{ + 'filename': PMC5755158_010_01.png,# image name + 'split': ’train‘, # whether the image belongs to the training set or the validation set + 'imgid': 0,# index of image + 'html': { + 'structure': {'tokens': ['', '', '', ...]}, # HTML string of the table + 'cell': [ + { + 'tokens': ['P', 'a', 'd', 'd', 'l', 'e', 'P', 'a', 'd', 'd', 'l', 'e'], # text in cell + 'bbox': [x0, y0, x1, y1] # bbox of cell + } + ] + } +} +``` + +The default storage path for training data is `PaddleOCR/train_data`, if you already have a dataset on disk, just create a soft link to the dataset directory: + +``` +# linux and mac os +ln -sf /train_data/dataset +# windows +mklink /d /train_data/dataset +``` + +## 1.2. Data Download + +Download the public dataset reference [table_datasets](dataset/table_datasets_en.md)。 + +## 1.3. Dataset Generation + +Use [TableGeneration](https://github.com/WenmuZhou/TableGeneration) to generate scanned table images. + +TableGeneration is an open source table dataset generation tool, which renders html strings through browser rendering to obtain table images. + +Some samples are as follows: + +|Type|Sample| +|---|---| +|Simple Table|![](https://raw.githubusercontent.com/WenmuZhou/TableGeneration/main/imgs/simple.jpg)| +|Simple Color Table|![](https://raw.githubusercontent.com/WenmuZhou/TableGeneration/main/imgs/color.jpg)| + +# 2. Training + +PaddleOCR provides training scripts, evaluation scripts, and prediction scripts. In this section, the [SLANet](../../configs/table/SLANet.yml) model will be used as an example: + +## 2.1. Start Training + +*If you are installing the cpu version, please modify the `use_gpu` field in the configuration file to false* + +``` +# GPU training Support single card and multi-card training +# The training log will be automatically saved as train.log under "{save_model_dir}" + +# specify the single card training(Long training time, not recommended) +python3 tools/train.py -c configs/table/SLANet.yml + +# specify the card number through --gpus +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/table/SLANet.yml +``` + +After starting training normally, you will see the following log output: + +``` +[2022/08/16 03:07:33] ppocr INFO: epoch: [1/400], global_step: 20, lr: 0.000100, acc: 0.000000, loss: 3.915012, structure_loss: 3.229450, loc_loss: 0.670590, avg_reader_cost: 2.63382 s, avg_batch_cost: 6.32390 s, avg_samples: 48.0, ips: 7.59025 samples/s, eta: 9 days, 2:29:27 +[2022/08/16 03:08:41] ppocr INFO: epoch: [1/400], global_step: 40, lr: 0.000100, acc: 0.000000, loss: 1.750859, structure_loss: 1.082116, loc_loss: 0.652822, avg_reader_cost: 0.02533 s, avg_batch_cost: 3.37251 s, avg_samples: 48.0, ips: 14.23271 samples/s, eta: 6 days, 23:28:43 +[2022/08/16 03:09:46] ppocr INFO: epoch: [1/400], global_step: 60, lr: 0.000100, acc: 0.000000, loss: 1.395154, structure_loss: 0.776803, loc_loss: 0.625030, avg_reader_cost: 0.02550 s, avg_batch_cost: 3.26261 s, avg_samples: 48.0, ips: 14.71214 samples/s, eta: 6 days, 5:11:48 +``` + +The following information is automatically printed in the log: + +| Field | Meaning | +| :----: | :------: | +| epoch | current iteration round | +| global_step | current iteration count | +| lr | current learning rate | +| acc | The accuracy of the current batch | +| loss | current loss function | +| structure_loss | Table Structure Loss Values | +| loc_loss | Cell Coordinate Loss Value | +| avg_reader_cost | Current batch data processing time | +| avg_batch_cost | The total time spent in the current batch | +| avg_samples | The number of samples in the current batch | +| ips | Number of images processed per second | + + +PaddleOCR supports alternating training and evaluation. You can modify `eval_batch_step` in `configs/table/SLANet.yml` to set the evaluation frequency. By default, it is evaluated once every 1000 iters. During the evaluation process, the best acc model is saved as `output/SLANet/best_accuracy` by default. + +If the validation set is large, the test will be time-consuming. It is recommended to reduce the number of evaluations, or perform evaluation after training. + +**Tips:** You can use the -c parameter to select various model configurations under the `configs/table/` path for training. For the table recognition algorithms supported by PaddleOCR, please refer to [Table Algorithms List](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/doc/doc_en/algorithm_overview_en.md#3): + +**Note that the configuration file for prediction/evaluation must be the same as training. ** + +## 2.2. Resume Training + +If the training program is interrupted, if you want to load the interrupted model to resume training, you can specify the path of the model to be loaded by specifying Global.checkpoints: + +```shell +python3 tools/train.py -c configs/table/SLANet.yml -o Global.checkpoints=./your/trained/model +``` +**Note**: The priority of `Global.checkpoints` is higher than that of `Global.pretrained_model`, that is, when two parameters are specified at the same time, the model specified by `Global.checkpoints` will be loaded first. If `Global.checkpoints` The specified model path is incorrect, and the model specified by `Global.pretrained_model` will be loaded. + +## 2.3. Training with New Backbone + +The network part completes the construction of the network, and PaddleOCR divides the network into four parts, which are under [ppocr/modeling](../../ppocr/modeling). The data entering the network will pass through these four parts in sequence(transforms->backbones-> +necks->heads). + +```bash +├── architectures # Code for building network +├── transforms # Image Transformation Module +├── backbones # Feature extraction module +├── necks # Feature enhancement module +└── heads # Output module +``` + +If the Backbone to be replaced has a corresponding implementation in PaddleOCR, you can directly modify the parameters in the `Backbone` part of the configuration yml file. + +However, if you want to use a new Backbone, an example of replacing the backbones is as follows: + +1. Create a new file under the [ppocr/modeling/backbones](../../ppocr/modeling/backbones) folder, such as my_backbone.py. +2. Add code in the my_backbone.py file, the sample code is as follows: + +```python +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + + +class MyBackbone(nn.Layer): + def __init__(self, *args, **kwargs): + super(MyBackbone, self).__init__() + # your init code + self.conv = nn.xxxx + + def forward(self, inputs): + # your network forward + y = self.conv(inputs) + return y +``` + +3. Import the added module in the [ppocr/modeling/backbones/\__init\__.py](../../ppocr/modeling/backbones/__init__.py) file. + +After adding the four-part modules of the network, you only need to configure them in the configuration file to use, such as: + +```yaml + Backbone: + name: MyBackbone + args1: args1 +``` + +**NOTE**: More details about replace Backbone and other mudule can be found in [doc](add_new_algorithm_en.md). + +## 2.4. Mixed Precision Training + +If you want to speed up your training further, you can use [Auto Mixed Precision Training](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/01_paddle2.0_introduction/basic_concept/amp_cn.html), taking a single machine and a single gpu as an example, the commands are as follows: + +```shell +python3 tools/train.py -c configs/table/SLANet.yml \ + -o Global.pretrained_model=./pretrain_models/SLANet/best_accuracy \ + Global.use_amp=True Global.scale_loss=1024.0 Global.use_dynamic_loss_scaling=True + ``` + +## 2.5. Distributed Training + +During multi-machine multi-gpu training, use the `--ips` parameter to set the used machine IP address, and the `--gpus` parameter to set the used GPU ID: + +```bash +python3 -m paddle.distributed.launch --ips="xx.xx.xx.xx,xx.xx.xx.xx" --gpus '0,1,2,3' tools/train.py -c configs/table/SLANet.yml \ + -o Global.pretrained_model=./pretrain_models/SLANet/best_accuracy +``` + + +**Note:** (1) When using multi-machine and multi-gpu training, you need to replace the ips value in the above command with the address of your machine, and the machines need to be able to ping each other. (2) Training needs to be launched separately on multiple machines. The command to view the ip address of the machine is `ifconfig`. (3) For more details about the distributed training speedup ratio, please refer to [Distributed Training Tutorial](./distributed_training_en.md). + +## 2.6. Training on other platform(Windows/macOS/Linux DCU) + +- Windows GPU/CPU +The Windows platform is slightly different from the Linux platform: +Windows platform only supports `single gpu` training and inference, specify GPU for training `set CUDA_VISIBLE_DEVICES=0` +On the Windows platform, DataLoader only supports single-process mode, so you need to set `num_workers` to 0; + +- macOS +GPU mode is not supported, you need to set `use_gpu` to False in the configuration file, and the rest of the training evaluation prediction commands are exactly the same as Linux GPU. + +- Linux DCU +Running on a DCU device requires setting the environment variable `export HIP_VISIBLE_DEVICES=0,1,2,3`, and the rest of the training and evaluation prediction commands are exactly the same as the Linux GPU. + + +## 2.7. Fine-tuning + +In the actual use process, it is recommended to load the officially provided pre-training model and fine-tune it in your own data set. For the fine-tuning method of the table recognition model, please refer to: [Model fine-tuning tutorial](./finetune.md). + + +# 3. Evaluation and Test + +## 3.1. Evaluation + +The model parameters during training are saved in the `Global.save_model_dir` directory by default. When evaluating metrics, you need to set `Global.checkpoints` to point to the saved parameter file. Evaluation datasets can be modified via the `label_file_list` setting in Eval via `configs/table/SLANet.yml`. + +``` +# GPU evaluation, Global.checkpoints is the weight to be tested +python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/table/SLANet.yml -o Global.checkpoints={path/to/weights}/best_accuracy +``` + +After the operation is completed, the acc indicator of the model will be output. If you evaluate the English table recognition model, you will see the following output. + +```bash +[2022/08/16 07:59:55] ppocr INFO: acc:0.7622245132160782 +[2022/08/16 07:59:55] ppocr INFO: fps:30.991640622573044 +``` + +## 3.2. Test table structure recognition effect + +Using the model trained by PaddleOCR, you can quickly get prediction through the following script. + +The default prediction picture is stored in `infer_img`, and the trained weight is specified via `-o Global.checkpoints`: + + +According to the `save_model_dir` and `save_epoch_step` fields set in the configuration file, the following parameters will be saved: + + +``` +output/SLANet/ +├── best_accuracy.pdopt +├── best_accuracy.pdparams +├── best_accuracy.states +├── config.yml +├── latest.pdopt +├── latest.pdparams +├── latest.states +└── train.log +``` +Among them, best_accuracy.* is the best model on the evaluation set; latest.* is the model of the last epoch. + +``` +# Predict table image +python3 tools/infer_table.py -c configs/table/SLANet.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.infer_img=ppstructure/docs/table/table.jpg +``` + +Input image: + +![](../../ppstructure/docs/table/table.jpg) + +Get the prediction result of the input image: + +``` +['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '
', '', ''],[[320.0562438964844, 197.83375549316406, 350.0928955078125, 214.4309539794922], ... , [318.959228515625, 271.0166931152344, 353.7394104003906, 286.4538269042969]] +``` + +The cell coordinates are visualized as + +![](../../ppstructure/docs/imgs/slanet_result.jpg) + +# 4. Model export and prediction + +## 4.1 Model export + +inference model (model saved by `paddle.jit.save`) +Generally, it is model training, a solidified model that saves the model structure and model parameters in a file, and is mostly used to predict deployment scenarios. +The model saved during the training process is the checkpoints model, and only the parameters of the model are saved, which are mostly used to resume training. +Compared with the checkpoints model, the inference model will additionally save the structural information of the model. It has superior performance in predicting deployment and accelerating reasoning, and is flexible and convenient, and is suitable for actual system integration. + +The way to convert the form recognition model to the inference model is the same as the text detection and recognition, as follows: + +``` +# -c Set the training algorithm yml configuration file +# -o Set optional parameters +# Global.pretrained_model parameter Set the training model address to be converted without adding the file suffix .pdmodel, .pdopt or .pdparams. +# Global.save_inference_dir Set the address where the converted model will be saved. + +python3 tools/export_model.py -c configs/table/SLANet.yml -o Global.pretrained_model=./pretrain_models/SLANet/best_accuracy Global.save_inference_dir=./inference/SLANet/ +``` + +After the conversion is successful, there are three files in the model save directory: + + +``` +inference/SLANet/ + ├── inference.pdiparams # The parameter file of inference model + ├── inference.pdiparams.info # The parameter information of inference model, which can be ignored + └── inference.pdmodel # The program file of model +``` + +## 4.2 Prediction + +After the model is exported, use the following command to complete the prediction of the inference model + +```python +python3.7 table/predict_structure.py \ + --table_model_dir={path/to/inference model} \ + --table_char_dict_path=../ppocr/utils/dict/table_structure_dict_ch.txt \ + --image_dir=docs/table/table.jpg \ + --output=../output/table +``` + +Input image: + +![](../../ppstructure/docs/table/table.jpg) + +Get the prediction result of the input image: + +``` +['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '
', '', ''],[[320.0562438964844, 197.83375549316406, 350.0928955078125, 214.4309539794922], ... , [318.959228515625, 271.0166931152344, 353.7394104003906, 286.4538269042969]] +``` + +The cell coordinates are visualized as + +![](../../ppstructure/docs/imgs/slanet_result.jpg) + + + +# 5. FAQ + +Q1: After the training model is transferred to the inference model, the prediction effect is inconsistent? + +**A**: There are many such problems, and the problems are mostly caused by inconsistent preprocessing and postprocessing parameters when the trained model predicts and the preprocessing and postprocessing parameters when the inference model predicts. You can compare whether there are differences in preprocessing, postprocessing, and prediction in the configuration files used for training. diff --git a/doc/doc_en/whl_en.md b/doc/doc_en/whl_en.md index d81e5532cf1db0193abf61b972420bdc3bacfd0b..da2dff67c16b4a9a0a653934b1f1df64cb6e9707 100644 --- a/doc/doc_en/whl_en.md +++ b/doc/doc_en/whl_en.md @@ -342,6 +342,7 @@ im_show.save('result.jpg') | det_db_thresh | Binarization threshold value of DB output map | 0.3 | | det_db_box_thresh | The threshold value of the DB output box. Boxes score lower than this value will be discarded | 0.5 | | det_db_unclip_ratio | The expanded ratio of DB output box | 2 | +| det_db_score_mode | The parameter that control how the score of the detection frame is calculated. There are 'fast' and 'slow' options. If the text to be detected is curved, it is recommended to use 'slow' | 'fast' | | det_east_score_thresh | Binarization threshold value of EAST output map | 0.8 | | det_east_cover_thresh | The threshold value of the EAST output box. Boxes score lower than this value will be discarded | 0.1 | | det_east_nms_thresh | The NMS threshold value of EAST model output box | 0.2 | diff --git a/doc/features.png b/doc/features.png deleted file mode 100644 index 273e4beb74771b723ab732f703863fa2a3a4c21c..0000000000000000000000000000000000000000 Binary files a/doc/features.png and /dev/null differ diff --git a/doc/features_en.png b/doc/features_en.png deleted file mode 100644 index 310a1b7e50920304521a5fa68c5c2e2a881d3917..0000000000000000000000000000000000000000 Binary files a/doc/features_en.png and /dev/null differ diff --git a/doc/imgs_results/det_res_img623_ct.jpg b/doc/imgs_results/det_res_img623_ct.jpg new file mode 100644 index 0000000000000000000000000000000000000000..2c5f57d96cca896c70d9e0d33ba80a0177a8aeb9 Binary files /dev/null and b/doc/imgs_results/det_res_img623_ct.jpg differ diff --git a/doc/overview_en.png b/doc/overview_en.png deleted file mode 100644 index b44da4e9874d6a2162a8bb05ff1b479875bd65f3..0000000000000000000000000000000000000000 Binary files a/doc/overview_en.png and /dev/null differ diff --git a/doc/ppocr_v3/svtr_tiny.jpg b/doc/ppocr_v3/svtr_tiny.jpg deleted file mode 100644 index 26261047ef253e9802956f4c64449870d10de850..0000000000000000000000000000000000000000 Binary files a/doc/ppocr_v3/svtr_tiny.jpg and /dev/null differ diff --git a/paddleocr.py b/paddleocr.py index 470dc60da3b15195bcd401aff5e50be5a2cfd13e..fa732fc110dc7873f8d89b2ca2a21817a1e6d20d 100644 --- a/paddleocr.py +++ b/paddleocr.py @@ -35,26 +35,26 @@ from tools.infer import predict_system from ppocr.utils.logging import get_logger logger = get_logger() -from ppocr.utils.utility import check_and_read_gif, get_image_file_list +from ppocr.utils.utility import check_and_read, get_image_file_list from ppocr.utils.network import maybe_download, download_with_progressbar, is_link, confirm_model_dir_url from tools.infer.utility import draw_ocr, str2bool, check_gpu from ppstructure.utility import init_args, draw_structure_result -from ppstructure.predict_system import StructureSystem, save_structure_res +from ppstructure.predict_system import StructureSystem, save_structure_res, to_excel __all__ = [ 'PaddleOCR', 'PPStructure', 'draw_ocr', 'draw_structure_result', - 'save_structure_res', 'download_with_progressbar' + 'save_structure_res', 'download_with_progressbar', 'to_excel' ] SUPPORT_DET_MODEL = ['DB'] -VERSION = '2.5.0.3' +VERSION = '2.6.0.1' SUPPORT_REC_MODEL = ['CRNN', 'SVTR_LCNet'] BASE_DIR = os.path.expanduser("~/.paddleocr/") DEFAULT_OCR_MODEL_VERSION = 'PP-OCRv3' SUPPORT_OCR_MODEL_VERSION = ['PP-OCR', 'PP-OCRv2', 'PP-OCRv3'] -DEFAULT_STRUCTURE_MODEL_VERSION = 'PP-STRUCTURE' -SUPPORT_STRUCTURE_MODEL_VERSION = ['PP-STRUCTURE'] +DEFAULT_STRUCTURE_MODEL_VERSION = 'PP-Structurev2' +SUPPORT_STRUCTURE_MODEL_VERSION = ['PP-Structure', 'PP-Structurev2'] MODEL_URLS = { 'OCR': { 'PP-OCRv3': { @@ -263,7 +263,7 @@ MODEL_URLS = { } }, 'STRUCTURE': { - 'PP-STRUCTURE': { + 'PP-Structure': { 'table': { 'en': { 'url': @@ -271,6 +271,34 @@ MODEL_URLS = { 'dict_path': 'ppocr/utils/dict/table_structure_dict.txt' } } + }, + 'PP-Structurev2': { + 'table': { + 'en': { + 'url': + 'https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/en_ppstructure_mobile_v2.0_SLANet_infer.tar', + 'dict_path': 'ppocr/utils/dict/table_structure_dict.txt' + }, + 'ch': { + 'url': + 'https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_infer.tar', + 'dict_path': 'ppocr/utils/dict/table_structure_dict_ch.txt' + } + }, + 'layout': { + 'en': { + 'url': + 'https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_infer.tar', + 'dict_path': + 'ppocr/utils/dict/layout_dict/layout_publaynet_dict.txt' + }, + 'ch': { + 'url': + 'https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_cdla_infer.tar', + 'dict_path': + 'ppocr/utils/dict/layout_dict/layout_cdla_dict.txt' + } + } } } } @@ -298,12 +326,15 @@ def parse_args(mMain=True): "--structure_version", type=str, choices=SUPPORT_STRUCTURE_MODEL_VERSION, - default='PP-STRUCTURE', + default='PP-Structurev2', help='Model version, the current model support list is as follows:' - ' 1. STRUCTURE Support en table structure model.') + ' 1. PP-Structure Support en table structure model.' + ' 2. PP-Structurev2 Support ch and en table structure model.') for action in parser._actions: - if action.dest in ['rec_char_dict_path', 'table_char_dict_path']: + if action.dest in [ + 'rec_char_dict_path', 'table_char_dict_path', 'layout_dict_path' + ]: action.default = None if mMain: return parser.parse_args() @@ -383,6 +414,33 @@ def get_model_config(type, version, model_type, lang): return model_urls[version][model_type][lang] +def img_decode(content: bytes): + np_arr = np.frombuffer(content, dtype=np.uint8) + return cv2.imdecode(np_arr, cv2.IMREAD_COLOR) + + +def check_img(img): + if isinstance(img, bytes): + img = img_decode(img) + if isinstance(img, str): + # download net image + if is_link(img): + download_with_progressbar(img, 'tmp.jpg') + img = 'tmp.jpg' + image_file = img + img, flag, _ = check_and_read(image_file) + if not flag: + with open(image_file, 'rb') as f: + img = img_decode(f.read()) + if img is None: + logger.error("error in loading image:{}".format(image_file)) + return None + if isinstance(img, np.ndarray) and len(img.shape) == 2: + img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) + + return img + + class PaddleOCR(predict_system.TextSystem): def __init__(self, **kwargs): """ @@ -451,7 +509,7 @@ class PaddleOCR(predict_system.TextSystem): rec: use text recognition or not. If false, only det will be exec. Default is True cls: use angle classifier or not. Default is True. If true, the text with rotation of 180 degrees can be recognized. If no text is rotated by 180 degrees, use cls=False to get better performance. Text with rotation of 90 or 270 degrees can be recognized even if cls=False. """ - assert isinstance(img, (np.ndarray, list, str)) + assert isinstance(img, (np.ndarray, list, str, bytes)) if isinstance(img, list) and det == True: logger.error('When input a list of images, det must be false') exit(0) @@ -460,24 +518,10 @@ class PaddleOCR(predict_system.TextSystem): 'Since the angle classifier is not initialized, the angle classifier will not be uesd during the forward process' ) - if isinstance(img, str): - # download net image - if img.startswith('http'): - download_with_progressbar(img, 'tmp.jpg') - img = 'tmp.jpg' - image_file = img - img, flag = check_and_read_gif(image_file) - if not flag: - with open(image_file, 'rb') as f: - np_arr = np.frombuffer(f.read(), dtype=np.uint8) - img = cv2.imdecode(np_arr, cv2.IMREAD_COLOR) - if img is None: - logger.error("error in loading image:{}".format(image_file)) - return None - if isinstance(img, np.ndarray) and len(img.shape) == 2: - img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) + img = check_img(img) + if det and rec: - dt_boxes, rec_res = self.__call__(img, cls) + dt_boxes, rec_res, _ = self.__call__(img, cls) return [[box.tolist(), res] for box, res in zip(dt_boxes, rec_res)] elif det and not rec: dt_boxes, elapse = self.text_detector(img) @@ -506,6 +550,12 @@ class PPStructure(StructureSystem): if not params.show_log: logger.setLevel(logging.INFO) lang, det_lang = parse_lang(params.lang) + if lang == 'ch': + table_lang = 'ch' + else: + table_lang = 'en' + if params.structure_version == 'PP-Structure': + params.merge_no_span_structure = False # init model dir det_model_config = get_model_config('OCR', params.ocr_version, 'det', @@ -520,14 +570,20 @@ class PPStructure(StructureSystem): params.rec_model_dir, os.path.join(BASE_DIR, 'whl', 'rec', lang), rec_model_config['url']) table_model_config = get_model_config( - 'STRUCTURE', params.structure_version, 'table', 'en') + 'STRUCTURE', params.structure_version, 'table', table_lang) params.table_model_dir, table_url = confirm_model_dir_url( params.table_model_dir, os.path.join(BASE_DIR, 'whl', 'table'), table_model_config['url']) + layout_model_config = get_model_config( + 'STRUCTURE', params.structure_version, 'layout', lang) + params.layout_model_dir, layout_url = confirm_model_dir_url( + params.layout_model_dir, + os.path.join(BASE_DIR, 'whl', 'layout'), layout_model_config['url']) # download model maybe_download(params.det_model_dir, det_url) maybe_download(params.rec_model_dir, rec_url) maybe_download(params.table_model_dir, table_url) + maybe_download(params.layout_model_dir, layout_url) if params.rec_char_dict_path is None: params.rec_char_dict_path = str( @@ -535,29 +591,16 @@ class PPStructure(StructureSystem): if params.table_char_dict_path is None: params.table_char_dict_path = str( Path(__file__).parent / table_model_config['dict_path']) - + if params.layout_dict_path is None: + params.layout_dict_path = str( + Path(__file__).parent / layout_model_config['dict_path']) logger.debug(params) super().__init__(params) - def __call__(self, img, return_ocr_result_in_table=False): - if isinstance(img, str): - # download net image - if img.startswith('http'): - download_with_progressbar(img, 'tmp.jpg') - img = 'tmp.jpg' - image_file = img - img, flag = check_and_read_gif(image_file) - if not flag: - with open(image_file, 'rb') as f: - np_arr = np.frombuffer(f.read(), dtype=np.uint8) - img = cv2.imdecode(np_arr, cv2.IMREAD_COLOR) - if img is None: - logger.error("error in loading image:{}".format(image_file)) - return None - if isinstance(img, np.ndarray) and len(img.shape) == 2: - img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) - - res = super().__call__(img, return_ocr_result_in_table) + def __call__(self, img, return_ocr_result_in_table=False, img_idx=0): + img = check_img(img) + res, _ = super().__call__( + img, return_ocr_result_in_table, img_idx=img_idx) return res @@ -592,9 +635,55 @@ def main(): for line in result: logger.info(line) elif args.type == 'structure': - result = engine(img_path) - save_structure_res(result, args.output, img_name) - - for item in result: + img, flag_gif, flag_pdf = check_and_read(img_path) + if not flag_gif and not flag_pdf: + img = cv2.imread(img_path) + + if not flag_pdf: + if img is None: + logger.error("error in loading image:{}".format(img_path)) + continue + img_paths = [[img_path, img]] + else: + img_paths = [] + for index, pdf_img in enumerate(img): + os.makedirs( + os.path.join(args.output, img_name), exist_ok=True) + pdf_img_path = os.path.join( + args.output, img_name, + img_name + '_' + str(index) + '.jpg') + cv2.imwrite(pdf_img_path, pdf_img) + img_paths.append([pdf_img_path, pdf_img]) + + all_res = [] + for index, (new_img_path, img) in enumerate(img_paths): + logger.info('processing {}/{} page:'.format(index + 1, + len(img_paths))) + new_img_name = os.path.basename(new_img_path).split('.')[0] + result = engine(new_img_path, img_idx=index) + save_structure_res(result, args.output, img_name, index) + + if args.recovery and result != []: + from copy import deepcopy + from ppstructure.recovery.recovery_to_doc import sorted_layout_boxes + h, w, _ = img.shape + result_cp = deepcopy(result) + result_sorted = sorted_layout_boxes(result_cp, w) + all_res += result_sorted + + if args.recovery and all_res != []: + try: + from ppstructure.recovery.recovery_to_doc import convert_info_docx + convert_info_docx(img, all_res, args.output, img_name, + args.save_pdf) + except Exception as ex: + logger.error( + "error in layout recovery image:{}, err msg: {}".format( + img_name, ex)) + continue + + for item in all_res: item.pop('img') + item.pop('res') logger.info(item) + logger.info('result save to {}'.format(args.output)) diff --git a/ppocr/data/__init__.py b/ppocr/data/__init__.py index 78c3279656e184a3a34bff3847d3936b5e8977b6..b602a346dbe4b0d45af287f25f05ead0f62daf44 100644 --- a/ppocr/data/__init__.py +++ b/ppocr/data/__init__.py @@ -34,7 +34,7 @@ import paddle.distributed as dist from ppocr.data.imaug import transform, create_operators from ppocr.data.simple_dataset import SimpleDataSet -from ppocr.data.lmdb_dataset import LMDBDataSet +from ppocr.data.lmdb_dataset import LMDBDataSet, LMDBDataSetSR from ppocr.data.pgnet_dataset import PGDataSet from ppocr.data.pubtab_dataset import PubTabDataSet @@ -54,7 +54,8 @@ def build_dataloader(config, mode, device, logger, seed=None): config = copy.deepcopy(config) support_dict = [ - 'SimpleDataSet', 'LMDBDataSet', 'PGDataSet', 'PubTabDataSet' + 'SimpleDataSet', 'LMDBDataSet', 'PGDataSet', 'PubTabDataSet', + 'LMDBDataSetSR' ] module_name = config[mode]['dataset']['name'] assert module_name in support_dict, Exception( diff --git a/ppocr/data/imaug/__init__.py b/ppocr/data/imaug/__init__.py index a2332b6c07be63ecfe2fa9003cbe9d0c1b0e8001..863988cccfa9d9f2c865a444410d4245687f49ee 100644 --- a/ppocr/data/imaug/__init__.py +++ b/ppocr/data/imaug/__init__.py @@ -26,8 +26,7 @@ from .make_pse_gt import MakePseGt from .rec_img_aug import BaseDataAugmentation, RecAug, RecConAug, RecResizeImg, ClsResizeImg, \ SRNRecResizeImg, GrayRecResizeImg, SARRecResizeImg, PRENResizeImg, \ - ABINetRecResizeImg, SVTRRecResizeImg, ABINetRecAug, VLRecResizeImg, SPINRecResizeImg - + ABINetRecResizeImg, SVTRRecResizeImg, ABINetRecAug, VLRecResizeImg, SPINRecResizeImg, RobustScannerRecResizeImg from .ssl_img_aug import SSLRotateResize from .randaugment import RandAugment from .copy_paste import CopyPaste @@ -44,6 +43,7 @@ from .vqa import * from .fce_aug import * from .fce_targets import FCENetTargets +from .ct_process import * def transform(data, ops=None): diff --git a/ppocr/data/imaug/copy_paste.py b/ppocr/data/imaug/copy_paste.py index 0b3386c896792bd670cd2bfc757eb3b80f22bac4..79343da60fd40f8dc0ffe8927398b70cb751b532 100644 --- a/ppocr/data/imaug/copy_paste.py +++ b/ppocr/data/imaug/copy_paste.py @@ -35,10 +35,12 @@ class CopyPaste(object): point_num = data['polys'].shape[1] src_img = data['image'] src_polys = data['polys'].tolist() + src_texts = data['texts'] src_ignores = data['ignore_tags'].tolist() ext_data = data['ext_data'][0] ext_image = ext_data['image'] ext_polys = ext_data['polys'] + ext_texts = ext_data['texts'] ext_ignores = ext_data['ignore_tags'] indexs = [i for i in range(len(ext_ignores)) if not ext_ignores[i]] @@ -53,7 +55,7 @@ class CopyPaste(object): src_img = cv2.cvtColor(src_img, cv2.COLOR_BGR2RGB) ext_image = cv2.cvtColor(ext_image, cv2.COLOR_BGR2RGB) src_img = Image.fromarray(src_img).convert('RGBA') - for poly, tag in zip(select_polys, select_ignores): + for idx, poly, tag in zip(select_idxs, select_polys, select_ignores): box_img = get_rotate_crop_image(ext_image, poly) src_img, box = self.paste_img(src_img, box_img, src_polys) @@ -62,6 +64,7 @@ class CopyPaste(object): for _ in range(len(box), point_num): box.append(box[-1]) src_polys.append(box) + src_texts.append(ext_texts[idx]) src_ignores.append(tag) src_img = cv2.cvtColor(np.array(src_img), cv2.COLOR_RGB2BGR) h, w = src_img.shape[:2] @@ -70,6 +73,7 @@ class CopyPaste(object): src_polys[:, :, 1] = np.clip(src_polys[:, :, 1], 0, h) data['image'] = src_img data['polys'] = src_polys + data['texts'] = src_texts data['ignore_tags'] = np.array(src_ignores) return data diff --git a/ppocr/data/imaug/ct_process.py b/ppocr/data/imaug/ct_process.py new file mode 100644 index 0000000000000000000000000000000000000000..59715090036e1020800950b02b9ea06ab5c8d4c2 --- /dev/null +++ b/ppocr/data/imaug/ct_process.py @@ -0,0 +1,355 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import cv2 +import random +import pyclipper +import paddle + +import numpy as np +import Polygon as plg +import scipy.io as scio + +from PIL import Image +import paddle.vision.transforms as transforms + + +class RandomScale(): + def __init__(self, short_size=640, **kwargs): + self.short_size = short_size + + def scale_aligned(self, img, scale): + oh, ow = img.shape[0:2] + h = int(oh * scale + 0.5) + w = int(ow * scale + 0.5) + if h % 32 != 0: + h = h + (32 - h % 32) + if w % 32 != 0: + w = w + (32 - w % 32) + img = cv2.resize(img, dsize=(w, h)) + factor_h = h / oh + factor_w = w / ow + return img, factor_h, factor_w + + def __call__(self, data): + img = data['image'] + + h, w = img.shape[0:2] + random_scale = np.array([0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3]) + scale = (np.random.choice(random_scale) * self.short_size) / min(h, w) + img, factor_h, factor_w = self.scale_aligned(img, scale) + + data['scale_factor'] = (factor_w, factor_h) + data['image'] = img + return data + + +class MakeShrink(): + def __init__(self, kernel_scale=0.7, **kwargs): + self.kernel_scale = kernel_scale + + def dist(self, a, b): + return np.linalg.norm((a - b), ord=2, axis=0) + + def perimeter(self, bbox): + peri = 0.0 + for i in range(bbox.shape[0]): + peri += self.dist(bbox[i], bbox[(i + 1) % bbox.shape[0]]) + return peri + + def shrink(self, bboxes, rate, max_shr=20): + rate = rate * rate + shrinked_bboxes = [] + for bbox in bboxes: + area = plg.Polygon(bbox).area() + peri = self.perimeter(bbox) + + try: + pco = pyclipper.PyclipperOffset() + pco.AddPath(bbox, pyclipper.JT_ROUND, + pyclipper.ET_CLOSEDPOLYGON) + offset = min( + int(area * (1 - rate) / (peri + 0.001) + 0.5), max_shr) + + shrinked_bbox = pco.Execute(-offset) + if len(shrinked_bbox) == 0: + shrinked_bboxes.append(bbox) + continue + + shrinked_bbox = np.array(shrinked_bbox[0]) + if shrinked_bbox.shape[0] <= 2: + shrinked_bboxes.append(bbox) + continue + + shrinked_bboxes.append(shrinked_bbox) + except Exception as e: + shrinked_bboxes.append(bbox) + + return shrinked_bboxes + + def __call__(self, data): + img = data['image'] + bboxes = data['polys'] + words = data['texts'] + scale_factor = data['scale_factor'] + + gt_instance = np.zeros(img.shape[0:2], dtype='uint8') # h,w + training_mask = np.ones(img.shape[0:2], dtype='uint8') + training_mask_distance = np.ones(img.shape[0:2], dtype='uint8') + + for i in range(len(bboxes)): + bboxes[i] = np.reshape(bboxes[i] * ( + [scale_factor[0], scale_factor[1]] * (bboxes[i].shape[0] // 2)), + (bboxes[i].shape[0] // 2, 2)).astype('int32') + + for i in range(len(bboxes)): + #different value for different bbox + cv2.drawContours(gt_instance, [bboxes[i]], -1, i + 1, -1) + + # set training mask to 0 + cv2.drawContours(training_mask, [bboxes[i]], -1, 0, -1) + + # for not accurate annotation, use training_mask_distance + if words[i] == '###' or words[i] == '???': + cv2.drawContours(training_mask_distance, [bboxes[i]], -1, 0, -1) + + # make shrink + gt_kernel_instance = np.zeros(img.shape[0:2], dtype='uint8') + kernel_bboxes = self.shrink(bboxes, self.kernel_scale) + for i in range(len(bboxes)): + cv2.drawContours(gt_kernel_instance, [kernel_bboxes[i]], -1, i + 1, + -1) + + # for training mask, kernel and background= 1, box region=0 + if words[i] != '###' and words[i] != '???': + cv2.drawContours(training_mask, [kernel_bboxes[i]], -1, 1, -1) + + gt_kernel = gt_kernel_instance.copy() + # for gt_kernel, kernel = 1 + gt_kernel[gt_kernel > 0] = 1 + + # shrink 2 times + tmp1 = gt_kernel_instance.copy() + erode_kernel = np.ones((3, 3), np.uint8) + tmp1 = cv2.erode(tmp1, erode_kernel, iterations=1) + tmp2 = tmp1.copy() + tmp2 = cv2.erode(tmp2, erode_kernel, iterations=1) + + # compute text region + gt_kernel_inner = tmp1 - tmp2 + + # gt_instance: text instance, bg=0, diff word use diff value + # training_mask: text instance mask, word=0,kernel and bg=1 + # gt_kernel_instance: text kernel instance, bg=0, diff word use diff value + # gt_kernel: text_kernel, bg=0,diff word use same value + # gt_kernel_inner: text kernel reference + # training_mask_distance: word without anno = 0, else 1 + + data['image'] = [ + img, gt_instance, training_mask, gt_kernel_instance, gt_kernel, + gt_kernel_inner, training_mask_distance + ] + return data + + +class GroupRandomHorizontalFlip(): + def __init__(self, p=0.5, **kwargs): + self.p = p + + def __call__(self, data): + imgs = data['image'] + + if random.random() < self.p: + for i in range(len(imgs)): + imgs[i] = np.flip(imgs[i], axis=1).copy() + data['image'] = imgs + return data + + +class GroupRandomRotate(): + def __init__(self, **kwargs): + pass + + def __call__(self, data): + imgs = data['image'] + + max_angle = 10 + angle = random.random() * 2 * max_angle - max_angle + for i in range(len(imgs)): + img = imgs[i] + w, h = img.shape[:2] + rotation_matrix = cv2.getRotationMatrix2D((h / 2, w / 2), angle, 1) + img_rotation = cv2.warpAffine( + img, rotation_matrix, (h, w), flags=cv2.INTER_NEAREST) + imgs[i] = img_rotation + + data['image'] = imgs + return data + + +class GroupRandomCropPadding(): + def __init__(self, target_size=(640, 640), **kwargs): + self.target_size = target_size + + def __call__(self, data): + imgs = data['image'] + + h, w = imgs[0].shape[0:2] + t_w, t_h = self.target_size + p_w, p_h = self.target_size + if w == t_w and h == t_h: + return data + + t_h = t_h if t_h < h else h + t_w = t_w if t_w < w else w + + if random.random() > 3.0 / 8.0 and np.max(imgs[1]) > 0: + # make sure to crop the text region + tl = np.min(np.where(imgs[1] > 0), axis=1) - (t_h, t_w) + tl[tl < 0] = 0 + br = np.max(np.where(imgs[1] > 0), axis=1) - (t_h, t_w) + br[br < 0] = 0 + br[0] = min(br[0], h - t_h) + br[1] = min(br[1], w - t_w) + + i = random.randint(tl[0], br[0]) if tl[0] < br[0] else 0 + j = random.randint(tl[1], br[1]) if tl[1] < br[1] else 0 + else: + i = random.randint(0, h - t_h) if h - t_h > 0 else 0 + j = random.randint(0, w - t_w) if w - t_w > 0 else 0 + + n_imgs = [] + for idx in range(len(imgs)): + if len(imgs[idx].shape) == 3: + s3_length = int(imgs[idx].shape[-1]) + img = imgs[idx][i:i + t_h, j:j + t_w, :] + img_p = cv2.copyMakeBorder( + img, + 0, + p_h - t_h, + 0, + p_w - t_w, + borderType=cv2.BORDER_CONSTANT, + value=tuple(0 for i in range(s3_length))) + else: + img = imgs[idx][i:i + t_h, j:j + t_w] + img_p = cv2.copyMakeBorder( + img, + 0, + p_h - t_h, + 0, + p_w - t_w, + borderType=cv2.BORDER_CONSTANT, + value=(0, )) + n_imgs.append(img_p) + + data['image'] = n_imgs + return data + + +class MakeCentripetalShift(): + def __init__(self, **kwargs): + pass + + def jaccard(self, As, Bs): + A = As.shape[0] # small + B = Bs.shape[0] # large + + dis = np.sqrt( + np.sum((As[:, np.newaxis, :].repeat( + B, axis=1) - Bs[np.newaxis, :, :].repeat( + A, axis=0))**2, + axis=-1)) + + ind = np.argmin(dis, axis=-1) + + return ind + + def __call__(self, data): + imgs = data['image'] + + img, gt_instance, training_mask, gt_kernel_instance, gt_kernel, gt_kernel_inner, training_mask_distance = \ + imgs[0], imgs[1], imgs[2], imgs[3], imgs[4], imgs[5], imgs[6] + + max_instance = np.max(gt_instance) # num bbox + + # make centripetal shift + gt_distance = np.zeros((2, *img.shape[0:2]), dtype=np.float32) + for i in range(1, max_instance + 1): + # kernel_reference + ind = (gt_kernel_inner == i) + + if np.sum(ind) == 0: + training_mask[gt_instance == i] = 0 + training_mask_distance[gt_instance == i] = 0 + continue + + kpoints = np.array(np.where(ind)).transpose( + (1, 0))[:, ::-1].astype('float32') + + ind = (gt_instance == i) * (gt_kernel_instance == 0) + if np.sum(ind) == 0: + continue + pixels = np.where(ind) + + points = np.array(pixels).transpose( + (1, 0))[:, ::-1].astype('float32') + + bbox_ind = self.jaccard(points, kpoints) + + offset_gt = kpoints[bbox_ind] - points + + gt_distance[:, pixels[0], pixels[1]] = offset_gt.T * 0.1 + + img = Image.fromarray(img) + img = img.convert('RGB') + + data["image"] = img + data["gt_kernel"] = gt_kernel.astype("int64") + data["training_mask"] = training_mask.astype("int64") + data["gt_instance"] = gt_instance.astype("int64") + data["gt_kernel_instance"] = gt_kernel_instance.astype("int64") + data["training_mask_distance"] = training_mask_distance.astype("int64") + data["gt_distance"] = gt_distance.astype("float32") + + return data + + +class ScaleAlignedShort(): + def __init__(self, short_size=640, **kwargs): + self.short_size = short_size + + def __call__(self, data): + img = data['image'] + + org_img_shape = img.shape + + h, w = img.shape[0:2] + scale = self.short_size * 1.0 / min(h, w) + h = int(h * scale + 0.5) + w = int(w * scale + 0.5) + if h % 32 != 0: + h = h + (32 - h % 32) + if w % 32 != 0: + w = w + (32 - w % 32) + img = cv2.resize(img, dsize=(w, h)) + + new_img_shape = img.shape + img_shape = np.array(org_img_shape + new_img_shape) + + data['shape'] = img_shape + data['image'] = img + + return data \ No newline at end of file diff --git a/ppocr/data/imaug/label_ops.py b/ppocr/data/imaug/label_ops.py index 1656c69529e19ee04fcb4343f28fe742dabb83b0..dbfb93176cc782bedc8f7b33367b59046c4abec8 100644 --- a/ppocr/data/imaug/label_ops.py +++ b/ppocr/data/imaug/label_ops.py @@ -575,7 +575,7 @@ class TableLabelEncode(AttnLabelEncode): replace_empty_cell_token=False, merge_no_span_structure=False, learn_empty_box=False, - point_num=2, + loc_reg_num=4, **kwargs): self.max_text_len = max_text_length self.lower = False @@ -590,6 +590,12 @@ class TableLabelEncode(AttnLabelEncode): line = line.decode('utf-8').strip("\n").strip("\r\n") dict_character.append(line) + if self.merge_no_span_structure: + if "" not in dict_character: + dict_character.append("") + if "" in dict_character: + dict_character.remove("") + dict_character = self.add_special_char(dict_character) self.dict = {} for i, char in enumerate(dict_character): @@ -597,7 +603,7 @@ class TableLabelEncode(AttnLabelEncode): self.idx2char = {v: k for k, v in self.dict.items()} self.character = dict_character - self.point_num = point_num + self.loc_reg_num = loc_reg_num self.pad_idx = self.dict[self.beg_str] self.start_idx = self.dict[self.beg_str] self.end_idx = self.dict[self.end_str] @@ -653,7 +659,7 @@ class TableLabelEncode(AttnLabelEncode): # encode box bboxes = np.zeros( - (self._max_text_len, self.point_num * 2), dtype=np.float32) + (self._max_text_len, self.loc_reg_num), dtype=np.float32) bbox_masks = np.zeros((self._max_text_len, 1), dtype=np.float32) bbox_idx = 0 @@ -718,11 +724,11 @@ class TableMasterLabelEncode(TableLabelEncode): replace_empty_cell_token=False, merge_no_span_structure=False, learn_empty_box=False, - point_num=2, + loc_reg_num=4, **kwargs): super(TableMasterLabelEncode, self).__init__( max_text_length, character_dict_path, replace_empty_cell_token, - merge_no_span_structure, learn_empty_box, point_num, **kwargs) + merge_no_span_structure, learn_empty_box, loc_reg_num, **kwargs) self.pad_idx = self.dict[self.pad_str] self.unknown_idx = self.dict[self.unknown_str] @@ -743,27 +749,35 @@ class TableMasterLabelEncode(TableLabelEncode): class TableBoxEncode(object): - def __init__(self, use_xywh=False, **kwargs): - self.use_xywh = use_xywh + def __init__(self, in_box_format='xyxy', out_box_format='xyxy', **kwargs): + assert out_box_format in ['xywh', 'xyxy', 'xyxyxyxy'] + self.in_box_format = in_box_format + self.out_box_format = out_box_format def __call__(self, data): img_height, img_width = data['image'].shape[:2] bboxes = data['bboxes'] - if self.use_xywh and bboxes.shape[1] == 4: - bboxes = self.xyxy2xywh(bboxes) + if self.in_box_format != self.out_box_format: + if self.out_box_format == 'xywh': + if self.in_box_format == 'xyxyxyxy': + bboxes = self.xyxyxyxy2xywh(bboxes) + elif self.in_box_format == 'xyxy': + bboxes = self.xyxy2xywh(bboxes) + bboxes[:, 0::2] /= img_width bboxes[:, 1::2] /= img_height data['bboxes'] = bboxes return data + def xyxyxyxy2xywh(self, boxes): + new_bboxes = np.zeros([len(bboxes), 4]) + new_bboxes[:, 0] = bboxes[:, 0::2].min() # x1 + new_bboxes[:, 1] = bboxes[:, 1::2].min() # y1 + new_bboxes[:, 2] = bboxes[:, 0::2].max() - new_bboxes[:, 0] # w + new_bboxes[:, 3] = bboxes[:, 1::2].max() - new_bboxes[:, 1] # h + return new_bboxes + def xyxy2xywh(self, bboxes): - """ - Convert coord (x1,y1,x2,y2) to (x,y,w,h). - where (x1,y1) is top-left, (x2,y2) is bottom-right. - (x,y) is bbox center and (w,h) is width and height. - :param bboxes: (x1, y1, x2, y2) - :return: - """ new_bboxes = np.empty_like(bboxes) new_bboxes[:, 0] = (bboxes[:, 0] + bboxes[:, 2]) / 2 # x center new_bboxes[:, 1] = (bboxes[:, 1] + bboxes[:, 3]) / 2 # y center @@ -1236,6 +1250,54 @@ class ABINetLabelEncode(BaseRecLabelEncode): return dict_character +class SRLabelEncode(BaseRecLabelEncode): + def __init__(self, + max_text_length, + character_dict_path=None, + use_space_char=False, + **kwargs): + super(SRLabelEncode, self).__init__(max_text_length, + character_dict_path, use_space_char) + self.dic = {} + with open(character_dict_path, 'r') as fin: + for line in fin.readlines(): + line = line.strip() + character, sequence = line.split() + self.dic[character] = sequence + english_stroke_alphabet = '0123456789' + self.english_stroke_dict = {} + for index in range(len(english_stroke_alphabet)): + self.english_stroke_dict[english_stroke_alphabet[index]] = index + + def encode(self, label): + stroke_sequence = '' + for character in label: + if character not in self.dic: + continue + else: + stroke_sequence += self.dic[character] + stroke_sequence += '0' + label = stroke_sequence + + length = len(label) + + input_tensor = np.zeros(self.max_text_len).astype("int64") + for j in range(length - 1): + input_tensor[j + 1] = self.english_stroke_dict[label[j]] + + return length, input_tensor + + def __call__(self, data): + text = data['label'] + length, input_tensor = self.encode(text) + + data["length"] = length + data["input_tensor"] = input_tensor + if text is None: + return None + return data + + class SPINLabelEncode(AttnLabelEncode): """ Convert between text-label and text-index """ @@ -1333,3 +1395,29 @@ class VLLabelEncode(BaseRecLabelEncode): data['label_res'] = np.array(label_res) data['label_sub'] = np.array(label_sub) return data + + +class CTLabelEncode(object): + def __init__(self, **kwargs): + pass + + def __call__(self, data): + label = data['label'] + + label = json.loads(label) + nBox = len(label) + boxes, txts = [], [] + for bno in range(0, nBox): + box = label[bno]['points'] + box = np.array(box) + + boxes.append(box) + txt = label[bno]['transcription'] + txts.append(txt) + + if len(boxes) == 0: + return None + + data['polys'] = boxes + data['texts'] = txts + return data \ No newline at end of file diff --git a/ppocr/data/imaug/operators.py b/ppocr/data/imaug/operators.py index 04cc2848fb4d25baaf553c6eda235ddb0e86511f..5e84b1aac9c54d8a8283468af6826ca917ba0384 100644 --- a/ppocr/data/imaug/operators.py +++ b/ppocr/data/imaug/operators.py @@ -24,6 +24,7 @@ import six import cv2 import numpy as np import math +from PIL import Image class DecodeImage(object): @@ -224,6 +225,8 @@ class DetResizeForTest(object): def __call__(self, data): img = data['image'] src_h, src_w, _ = img.shape + if sum([src_h, src_w]) < 64: + img = self.image_padding(img) if self.resize_type == 0: # img, shape = self.resize_image_type0(img) @@ -237,6 +240,12 @@ class DetResizeForTest(object): data['shape'] = np.array([src_h, src_w, ratio_h, ratio_w]) return data + def image_padding(self, im, value=0): + h, w, c = im.shape + im_pad = np.zeros((max(32, h), max(32, w), c), np.uint8) + value + im_pad[:h, :w, :] = im + return im_pad + def resize_image_type1(self, img): resize_h, resize_w = self.image_shape ori_h, ori_w = img.shape[:2] # (h, w, c) @@ -440,3 +449,52 @@ class KieResize(object): points[:, 0::2] = np.clip(points[:, 0::2], 0, img_shape[1]) points[:, 1::2] = np.clip(points[:, 1::2], 0, img_shape[0]) return points + + +class SRResize(object): + def __init__(self, + imgH=32, + imgW=128, + down_sample_scale=4, + keep_ratio=False, + min_ratio=1, + mask=False, + infer_mode=False, + **kwargs): + self.imgH = imgH + self.imgW = imgW + self.keep_ratio = keep_ratio + self.min_ratio = min_ratio + self.down_sample_scale = down_sample_scale + self.mask = mask + self.infer_mode = infer_mode + + def __call__(self, data): + imgH = self.imgH + imgW = self.imgW + images_lr = data["image_lr"] + transform2 = ResizeNormalize( + (imgW // self.down_sample_scale, imgH // self.down_sample_scale)) + images_lr = transform2(images_lr) + data["img_lr"] = images_lr + if self.infer_mode: + return data + + images_HR = data["image_hr"] + label_strs = data["label"] + transform = ResizeNormalize((imgW, imgH)) + images_HR = transform(images_HR) + data["img_hr"] = images_HR + return data + + +class ResizeNormalize(object): + def __init__(self, size, interpolation=Image.BICUBIC): + self.size = size + self.interpolation = interpolation + + def __call__(self, img): + img = img.resize(self.size, self.interpolation) + img_numpy = np.array(img).astype("float32") + img_numpy = img_numpy.transpose((2, 0, 1)) / 255 + return img_numpy diff --git a/ppocr/data/imaug/pg_process.py b/ppocr/data/imaug/pg_process.py index 53031064c019ddce00c7546f898ac67a7f0459f9..f1e5f912b7a55dc3b9e883a9f4f8c5de482dcd5a 100644 --- a/ppocr/data/imaug/pg_process.py +++ b/ppocr/data/imaug/pg_process.py @@ -15,6 +15,8 @@ import math import cv2 import numpy as np +from skimage.morphology._skeletonize import thin +from ppocr.utils.e2e_utils.extract_textpoint_fast import sort_and_expand_with_direction_v2 __all__ = ['PGProcessTrain'] @@ -26,17 +28,24 @@ class PGProcessTrain(object): max_text_nums, tcl_len, batch_size=14, + use_resize=True, + use_random_crop=False, min_crop_size=24, min_text_size=4, max_text_size=512, + point_gather_mode=None, **kwargs): self.tcl_len = tcl_len self.max_text_length = max_text_length self.max_text_nums = max_text_nums self.batch_size = batch_size - self.min_crop_size = min_crop_size + if use_random_crop is True: + self.min_crop_size = min_crop_size + self.use_random_crop = use_random_crop self.min_text_size = min_text_size self.max_text_size = max_text_size + self.use_resize = use_resize + self.point_gather_mode = point_gather_mode self.Lexicon_Table = self.get_dict(character_dict_path) self.pad_num = len(self.Lexicon_Table) self.img_id = 0 @@ -282,6 +291,95 @@ class PGProcessTrain(object): pos_m[:keep] = 1.0 return pos_l, pos_m + def fit_and_gather_tcl_points_v3(self, + min_area_quad, + poly, + max_h, + max_w, + fixed_point_num=64, + img_id=0, + reference_height=3): + """ + Find the center point of poly as key_points, then fit and gather. + """ + det_mask = np.zeros((int(max_h / self.ds_ratio), + int(max_w / self.ds_ratio))).astype(np.float32) + + # score_big_map + cv2.fillPoly(det_mask, + np.round(poly / self.ds_ratio).astype(np.int32), 1.0) + det_mask = cv2.resize( + det_mask, dsize=None, fx=self.ds_ratio, fy=self.ds_ratio) + det_mask = np.array(det_mask > 1e-3, dtype='float32') + + f_direction = self.f_direction + skeleton_map = thin(det_mask.astype(np.uint8)) + instance_count, instance_label_map = cv2.connectedComponents( + skeleton_map.astype(np.uint8), connectivity=8) + + ys, xs = np.where(instance_label_map == 1) + pos_list = list(zip(ys, xs)) + if len(pos_list) < 3: + return None + pos_list_sorted = sort_and_expand_with_direction_v2( + pos_list, f_direction, det_mask) + + pos_list_sorted = np.array(pos_list_sorted) + length = len(pos_list_sorted) - 1 + insert_num = 0 + for index in range(length): + stride_y = np.abs(pos_list_sorted[index + insert_num][0] - + pos_list_sorted[index + 1 + insert_num][0]) + stride_x = np.abs(pos_list_sorted[index + insert_num][1] - + pos_list_sorted[index + 1 + insert_num][1]) + max_points = int(max(stride_x, stride_y)) + + stride = (pos_list_sorted[index + insert_num] - + pos_list_sorted[index + 1 + insert_num]) / (max_points) + insert_num_temp = max_points - 1 + + for i in range(int(insert_num_temp)): + insert_value = pos_list_sorted[index + insert_num] - (i + 1 + ) * stride + insert_index = index + i + 1 + insert_num + pos_list_sorted = np.insert( + pos_list_sorted, insert_index, insert_value, axis=0) + insert_num += insert_num_temp + + pos_info = np.array(pos_list_sorted).reshape(-1, 2).astype( + np.float32) # xy-> yx + + point_num = len(pos_info) + if point_num > fixed_point_num: + keep_ids = [ + int((point_num * 1.0 / fixed_point_num) * x) + for x in range(fixed_point_num) + ] + pos_info = pos_info[keep_ids, :] + + keep = int(min(len(pos_info), fixed_point_num)) + reference_width = (np.abs(poly[0, 0, 0] - poly[-1, 1, 0]) + + np.abs(poly[0, 3, 0] - poly[-1, 2, 0])) // 2 + if np.random.rand() < 1: + dh = (np.random.rand(keep) - 0.5) * reference_height + offset = np.random.rand() - 0.5 + dw = np.array([[0, offset * reference_width * 0.2]]) + random_float_h = np.array([1, 0]).reshape([1, 2]) * dh.reshape( + [keep, 1]) + random_float_w = dw.repeat(keep, axis=0) + pos_info += random_float_h + pos_info += random_float_w + pos_info[:, 0] = np.clip(pos_info[:, 0], 0, max_h - 1) + pos_info[:, 1] = np.clip(pos_info[:, 1], 0, max_w - 1) + + # padding to fixed length + pos_l = np.zeros((self.tcl_len, 3), dtype=np.int32) + pos_l[:, 0] = np.ones((self.tcl_len, )) * img_id + pos_m = np.zeros((self.tcl_len, 1), dtype=np.float32) + pos_l[:keep, 1:] = np.round(pos_info).astype(np.int32) + pos_m[:keep] = 1.0 + return pos_l, pos_m + def generate_direction_map(self, poly_quads, n_char, direction_map): """ """ @@ -334,6 +432,7 @@ class PGProcessTrain(object): """ Generate polygon. """ + self.ds_ratio = ds_ratio score_map_big = np.zeros( ( h, @@ -384,7 +483,6 @@ class PGProcessTrain(object): text_label = text_strs[poly_idx] text_label = self.prepare_text_label(text_label, self.Lexicon_Table) - text_label_index_list = [[self.Lexicon_Table.index(c_)] for c_ in text_label if c_ in self.Lexicon_Table] @@ -432,14 +530,30 @@ class PGProcessTrain(object): # pos info average_shrink_height = self.calculate_average_height( stcl_quads) - pos_l, pos_m = self.fit_and_gather_tcl_points_v2( - min_area_quad, - poly, - max_h=h, - max_w=w, - fixed_point_num=64, - img_id=self.img_id, - reference_height=average_shrink_height) + + if self.point_gather_mode == 'align': + self.f_direction = direction_map[:, :, :-1].copy() + pos_res = self.fit_and_gather_tcl_points_v3( + min_area_quad, + stcl_quads, + max_h=h, + max_w=w, + fixed_point_num=64, + img_id=self.img_id, + reference_height=average_shrink_height) + if pos_res is None: + continue + pos_l, pos_m = pos_res[0], pos_res[1] + + else: + pos_l, pos_m = self.fit_and_gather_tcl_points_v2( + min_area_quad, + poly, + max_h=h, + max_w=w, + fixed_point_num=64, + img_id=self.img_id, + reference_height=average_shrink_height) label_l = text_label_index_list if len(text_label_index_list) < 2: @@ -770,27 +884,41 @@ class PGProcessTrain(object): text_polys[:, :, 0] *= asp_wx text_polys[:, :, 1] *= asp_hy - h, w, _ = im.shape - if max(h, w) > 2048: - rd_scale = 2048.0 / max(h, w) - im = cv2.resize(im, dsize=None, fx=rd_scale, fy=rd_scale) - text_polys *= rd_scale - h, w, _ = im.shape - if min(h, w) < 16: - return None - - # no background - im, text_polys, text_tags, hv_tags, text_strs = self.crop_area( - im, - text_polys, - text_tags, - hv_tags, - text_strs, - crop_background=False) + if self.use_resize is True: + ori_h, ori_w, _ = im.shape + if max(ori_h, ori_w) < 200: + ratio = 200 / max(ori_h, ori_w) + im = cv2.resize(im, (int(ori_w * ratio), int(ori_h * ratio))) + text_polys[:, :, 0] *= ratio + text_polys[:, :, 1] *= ratio + + if max(ori_h, ori_w) > 512: + ratio = 512 / max(ori_h, ori_w) + im = cv2.resize(im, (int(ori_w * ratio), int(ori_h * ratio))) + text_polys[:, :, 0] *= ratio + text_polys[:, :, 1] *= ratio + elif self.use_random_crop is True: + h, w, _ = im.shape + if max(h, w) > 2048: + rd_scale = 2048.0 / max(h, w) + im = cv2.resize(im, dsize=None, fx=rd_scale, fy=rd_scale) + text_polys *= rd_scale + h, w, _ = im.shape + if min(h, w) < 16: + return None + + # no background + im, text_polys, text_tags, hv_tags, text_strs = self.crop_area( + im, + text_polys, + text_tags, + hv_tags, + text_strs, + crop_background=False) if text_polys.shape[0] == 0: return None - # # continue for all ignore case + # continue for all ignore case if np.sum((text_tags * 1.0)) >= text_tags.size: return None new_h, new_w, _ = im.shape diff --git a/ppocr/data/imaug/rec_img_aug.py b/ppocr/data/imaug/rec_img_aug.py index 725b4b0617c2f0808c7bf99077e2f62caa3afbf0..89022d85ad8f24f61ef7725319ab46be01fe4d16 100644 --- a/ppocr/data/imaug/rec_img_aug.py +++ b/ppocr/data/imaug/rec_img_aug.py @@ -414,6 +414,23 @@ class SVTRRecResizeImg(object): data['valid_ratio'] = valid_ratio return data +class RobustScannerRecResizeImg(object): + def __init__(self, image_shape, max_text_length, width_downsample_ratio=0.25, **kwargs): + self.image_shape = image_shape + self.width_downsample_ratio = width_downsample_ratio + self.max_text_length = max_text_length + + def __call__(self, data): + img = data['image'] + norm_img, resize_shape, pad_shape, valid_ratio = resize_norm_img_sar( + img, self.image_shape, self.width_downsample_ratio) + word_positons = np.array(range(0, self.max_text_length)).astype('int64') + data['image'] = norm_img + data['resized_shape'] = resize_shape + data['pad_shape'] = pad_shape + data['valid_ratio'] = valid_ratio + data['word_positons'] = word_positons + return data def resize_norm_img_sar(img, image_shape, width_downsample_ratio=0.25): imgC, imgH, imgW_min, imgW_max = image_shape @@ -485,7 +502,7 @@ def resize_norm_img_chinese(img, image_shape): max_wh_ratio = imgW * 1.0 / imgH h, w = img.shape[0], img.shape[1] ratio = w * 1.0 / h - max_wh_ratio = max(max_wh_ratio, ratio) + max_wh_ratio = min(max(max_wh_ratio, ratio), max_wh_ratio) imgW = int(imgH * max_wh_ratio) if math.ceil(imgH * ratio) > imgW: resized_w = imgW diff --git a/ppocr/data/imaug/table_ops.py b/ppocr/data/imaug/table_ops.py index 8d139190ab4b22c553036ddc8e31cfbc7ec3423d..c2c2fb2be6c80fdeb637717af2bbe122e1be999c 100644 --- a/ppocr/data/imaug/table_ops.py +++ b/ppocr/data/imaug/table_ops.py @@ -206,7 +206,7 @@ class ResizeTableImage(object): data['bboxes'] = data['bboxes'] * ratio data['image'] = resize_img data['src_img'] = img - data['shape'] = np.array([resize_h, resize_w, ratio, ratio]) + data['shape'] = np.array([height, width, ratio, ratio]) data['max_len'] = self.max_len return data diff --git a/ppocr/data/lmdb_dataset.py b/ppocr/data/lmdb_dataset.py index e1b49809d199096ad06b90c4562aa5dbfa634db1..3a51cefec2f1da2c96cceb6482d8303aa136b78a 100644 --- a/ppocr/data/lmdb_dataset.py +++ b/ppocr/data/lmdb_dataset.py @@ -16,6 +16,9 @@ import os from paddle.io import Dataset import lmdb import cv2 +import string +import six +from PIL import Image from .imaug import transform, create_operators @@ -116,3 +119,58 @@ class LMDBDataSet(Dataset): def __len__(self): return self.data_idx_order_list.shape[0] + + +class LMDBDataSetSR(LMDBDataSet): + def buf2PIL(self, txn, key, type='RGB'): + imgbuf = txn.get(key) + buf = six.BytesIO() + buf.write(imgbuf) + buf.seek(0) + im = Image.open(buf).convert(type) + return im + + def str_filt(self, str_, voc_type): + alpha_dict = { + 'digit': string.digits, + 'lower': string.digits + string.ascii_lowercase, + 'upper': string.digits + string.ascii_letters, + 'all': string.digits + string.ascii_letters + string.punctuation + } + if voc_type == 'lower': + str_ = str_.lower() + for char in str_: + if char not in alpha_dict[voc_type]: + str_ = str_.replace(char, '') + return str_ + + def get_lmdb_sample_info(self, txn, index): + self.voc_type = 'upper' + self.max_len = 100 + self.test = False + label_key = b'label-%09d' % index + word = str(txn.get(label_key).decode()) + img_HR_key = b'image_hr-%09d' % index # 128*32 + img_lr_key = b'image_lr-%09d' % index # 64*16 + try: + img_HR = self.buf2PIL(txn, img_HR_key, 'RGB') + img_lr = self.buf2PIL(txn, img_lr_key, 'RGB') + except IOError or len(word) > self.max_len: + return self[index + 1] + label_str = self.str_filt(word, self.voc_type) + return img_HR, img_lr, label_str + + def __getitem__(self, idx): + lmdb_idx, file_idx = self.data_idx_order_list[idx] + lmdb_idx = int(lmdb_idx) + file_idx = int(file_idx) + sample_info = self.get_lmdb_sample_info(self.lmdb_sets[lmdb_idx]['txn'], + file_idx) + if sample_info is None: + return self.__getitem__(np.random.randint(self.__len__())) + img_HR, img_lr, label_str = sample_info + data = {'image_hr': img_HR, 'image_lr': img_lr, 'label': label_str} + outs = transform(data, self.ops) + if outs is None: + return self.__getitem__(np.random.randint(self.__len__())) + return outs diff --git a/ppocr/losses/__init__.py b/ppocr/losses/__init__.py index bb82c7e0060fc561b6ebd8a71968e4f0ce7003e1..02525b3d50ad87509a6cba6fb2c1b00cb0add56e 100755 --- a/ppocr/losses/__init__.py +++ b/ppocr/losses/__init__.py @@ -25,6 +25,7 @@ from .det_east_loss import EASTLoss from .det_sast_loss import SASTLoss from .det_pse_loss import PSELoss from .det_fce_loss import FCELoss +from .det_ct_loss import CTLoss # rec loss from .rec_ctc_loss import CTCLoss @@ -52,11 +53,14 @@ from .basic_loss import DistanceLoss from .combined_loss import CombinedLoss # table loss -from .table_att_loss import TableAttentionLoss +from .table_att_loss import TableAttentionLoss, SLALoss from .table_master_loss import TableMasterLoss # vqa token loss from .vqa_token_layoutlm_loss import VQASerTokenLayoutLMLoss +# sr loss +from .stroke_focus_loss import StrokeFocusLoss + def build_loss(config): support_dict = [ @@ -64,7 +68,8 @@ def build_loss(config): 'ClsLoss', 'AttentionLoss', 'SRNLoss', 'PGLoss', 'CombinedLoss', 'CELoss', 'TableAttentionLoss', 'SARLoss', 'AsterLoss', 'SDMGRLoss', 'VQASerTokenLayoutLMLoss', 'LossFromOutput', 'PRENLoss', 'MultiLoss', - 'TableMasterLoss', 'SPINAttentionLoss', 'VLLoss' + 'TableMasterLoss', 'SPINAttentionLoss', 'VLLoss', 'StrokeFocusLoss', + 'SLALoss', 'CTLoss' ] config = copy.deepcopy(config) module_name = config.pop('name') diff --git a/ppocr/losses/basic_loss.py b/ppocr/losses/basic_loss.py index da9faa08bc5ca35c5d65f7a7bfbbdd67192f052b..58410b4db2157074c2cb0f7db590c84021e10ace 100644 --- a/ppocr/losses/basic_loss.py +++ b/ppocr/losses/basic_loss.py @@ -60,19 +60,19 @@ class KLJSLoss(object): ], "mode can only be one of ['kl', 'KL', 'js', 'JS']" self.mode = mode - def __call__(self, p1, p2, reduction="mean"): + def __call__(self, p1, p2, reduction="mean", eps=1e-5): if self.mode.lower() == 'kl': loss = paddle.multiply(p2, - paddle.log((p2 + 1e-5) / (p1 + 1e-5) + 1e-5)) - loss += paddle.multiply( - p1, paddle.log((p1 + 1e-5) / (p2 + 1e-5) + 1e-5)) + paddle.log((p2 + eps) / (p1 + eps) + eps)) + loss += paddle.multiply(p1, + paddle.log((p1 + eps) / (p2 + eps) + eps)) loss *= 0.5 elif self.mode.lower() == "js": loss = paddle.multiply( - p2, paddle.log((2 * p2 + 1e-5) / (p1 + p2 + 1e-5) + 1e-5)) + p2, paddle.log((2 * p2 + eps) / (p1 + p2 + eps) + eps)) loss += paddle.multiply( - p1, paddle.log((2 * p1 + 1e-5) / (p1 + p2 + 1e-5) + 1e-5)) + p1, paddle.log((2 * p1 + eps) / (p1 + p2 + eps) + eps)) loss *= 0.5 else: raise ValueError( diff --git a/ppocr/losses/det_ct_loss.py b/ppocr/losses/det_ct_loss.py new file mode 100755 index 0000000000000000000000000000000000000000..f48c95be4f84e2d8520363379b3061fa4245c105 --- /dev/null +++ b/ppocr/losses/det_ct_loss.py @@ -0,0 +1,276 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/shengtao96/CentripetalText/tree/main/models/loss +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from paddle import nn +import paddle.nn.functional as F +import numpy as np + + +def ohem_single(score, gt_text, training_mask): + # online hard example mining + + pos_num = int(paddle.sum(gt_text > 0.5)) - int( + paddle.sum((gt_text > 0.5) & (training_mask <= 0.5))) + + if pos_num == 0: + # selected_mask = gt_text.copy() * 0 # may be not good + selected_mask = training_mask + selected_mask = paddle.cast( + selected_mask.reshape( + (1, selected_mask.shape[0], selected_mask.shape[1])), "float32") + return selected_mask + + neg_num = int(paddle.sum((gt_text <= 0.5) & (training_mask > 0.5))) + neg_num = int(min(pos_num * 3, neg_num)) + + if neg_num == 0: + selected_mask = training_mask + selected_mask = paddle.cast( + selected_mask.reshape( + (1, selected_mask.shape[0], selected_mask.shape[1])), "float32") + return selected_mask + + # hard example + neg_score = score[(gt_text <= 0.5) & (training_mask > 0.5)] + neg_score_sorted = paddle.sort(-neg_score) + threshold = -neg_score_sorted[neg_num - 1] + + selected_mask = ((score >= threshold) | + (gt_text > 0.5)) & (training_mask > 0.5) + selected_mask = paddle.cast( + selected_mask.reshape( + (1, selected_mask.shape[0], selected_mask.shape[1])), "float32") + return selected_mask + + +def ohem_batch(scores, gt_texts, training_masks): + selected_masks = [] + for i in range(scores.shape[0]): + selected_masks.append( + ohem_single(scores[i, :, :], gt_texts[i, :, :], training_masks[ + i, :, :])) + + selected_masks = paddle.cast(paddle.concat(selected_masks, 0), "float32") + return selected_masks + + +def iou_single(a, b, mask, n_class): + EPS = 1e-6 + valid = mask == 1 + a = a[valid] + b = b[valid] + miou = [] + + # iou of each class + for i in range(n_class): + inter = paddle.cast(((a == i) & (b == i)), "float32") + union = paddle.cast(((a == i) | (b == i)), "float32") + + miou.append(paddle.sum(inter) / (paddle.sum(union) + EPS)) + miou = sum(miou) / len(miou) + return miou + + +def iou(a, b, mask, n_class=2, reduce=True): + batch_size = a.shape[0] + + a = a.reshape((batch_size, -1)) + b = b.reshape((batch_size, -1)) + mask = mask.reshape((batch_size, -1)) + + iou = paddle.zeros((batch_size, ), dtype="float32") + for i in range(batch_size): + iou[i] = iou_single(a[i], b[i], mask[i], n_class) + + if reduce: + iou = paddle.mean(iou) + return iou + + +class DiceLoss(nn.Layer): + def __init__(self, loss_weight=1.0): + super(DiceLoss, self).__init__() + self.loss_weight = loss_weight + + def forward(self, input, target, mask, reduce=True): + batch_size = input.shape[0] + input = F.sigmoid(input) # scale to 0-1 + + input = input.reshape((batch_size, -1)) + target = paddle.cast(target.reshape((batch_size, -1)), "float32") + mask = paddle.cast(mask.reshape((batch_size, -1)), "float32") + + input = input * mask + target = target * mask + + a = paddle.sum(input * target, axis=1) + b = paddle.sum(input * input, axis=1) + 0.001 + c = paddle.sum(target * target, axis=1) + 0.001 + d = (2 * a) / (b + c) + loss = 1 - d + + loss = self.loss_weight * loss + + if reduce: + loss = paddle.mean(loss) + + return loss + + +class SmoothL1Loss(nn.Layer): + def __init__(self, beta=1.0, loss_weight=1.0): + super(SmoothL1Loss, self).__init__() + self.beta = beta + self.loss_weight = loss_weight + + np_coord = np.zeros(shape=[640, 640, 2], dtype=np.int64) + for i in range(640): + for j in range(640): + np_coord[i, j, 0] = j + np_coord[i, j, 1] = i + np_coord = np_coord.reshape((-1, 2)) + + self.coord = self.create_parameter( + shape=[640 * 640, 2], + dtype="int32", # NOTE: not support "int64" before paddle 2.3.1 + default_initializer=nn.initializer.Assign(value=np_coord)) + self.coord.stop_gradient = True + + def forward_single(self, input, target, mask, beta=1.0, eps=1e-6): + batch_size = input.shape[0] + + diff = paddle.abs(input - target) * mask.unsqueeze(1) + loss = paddle.where(diff < beta, 0.5 * diff * diff / beta, + diff - 0.5 * beta) + loss = paddle.cast(loss.reshape((batch_size, -1)), "float32") + mask = paddle.cast(mask.reshape((batch_size, -1)), "float32") + loss = paddle.sum(loss, axis=-1) + loss = loss / (mask.sum(axis=-1) + eps) + + return loss + + def select_single(self, distance, gt_instance, gt_kernel_instance, + training_mask): + + with paddle.no_grad(): + # paddle 2.3.1, paddle.slice not support: + # distance[:, self.coord[:, 1], self.coord[:, 0]] + select_distance_list = [] + for i in range(2): + tmp1 = distance[i, :] + tmp2 = tmp1[self.coord[:, 1], self.coord[:, 0]] + select_distance_list.append(tmp2.unsqueeze(0)) + select_distance = paddle.concat(select_distance_list, axis=0) + + off_points = paddle.cast( + self.coord, "float32") + 10 * select_distance.transpose((1, 0)) + + off_points = paddle.cast(off_points, "int64") + off_points = paddle.clip(off_points, 0, distance.shape[-1] - 1) + + selected_mask = ( + gt_instance[self.coord[:, 1], self.coord[:, 0]] != + gt_kernel_instance[off_points[:, 1], off_points[:, 0]]) + selected_mask = paddle.cast( + selected_mask.reshape((1, -1, distance.shape[-1])), "int64") + selected_training_mask = selected_mask * training_mask + + return selected_training_mask + + def forward(self, + distances, + gt_instances, + gt_kernel_instances, + training_masks, + gt_distances, + reduce=True): + + selected_training_masks = [] + for i in range(distances.shape[0]): + selected_training_masks.append( + self.select_single(distances[i, :, :, :], gt_instances[i, :, :], + gt_kernel_instances[i, :, :], training_masks[ + i, :, :])) + selected_training_masks = paddle.cast( + paddle.concat(selected_training_masks, 0), "float32") + + loss = self.forward_single(distances, gt_distances, + selected_training_masks, self.beta) + loss = self.loss_weight * loss + + with paddle.no_grad(): + batch_size = distances.shape[0] + false_num = selected_training_masks.reshape((batch_size, -1)) + false_num = false_num.sum(axis=-1) + total_num = paddle.cast( + training_masks.reshape((batch_size, -1)), "float32") + total_num = total_num.sum(axis=-1) + iou_text = (total_num - false_num) / (total_num + 1e-6) + + if reduce: + loss = paddle.mean(loss) + + return loss, iou_text + + +class CTLoss(nn.Layer): + def __init__(self): + super(CTLoss, self).__init__() + self.kernel_loss = DiceLoss() + self.loc_loss = SmoothL1Loss(beta=0.1, loss_weight=0.05) + + def forward(self, preds, batch): + imgs = batch[0] + out = preds['maps'] + gt_kernels, training_masks, gt_instances, gt_kernel_instances, training_mask_distances, gt_distances = batch[ + 1:] + + kernels = out[:, 0, :, :] + distances = out[:, 1:, :, :] + + # kernel loss + selected_masks = ohem_batch(kernels, gt_kernels, training_masks) + + loss_kernel = self.kernel_loss( + kernels, gt_kernels, selected_masks, reduce=False) + + iou_kernel = iou(paddle.cast((kernels > 0), "int64"), + gt_kernels, + training_masks, + reduce=False) + losses = dict(loss_kernels=loss_kernel, ) + + # loc loss + loss_loc, iou_text = self.loc_loss( + distances, + gt_instances, + gt_kernel_instances, + training_mask_distances, + gt_distances, + reduce=False) + losses.update(dict(loss_loc=loss_loc, )) + + loss_all = loss_kernel + loss_loc + losses = {'loss': loss_all} + + return losses diff --git a/ppocr/losses/e2e_pg_loss.py b/ppocr/losses/e2e_pg_loss.py index 10a8ed0aa907123b155976ba498426604f23c2b0..aff67b7ce3c208bf9c7b1371e095eac8c70ce9df 100644 --- a/ppocr/losses/e2e_pg_loss.py +++ b/ppocr/losses/e2e_pg_loss.py @@ -89,12 +89,13 @@ class PGLoss(nn.Layer): tcl_pos = paddle.reshape(tcl_pos, [-1, 3]) tcl_pos = paddle.cast(tcl_pos, dtype=int) f_tcl_char = paddle.gather_nd(f_char, tcl_pos) - f_tcl_char = paddle.reshape(f_tcl_char, - [-1, 64, 37]) # len(Lexicon_Table)+1 - f_tcl_char_fg, f_tcl_char_bg = paddle.split(f_tcl_char, [36, 1], axis=2) + f_tcl_char = paddle.reshape( + f_tcl_char, [-1, 64, self.pad_num + 1]) # len(Lexicon_Table)+1 + f_tcl_char_fg, f_tcl_char_bg = paddle.split( + f_tcl_char, [self.pad_num, 1], axis=2) f_tcl_char_bg = f_tcl_char_bg * tcl_mask + (1.0 - tcl_mask) * 20.0 b, c, l = tcl_mask.shape - tcl_mask_fg = paddle.expand(x=tcl_mask, shape=[b, c, 36 * l]) + tcl_mask_fg = paddle.expand(x=tcl_mask, shape=[b, c, self.pad_num * l]) tcl_mask_fg.stop_gradient = True f_tcl_char_fg = f_tcl_char_fg * tcl_mask_fg + (1.0 - tcl_mask_fg) * ( -20.0) diff --git a/ppocr/losses/stroke_focus_loss.py b/ppocr/losses/stroke_focus_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..002bbc34774cc80599015492762ca448f593df0f --- /dev/null +++ b/ppocr/losses/stroke_focus_loss.py @@ -0,0 +1,68 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/FudanVI/FudanOCR/blob/main/text-gestalt/loss/stroke_focus_loss.py +""" +import cv2 +import sys +import time +import string +import random +import numpy as np +import paddle.nn as nn +import paddle + + +class StrokeFocusLoss(nn.Layer): + def __init__(self, character_dict_path=None, **kwargs): + super(StrokeFocusLoss, self).__init__(character_dict_path) + self.mse_loss = nn.MSELoss() + self.ce_loss = nn.CrossEntropyLoss() + self.l1_loss = nn.L1Loss() + self.english_stroke_alphabet = '0123456789' + self.english_stroke_dict = {} + for index in range(len(self.english_stroke_alphabet)): + self.english_stroke_dict[self.english_stroke_alphabet[ + index]] = index + + stroke_decompose_lines = open(character_dict_path, 'r').readlines() + self.dic = {} + for line in stroke_decompose_lines: + line = line.strip() + character, sequence = line.split() + self.dic[character] = sequence + + def forward(self, pred, data): + + sr_img = pred["sr_img"] + hr_img = pred["hr_img"] + + mse_loss = self.mse_loss(sr_img, hr_img) + word_attention_map_gt = pred["word_attention_map_gt"] + word_attention_map_pred = pred["word_attention_map_pred"] + + hr_pred = pred["hr_pred"] + sr_pred = pred["sr_pred"] + + attention_loss = paddle.nn.functional.l1_loss(word_attention_map_gt, + word_attention_map_pred) + + loss = (mse_loss + attention_loss * 50) * 100 + + return { + "mse_loss": mse_loss, + "attention_loss": attention_loss, + "loss": loss + } diff --git a/ppocr/losses/table_att_loss.py b/ppocr/losses/table_att_loss.py index 3496c9072553d839017eaa017fe47dfb66fb9d3b..f1771847b46b99d8cf2a3ae69e7e990ee02f26a5 100644 --- a/ppocr/losses/table_att_loss.py +++ b/ppocr/losses/table_att_loss.py @@ -22,65 +22,11 @@ from paddle.nn import functional as F class TableAttentionLoss(nn.Layer): - def __init__(self, - structure_weight, - loc_weight, - use_giou=False, - giou_weight=1.0, - **kwargs): + def __init__(self, structure_weight, loc_weight, **kwargs): super(TableAttentionLoss, self).__init__() self.loss_func = nn.CrossEntropyLoss(weight=None, reduction='none') self.structure_weight = structure_weight self.loc_weight = loc_weight - self.use_giou = use_giou - self.giou_weight = giou_weight - - def giou_loss(self, preds, bbox, eps=1e-7, reduction='mean'): - ''' - :param preds:[[x1,y1,x2,y2], [x1,y1,x2,y2],,,] - :param bbox:[[x1,y1,x2,y2], [x1,y1,x2,y2],,,] - :return: loss - ''' - ix1 = paddle.maximum(preds[:, 0], bbox[:, 0]) - iy1 = paddle.maximum(preds[:, 1], bbox[:, 1]) - ix2 = paddle.minimum(preds[:, 2], bbox[:, 2]) - iy2 = paddle.minimum(preds[:, 3], bbox[:, 3]) - - iw = paddle.clip(ix2 - ix1 + 1e-3, 0., 1e10) - ih = paddle.clip(iy2 - iy1 + 1e-3, 0., 1e10) - - # overlap - inters = iw * ih - - # union - uni = (preds[:, 2] - preds[:, 0] + 1e-3) * ( - preds[:, 3] - preds[:, 1] + 1e-3) + (bbox[:, 2] - bbox[:, 0] + 1e-3 - ) * (bbox[:, 3] - bbox[:, 1] + - 1e-3) - inters + eps - - # ious - ious = inters / uni - - ex1 = paddle.minimum(preds[:, 0], bbox[:, 0]) - ey1 = paddle.minimum(preds[:, 1], bbox[:, 1]) - ex2 = paddle.maximum(preds[:, 2], bbox[:, 2]) - ey2 = paddle.maximum(preds[:, 3], bbox[:, 3]) - ew = paddle.clip(ex2 - ex1 + 1e-3, 0., 1e10) - eh = paddle.clip(ey2 - ey1 + 1e-3, 0., 1e10) - - # enclose erea - enclose = ew * eh + eps - giou = ious - (enclose - uni) / enclose - - loss = 1 - giou - - if reduction == 'mean': - loss = paddle.mean(loss) - elif reduction == 'sum': - loss = paddle.sum(loss) - else: - raise NotImplementedError - return loss def forward(self, predicts, batch): structure_probs = predicts['structure_probs'] @@ -100,20 +46,48 @@ class TableAttentionLoss(nn.Layer): loc_targets_mask = loc_targets_mask[:, 1:, :] loc_loss = F.mse_loss(loc_preds * loc_targets_mask, loc_targets) * self.loc_weight - if self.use_giou: - loc_loss_giou = self.giou_loss(loc_preds * loc_targets_mask, - loc_targets) * self.giou_weight - total_loss = structure_loss + loc_loss + loc_loss_giou - return { - 'loss': total_loss, - "structure_loss": structure_loss, - "loc_loss": loc_loss, - "loc_loss_giou": loc_loss_giou - } - else: - total_loss = structure_loss + loc_loss - return { - 'loss': total_loss, - "structure_loss": structure_loss, - "loc_loss": loc_loss - } + + total_loss = structure_loss + loc_loss + return { + 'loss': total_loss, + "structure_loss": structure_loss, + "loc_loss": loc_loss + } + + +class SLALoss(nn.Layer): + def __init__(self, structure_weight, loc_weight, loc_loss='mse', **kwargs): + super(SLALoss, self).__init__() + self.loss_func = nn.CrossEntropyLoss(weight=None, reduction='mean') + self.structure_weight = structure_weight + self.loc_weight = loc_weight + self.loc_loss = loc_loss + self.eps = 1e-12 + + def forward(self, predicts, batch): + structure_probs = predicts['structure_probs'] + structure_targets = batch[1].astype("int64") + structure_targets = structure_targets[:, 1:] + + structure_loss = self.loss_func(structure_probs, structure_targets) + + structure_loss = paddle.mean(structure_loss) * self.structure_weight + + loc_preds = predicts['loc_preds'] + loc_targets = batch[2].astype("float32") + loc_targets_mask = batch[3].astype("float32") + loc_targets = loc_targets[:, 1:, :] + loc_targets_mask = loc_targets_mask[:, 1:, :] + + loc_loss = F.smooth_l1_loss( + loc_preds * loc_targets_mask, + loc_targets * loc_targets_mask, + reduction='sum') * self.loc_weight + + loc_loss = loc_loss / (loc_targets_mask.sum() + self.eps) + total_loss = structure_loss + loc_loss + return { + 'loss': total_loss, + "structure_loss": structure_loss, + "loc_loss": loc_loss + } diff --git a/ppocr/metrics/__init__.py b/ppocr/metrics/__init__.py index c244066c9f35570143403dd485e3422786711832..a39d0a464f3f96b44d23cec55768223ca41311fa 100644 --- a/ppocr/metrics/__init__.py +++ b/ppocr/metrics/__init__.py @@ -30,13 +30,15 @@ from .table_metric import TableMetric from .kie_metric import KIEMetric from .vqa_token_ser_metric import VQASerTokenMetric from .vqa_token_re_metric import VQAReTokenMetric +from .sr_metric import SRMetric +from .ct_metric import CTMetric def build_metric(config): support_dict = [ "DetMetric", "DetFCEMetric", "RecMetric", "ClsMetric", "E2EMetric", "DistillationMetric", "TableMetric", 'KIEMetric', 'VQASerTokenMetric', - 'VQAReTokenMetric' + 'VQAReTokenMetric', 'SRMetric', 'CTMetric' ] config = copy.deepcopy(config) diff --git a/ppocr/metrics/ct_metric.py b/ppocr/metrics/ct_metric.py new file mode 100644 index 0000000000000000000000000000000000000000..a7634230a23027a5dd5c32a7b8eb87ee4a229076 --- /dev/null +++ b/ppocr/metrics/ct_metric.py @@ -0,0 +1,52 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +from scipy import io +import numpy as np + +from ppocr.utils.e2e_metric.Deteval import combine_results, get_score_C + + +class CTMetric(object): + def __init__(self, main_indicator, delimiter='\t', **kwargs): + self.delimiter = delimiter + self.main_indicator = main_indicator + self.reset() + + def reset(self): + self.results = [] # clear results + + def __call__(self, preds, batch, **kwargs): + # NOTE: only support bs=1 now, as the label length of different sample is Unequal + assert len( + preds) == 1, "CentripetalText test now only suuport batch_size=1." + label = batch[2] + text = batch[3] + pred = preds[0]['points'] + result = get_score_C(label, text, pred) + + self.results.append(result) + + def get_metric(self): + """ + Input format: y0,x0, ..... yn,xn. Each detection is separated by the end of line token ('\n')' + """ + metrics = combine_results(self.results, rec_flag=False) + self.reset() + return metrics diff --git a/ppocr/metrics/rec_metric.py b/ppocr/metrics/rec_metric.py index 515b9372e38a7213cde29fdc9834ed6df45a0a80..9863978116b1340fa809e8919a6a37d598d6bbdf 100644 --- a/ppocr/metrics/rec_metric.py +++ b/ppocr/metrics/rec_metric.py @@ -12,10 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -import Levenshtein +from rapidfuzz.distance import Levenshtein import string + class RecMetric(object): def __init__(self, main_indicator='acc', @@ -45,8 +46,7 @@ class RecMetric(object): if self.is_filter: pred = self._normalize_text(pred) target = self._normalize_text(target) - norm_edit_dis += Levenshtein.distance(pred, target) / max( - len(pred), len(target), 1) + norm_edit_dis += Levenshtein.normalized_distance(pred, target) if pred == target: correct_num += 1 all_num += 1 diff --git a/ppocr/metrics/sr_metric.py b/ppocr/metrics/sr_metric.py new file mode 100644 index 0000000000000000000000000000000000000000..51c3ad66564e61abdd91432e6dc9ea1d8918583b --- /dev/null +++ b/ppocr/metrics/sr_metric.py @@ -0,0 +1,155 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +https://github.com/FudanVI/FudanOCR/blob/main/text-gestalt/utils/ssim_psnr.py +""" + +from math import exp + +import paddle +import paddle.nn.functional as F +import paddle.nn as nn +import string + + +class SSIM(nn.Layer): + def __init__(self, window_size=11, size_average=True): + super(SSIM, self).__init__() + self.window_size = window_size + self.size_average = size_average + self.channel = 1 + self.window = self.create_window(window_size, self.channel) + + def gaussian(self, window_size, sigma): + gauss = paddle.to_tensor([ + exp(-(x - window_size // 2)**2 / float(2 * sigma**2)) + for x in range(window_size) + ]) + return gauss / gauss.sum() + + def create_window(self, window_size, channel): + _1D_window = self.gaussian(window_size, 1.5).unsqueeze(1) + _2D_window = _1D_window.mm(_1D_window.t()).unsqueeze(0).unsqueeze(0) + window = _2D_window.expand([channel, 1, window_size, window_size]) + return window + + def _ssim(self, img1, img2, window, window_size, channel, + size_average=True): + mu1 = F.conv2d(img1, window, padding=window_size // 2, groups=channel) + mu2 = F.conv2d(img2, window, padding=window_size // 2, groups=channel) + + mu1_sq = mu1.pow(2) + mu2_sq = mu2.pow(2) + mu1_mu2 = mu1 * mu2 + + sigma1_sq = F.conv2d( + img1 * img1, window, padding=window_size // 2, + groups=channel) - mu1_sq + sigma2_sq = F.conv2d( + img2 * img2, window, padding=window_size // 2, + groups=channel) - mu2_sq + sigma12 = F.conv2d( + img1 * img2, window, padding=window_size // 2, + groups=channel) - mu1_mu2 + + C1 = 0.01**2 + C2 = 0.03**2 + + ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) / ( + (mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2)) + + if size_average: + return ssim_map.mean() + else: + return ssim_map.mean([1, 2, 3]) + + def ssim(self, img1, img2, window_size=11, size_average=True): + (_, channel, _, _) = img1.shape + window = self.create_window(window_size, channel) + + return self._ssim(img1, img2, window, window_size, channel, + size_average) + + def forward(self, img1, img2): + (_, channel, _, _) = img1.shape + + if channel == self.channel and self.window.dtype == img1.dtype: + window = self.window + else: + window = self.create_window(self.window_size, channel) + + self.window = window + self.channel = channel + + return self._ssim(img1, img2, window, self.window_size, channel, + self.size_average) + + +class SRMetric(object): + def __init__(self, main_indicator='all', **kwargs): + self.main_indicator = main_indicator + self.eps = 1e-5 + self.psnr_result = [] + self.ssim_result = [] + self.calculate_ssim = SSIM() + self.reset() + + def reset(self): + self.correct_num = 0 + self.all_num = 0 + self.norm_edit_dis = 0 + self.psnr_result = [] + self.ssim_result = [] + + def calculate_psnr(self, img1, img2): + # img1 and img2 have range [0, 1] + mse = ((img1 * 255 - img2 * 255)**2).mean() + if mse == 0: + return float('inf') + return 20 * paddle.log10(255.0 / paddle.sqrt(mse)) + + def _normalize_text(self, text): + text = ''.join( + filter(lambda x: x in (string.digits + string.ascii_letters), text)) + return text.lower() + + def __call__(self, pred_label, *args, **kwargs): + metric = {} + images_sr = pred_label["sr_img"] + images_hr = pred_label["hr_img"] + psnr = self.calculate_psnr(images_sr, images_hr) + ssim = self.calculate_ssim(images_sr, images_hr) + self.psnr_result.append(psnr) + self.ssim_result.append(ssim) + + def get_metric(self): + """ + return metrics { + 'acc': 0, + 'norm_edit_dis': 0, + } + """ + self.psnr_avg = sum(self.psnr_result) / len(self.psnr_result) + self.psnr_avg = round(self.psnr_avg.item(), 6) + self.ssim_avg = sum(self.ssim_result) / len(self.ssim_result) + self.ssim_avg = round(self.ssim_avg.item(), 6) + + self.all_avg = self.psnr_avg + self.ssim_avg + + self.reset() + return { + 'psnr_avg': self.psnr_avg, + "ssim_avg": self.ssim_avg, + "all": self.all_avg + } diff --git a/ppocr/metrics/table_metric.py b/ppocr/metrics/table_metric.py index fd2631e442b8d111c64d5cf4b34ea9063d8c60dd..c0b247efa672caacb9a9a09a8ef0da58e47367e4 100644 --- a/ppocr/metrics/table_metric.py +++ b/ppocr/metrics/table_metric.py @@ -16,9 +16,14 @@ from ppocr.metrics.det_metric import DetMetric class TableStructureMetric(object): - def __init__(self, main_indicator='acc', eps=1e-6, **kwargs): + def __init__(self, + main_indicator='acc', + eps=1e-6, + del_thead_tbody=False, + **kwargs): self.main_indicator = main_indicator self.eps = eps + self.del_thead_tbody = del_thead_tbody self.reset() def __call__(self, pred_label, batch=None, *args, **kwargs): @@ -31,6 +36,13 @@ class TableStructureMetric(object): gt_structure_batch_list): pred_str = ''.join(pred) target_str = ''.join(target) + if self.del_thead_tbody: + pred_str = pred_str.replace('', '').replace( + '', '').replace('', '').replace('', + '') + target_str = target_str.replace('', '').replace( + '', '').replace('', '').replace('', + '') if pred_str == target_str: correct_num += 1 all_num += 1 @@ -59,7 +71,8 @@ class TableMetric(object): def __init__(self, main_indicator='acc', compute_bbox_metric=False, - point_num=2, + box_format='xyxy', + del_thead_tbody=False, **kwargs): """ @@ -67,10 +80,11 @@ class TableMetric(object): @param main_matric: main_matric for save best_model @param kwargs: """ - self.structure_metric = TableStructureMetric() + self.structure_metric = TableStructureMetric( + del_thead_tbody=del_thead_tbody) self.bbox_metric = DetMetric() if compute_bbox_metric else None self.main_indicator = main_indicator - self.point_num = point_num + self.box_format = box_format self.reset() def __call__(self, pred_label, batch=None, *args, **kwargs): @@ -129,10 +143,14 @@ class TableMetric(object): self.bbox_metric.reset() def format_box(self, box): - if self.point_num == 2: + if self.box_format == 'xyxy': x1, y1, x2, y2 = box box = [[x1, y1], [x2, y1], [x2, y2], [x1, y2]] - elif self.point_num == 4: + elif self.box_format == 'xywh': + x, y, w, h = box + x1, y1, x2, y2 = x - w // 2, y - h // 2, x + w // 2, y + h // 2 + box = [[x1, y1], [x2, y1], [x2, y2], [x1, y2]] + elif self.box_format == 'xyxyxyxy': x1, y1, x2, y2, x3, y3, x4, y4 = box box = [[x1, y1], [x2, y2], [x3, y3], [x4, y4]] return box diff --git a/ppocr/modeling/architectures/base_model.py b/ppocr/modeling/architectures/base_model.py index ed2a909cb58d56ec5a67b897de1a171658228acb..5612d366ea9ccf3f45ab675fbaa374fd4fe5d773 100644 --- a/ppocr/modeling/architectures/base_model.py +++ b/ppocr/modeling/architectures/base_model.py @@ -14,6 +14,7 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function + from paddle import nn from ppocr.modeling.transforms import build_transform from ppocr.modeling.backbones import build_backbone @@ -46,9 +47,13 @@ class BaseModel(nn.Layer): in_channels = self.transform.out_channels # build backbone, backbone is need for del, rec and cls - config["Backbone"]['in_channels'] = in_channels - self.backbone = build_backbone(config["Backbone"], model_type) - in_channels = self.backbone.out_channels + if 'Backbone' not in config or config['Backbone'] is None: + self.use_backbone = False + else: + self.use_backbone = True + config["Backbone"]['in_channels'] = in_channels + self.backbone = build_backbone(config["Backbone"], model_type) + in_channels = self.backbone.out_channels # build neck # for rec, neck can be cnn,rnn or reshape(None) @@ -77,7 +82,8 @@ class BaseModel(nn.Layer): y = dict() if self.use_transform: x = self.transform(x) - x = self.backbone(x) + if self.use_backbone: + x = self.backbone(x) if isinstance(x, dict): y.update(x) else: @@ -109,4 +115,4 @@ class BaseModel(nn.Layer): else: return {final_name: x} else: - return x + return x \ No newline at end of file diff --git a/ppocr/modeling/backbones/__init__.py b/ppocr/modeling/backbones/__init__.py index d4f5b15f56d34a9f6a6501058179a643ac7e8318..6fdcc4a759e59027b1457d1e46757c64c4dcad9e 100755 --- a/ppocr/modeling/backbones/__init__.py +++ b/ppocr/modeling/backbones/__init__.py @@ -21,7 +21,10 @@ def build_backbone(config, model_type): from .det_resnet import ResNet from .det_resnet_vd import ResNet_vd from .det_resnet_vd_sast import ResNet_SAST - support_dict = ["MobileNetV3", "ResNet", "ResNet_vd", "ResNet_SAST"] + from .det_pp_lcnet import PPLCNet + support_dict = [ + "MobileNetV3", "ResNet", "ResNet_vd", "ResNet_SAST", "PPLCNet" + ] if model_type == "table": from .table_master_resnet import TableResNetExtra support_dict.append('TableResNetExtra') @@ -49,17 +52,15 @@ def build_backbone(config, model_type): support_dict = ['ResNet'] elif model_type == 'kie': from .kie_unet_sdmgr import Kie_backbone - support_dict = ['Kie_backbone'] + from .vqa_layoutlm import LayoutLMForSer, LayoutLMv2ForSer, LayoutLMv2ForRe, LayoutXLMForSer, LayoutXLMForRe + support_dict = [ + 'Kie_backbone', 'LayoutLMForSer', 'LayoutLMv2ForSer', + 'LayoutLMv2ForRe', 'LayoutXLMForSer', 'LayoutXLMForRe' + ] elif model_type == 'table': from .table_resnet_vd import ResNet from .table_mobilenet_v3 import MobileNetV3 support_dict = ['ResNet', 'MobileNetV3'] - elif model_type == 'vqa': - from .vqa_layoutlm import LayoutLMForSer, LayoutLMv2ForSer, LayoutLMv2ForRe, LayoutXLMForSer, LayoutXLMForRe - support_dict = [ - 'LayoutLMForSer', 'LayoutLMv2ForSer', 'LayoutLMv2ForRe', - 'LayoutXLMForSer', 'LayoutXLMForRe' - ] else: raise NotImplementedError diff --git a/ppocr/modeling/backbones/det_pp_lcnet.py b/ppocr/modeling/backbones/det_pp_lcnet.py new file mode 100644 index 0000000000000000000000000000000000000000..3f719e92bc67452b482e5b2053ee1a09540ffc0e --- /dev/null +++ b/ppocr/modeling/backbones/det_pp_lcnet.py @@ -0,0 +1,271 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import, division, print_function + +import os +import paddle +import paddle.nn as nn +from paddle import ParamAttr +from paddle.nn import AdaptiveAvgPool2D, BatchNorm, Conv2D, Dropout, Linear +from paddle.regularizer import L2Decay +from paddle.nn.initializer import KaimingNormal +from paddle.utils.download import get_path_from_url + +MODEL_URLS = { + "PPLCNet_x0.25": + "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPLCNet_x0_25_pretrained.pdparams", + "PPLCNet_x0.35": + "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPLCNet_x0_35_pretrained.pdparams", + "PPLCNet_x0.5": + "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPLCNet_x0_5_pretrained.pdparams", + "PPLCNet_x0.75": + "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPLCNet_x0_75_pretrained.pdparams", + "PPLCNet_x1.0": + "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPLCNet_x1_0_pretrained.pdparams", + "PPLCNet_x1.5": + "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPLCNet_x1_5_pretrained.pdparams", + "PPLCNet_x2.0": + "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPLCNet_x2_0_pretrained.pdparams", + "PPLCNet_x2.5": + "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPLCNet_x2_5_pretrained.pdparams" +} + +MODEL_STAGES_PATTERN = { + "PPLCNet": ["blocks2", "blocks3", "blocks4", "blocks5", "blocks6"] +} + +__all__ = list(MODEL_URLS.keys()) + +# Each element(list) represents a depthwise block, which is composed of k, in_c, out_c, s, use_se. +# k: kernel_size +# in_c: input channel number in depthwise block +# out_c: output channel number in depthwise block +# s: stride in depthwise block +# use_se: whether to use SE block + +NET_CONFIG = { + "blocks2": + # k, in_c, out_c, s, use_se + [[3, 16, 32, 1, False]], + "blocks3": [[3, 32, 64, 2, False], [3, 64, 64, 1, False]], + "blocks4": [[3, 64, 128, 2, False], [3, 128, 128, 1, False]], + "blocks5": + [[3, 128, 256, 2, False], [5, 256, 256, 1, False], [5, 256, 256, 1, False], + [5, 256, 256, 1, False], [5, 256, 256, 1, False], [5, 256, 256, 1, False]], + "blocks6": [[5, 256, 512, 2, True], [5, 512, 512, 1, True]] +} + + +def make_divisible(v, divisor=8, min_value=None): + if min_value is None: + min_value = divisor + new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) + if new_v < 0.9 * v: + new_v += divisor + return new_v + + +class ConvBNLayer(nn.Layer): + def __init__(self, + num_channels, + filter_size, + num_filters, + stride, + num_groups=1): + super().__init__() + + self.conv = Conv2D( + in_channels=num_channels, + out_channels=num_filters, + kernel_size=filter_size, + stride=stride, + padding=(filter_size - 1) // 2, + groups=num_groups, + weight_attr=ParamAttr(initializer=KaimingNormal()), + bias_attr=False) + + self.bn = BatchNorm( + num_filters, + param_attr=ParamAttr(regularizer=L2Decay(0.0)), + bias_attr=ParamAttr(regularizer=L2Decay(0.0))) + self.hardswish = nn.Hardswish() + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + x = self.hardswish(x) + return x + + +class DepthwiseSeparable(nn.Layer): + def __init__(self, + num_channels, + num_filters, + stride, + dw_size=3, + use_se=False): + super().__init__() + self.use_se = use_se + self.dw_conv = ConvBNLayer( + num_channels=num_channels, + num_filters=num_channels, + filter_size=dw_size, + stride=stride, + num_groups=num_channels) + if use_se: + self.se = SEModule(num_channels) + self.pw_conv = ConvBNLayer( + num_channels=num_channels, + filter_size=1, + num_filters=num_filters, + stride=1) + + def forward(self, x): + x = self.dw_conv(x) + if self.use_se: + x = self.se(x) + x = self.pw_conv(x) + return x + + +class SEModule(nn.Layer): + def __init__(self, channel, reduction=4): + super().__init__() + self.avg_pool = AdaptiveAvgPool2D(1) + self.conv1 = Conv2D( + in_channels=channel, + out_channels=channel // reduction, + kernel_size=1, + stride=1, + padding=0) + self.relu = nn.ReLU() + self.conv2 = Conv2D( + in_channels=channel // reduction, + out_channels=channel, + kernel_size=1, + stride=1, + padding=0) + self.hardsigmoid = nn.Hardsigmoid() + + def forward(self, x): + identity = x + x = self.avg_pool(x) + x = self.conv1(x) + x = self.relu(x) + x = self.conv2(x) + x = self.hardsigmoid(x) + x = paddle.multiply(x=identity, y=x) + return x + + +class PPLCNet(nn.Layer): + def __init__(self, + in_channels=3, + scale=1.0, + pretrained=False, + use_ssld=False): + super().__init__() + self.out_channels = [ + int(NET_CONFIG["blocks3"][-1][2] * scale), + int(NET_CONFIG["blocks4"][-1][2] * scale), + int(NET_CONFIG["blocks5"][-1][2] * scale), + int(NET_CONFIG["blocks6"][-1][2] * scale) + ] + self.scale = scale + + self.conv1 = ConvBNLayer( + num_channels=in_channels, + filter_size=3, + num_filters=make_divisible(16 * scale), + stride=2) + + self.blocks2 = nn.Sequential(* [ + DepthwiseSeparable( + num_channels=make_divisible(in_c * scale), + num_filters=make_divisible(out_c * scale), + dw_size=k, + stride=s, + use_se=se) + for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks2"]) + ]) + + self.blocks3 = nn.Sequential(* [ + DepthwiseSeparable( + num_channels=make_divisible(in_c * scale), + num_filters=make_divisible(out_c * scale), + dw_size=k, + stride=s, + use_se=se) + for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks3"]) + ]) + + self.blocks4 = nn.Sequential(* [ + DepthwiseSeparable( + num_channels=make_divisible(in_c * scale), + num_filters=make_divisible(out_c * scale), + dw_size=k, + stride=s, + use_se=se) + for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks4"]) + ]) + + self.blocks5 = nn.Sequential(* [ + DepthwiseSeparable( + num_channels=make_divisible(in_c * scale), + num_filters=make_divisible(out_c * scale), + dw_size=k, + stride=s, + use_se=se) + for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks5"]) + ]) + + self.blocks6 = nn.Sequential(* [ + DepthwiseSeparable( + num_channels=make_divisible(in_c * scale), + num_filters=make_divisible(out_c * scale), + dw_size=k, + stride=s, + use_se=se) + for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks6"]) + ]) + + if pretrained: + self._load_pretrained( + MODEL_URLS['PPLCNet_x{}'.format(scale)], use_ssld=use_ssld) + + def forward(self, x): + outs = [] + x = self.conv1(x) + x = self.blocks2(x) + x = self.blocks3(x) + outs.append(x) + x = self.blocks4(x) + outs.append(x) + x = self.blocks5(x) + outs.append(x) + x = self.blocks6(x) + outs.append(x) + return outs + + def _load_pretrained(self, pretrained_url, use_ssld=False): + if use_ssld: + pretrained_url = pretrained_url.replace("_pretrained", + "_ssld_pretrained") + print(pretrained_url) + local_weight_path = get_path_from_url( + pretrained_url, os.path.expanduser("~/.paddleclas/weights")) + param_state_dict = paddle.load(local_weight_path) + self.set_dict(param_state_dict) + return diff --git a/ppocr/modeling/backbones/rec_resnet_31.py b/ppocr/modeling/backbones/rec_resnet_31.py index 965170138d00a53fca720b3b5f535a3dd34272d9..46dc374008b56a20dbd4be257775368e9cbbace4 100644 --- a/ppocr/modeling/backbones/rec_resnet_31.py +++ b/ppocr/modeling/backbones/rec_resnet_31.py @@ -29,27 +29,29 @@ import numpy as np __all__ = ["ResNet31"] - -def conv3x3(in_channel, out_channel, stride=1): +def conv3x3(in_channel, out_channel, stride=1, conv_weight_attr=None): return nn.Conv2D( in_channel, out_channel, kernel_size=3, stride=stride, padding=1, + weight_attr=conv_weight_attr, bias_attr=False) class BasicBlock(nn.Layer): expansion = 1 - def __init__(self, in_channels, channels, stride=1, downsample=False): + def __init__(self, in_channels, channels, stride=1, downsample=False, conv_weight_attr=None, bn_weight_attr=None): super().__init__() - self.conv1 = conv3x3(in_channels, channels, stride) - self.bn1 = nn.BatchNorm2D(channels) + self.conv1 = conv3x3(in_channels, channels, stride, + conv_weight_attr=conv_weight_attr) + self.bn1 = nn.BatchNorm2D(channels, weight_attr=bn_weight_attr) self.relu = nn.ReLU() - self.conv2 = conv3x3(channels, channels) - self.bn2 = nn.BatchNorm2D(channels) + self.conv2 = conv3x3(channels, channels, + conv_weight_attr=conv_weight_attr) + self.bn2 = nn.BatchNorm2D(channels, weight_attr=bn_weight_attr) self.downsample = downsample if downsample: self.downsample = nn.Sequential( @@ -58,8 +60,9 @@ class BasicBlock(nn.Layer): channels * self.expansion, 1, stride, + weight_attr=conv_weight_attr, bias_attr=False), - nn.BatchNorm2D(channels * self.expansion), ) + nn.BatchNorm2D(channels * self.expansion, weight_attr=bn_weight_attr)) else: self.downsample = nn.Sequential() self.stride = stride @@ -91,6 +94,7 @@ class ResNet31(nn.Layer): channels (list[int]): List of out_channels of Conv2d layer. out_indices (None | Sequence[int]): Indices of output stages. last_stage_pool (bool): If True, add `MaxPool2d` layer to last stage. + init_type (None | str): the config to control the initialization. ''' def __init__(self, @@ -98,7 +102,8 @@ class ResNet31(nn.Layer): layers=[1, 2, 5, 3], channels=[64, 128, 256, 256, 512, 512, 512], out_indices=None, - last_stage_pool=False): + last_stage_pool=False, + init_type=None): super(ResNet31, self).__init__() assert isinstance(in_channels, int) assert isinstance(last_stage_pool, bool) @@ -106,42 +111,55 @@ class ResNet31(nn.Layer): self.out_indices = out_indices self.last_stage_pool = last_stage_pool + conv_weight_attr = None + bn_weight_attr = None + + if init_type is not None: + support_dict = ['KaimingNormal'] + assert init_type in support_dict, Exception( + "resnet31 only support {}".format(support_dict)) + conv_weight_attr = nn.initializer.KaimingNormal() + bn_weight_attr = ParamAttr(initializer=nn.initializer.Uniform(), learning_rate=1) + # conv 1 (Conv Conv) self.conv1_1 = nn.Conv2D( - in_channels, channels[0], kernel_size=3, stride=1, padding=1) - self.bn1_1 = nn.BatchNorm2D(channels[0]) + in_channels, channels[0], kernel_size=3, stride=1, padding=1, weight_attr=conv_weight_attr) + self.bn1_1 = nn.BatchNorm2D(channels[0], weight_attr=bn_weight_attr) self.relu1_1 = nn.ReLU() self.conv1_2 = nn.Conv2D( - channels[0], channels[1], kernel_size=3, stride=1, padding=1) - self.bn1_2 = nn.BatchNorm2D(channels[1]) + channels[0], channels[1], kernel_size=3, stride=1, padding=1, weight_attr=conv_weight_attr) + self.bn1_2 = nn.BatchNorm2D(channels[1], weight_attr=bn_weight_attr) self.relu1_2 = nn.ReLU() # conv 2 (Max-pooling, Residual block, Conv) self.pool2 = nn.MaxPool2D( kernel_size=2, stride=2, padding=0, ceil_mode=True) - self.block2 = self._make_layer(channels[1], channels[2], layers[0]) + self.block2 = self._make_layer(channels[1], channels[2], layers[0], + conv_weight_attr=conv_weight_attr, bn_weight_attr=bn_weight_attr) self.conv2 = nn.Conv2D( - channels[2], channels[2], kernel_size=3, stride=1, padding=1) - self.bn2 = nn.BatchNorm2D(channels[2]) + channels[2], channels[2], kernel_size=3, stride=1, padding=1, weight_attr=conv_weight_attr) + self.bn2 = nn.BatchNorm2D(channels[2], weight_attr=bn_weight_attr) self.relu2 = nn.ReLU() # conv 3 (Max-pooling, Residual block, Conv) self.pool3 = nn.MaxPool2D( kernel_size=2, stride=2, padding=0, ceil_mode=True) - self.block3 = self._make_layer(channels[2], channels[3], layers[1]) + self.block3 = self._make_layer(channels[2], channels[3], layers[1], + conv_weight_attr=conv_weight_attr, bn_weight_attr=bn_weight_attr) self.conv3 = nn.Conv2D( - channels[3], channels[3], kernel_size=3, stride=1, padding=1) - self.bn3 = nn.BatchNorm2D(channels[3]) + channels[3], channels[3], kernel_size=3, stride=1, padding=1, weight_attr=conv_weight_attr) + self.bn3 = nn.BatchNorm2D(channels[3], weight_attr=bn_weight_attr) self.relu3 = nn.ReLU() # conv 4 (Max-pooling, Residual block, Conv) self.pool4 = nn.MaxPool2D( kernel_size=(2, 1), stride=(2, 1), padding=0, ceil_mode=True) - self.block4 = self._make_layer(channels[3], channels[4], layers[2]) + self.block4 = self._make_layer(channels[3], channels[4], layers[2], + conv_weight_attr=conv_weight_attr, bn_weight_attr=bn_weight_attr) self.conv4 = nn.Conv2D( - channels[4], channels[4], kernel_size=3, stride=1, padding=1) - self.bn4 = nn.BatchNorm2D(channels[4]) + channels[4], channels[4], kernel_size=3, stride=1, padding=1, weight_attr=conv_weight_attr) + self.bn4 = nn.BatchNorm2D(channels[4], weight_attr=bn_weight_attr) self.relu4 = nn.ReLU() # conv 5 ((Max-pooling), Residual block, Conv) @@ -149,15 +167,16 @@ class ResNet31(nn.Layer): if self.last_stage_pool: self.pool5 = nn.MaxPool2D( kernel_size=2, stride=2, padding=0, ceil_mode=True) - self.block5 = self._make_layer(channels[4], channels[5], layers[3]) + self.block5 = self._make_layer(channels[4], channels[5], layers[3], + conv_weight_attr=conv_weight_attr, bn_weight_attr=bn_weight_attr) self.conv5 = nn.Conv2D( - channels[5], channels[5], kernel_size=3, stride=1, padding=1) - self.bn5 = nn.BatchNorm2D(channels[5]) + channels[5], channels[5], kernel_size=3, stride=1, padding=1, weight_attr=conv_weight_attr) + self.bn5 = nn.BatchNorm2D(channels[5], weight_attr=bn_weight_attr) self.relu5 = nn.ReLU() self.out_channels = channels[-1] - def _make_layer(self, input_channels, output_channels, blocks): + def _make_layer(self, input_channels, output_channels, blocks, conv_weight_attr=None, bn_weight_attr=None): layers = [] for _ in range(blocks): downsample = None @@ -168,12 +187,14 @@ class ResNet31(nn.Layer): output_channels, kernel_size=1, stride=1, + weight_attr=conv_weight_attr, bias_attr=False), - nn.BatchNorm2D(output_channels), ) + nn.BatchNorm2D(output_channels, weight_attr=bn_weight_attr)) layers.append( BasicBlock( - input_channels, output_channels, downsample=downsample)) + input_channels, output_channels, downsample=downsample, + conv_weight_attr=conv_weight_attr, bn_weight_attr=bn_weight_attr)) input_channels = output_channels return nn.Sequential(*layers) diff --git a/ppocr/modeling/backbones/vqa_layoutlm.py b/ppocr/modeling/backbones/vqa_layoutlm.py index d4ced350885bd54e6c6065cb0f21c45780c136b0..8e10ed7b48e9aff344b71e5a04970d1a5dab8a71 100644 --- a/ppocr/modeling/backbones/vqa_layoutlm.py +++ b/ppocr/modeling/backbones/vqa_layoutlm.py @@ -29,14 +29,14 @@ __all__ = ["LayoutXLMForSer", "LayoutLMForSer"] pretrained_model_dict = { LayoutXLMModel: { "base": "layoutxlm-base-uncased", - "vi": "layoutxlm-wo-backbone-base-uncased", + "vi": "vi-layoutxlm-base-uncased", }, LayoutLMModel: { "base": "layoutlm-base-uncased", }, LayoutLMv2Model: { "base": "layoutlmv2-base-uncased", - "vi": "layoutlmv2-wo-backbone-base-uncased", + "vi": "vi-layoutlmv2-base-uncased", }, } @@ -113,7 +113,6 @@ class LayoutLMv2ForSer(NLPBaseModel): pretrained, checkpoints, num_classes=num_classes) - self.use_visual_backbone = True if hasattr(self.model.layoutlmv2, "use_visual_backbone" ) and self.model.layoutlmv2.use_visual_backbone is False: self.use_visual_backbone = False @@ -155,7 +154,9 @@ class LayoutXLMForSer(NLPBaseModel): pretrained, checkpoints, num_classes=num_classes) - self.use_visual_backbone = True + if hasattr(self.model.layoutxlm, "use_visual_backbone" + ) and self.model.layoutxlm.use_visual_backbone is False: + self.use_visual_backbone = False def forward(self, x): if self.use_visual_backbone is True: @@ -185,6 +186,9 @@ class LayoutLMv2ForRe(NLPBaseModel): super(LayoutLMv2ForRe, self).__init__( LayoutLMv2Model, LayoutLMv2ForRelationExtraction, mode, "re", pretrained, checkpoints) + if hasattr(self.model.layoutlmv2, "use_visual_backbone" + ) and self.model.layoutlmv2.use_visual_backbone is False: + self.use_visual_backbone = False def forward(self, x): x = self.model( @@ -207,7 +211,6 @@ class LayoutXLMForRe(NLPBaseModel): super(LayoutXLMForRe, self).__init__( LayoutXLMModel, LayoutXLMForRelationExtraction, mode, "re", pretrained, checkpoints) - self.use_visual_backbone = True if hasattr(self.model.layoutxlm, "use_visual_backbone" ) and self.model.layoutxlm.use_visual_backbone is False: self.use_visual_backbone = False diff --git a/ppocr/modeling/heads/__init__.py b/ppocr/modeling/heads/__init__.py index b465cedba3b22e2d24f54b8617cc4071316cb676..751757e5f176119688e2db47a68c514850b91823 100755 --- a/ppocr/modeling/heads/__init__.py +++ b/ppocr/modeling/heads/__init__.py @@ -23,6 +23,7 @@ def build_head(config): from .det_pse_head import PSEHead from .det_fce_head import FCEHead from .e2e_pg_head import PGHead + from .det_ct_head import CT_Head # rec head from .rec_ctc_head import CTCHead @@ -35,6 +36,7 @@ def build_head(config): from .rec_multi_head import MultiHead from .rec_spin_att_head import SPINAttentionHead from .rec_abinet_head import ABINetHead + from .rec_robustscanner_head import RobustScannerHead from .rec_visionlan_head import VLHead # cls head @@ -43,7 +45,7 @@ def build_head(config): #kie head from .kie_sdmgr_head import SDMGRHead - from .table_att_head import TableAttentionHead + from .table_att_head import TableAttentionHead, SLAHead from .table_master_head import TableMasterHead support_dict = [ @@ -51,7 +53,7 @@ def build_head(config): 'ClsHead', 'AttentionHead', 'SRNHead', 'PGHead', 'Transformer', 'TableAttentionHead', 'SARHead', 'AsterHead', 'SDMGRHead', 'PRENHead', 'MultiHead', 'ABINetHead', 'TableMasterHead', 'SPINAttentionHead', - 'VLHead' + 'VLHead', 'SLAHead', 'RobustScannerHead', 'CT_Head' ] #table head diff --git a/ppocr/modeling/heads/det_ct_head.py b/ppocr/modeling/heads/det_ct_head.py new file mode 100644 index 0000000000000000000000000000000000000000..08e6719e8f0ade6887eb4ad7f44a2bc36ec132db --- /dev/null +++ b/ppocr/modeling/heads/det_ct_head.py @@ -0,0 +1,69 @@ +# copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +import paddle +from paddle import nn +import paddle.nn.functional as F +from paddle import ParamAttr + +import math +from paddle.nn.initializer import TruncatedNormal, Constant, Normal +ones_ = Constant(value=1.) +zeros_ = Constant(value=0.) + + +class CT_Head(nn.Layer): + def __init__(self, + in_channels, + hidden_dim, + num_classes, + loss_kernel=None, + loss_loc=None): + super(CT_Head, self).__init__() + self.conv1 = nn.Conv2D( + in_channels, hidden_dim, kernel_size=3, stride=1, padding=1) + self.bn1 = nn.BatchNorm2D(hidden_dim) + self.relu1 = nn.ReLU() + + self.conv2 = nn.Conv2D( + hidden_dim, num_classes, kernel_size=1, stride=1, padding=0) + + for m in self.sublayers(): + if isinstance(m, nn.Conv2D): + n = m._kernel_size[0] * m._kernel_size[1] * m._out_channels + normal_ = Normal(mean=0.0, std=math.sqrt(2. / n)) + normal_(m.weight) + elif isinstance(m, nn.BatchNorm2D): + zeros_(m.bias) + ones_(m.weight) + + def _upsample(self, x, scale=1): + return F.upsample(x, scale_factor=scale, mode='bilinear') + + def forward(self, f, targets=None): + out = self.conv1(f) + out = self.relu1(self.bn1(out)) + out = self.conv2(out) + + if self.training: + out = self._upsample(out, scale=4) + return {'maps': out} + else: + score = F.sigmoid(out[:, 0, :, :]) + return {'maps': out, 'score': score} diff --git a/ppocr/modeling/heads/e2e_pg_head.py b/ppocr/modeling/heads/e2e_pg_head.py index 274e1cdac5172f45590c9f7d7b50522c74db6750..514962ef97e503d331b6351c6d314070dfd8b15f 100644 --- a/ppocr/modeling/heads/e2e_pg_head.py +++ b/ppocr/modeling/heads/e2e_pg_head.py @@ -66,8 +66,17 @@ class PGHead(nn.Layer): """ """ - def __init__(self, in_channels, **kwargs): + def __init__(self, + in_channels, + character_dict_path='ppocr/utils/ic15_dict.txt', + **kwargs): super(PGHead, self).__init__() + + # get character_length + with open(character_dict_path, "rb") as fin: + lines = fin.readlines() + character_length = len(lines) + 1 + self.conv_f_score1 = ConvBNLayer( in_channels=in_channels, out_channels=64, @@ -178,7 +187,7 @@ class PGHead(nn.Layer): name="conv_f_char{}".format(5)) self.conv3 = nn.Conv2D( in_channels=256, - out_channels=37, + out_channels=character_length, kernel_size=3, stride=1, padding=1, diff --git a/ppocr/modeling/heads/rec_robustscanner_head.py b/ppocr/modeling/heads/rec_robustscanner_head.py new file mode 100644 index 0000000000000000000000000000000000000000..7956059ecfe01f27db364d3d748d6af24dad0aac --- /dev/null +++ b/ppocr/modeling/heads/rec_robustscanner_head.py @@ -0,0 +1,709 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +This code is refer from: +https://github.com/open-mmlab/mmocr/blob/main/mmocr/models/textrecog/encoders/channel_reduction_encoder.py +https://github.com/open-mmlab/mmocr/blob/main/mmocr/models/textrecog/decoders/robust_scanner_decoder.py +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +import paddle +from paddle import ParamAttr +import paddle.nn as nn +import paddle.nn.functional as F + +class BaseDecoder(nn.Layer): + def __init__(self, **kwargs): + super().__init__() + + def forward_train(self, feat, out_enc, targets, img_metas): + raise NotImplementedError + + def forward_test(self, feat, out_enc, img_metas): + raise NotImplementedError + + def forward(self, + feat, + out_enc, + label=None, + valid_ratios=None, + word_positions=None, + train_mode=True): + self.train_mode = train_mode + + if train_mode: + return self.forward_train(feat, out_enc, label, valid_ratios, word_positions) + return self.forward_test(feat, out_enc, valid_ratios, word_positions) + +class ChannelReductionEncoder(nn.Layer): + """Change the channel number with a one by one convoluational layer. + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + """ + + def __init__(self, + in_channels, + out_channels, + **kwargs): + super(ChannelReductionEncoder, self).__init__() + + self.layer = nn.Conv2D( + in_channels, out_channels, kernel_size=1, stride=1, padding=0, weight_attr=nn.initializer.XavierNormal()) + + def forward(self, feat): + """ + Args: + feat (Tensor): Image features with the shape of + :math:`(N, C_{in}, H, W)`. + + Returns: + Tensor: A tensor of shape :math:`(N, C_{out}, H, W)`. + """ + return self.layer(feat) + + +def masked_fill(x, mask, value): + y = paddle.full(x.shape, value, x.dtype) + return paddle.where(mask, y, x) + +class DotProductAttentionLayer(nn.Layer): + + def __init__(self, dim_model=None): + super().__init__() + + self.scale = dim_model**-0.5 if dim_model is not None else 1. + + def forward(self, query, key, value, h, w, valid_ratios=None): + query = paddle.transpose(query, (0, 2, 1)) + logits = paddle.matmul(query, key) * self.scale + n, c, t = logits.shape + # reshape to (n, c, h, w) + logits = paddle.reshape(logits, [n, c, h, w]) + if valid_ratios is not None: + # cal mask of attention weight + for i, valid_ratio in enumerate(valid_ratios): + valid_width = min(w, int(w * valid_ratio + 0.5)) + if valid_width < w: + logits[i, :, :, valid_width:] = float('-inf') + + # reshape to (n, c, h, w) + logits = paddle.reshape(logits, [n, c, t]) + weights = F.softmax(logits, axis=2) + value = paddle.transpose(value, (0, 2, 1)) + glimpse = paddle.matmul(weights, value) + glimpse = paddle.transpose(glimpse, (0, 2, 1)) + return glimpse + +class SequenceAttentionDecoder(BaseDecoder): + """Sequence attention decoder for RobustScanner. + + RobustScanner: `RobustScanner: Dynamically Enhancing Positional Clues for + Robust Text Recognition `_ + + Args: + num_classes (int): Number of output classes :math:`C`. + rnn_layers (int): Number of RNN layers. + dim_input (int): Dimension :math:`D_i` of input vector ``feat``. + dim_model (int): Dimension :math:`D_m` of the model. Should also be the + same as encoder output vector ``out_enc``. + max_seq_len (int): Maximum output sequence length :math:`T`. + start_idx (int): The index of ``. + mask (bool): Whether to mask input features according to + ``img_meta['valid_ratio']``. + padding_idx (int): The index of ``. + dropout (float): Dropout rate. + return_feature (bool): Return feature or logits as the result. + encode_value (bool): Whether to use the output of encoder ``out_enc`` + as `value` of attention layer. If False, the original feature + ``feat`` will be used. + + Warning: + This decoder will not predict the final class which is assumed to be + ``. Therefore, its output size is always :math:`C - 1`. `` + is also ignored by loss as specified in + :obj:`mmocr.models.textrecog.recognizer.EncodeDecodeRecognizer`. + """ + + def __init__(self, + num_classes=None, + rnn_layers=2, + dim_input=512, + dim_model=128, + max_seq_len=40, + start_idx=0, + mask=True, + padding_idx=None, + dropout=0, + return_feature=False, + encode_value=False): + super().__init__() + + self.num_classes = num_classes + self.dim_input = dim_input + self.dim_model = dim_model + self.return_feature = return_feature + self.encode_value = encode_value + self.max_seq_len = max_seq_len + self.start_idx = start_idx + self.mask = mask + + self.embedding = nn.Embedding( + self.num_classes, self.dim_model, padding_idx=padding_idx) + + self.sequence_layer = nn.LSTM( + input_size=dim_model, + hidden_size=dim_model, + num_layers=rnn_layers, + time_major=False, + dropout=dropout) + + self.attention_layer = DotProductAttentionLayer() + + self.prediction = None + if not self.return_feature: + pred_num_classes = num_classes - 1 + self.prediction = nn.Linear( + dim_model if encode_value else dim_input, pred_num_classes) + + def forward_train(self, feat, out_enc, targets, valid_ratios): + """ + Args: + feat (Tensor): Tensor of shape :math:`(N, D_i, H, W)`. + out_enc (Tensor): Encoder output of shape + :math:`(N, D_m, H, W)`. + targets (Tensor): a tensor of shape :math:`(N, T)`. Each element is the index of a + character. + valid_ratios (Tensor): valid length ratio of img. + Returns: + Tensor: A raw logit tensor of shape :math:`(N, T, C-1)` if + ``return_feature=False``. Otherwise it would be the hidden feature + before the prediction projection layer, whose shape is + :math:`(N, T, D_m)`. + """ + + tgt_embedding = self.embedding(targets) + + n, c_enc, h, w = out_enc.shape + assert c_enc == self.dim_model + _, c_feat, _, _ = feat.shape + assert c_feat == self.dim_input + _, len_q, c_q = tgt_embedding.shape + assert c_q == self.dim_model + assert len_q <= self.max_seq_len + + query, _ = self.sequence_layer(tgt_embedding) + query = paddle.transpose(query, (0, 2, 1)) + key = paddle.reshape(out_enc, [n, c_enc, h * w]) + if self.encode_value: + value = key + else: + value = paddle.reshape(feat, [n, c_feat, h * w]) + + attn_out = self.attention_layer(query, key, value, h, w, valid_ratios) + attn_out = paddle.transpose(attn_out, (0, 2, 1)) + + if self.return_feature: + return attn_out + + out = self.prediction(attn_out) + + return out + + def forward_test(self, feat, out_enc, valid_ratios): + """ + Args: + feat (Tensor): Tensor of shape :math:`(N, D_i, H, W)`. + out_enc (Tensor): Encoder output of shape + :math:`(N, D_m, H, W)`. + valid_ratios (Tensor): valid length ratio of img. + + Returns: + Tensor: The output logit sequence tensor of shape + :math:`(N, T, C-1)`. + """ + seq_len = self.max_seq_len + batch_size = feat.shape[0] + + decode_sequence = (paddle.ones((batch_size, seq_len), dtype='int64') * self.start_idx) + + outputs = [] + for i in range(seq_len): + step_out = self.forward_test_step(feat, out_enc, decode_sequence, + i, valid_ratios) + outputs.append(step_out) + max_idx = paddle.argmax(step_out, axis=1, keepdim=False) + if i < seq_len - 1: + decode_sequence[:, i + 1] = max_idx + + outputs = paddle.stack(outputs, 1) + + return outputs + + def forward_test_step(self, feat, out_enc, decode_sequence, current_step, + valid_ratios): + """ + Args: + feat (Tensor): Tensor of shape :math:`(N, D_i, H, W)`. + out_enc (Tensor): Encoder output of shape + :math:`(N, D_m, H, W)`. + decode_sequence (Tensor): Shape :math:`(N, T)`. The tensor that + stores history decoding result. + current_step (int): Current decoding step. + valid_ratios (Tensor): valid length ratio of img + + Returns: + Tensor: Shape :math:`(N, C-1)`. The logit tensor of predicted + tokens at current time step. + """ + + embed = self.embedding(decode_sequence) + + n, c_enc, h, w = out_enc.shape + assert c_enc == self.dim_model + _, c_feat, _, _ = feat.shape + assert c_feat == self.dim_input + _, _, c_q = embed.shape + assert c_q == self.dim_model + + query, _ = self.sequence_layer(embed) + query = paddle.transpose(query, (0, 2, 1)) + key = paddle.reshape(out_enc, [n, c_enc, h * w]) + if self.encode_value: + value = key + else: + value = paddle.reshape(feat, [n, c_feat, h * w]) + + # [n, c, l] + attn_out = self.attention_layer(query, key, value, h, w, valid_ratios) + out = attn_out[:, :, current_step] + + if self.return_feature: + return out + + out = self.prediction(out) + out = F.softmax(out, dim=-1) + + return out + + +class PositionAwareLayer(nn.Layer): + + def __init__(self, dim_model, rnn_layers=2): + super().__init__() + + self.dim_model = dim_model + + self.rnn = nn.LSTM( + input_size=dim_model, + hidden_size=dim_model, + num_layers=rnn_layers, + time_major=False) + + self.mixer = nn.Sequential( + nn.Conv2D( + dim_model, dim_model, kernel_size=3, stride=1, padding=1), + nn.ReLU(), + nn.Conv2D( + dim_model, dim_model, kernel_size=3, stride=1, padding=1)) + + def forward(self, img_feature): + n, c, h, w = img_feature.shape + rnn_input = paddle.transpose(img_feature, (0, 2, 3, 1)) + rnn_input = paddle.reshape(rnn_input, (n * h, w, c)) + rnn_output, _ = self.rnn(rnn_input) + rnn_output = paddle.reshape(rnn_output, (n, h, w, c)) + rnn_output = paddle.transpose(rnn_output, (0, 3, 1, 2)) + out = self.mixer(rnn_output) + return out + + +class PositionAttentionDecoder(BaseDecoder): + """Position attention decoder for RobustScanner. + + RobustScanner: `RobustScanner: Dynamically Enhancing Positional Clues for + Robust Text Recognition `_ + + Args: + num_classes (int): Number of output classes :math:`C`. + rnn_layers (int): Number of RNN layers. + dim_input (int): Dimension :math:`D_i` of input vector ``feat``. + dim_model (int): Dimension :math:`D_m` of the model. Should also be the + same as encoder output vector ``out_enc``. + max_seq_len (int): Maximum output sequence length :math:`T`. + mask (bool): Whether to mask input features according to + ``img_meta['valid_ratio']``. + return_feature (bool): Return feature or logits as the result. + encode_value (bool): Whether to use the output of encoder ``out_enc`` + as `value` of attention layer. If False, the original feature + ``feat`` will be used. + + Warning: + This decoder will not predict the final class which is assumed to be + ``. Therefore, its output size is always :math:`C - 1`. `` + is also ignored by loss + + """ + + def __init__(self, + num_classes=None, + rnn_layers=2, + dim_input=512, + dim_model=128, + max_seq_len=40, + mask=True, + return_feature=False, + encode_value=False): + super().__init__() + + self.num_classes = num_classes + self.dim_input = dim_input + self.dim_model = dim_model + self.max_seq_len = max_seq_len + self.return_feature = return_feature + self.encode_value = encode_value + self.mask = mask + + self.embedding = nn.Embedding(self.max_seq_len + 1, self.dim_model) + + self.position_aware_module = PositionAwareLayer( + self.dim_model, rnn_layers) + + self.attention_layer = DotProductAttentionLayer() + + self.prediction = None + if not self.return_feature: + pred_num_classes = num_classes - 1 + self.prediction = nn.Linear( + dim_model if encode_value else dim_input, pred_num_classes) + + def _get_position_index(self, length, batch_size): + position_index_list = [] + for i in range(batch_size): + position_index = paddle.arange(0, end=length, step=1, dtype='int64') + position_index_list.append(position_index) + batch_position_index = paddle.stack(position_index_list, axis=0) + return batch_position_index + + def forward_train(self, feat, out_enc, targets, valid_ratios, position_index): + """ + Args: + feat (Tensor): Tensor of shape :math:`(N, D_i, H, W)`. + out_enc (Tensor): Encoder output of shape + :math:`(N, D_m, H, W)`. + targets (dict): A dict with the key ``padded_targets``, a + tensor of shape :math:`(N, T)`. Each element is the index of a + character. + valid_ratios (Tensor): valid length ratio of img. + position_index (Tensor): The position of each word. + + Returns: + Tensor: A raw logit tensor of shape :math:`(N, T, C-1)` if + ``return_feature=False``. Otherwise it will be the hidden feature + before the prediction projection layer, whose shape is + :math:`(N, T, D_m)`. + """ + n, c_enc, h, w = out_enc.shape + assert c_enc == self.dim_model + _, c_feat, _, _ = feat.shape + assert c_feat == self.dim_input + _, len_q = targets.shape + assert len_q <= self.max_seq_len + + position_out_enc = self.position_aware_module(out_enc) + + query = self.embedding(position_index) + query = paddle.transpose(query, (0, 2, 1)) + key = paddle.reshape(position_out_enc, (n, c_enc, h * w)) + if self.encode_value: + value = paddle.reshape(out_enc,(n, c_enc, h * w)) + else: + value = paddle.reshape(feat,(n, c_feat, h * w)) + + attn_out = self.attention_layer(query, key, value, h, w, valid_ratios) + attn_out = paddle.transpose(attn_out, (0, 2, 1)) # [n, len_q, dim_v] + + if self.return_feature: + return attn_out + + return self.prediction(attn_out) + + def forward_test(self, feat, out_enc, valid_ratios, position_index): + """ + Args: + feat (Tensor): Tensor of shape :math:`(N, D_i, H, W)`. + out_enc (Tensor): Encoder output of shape + :math:`(N, D_m, H, W)`. + valid_ratios (Tensor): valid length ratio of img + position_index (Tensor): The position of each word. + + Returns: + Tensor: A raw logit tensor of shape :math:`(N, T, C-1)` if + ``return_feature=False``. Otherwise it would be the hidden feature + before the prediction projection layer, whose shape is + :math:`(N, T, D_m)`. + """ + n, c_enc, h, w = out_enc.shape + assert c_enc == self.dim_model + _, c_feat, _, _ = feat.shape + assert c_feat == self.dim_input + + position_out_enc = self.position_aware_module(out_enc) + + query = self.embedding(position_index) + query = paddle.transpose(query, (0, 2, 1)) + key = paddle.reshape(position_out_enc, (n, c_enc, h * w)) + if self.encode_value: + value = paddle.reshape(out_enc,(n, c_enc, h * w)) + else: + value = paddle.reshape(feat,(n, c_feat, h * w)) + + attn_out = self.attention_layer(query, key, value, h, w, valid_ratios) + attn_out = paddle.transpose(attn_out, (0, 2, 1)) # [n, len_q, dim_v] + + if self.return_feature: + return attn_out + + return self.prediction(attn_out) + +class RobustScannerFusionLayer(nn.Layer): + + def __init__(self, dim_model, dim=-1): + super(RobustScannerFusionLayer, self).__init__() + + self.dim_model = dim_model + self.dim = dim + self.linear_layer = nn.Linear(dim_model * 2, dim_model * 2) + + def forward(self, x0, x1): + assert x0.shape == x1.shape + fusion_input = paddle.concat([x0, x1], self.dim) + output = self.linear_layer(fusion_input) + output = F.glu(output, self.dim) + return output + +class RobustScannerDecoder(BaseDecoder): + """Decoder for RobustScanner. + + RobustScanner: `RobustScanner: Dynamically Enhancing Positional Clues for + Robust Text Recognition `_ + + Args: + num_classes (int): Number of output classes :math:`C`. + dim_input (int): Dimension :math:`D_i` of input vector ``feat``. + dim_model (int): Dimension :math:`D_m` of the model. Should also be the + same as encoder output vector ``out_enc``. + max_seq_len (int): Maximum output sequence length :math:`T`. + start_idx (int): The index of ``. + mask (bool): Whether to mask input features according to + ``img_meta['valid_ratio']``. + padding_idx (int): The index of ``. + encode_value (bool): Whether to use the output of encoder ``out_enc`` + as `value` of attention layer. If False, the original feature + ``feat`` will be used. + + Warning: + This decoder will not predict the final class which is assumed to be + ``. Therefore, its output size is always :math:`C - 1`. `` + is also ignored by loss as specified in + :obj:`mmocr.models.textrecog.recognizer.EncodeDecodeRecognizer`. + """ + + def __init__(self, + num_classes=None, + dim_input=512, + dim_model=128, + hybrid_decoder_rnn_layers=2, + hybrid_decoder_dropout=0, + position_decoder_rnn_layers=2, + max_seq_len=40, + start_idx=0, + mask=True, + padding_idx=None, + encode_value=False): + super().__init__() + self.num_classes = num_classes + self.dim_input = dim_input + self.dim_model = dim_model + self.max_seq_len = max_seq_len + self.encode_value = encode_value + self.start_idx = start_idx + self.padding_idx = padding_idx + self.mask = mask + + # init hybrid decoder + self.hybrid_decoder = SequenceAttentionDecoder( + num_classes=num_classes, + rnn_layers=hybrid_decoder_rnn_layers, + dim_input=dim_input, + dim_model=dim_model, + max_seq_len=max_seq_len, + start_idx=start_idx, + mask=mask, + padding_idx=padding_idx, + dropout=hybrid_decoder_dropout, + encode_value=encode_value, + return_feature=True + ) + + # init position decoder + self.position_decoder = PositionAttentionDecoder( + num_classes=num_classes, + rnn_layers=position_decoder_rnn_layers, + dim_input=dim_input, + dim_model=dim_model, + max_seq_len=max_seq_len, + mask=mask, + encode_value=encode_value, + return_feature=True + ) + + + self.fusion_module = RobustScannerFusionLayer( + self.dim_model if encode_value else dim_input) + + pred_num_classes = num_classes - 1 + self.prediction = nn.Linear(dim_model if encode_value else dim_input, + pred_num_classes) + + def forward_train(self, feat, out_enc, target, valid_ratios, word_positions): + """ + Args: + feat (Tensor): Tensor of shape :math:`(N, D_i, H, W)`. + out_enc (Tensor): Encoder output of shape + :math:`(N, D_m, H, W)`. + target (dict): A dict with the key ``padded_targets``, a + tensor of shape :math:`(N, T)`. Each element is the index of a + character. + valid_ratios (Tensor): + word_positions (Tensor): The position of each word. + + Returns: + Tensor: A raw logit tensor of shape :math:`(N, T, C-1)`. + """ + hybrid_glimpse = self.hybrid_decoder.forward_train( + feat, out_enc, target, valid_ratios) + position_glimpse = self.position_decoder.forward_train( + feat, out_enc, target, valid_ratios, word_positions) + + fusion_out = self.fusion_module(hybrid_glimpse, position_glimpse) + + out = self.prediction(fusion_out) + + return out + + def forward_test(self, feat, out_enc, valid_ratios, word_positions): + """ + Args: + feat (Tensor): Tensor of shape :math:`(N, D_i, H, W)`. + out_enc (Tensor): Encoder output of shape + :math:`(N, D_m, H, W)`. + valid_ratios (Tensor): + word_positions (Tensor): The position of each word. + Returns: + Tensor: The output logit sequence tensor of shape + :math:`(N, T, C-1)`. + """ + seq_len = self.max_seq_len + batch_size = feat.shape[0] + + decode_sequence = (paddle.ones((batch_size, seq_len), dtype='int64') * self.start_idx) + + position_glimpse = self.position_decoder.forward_test( + feat, out_enc, valid_ratios, word_positions) + + outputs = [] + for i in range(seq_len): + hybrid_glimpse_step = self.hybrid_decoder.forward_test_step( + feat, out_enc, decode_sequence, i, valid_ratios) + + fusion_out = self.fusion_module(hybrid_glimpse_step, + position_glimpse[:, i, :]) + + char_out = self.prediction(fusion_out) + char_out = F.softmax(char_out, -1) + outputs.append(char_out) + max_idx = paddle.argmax(char_out, axis=1, keepdim=False) + if i < seq_len - 1: + decode_sequence[:, i + 1] = max_idx + + outputs = paddle.stack(outputs, 1) + + return outputs + +class RobustScannerHead(nn.Layer): + def __init__(self, + out_channels, # 90 + unknown + start + padding + in_channels, + enc_outchannles=128, + hybrid_dec_rnn_layers=2, + hybrid_dec_dropout=0, + position_dec_rnn_layers=2, + start_idx=0, + max_text_length=40, + mask=True, + padding_idx=None, + encode_value=False, + **kwargs): + super(RobustScannerHead, self).__init__() + + # encoder module + self.encoder = ChannelReductionEncoder( + in_channels=in_channels, out_channels=enc_outchannles) + + # decoder module + self.decoder =RobustScannerDecoder( + num_classes=out_channels, + dim_input=in_channels, + dim_model=enc_outchannles, + hybrid_decoder_rnn_layers=hybrid_dec_rnn_layers, + hybrid_decoder_dropout=hybrid_dec_dropout, + position_decoder_rnn_layers=position_dec_rnn_layers, + max_seq_len=max_text_length, + start_idx=start_idx, + mask=mask, + padding_idx=padding_idx, + encode_value=encode_value) + + def forward(self, inputs, targets=None): + ''' + targets: [label, valid_ratio, word_positions] + ''' + out_enc = self.encoder(inputs) + valid_ratios = None + word_positions = targets[-1] + + if len(targets) > 1: + valid_ratios = targets[-2] + + if self.training: + label = targets[0] # label + label = paddle.to_tensor(label, dtype='int64') + final_out = self.decoder( + inputs, out_enc, label, valid_ratios, word_positions) + if not self.training: + final_out = self.decoder( + inputs, + out_enc, + label=None, + valid_ratios=valid_ratios, + word_positions=word_positions, + train_mode=False) + return final_out diff --git a/ppocr/modeling/heads/sr_rensnet_transformer.py b/ppocr/modeling/heads/sr_rensnet_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..a004a12663ac2061a329236c58e147a017c80ba6 --- /dev/null +++ b/ppocr/modeling/heads/sr_rensnet_transformer.py @@ -0,0 +1,430 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/FudanVI/FudanOCR/blob/main/text-gestalt/loss/transformer_english_decomposition.py +""" +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +import math, copy +import numpy as np + +# stroke-level alphabet +alphabet = '0123456789' + + +def get_alphabet_len(): + return len(alphabet) + + +def subsequent_mask(size): + """Generate a square mask for the sequence. The masked positions are filled with float('-inf'). + Unmasked positions are filled with float(0.0). + """ + mask = paddle.ones([1, size, size], dtype='float32') + mask_inf = paddle.triu( + paddle.full( + shape=[1, size, size], dtype='float32', fill_value='-inf'), + diagonal=1) + mask = mask + mask_inf + padding_mask = paddle.equal(mask, paddle.to_tensor(1, dtype=mask.dtype)) + return padding_mask + + +def clones(module, N): + return nn.LayerList([copy.deepcopy(module) for _ in range(N)]) + + +def masked_fill(x, mask, value): + y = paddle.full(x.shape, value, x.dtype) + return paddle.where(mask, y, x) + + +def attention(query, key, value, mask=None, dropout=None, attention_map=None): + d_k = query.shape[-1] + scores = paddle.matmul(query, + paddle.transpose(key, [0, 1, 3, 2])) / math.sqrt(d_k) + + if mask is not None: + scores = masked_fill(scores, mask == 0, float('-inf')) + else: + pass + + p_attn = F.softmax(scores, axis=-1) + + if dropout is not None: + p_attn = dropout(p_attn) + return paddle.matmul(p_attn, value), p_attn + + +class MultiHeadedAttention(nn.Layer): + def __init__(self, h, d_model, dropout=0.1, compress_attention=False): + super(MultiHeadedAttention, self).__init__() + assert d_model % h == 0 + self.d_k = d_model // h + self.h = h + self.linears = clones(nn.Linear(d_model, d_model), 4) + self.attn = None + self.dropout = nn.Dropout(p=dropout, mode="downscale_in_infer") + self.compress_attention = compress_attention + self.compress_attention_linear = nn.Linear(h, 1) + + def forward(self, query, key, value, mask=None, attention_map=None): + if mask is not None: + mask = mask.unsqueeze(1) + nbatches = query.shape[0] + + query, key, value = \ + [paddle.transpose(l(x).reshape([nbatches, -1, self.h, self.d_k]), [0,2,1,3]) + for l, x in zip(self.linears, (query, key, value))] + + x, attention_map = attention( + query, + key, + value, + mask=mask, + dropout=self.dropout, + attention_map=attention_map) + + x = paddle.reshape( + paddle.transpose(x, [0, 2, 1, 3]), + [nbatches, -1, self.h * self.d_k]) + + return self.linears[-1](x), attention_map + + +class ResNet(nn.Layer): + def __init__(self, num_in, block, layers): + super(ResNet, self).__init__() + + self.conv1 = nn.Conv2D(num_in, 64, kernel_size=3, stride=1, padding=1) + self.bn1 = nn.BatchNorm2D(64, use_global_stats=True) + self.relu1 = nn.ReLU() + self.pool = nn.MaxPool2D((2, 2), (2, 2)) + + self.conv2 = nn.Conv2D(64, 128, kernel_size=3, stride=1, padding=1) + self.bn2 = nn.BatchNorm2D(128, use_global_stats=True) + self.relu2 = nn.ReLU() + + self.layer1_pool = nn.MaxPool2D((2, 2), (2, 2)) + self.layer1 = self._make_layer(block, 128, 256, layers[0]) + self.layer1_conv = nn.Conv2D(256, 256, 3, 1, 1) + self.layer1_bn = nn.BatchNorm2D(256, use_global_stats=True) + self.layer1_relu = nn.ReLU() + + self.layer2_pool = nn.MaxPool2D((2, 2), (2, 2)) + self.layer2 = self._make_layer(block, 256, 256, layers[1]) + self.layer2_conv = nn.Conv2D(256, 256, 3, 1, 1) + self.layer2_bn = nn.BatchNorm2D(256, use_global_stats=True) + self.layer2_relu = nn.ReLU() + + self.layer3_pool = nn.MaxPool2D((2, 2), (2, 2)) + self.layer3 = self._make_layer(block, 256, 512, layers[2]) + self.layer3_conv = nn.Conv2D(512, 512, 3, 1, 1) + self.layer3_bn = nn.BatchNorm2D(512, use_global_stats=True) + self.layer3_relu = nn.ReLU() + + self.layer4_pool = nn.MaxPool2D((2, 2), (2, 2)) + self.layer4 = self._make_layer(block, 512, 512, layers[3]) + self.layer4_conv2 = nn.Conv2D(512, 1024, 3, 1, 1) + self.layer4_conv2_bn = nn.BatchNorm2D(1024, use_global_stats=True) + self.layer4_conv2_relu = nn.ReLU() + + def _make_layer(self, block, inplanes, planes, blocks): + + if inplanes != planes: + downsample = nn.Sequential( + nn.Conv2D(inplanes, planes, 3, 1, 1), + nn.BatchNorm2D( + planes, use_global_stats=True), ) + else: + downsample = None + layers = [] + layers.append(block(inplanes, planes, downsample)) + for i in range(1, blocks): + layers.append(block(planes, planes, downsample=None)) + + return nn.Sequential(*layers) + + def forward(self, x): + x = self.conv1(x) + x = self.bn1(x) + x = self.relu1(x) + x = self.pool(x) + + x = self.conv2(x) + x = self.bn2(x) + x = self.relu2(x) + + x = self.layer1_pool(x) + x = self.layer1(x) + x = self.layer1_conv(x) + x = self.layer1_bn(x) + x = self.layer1_relu(x) + + x = self.layer2(x) + x = self.layer2_conv(x) + x = self.layer2_bn(x) + x = self.layer2_relu(x) + + x = self.layer3(x) + x = self.layer3_conv(x) + x = self.layer3_bn(x) + x = self.layer3_relu(x) + + x = self.layer4(x) + x = self.layer4_conv2(x) + x = self.layer4_conv2_bn(x) + x = self.layer4_conv2_relu(x) + + return x + + +class Bottleneck(nn.Layer): + def __init__(self, input_dim): + super(Bottleneck, self).__init__() + self.conv1 = nn.Conv2D(input_dim, input_dim, 1) + self.bn1 = nn.BatchNorm2D(input_dim, use_global_stats=True) + self.relu = nn.ReLU() + + self.conv2 = nn.Conv2D(input_dim, input_dim, 3, 1, 1) + self.bn2 = nn.BatchNorm2D(input_dim, use_global_stats=True) + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + + out += residual + out = self.relu(out) + + return out + + +class PositionalEncoding(nn.Layer): + "Implement the PE function." + + def __init__(self, dropout, dim, max_len=5000): + super(PositionalEncoding, self).__init__() + self.dropout = nn.Dropout(p=dropout, mode="downscale_in_infer") + + pe = paddle.zeros([max_len, dim]) + position = paddle.arange(0, max_len, dtype=paddle.float32).unsqueeze(1) + div_term = paddle.exp( + paddle.arange(0, dim, 2).astype('float32') * + (-math.log(10000.0) / dim)) + pe[:, 0::2] = paddle.sin(position * div_term) + pe[:, 1::2] = paddle.cos(position * div_term) + pe = paddle.unsqueeze(pe, 0) + self.register_buffer('pe', pe) + + def forward(self, x): + x = x + self.pe[:, :paddle.shape(x)[1]] + return self.dropout(x) + + +class PositionwiseFeedForward(nn.Layer): + "Implements FFN equation." + + def __init__(self, d_model, d_ff, dropout=0.1): + super(PositionwiseFeedForward, self).__init__() + self.w_1 = nn.Linear(d_model, d_ff) + self.w_2 = nn.Linear(d_ff, d_model) + self.dropout = nn.Dropout(dropout, mode="downscale_in_infer") + + def forward(self, x): + return self.w_2(self.dropout(F.relu(self.w_1(x)))) + + +class Generator(nn.Layer): + "Define standard linear + softmax generation step." + + def __init__(self, d_model, vocab): + super(Generator, self).__init__() + self.proj = nn.Linear(d_model, vocab) + self.relu = nn.ReLU() + + def forward(self, x): + out = self.proj(x) + return out + + +class Embeddings(nn.Layer): + def __init__(self, d_model, vocab): + super(Embeddings, self).__init__() + self.lut = nn.Embedding(vocab, d_model) + self.d_model = d_model + + def forward(self, x): + embed = self.lut(x) * math.sqrt(self.d_model) + return embed + + +class LayerNorm(nn.Layer): + "Construct a layernorm module (See citation for details)." + + def __init__(self, features, eps=1e-6): + super(LayerNorm, self).__init__() + self.a_2 = self.create_parameter( + shape=[features], + default_initializer=paddle.nn.initializer.Constant(1.0)) + self.b_2 = self.create_parameter( + shape=[features], + default_initializer=paddle.nn.initializer.Constant(0.0)) + self.eps = eps + + def forward(self, x): + mean = x.mean(-1, keepdim=True) + std = x.std(-1, keepdim=True) + return self.a_2 * (x - mean) / (std + self.eps) + self.b_2 + + +class Decoder(nn.Layer): + def __init__(self): + super(Decoder, self).__init__() + + self.mask_multihead = MultiHeadedAttention( + h=16, d_model=1024, dropout=0.1) + self.mul_layernorm1 = LayerNorm(1024) + + self.multihead = MultiHeadedAttention(h=16, d_model=1024, dropout=0.1) + self.mul_layernorm2 = LayerNorm(1024) + + self.pff = PositionwiseFeedForward(1024, 2048) + self.mul_layernorm3 = LayerNorm(1024) + + def forward(self, text, conv_feature, attention_map=None): + text_max_length = text.shape[1] + mask = subsequent_mask(text_max_length) + result = text + result = self.mul_layernorm1(result + self.mask_multihead( + text, text, text, mask=mask)[0]) + b, c, h, w = conv_feature.shape + conv_feature = paddle.transpose( + conv_feature.reshape([b, c, h * w]), [0, 2, 1]) + word_image_align, attention_map = self.multihead( + result, + conv_feature, + conv_feature, + mask=None, + attention_map=attention_map) + result = self.mul_layernorm2(result + word_image_align) + result = self.mul_layernorm3(result + self.pff(result)) + + return result, attention_map + + +class BasicBlock(nn.Layer): + def __init__(self, inplanes, planes, downsample): + super(BasicBlock, self).__init__() + self.conv1 = nn.Conv2D( + inplanes, planes, kernel_size=3, stride=1, padding=1) + self.bn1 = nn.BatchNorm2D(planes, use_global_stats=True) + self.relu = nn.ReLU() + self.conv2 = nn.Conv2D( + planes, planes, kernel_size=3, stride=1, padding=1) + self.bn2 = nn.BatchNorm2D(planes, use_global_stats=True) + self.downsample = downsample + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + + if self.downsample != None: + residual = self.downsample(residual) + + out += residual + out = self.relu(out) + + return out + + +class Encoder(nn.Layer): + def __init__(self): + super(Encoder, self).__init__() + self.cnn = ResNet(num_in=1, block=BasicBlock, layers=[1, 2, 5, 3]) + + def forward(self, input): + conv_result = self.cnn(input) + return conv_result + + +class Transformer(nn.Layer): + def __init__(self, in_channels=1): + super(Transformer, self).__init__() + + word_n_class = get_alphabet_len() + self.embedding_word_with_upperword = Embeddings(512, word_n_class) + self.pe = PositionalEncoding(dim=512, dropout=0.1, max_len=5000) + + self.encoder = Encoder() + self.decoder = Decoder() + self.generator_word_with_upperword = Generator(1024, word_n_class) + + for p in self.parameters(): + if p.dim() > 1: + nn.initializer.XavierNormal(p) + + def forward(self, image, text_length, text_input, attention_map=None): + if image.shape[1] == 3: + R = image[:, 0:1, :, :] + G = image[:, 1:2, :, :] + B = image[:, 2:3, :, :] + image = 0.299 * R + 0.587 * G + 0.114 * B + + conv_feature = self.encoder(image) # batch, 1024, 8, 32 + max_length = max(text_length) + text_input = text_input[:, :max_length] + + text_embedding = self.embedding_word_with_upperword( + text_input) # batch, text_max_length, 512 + postion_embedding = self.pe( + paddle.zeros(text_embedding.shape)) # batch, text_max_length, 512 + text_input_with_pe = paddle.concat([text_embedding, postion_embedding], + 2) # batch, text_max_length, 1024 + batch, seq_len, _ = text_input_with_pe.shape + + text_input_with_pe, word_attention_map = self.decoder( + text_input_with_pe, conv_feature) + + word_decoder_result = self.generator_word_with_upperword( + text_input_with_pe) + + if self.training: + total_length = paddle.sum(text_length) + probs_res = paddle.zeros([total_length, get_alphabet_len()]) + start = 0 + + for index, length in enumerate(text_length): + length = int(length.numpy()) + probs_res[start:start + length, :] = word_decoder_result[ + index, 0:0 + length, :] + + start = start + length + + return probs_res, word_attention_map, None + else: + return word_decoder_result diff --git a/ppocr/modeling/heads/table_att_head.py b/ppocr/modeling/heads/table_att_head.py index 4f39d6253d8d596fecdc4736666a6d3106601a82..d3c86e22b02e08c18d8d5cb193f2ffb8b07ad785 100644 --- a/ppocr/modeling/heads/table_att_head.py +++ b/ppocr/modeling/heads/table_att_head.py @@ -18,12 +18,26 @@ from __future__ import print_function import paddle import paddle.nn as nn +from paddle import ParamAttr import paddle.nn.functional as F import numpy as np from .rec_att_head import AttentionGRUCell +def get_para_bias_attr(l2_decay, k): + if l2_decay > 0: + regularizer = paddle.regularizer.L2Decay(l2_decay) + stdv = 1.0 / math.sqrt(k * 1.0) + initializer = nn.initializer.Uniform(-stdv, stdv) + else: + regularizer = None + initializer = None + weight_attr = ParamAttr(regularizer=regularizer, initializer=initializer) + bias_attr = ParamAttr(regularizer=regularizer, initializer=initializer) + return [weight_attr, bias_attr] + + class TableAttentionHead(nn.Layer): def __init__(self, in_channels, @@ -32,7 +46,7 @@ class TableAttentionHead(nn.Layer): in_max_len=488, max_text_length=800, out_channels=30, - point_num=2, + loc_reg_num=4, **kwargs): super(TableAttentionHead, self).__init__() self.input_size = in_channels[-1] @@ -56,7 +70,7 @@ class TableAttentionHead(nn.Layer): else: self.loc_fea_trans = nn.Linear(256, self.max_text_length + 1) self.loc_generator = nn.Linear(self.input_size + hidden_size, - point_num * 2) + loc_reg_num) def _char_to_onehot(self, input_char, onehot_dim): input_ont_hot = F.one_hot(input_char, onehot_dim) @@ -129,3 +143,122 @@ class TableAttentionHead(nn.Layer): loc_preds = self.loc_generator(loc_concat) loc_preds = F.sigmoid(loc_preds) return {'structure_probs': structure_probs, 'loc_preds': loc_preds} + + +class SLAHead(nn.Layer): + def __init__(self, + in_channels, + hidden_size, + out_channels=30, + max_text_length=500, + loc_reg_num=4, + fc_decay=0.0, + **kwargs): + """ + @param in_channels: input shape + @param hidden_size: hidden_size for RNN and Embedding + @param out_channels: num_classes to rec + @param max_text_length: max text pred + """ + super().__init__() + in_channels = in_channels[-1] + self.hidden_size = hidden_size + self.max_text_length = max_text_length + self.emb = self._char_to_onehot + self.num_embeddings = out_channels + self.loc_reg_num = loc_reg_num + + # structure + self.structure_attention_cell = AttentionGRUCell( + in_channels, hidden_size, self.num_embeddings) + weight_attr, bias_attr = get_para_bias_attr( + l2_decay=fc_decay, k=hidden_size) + weight_attr1_1, bias_attr1_1 = get_para_bias_attr( + l2_decay=fc_decay, k=hidden_size) + weight_attr1_2, bias_attr1_2 = get_para_bias_attr( + l2_decay=fc_decay, k=hidden_size) + self.structure_generator = nn.Sequential( + nn.Linear( + self.hidden_size, + self.hidden_size, + weight_attr=weight_attr1_2, + bias_attr=bias_attr1_2), + nn.Linear( + hidden_size, + out_channels, + weight_attr=weight_attr, + bias_attr=bias_attr)) + # loc + weight_attr1, bias_attr1 = get_para_bias_attr( + l2_decay=fc_decay, k=self.hidden_size) + weight_attr2, bias_attr2 = get_para_bias_attr( + l2_decay=fc_decay, k=self.hidden_size) + self.loc_generator = nn.Sequential( + nn.Linear( + self.hidden_size, + self.hidden_size, + weight_attr=weight_attr1, + bias_attr=bias_attr1), + nn.Linear( + self.hidden_size, + loc_reg_num, + weight_attr=weight_attr2, + bias_attr=bias_attr2), + nn.Sigmoid()) + + def forward(self, inputs, targets=None): + fea = inputs[-1] + batch_size = fea.shape[0] + # reshape + fea = paddle.reshape(fea, [fea.shape[0], fea.shape[1], -1]) + fea = fea.transpose([0, 2, 1]) # (NTC)(batch, width, channels) + + hidden = paddle.zeros((batch_size, self.hidden_size)) + structure_preds = paddle.zeros((batch_size, self.max_text_length + 1, self.num_embeddings)) + loc_preds = paddle.zeros((batch_size, self.max_text_length + 1, self.loc_reg_num)) + structure_preds.stop_gradient = True + loc_preds.stop_gradient = True + if self.training and targets is not None: + structure = targets[0] + for i in range(self.max_text_length + 1): + hidden, structure_step, loc_step = self._decode(structure[:, i], + fea, hidden) + structure_preds[:, i, :] = structure_step + loc_preds[:, i, :] = loc_step + else: + pre_chars = paddle.zeros(shape=[batch_size], dtype="int32") + max_text_length = paddle.to_tensor(self.max_text_length) + # for export + loc_step, structure_step = None, None + for i in range(max_text_length + 1): + hidden, structure_step, loc_step = self._decode(pre_chars, fea, + hidden) + pre_chars = structure_step.argmax(axis=1, dtype="int32") + structure_preds[:, i, :] = structure_step + loc_preds[:, i, :] = loc_step + if not self.training: + structure_preds = F.softmax(structure_preds) + return {'structure_probs': structure_preds, 'loc_preds': loc_preds} + + def _decode(self, pre_chars, features, hidden): + """ + Predict table label and coordinates for each step + @param pre_chars: Table label in previous step + @param features: + @param hidden: hidden status in previous step + @return: + """ + emb_feature = self.emb(pre_chars) + # output shape is b * self.hidden_size + (output, hidden), alpha = self.structure_attention_cell( + hidden, features, emb_feature) + + # structure + structure_step = self.structure_generator(output) + # loc + loc_step = self.loc_generator(output) + return hidden, structure_step, loc_step + + def _char_to_onehot(self, input_char): + input_ont_hot = F.one_hot(input_char, self.num_embeddings) + return input_ont_hot diff --git a/ppocr/modeling/heads/table_master_head.py b/ppocr/modeling/heads/table_master_head.py index fddbcc63fcd6d5380f9fdd96f9ca85756d666442..486f9cbea13c15b0f3a6d608789163f18f678914 100644 --- a/ppocr/modeling/heads/table_master_head.py +++ b/ppocr/modeling/heads/table_master_head.py @@ -37,7 +37,7 @@ class TableMasterHead(nn.Layer): d_ff=2048, dropout=0, max_text_length=500, - point_num=2, + loc_reg_num=4, **kwargs): super(TableMasterHead, self).__init__() hidden_size = in_channels[-1] @@ -50,7 +50,7 @@ class TableMasterHead(nn.Layer): self.cls_fc = nn.Linear(hidden_size, out_channels) self.bbox_fc = nn.Sequential( # nn.Linear(hidden_size, hidden_size), - nn.Linear(hidden_size, point_num * 2), + nn.Linear(hidden_size, loc_reg_num), nn.Sigmoid()) self.norm = nn.LayerNorm(hidden_size) self.embedding = Embeddings(d_model=hidden_size, vocab=out_channels) @@ -59,7 +59,7 @@ class TableMasterHead(nn.Layer): self.SOS = out_channels - 3 self.PAD = out_channels - 1 self.out_channels = out_channels - self.point_num = point_num + self.loc_reg_num = loc_reg_num self.max_text_length = max_text_length def make_mask(self, tgt): @@ -105,7 +105,7 @@ class TableMasterHead(nn.Layer): output = paddle.zeros( [input.shape[0], self.max_text_length + 1, self.out_channels]) bbox_output = paddle.zeros( - [input.shape[0], self.max_text_length + 1, self.point_num * 2]) + [input.shape[0], self.max_text_length + 1, self.loc_reg_num]) max_text_length = paddle.to_tensor(self.max_text_length) for i in range(max_text_length + 1): target_mask = self.make_mask(input) diff --git a/ppocr/modeling/necks/__init__.py b/ppocr/modeling/necks/__init__.py index e10b082d11be69b1865f0093b6fec442b255f03a..c7e8dd068b4a68e56b066ca8fa629644a8f302c6 100644 --- a/ppocr/modeling/necks/__init__.py +++ b/ppocr/modeling/necks/__init__.py @@ -25,13 +25,16 @@ def build_neck(config): from .fpn import FPN from .fce_fpn import FCEFPN from .pren_fpn import PRENFPN + from .csp_pan import CSPPAN + from .ct_fpn import CTFPN support_dict = [ 'FPN', 'FCEFPN', 'LKPAN', 'DBFPN', 'RSEFPN', 'EASTFPN', 'SASTFPN', - 'SequenceEncoder', 'PGFPN', 'TableFPN', 'PRENFPN' + 'SequenceEncoder', 'PGFPN', 'TableFPN', 'PRENFPN', 'CSPPAN', 'CTFPN' ] module_name = config.pop('name') assert module_name in support_dict, Exception('neck only support {}'.format( support_dict)) + module_class = eval(module_name)(**config) return module_class diff --git a/ppocr/modeling/necks/csp_pan.py b/ppocr/modeling/necks/csp_pan.py new file mode 100755 index 0000000000000000000000000000000000000000..f4f8547f7d80d25edfe66824aa4f104341ae29ef --- /dev/null +++ b/ppocr/modeling/necks/csp_pan.py @@ -0,0 +1,324 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# The code is based on: +# https://github.com/PaddlePaddle/PaddleDetection/blob/release%2F2.3/ppdet/modeling/necks/csp_pan.py + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from paddle import ParamAttr + +__all__ = ['CSPPAN'] + + +class ConvBNLayer(nn.Layer): + def __init__(self, + in_channel=96, + out_channel=96, + kernel_size=3, + stride=1, + groups=1, + act='leaky_relu'): + super(ConvBNLayer, self).__init__() + initializer = nn.initializer.KaimingUniform() + self.act = act + assert self.act in ['leaky_relu', "hard_swish"] + self.conv = nn.Conv2D( + in_channels=in_channel, + out_channels=out_channel, + kernel_size=kernel_size, + groups=groups, + padding=(kernel_size - 1) // 2, + stride=stride, + weight_attr=ParamAttr(initializer=initializer), + bias_attr=False) + self.bn = nn.BatchNorm2D(out_channel) + + def forward(self, x): + x = self.bn(self.conv(x)) + if self.act == "leaky_relu": + x = F.leaky_relu(x) + elif self.act == "hard_swish": + x = F.hardswish(x) + return x + + +class DPModule(nn.Layer): + """ + Depth-wise and point-wise module. + Args: + in_channel (int): The input channels of this Module. + out_channel (int): The output channels of this Module. + kernel_size (int): The conv2d kernel size of this Module. + stride (int): The conv2d's stride of this Module. + act (str): The activation function of this Module, + Now support `leaky_relu` and `hard_swish`. + """ + + def __init__(self, + in_channel=96, + out_channel=96, + kernel_size=3, + stride=1, + act='leaky_relu'): + super(DPModule, self).__init__() + initializer = nn.initializer.KaimingUniform() + self.act = act + self.dwconv = nn.Conv2D( + in_channels=in_channel, + out_channels=out_channel, + kernel_size=kernel_size, + groups=out_channel, + padding=(kernel_size - 1) // 2, + stride=stride, + weight_attr=ParamAttr(initializer=initializer), + bias_attr=False) + self.bn1 = nn.BatchNorm2D(out_channel) + self.pwconv = nn.Conv2D( + in_channels=out_channel, + out_channels=out_channel, + kernel_size=1, + groups=1, + padding=0, + weight_attr=ParamAttr(initializer=initializer), + bias_attr=False) + self.bn2 = nn.BatchNorm2D(out_channel) + + def act_func(self, x): + if self.act == "leaky_relu": + x = F.leaky_relu(x) + elif self.act == "hard_swish": + x = F.hardswish(x) + return x + + def forward(self, x): + x = self.act_func(self.bn1(self.dwconv(x))) + x = self.act_func(self.bn2(self.pwconv(x))) + return x + + +class DarknetBottleneck(nn.Layer): + """The basic bottleneck block used in Darknet. + Each Block consists of two ConvModules and the input is added to the + final output. Each ConvModule is composed of Conv, BN, and act. + The first convLayer has filter size of 1x1 and the second one has the + filter size of 3x3. + Args: + in_channels (int): The input channels of this Module. + out_channels (int): The output channels of this Module. + expansion (int): The kernel size of the convolution. Default: 0.5 + add_identity (bool): Whether to add identity to the out. + Default: True + use_depthwise (bool): Whether to use depthwise separable convolution. + Default: False + """ + + def __init__(self, + in_channels, + out_channels, + kernel_size=3, + expansion=0.5, + add_identity=True, + use_depthwise=False, + act="leaky_relu"): + super(DarknetBottleneck, self).__init__() + hidden_channels = int(out_channels * expansion) + conv_func = DPModule if use_depthwise else ConvBNLayer + self.conv1 = ConvBNLayer( + in_channel=in_channels, + out_channel=hidden_channels, + kernel_size=1, + act=act) + self.conv2 = conv_func( + in_channel=hidden_channels, + out_channel=out_channels, + kernel_size=kernel_size, + stride=1, + act=act) + self.add_identity = \ + add_identity and in_channels == out_channels + + def forward(self, x): + identity = x + out = self.conv1(x) + out = self.conv2(out) + + if self.add_identity: + return out + identity + else: + return out + + +class CSPLayer(nn.Layer): + """Cross Stage Partial Layer. + Args: + in_channels (int): The input channels of the CSP layer. + out_channels (int): The output channels of the CSP layer. + expand_ratio (float): Ratio to adjust the number of channels of the + hidden layer. Default: 0.5 + num_blocks (int): Number of blocks. Default: 1 + add_identity (bool): Whether to add identity in blocks. + Default: True + use_depthwise (bool): Whether to depthwise separable convolution in + blocks. Default: False + """ + + def __init__(self, + in_channels, + out_channels, + kernel_size=3, + expand_ratio=0.5, + num_blocks=1, + add_identity=True, + use_depthwise=False, + act="leaky_relu"): + super().__init__() + mid_channels = int(out_channels * expand_ratio) + self.main_conv = ConvBNLayer(in_channels, mid_channels, 1, act=act) + self.short_conv = ConvBNLayer(in_channels, mid_channels, 1, act=act) + self.final_conv = ConvBNLayer( + 2 * mid_channels, out_channels, 1, act=act) + + self.blocks = nn.Sequential(* [ + DarknetBottleneck( + mid_channels, + mid_channels, + kernel_size, + 1.0, + add_identity, + use_depthwise, + act=act) for _ in range(num_blocks) + ]) + + def forward(self, x): + x_short = self.short_conv(x) + + x_main = self.main_conv(x) + x_main = self.blocks(x_main) + + x_final = paddle.concat((x_main, x_short), axis=1) + return self.final_conv(x_final) + + +class Channel_T(nn.Layer): + def __init__(self, + in_channels=[116, 232, 464], + out_channels=96, + act="leaky_relu"): + super(Channel_T, self).__init__() + self.convs = nn.LayerList() + for i in range(len(in_channels)): + self.convs.append( + ConvBNLayer( + in_channels[i], out_channels, 1, act=act)) + + def forward(self, x): + outs = [self.convs[i](x[i]) for i in range(len(x))] + return outs + + +class CSPPAN(nn.Layer): + """Path Aggregation Network with CSP module. + Args: + in_channels (List[int]): Number of input channels per scale. + out_channels (int): Number of output channels (used at each scale) + kernel_size (int): The conv2d kernel size of this Module. + num_csp_blocks (int): Number of bottlenecks in CSPLayer. Default: 1 + use_depthwise (bool): Whether to depthwise separable convolution in + blocks. Default: True + """ + + def __init__(self, + in_channels, + out_channels, + kernel_size=5, + num_csp_blocks=1, + use_depthwise=True, + act='hard_swish'): + super(CSPPAN, self).__init__() + self.in_channels = in_channels + self.out_channels = [out_channels] * len(in_channels) + conv_func = DPModule if use_depthwise else ConvBNLayer + + self.conv_t = Channel_T(in_channels, out_channels, act=act) + + # build top-down blocks + self.upsample = nn.Upsample(scale_factor=2, mode='nearest') + self.top_down_blocks = nn.LayerList() + for idx in range(len(in_channels) - 1, 0, -1): + self.top_down_blocks.append( + CSPLayer( + out_channels * 2, + out_channels, + kernel_size=kernel_size, + num_blocks=num_csp_blocks, + add_identity=False, + use_depthwise=use_depthwise, + act=act)) + + # build bottom-up blocks + self.downsamples = nn.LayerList() + self.bottom_up_blocks = nn.LayerList() + for idx in range(len(in_channels) - 1): + self.downsamples.append( + conv_func( + out_channels, + out_channels, + kernel_size=kernel_size, + stride=2, + act=act)) + self.bottom_up_blocks.append( + CSPLayer( + out_channels * 2, + out_channels, + kernel_size=kernel_size, + num_blocks=num_csp_blocks, + add_identity=False, + use_depthwise=use_depthwise, + act=act)) + + def forward(self, inputs): + """ + Args: + inputs (tuple[Tensor]): input features. + Returns: + tuple[Tensor]: CSPPAN features. + """ + assert len(inputs) == len(self.in_channels) + inputs = self.conv_t(inputs) + + # top-down path + inner_outs = [inputs[-1]] + for idx in range(len(self.in_channels) - 1, 0, -1): + feat_heigh = inner_outs[0] + feat_low = inputs[idx - 1] + upsample_feat = F.upsample( + feat_heigh, size=paddle.shape(feat_low)[2:4], mode="nearest") + + inner_out = self.top_down_blocks[len(self.in_channels) - 1 - idx]( + paddle.concat([upsample_feat, feat_low], 1)) + inner_outs.insert(0, inner_out) + + # bottom-up path + outs = [inner_outs[0]] + for idx in range(len(self.in_channels) - 1): + feat_low = outs[-1] + feat_height = inner_outs[idx + 1] + downsample_feat = self.downsamples[idx](feat_low) + out = self.bottom_up_blocks[idx](paddle.concat( + [downsample_feat, feat_height], 1)) + outs.append(out) + + return tuple(outs) diff --git a/ppocr/modeling/necks/ct_fpn.py b/ppocr/modeling/necks/ct_fpn.py new file mode 100644 index 0000000000000000000000000000000000000000..ee4d25e901b5b3093588571f0412a931eaf6f364 --- /dev/null +++ b/ppocr/modeling/necks/ct_fpn.py @@ -0,0 +1,185 @@ +# copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from paddle import nn +import paddle.nn.functional as F +from paddle import ParamAttr +import os +import sys + +import math +from paddle.nn.initializer import TruncatedNormal, Constant, Normal +ones_ = Constant(value=1.) +zeros_ = Constant(value=0.) + +__dir__ = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(__dir__) +sys.path.insert(0, os.path.abspath(os.path.join(__dir__, '../../..'))) + + +class Conv_BN_ReLU(nn.Layer): + def __init__(self, + in_planes, + out_planes, + kernel_size=1, + stride=1, + padding=0): + super(Conv_BN_ReLU, self).__init__() + self.conv = nn.Conv2D( + in_planes, + out_planes, + kernel_size=kernel_size, + stride=stride, + padding=padding, + bias_attr=False) + self.bn = nn.BatchNorm2D(out_planes) + self.relu = nn.ReLU() + + for m in self.sublayers(): + if isinstance(m, nn.Conv2D): + n = m._kernel_size[0] * m._kernel_size[1] * m._out_channels + normal_ = Normal(mean=0.0, std=math.sqrt(2. / n)) + normal_(m.weight) + elif isinstance(m, nn.BatchNorm2D): + zeros_(m.bias) + ones_(m.weight) + + def forward(self, x): + return self.relu(self.bn(self.conv(x))) + + +class FPEM(nn.Layer): + def __init__(self, in_channels, out_channels): + super(FPEM, self).__init__() + planes = out_channels + self.dwconv3_1 = nn.Conv2D( + planes, + planes, + kernel_size=3, + stride=1, + padding=1, + groups=planes, + bias_attr=False) + self.smooth_layer3_1 = Conv_BN_ReLU(planes, planes) + + self.dwconv2_1 = nn.Conv2D( + planes, + planes, + kernel_size=3, + stride=1, + padding=1, + groups=planes, + bias_attr=False) + self.smooth_layer2_1 = Conv_BN_ReLU(planes, planes) + + self.dwconv1_1 = nn.Conv2D( + planes, + planes, + kernel_size=3, + stride=1, + padding=1, + groups=planes, + bias_attr=False) + self.smooth_layer1_1 = Conv_BN_ReLU(planes, planes) + + self.dwconv2_2 = nn.Conv2D( + planes, + planes, + kernel_size=3, + stride=2, + padding=1, + groups=planes, + bias_attr=False) + self.smooth_layer2_2 = Conv_BN_ReLU(planes, planes) + + self.dwconv3_2 = nn.Conv2D( + planes, + planes, + kernel_size=3, + stride=2, + padding=1, + groups=planes, + bias_attr=False) + self.smooth_layer3_2 = Conv_BN_ReLU(planes, planes) + + self.dwconv4_2 = nn.Conv2D( + planes, + planes, + kernel_size=3, + stride=2, + padding=1, + groups=planes, + bias_attr=False) + self.smooth_layer4_2 = Conv_BN_ReLU(planes, planes) + + def _upsample_add(self, x, y): + return F.upsample(x, scale_factor=2, mode='bilinear') + y + + def forward(self, f1, f2, f3, f4): + # up-down + f3 = self.smooth_layer3_1(self.dwconv3_1(self._upsample_add(f4, f3))) + f2 = self.smooth_layer2_1(self.dwconv2_1(self._upsample_add(f3, f2))) + f1 = self.smooth_layer1_1(self.dwconv1_1(self._upsample_add(f2, f1))) + + # down-up + f2 = self.smooth_layer2_2(self.dwconv2_2(self._upsample_add(f2, f1))) + f3 = self.smooth_layer3_2(self.dwconv3_2(self._upsample_add(f3, f2))) + f4 = self.smooth_layer4_2(self.dwconv4_2(self._upsample_add(f4, f3))) + + return f1, f2, f3, f4 + + +class CTFPN(nn.Layer): + def __init__(self, in_channels, out_channel=128): + super(CTFPN, self).__init__() + self.out_channels = out_channel * 4 + + self.reduce_layer1 = Conv_BN_ReLU(in_channels[0], 128) + self.reduce_layer2 = Conv_BN_ReLU(in_channels[1], 128) + self.reduce_layer3 = Conv_BN_ReLU(in_channels[2], 128) + self.reduce_layer4 = Conv_BN_ReLU(in_channels[3], 128) + + self.fpem1 = FPEM(in_channels=(64, 128, 256, 512), out_channels=128) + self.fpem2 = FPEM(in_channels=(64, 128, 256, 512), out_channels=128) + + def _upsample(self, x, scale=1): + return F.upsample(x, scale_factor=scale, mode='bilinear') + + def forward(self, f): + # # reduce channel + f1 = self.reduce_layer1(f[0]) # N,64,160,160 --> N, 128, 160, 160 + f2 = self.reduce_layer2(f[1]) # N, 128, 80, 80 --> N, 128, 80, 80 + f3 = self.reduce_layer3(f[2]) # N, 256, 40, 40 --> N, 128, 40, 40 + f4 = self.reduce_layer4(f[3]) # N, 512, 20, 20 --> N, 128, 20, 20 + + # FPEM + f1_1, f2_1, f3_1, f4_1 = self.fpem1(f1, f2, f3, f4) + f1_2, f2_2, f3_2, f4_2 = self.fpem2(f1_1, f2_1, f3_1, f4_1) + + # FFM + f1 = f1_1 + f1_2 + f2 = f2_1 + f2_2 + f3 = f3_1 + f3_2 + f4 = f4_1 + f4_2 + + f2 = self._upsample(f2, scale=2) + f3 = self._upsample(f3, scale=4) + f4 = self._upsample(f4, scale=8) + ff = paddle.concat((f1, f2, f3, f4), 1) # N,512, 160,160 + return ff diff --git a/ppocr/modeling/transforms/__init__.py b/ppocr/modeling/transforms/__init__.py index 7e4ffdf46854416f71e1c8f4e131d1f0283bb725..b22c60bb3d5e1933056d37bad208f4c311139c8e 100755 --- a/ppocr/modeling/transforms/__init__.py +++ b/ppocr/modeling/transforms/__init__.py @@ -18,10 +18,10 @@ __all__ = ['build_transform'] def build_transform(config): from .tps import TPS from .stn import STN_ON + from .tsrn import TSRN from .gaspin_transformer import GA_SPIN_Transformer as GA_SPIN - - support_dict = ['TPS', 'STN_ON', 'GA_SPIN'] + support_dict = ['TPS', 'STN_ON', 'GA_SPIN', 'TSRN'] module_name = config.pop('name') assert module_name in support_dict, Exception( diff --git a/ppocr/modeling/transforms/tps_spatial_transformer.py b/ppocr/modeling/transforms/tps_spatial_transformer.py index cb1cb10aaa98dffa2f720dc81afdf82d25e071ca..e7ec2c848f192d766722f824962a7f8d0fed41f9 100644 --- a/ppocr/modeling/transforms/tps_spatial_transformer.py +++ b/ppocr/modeling/transforms/tps_spatial_transformer.py @@ -153,4 +153,4 @@ class TPSSpatialTransformer(nn.Layer): # the input to grid_sample is normalized [-1, 1], but what we get is [0, 1] grid = 2.0 * grid - 1.0 output_maps = grid_sample(input, grid, canvas=None) - return output_maps, source_coordinate + return output_maps, source_coordinate \ No newline at end of file diff --git a/ppocr/modeling/transforms/tsrn.py b/ppocr/modeling/transforms/tsrn.py new file mode 100644 index 0000000000000000000000000000000000000000..31aa90ea4b5d5e8f071487899b72219f3e5b36f5 --- /dev/null +++ b/ppocr/modeling/transforms/tsrn.py @@ -0,0 +1,219 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/FudanVI/FudanOCR/blob/main/text-gestalt/model/tsrn.py +""" + +import math +import paddle +import paddle.nn.functional as F +from paddle import nn +from collections import OrderedDict +import sys +import numpy as np +import warnings +import math, copy +import cv2 + +warnings.filterwarnings("ignore") + +from .tps_spatial_transformer import TPSSpatialTransformer +from .stn import STN as STN_model +from ppocr.modeling.heads.sr_rensnet_transformer import Transformer + + +class TSRN(nn.Layer): + def __init__(self, + in_channels, + scale_factor=2, + width=128, + height=32, + STN=False, + srb_nums=5, + mask=False, + hidden_units=32, + infer_mode=False, + **kwargs): + super(TSRN, self).__init__() + in_planes = 3 + if mask: + in_planes = 4 + assert math.log(scale_factor, 2) % 1 == 0 + upsample_block_num = int(math.log(scale_factor, 2)) + self.block1 = nn.Sequential( + nn.Conv2D( + in_planes, 2 * hidden_units, kernel_size=9, padding=4), + nn.PReLU()) + self.srb_nums = srb_nums + for i in range(srb_nums): + setattr(self, 'block%d' % (i + 2), + RecurrentResidualBlock(2 * hidden_units)) + + setattr( + self, + 'block%d' % (srb_nums + 2), + nn.Sequential( + nn.Conv2D( + 2 * hidden_units, + 2 * hidden_units, + kernel_size=3, + padding=1), + nn.BatchNorm2D(2 * hidden_units))) + + block_ = [ + UpsampleBLock(2 * hidden_units, 2) + for _ in range(upsample_block_num) + ] + block_.append( + nn.Conv2D( + 2 * hidden_units, in_planes, kernel_size=9, padding=4)) + setattr(self, 'block%d' % (srb_nums + 3), nn.Sequential(*block_)) + self.tps_inputsize = [height // scale_factor, width // scale_factor] + tps_outputsize = [height // scale_factor, width // scale_factor] + num_control_points = 20 + tps_margins = [0.05, 0.05] + self.stn = STN + if self.stn: + self.tps = TPSSpatialTransformer( + output_image_size=tuple(tps_outputsize), + num_control_points=num_control_points, + margins=tuple(tps_margins)) + + self.stn_head = STN_model( + in_channels=in_planes, + num_ctrlpoints=num_control_points, + activation='none') + self.out_channels = in_channels + + self.r34_transformer = Transformer() + for param in self.r34_transformer.parameters(): + param.trainable = False + self.infer_mode = infer_mode + + def forward(self, x): + output = {} + if self.infer_mode: + output["lr_img"] = x + y = x + else: + output["lr_img"] = x[0] + output["hr_img"] = x[1] + y = x[0] + if self.stn and self.training: + _, ctrl_points_x = self.stn_head(y) + y, _ = self.tps(y, ctrl_points_x) + block = {'1': self.block1(y)} + for i in range(self.srb_nums + 1): + block[str(i + 2)] = getattr(self, + 'block%d' % (i + 2))(block[str(i + 1)]) + + block[str(self.srb_nums + 3)] = getattr(self, 'block%d' % (self.srb_nums + 3)) \ + ((block['1'] + block[str(self.srb_nums + 2)])) + + sr_img = paddle.tanh(block[str(self.srb_nums + 3)]) + + output["sr_img"] = sr_img + + if self.training: + hr_img = x[1] + length = x[2] + input_tensor = x[3] + + # add transformer + sr_pred, word_attention_map_pred, _ = self.r34_transformer( + sr_img, length, input_tensor) + + hr_pred, word_attention_map_gt, _ = self.r34_transformer( + hr_img, length, input_tensor) + + output["hr_img"] = hr_img + output["hr_pred"] = hr_pred + output["word_attention_map_gt"] = word_attention_map_gt + output["sr_pred"] = sr_pred + output["word_attention_map_pred"] = word_attention_map_pred + + return output + + +class RecurrentResidualBlock(nn.Layer): + def __init__(self, channels): + super(RecurrentResidualBlock, self).__init__() + self.conv1 = nn.Conv2D(channels, channels, kernel_size=3, padding=1) + self.bn1 = nn.BatchNorm2D(channels) + self.gru1 = GruBlock(channels, channels) + self.prelu = mish() + self.conv2 = nn.Conv2D(channels, channels, kernel_size=3, padding=1) + self.bn2 = nn.BatchNorm2D(channels) + self.gru2 = GruBlock(channels, channels) + + def forward(self, x): + residual = self.conv1(x) + residual = self.bn1(residual) + residual = self.prelu(residual) + residual = self.conv2(residual) + residual = self.bn2(residual) + residual = self.gru1(residual.transpose([0, 1, 3, 2])).transpose( + [0, 1, 3, 2]) + + return self.gru2(x + residual) + + +class UpsampleBLock(nn.Layer): + def __init__(self, in_channels, up_scale): + super(UpsampleBLock, self).__init__() + self.conv = nn.Conv2D( + in_channels, in_channels * up_scale**2, kernel_size=3, padding=1) + + self.pixel_shuffle = nn.PixelShuffle(up_scale) + self.prelu = mish() + + def forward(self, x): + x = self.conv(x) + x = self.pixel_shuffle(x) + x = self.prelu(x) + return x + + +class mish(nn.Layer): + def __init__(self, ): + super(mish, self).__init__() + self.activated = True + + def forward(self, x): + if self.activated: + x = x * (paddle.tanh(F.softplus(x))) + return x + + +class GruBlock(nn.Layer): + def __init__(self, in_channels, out_channels): + super(GruBlock, self).__init__() + assert out_channels % 2 == 0 + self.conv1 = nn.Conv2D( + in_channels, out_channels, kernel_size=1, padding=0) + self.gru = nn.GRU(out_channels, + out_channels // 2, + direction='bidirectional') + + def forward(self, x): + # x: b, c, w, h + x = self.conv1(x) + x = x.transpose([0, 2, 3, 1]) # b, w, h, c + batch_size, w, h, c = x.shape + x = x.reshape([-1, h, c]) # b*w, h, c + x, _ = self.gru(x) + x = x.reshape([-1, w, h, c]) + x = x.transpose([0, 3, 1, 2]) + return x diff --git a/ppocr/postprocess/__init__.py b/ppocr/postprocess/__init__.py index 7c0c7fd003a38966a24fd116d8cfd3805aed6797..35b7a6800da422264a796da14236ae8a484c30d9 100644 --- a/ppocr/postprocess/__init__.py +++ b/ppocr/postprocess/__init__.py @@ -34,6 +34,8 @@ from .pg_postprocess import PGPostProcess from .vqa_token_ser_layoutlm_postprocess import VQASerTokenLayoutLMPostProcess, DistillationSerPostProcess from .vqa_token_re_layoutlm_postprocess import VQAReTokenLayoutLMPostProcess, DistillationRePostProcess from .table_postprocess import TableMasterLabelDecode, TableLabelDecode +from .picodet_postprocess import PicoDetPostProcess +from .ct_postprocess import CTPostProcess def build_post_process(config, global_config=None): @@ -47,7 +49,7 @@ def build_post_process(config, global_config=None): 'DistillationSARLabelDecode', 'ViTSTRLabelDecode', 'ABINetLabelDecode', 'TableMasterLabelDecode', 'SPINLabelDecode', 'DistillationSerPostProcess', 'DistillationRePostProcess', - 'VLLabelDecode' + 'VLLabelDecode', 'PicoDetPostProcess', 'CTPostProcess' ] if config['name'] == 'PSEPostProcess': diff --git a/ppocr/postprocess/ct_postprocess.py b/ppocr/postprocess/ct_postprocess.py new file mode 100755 index 0000000000000000000000000000000000000000..3ab90be24d65888339698a5abe2ed692ceaab4c7 --- /dev/null +++ b/ppocr/postprocess/ct_postprocess.py @@ -0,0 +1,154 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refered from: +https://github.com/shengtao96/CentripetalText/blob/main/test.py +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import os.path as osp +import numpy as np +import cv2 +import paddle +import pyclipper + + +class CTPostProcess(object): + """ + The post process for Centripetal Text (CT). + """ + + def __init__(self, min_score=0.88, min_area=16, box_type='poly', **kwargs): + self.min_score = min_score + self.min_area = min_area + self.box_type = box_type + + self.coord = np.zeros((2, 300, 300), dtype=np.int32) + for i in range(300): + for j in range(300): + self.coord[0, i, j] = j + self.coord[1, i, j] = i + + def __call__(self, preds, batch): + outs = preds['maps'] + out_scores = preds['score'] + + if isinstance(outs, paddle.Tensor): + outs = outs.numpy() + if isinstance(out_scores, paddle.Tensor): + out_scores = out_scores.numpy() + + batch_size = outs.shape[0] + boxes_batch = [] + for idx in range(batch_size): + bboxes = [] + scores = [] + + img_shape = batch[idx] + + org_img_size = img_shape[:3] + img_shape = img_shape[3:] + img_size = img_shape[:2] + + out = np.expand_dims(outs[idx], axis=0) + outputs = dict() + + score = np.expand_dims(out_scores[idx], axis=0) + + kernel = out[:, 0, :, :] > 0.2 + loc = out[:, 1:, :, :].astype("float32") + + score = score[0].astype(np.float32) + kernel = kernel[0].astype(np.uint8) + loc = loc[0].astype(np.float32) + + label_num, label_kernel = cv2.connectedComponents( + kernel, connectivity=4) + + for i in range(1, label_num): + ind = (label_kernel == i) + if ind.sum( + ) < 10: # pixel number less than 10, treated as background + label_kernel[ind] = 0 + + label = np.zeros_like(label_kernel) + h, w = label_kernel.shape + pixels = self.coord[:, :h, :w].reshape(2, -1) + points = pixels.transpose([1, 0]).astype(np.float32) + + off_points = (points + 10. / 4. * loc[:, pixels[1], pixels[0]].T + ).astype(np.int32) + off_points[:, 0] = np.clip(off_points[:, 0], 0, label.shape[1] - 1) + off_points[:, 1] = np.clip(off_points[:, 1], 0, label.shape[0] - 1) + + label[pixels[1], pixels[0]] = label_kernel[off_points[:, 1], + off_points[:, 0]] + label[label_kernel > 0] = label_kernel[label_kernel > 0] + + score_pocket = [0.0] + for i in range(1, label_num): + ind = (label_kernel == i) + if ind.sum() == 0: + score_pocket.append(0.0) + continue + score_i = np.mean(score[ind]) + score_pocket.append(score_i) + + label_num = np.max(label) + 1 + label = cv2.resize( + label, (img_size[1], img_size[0]), + interpolation=cv2.INTER_NEAREST) + + scale = (float(org_img_size[1]) / float(img_size[1]), + float(org_img_size[0]) / float(img_size[0])) + + for i in range(1, label_num): + ind = (label == i) + points = np.array(np.where(ind)).transpose((1, 0)) + + if points.shape[0] < self.min_area: + continue + + score_i = score_pocket[i] + if score_i < self.min_score: + continue + + if self.box_type == 'rect': + rect = cv2.minAreaRect(points[:, ::-1]) + bbox = cv2.boxPoints(rect) * scale + z = bbox.mean(0) + bbox = z + (bbox - z) * 0.85 + elif self.box_type == 'poly': + binary = np.zeros(label.shape, dtype='uint8') + binary[ind] = 1 + try: + _, contours, _ = cv2.findContours( + binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + except BaseException: + contours, _ = cv2.findContours( + binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + + bbox = contours[0] * scale + + bbox = bbox.astype('int32') + bboxes.append(bbox.reshape(-1, 2)) + scores.append(score_i) + + boxes_batch.append({'points': bboxes}) + + return boxes_batch diff --git a/ppocr/postprocess/pg_postprocess.py b/ppocr/postprocess/pg_postprocess.py index 0b1455181fddb0adb5347406bb2eb3093ee6fb30..058cf8b907de296094d3ed2fc7e6981939ced328 100644 --- a/ppocr/postprocess/pg_postprocess.py +++ b/ppocr/postprocess/pg_postprocess.py @@ -30,12 +30,18 @@ class PGPostProcess(object): The post process for PGNet. """ - def __init__(self, character_dict_path, valid_set, score_thresh, mode, + def __init__(self, + character_dict_path, + valid_set, + score_thresh, + mode, + point_gather_mode=None, **kwargs): self.character_dict_path = character_dict_path self.valid_set = valid_set self.score_thresh = score_thresh self.mode = mode + self.point_gather_mode = point_gather_mode # c++ la-nms is faster, but only support python 3.5 self.is_python35 = False @@ -43,8 +49,13 @@ class PGPostProcess(object): self.is_python35 = True def __call__(self, outs_dict, shape_list): - post = PGNet_PostProcess(self.character_dict_path, self.valid_set, - self.score_thresh, outs_dict, shape_list) + post = PGNet_PostProcess( + self.character_dict_path, + self.valid_set, + self.score_thresh, + outs_dict, + shape_list, + point_gather_mode=self.point_gather_mode) if self.mode == 'fast': data = post.pg_postprocess_fast() else: diff --git a/ppocr/postprocess/picodet_postprocess.py b/ppocr/postprocess/picodet_postprocess.py new file mode 100644 index 0000000000000000000000000000000000000000..1a0aeb4387ea4778c1c6bec910262f1c4e136084 --- /dev/null +++ b/ppocr/postprocess/picodet_postprocess.py @@ -0,0 +1,250 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +from scipy.special import softmax + + +def hard_nms(box_scores, iou_threshold, top_k=-1, candidate_size=200): + """ + Args: + box_scores (N, 5): boxes in corner-form and probabilities. + iou_threshold: intersection over union threshold. + top_k: keep top_k results. If k <= 0, keep all the results. + candidate_size: only consider the candidates with the highest scores. + Returns: + picked: a list of indexes of the kept boxes + """ + scores = box_scores[:, -1] + boxes = box_scores[:, :-1] + picked = [] + indexes = np.argsort(scores) + indexes = indexes[-candidate_size:] + while len(indexes) > 0: + current = indexes[-1] + picked.append(current) + if 0 < top_k == len(picked) or len(indexes) == 1: + break + current_box = boxes[current, :] + indexes = indexes[:-1] + rest_boxes = boxes[indexes, :] + iou = iou_of( + rest_boxes, + np.expand_dims( + current_box, axis=0), ) + indexes = indexes[iou <= iou_threshold] + + return box_scores[picked, :] + + +def iou_of(boxes0, boxes1, eps=1e-5): + """Return intersection-over-union (Jaccard index) of boxes. + Args: + boxes0 (N, 4): ground truth boxes. + boxes1 (N or 1, 4): predicted boxes. + eps: a small number to avoid 0 as denominator. + Returns: + iou (N): IoU values. + """ + overlap_left_top = np.maximum(boxes0[..., :2], boxes1[..., :2]) + overlap_right_bottom = np.minimum(boxes0[..., 2:], boxes1[..., 2:]) + + overlap_area = area_of(overlap_left_top, overlap_right_bottom) + area0 = area_of(boxes0[..., :2], boxes0[..., 2:]) + area1 = area_of(boxes1[..., :2], boxes1[..., 2:]) + return overlap_area / (area0 + area1 - overlap_area + eps) + + +def area_of(left_top, right_bottom): + """Compute the areas of rectangles given two corners. + Args: + left_top (N, 2): left top corner. + right_bottom (N, 2): right bottom corner. + Returns: + area (N): return the area. + """ + hw = np.clip(right_bottom - left_top, 0.0, None) + return hw[..., 0] * hw[..., 1] + + +class PicoDetPostProcess(object): + """ + Args: + input_shape (int): network input image size + ori_shape (int): ori image shape of before padding + scale_factor (float): scale factor of ori image + enable_mkldnn (bool): whether to open MKLDNN + """ + + def __init__(self, + layout_dict_path, + strides=[8, 16, 32, 64], + score_threshold=0.4, + nms_threshold=0.5, + nms_top_k=1000, + keep_top_k=100): + self.labels = self.load_layout_dict(layout_dict_path) + self.strides = strides + self.score_threshold = score_threshold + self.nms_threshold = nms_threshold + self.nms_top_k = nms_top_k + self.keep_top_k = keep_top_k + + def load_layout_dict(self, layout_dict_path): + with open(layout_dict_path, 'r', encoding='utf-8') as fp: + labels = fp.readlines() + return [label.strip('\n') for label in labels] + + def warp_boxes(self, boxes, ori_shape): + """Apply transform to boxes + """ + width, height = ori_shape[1], ori_shape[0] + n = len(boxes) + if n: + # warp points + xy = np.ones((n * 4, 3)) + xy[:, :2] = boxes[:, [0, 1, 2, 3, 0, 3, 2, 1]].reshape( + n * 4, 2) # x1y1, x2y2, x1y2, x2y1 + # xy = xy @ M.T # transform + xy = (xy[:, :2] / xy[:, 2:3]).reshape(n, 8) # rescale + # create new boxes + x = xy[:, [0, 2, 4, 6]] + y = xy[:, [1, 3, 5, 7]] + xy = np.concatenate( + (x.min(1), y.min(1), x.max(1), y.max(1))).reshape(4, n).T + # clip boxes + xy[:, [0, 2]] = xy[:, [0, 2]].clip(0, width) + xy[:, [1, 3]] = xy[:, [1, 3]].clip(0, height) + return xy.astype(np.float32) + else: + return boxes + + def img_info(self, ori_img, img): + origin_shape = ori_img.shape + resize_shape = img.shape + im_scale_y = resize_shape[2] / float(origin_shape[0]) + im_scale_x = resize_shape[3] / float(origin_shape[1]) + scale_factor = np.array([im_scale_y, im_scale_x], dtype=np.float32) + img_shape = np.array(img.shape[2:], dtype=np.float32) + + input_shape = np.array(img).astype('float32').shape[2:] + ori_shape = np.array((img_shape, )).astype('float32') + scale_factor = np.array((scale_factor, )).astype('float32') + return ori_shape, input_shape, scale_factor + + def __call__(self, ori_img, img, preds): + scores, raw_boxes = preds['boxes'], preds['boxes_num'] + batch_size = raw_boxes[0].shape[0] + reg_max = int(raw_boxes[0].shape[-1] / 4 - 1) + out_boxes_num = [] + out_boxes_list = [] + results = [] + ori_shape, input_shape, scale_factor = self.img_info(ori_img, img) + + for batch_id in range(batch_size): + # generate centers + decode_boxes = [] + select_scores = [] + for stride, box_distribute, score in zip(self.strides, raw_boxes, + scores): + box_distribute = box_distribute[batch_id] + score = score[batch_id] + # centers + fm_h = input_shape[0] / stride + fm_w = input_shape[1] / stride + h_range = np.arange(fm_h) + w_range = np.arange(fm_w) + ww, hh = np.meshgrid(w_range, h_range) + ct_row = (hh.flatten() + 0.5) * stride + ct_col = (ww.flatten() + 0.5) * stride + center = np.stack((ct_col, ct_row, ct_col, ct_row), axis=1) + + # box distribution to distance + reg_range = np.arange(reg_max + 1) + box_distance = box_distribute.reshape((-1, reg_max + 1)) + box_distance = softmax(box_distance, axis=1) + box_distance = box_distance * np.expand_dims(reg_range, axis=0) + box_distance = np.sum(box_distance, axis=1).reshape((-1, 4)) + box_distance = box_distance * stride + + # top K candidate + topk_idx = np.argsort(score.max(axis=1))[::-1] + topk_idx = topk_idx[:self.nms_top_k] + center = center[topk_idx] + score = score[topk_idx] + box_distance = box_distance[topk_idx] + + # decode box + decode_box = center + [-1, -1, 1, 1] * box_distance + + select_scores.append(score) + decode_boxes.append(decode_box) + + # nms + bboxes = np.concatenate(decode_boxes, axis=0) + confidences = np.concatenate(select_scores, axis=0) + picked_box_probs = [] + picked_labels = [] + for class_index in range(0, confidences.shape[1]): + probs = confidences[:, class_index] + mask = probs > self.score_threshold + probs = probs[mask] + if probs.shape[0] == 0: + continue + subset_boxes = bboxes[mask, :] + box_probs = np.concatenate( + [subset_boxes, probs.reshape(-1, 1)], axis=1) + box_probs = hard_nms( + box_probs, + iou_threshold=self.nms_threshold, + top_k=self.keep_top_k, ) + picked_box_probs.append(box_probs) + picked_labels.extend([class_index] * box_probs.shape[0]) + + if len(picked_box_probs) == 0: + out_boxes_list.append(np.empty((0, 4))) + out_boxes_num.append(0) + + else: + picked_box_probs = np.concatenate(picked_box_probs) + + # resize output boxes + picked_box_probs[:, :4] = self.warp_boxes( + picked_box_probs[:, :4], ori_shape[batch_id]) + im_scale = np.concatenate([ + scale_factor[batch_id][::-1], scale_factor[batch_id][::-1] + ]) + picked_box_probs[:, :4] /= im_scale + # clas score box + out_boxes_list.append( + np.concatenate( + [ + np.expand_dims( + np.array(picked_labels), + axis=-1), np.expand_dims( + picked_box_probs[:, 4], axis=-1), + picked_box_probs[:, :4] + ], + axis=1)) + out_boxes_num.append(len(picked_labels)) + + out_boxes_list = np.concatenate(out_boxes_list, axis=0) + out_boxes_num = np.asarray(out_boxes_num).astype(np.int32) + + for dt in out_boxes_list: + clsid, bbox, score = int(dt[0]), dt[2:], dt[1] + label = self.labels[clsid] + result = {'bbox': bbox, 'label': label} + results.append(result) + return results diff --git a/ppocr/postprocess/rec_postprocess.py b/ppocr/postprocess/rec_postprocess.py index 7b994f810d6747a91aceec82641f433d816b3feb..749060a053f1442f4bf5df6c5f4b56205e893be8 100644 --- a/ppocr/postprocess/rec_postprocess.py +++ b/ppocr/postprocess/rec_postprocess.py @@ -24,7 +24,7 @@ class BaseRecLabelDecode(object): def __init__(self, character_dict_path=None, use_space_char=False): self.beg_str = "sos" self.end_str = "eos" - + self.reverse = False self.character_str = [] if character_dict_path is None: self.character_str = "0123456789abcdefghijklmnopqrstuvwxyz" @@ -38,6 +38,8 @@ class BaseRecLabelDecode(object): if use_space_char: self.character_str.append(" ") dict_character = list(self.character_str) + if 'arabic' in character_dict_path: + self.reverse = True dict_character = self.add_special_char(dict_character) self.dict = {} @@ -45,6 +47,22 @@ class BaseRecLabelDecode(object): self.dict[char] = i self.character = dict_character + def pred_reverse(self, pred): + pred_re = [] + c_current = '' + for c in pred: + if not bool(re.search('[a-zA-Z0-9 :*./%+-]', c)): + if c_current != '': + pred_re.append(c_current) + pred_re.append(c) + c_current = '' + else: + c_current += c + if c_current != '': + pred_re.append(c_current) + + return ''.join(pred_re[::-1]) + def add_special_char(self, dict_character): return dict_character @@ -73,6 +91,10 @@ class BaseRecLabelDecode(object): conf_list = [0] text = ''.join(char_list) + + if self.reverse: # for arabic rec + text = self.pred_reverse(text) + result_list.append((text, np.mean(conf_list).tolist())) return result_list @@ -780,7 +802,7 @@ class VLLabelDecode(BaseRecLabelDecode): ) + length[i])].topk(1)[0][:, 0] preds_prob = paddle.exp( paddle.log(preds_prob).sum() / (preds_prob.shape[0] + 1e-6)) - text.append((preds_text, preds_prob)) + text.append((preds_text, preds_prob.numpy()[0])) if label is None: return text label = self.decode(label) diff --git a/ppocr/postprocess/table_postprocess.py b/ppocr/postprocess/table_postprocess.py index 4396ec4f701478e7bdcdd8c7752738c5c8ef148d..a47061f935e31b24fdb624df170f8abb38e01f40 100644 --- a/ppocr/postprocess/table_postprocess.py +++ b/ppocr/postprocess/table_postprocess.py @@ -21,9 +21,29 @@ from .rec_postprocess import AttnLabelDecode class TableLabelDecode(AttnLabelDecode): """ """ - def __init__(self, character_dict_path, **kwargs): - super(TableLabelDecode, self).__init__(character_dict_path) - self.td_token = ['', '', ''] + def __init__(self, + character_dict_path, + merge_no_span_structure=False, + **kwargs): + dict_character = [] + with open(character_dict_path, "rb") as fin: + lines = fin.readlines() + for line in lines: + line = line.decode('utf-8').strip("\n").strip("\r\n") + dict_character.append(line) + + if merge_no_span_structure: + if "" not in dict_character: + dict_character.append("") + if "" in dict_character: + dict_character.remove("") + + dict_character = self.add_special_char(dict_character) + self.dict = {} + for i, char in enumerate(dict_character): + self.dict[char] = i + self.character = dict_character + self.td_token = ['', ''] def __call__(self, preds, batch=None): structure_probs = preds['structure_probs'] @@ -114,18 +134,21 @@ class TableLabelDecode(AttnLabelDecode): def _bbox_decode(self, bbox, shape): h, w, ratio_h, ratio_w, pad_h, pad_w = shape - src_h = h / ratio_h - src_w = w / ratio_w - bbox[0::2] *= src_w - bbox[1::2] *= src_h + bbox[0::2] *= w + bbox[1::2] *= h return bbox class TableMasterLabelDecode(TableLabelDecode): """ """ - def __init__(self, character_dict_path, box_shape='ori', **kwargs): - super(TableMasterLabelDecode, self).__init__(character_dict_path) + def __init__(self, + character_dict_path, + box_shape='ori', + merge_no_span_structure=True, + **kwargs): + super(TableMasterLabelDecode, self).__init__(character_dict_path, + merge_no_span_structure) self.box_shape = box_shape assert box_shape in [ 'ori', 'pad' @@ -157,4 +180,7 @@ class TableMasterLabelDecode(TableLabelDecode): bbox[1::2] *= h bbox[0::2] /= ratio_w bbox[1::2] /= ratio_h + x, y, w, h = bbox + x1, y1, x2, y2 = x - w // 2, y - h // 2, x + w // 2, y + h // 2 + bbox = np.array([x1, y1, x2, y2]) return bbox diff --git a/ppocr/utils/dict/arabic_dict.txt b/ppocr/utils/dict/arabic_dict.txt index e97abf39274df77fbad066ee4635aebc6743140c..916d421c53bad563dfd980c1b64dcce07a3c9d24 100644 --- a/ppocr/utils/dict/arabic_dict.txt +++ b/ppocr/utils/dict/arabic_dict.txt @@ -1,4 +1,3 @@ - ! # $ diff --git a/ppocr/utils/dict/kie_dict/xfund_class_list.txt b/ppocr/utils/dict/kie_dict/xfund_class_list.txt new file mode 100644 index 0000000000000000000000000000000000000000..faded9f9b8f56bd258909bec9b8f1755aa688367 --- /dev/null +++ b/ppocr/utils/dict/kie_dict/xfund_class_list.txt @@ -0,0 +1,4 @@ +OTHER +QUESTION +ANSWER +HEADER diff --git a/ppocr/utils/dict/layout_dict/layout_cdla_dict.txt b/ppocr/utils/dict/layout_dict/layout_cdla_dict.txt new file mode 100644 index 0000000000000000000000000000000000000000..8be0f48600a88463d840fffe27eebd63629576ce --- /dev/null +++ b/ppocr/utils/dict/layout_dict/layout_cdla_dict.txt @@ -0,0 +1,10 @@ +text +title +figure +figure_caption +table +table_caption +header +footer +reference +equation \ No newline at end of file diff --git a/ppocr/utils/dict/layout_dict/layout_publaynet_dict.txt b/ppocr/utils/dict/layout_dict/layout_publaynet_dict.txt new file mode 100644 index 0000000000000000000000000000000000000000..ca6acf4eef8d4d5f9ba5a4ced4858a119a4ef983 --- /dev/null +++ b/ppocr/utils/dict/layout_dict/layout_publaynet_dict.txt @@ -0,0 +1,5 @@ +text +title +list +table +figure \ No newline at end of file diff --git a/ppocr/utils/dict/layout_dict/layout_table_dict.txt b/ppocr/utils/dict/layout_dict/layout_table_dict.txt new file mode 100644 index 0000000000000000000000000000000000000000..faea15ea07d7d1a6f77dbd4287bb9fa87165cbb9 --- /dev/null +++ b/ppocr/utils/dict/layout_dict/layout_table_dict.txt @@ -0,0 +1 @@ +table \ No newline at end of file diff --git a/ppocr/utils/dict/table_structure_dict_ch.txt b/ppocr/utils/dict/table_structure_dict_ch.txt new file mode 100644 index 0000000000000000000000000000000000000000..0c59c0e9998a31f9d32f703625aa1c5ca7718c8d --- /dev/null +++ b/ppocr/utils/dict/table_structure_dict_ch.txt @@ -0,0 +1,48 @@ + + + + + + + + + + colspan="2" + colspan="3" + colspan="4" + colspan="5" + colspan="6" + colspan="7" + colspan="8" + colspan="9" + colspan="10" + colspan="11" + colspan="12" + colspan="13" + colspan="14" + colspan="15" + colspan="16" + colspan="17" + colspan="18" + colspan="19" + colspan="20" + rowspan="2" + rowspan="3" + rowspan="4" + rowspan="5" + rowspan="6" + rowspan="7" + rowspan="8" + rowspan="9" + rowspan="10" + rowspan="11" + rowspan="12" + rowspan="13" + rowspan="14" + rowspan="15" + rowspan="16" + rowspan="17" + rowspan="18" + rowspan="19" + rowspan="20" diff --git a/ppocr/utils/e2e_metric/Deteval.py b/ppocr/utils/e2e_metric/Deteval.py index 45567a7dd2d82b6c583abd4a4eabef52974be081..6ce56eda2aa9f38fdc712d49ae64945c558b418d 100755 --- a/ppocr/utils/e2e_metric/Deteval.py +++ b/ppocr/utils/e2e_metric/Deteval.py @@ -12,8 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +import json import numpy as np import scipy.io as io +import Polygon as plg from ppocr.utils.e2e_metric.polygon_fast import iod, area_of_intersection, area @@ -269,7 +271,124 @@ def get_socre_B(gt_dir, img_id, pred_dict): return single_data -def combine_results(all_data): +def get_score_C(gt_label, text, pred_bboxes): + """ + get score for CentripetalText (CT) prediction. + """ + + def gt_reading_mod(gt_label, text): + """This helper reads groundtruths from mat files""" + groundtruths = [] + nbox = len(gt_label) + for i in range(nbox): + label = {"transcription": text[i][0], "points": gt_label[i].numpy()} + groundtruths.append(label) + + return groundtruths + + def get_union(pD, pG): + areaA = pD.area() + areaB = pG.area() + return areaA + areaB - get_intersection(pD, pG) + + def get_intersection(pD, pG): + pInt = pD & pG + if len(pInt) == 0: + return 0 + return pInt.area() + + def detection_filtering(detections, groundtruths, threshold=0.5): + for gt in groundtruths: + point_num = gt['points'].shape[1] // 2 + if gt['transcription'] == '###' and (point_num > 1): + gt_p = np.array(gt['points']).reshape(point_num, + 2).astype('int32') + gt_p = plg.Polygon(gt_p) + + for det_id, detection in enumerate(detections): + det_y = detection[0::2] + det_x = detection[1::2] + + det_p = np.concatenate((np.array(det_x), np.array(det_y))) + det_p = det_p.reshape(2, -1).transpose() + det_p = plg.Polygon(det_p) + + try: + det_gt_iou = get_intersection(det_p, + gt_p) / det_p.area() + except: + print(det_x, det_y, gt_p) + if det_gt_iou > threshold: + detections[det_id] = [] + + detections[:] = [item for item in detections if item != []] + return detections + + def sigma_calculation(det_p, gt_p): + """ + sigma = inter_area / gt_area + """ + if gt_p.area() == 0.: + return 0 + return get_intersection(det_p, gt_p) / gt_p.area() + + def tau_calculation(det_p, gt_p): + """ + tau = inter_area / det_area + """ + if det_p.area() == 0.: + return 0 + return get_intersection(det_p, gt_p) / det_p.area() + + detections = [] + + for item in pred_bboxes: + detections.append(item[:, ::-1].reshape(-1)) + + groundtruths = gt_reading_mod(gt_label, text) + + detections = detection_filtering( + detections, groundtruths) # filters detections overlapping with DC area + + for idx in range(len(groundtruths) - 1, -1, -1): + #NOTE: source code use 'orin' to indicate '#', here we use 'anno', + # which may cause slight drop in fscore, about 0.12 + if groundtruths[idx]['transcription'] == '###': + groundtruths.pop(idx) + + local_sigma_table = np.zeros((len(groundtruths), len(detections))) + local_tau_table = np.zeros((len(groundtruths), len(detections))) + + for gt_id, gt in enumerate(groundtruths): + if len(detections) > 0: + for det_id, detection in enumerate(detections): + point_num = gt['points'].shape[1] // 2 + + gt_p = np.array(gt['points']).reshape(point_num, + 2).astype('int32') + gt_p = plg.Polygon(gt_p) + + det_y = detection[0::2] + det_x = detection[1::2] + + det_p = np.concatenate((np.array(det_x), np.array(det_y))) + + det_p = det_p.reshape(2, -1).transpose() + det_p = plg.Polygon(det_p) + + local_sigma_table[gt_id, det_id] = sigma_calculation(det_p, + gt_p) + local_tau_table[gt_id, det_id] = tau_calculation(det_p, gt_p) + + data = {} + data['sigma'] = local_sigma_table + data['global_tau'] = local_tau_table + data['global_pred_str'] = '' + data['global_gt_str'] = '' + return data + + +def combine_results(all_data, rec_flag=True): tr = 0.7 tp = 0.6 fsc_k = 0.8 @@ -278,6 +397,7 @@ def combine_results(all_data): global_tau = [] global_pred_str = [] global_gt_str = [] + for data in all_data: global_sigma.append(data['sigma']) global_tau.append(data['global_tau']) @@ -294,7 +414,7 @@ def combine_results(all_data): def one_to_one(local_sigma_table, local_tau_table, local_accumulative_recall, local_accumulative_precision, global_accumulative_recall, global_accumulative_precision, - gt_flag, det_flag, idy): + gt_flag, det_flag, idy, rec_flag): hit_str_num = 0 for gt_id in range(num_gt): gt_matching_qualified_sigma_candidates = np.where( @@ -328,14 +448,15 @@ def combine_results(all_data): gt_flag[0, gt_id] = 1 matched_det_id = np.where(local_sigma_table[gt_id, :] > tr) # recg start - gt_str_cur = global_gt_str[idy][gt_id] - pred_str_cur = global_pred_str[idy][matched_det_id[0].tolist()[ - 0]] - if pred_str_cur == gt_str_cur: - hit_str_num += 1 - else: - if pred_str_cur.lower() == gt_str_cur.lower(): + if rec_flag: + gt_str_cur = global_gt_str[idy][gt_id] + pred_str_cur = global_pred_str[idy][matched_det_id[0] + .tolist()[0]] + if pred_str_cur == gt_str_cur: hit_str_num += 1 + else: + if pred_str_cur.lower() == gt_str_cur.lower(): + hit_str_num += 1 # recg end det_flag[0, matched_det_id] = 1 return local_accumulative_recall, local_accumulative_precision, global_accumulative_recall, global_accumulative_precision, gt_flag, det_flag, hit_str_num @@ -343,7 +464,7 @@ def combine_results(all_data): def one_to_many(local_sigma_table, local_tau_table, local_accumulative_recall, local_accumulative_precision, global_accumulative_recall, global_accumulative_precision, - gt_flag, det_flag, idy): + gt_flag, det_flag, idy, rec_flag): hit_str_num = 0 for gt_id in range(num_gt): # skip the following if the groundtruth was matched @@ -374,28 +495,30 @@ def combine_results(all_data): gt_flag[0, gt_id] = 1 det_flag[0, qualified_tau_candidates] = 1 # recg start - gt_str_cur = global_gt_str[idy][gt_id] - pred_str_cur = global_pred_str[idy][ - qualified_tau_candidates[0].tolist()[0]] - if pred_str_cur == gt_str_cur: - hit_str_num += 1 - else: - if pred_str_cur.lower() == gt_str_cur.lower(): + if rec_flag: + gt_str_cur = global_gt_str[idy][gt_id] + pred_str_cur = global_pred_str[idy][ + qualified_tau_candidates[0].tolist()[0]] + if pred_str_cur == gt_str_cur: hit_str_num += 1 + else: + if pred_str_cur.lower() == gt_str_cur.lower(): + hit_str_num += 1 # recg end elif (np.sum(local_sigma_table[gt_id, qualified_tau_candidates]) >= tr): gt_flag[0, gt_id] = 1 det_flag[0, qualified_tau_candidates] = 1 # recg start - gt_str_cur = global_gt_str[idy][gt_id] - pred_str_cur = global_pred_str[idy][ - qualified_tau_candidates[0].tolist()[0]] - if pred_str_cur == gt_str_cur: - hit_str_num += 1 - else: - if pred_str_cur.lower() == gt_str_cur.lower(): + if rec_flag: + gt_str_cur = global_gt_str[idy][gt_id] + pred_str_cur = global_pred_str[idy][ + qualified_tau_candidates[0].tolist()[0]] + if pred_str_cur == gt_str_cur: hit_str_num += 1 + else: + if pred_str_cur.lower() == gt_str_cur.lower(): + hit_str_num += 1 # recg end global_accumulative_recall = global_accumulative_recall + fsc_k @@ -409,7 +532,7 @@ def combine_results(all_data): def many_to_one(local_sigma_table, local_tau_table, local_accumulative_recall, local_accumulative_precision, global_accumulative_recall, global_accumulative_precision, - gt_flag, det_flag, idy): + gt_flag, det_flag, idy, rec_flag): hit_str_num = 0 for det_id in range(num_det): # skip the following if the detection was matched @@ -440,6 +563,30 @@ def combine_results(all_data): gt_flag[0, qualified_sigma_candidates] = 1 det_flag[0, det_id] = 1 # recg start + if rec_flag: + pred_str_cur = global_pred_str[idy][det_id] + gt_len = len(qualified_sigma_candidates[0]) + for idx in range(gt_len): + ele_gt_id = qualified_sigma_candidates[ + 0].tolist()[idx] + if ele_gt_id not in global_gt_str[idy]: + continue + gt_str_cur = global_gt_str[idy][ele_gt_id] + if pred_str_cur == gt_str_cur: + hit_str_num += 1 + break + else: + if pred_str_cur.lower() == gt_str_cur.lower( + ): + hit_str_num += 1 + break + # recg end + elif (np.sum(local_tau_table[qualified_sigma_candidates, + det_id]) >= tp): + det_flag[0, det_id] = 1 + gt_flag[0, qualified_sigma_candidates] = 1 + # recg start + if rec_flag: pred_str_cur = global_pred_str[idy][det_id] gt_len = len(qualified_sigma_candidates[0]) for idx in range(gt_len): @@ -454,27 +601,7 @@ def combine_results(all_data): else: if pred_str_cur.lower() == gt_str_cur.lower(): hit_str_num += 1 - break - # recg end - elif (np.sum(local_tau_table[qualified_sigma_candidates, - det_id]) >= tp): - det_flag[0, det_id] = 1 - gt_flag[0, qualified_sigma_candidates] = 1 - # recg start - pred_str_cur = global_pred_str[idy][det_id] - gt_len = len(qualified_sigma_candidates[0]) - for idx in range(gt_len): - ele_gt_id = qualified_sigma_candidates[0].tolist()[idx] - if ele_gt_id not in global_gt_str[idy]: - continue - gt_str_cur = global_gt_str[idy][ele_gt_id] - if pred_str_cur == gt_str_cur: - hit_str_num += 1 - break - else: - if pred_str_cur.lower() == gt_str_cur.lower(): - hit_str_num += 1 - break + break # recg end global_accumulative_recall = global_accumulative_recall + num_qualified_sigma_candidates * fsc_k @@ -504,7 +631,7 @@ def combine_results(all_data): gt_flag, det_flag, hit_str_num = one_to_one(local_sigma_table, local_tau_table, local_accumulative_recall, local_accumulative_precision, global_accumulative_recall, global_accumulative_precision, - gt_flag, det_flag, idx) + gt_flag, det_flag, idx, rec_flag) hit_str_count += hit_str_num #######then check for one-to-many case########## @@ -512,14 +639,14 @@ def combine_results(all_data): gt_flag, det_flag, hit_str_num = one_to_many(local_sigma_table, local_tau_table, local_accumulative_recall, local_accumulative_precision, global_accumulative_recall, global_accumulative_precision, - gt_flag, det_flag, idx) + gt_flag, det_flag, idx, rec_flag) hit_str_count += hit_str_num #######then check for many-to-one case########## local_accumulative_recall, local_accumulative_precision, global_accumulative_recall, global_accumulative_precision, \ gt_flag, det_flag, hit_str_num = many_to_one(local_sigma_table, local_tau_table, local_accumulative_recall, local_accumulative_precision, global_accumulative_recall, global_accumulative_precision, - gt_flag, det_flag, idx) + gt_flag, det_flag, idx, rec_flag) hit_str_count += hit_str_num try: diff --git a/ppocr/utils/e2e_utils/extract_textpoint_fast.py b/ppocr/utils/e2e_utils/extract_textpoint_fast.py index 787cd3017fafa6fc554bead0cc05b5bfe682df42..a85b8e78ead00e64630b57400b9e5141eb0181a8 100644 --- a/ppocr/utils/e2e_utils/extract_textpoint_fast.py +++ b/ppocr/utils/e2e_utils/extract_textpoint_fast.py @@ -88,8 +88,35 @@ def ctc_greedy_decoder(probs_seq, blank=95, keep_blank_in_idxs=True): return dst_str, keep_idx_list -def instance_ctc_greedy_decoder(gather_info, logits_map, pts_num=4): +def instance_ctc_greedy_decoder(gather_info, + logits_map, + pts_num=4, + point_gather_mode=None): _, _, C = logits_map.shape + if point_gather_mode == 'align': + insert_num = 0 + gather_info = np.array(gather_info) + length = len(gather_info) - 1 + for index in range(length): + stride_y = np.abs(gather_info[index + insert_num][0] - gather_info[ + index + 1 + insert_num][0]) + stride_x = np.abs(gather_info[index + insert_num][1] - gather_info[ + index + 1 + insert_num][1]) + max_points = int(max(stride_x, stride_y)) + stride = (gather_info[index + insert_num] - + gather_info[index + 1 + insert_num]) / (max_points) + insert_num_temp = max_points - 1 + + for i in range(int(insert_num_temp)): + insert_value = gather_info[index + insert_num] - (i + 1 + ) * stride + insert_index = index + i + 1 + insert_num + gather_info = np.insert( + gather_info, insert_index, insert_value, axis=0) + insert_num += insert_num_temp + gather_info = gather_info.tolist() + else: + pass ys, xs = zip(*gather_info) logits_seq = logits_map[list(ys), list(xs)] probs_seq = logits_seq @@ -104,7 +131,8 @@ def instance_ctc_greedy_decoder(gather_info, logits_map, pts_num=4): def ctc_decoder_for_image(gather_info_list, logits_map, Lexicon_Table, - pts_num=6): + pts_num=6, + point_gather_mode=None): """ CTC decoder using multiple processes. """ @@ -114,7 +142,10 @@ def ctc_decoder_for_image(gather_info_list, if len(gather_info) < pts_num: continue dst_str, xys_list = instance_ctc_greedy_decoder( - gather_info, logits_map, pts_num=pts_num) + gather_info, + logits_map, + pts_num=pts_num, + point_gather_mode=point_gather_mode) dst_str_readable = ''.join([Lexicon_Table[idx] for idx in dst_str]) if len(dst_str_readable) < 2: continue @@ -356,7 +387,8 @@ def generate_pivot_list_fast(p_score, p_char_maps, f_direction, Lexicon_Table, - score_thresh=0.5): + score_thresh=0.5, + point_gather_mode=None): """ return center point and end point of TCL instance; filter with the char maps; """ @@ -384,7 +416,10 @@ def generate_pivot_list_fast(p_score, p_char_maps = p_char_maps.transpose([1, 2, 0]) decoded_str, keep_yxs_list = ctc_decoder_for_image( - all_pos_yxs, logits_map=p_char_maps, Lexicon_Table=Lexicon_Table) + all_pos_yxs, + logits_map=p_char_maps, + Lexicon_Table=Lexicon_Table, + point_gather_mode=point_gather_mode) return keep_yxs_list, decoded_str diff --git a/ppocr/utils/e2e_utils/pgnet_pp_utils.py b/ppocr/utils/e2e_utils/pgnet_pp_utils.py index a15503c0a88f735cc5f5eef924b0d022e5684eed..06a766b0e714e2792c0b0d3069963de998eb9eb7 100644 --- a/ppocr/utils/e2e_utils/pgnet_pp_utils.py +++ b/ppocr/utils/e2e_utils/pgnet_pp_utils.py @@ -28,13 +28,19 @@ from extract_textpoint_fast import generate_pivot_list_fast, restore_poly class PGNet_PostProcess(object): # two different post-process - def __init__(self, character_dict_path, valid_set, score_thresh, outs_dict, - shape_list): + def __init__(self, + character_dict_path, + valid_set, + score_thresh, + outs_dict, + shape_list, + point_gather_mode=None): self.Lexicon_Table = get_dict(character_dict_path) self.valid_set = valid_set self.score_thresh = score_thresh self.outs_dict = outs_dict self.shape_list = shape_list + self.point_gather_mode = point_gather_mode def pg_postprocess_fast(self): p_score = self.outs_dict['f_score'] @@ -58,7 +64,8 @@ class PGNet_PostProcess(object): p_char, p_direction, self.Lexicon_Table, - score_thresh=self.score_thresh) + score_thresh=self.score_thresh, + point_gather_mode=self.point_gather_mode) poly_list, keep_str_list = restore_poly(instance_yxs_list, seq_strs, p_border, ratio_w, ratio_h, src_w, src_h, self.valid_set) diff --git a/ppocr/utils/network.py b/ppocr/utils/network.py index 118d1be364925d9416134cffe21d636fcac753e9..080a5d160116cfdd3b255a883525281d97ee9cc9 100644 --- a/ppocr/utils/network.py +++ b/ppocr/utils/network.py @@ -41,9 +41,7 @@ def download_with_progressbar(url, save_path): def maybe_download(model_storage_directory, url): # using custom model - tar_file_name_list = [ - 'inference.pdiparams', 'inference.pdiparams.info', 'inference.pdmodel' - ] + tar_file_name_list = ['.pdiparams', '.pdiparams.info', '.pdmodel'] if not os.path.exists( os.path.join(model_storage_directory, 'inference.pdiparams') ) or not os.path.exists( @@ -57,8 +55,8 @@ def maybe_download(model_storage_directory, url): for member in tarObj.getmembers(): filename = None for tar_file_name in tar_file_name_list: - if tar_file_name in member.name: - filename = tar_file_name + if member.name.endswith(tar_file_name): + filename = 'inference' + tar_file_name if filename is None: continue file = tarObj.extractfile(member) diff --git a/ppocr/utils/save_load.py b/ppocr/utils/save_load.py index e77a6ce0183611569193e1996e935f4bd30400a0..aa65f290c0a5f4f13b3103fb4404815e2ae74a88 100644 --- a/ppocr/utils/save_load.py +++ b/ppocr/utils/save_load.py @@ -54,13 +54,15 @@ def load_model(config, model, optimizer=None, model_type='det'): pretrained_model = global_config.get('pretrained_model') best_model_dict = {} is_float16 = False + is_nlp_model = model_type == 'kie' and config["Architecture"][ + "algorithm"] not in ["SDMGR"] - if model_type == 'vqa': - # NOTE: for vqa model, resume training is not supported now + if is_nlp_model is True: + # NOTE: for kie model dsitillation, resume training is not supported now if config["Architecture"]["algorithm"] in ["Distillation"]: return best_model_dict checkpoints = config['Architecture']['Backbone']['checkpoints'] - # load vqa method metric + # load kie method metric if checkpoints: if os.path.exists(os.path.join(checkpoints, 'metric.states')): with open(os.path.join(checkpoints, 'metric.states'), @@ -102,8 +104,9 @@ def load_model(config, model, optimizer=None, model_type='det'): continue pre_value = params[key] if pre_value.dtype == paddle.float16: - pre_value = pre_value.astype(paddle.float32) is_float16 = True + if pre_value.dtype != value.dtype: + pre_value = pre_value.astype(value.dtype) if list(value.shape) == list(pre_value.shape): new_state_dict[key] = pre_value else: @@ -148,16 +151,21 @@ def load_pretrained_params(model, path): "The {}.pdparams does not exists!".format(path) params = paddle.load(path + '.pdparams') + state_dict = model.state_dict() + new_state_dict = {} is_float16 = False + for k1 in params.keys(): + if k1 not in state_dict.keys(): logger.warning("The pretrained params {} not in model".format(k1)) else: if params[k1].dtype == paddle.float16: - params[k1] = params[k1].astype(paddle.float32) is_float16 = True + if params[k1].dtype != state_dict[k1].dtype: + params[k1] = params[k1].astype(state_dict[k1].dtype) if list(state_dict[k1].shape) == list(params[k1].shape): new_state_dict[k1] = params[k1] else: @@ -187,12 +195,14 @@ def save_model(model, """ _mkdir_if_not_exist(model_path, logger) model_prefix = os.path.join(model_path, prefix) - if config['Architecture']["model_type"] != 'vqa': - paddle.save(optimizer.state_dict(), model_prefix + '.pdopt') - if config['Architecture']["model_type"] != 'vqa': + paddle.save(optimizer.state_dict(), model_prefix + '.pdopt') + + is_nlp_model = config['Architecture']["model_type"] == 'kie' and config[ + "Architecture"]["algorithm"] not in ["SDMGR"] + if is_nlp_model is not True: paddle.save(model.state_dict(), model_prefix + '.pdparams') metric_prefix = model_prefix - else: # for vqa system, we follow the save/load rules in NLP + else: # for kie system, we follow the save/load rules in NLP if config['Global']['distributed']: arch = model._layers else: diff --git a/ppocr/utils/utility.py b/ppocr/utils/utility.py index b881fcab20bc5ca076a0002bd72349768c7d881a..18357c8e97bcea8ee321856a87146a4a7b901469 100755 --- a/ppocr/utils/utility.py +++ b/ppocr/utils/utility.py @@ -50,7 +50,7 @@ def get_check_global_params(mode): def _check_image_file(path): - img_end = {'jpg', 'bmp', 'png', 'jpeg', 'rgb', 'tif', 'tiff', 'gif'} + img_end = {'jpg', 'bmp', 'png', 'jpeg', 'rgb', 'tif', 'tiff', 'gif', 'pdf'} return any([path.lower().endswith(e) for e in img_end]) @@ -59,7 +59,7 @@ def get_image_file_list(img_file): if img_file is None or not os.path.exists(img_file): raise Exception("not found any img file in {}".format(img_file)) - img_end = {'jpg', 'bmp', 'png', 'jpeg', 'rgb', 'tif', 'tiff', 'gif'} + img_end = {'jpg', 'bmp', 'png', 'jpeg', 'rgb', 'tif', 'tiff', 'gif', 'pdf'} if os.path.isfile(img_file) and _check_image_file(img_file): imgs_lists.append(img_file) elif os.path.isdir(img_file): @@ -73,7 +73,7 @@ def get_image_file_list(img_file): return imgs_lists -def check_and_read_gif(img_path): +def check_and_read(img_path): if os.path.basename(img_path)[-3:] in ['gif', 'GIF']: gif = cv2.VideoCapture(img_path) ret, frame = gif.read() @@ -84,8 +84,26 @@ def check_and_read_gif(img_path): if len(frame.shape) == 2 or frame.shape[-1] == 1: frame = cv2.cvtColor(frame, cv2.COLOR_GRAY2RGB) imgvalue = frame[:, :, ::-1] - return imgvalue, True - return None, False + return imgvalue, True, False + elif os.path.basename(img_path)[-3:] in ['pdf']: + import fitz + from PIL import Image + imgs = [] + with fitz.open(img_path) as pdf: + for pg in range(0, pdf.pageCount): + page = pdf[pg] + mat = fitz.Matrix(2, 2) + pm = page.getPixmap(matrix=mat, alpha=False) + + # if width or height > 2000 pixels, don't enlarge the image + if pm.width > 2000 or pm.height > 2000: + pm = page.getPixmap(matrix=fitz.Matrix(1, 1), alpha=False) + + img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples) + img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR) + imgs.append(img) + return imgs, False, True + return None, False, False def load_vqa_bio_label_maps(label_map_path): diff --git a/ppocr/utils/visual.py b/ppocr/utils/visual.py index e0fbf06abb471c294cb268520fb99bca1a6b1d61..5bd805ea6e76be37612a142102beab492bece941 100644 --- a/ppocr/utils/visual.py +++ b/ppocr/utils/visual.py @@ -113,14 +113,11 @@ def draw_re_results(image, return np.array(img_new) -def draw_rectangle(img_path, boxes, use_xywh=False): +def draw_rectangle(img_path, boxes): + boxes = np.array(boxes) img = cv2.imread(img_path) img_show = img.copy() for box in boxes.astype(int): - if use_xywh: - x, y, w, h = box - x1, y1, x2, y2 = x - w // 2, y - h // 2, x + w // 2, y + h // 2 - else: - x1, y1, x2, y2 = box + x1, y1, x2, y2 = box cv2.rectangle(img_show, (x1, y1), (x2, y2), (255, 0, 0), 2) return img_show \ No newline at end of file diff --git a/ppstructure/README.md b/ppstructure/README.md index 72670e33575ebe444c78b15fbab4e330389a7498..f3f2d4a931d611003102da4b80bfb1b12d96cbab 100644 --- a/ppstructure/README.md +++ b/ppstructure/README.md @@ -1,120 +1,117 @@ English | [简体中文](README_ch.md) +# PP-Structure + - [1. Introduction](#1-introduction) -- [2. Update log](#2-update-log) -- [3. Features](#3-features) -- [4. Results](#4-results) - - [4.1 Layout analysis and table recognition](#41-layout-analysis-and-table-recognition) - - [4.2 DOC-VQA](#42-doc-vqa) -- [5. Quick start](#5-quick-start) -- [6. PP-Structure System](#6-pp-structure-system) - - [6.1 Layout analysis and table recognition](#61-layout-analysis-and-table-recognition) - - [6.1.1 Layout analysis](#611-layout-analysis) - - [6.1.2 Table recognition](#612-table-recognition) - - [6.2 DOC-VQA](#62-doc-vqa) -- [7. Model List](#7-model-list) - - [7.1 Layout analysis model](#71-layout-analysis-model) - - [7.2 OCR and table recognition model](#72-ocr-and-table-recognition-model) - - [7.3 DOC-VQA model](#73-doc-vqa-model) +- [2. Features](#2-features) +- [3. Results](#3-results) + - [3.1 Layout analysis and table recognition](#31-layout-analysis-and-table-recognition) + - [3.2 Layout Recovery](#32-layout-recovery) + - [3.3 KIE](#33-kie) +- [4. Quick start](#4-quick-start) +- [5. Model List](#5-model-list) ## 1. Introduction -PP-Structure is an OCR toolkit that can be used for document analysis and processing with complex structures, designed to help developers better complete document understanding tasks - -## 2. Update log -* 2022.02.12 DOC-VQA add LayoutLMv2 model。 -* 2021.12.07 add [DOC-VQA SER and RE tasks](vqa/README.md)。 - -## 3. Features - -The main features of PP-Structure are as follows: +PP-Structure is an intelligent document analysis system developed by the PaddleOCR team, which aims to help developers better complete tasks related to document understanding such as layout analysis and table recognition. -- Support the layout analysis of documents, divide the documents into 5 types of areas **text, title, table, image and list** (conjunction with Layout-Parser) -- Support to extract the texts from the text, title, picture and list areas (used in conjunction with PP-OCR) -- Support to extract excel files from the table areas -- Support python whl package and command line usage, easy to use -- Support custom training for layout analysis and table structure tasks -- Support Document Visual Question Answering (DOC-VQA) tasks: Semantic Entity Recognition (SER) and Relation Extraction (RE) +The pipeline of PP-Structurev2 system is shown below. The document image first passes through the image direction correction module to identify the direction of the entire image and complete the direction correction. Then, two tasks of layout information analysis and key information extraction can be completed. -## 4. Results - -### 4.1 Layout analysis and table recognition - - +- In the layout analysis task, the image first goes through the layout analysis model to divide the image into different areas such as text, table, and figure, and then analyze these areas separately. For example, the table area is sent to the form recognition module for structured recognition, and the text area is sent to the OCR engine for text recognition. Finally, the layout recovery module restores it to a word or pdf file with the same layout as the original image; +- In the key information extraction task, the OCR engine is first used to extract the text content, and then the SER(semantic entity recognition) module obtains the semantic entities in the image, and finally the RE(relationship extraction) module obtains the correspondence between the semantic entities, thereby extracting the required key information. + -The figure shows the pipeline of layout analysis + table recognition. The image is first divided into four areas of image, text, title and table by layout analysis, and then OCR detection and recognition is performed on the three areas of image, text and title, and the table is performed table recognition, where the image will also be stored for use. +More technical details: 👉 [PP-Structurev2 Technical Report](docs/PP-Structurev2_introduction.md) -### 4.2 DOC-VQA +PP-Structurev2 supports independent use or flexible collocation of each module. For example, you can use layout analysis alone or table recognition alone. Click the corresponding link below to get the tutorial for each independent module: -* SER -* -![](docs/vqa/result_ser/zh_val_0_ser.jpg) | ![](docs/vqa/result_ser/zh_val_42_ser.jpg) ----|--- +- [Layout Analysis](layout/README.md) +- [Table Recognition](table/README.md) +- [Key Information Extraction](kie/README.md) +- [Layout Recovery](recovery/README.md) -Different colored boxes in the figure represent different categories. For xfun dataset, there are three categories: query, answer and header: +## 2. Features -* Dark purple: header -* Light purple: query -* Army green: answer +The main features of PP-Structurev2 are as follows: +- Support layout analysis of documents in the form of images/pdfs, which can be divided into areas such as **text, titles, tables, figures, formulas, etc.**; +- Support common Chinese and English **table detection** tasks; +- Support structured table recognition, and output the final result to **Excel file**; +- Support multimodal-based Key Information Extraction (KIE) tasks - **Semantic Entity Recognition** (SER) and **Relation Extraction (RE); +- Support **layout recovery**, that is, restore the document in word or pdf format with the same layout as the original image; +- Support customized training and multiple inference deployment methods such as python whl package quick start; +- Connect with the semi-automatic data labeling tool PPOCRLabel, which supports the labeling of layout analysis, table recognition, and SER. -The corresponding category and OCR recognition results are also marked at the top left of the OCR detection box. +## 3. Results +PP-Structurev2 supports the independent use or flexible collocation of each module. For example, layout analysis can be used alone, or table recognition can be used alone. Only the visualization effects of several representative usage methods are shown here. -* RE +### 3.1 Layout analysis and table recognition -![](docs/vqa/result_re/zh_val_21_re.jpg) | ![](docs/vqa/result_re/zh_val_40_re.jpg) ----|--- +The figure shows the pipeline of layout analysis + table recognition. The image is first divided into four areas of image, text, title and table by layout analysis, and then OCR detection and recognition is performed on the three areas of image, text and title, and the table is performed table recognition, where the image will also be stored for use. + +### 3.2 Layout recovery -In the figure, the red box represents the question, the blue box represents the answer, and the question and answer are connected by green lines. The corresponding category and OCR recognition results are also marked at the top left of the OCR detection box. +The following figure shows the effect of layout recovery based on the results of layout analysis and table recognition in the previous section. + -## 5. Quick start +### 3.3 KIE -Start from [Quick Installation](./docs/quickstart.md) +* SER -## 6. PP-Structure System +Different colored boxes in the figure represent different categories. -### 6.1 Layout analysis and table recognition +
+ +
-![pipeline](docs/table/pipeline.jpg) +
+ +
-In PP-Structure, the image will be divided into 5 types of areas **text, title, image list and table**. For the first 4 types of areas, directly use PP-OCR system to complete the text detection and recognition. For the table area, after the table structuring process, the table in image is converted into an Excel file with the same table style. +
+ +
-#### 6.1.1 Layout analysis +
+ +
-Layout analysis classifies image by region, including the use of Python scripts of layout analysis tools, extraction of designated category detection boxes, performance indicators, and custom training layout analysis models. For details, please refer to [document](layout/README.md). +
+ +
-#### 6.1.2 Table recognition +* RE -Table recognition converts table images into excel documents, which include the detection and recognition of table text and the prediction of table structure and cell coordinates. For detailed instructions, please refer to [document](table/README.md) +In the figure, the red box represents `Question`, the blue box represents `Answer`, and `Question` and `Answer` are connected by green lines. -### 6.2 DOC-VQA +
+ +
-Document Visual Question Answering (DOC-VQA) if a type of Visual Question Answering (VQA), which includes Semantic Entity Recognition (SER) and Relation Extraction (RE) tasks. Based on SER task, text recognition and classification in images can be completed. Based on THE RE task, we can extract the relation of the text content in the image, such as judge the problem pair. For details, please refer to [document](vqa/README.md) +
+ +
-## 7. Model List +
+ +
-PP-Structure Series Model List (Updating) +
+ +
-### 7.1 Layout analysis model +## 4. Quick start -|model name|description|download|label_map| -| --- | --- | --- |--- | -| ppyolov2_r50vd_dcn_365e_publaynet | The layout analysis model trained on the PubLayNet dataset can divide image into 5 types of areas **text, title, table, picture, and list** | [PubLayNet](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_publaynet.tar) | {0: "Text", 1: "Title", 2: "List", 3:"Table", 4:"Figure"}| +Start from [Quick Start](./docs/quickstart_en.md). -### 7.2 OCR and table recognition model +## 5. Model List -|model name|description|model size|download| -| --- | --- | --- | --- | -|ch_PP-OCRv2_det_slim|[New] Slim quantization with distillation lightweight model, supporting Chinese, English, multilingual text detection| 3M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_slim_quant_infer.tar)| -|ch_PP-OCRv2_rec_slim|[New] Slim qunatization with distillation lightweight model, supporting Chinese, English, multilingual text recognition| 9M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_train.tar) | -|en_ppocr_mobile_v2.0_table_structure|Table structure prediction of English table scene trained on PubLayNet dataset| 18.6M |[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_structure_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.1/table/en_ppocr_mobile_v2.0_table_structure_train.tar) | +Some tasks need to use both the structured analysis models and the OCR models. For example, the table recognition task needs to use the table recognition model for structured analysis, and the OCR model to recognize the text in the table. Please select the appropriate models according to your specific needs. -### 7.3 DOC-VQA model +For structural analysis related model downloads, please refer to: +- [PP-Structure Model Zoo](./docs/models_list_en.md) -|model name|description|model size|download| -| --- | --- | --- | --- | -|ser_LayoutXLM_xfun_zhd|SER model trained on xfun Chinese dataset based on LayoutXLM|1.4G|[inference model coming soon]() / [trained model](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutXLM_xfun_zh.tar) | -|re_LayoutXLM_xfun_zh|RE model trained on xfun Chinese dataset based on LayoutXLM|1.4G|[inference model coming soon]() / [trained model](https://paddleocr.bj.bcebos.com/pplayout/re_LayoutXLM_xfun_zh.tar) | +For OCR related model downloads, please refer to: +- [PP-OCR Model Zoo](../doc/doc_en/models_list_en.md) -If you need to use other models, you can download the model in [PPOCR model_list](../doc/doc_en/models_list_en.md) and [PPStructure model_list](./docs/models_list.md) diff --git a/ppstructure/README_ch.md b/ppstructure/README_ch.md index ddacbb077937f325db0430846b8f05bfda9619cd..87a9c625b32c32e9c7fffb8ebc9b9fdf3b2130db 100644 --- a/ppstructure/README_ch.md +++ b/ppstructure/README_ch.md @@ -3,134 +3,120 @@ # PP-Structure 文档分析 - [1. 简介](#1) -- [2. 近期更新](#2) -- [3. 特性](#3) -- [4. 效果展示](#4) - - [4.1 版面分析和表格识别](#41) - - [4.2 DocVQA](#42) -- [5. 快速体验](#5) -- [6. PP-Structure 介绍](#6) - - [6.1 版面分析+表格识别](#61) - - [6.1.1 版面分析](#611) - - [6.1.2 表格识别](#612) - - [6.2 DocVQA](#62) -- [7. 模型库](#7) - - [7.1 版面分析模型](#71) - - [7.2 OCR和表格识别模型](#72) - - [7.3 DocVQA 模型](#73) +- [2. 特性](#2) +- [3. 效果展示](#3) + - [3.1 版面分析和表格识别](#31) + - [3.2 版面恢复](#32) + - [3.3 关键信息抽取](#33) +- [4. 快速体验](#4) +- [5. 模型库](#5) ## 1. 简介 -PP-Structure是一个可用于复杂文档结构分析和处理的OCR工具包,旨在帮助开发者更好的完成文档理解相关任务。 - -## 2. 近期更新 -* 2022.02.12 DocVQA增加LayoutLMv2模型。 -* 2021.12.07 新增[DOC-VQA任务SER和RE](vqa/README.md)。 - - -## 3. 特性 - -PP-Structure的主要特性如下: -- 支持对图片形式的文档进行版面分析,可以划分**文字、标题、表格、图片以及列表**5类区域(与Layout-Parser联合使用) -- 支持文字、标题、图片以及列表区域提取为文字字段(与PP-OCR联合使用) -- 支持表格区域进行结构化分析,最终结果输出Excel文件 -- 支持python whl包和命令行两种方式,简单易用 -- 支持版面分析和表格结构化两类任务自定义训练 -- 支持文档视觉问答(Document Visual Question Answering,DocVQA)任务-语义实体识别(Semantic Entity Recognition,SER)和关系抽取(Relation Extraction,RE) - - -## 4. 效果展示 +PP-Structure是PaddleOCR团队自研的智能文档分析系统,旨在帮助开发者更好的完成版面分析、表格识别等文档理解相关任务。 - -### 4.1 版面分析和表格识别 - - - -图中展示了版面分析+表格识别的整体流程,图片先有版面分析划分为图像、文本、标题和表格四种区域,然后对图像、文本和标题三种区域进行OCR的检测识别,对表格进行表格识别,其中图像还会被存储下来以便使用。 - - -### 4.2 DOC-VQA - -* SER +PP-Structurev2系统流程图如下所示,文档图像首先经过图像矫正模块,判断整图方向并完成转正,随后可以完成版面信息分析与关键信息抽取2类任务。 +- 版面分析任务中,图像首先经过版面分析模型,将图像划分为文本、表格、图像等不同区域,随后对这些区域分别进行识别,如,将表格区域送入表格识别模块进行结构化识别,将文本区域送入OCR引擎进行文字识别,最后使用版面恢复模块将其恢复为与原始图像布局一致的word或者pdf格式的文件; +- 关键信息抽取任务中,首先使用OCR引擎提取文本内容,然后由语义实体识别模块获取图像中的语义实体,最后经关系抽取模块获取语义实体之间的对应关系,从而提取需要的关键信息。 + -![](./docs/vqa/result_ser/zh_val_0_ser.jpg) | ![](./docs/vqa/result_ser/zh_val_42_ser.jpg) ----|--- +更多技术细节:👉 [PP-Structurev2技术报告](docs/PP-Structurev2_introduction.md) -图中不同颜色的框表示不同的类别,对于XFUN数据集,有`QUESTION`, `ANSWER`, `HEADER` 3种类别 +PP-Structurev2支持各个模块独立使用或灵活搭配,如,可以单独使用版面分析,或单独使用表格识别,点击下面相应链接获取各个独立模块的使用教程: -* 深紫色:HEADER -* 浅紫色:QUESTION -* 军绿色:ANSWER +- [版面分析](layout/README_ch.md) +- [表格识别](table/README_ch.md) +- [关键信息抽取](kie/README_ch.md) +- [版面复原](recovery/README_ch.md) -在OCR检测框的左上方也标出了对应的类别和OCR识别结果。 + +## 2. 特性 -* RE +PP-Structurev2的主要特性如下: +- 支持对图片/pdf形式的文档进行版面分析,可以划分**文字、标题、表格、图片、公式等**区域; +- 支持通用的中英文**表格检测**任务; +- 支持表格区域进行结构化识别,最终结果输出**Excel文件**; +- 支持基于多模态的关键信息抽取(Key Information Extraction,KIE)任务-**语义实体识别**(Semantic Entity Recognition,SER)和**关系抽取**(Relation Extraction,RE); +- 支持**版面复原**,即恢复为与原始图像布局一致的word或者pdf格式的文件; +- 支持自定义训练及python whl包调用等多种推理部署方式,简单易用; +- 与半自动数据标注工具PPOCRLabel打通,支持版面分析、表格识别、SER三种任务的标注。 -![](./docs/vqa/result_re/zh_val_21_re.jpg) | ![](./docs/vqa/result_re/zh_val_40_re.jpg) ----|--- + +## 3. 效果展示 +PP-Structurev2支持各个模块独立使用或灵活搭配,如,可以单独使用版面分析,或单独使用表格识别,这里仅展示几种代表性使用方式的可视化效果。 + +### 3.1 版面分析和表格识别 +下图展示了版面分析+表格识别的整体流程,图片先有版面分析划分为图像、文本、标题和表格四种区域,然后对图像、文本和标题三种区域进行OCR的检测识别,对表格进行表格识别,其中图像还会被存储下来以便使用。 + -图中红色框表示问题,蓝色框表示答案,问题和答案之间使用绿色线连接。在OCR检测框的左上方也标出了对应的类别和OCR识别结果。 + +### 3.2 版面恢复 +下图展示了基于上一节版面分析和表格识别的结果进行版面恢复的效果。 + - -## 5. 快速体验 -请参考[快速使用](./docs/quickstart.md)教程。 + +### 3.3 关键信息抽取 - -## 6. PP-Structure 介绍 +* SER - -### 6.1 版面分析+表格识别 +图中不同颜色的框表示不同的类别。 -![pipeline](./docs/table/pipeline.jpg) +
+ +
-在PP-Structure中,图片会先经由Layout-Parser进行版面分析,在版面分析中,会对图片里的区域进行分类,包括**文字、标题、图片、列表和表格**5类。对于前4类区域,直接使用PP-OCR完成对应区域文字检测与识别。对于表格类区域,经过表格结构化处理后,表格图片转换为相同表格样式的Excel文件。 +
+ +
- -#### 6.1.1 版面分析 +
+ +
-版面分析对文档数据进行区域分类,其中包括版面分析工具的Python脚本使用、提取指定类别检测框、性能指标以及自定义训练版面分析模型,详细内容可以参考[文档](layout/README_ch.md)。 +
+ +
- -#### 6.1.2 表格识别 +
+ +
-表格识别将表格图片转换为excel文档,其中包含对于表格文本的检测和识别以及对于表格结构和单元格坐标的预测,详细说明参考[文档](table/README_ch.md)。 +* RE - -### 6.2 DocVQA +图中红色框表示`问题`,蓝色框表示`答案`,`问题`和`答案`之间使用绿色线连接。 -DocVQA指文档视觉问答,其中包括语义实体识别 (Semantic Entity Recognition, SER) 和关系抽取 (Relation Extraction, RE) 任务。基于 SER 任务,可以完成对图像中的文本识别与分类;基于 RE 任务,可以完成对图象中的文本内容的关系提取,如判断问题对(pair),详细说明参考[文档](vqa/README.md)。 +
+ +
- -## 7. 模型库 +
+ +
-PP-Structure系列模型列表(更新中) +
+ +
- -### 7.1 版面分析模型 +
+ +
-|模型名称|模型简介|下载地址| label_map| -| --- | --- | --- | --- | -| ppyolov2_r50vd_dcn_365e_publaynet | PubLayNet 数据集训练的版面分析模型,可以划分**文字、标题、表格、图片以及列表**5类区域 | [PubLayNet](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_publaynet.tar) | {0: "Text", 1: "Title", 2: "List", 3:"Table", 4:"Figure"}| + +## 4. 快速体验 - -### 7.2 OCR和表格识别模型 +请参考[快速使用](./docs/quickstart.md)教程。 -|模型名称|模型简介|模型大小|下载地址| -| --- | --- | --- | --- | -|ch_PP-OCRv2_det_slim|【最新】slim量化+蒸馏版超轻量模型,支持中英文、多语种文本检测| 3M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_slim_quant_infer.tar)| -|ch_PP-OCRv2_rec_slim|【最新】slim量化版超轻量模型,支持中英文、数字识别| 9M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_train.tar) | -|en_ppocr_mobile_v2.0_table_structure|PubLayNet数据集训练的英文表格场景的表格结构预测|18.6M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_structure_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/table/en_ppocr_mobile_v2.0_table_structure_train.tar) | + +## 5. 模型库 - -### 7.3 DocVQA 模型 +部分任务需要同时用到结构化分析模型和OCR模型,如表格识别需要使用表格识别模型进行结构化解析,同时也要用到OCR模型对表格内的文字进行识别,请根据具体需求选择合适的模型。 -|模型名称|模型简介|模型大小|下载地址| -| --- | --- | --- | --- | -|ser_LayoutXLM_xfun_zhd|基于LayoutXLM在xfun中文数据集上训练的SER模型|1.4G|[推理模型 coming soon]() / [训练模型](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutXLM_xfun_zh.tar) | -|re_LayoutXLM_xfun_zh|基于LayoutXLM在xfun中文数据集上训练的RE模型|1.4G|[推理模型 coming soon]() / [训练模型](https://paddleocr.bj.bcebos.com/pplayout/re_LayoutXLM_xfun_zh.tar) | +结构化分析相关模型下载可以参考: +- [PP-Structure 模型库](./docs/models_list.md) +OCR相关模型下载可以参考: +- [PP-OCR 模型库](../doc/doc_ch/models_list.md) -更多模型下载,可以参考 [PP-OCR model_list](../doc/doc_ch/models_list.md) and [PP-Structure model_list](./docs/models_list.md) diff --git a/ppstructure/docs/PP-Structurev2_introduction.md b/ppstructure/docs/PP-Structurev2_introduction.md new file mode 100644 index 0000000000000000000000000000000000000000..e337b563efea5b3fccbe81b14abcd50f1d36d70b --- /dev/null +++ b/ppstructure/docs/PP-Structurev2_introduction.md @@ -0,0 +1,426 @@ +# PP-Structurev2 + +## 目录 + +- [1. 背景](#1-背景) +- [2. 简介](#3-简介) +- [3. 整图方向矫正](#3-整图方向矫正) +- [4. 版面信息结构化](#4-版面信息结构化) + - [4.1 版面分析](#41-版面分析) + - [4.2 表格识别](#42-表格识别) + - [4.3 版面恢复](#43-版面恢复) +- [5. 关键信息抽取](#5-关键信息抽取) +- [6. Reference](#6-Reference) + +## 1. 背景 + +现实场景中包含大量的文档图像,它们以图片等非结构化形式存储。基于文档图像的结构化分析与信息抽取对于数据的数字化存储以及产业的数字化转型至关重要。基于该考虑,PaddleOCR自研并发布了PP-Structure智能文档分析系统,旨在帮助开发者更好的完成版面分析、表格识别、关键信息抽取等文档理解相关任务。 + +近期,PaddleOCR团队针对PP-Structurev1的版面分析、表格识别、关键信息抽取模块,进行了共计8个方面的升级,同时新增整图方向矫正、文档复原等功能,打造出一个全新的、效果更优的文档分析系统:PP-Structurev2。 + +## 2. 简介 + +PP-Structurev2在PP-Structurev1的基础上进一步改进,主要有以下3个方面升级: + + * **系统功能升级** :新增图像矫正和版面复原模块,图像转word/pdf、关键信息抽取能力全覆盖! + * **系统性能优化** : + * 版面分析:发布轻量级版面分析模型,速度提升**11倍**,平均CPU耗时仅需**41ms**! + * 表格识别:设计3大优化策略,预测耗时不变情况下,模型精度提升**6%**。 + * 关键信息抽取:设计视觉无关模型结构,语义实体识别精度提升**2.8%**,关系抽取精度提升**9.1%**。 + * **中文场景适配** :完成对版面分析与表格识别的中文场景适配,开源**开箱即用**的中文场景版面结构化模型! + +PP-Structurev2系统流程图如下所示,文档图像首先经过图像矫正模块,判断整图方向并完成转正,随后可以完成版面信息分析与关键信息抽取2类任务。版面分析任务中,图像首先经过版面分析模型,将图像划分为文本、表格、图像等不同区域,随后对这些区域分别进行识别,如,将表格区域送入表格识别模块进行结构化识别,将文本区域送入OCR引擎进行文字识别,最后使用版面恢复模块将其恢复为与原始图像布局一致的word或者pdf格式的文件;关键信息抽取任务中,首先使用OCR引擎提取文本内容,然后由语义实体识别模块获取图像中的语义实体,最后经关系抽取模块获取语义实体之间的对应关系,从而提取需要的关键信息。 + +
+ +
+ + +从算法改进思路来看,对系统中的3个关键子模块,共进行了8个方面的改进。 + +* 版面分析 + * PP-PicoDet:轻量级版面分析模型 + * FGD:兼顾全局与局部特征的模型蒸馏算法 + +* 表格识别 + * PP-LCNet: CPU友好型轻量级骨干网络 + * CSP-PAN:轻量级高低层特征融合模块 + * SLAHead:结构与位置信息对齐的特征解码模块 + +* 关键信息抽取 + * VI-LayoutXLM:视觉特征无关的多模态预训练模型结构 + * TB-YX:考虑阅读顺序的文本行排序逻辑 + * UDML:联合互学习知识蒸馏策略 + +最终,与PP-Structurev1相比: + +- 版面分析模型参数量减少95.6%,推理速度提升11倍,精度提升0.4%; +- 表格识别预测耗时不变,模型精度提升6%,端到端TEDS提升2%; +- 关键信息抽取模型速度提升2.8倍,语义实体识别模型精度提升2.8%;关系抽取模型精度提升9.1%。 + +下面对各个模块进行详细介绍。 + +## 3. 整图方向矫正 + +由于训练集一般以正方向图像为主,旋转过的文档图像直接输入模型会增加识别难度,影响识别效果。PP-Structurev2引入了整图方向矫正模块来判断含文字图像的方向,并将其进行方向调整。 + +我们直接调用PaddleClas中提供的文字图像方向分类模型-[PULC_text_image_orientation](https://github.com/PaddlePaddle/PaddleClas/blob/develop/docs/zh_CN/PULC/PULC_text_image_orientation.md),该模型部分数据集图像如下所示。不同于文本行方向分类器,文字图像方向分类模型针对整图进行方向判别。文字图像方向分类模型在验证集上精度高达99%,单张图像CPU预测耗时仅为`2.16ms`。 + +
+ +
+ +## 4. 版面信息结构化 + +### 4.1 版面分析 + +版面分析指的是对图片形式的文档进行区域划分,定位其中的关键区域,如文字、标题、表格、图片等,PP-Structurev1使用了PaddleDetection中开源的高效检测算法PP-YOLOv2完成版面分析的任务。 + +在PP-Structurev2中,我们发布基于PP-PicoDet的轻量级版面分析模型,并针对版面分析场景定制图像尺度,同时使用FGD知识蒸馏算法,进一步提升模型精度。最终CPU上`41ms`即可完成版面分析过程(仅包含模型推理时间,数据预处理耗时大约50ms左右)。在公开数据集PubLayNet 上,消融实验如下: + +| 实验序号 | 策略 | 模型存储(M) | mAP | CPU预测耗时(ms) | +|:------:|:------:|:------:|:------:|:------:| +| 1 | PP-YOLOv2(640*640) | 221 | 93.6% | 512 | +| 2 | PP-PicoDet-LCNet2.5x(640*640) | 29.7 | 92.5% |53.2| +| 3 | PP-PicoDet-LCNet2.5x(800*608) | 29.7 | 94.2% |83.1 | +| 4 | PP-PicoDet-LCNet1.0x(800*608) | 9.7 | 93.5% | 41.2| +| 5 | PP-PicoDet-LCNet1.0x(800*608) + FGD | 9.7 | 94% |41.2| + +* 测试条件 + * paddle版本:2.3.0 + * CPU:Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz,开启mkldnn,线程数为10 + +在PubLayNet数据集上,与其他方法的性能对比如下表所示。可以看到,和基于Detectron2的版面分析工具layoutparser相比,我们的模型精度高出大约5%,预测速度快约69倍。 + +| 模型 | mAP | CPU预测耗时 | +|-------------------|-----------|------------| +| layoutparser (Detectron2) | 88.98% | 2.9s | +| PP-Structurev2 (PP-PicoDet) | **94%** | 41.2ms | + +[PubLayNet](https://github.com/ibm-aur-nlp/PubLayNet)数据集是一个大型的文档图像数据集,包含Text、Title、Tale、Figure、List,共5个类别。数据集中包含335,703张训练集、11,245张验证集和11,405张测试集。训练数据与标注示例图如下所示: + +
+ +
+ + +#### 4.1.1 优化策略 + +**(1)轻量级版面分析模型PP-PicoDet** + +`PP-PicoDet`是PaddleDetection中提出的轻量级目标检测模型,通过使用PP-LCNet骨干网络、CSP-PAN特征融合模块、SimOTA标签分配方法等优化策略,最终在CPU与移动端具有卓越的性能。我们将PP-Structurev1中采用的PP-YOLOv2模型替换为`PP-PicoDet`,同时针对版面分析场景优化预测尺度,从针对目标检测设计的`640*640`调整为更适配文档图像的`800*608`,在`1.0x`配置下,模型精度与PP-YOLOv2相当,CPU平均预测速度可提升11倍。 + +**(1)FGD知识蒸馏** + +FGD(Focal and Global Knowledge Distillation for Detectors),是一种兼顾局部全局特征信息的模型蒸馏方法,分为Focal蒸馏和Global蒸馏2个部分。Focal蒸馏分离图像的前景和背景,让学生模型分别关注教师模型的前景和背景部分特征的关键像素;Global蒸馏部分重建不同像素之间的关系并将其从教师转移到学生,以补偿Focal蒸馏中丢失的全局信息。我们基于FGD蒸馏策略,使用教师模型PP-PicoDet-LCNet2.5x(mAP=94.2%)蒸馏学生模型PP-PicoDet-LCNet1.0x(mAP=93.5%),可将学生模型精度提升0.5%,和教师模型仅差0.2%,而预测速度比教师模型快1倍。 + +#### 4.1.2 场景适配 + +**(1)中文版面分析** + +除了英文公开数据集PubLayNet,我们也在中文场景进行了场景适配与方法验证。[CDLA](https://github.com/buptlihang/CDLA)是一个中文文档版面分析数据集,面向中文文献类(论文)场景,包含正文、标题等10个label。数据集中包含5,000张训练集和1,000张验证集。训练数据与标注示例图如下所示: + + +
+ +
+ + +在CDLA 数据集上,消融实验如下: + +| 实验序号 | 策略 | mAP | +|:------:|:------:|:------:| +| 1 | PP-YOLOv2 | 84.7% | +| 2 | PP-PicoDet-LCNet2.5x(800*608) | 87.8% | +| 3 | PP-PicoDet-LCNet1.0x(800*608) | 84.5% | +| 4 | PP-PicoDet-LCNet1.0x(800*608) + FGD | 86.8% | + + +**(2)表格版面分析** + +在实际应用中,很多场景并不关注图像中的图片、文本等版面区域,而仅需要提取文档图像中的表格,此时版面分析任务退化为一个表格检测任务,表格检测往往也是表格识别的前序任务。面向中英文文档场景,我们整理了开源领域含表格的版面分析数据集,包括TableBank、DocBank等。融合后的数据集中包含496,405张训练集与9,495张验证集图像。 + +在表格数据集上,消融实验如下: + +| 实验序号 | 策略 | mAP | +|:------:|:------:|:------:| +| 1 | PP-YOLOv2 |91.3% | +| 2 | PP-PicoDet-LCNet2.5x(800*608) | 95.9% | +| 3 | PP-PicoDet-LCNet1.0x(800*608) | 95.2% | +| 4 | PP-PicoDet-LCNet1.0x(800*608) + FGD | 95.7% | + +表格检测效果示意图如下: + +
+ +
+ +### 4.2 表格识别 + +基于深度学习的表格识别算法种类丰富,PP-Structurev1中,我们基于文本识别算法RARE研发了端到端表格识别算法TableRec-RARE,模型输出为表格结构的HTML表示,进而可以方便地转化为Excel文件。PP-Structurev2中,我们对模型结构和损失函数等5个方面进行升级,提出了 SLANet (Structure Location Alignment Network) ,模型结构如下图所示: + +
+ +
+ +在PubTabNet英文表格识别数据集上的消融实验如下: + +|策略|Acc|TEDS|推理速度(CPU+MKLDNN)|模型大小| +|---|---|---|---|---| +|TableRec-RARE| 71.73% | 93.88% |779ms |6.8M| +|+PP-LCNet| 74.71% |94.37% |778ms| 8.7M| +|+CSP-PAN| 75.68%| 94.72% |708ms| 9.3M| +|+SLAHead| 77.7%|94.85%| 766ms| 9.2M| +|+MergeToken| 76.31%| 95.89%|766ms| 9.2M| + +* 测试环境 + * paddle版本:2.3.1 + * CPU:Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz,开启mkldnn,线程数为10 + +在PubtabNet英文表格识别数据集上,和其他方法对比如下: + +|策略|Acc|TEDS|推理速度(CPU+MKLDNN)|模型大小| +|---|---|---|---|---| +|TableMaster|77.9%|96.12%|2144ms|253M| +|TableRec-RARE| 71.73% | 93.88% |779ms |6.8M| +|SLANet|76.31%| 95.89%|766ms|9.2M| + +#### 4.2.1 优化策略 + +**(1) CPU友好型轻量级骨干网络PP-LCNet** + +PP-LCNet是结合Intel-CPU端侧推理特性而设计的轻量高性能骨干网络,该方案在图像分类任务上取得了比ShuffleNetV2、MobileNetV3、GhostNet等轻量级模型更优的“精度-速度”均衡。PP-Structurev2中,我们采用PP-LCNet作为骨干网络,表格识别模型精度从71.73%提升至72.98%;同时加载通过SSLD知识蒸馏方案训练得到的图像分类模型权重作为表格识别的预训练模型,最终精度进一步提升2.95%至74.71%。 + +**(2)轻量级高低层特征融合模块CSP-PAN** + +对骨干网络提取的特征进行融合,可以有效解决尺度变化较大等复杂场景中的模型预测问题。早期,FPN模块被提出并用于特征融合,但是它的特征融合过程仅包含单向(高->低),融合不够充分。CSP-PAN基于PAN进行改进,在保证特征融合更为充分的同时,使用CSP block、深度可分离卷积等策略减小了计算量。在表格识别场景中,我们进一步将CSP-PAN的通道数从128降低至96以降低模型大小。最终表格识别模型精度提升0.97%至75.68%,预测速度提升10%。 + +**(3)结构与位置信息对齐的特征解码模块SLAHead** + +TableRec-RARE的TableAttentionHead如下图a所示,TableAttentionHead在执行完全部step的计算后拿到最终隐藏层状态表征(hiddens),随后hiddens经由SDM(Structure Decode Module)和CLDM(Cell Location Decode Module)模块生成全部的表格结构token和单元格坐标。但是这种设计忽略了单元格token和坐标之间一一对应的关系。 + +PP-Structurev2中,我们设计SLAHead模块,对单元格token和坐标之间做了对齐操作,如下图b所示。在SLAHead中,每一个step的隐藏层状态表征会分别送入SDM和CLDM来得到当前step的token和坐标,每个step的token和坐标输出分别进行concat得到表格的html表达和全部单元格的坐标。此外,考虑到表格识别模型的单元格准确率依赖于表格结构的识别准确,我们将损失函数中表格结构分支与单元格定位分支的权重比从1:1提升到8:1,并使用收敛更稳定的Smoothl1 Loss替换定位分支中的MSE Loss。最终模型精度从75.68%提高至77.7%。 + + +
+ +
+ + +**(4)其他** + +TableRec-RARE算法中,我们使用``和``两个单独的token来表示一个非跨行列单元格,这种表示方式限制了网络对于单元格数量较多表格的处理能力。 + +PP-Structurev2中,我们参考TableMaster中的token处理方法,将``和``合并为一个token-``。合并token后,验证集中token长度大于500的图片也参与模型评估,最终模型精度降低为76.31%,但是端到端TEDS提升1.04%。 + +#### 4.2.2 中文场景适配 + +除了上述模型策略的升级外,本次升级还开源了中文表格识别模型。在实际应用场景中,表格图像存在着各种各样的倾斜角度(PubTabNet数据集不存在该问题),因此在中文模型中,我们将单元格坐标回归的点数从2个(左上,右下)增加到4个(左上,右上,右下,左下)。在内部测试集上,模型升级前后指标如下: +|模型|acc| +|---|---| +|TableRec-RARE|44.3%| +|SLANet|59.35%| + +可视化结果如下,左为输入图像,右为识别的html表格 + + +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ + + + +### 4.3 版面恢复 + +版面恢复指的是文档图像经过OCR识别、版面分析、表格识别等方法处理后的内容可以与原始文档保持相同的排版方式,并输出到word等文档中。PP-Structurev2中,我们版面恢复系统,包含版面分析、表格识别、OCR文本检测与识别等子模块。 +下图展示了版面恢复的结果: + +
+ +
+ +## 5. 关键信息抽取 + +关键信息抽取指的是针对文档图像的文字内容,提取出用户关注的关键信息,如身份证中的姓名、住址等字段。PP-Structure中支持了基于多模态LayoutLM系列模型的语义实体识别 (Semantic Entity Recognition, SER) 以及关系抽取 (Relation Extraction, RE) 任务。PP-Structurev2中,我们对模型结构以及下游任务训练方法进行升级,提出了VI-LayoutXLM(Visual-feature Independent LayoutXLM),具体流程图如下所示。 + + +
+ +
+ + +具体优化策略包括: + +* VI-LayoutXLM:视觉特征无关的多模态预训练模型结构 +* TB-YX:考虑人类阅读顺序的文本行排序逻辑 +* UDML:联合互学习知识蒸馏策略 + +XFUND-zh数据集上,SER任务的消融实验如下所示。 + +| 实验序号 | 策略 | 模型大小(G) | 精度 | GPU预测耗时(ms) | CPU预测耗时(ms) | +|:------:|:------:|:------:|:------:|:------:|:------:| +| 1 | LayoutXLM | 1.4 | 89.50% | 59.35 | 766.16 | +| 2 | VI-LayoutXLM | 1.1 | 90.46% | 23.71 | 675.58 | +| 3 | 实验2 + TB-YX文本行排序 | 1.1 | 92.50% | 23.71 | 675.58 | +| 4 | 实验3 + UDML蒸馏 | 1.1 | 93.19% | 23.71 | 675.58 | +| 5 | 实验3 + UDML蒸馏 | 1.1 | **93.19%** | **15.49** | **675.58** | + +* 测试条件 + * paddle版本:2.3.0 + * GPU:V100,实验5的GPU预测耗时使用`trt+fp16`测试得到,环境为cuda10.2+ cudnn8.1.1 + trt7.2.3.4,其他实验的预测耗时统计中没有使用TRT。 + * CPU:Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz,开启mkldnn,线程数为10 + +在XFUND数据集上,与其他方法的效果对比如下所示。 + +| 模型 | SER Hmean | RE Hmean | +|-------------------|-----------|------------| +| LayoutLMv2-base | 85.44% | 67.77% | +| LayoutXLM-base | 89.24% | 70.73% | +| StrucTexT-large | 92.29% | **86.81%** | +| VI-LayoutXLM-base (ours) | **93.19%** | 83.92% | + + +### 5.1 优化策略 + +**(1) VI-LayoutXLM(Visual-feature Independent LayoutXLM)** + +LayoutLMv2以及LayoutXLM中引入视觉骨干网络,用于提取视觉特征,并与后续的text embedding进行联合,作为多模态的输入embedding。但是该模块为基于`ResNet_x101_64x4d`的特征提取网络,特征抽取阶段耗时严重,因此我们将其去除,同时仍然保留文本、位置以及布局等信息,最终发现针对LayoutXLM进行改进,下游SER任务精度无损,针对LayoutLMv2进行改进,下游SER任务精度仅降低`2.1%`,而模型大小减小了约`340M`。具体消融实验如下所示。 + +| 模型 | 模型大小 (G) | F-score | 精度收益 | +|-----------------|----------|---------|--------| +| LayoutLMv2 | 0.76 | 84.20% | - | +| VI-LayoutLMv2 | 0.42 | 82.10% | -2.10% | +| LayoutXLM | 1.4 | 89.50% | - | +| VI-LayouXLM | 1.1 | 90.46% | +0.96% | + +同时,基于XFUND数据集,VI-LayoutXLM在RE任务上的精度也进一步提升了`1.06%`。 + +**(2) TB-YX排序方法(Threshold-Based YX sorting algorithm)** + +文本阅读顺序对于信息抽取与文本理解等任务至关重要,传统多模态模型中,没有考虑不同OCR工具可能产生的不正确阅读顺序,而模型输入中包含位置编码,阅读顺序会直接影响预测结果,在预处理中,我们对文本行按照从上到下,从左到右(YX)的顺序进行排序,为防止文本行位置轻微干扰带来的排序结果不稳定问题,在排序的过程中,引入位置偏移阈值Th,对于Y方向距离小于Th的2个文本内容,使用x方向的位置从左到右进行排序。TB-YX排序方法伪代码如下所示。 + +```py +def order_by_tbyx(ocr_info, th=20): + """ + ocr_info: a list of dict, which contains bbox information([x1, y1, x2, y2]) + th: threshold of the position threshold + """ + res = sorted(ocr_info, key=lambda r: (r["bbox"][1], r["bbox"][0])) # sort using y1 first and then x1 + for i in range(len(res) - 1): + for j in range(i, 0, -1): + # restore the order using the + if abs(res[j + 1]["bbox"][1] - res[j]["bbox"][1]) < th and \ + (res[j + 1]["bbox"][0] < res[j]["bbox"][0]): + tmp = deepcopy(res[j]) + res[j] = deepcopy(res[j + 1]) + res[j + 1] = deepcopy(tmp) + else: + break + return res +``` + +不同排序方法的结果对比如下所示,可以看出引入偏离阈值之后,排序结果更加符合人类的阅读顺序。 + +
+ +
+ + +使用该策略,最终XFUND数据集上,SER任务F1指标提升`2.06%`,RE任务F1指标提升`7.04%`。 + +**(3) 互学习蒸馏策略** + +UDML(Unified-Deep Mutual Learning)联合互学习是PP-OCRv2与PP-OCRv3中采用的对于文本识别非常有效的提升模型效果的策略。在训练时,引入2个完全相同的模型进行互学习,计算2个模型之间的互蒸馏损失函数(DML loss),同时对transformer中间层的输出结果计算距离损失函数(L2 loss)。使用该策略,最终XFUND数据集上,SER任务F1指标提升`0.6%`,RE任务F1指标提升`5.01%`。 + +最终优化后模型基于SER任务的可视化结果如下所示。 + +
+ +
+ +
+ +
+ + +RE任务的可视化结果如下所示。 + + +
+ +
+ +
+ +
+ +### 5.2 更多场景消融实验 + +我们在FUNSD数据集上,同时基于RE任务进行对本次升级策略进行验证,具体实验结果如下所示,可以看出该方案针对不同任务,在不同数据集上均有非常明显的精度收益。 + +#### 5.2.1 XFUND_zh数据集 + +**RE任务结果** + +| 实验序号 | 策略 | 模型大小(G) | F1-score | +|:------:|:------------:|:---------:|:----------:| +| 1 | LayoutXLM | 1.4 | 70.81% | +| 2 | VI-LayoutXLM | 1.1 | 71.87% | +| 3 | 实验2 + PP-OCR排序 | 1.1 | 78.91% | +| 4 | 实验3 + UDML蒸馏 | 1.1 | **83.92%** | + + +#### 5.2.2 FUNSD数据集 + +**SER任务结果** + +| 实验序号 | 策略 | F1-score | +|:------:|:------:|:------:| +| 1 | LayoutXLM | 82.28% | +| 2 | PP-Structurev2 SER | **87.79%** | + + +**RE任务结果** + +| 实验序号 | 策略 | F1-score | +|:------:|:------:|:------:| +| 1 | LayoutXLM | 53.13% | +| 2 | PP-Structurev2 SER | **74.87%** | + + +## 6. Reference +* [1] Zhong X, ShafieiBavani E, Jimeno Yepes A. Image-based table recognition: data, model, and evaluation[C]//European Conference on Computer Vision. Springer, Cham, 2020: 564-580. +* [2] Cui C, Gao T, Wei S. Yuning Du, Ruoyu Guo, Shuilong Dong, Bin Lu, Ying Zhou, Xueying Lv, Qiwen Liu, Xiaoguang Hu, Dianhai Yu, and Yanjun Ma* [J]. Pplcnet: A lightweight cpu convolutional neural network, 2021, 3. +* [3] Lin T Y, Dollár P, Girshick R, et al. Feature pyramid networks for object detection[C]//Proceedings of the IEEE conference on computer vision and pattern recognition. 2017: 2117-2125. +* [4] Yu G, Chang Q, Lv W, et al. PP-PicoDet: A Better Real-Time Object Detector on Mobile Devices[J]. arXiv preprint arXiv:2111.00902, 2021. +* [5] Bochkovskiy A, Wang C Y, Liao H Y M. Yolov4: Optimal speed and accuracy of object detection[J]. arXiv preprint arXiv:2004.10934, 2020. +* [6] Ye J, Qi X, He Y, et al. PingAn-VCGroup's Solution for ICDAR 2021 Competition on Scientific Literature Parsing Task B: Table Recognition to HTML[J]. arXiv preprint arXiv:2105.01848, 2021. +* [7] Zhong X, Tang J, Yepes A J. Publaynet: largest dataset ever for document layout analysis[C]//2019 International Conference on Document Analysis and Recognition (ICDAR). IEEE, 2019: 1015-1022. +* [8] CDLA:https://github.com/buptlihang/CDLA +* [9]Gao L, Huang Y, Déjean H, et al. ICDAR 2019 competition on table detection and recognition (cTDaR)[C]//2019 International Conference on Document Analysis and Recognition (ICDAR). IEEE, 2019: 1510-1515. +* [10] Mondal A, Lipps P, Jawahar C V. IIIT-AR-13K: a new dataset for graphical object detection in documents[C]//International Workshop on Document Analysis Systems. Springer, Cham, 2020: 216-230. +* [11] Tal ocr_tabel:https://ai.100tal.com/dataset +* [12] Li M, Cui L, Huang S, et al. Tablebank: A benchmark dataset for table detection and recognition[J]. arXiv preprint arXiv:1903.01949, 2019. +* [13]Li M, Xu Y, Cui L, et al. DocBank: A benchmark dataset for document layout analysis[J]. arXiv preprint arXiv:2006.01038, 2020. +* [14] Xu Y, Li M, Cui L, et al. Layoutlm: Pre-training of text and layout for document image understanding[C]//Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining. 2020: 1192-1200. +* [15] Xu Y, Xu Y, Lv T, et al. LayoutLMv2: Multi-modal pre-training for visually-rich document understanding[J]. arXiv preprint arXiv:2012.14740, 2020. +* [16] Xu Y, Lv T, Cui L, et al. Layoutxlm: Multimodal pre-training for multilingual visually-rich document understanding[J]. arXiv preprint arXiv:2104.08836, 2021. +* [17] Xu Y, Lv T, Cui L, et al. XFUND: A Benchmark Dataset for Multilingual Visually Rich Form Understanding[C]//Findings of the Association for Computational Linguistics: ACL 2022. 2022: 3214-3224. +* [18] Jaume G, Ekenel H K, Thiran J P. Funsd: A dataset for form understanding in noisy scanned documents[C]//2019 International Conference on Document Analysis and Recognition Workshops (ICDARW). IEEE, 2019, 2: 1-6. diff --git a/ppstructure/docs/imgs/0.png b/ppstructure/docs/imgs/sdmgr_result.png similarity index 100% rename from ppstructure/docs/imgs/0.png rename to ppstructure/docs/imgs/sdmgr_result.png diff --git a/ppstructure/docs/imgs/slanet_result.jpg b/ppstructure/docs/imgs/slanet_result.jpg new file mode 100644 index 0000000000000000000000000000000000000000..011857fbc2295b91a96d938f861d38b8e07421bc Binary files /dev/null and b/ppstructure/docs/imgs/slanet_result.jpg differ diff --git a/ppstructure/docs/imgs/table_ch_result1.jpg b/ppstructure/docs/imgs/table_ch_result1.jpg new file mode 100644 index 0000000000000000000000000000000000000000..c75eee40f642d437451fa16bff9cb4a3bdb4f38a Binary files /dev/null and b/ppstructure/docs/imgs/table_ch_result1.jpg differ diff --git a/ppstructure/docs/imgs/table_ch_result2.jpg b/ppstructure/docs/imgs/table_ch_result2.jpg new file mode 100644 index 0000000000000000000000000000000000000000..802871a8e1b6983f304fc73a9fd13404aa02630f Binary files /dev/null and b/ppstructure/docs/imgs/table_ch_result2.jpg differ diff --git a/ppstructure/docs/imgs/table_ch_result3.jpg b/ppstructure/docs/imgs/table_ch_result3.jpg new file mode 100644 index 0000000000000000000000000000000000000000..bdd92aa6ee7819c837fd3e2abc38cac915588a71 Binary files /dev/null and b/ppstructure/docs/imgs/table_ch_result3.jpg differ diff --git a/ppstructure/docs/inference.md b/ppstructure/docs/inference.md index 7604246da5a79b0ee2939c9fb4c91602531ec7de..516db82784ce98abba6db14c795fe7323be508e0 100644 --- a/ppstructure/docs/inference.md +++ b/ppstructure/docs/inference.md @@ -1,38 +1,41 @@ # 基于Python预测引擎推理 -- [1. Structure](#1) +- [1. 版面信息抽取](#1) - [1.1 版面分析+表格识别](#1.1) - [1.2 版面分析](#1.2) - [1.3 表格识别](#1.3) -- [2. DocVQA](#2) +- [2. 关键信息抽取](#2) -## 1. Structure +## 1. 版面信息抽取 进入`ppstructure`目录 ```bash cd ppstructure -```` +``` 下载模型 ```bash mkdir inference && cd inference -# 下载PP-OCRv2文本检测模型并解压 -wget https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_slim_quant_infer.tar && tar xf ch_PP-OCRv2_det_slim_quant_infer.tar -# 下载PP-OCRv2文本识别模型并解压 -wget https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_infer.tar && tar xf ch_PP-OCRv2_rec_slim_quant_infer.tar -# 下载超轻量级英文表格预测模型并解压 -wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_structure_infer.tar && tar xf en_ppocr_mobile_v2.0_table_structure_infer.tar +# 下载PP-Structurev2版面分析模型并解压 +wget https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_layout_infer.tar && tar xf picodet_lcnet_x1_0_layout_infer.tar +# 下载PP-OCRv3文本检测模型并解压 +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar && tar xf ch_PP-OCRv3_det_infer.tar +# 下载PP-OCRv3文本识别模型并解压 +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar && tar xf ch_PP-OCRv3_rec_infer.tar +# 下载PP-Structurev2表格识别模型并解压 +wget https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_infer.tar && tar xf ch_ppstructure_mobile_v2.0_SLANet_infer.tar cd .. ``` ### 1.1 版面分析+表格识别 ```bash -python3 predict_system.py --det_model_dir=inference/ch_PP-OCRv2_det_slim_quant_infer \ - --rec_model_dir=inference/ch_PP-OCRv2_rec_slim_quant_infer \ - --table_model_dir=inference/en_ppocr_mobile_v2.0_table_structure_infer \ +python3 predict_system.py --det_model_dir=inference/ch_PP-OCRv3_det_infer \ + --rec_model_dir=inference/ch_PP-OCRv3_rec_infer \ + --table_model_dir=inference/ch_ppstructure_mobile_v2.0_SLANet_infer \ + --layout_model_dir=inference/picodet_lcnet_x1_0_layout_infer \ --image_dir=./docs/table/1.png \ --rec_char_dict_path=../ppocr/utils/ppocr_keys_v1.txt \ - --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt \ + --table_char_dict_path=../ppocr/utils/dict/table_structure_dict_ch.txt \ --output=../output \ --vis_font_path=../doc/fonts/simfang.ttf ``` @@ -41,19 +44,23 @@ python3 predict_system.py --det_model_dir=inference/ch_PP-OCRv2_det_slim_quant_i ### 1.2 版面分析 ```bash -python3 predict_system.py --image_dir=./docs/table/1.png --table=false --ocr=false --output=../output/ +python3 predict_system.py --layout_model_dir=inference/picodet_lcnet_x1_0_layout_infer \ + --image_dir=./docs/table/1.png \ + --output=../output \ + --table=false \ + --ocr=false ``` 运行完成后,每张图片会在`output`字段指定的目录下的`structure`目录下有一个同名目录,图片区域会被裁剪之后保存下来,图片名为表格在图片里的坐标。版面分析结果会存储在`res.txt`文件中。 ### 1.3 表格识别 ```bash -python3 predict_system.py --det_model_dir=inference/ch_PP-OCRv2_det_slim_quant_infer \ - --rec_model_dir=inference/ch_PP-OCRv2_rec_slim_quant_infer \ - --table_model_dir=inference/en_ppocr_mobile_v2.0_table_structure_infer \ +python3 predict_system.py --det_model_dir=inference/ch_PP-OCRv3_det_infer \ + --rec_model_dir=inference/ch_PP-OCRv3_rec_infer \ + --table_model_dir=inference/ch_ppstructure_mobile_v2.0_SLANet_infer \ --image_dir=./docs/table/table.jpg \ --rec_char_dict_path=../ppocr/utils/ppocr_keys_v1.txt \ - --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt \ + --table_char_dict_path=../ppocr/utils/dict/table_structure_dict_ch.txt \ --output=../output \ --vis_font_path=../doc/fonts/simfang.ttf \ --layout=false @@ -61,20 +68,22 @@ python3 predict_system.py --det_model_dir=inference/ch_PP-OCRv2_det_slim_quant_i 运行完成后,每张图片会在`output`字段指定的目录下的`structure`目录下有一个同名目录,表格会存储为一个excel,excel文件名为`[0,0,img_h,img_w]`。 -## 2. DocVQA +## 2. 关键信息抽取 ```bash cd ppstructure -# 下载模型 mkdir inference && cd inference -# 下载SER xfun 模型并解压 -wget https://paddleocr.bj.bcebos.com/pplayout/PP-Layout_v1.0_ser_pretrained.tar && tar xf PP-Layout_v1.0_ser_pretrained.tar +# 下载SER XFUND 模型并解压 +wget https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/ser_vi_layoutxlm_xfund_infer.tar && tar -xf ser_vi_layoutxlm_xfund_infer.tar cd .. - -python3 predict_system.py --model_name_or_path=vqa/PP-Layout_v1.0_ser_pretrained/ \ - --mode=vqa \ - --image_dir=vqa/images/input/zh_val_0.jpg \ - --vis_font_path=../doc/fonts/simfang.ttf +python3 kie/predict_kie_token_ser.py \ + --kie_algorithm=LayoutXLM \ + --ser_model_dir=../inference/ser_vi_layoutxlm_xfund_infer \ + --image_dir=./docs/kie/input/zh_val_42.jpg \ + --ser_dict_path=../ppocr/utils/dict/kie_dict/xfund_class_list.txt \ + --vis_font_path=../doc/fonts/simfang.ttf \ + --ocr_order_method="tb-yx" ``` -运行完成后,每张图片会在`output`字段指定的目录下的`vqa`目录下存放可视化之后的图片,图片名和输入图片名一致。 + +运行完成后,每张图片会在`output`字段指定的目录下的`kie`目录下存放可视化之后的图片,图片名和输入图片名一致。 diff --git a/ppstructure/docs/inference_en.md b/ppstructure/docs/inference_en.md index 2a0fb30543eaa06c4ede5f82a443135c959db37d..71019ec70f80e44bc16d2b0d07b0bb93b475b7e7 100644 --- a/ppstructure/docs/inference_en.md +++ b/ppstructure/docs/inference_en.md @@ -1,13 +1,13 @@ # Python Inference -- [1. Structure](#1) +- [1. Layout Structured Analysis](#1) - [1.1 layout analysis + table recognition](#1.1) - [1.2 layout analysis](#1.2) - [1.3 table recognition](#1.3) -- [2. DocVQA](#2) +- [2. Key Information Extraction](#2) -## 1. Structure +## 1. Layout Structured Analysis Go to the `ppstructure` directory ```bash @@ -18,23 +18,26 @@ download model ```bash mkdir inference && cd inference -# Download the PP-OCRv2 text detection model and unzip it -wget https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_slim_quant_infer.tar && tar xf ch_PP-OCRv2_det_slim_quant_infer.tar -# Download the PP-OCRv2 text recognition model and unzip it -wget https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_infer.tar && tar xf ch_PP-OCRv2_rec_slim_quant_infer.tar -# Download the ultra-lightweight English table structure model and unzip it -wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_structure_infer.tar && tar xf en_ppocr_mobile_v2.0_table_structure_infer.tar +# Download the PP-Structurev2 layout analysis model and unzip it +wget https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_layout_infer.tar && tar xf picodet_lcnet_x1_0_layout_infer.tar +# Download the PP-OCRv3 text detection model and unzip it +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar && tar xf ch_PP-OCRv3_det_infer.tar +# Download the PP-OCRv3 text recognition model and unzip it +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar && tar xf ch_PP-OCRv3_rec_infer.tar +# Download the PP-Structurev2 form recognition model and unzip it +wget https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_infer.tar && tar xf ch_ppstructure_mobile_v2.0_SLANet_infer.tar cd .. ``` ### 1.1 layout analysis + table recognition ```bash -python3 predict_system.py --det_model_dir=inference/ch_PP-OCRv2_det_slim_quant_infer \ - --rec_model_dir=inference/ch_PP-OCRv2_rec_slim_quant_infer \ - --table_model_dir=inference/en_ppocr_mobile_v2.0_table_structure_infer \ +python3 predict_system.py --det_model_dir=inference/ch_PP-OCRv3_det_infer \ + --rec_model_dir=inference/ch_PP-OCRv3_rec_infer \ + --table_model_dir=inference/ch_ppstructure_mobile_v2.0_SLANet_infer \ + --layout_model_dir=inference/picodet_lcnet_x1_0_layout_infer \ --image_dir=./docs/table/1.png \ --rec_char_dict_path=../ppocr/utils/ppocr_keys_v1.txt \ - --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt \ + --table_char_dict_path=../ppocr/utils/dict/table_structure_dict_ch.txt \ --output=../output \ --vis_font_path=../doc/fonts/simfang.ttf ``` @@ -43,19 +46,23 @@ After the operation is completed, each image will have a directory with the same ### 1.2 layout analysis ```bash -python3 predict_system.py --image_dir=./docs/table/1.png --table=false --ocr=false --output=../output/ +python3 predict_system.py --layout_model_dir=inference/picodet_lcnet_x1_0_layout_infer \ + --image_dir=./docs/table/1.png \ + --output=../output \ + --table=false \ + --ocr=false ``` After the operation is completed, each image will have a directory with the same name in the `structure` directory under the directory specified by the `output` field. Each picture in image will be cropped and saved. The filename of picture area is their coordinates in the image. Layout analysis results will be stored in the `res.txt` file ### 1.3 table recognition ```bash -python3 predict_system.py --det_model_dir=inference/ch_PP-OCRv2_det_slim_quant_infer \ - --rec_model_dir=inference/ch_PP-OCRv2_rec_slim_quant_infer \ - --table_model_dir=inference/en_ppocr_mobile_v2.0_table_structure_infer \ +python3 predict_system.py --det_model_dir=inference/ch_PP-OCRv3_det_infer \ + --rec_model_dir=inference/ch_PP-OCRv3_rec_infer \ + --table_model_dir=inference/ch_ppstructure_mobile_v2.0_SLANet_infer \ --image_dir=./docs/table/table.jpg \ --rec_char_dict_path=../ppocr/utils/ppocr_keys_v1.txt \ - --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt \ + --table_char_dict_path=../ppocr/utils/dict/table_structure_dict_ch.txt \ --output=../output \ --vis_font_path=../doc/fonts/simfang.ttf \ --layout=false @@ -63,19 +70,22 @@ python3 predict_system.py --det_model_dir=inference/ch_PP-OCRv2_det_slim_quant_i After the operation is completed, each image will have a directory with the same name in the `structure` directory under the directory specified by the `output` field. Each table in the image will be stored as an excel. The filename of excel is their coordinates in the image. -## 2. DocVQA +## 2. Key Information Extraction ```bash cd ppstructure -# download model mkdir inference && cd inference -wget https://paddleocr.bj.bcebos.com/pplayout/PP-Layout_v1.0_ser_pretrained.tar && tar xf PP-Layout_v1.0_ser_pretrained.tar +# download model +wget https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/ser_vi_layoutxlm_xfund_infer.tar && tar -xf ser_vi_layoutxlm_xfund_infer.tar cd .. - -python3 predict_system.py --model_name_or_path=vqa/PP-Layout_v1.0_ser_pretrained/ \ - --mode=vqa \ - --image_dir=vqa/images/input/zh_val_0.jpg \ - --vis_font_path=../doc/fonts/simfang.ttf +python3 kie/predict_kie_token_ser.py \ + --kie_algorithm=LayoutXLM \ + --ser_model_dir=../inference/ser_vi_layoutxlm_xfund_infer \ + --image_dir=./docs/kie/input/zh_val_42.jpg \ + --ser_dict_path=../ppocr/utils/dict/kie_dict/xfund_class_list.txt \ + --vis_font_path=../doc/fonts/simfang.ttf \ + --ocr_order_method="tb-yx" ``` -After the operation is completed, each image will store the visualized image in the `vqa` directory under the directory specified by the `output` field, and the image name is the same as the input image name. + +After the operation is completed, each image will store the visualized image in the `kie` directory under the directory specified by the `output` field, and the image name is the same as the input image name. diff --git a/ppstructure/docs/installation.md b/ppstructure/docs/installation.md deleted file mode 100644 index 155baf29de5701b58c9342cf82897b23f4ab7e45..0000000000000000000000000000000000000000 --- a/ppstructure/docs/installation.md +++ /dev/null @@ -1,34 +0,0 @@ -- [快速安装](#快速安装) - - [1. PaddlePaddle 和 PaddleOCR](#1-paddlepaddle-和-paddleocr) - - [2. 安装其他依赖](#2-安装其他依赖) - - [2.1 版面分析所需 Layout-Parser](#21-版面分析所需--layout-parser) - - [2.2 VQA所需依赖](#22--vqa所需依赖) - -# 快速安装 - -## 1. PaddlePaddle 和 PaddleOCR - -可参考[PaddleOCR安装文档](../../doc/doc_ch/installation.md) - -## 2. 安装其他依赖 - -### 2.1 版面分析所需 Layout-Parser - -Layout-Parser 可通过如下命令安装 - -```bash -pip3 install -U https://paddleocr.bj.bcebos.com/whl/layoutparser-0.0.0-py3-none-any.whl -``` -### 2.2 VQA所需依赖 -* paddleocr - -```bash -pip3 install paddleocr -``` - -* PaddleNLP -```bash -git clone https://github.com/PaddlePaddle/PaddleNLP -b develop -cd PaddleNLP -pip3 install -e . -``` diff --git a/ppstructure/docs/kie.md b/ppstructure/docs/kie.md deleted file mode 100644 index 315dd9f7bafa6b6160489eab330e8d278b2d119d..0000000000000000000000000000000000000000 --- a/ppstructure/docs/kie.md +++ /dev/null @@ -1,71 +0,0 @@ -- [关键信息提取(Key Information Extraction)](#关键信息提取key-information-extraction) - - [1. 快速使用](#1-快速使用) - - [2. 执行训练](#2-执行训练) - - [3. 执行评估](#3-执行评估) - - [4. 参考文献](#4-参考文献) - -# 关键信息提取(Key Information Extraction) - -本节介绍PaddleOCR中关键信息提取SDMGR方法的快速使用和训练方法。 - -SDMGR是一个关键信息提取算法,将每个检测到的文本区域分类为预定义的类别,如订单ID、发票号码,金额等。 - - -## 1. 快速使用 - -训练和测试的数据采用wildreceipt数据集,通过如下指令下载数据集: - -``` -wget https://paddleocr.bj.bcebos.com/ppstructure/dataset/wildreceipt.tar && tar xf wildreceipt.tar -``` - -执行预测: - -``` -cd PaddleOCR/ -wget https://paddleocr.bj.bcebos.com/dygraph_v2.1/kie/kie_vgg16.tar && tar xf kie_vgg16.tar -python3.7 tools/infer_kie.py -c configs/kie/kie_unet_sdmgr.yml -o Global.checkpoints=kie_vgg16/best_accuracy Global.infer_img=../wildreceipt/1.txt -``` - -执行预测后的结果保存在`./output/sdmgr_kie/predicts_kie.txt`文件中,可视化结果保存在`/output/sdmgr_kie/kie_results/`目录下。 - -可视化结果如下图所示: - -
- -
- -## 2. 执行训练 - -创建数据集软链到PaddleOCR/train_data目录下: -``` -cd PaddleOCR/ && mkdir train_data && cd train_data - -ln -s ../../wildreceipt ./ -``` - -训练采用的配置文件是configs/kie/kie_unet_sdmgr.yml,配置文件中默认训练数据路径是`train_data/wildreceipt`,准备好数据后,可以通过如下指令执行训练: -``` -python3.7 tools/train.py -c configs/kie/kie_unet_sdmgr.yml -o Global.save_model_dir=./output/kie/ -``` -## 3. 执行评估 - -``` -python3.7 tools/eval.py -c configs/kie/kie_unet_sdmgr.yml -o Global.checkpoints=./output/kie/best_accuracy -``` - - -## 4. 参考文献 - - - -```bibtex -@misc{sun2021spatial, - title={Spatial Dual-Modality Graph Reasoning for Key Information Extraction}, - author={Hongbin Sun and Zhanghui Kuang and Xiaoyu Yue and Chenhao Lin and Wayne Zhang}, - year={2021}, - eprint={2103.14470}, - archivePrefix={arXiv}, - primaryClass={cs.CV} -} -``` diff --git a/ppstructure/docs/vqa/input/zh_val_0.jpg b/ppstructure/docs/kie/input/zh_val_0.jpg similarity index 100% rename from ppstructure/docs/vqa/input/zh_val_0.jpg rename to ppstructure/docs/kie/input/zh_val_0.jpg diff --git a/ppstructure/docs/vqa/input/zh_val_21.jpg b/ppstructure/docs/kie/input/zh_val_21.jpg similarity index 100% rename from ppstructure/docs/vqa/input/zh_val_21.jpg rename to ppstructure/docs/kie/input/zh_val_21.jpg diff --git a/ppstructure/docs/vqa/input/zh_val_40.jpg b/ppstructure/docs/kie/input/zh_val_40.jpg similarity index 100% rename from ppstructure/docs/vqa/input/zh_val_40.jpg rename to ppstructure/docs/kie/input/zh_val_40.jpg diff --git a/ppstructure/docs/vqa/input/zh_val_42.jpg b/ppstructure/docs/kie/input/zh_val_42.jpg similarity index 100% rename from ppstructure/docs/vqa/input/zh_val_42.jpg rename to ppstructure/docs/kie/input/zh_val_42.jpg diff --git a/ppstructure/docs/vqa/result_re/zh_val_21_re.jpg b/ppstructure/docs/kie/result_re/zh_val_21_re.jpg similarity index 100% rename from ppstructure/docs/vqa/result_re/zh_val_21_re.jpg rename to ppstructure/docs/kie/result_re/zh_val_21_re.jpg diff --git a/ppstructure/docs/vqa/result_re/zh_val_40_re.jpg b/ppstructure/docs/kie/result_re/zh_val_40_re.jpg similarity index 100% rename from ppstructure/docs/vqa/result_re/zh_val_40_re.jpg rename to ppstructure/docs/kie/result_re/zh_val_40_re.jpg diff --git a/ppstructure/docs/kie/result_re/zh_val_42_re.jpg b/ppstructure/docs/kie/result_re/zh_val_42_re.jpg new file mode 100644 index 0000000000000000000000000000000000000000..49a0fad352b2b7f507d6bd73c7574f054c58a82e Binary files /dev/null and b/ppstructure/docs/kie/result_re/zh_val_42_re.jpg differ diff --git a/ppstructure/docs/kie/result_re_with_gt_ocr/zh_val_42_re.jpg b/ppstructure/docs/kie/result_re_with_gt_ocr/zh_val_42_re.jpg new file mode 100644 index 0000000000000000000000000000000000000000..03f3769eb257ed3f8444a5380d4f90b0dbf3f509 Binary files /dev/null and b/ppstructure/docs/kie/result_re_with_gt_ocr/zh_val_42_re.jpg differ diff --git a/ppstructure/docs/vqa/result_ser/zh_val_0_ser.jpg b/ppstructure/docs/kie/result_ser/zh_val_0_ser.jpg similarity index 100% rename from ppstructure/docs/vqa/result_ser/zh_val_0_ser.jpg rename to ppstructure/docs/kie/result_ser/zh_val_0_ser.jpg diff --git a/ppstructure/docs/kie/result_ser/zh_val_42_ser.jpg b/ppstructure/docs/kie/result_ser/zh_val_42_ser.jpg new file mode 100644 index 0000000000000000000000000000000000000000..d69d83569bcb05a0dc6584fcc703ad74847111db Binary files /dev/null and b/ppstructure/docs/kie/result_ser/zh_val_42_ser.jpg differ diff --git a/ppstructure/docs/kie/result_ser_with_gt_ocr/zh_val_42_ser.jpg b/ppstructure/docs/kie/result_ser_with_gt_ocr/zh_val_42_ser.jpg new file mode 100644 index 0000000000000000000000000000000000000000..3cd35ac41fe42ffe835d1a2a3f1de530df695e93 Binary files /dev/null and b/ppstructure/docs/kie/result_ser_with_gt_ocr/zh_val_42_ser.jpg differ diff --git a/ppstructure/docs/layout/layout.png b/ppstructure/docs/layout/layout.png new file mode 100644 index 0000000000000000000000000000000000000000..da9640e245e34659771353e328bf97da129bd622 Binary files /dev/null and b/ppstructure/docs/layout/layout.png differ diff --git a/ppstructure/docs/layout/layout_res.jpg b/ppstructure/docs/layout/layout_res.jpg new file mode 100644 index 0000000000000000000000000000000000000000..93b3a8bef3bfc9f5c80a9505239af05d526b45a7 Binary files /dev/null and b/ppstructure/docs/layout/layout_res.jpg differ diff --git a/ppstructure/docs/models_list.md b/ppstructure/docs/models_list.md index 42d44009dad1ba1b07bb410c199993c6f79f3d5d..935d12d756eec467574f9ae32d48c70a3ea054c3 100644 --- a/ppstructure/docs/models_list.md +++ b/ppstructure/docs/models_list.md @@ -4,20 +4,23 @@ - [2. OCR和表格识别模型](#2-ocr和表格识别模型) - [2.1 OCR](#21-ocr) - [2.2 表格识别模型](#22-表格识别模型) -- [3. VQA模型](#3-vqa模型) -- [4. KIE模型](#4-kie模型) +- [3. KIE模型](#3-kie模型) ## 1. 版面分析模型 -|模型名称|模型简介|下载地址|label_map| -| --- | --- | --- | --- | -| ppyolov2_r50vd_dcn_365e_publaynet | PubLayNet 数据集训练的版面分析模型,可以划分**文字、标题、表格、图片以及列表**5类区域 | [推理模型](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_publaynet.tar) / [训练模型](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_publaynet_pretrained.pdparams) |{0: "Text", 1: "Title", 2: "List", 3:"Table", 4:"Figure"}| -| ppyolov2_r50vd_dcn_365e_tableBank_word | TableBank Word 数据集训练的版面分析模型,只能检测表格 | [推理模型](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_tableBank_word.tar) | {0:"Table"}| -| ppyolov2_r50vd_dcn_365e_tableBank_latex | TableBank Latex 数据集训练的版面分析模型,只能检测表格 | [推理模型](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_tableBank_latex.tar) | {0:"Table"}| +|模型名称|模型简介|推理模型大小|下载地址|dict path| +| --- | --- | --- | --- | --- | +| picodet_lcnet_x1_0_fgd_layout | 基于PicoDet LCNet_x1_0和FGD蒸馏在PubLayNet 数据集训练的英文版面分析模型,可以划分**文字、标题、表格、图片以及列表**5类区域 | 9.7M | [推理模型](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout.pdparams) | [PubLayNet dict](../../ppocr/utils/dict/layout_dict/layout_publaynet_dict.txt) | +| ppyolov2_r50vd_dcn_365e_publaynet | 基于PP-YOLOv2在PubLayNet数据集上训练的英文版面分析模型 | 221M | [推理模型](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_publaynet.tar) / [训练模型](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_publaynet_pretrained.pdparams) | 同上 | +| picodet_lcnet_x1_0_fgd_layout_cdla | CDLA数据集训练的中文版面分析模型,可以划分为**表格、图片、图片标题、表格、表格标题、页眉、脚本、引用、公式**10类区域 | 9.7M | [推理模型](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_cdla_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_cdla.pdparams) | [CDLA dict](../../ppocr/utils/dict/layout_dict/layout_cdla_dict.txt) | +| picodet_lcnet_x1_0_fgd_layout_table | 表格数据集训练的版面分析模型,支持中英文文档表格区域的检测 | 9.7M | [推理模型](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_table_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_table.pdparams) | [Table dict](../../ppocr/utils/dict/layout_dict/layout_table_dict.txt) | +| ppyolov2_r50vd_dcn_365e_tableBank_word | 基于PP-YOLOv2在TableBank Word 数据集训练的版面分析模型,支持英文文档表格区域的检测 | 221M | [推理模型](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_tableBank_word.tar) | 同上 | +| ppyolov2_r50vd_dcn_365e_tableBank_latex | 基于PP-YOLOv2在TableBank Latex数据集训练的版面分析模型,支持英文文档表格区域的检测 | 221M | [推理模型](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_tableBank_latex.tar) | 同上 | + ## 2. OCR和表格识别模型 @@ -25,8 +28,8 @@ |模型名称|模型简介|推理模型大小|下载地址| | --- | --- | --- | --- | -|en_ppocr_mobile_v2.0_table_det|PubLayNet数据集训练的英文表格场景的文字检测|4.7M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/table/en_ppocr_mobile_v2.0_table_det_train.tar) | -|en_ppocr_mobile_v2.0_table_rec|PubLayNet数据集训练的英文表格场景的文字识别|6.9M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/table/en_ppocr_mobile_v2.0_table_rec_train.tar) | +|en_ppocr_mobile_v2.0_table_det|PubTabNet数据集训练的英文表格场景的文字检测|4.7M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/table/en_ppocr_mobile_v2.0_table_det_train.tar) | +|en_ppocr_mobile_v2.0_table_rec|PubTabNet数据集训练的英文表格场景的文字识别|6.9M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/table/en_ppocr_mobile_v2.0_table_rec_train.tar) | 如需要使用其他OCR模型,可以在 [PP-OCR model_list](../../doc/doc_ch/models_list.md) 下载模型或者使用自己训练好的模型配置到 `det_model_dir`, `rec_model_dir`两个字段即可。 @@ -35,22 +38,31 @@ |模型名称|模型简介|推理模型大小|下载地址| | --- | --- | --- | --- | -|en_ppocr_mobile_v2.0_table_structure|PubTabNet数据集训练的英文表格场景的表格结构预测|18.6M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_structure_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/table/en_ppocr_mobile_v2.0_table_structure_train.tar) | +|en_ppocr_mobile_v2.0_table_structure|基于TableRec-RARE在PubTabNet数据集上训练的英文表格识别模型|6.8M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_structure_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/table/en_ppocr_mobile_v2.0_table_structure_train.tar) | +|en_ppstructure_mobile_v2.0_SLANet|基于SLANet在PubTabNet数据集上训练的英文表格识别模型|9.2M|[推理模型](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/en_ppstructure_mobile_v2.0_SLANet_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/en_ppstructure_mobile_v2.0_SLANet_train.tar) | +|ch_ppstructure_mobile_v2.0_SLANet|基于SLANet的中文表格识别模型|9.3M|[推理模型](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_train.tar) | -## 3. VQA模型 -|模型名称|模型简介|推理模型大小|下载地址| -| --- | --- | --- | --- | -|ser_LayoutXLM_xfun_zh|基于LayoutXLM在xfun中文数据集上训练的SER模型|1.4G|[推理模型](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutXLM_xfun_zh_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutXLM_xfun_zh.tar) | -|re_LayoutXLM_xfun_zh|基于LayoutXLM在xfun中文数据集上训练的RE模型|1.4G|[推理模型 coming soon]() / [训练模型](https://paddleocr.bj.bcebos.com/pplayout/re_LayoutXLM_xfun_zh.tar) | -|ser_LayoutLMv2_xfun_zh|基于LayoutLMv2在xfun中文数据集上训练的SER模型|778M|[推理模型](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutLMv2_xfun_zh_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutLMv2_xfun_zh.tar) | -|re_LayoutLMv2_xfun_zh|基于LayoutLMv2在xfun中文数据集上训练的RE模型|765M|[推理模型 coming soon]() / [训练模型](https://paddleocr.bj.bcebos.com/pplayout/re_LayoutLMv2_xfun_zh.tar) | -|ser_LayoutLM_xfun_zh|基于LayoutLM在xfun中文数据集上训练的SER模型|430M|[推理模型](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutLM_xfun_zh_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutLM_xfun_zh.tar) | +## 3. KIE模型 - -## 4. KIE模型 +在XFUND_zh数据集上,不同模型的精度与V100 GPU上速度信息如下所示。 -|模型名称|模型简介|模型大小|下载地址| -| --- | --- | --- | --- | -|SDMGR|关键信息提取模型|78M|[推理模型 coming soon]() / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/kie/kie_vgg16.tar)| +|模型名称|模型简介 | 推理模型大小| 精度(hmean) | 预测耗时(ms) | 下载地址| +| --- | --- | --- |--- |--- | --- | +|ser_VI-LayoutXLM_xfund_zh|基于VI-LayoutXLM在xfund中文数据集上训练的SER模型|1.1G| 93.19% | 15.49 | [推理模型](https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/ser_vi_layoutxlm_xfund_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/ser_vi_layoutxlm_xfund_pretrained.tar) | +|re_VI-LayoutXLM_xfund_zh|基于VI-LayoutXLM在xfund中文数据集上训练的RE模型|1.1G| 83.92% | 15.49 |[推理模型 coming soon]() / [训练模型](https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/re_vi_layoutxlm_xfund_pretrained.tar) | +|ser_LayoutXLM_xfund_zh|基于LayoutXLM在xfund中文数据集上训练的SER模型|1.4G| 90.38% | 19.49 |[推理模型](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutXLM_xfun_zh_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutXLM_xfun_zh.tar) | +|re_LayoutXLM_xfund_zh|基于LayoutXLM在xfund中文数据集上训练的RE模型|1.4G| 74.83% | 19.49 |[推理模型 coming soon]() / [训练模型](https://paddleocr.bj.bcebos.com/pplayout/re_LayoutXLM_xfun_zh.tar) | +|ser_LayoutLMv2_xfund_zh|基于LayoutLMv2在xfund中文数据集上训练的SER模型|778M| 85.44% | 31.46 |[推理模型](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutLMv2_xfun_zh_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutLMv2_xfun_zh.tar) | +|re_LayoutLMv2_xfund_zh|基于LayoutLMv2在xfun中文数据集上训练的RE模型|765M| 67.77% | 31.46 |[推理模型 coming soon]() / [训练模型](https://paddleocr.bj.bcebos.com/pplayout/re_LayoutLMv2_xfun_zh.tar) | +|ser_LayoutLM_xfund_zh|基于LayoutLM在xfund中文数据集上训练的SER模型|430M| 77.31% | - |[推理模型](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutLM_xfun_zh_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutLM_xfun_zh.tar) | + +* 注:上述预测耗时信息仅包含了inference模型的推理耗时,没有统计预处理与后处理耗时,测试环境为`V100 GPU + CUDA 10.2 + CUDNN 8.1.1 + TRT 7.2.3.4`。 + +在wildreceipt数据集上,SDMGR模型精度与下载地址如下所示。 + + +|模型名称|模型简介|模型大小|精度|下载地址| +| --- | --- | --- |--- | --- | +|SDMGR|关键信息提取模型|78M| 86.70% | [推理模型 coming soon]() / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/kie/kie_vgg16.tar)| diff --git a/ppstructure/docs/models_list_en.md b/ppstructure/docs/models_list_en.md index e133a0bb2a9b017207b5e92ea444aba4633a7457..291d42f995fdd7fabc293a0e4df35c2249945fd2 100644 --- a/ppstructure/docs/models_list_en.md +++ b/ppstructure/docs/models_list_en.md @@ -4,18 +4,20 @@ - [2. OCR and Table Recognition](#2-ocr-and-table-recognition) - [2.1 OCR](#21-ocr) - [2.2 Table Recognition](#22-table-recognition) -- [3. VQA](#3-vqa) -- [4. KIE](#4-kie) - +- [3. KIE](#3-kie) + ## 1. Layout Analysis -|model name| description |download|label_map| -| --- |---------------------------------------------------------------------------------------------------------------------------------------------------------| --- | --- | -| ppyolov2_r50vd_dcn_365e_publaynet | The layout analysis model trained on the PubLayNet dataset, the model can recognition 5 types of areas such as **text, title, table, picture and list** | [inference model](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_publaynet.tar) / [trained model](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_publaynet_pretrained.pdparams) |{0: "Text", 1: "Title", 2: "List", 3:"Table", 4:"Figure"}| -| ppyolov2_r50vd_dcn_365e_tableBank_word | The layout analysis model trained on the TableBank Word dataset, the model can only detect tables | [inference model](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_tableBank_word.tar) | {0:"Table"}| -| ppyolov2_r50vd_dcn_365e_tableBank_latex | The layout analysis model trained on the TableBank Latex dataset, the model can only detect tables | [inference model](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_tableBank_latex.tar) | {0:"Table"}| +|model name| description | inference model size |download|dict path| +| --- |---------------------------------------------------------------------------------------------------------------------------------------------------------| --- | --- | --- | +| picodet_lcnet_x1_0_fgd_layout | The layout analysis English model trained on the PubLayNet dataset based on PicoDet LCNet_x1_0 and FGD . the model can recognition 5 types of areas such as **Text, Title, Table, Picture and List** | 9.7M | [inference model](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout.pdparams) | [PubLayNet dict](../../ppocr/utils/dict/layout_dict/layout_publaynet_dict.txt) | +| ppyolov2_r50vd_dcn_365e_publaynet | The layout analysis English model trained on the PubLayNet dataset based on PP-YOLOv2 | 221M | [inference_moel](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_publaynet.tar) / [trained model](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_publaynet_pretrained.pdparams) | same as above | +| picodet_lcnet_x1_0_fgd_layout_cdla | The layout analysis Chinese model trained on the CDLA dataset, the model can recognition 10 types of areas such as **Table、Figure、Figure caption、Table、Table caption、Header、Footer、Reference、Equation** | 9.7M | [inference model](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_cdla_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_cdla.pdparams) | [CDLA dict](../../ppocr/utils/dict/layout_dict/layout_cdla_dict.txt) | +| picodet_lcnet_x1_0_fgd_layout_table | The layout analysis model trained on the table dataset, the model can detect tables in Chinese and English documents | 9.7M | [inference model](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_table_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_table.pdparams) | [Table dict](../../ppocr/utils/dict/layout_dict/layout_table_dict.txt) | +| ppyolov2_r50vd_dcn_365e_tableBank_word | The layout analysis model trained on the TableBank Word dataset based on PP-YOLOv2, the model can detect tables in English documents | 221M | [inference model](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_tableBank_word.tar) | same as above | +| ppyolov2_r50vd_dcn_365e_tableBank_latex | The layout analysis model trained on the TableBank Latex dataset based on PP-YOLOv2, the model can detect tables in English documents | 221M | [inference model](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_tableBank_latex.tar) | same as above | ## 2. OCR and Table Recognition @@ -35,22 +37,30 @@ If you need to use other OCR models, you can download the model in [PP-OCR model |model| description |inference model size|download| | --- |-----------------------------------------------------------------------------| --- | --- | -|en_ppocr_mobile_v2.0_table_structure| Table structure model for English table scenes trained on PubTabNet dataset |18.6M|[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_structure_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.1/table/en_ppocr_mobile_v2.0_table_structure_train.tar) | +|en_ppocr_mobile_v2.0_table_structure| English table recognition model trained on PubTabNet dataset based on TableRec-RARE |6.8M|[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_structure_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.1/table/en_ppocr_mobile_v2.0_table_structure_train.tar) | +|en_ppstructure_mobile_v2.0_SLANet|English table recognition model trained on PubTabNet dataset based on SLANet|9.2M|[inference model](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/en_ppstructure_mobile_v2.0_SLANet_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/en_ppstructure_mobile_v2.0_SLANet_train.tar) | +|ch_ppstructure_mobile_v2.0_SLANet|Chinese table recognition model based on SLANet|9.3M|[inference model](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_train.tar) | -## 3. VQA - -|model| description |inference model size|download| -| --- |----------------------------------------------------------------| --- | --- | -|ser_LayoutXLM_xfun_zh| SER model trained on xfun Chinese dataset based on LayoutXLM |1.4G|[inference model](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutXLM_xfun_zh_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutXLM_xfun_zh.tar) | -|re_LayoutXLM_xfun_zh| Re model trained on xfun Chinese dataset based on LayoutXLM |1.4G|[inference model coming soon]() / [trained model](https://paddleocr.bj.bcebos.com/pplayout/re_LayoutXLM_xfun_zh.tar) | -|ser_LayoutLMv2_xfun_zh| SER model trained on xfun Chinese dataset based on LayoutXLMv2 |778M|[inference model](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutLMv2_xfun_zh_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutLMv2_xfun_zh.tar) | -|re_LayoutLMv2_xfun_zh| Re model trained on xfun Chinese dataset based on LayoutXLMv2 |765M|[inference model coming soon]() / [trained model](https://paddleocr.bj.bcebos.com/pplayout/re_LayoutLMv2_xfun_zh.tar) | -|ser_LayoutLM_xfun_zh| SER model trained on xfun Chinese dataset based on LayoutLM |430M|[inference model](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutLM_xfun_zh_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutLM_xfun_zh.tar) | - - -## 4. KIE - -|model|description|model size|download| -| --- | --- | --- | --- | -|SDMGR|Key Information Extraction Model|78M|[inference model coming soon]() / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.1/kie/kie_vgg16.tar)| +## 3. KIE + +On XFUND_zh dataset, Accuracy and time cost of different models on V100 GPU are as follows. + +|Model|Backbone|Task|Config|Hmean|Time cost(ms)|Download link| +| --- | --- | --- | --- | --- | --- |--- | +|VI-LayoutXLM| VI-LayoutXLM-base | SER | [ser_vi_layoutxlm_xfund_zh_udml.yml](../../configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh_udml.yml)|**93.19%**| 15.49| [trained model](https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/ser_vi_layoutxlm_xfund_pretrained.tar)| +|LayoutXLM| LayoutXLM-base | SER | [ser_layoutxlm_xfund_zh.yml](../../configs/kie/layoutlm_series/ser_layoutxlm_xfund_zh.yml)|90.38%| 19.49 |[trained model](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutXLM_xfun_zh.tar)| +|LayoutLM| LayoutLM-base | SER | [ser_layoutlm_xfund_zh.yml](../../configs/kie/layoutlm_series/ser_layoutlm_xfund_zh.yml)|77.31%|-|[trained model](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutLM_xfun_zh.tar)| +|LayoutLMv2| LayoutLMv2-base | SER | [ser_layoutlmv2_xfund_zh.yml](../../configs/kie/layoutlm_series/ser_layoutlmv2_xfund_zh.yml)|85.44%|31.46|[trained model](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutLMv2_xfun_zh.tar)| +|VI-LayoutXLM| VI-LayoutXLM-base | RE | [re_vi_layoutxlm_xfund_zh_udml.yml](../../configs/kie/vi_layoutxlm/re_vi_layoutxlm_xfund_zh_udml.yml)|**83.92%**|15.49|[trained model](https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/re_vi_layoutxlm_xfund_pretrained.tar)| +|LayoutXLM| LayoutXLM-base | RE | [re_layoutxlm_xfund_zh.yml](../../configs/kie/layoutlm_series/re_layoutxlm_xfund_zh.yml)|74.83%|19.49|[trained model](https://paddleocr.bj.bcebos.com/pplayout/re_LayoutXLM_xfun_zh.tar)| +|LayoutLMv2| LayoutLMv2-base | RE | [re_layoutlmv2_xfund_zh.yml](../../configs/kie/layoutlm_series/re_layoutlmv2_xfund_zh.yml)|67.77%|31.46|[trained model](https://paddleocr.bj.bcebos.com/pplayout/re_LayoutLMv2_xfun_zh.tar)| + +* Note: The above time cost information just considers inference time without preprocess or postprocess, test environment: `V100 GPU + CUDA 10.2 + CUDNN 8.1.1 + TRT 7.2.3.4` + + +On wildreceipt dataset, the algorithm result is as follows: + +|Model|Backbone|Config|Hmean|Download link| +| --- | --- | --- | --- | --- | +|SDMGR|VGG6|[configs/kie/sdmgr/kie_unet_sdmgr.yml](../../configs/kie/sdmgr/kie_unet_sdmgr.yml)|86.7%|[trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.1/kie/kie_vgg16.tar)| diff --git a/ppstructure/docs/ppstructurev2_pipeline.png b/ppstructure/docs/ppstructurev2_pipeline.png new file mode 100644 index 0000000000000000000000000000000000000000..b53a290a6dbc396449374cc694dd01c304325739 Binary files /dev/null and b/ppstructure/docs/ppstructurev2_pipeline.png differ diff --git a/ppstructure/docs/quickstart.md b/ppstructure/docs/quickstart.md index 31e59416247b4f0e6b6d82fb13e0d3841a113a5f..60642f78b6691c3ac2eeba99680a2af23299ddc9 100644 --- a/ppstructure/docs/quickstart.md +++ b/ppstructure/docs/quickstart.md @@ -1,80 +1,127 @@ # PP-Structure 快速开始 -- [1. 安装依赖包](#1) -- [2. 便捷使用](#2) - - [2.1 命令行使用](#21) - - [2.1.1 版面分析+表格识别](#211) - - [2.1.2 版面分析](#212) - - [2.1.3 表格识别](#213) - - [2.1.4 DocVQA](#214) - - [2.2 代码使用](#22) - - [2.2.1 版面分析+表格识别](#221) - - [2.2.2 版面分析](#222) - - [2.2.3 表格识别](#223) - - [2.2.4 DocVQA](#224) - - [2.3 返回结果说明](#23) - - [2.3.1 版面分析+表格识别](#231) - - [2.3.2 DocVQA](#232) - - [2.4 参数说明](#24) - +- [1. 准备环境](#1-准备环境) +- [2. 便捷使用](#2-便捷使用) + - [2.1 命令行使用](#21-命令行使用) + - [2.1.1 图像方向分类+版面分析+表格识别](#211-图像方向分类版面分析表格识别) + - [2.1.2 版面分析+表格识别](#212-版面分析表格识别) + - [2.1.3 版面分析](#213-版面分析) + - [2.1.4 表格识别](#214-表格识别) + - [2.1.5 关键信息抽取](#215-关键信息抽取) + - [2.1.6 版面恢复](#216-版面恢复) + - [2.2 Python脚本使用](#22-Python脚本使用) + - [2.2.1 图像方向分类+版面分析+表格识别](#221-图像方向分类版面分析表格识别) + - [2.2.2 版面分析+表格识别](#222-版面分析表格识别) + - [2.2.3 版面分析](#223-版面分析) + - [2.2.4 表格识别](#224-表格识别) + - [2.2.5 关键信息抽取](#225-关键信息抽取) + - [2.2.6 版面恢复](#226-版面恢复) + - [2.3 返回结果说明](#23-返回结果说明) + - [2.3.1 版面分析+表格识别](#231-版面分析表格识别) + - [2.3.2 关键信息抽取](#232-关键信息抽取) + - [2.4 参数说明](#24-参数说明) +- [3. 小结](#3-小结) -## 1. 安装依赖包 +## 1. 准备环境 +### 1.1 安装PaddlePaddle +> 如果您没有基础的Python运行环境,请参考[运行环境准备](../../doc/doc_ch/environment.md)。 + +- 您的机器安装的是CUDA9或CUDA10,请运行以下命令安装 + + ```bash + python3 -m pip install paddlepaddle-gpu -i https://mirror.baidu.com/pypi/simple + ``` + +- 您的机器是CPU,请运行以下命令安装 + + ```bash + python3 -m pip install paddlepaddle -i https://mirror.baidu.com/pypi/simple + ``` + +更多的版本需求,请参照[飞桨官网安装文档](https://www.paddlepaddle.org.cn/install/quick)中的说明进行操作。 + +### 1.2 安装PaddleOCR whl包 ```bash -# 安装 paddleocr,推荐使用2.5+版本 -pip3 install "paddleocr>=2.5" -# 安装 版面分析依赖包layoutparser(如不需要版面分析功能,可跳过) -pip3 install -U https://paddleocr.bj.bcebos.com/whl/layoutparser-0.0.0-py3-none-any.whl -# 安装 DocVQA依赖包paddlenlp(如不需要DocVQA功能,可跳过) -pip install paddlenlp +# 安装 paddleocr,推荐使用2.6版本 +pip3 install "paddleocr>=2.6" + +# 安装 图像方向分类依赖包paddleclas(如不需要图像方向分类功能,可跳过) +pip3 install paddleclas>=2.4.3 +# 安装 关键信息抽取 依赖包(如不需要KIE功能,可跳过) +pip3 install -r ppstructure/kie/requirements.txt + +# 安装 版面恢复 依赖包(如不需要版面恢复功能,可跳过) +pip3 install -r ppstructure/recovery/requirements.txt ``` + ## 2. 便捷使用 ### 2.1 命令行使用 -#### 2.1.1 版面分析+表格识别 +#### 2.1.1 图像方向分类+版面分析+表格识别 ```bash -paddleocr --image_dir=PaddleOCR/ppstructure/docs/table/1.png --type=structure +paddleocr --image_dir=ppstructure/docs/table/1.png --type=structure --image_orientation=true ``` -#### 2.1.2 版面分析 +#### 2.1.2 版面分析+表格识别 ```bash -paddleocr --image_dir=PaddleOCR/ppstructure/docs/table/1.png --type=structure --table=false --ocr=false +paddleocr --image_dir=ppstructure/docs/table/1.png --type=structure ``` -#### 2.1.3 表格识别 +#### 2.1.3 版面分析 ```bash -paddleocr --image_dir=PaddleOCR/ppstructure/docs/table/table.jpg --type=structure --layout=false +paddleocr --image_dir=ppstructure/docs/table/1.png --type=structure --table=false --ocr=false ``` -#### 2.1.4 DocVQA +#### 2.1.4 表格识别 +```bash +paddleocr --image_dir=ppstructure/docs/table/table.jpg --type=structure --layout=false +``` + + -请参考:[文档视觉问答](../vqa/README.md)。 +#### 2.1.5 关键信息抽取 +关键信息抽取暂不支持通过whl包调用,详细使用教程请参考:[关键信息抽取教程](../kie/README_ch.md)。 + + + +#### 2.1.6 版面恢复 + +```bash +# 中文测试图 +paddleocr --image_dir=ppstructure/docs/table/1.png --type=structure --recovery=true +# 英文测试图 +paddleocr --image_dir=ppstructure/docs/table/1.png --type=structure --recovery=true --lang='en' +# pdf测试文件 +paddleocr --image_dir=ppstructure/recovery/UnrealText.pdf --type=structure --recovery=true --lang='en' +``` -### 2.2 代码使用 + +### 2.2 Python脚本使用 -#### 2.2.1 版面分析+表格识别 +#### 2.2.1 图像方向分类+版面分析+表格识别 ```python import os import cv2 from paddleocr import PPStructure,draw_structure_result,save_structure_res -table_engine = PPStructure(show_log=True) +table_engine = PPStructure(show_log=True, image_orientation=True) save_folder = './output' -img_path = 'PaddleOCR/ppstructure/docs/table/1.png' +img_path = 'ppstructure/docs/table/1.png' img = cv2.imread(img_path) result = table_engine(img) save_structure_res(result, save_folder,os.path.basename(img_path).split('.')[0]) @@ -85,7 +132,7 @@ for line in result: from PIL import Image -font_path = 'PaddleOCR/doc/fonts/simfang.ttf' # PaddleOCR下提供字体包 +font_path = 'doc/fonts/simfang.ttf' # PaddleOCR下提供字体包 image = Image.open(img_path).convert('RGB') im_show = draw_structure_result(image, result,font_path=font_path) im_show = Image.fromarray(im_show) @@ -93,7 +140,36 @@ im_show.save('result.jpg') ``` -#### 2.2.2 版面分析 +#### 2.2.2 版面分析+表格识别 + +```python +import os +import cv2 +from paddleocr import PPStructure,draw_structure_result,save_structure_res + +table_engine = PPStructure(show_log=True) + +save_folder = './output' +img_path = 'ppstructure/docs/table/1.png' +img = cv2.imread(img_path) +result = table_engine(img) +save_structure_res(result, save_folder,os.path.basename(img_path).split('.')[0]) + +for line in result: + line.pop('img') + print(line) + +from PIL import Image + +font_path = 'doc/fonts/simfang.ttf' # PaddleOCR下提供字体包 +image = Image.open(img_path).convert('RGB') +im_show = draw_structure_result(image, result,font_path=font_path) +im_show = Image.fromarray(im_show) +im_show.save('result.jpg') +``` + + +#### 2.2.3 版面分析 ```python import os @@ -103,7 +179,7 @@ from paddleocr import PPStructure,save_structure_res table_engine = PPStructure(table=False, ocr=False, show_log=True) save_folder = './output' -img_path = 'PaddleOCR/ppstructure/docs/table/1.png' +img_path = 'ppstructure/docs/table/1.png' img = cv2.imread(img_path) result = table_engine(img) save_structure_res(result, save_folder, os.path.basename(img_path).split('.')[0]) @@ -113,8 +189,9 @@ for line in result: print(line) ``` - -#### 2.2.3 表格识别 + + +#### 2.2.4 表格识别 ```python import os @@ -124,7 +201,7 @@ from paddleocr import PPStructure,save_structure_res table_engine = PPStructure(layout=False, show_log=True) save_folder = './output' -img_path = 'PaddleOCR/ppstructure/docs/table/table.jpg' +img_path = 'ppstructure/docs/table/table.jpg' img = cv2.imread(img_path) result = table_engine(img) save_structure_res(result, save_folder, os.path.basename(img_path).split('.')[0]) @@ -134,14 +211,44 @@ for line in result: print(line) ``` - -#### 2.2.4 DocVQA + +#### 2.2.5 关键信息抽取 + +关键信息抽取暂不支持通过whl包调用,详细使用教程请参考:[关键信息抽取教程](../kie/README_ch.md)。 + + + +#### 2.2.6 版面恢复 -请参考:[文档视觉问答](../vqa/README.md)。 +```python +import os +import cv2 +from paddleocr import PPStructure,save_structure_res +from paddleocr.ppstructure.recovery.recovery_to_doc import sorted_layout_boxes, convert_info_docx + +# 中文测试图 +table_engine = PPStructure(recovery=True) +# 英文测试图 +# table_engine = PPStructure(recovery=True, lang='en') + +save_folder = './output' +img_path = 'ppstructure/docs/table/1.png' +img = cv2.imread(img_path) +result = table_engine(img) +save_structure_res(result, save_folder, os.path.basename(img_path).split('.')[0]) + +for line in result: + line.pop('img') + print(line) + +h, w, _ = img.shape +res = sorted_layout_boxes(result, w) +convert_info_docx(img, res, save_folder, os.path.basename(img_path).split('.')[0]) +``` ### 2.3 返回结果说明 -PP-Structure的返回结果为一个dict组成的list,示例如下 +PP-Structure的返回结果为一个dict组成的list,示例如下: #### 2.3.1 版面分析+表格识别 @@ -154,12 +261,12 @@ PP-Structure的返回结果为一个dict组成的list,示例如下 } ] ``` -dict 里各个字段说明如下 +dict 里各个字段说明如下: -| 字段 | 说明 | -| --------------- |-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -|type| 图片区域的类型 | -|bbox| 图片区域的在原图的坐标,分别[左上角x,左上角y,右下角x,右下角y] | +| 字段 | 说明| +| --- |---| +|type| 图片区域的类型 | +|bbox| 图片区域的在原图的坐标,分别[左上角x,左上角y,右下角x,右下角y]| |res| 图片区域的OCR或表格识别结果。
表格: 一个dict,字段说明如下
        `html`: 表格的HTML字符串
        在代码使用模式下,前向传入return_ocr_result_in_table=True可以拿到表格中每个文本的检测识别结果,对应为如下字段:
        `boxes`: 文本检测坐标
        `rec_res`: 文本识别结果。
OCR: 一个包含各个单行文字的检测坐标和识别结果的元组 | 运行完成后,每张图片会在`output`字段指定的目录下有一个同名目录,图片里的每个表格会存储为一个excel,图片区域会被裁剪之后保存下来,excel文件和图片名为表格在图片里的坐标。 @@ -173,27 +280,39 @@ dict 里各个字段说明如下 ``` -#### 2.3.2 DocVQA +#### 2.3.2 关键信息抽取 -请参考:[文档视觉问答](../vqa/README.md)。 +请参考:[关键信息抽取教程](../kie/README_ch.md)。 ### 2.4 参数说明 -| 字段 | 说明 | 默认值 | -|----------------------|----------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------| -| output | excel和识别结果保存的地址 | ./output/table | -| table_max_len | 表格结构模型预测时,图像的长边resize尺度 | 488 | -| table_model_dir | 表格结构模型 inference 模型地址 | None | -| table_char_dict_path | 表格结构模型所用字典地址 | ../ppocr/utils/dict/table_structure_dict.txt | -| layout_path_model | 版面分析模型模型地址,可以为在线地址或者本地地址,当为本地地址时,需要指定 layout_label_map, 命令行模式下可通过--layout_label_map='{0: "Text", 1: "Title", 2: "List", 3:"Table", 4:"Figure"}' 指定 | lp://PubLayNet/ppyolov2_r50vd_dcn_365e_publaynet/config | -| layout_label_map | 版面分析模型模型label映射字典 | None | -| model_name_or_path | VQA SER模型地址 | None | -| max_seq_length | VQA SER模型最大支持token长度 | 512 | -| label_map_path | VQA SER 标签文件地址 | ./vqa/labels/labels_ser.txt | -| layout | 前向中是否执行版面分析 | True | -| table | 前向中是否执行表格识别 | True | -| ocr | 对于版面分析中的非表格区域,是否执行ocr。当layout为False时会被自动设置为False | True | -| structure_version | 表格结构化模型版本,可选 PP-STRUCTURE。PP-STRUCTURE支持表格结构化模型 | PP-STRUCTURE | +| 字段 | 说明 | 默认值 | +|---|---|---| +| output | 结果保存地址 | ./output/table | +| table_max_len | 表格结构模型预测时,图像的长边resize尺度 | 488 | +| table_model_dir | 表格结构模型 inference 模型地址| None | +| table_char_dict_path | 表格结构模型所用字典地址 | ../ppocr/utils/dict/table_structure_dict.txt | +| merge_no_span_structure | 表格识别模型中,是否对'\'和'\' 进行合并 | False | +| layout_model_dir | 版面分析模型 inference 模型地址 | None | +| layout_dict_path | 版面分析模型字典| ../ppocr/utils/dict/layout_publaynet_dict.txt | +| layout_score_threshold | 版面分析模型检测框阈值| 0.5| +| layout_nms_threshold | 版面分析模型nms阈值| 0.5| +| kie_algorithm | kie模型算法| LayoutXLM| +| ser_model_dir | ser模型 inference 模型地址| None| +| ser_dict_path | ser模型字典| ../train_data/XFUND/class_list_xfun.txt| +| mode | structure or kie | structure | +| image_orientation | 前向中是否执行图像方向分类 | False | +| layout | 前向中是否执行版面分析 | True | +| table | 前向中是否执行表格识别 | True | +| ocr | 对于版面分析中的非表格区域,是否执行ocr。当layout为False时会被自动设置为False| True | +| recovery | 前向中是否执行版面恢复| False | +| save_pdf | 版面恢复导出docx文件的同时,是否导出pdf文件 | False | +| structure_version | 模型版本,可选 PP-structure和PP-structurev2 | PP-structure | 大部分参数和PaddleOCR whl包保持一致,见 [whl包文档](../../doc/doc_ch/whl.md) + + +## 3. 小结 + +通过本节内容,相信您已经熟练掌握通过PaddleOCR whl包调用PP-Structure相关功能的使用方法,您可以参考[文档教程](../../README_ch.md#文档教程),获取包括模型训练、推理部署等更详细的使用教程。 diff --git a/ppstructure/docs/quickstart_en.md b/ppstructure/docs/quickstart_en.md index 1f78b43ea3334648a37a37745737a6a26e27ece3..e0eec4b38ba57b1bebd0e711093e5dfd4773fdd9 100644 --- a/ppstructure/docs/quickstart_en.md +++ b/ppstructure/docs/quickstart_en.md @@ -1,80 +1,122 @@ # PP-Structure Quick Start -- [1. Install package](#1) -- [2. Use](#2) - - [2.1 Use by command line](#21) - - [2.1.1 layout analysis + table recognition](#211) - - [2.1.2 layout analysis](#212) - - [2.1.3 table recognition](#213) - - [2.1.4 DocVQA](#214) - - [2.2 Use by code](#22) - - [2.2.1 layout analysis + table recognition](#221) - - [2.2.2 layout analysis](#222) - - [2.2.3 table recognition](#223) - - [2.2.4 DocVQA](#224) - - [2.3 Result description](#23) - - [2.3.1 layout analysis + table recognition](#231) - - [2.3.2 DocVQA](#232) - - [2.4 Parameter Description](#24) +- [1. Environment Preparation](#1-environment-preparation) +- [2. Quick Use](#2-quick-use) + - [2.1 Use by command line](#21-use-by-command-line) + - [2.1.1 image orientation + layout analysis + table recognition](#211-image-orientation--layout-analysis--table-recognition) + - [2.1.2 layout analysis + table recognition](#212-layout-analysis--table-recognition) + - [2.1.3 layout analysis](#213-layout-analysis) + - [2.1.4 table recognition](#214-table-recognition) + - [2.1.5 Key Information Extraction](#215-Key-Information-Extraction) + - [2.1.6 layout recovery](#216-layout-recovery) + - [2.2 Use by python script](#22-use-by-python-script) + - [2.2.1 image orientation + layout analysis + table recognition](#221-image-orientation--layout-analysis--table-recognition) + - [2.2.2 layout analysis + table recognition](#222-layout-analysis--table-recognition) + - [2.2.3 layout analysis](#223-layout-analysis) + - [2.2.4 table recognition](#224-table-recognition) + - [2.2.5 Key Information Extraction](#225-Key-Information-Extraction) + - [2.2.6 layout recovery](#226-layout-recovery) + - [2.3 Result description](#23-result-description) + - [2.3.1 layout analysis + table recognition](#231-layout-analysis--table-recognition) + - [2.3.2 Key Information Extraction](#232-Key-Information-Extraction) + - [2.4 Parameter Description](#24-parameter-description) +- [3. Summary](#3-summary) -## 1. Install package +## 1. Environment Preparation +### 1.1 Install PaddlePaddle + +> If you do not have a Python environment, please refer to [Environment Preparation](./environment_en.md). + +- If you have CUDA 9 or CUDA 10 installed on your machine, please run the following command to install + + ```bash + python3 -m pip install paddlepaddle-gpu -i https://mirror.baidu.com/pypi/simple + ``` + +- If you have no available GPU on your machine, please run the following command to install the CPU version + + ```bash + python3 -m pip install paddlepaddle -i https://mirror.baidu.com/pypi/simple + ``` + +For more software version requirements, please refer to the instructions in [Installation Document](https://www.paddlepaddle.org.cn/install/quick) for operation. + +### 1.2 Install PaddleOCR Whl Package ```bash -# Install paddleocr, version 2.5+ is recommended -pip3 install "paddleocr>=2.5" -# Install layoutparser (if you do not use the layout analysis, you can skip it) -pip3 install -U https://paddleocr.bj.bcebos.com/whl/layoutparser-0.0.0-py3-none-any.whl -# Install the DocVQA dependency package paddlenlp (if you do not use the DocVQA, you can skip it) -pip install paddlenlp +# Install paddleocr, version 2.6 is recommended +pip3 install "paddleocr>=2.6" +# Install the image direction classification dependency package paddleclas (if you do not use the image direction classification, you can skip it) +pip3 install paddleclas>=2.4.3 + +# Install the KIE dependency packages (if you do not use the KIE, you can skip it) +pip3 install -r kie/requirements.txt + +# Install the layout recovery dependency packages (if you do not use the layout recovery, you can skip it) +pip3 install -r recovery/requirements.txt ``` -## 2. Use + +## 2. Quick Use ### 2.1 Use by command line -#### 2.1.1 layout analysis + table recognition +#### 2.1.1 image orientation + layout analysis + table recognition ```bash -paddleocr --image_dir=PaddleOCR/ppstructure/docs/table/1.png --type=structure +paddleocr --image_dir=ppstructure/docs/table/1.png --type=structure --image_orientation=true ``` -#### 2.1.2 layout analysis +#### 2.1.2 layout analysis + table recognition ```bash -paddleocr --image_dir=PaddleOCR/ppstructure/docs/table/1.png --type=structure --table=false --ocr=false +paddleocr --image_dir=ppstructure/docs/table/1.png --type=structure ``` -#### 2.1.3 table recognition +#### 2.1.3 layout analysis ```bash -paddleocr --image_dir=PaddleOCR/ppstructure/docs/table/table.jpg --type=structure --layout=false +paddleocr --image_dir=ppstructure/docs/table/1.png --type=structure --table=false --ocr=false ``` -#### 2.1.4 DocVQA +#### 2.1.4 table recognition +```bash +paddleocr --image_dir=ppstructure/docs/table/table.jpg --type=structure --layout=false +``` -Please refer to: [Documentation Visual Q&A](../vqa/README.md) . + + +#### 2.1.5 Key Information Extraction + +Key information extraction does not currently support use by the whl package. For detailed usage tutorials, please refer to: [Key Information Extraction](../kie/README.md). + + +#### 2.1.6 layout recovery +``` +paddleocr --image_dir=ppstructure/docs/table/1.png --type=structure --recovery=true --lang='en' +``` -### 2.2 Use by code +### 2.2 Use by python script -#### 2.2.1 layout analysis + table recognition +#### 2.2.1 image orientation + layout analysis + table recognition ```python import os import cv2 from paddleocr import PPStructure,draw_structure_result,save_structure_res -table_engine = PPStructure(show_log=True) +table_engine = PPStructure(show_log=True, image_orientation=True) save_folder = './output' -img_path = 'PaddleOCR/ppstructure/docs/table/1.png' +img_path = 'ppstructure/docs/table/1.png' img = cv2.imread(img_path) result = table_engine(img) save_structure_res(result, save_folder,os.path.basename(img_path).split('.')[0]) @@ -85,7 +127,7 @@ for line in result: from PIL import Image -font_path = 'PaddleOCR/doc/fonts/simfang.ttf' # PaddleOCR下提供字体包 +font_path = 'doc/fonts/simfang.ttf' # PaddleOCR下提供字体包 image = Image.open(img_path).convert('RGB') im_show = draw_structure_result(image, result,font_path=font_path) im_show = Image.fromarray(im_show) @@ -93,7 +135,36 @@ im_show.save('result.jpg') ``` -#### 2.2.2 layout analysis +#### 2.2.2 layout analysis + table recognition + +```python +import os +import cv2 +from paddleocr import PPStructure,draw_structure_result,save_structure_res + +table_engine = PPStructure(show_log=True) + +save_folder = './output' +img_path = 'ppstructure/docs/table/1.png' +img = cv2.imread(img_path) +result = table_engine(img) +save_structure_res(result, save_folder,os.path.basename(img_path).split('.')[0]) + +for line in result: + line.pop('img') + print(line) + +from PIL import Image + +font_path = 'doc/fonts/simfang.ttf' # font provieded in PaddleOCR +image = Image.open(img_path).convert('RGB') +im_show = draw_structure_result(image, result,font_path=font_path) +im_show = Image.fromarray(im_show) +im_show.save('result.jpg') +``` + + +#### 2.2.3 layout analysis ```python import os @@ -103,7 +174,7 @@ from paddleocr import PPStructure,save_structure_res table_engine = PPStructure(table=False, ocr=False, show_log=True) save_folder = './output' -img_path = 'PaddleOCR/ppstructure/docs/table/1.png' +img_path = 'ppstructure/docs/table/1.png' img = cv2.imread(img_path) result = table_engine(img) save_structure_res(result, save_folder, os.path.basename(img_path).split('.')[0]) @@ -113,8 +184,8 @@ for line in result: print(line) ``` - -#### 2.2.3 table recognition + +#### 2.2.4 table recognition ```python import os @@ -124,7 +195,7 @@ from paddleocr import PPStructure,save_structure_res table_engine = PPStructure(layout=False, show_log=True) save_folder = './output' -img_path = 'PaddleOCR/ppstructure/docs/table/table.jpg' +img_path = 'ppstructure/docs/table/table.jpg' img = cv2.imread(img_path) result = table_engine(img) save_structure_res(result, save_folder, os.path.basename(img_path).split('.')[0]) @@ -134,10 +205,39 @@ for line in result: print(line) ``` - -#### 2.2.4 DocVQA + +#### 2.2.5 Key Information Extraction -Please refer to: [Documentation Visual Q&A](../vqa/README.md) . +Key information extraction does not currently support use by the whl package. For detailed usage tutorials, please refer to: [Key Information Extraction](../kie/README.md). + + +#### 2.2.6 layout recovery + +```python +import os +import cv2 +from paddleocr import PPStructure,save_structure_res +from paddleocr.ppstructure.recovery.recovery_to_doc import sorted_layout_boxes, convert_info_docx + +# Chinese image +table_engine = PPStructure(recovery=True) +# English image +# table_engine = PPStructure(recovery=True, lang='en') + +save_folder = './output' +img_path = 'ppstructure/docs/table/1.png' +img = cv2.imread(img_path) +result = table_engine(img) +save_structure_res(result, save_folder, os.path.basename(img_path).split('.')[0]) + +for line in result: + line.pop('img') + print(line) + +h, w, _ = img.shape +res = sorted_layout_boxes(result, w) +convert_info_docx(img, res, save_folder, os.path.basename(img_path).split('.')[0]) +``` ### 2.3 Result description @@ -157,10 +257,10 @@ The return of PP-Structure is a list of dicts, the example is as follows: ``` Each field in dict is described as follows: -| field | description | -| --------------- |--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -|type| Type of image area. | -|bbox| The coordinates of the image area in the original image, respectively [upper left corner x, upper left corner y, lower right corner x, lower right corner y]. | +| field | description | +| --- |---| +|type| Type of image area. | +|bbox| The coordinates of the image area in the original image, respectively [upper left corner x, upper left corner y, lower right corner x, lower right corner y]. | |res| OCR or table recognition result of the image area.
table: a dict with field descriptions as follows:
        `html`: html str of table.
        In the code usage mode, set return_ocr_result_in_table=True whrn call can get the detection and recognition results of each text in the table area, corresponding to the following fields:
        `boxes`: text detection boxes.
        `rec_res`: text recognition results.
OCR: A tuple containing the detection boxes and recognition results of each single text. | After the recognition is completed, each image will have a directory with the same name under the directory specified by the `output` field. Each table in the image will be stored as an excel, and the picture area will be cropped and saved. The filename of excel and picture is their coordinates in the image. @@ -173,26 +273,39 @@ After the recognition is completed, each image will have a directory with the sa ``` -#### 2.3.2 DocVQA +#### 2.3.2 Key Information Extraction -Please refer to: [Documentation Visual Q&A](../vqa/README.md) . +Please refer to: [Key Information Extraction](../kie/README.md) . ### 2.4 Parameter Description -| field | description | default | -|----------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------| -| output | The save path of result | ./output/table | -| table_max_len | When the table structure model predicts, the long side of the image | 488 | -| table_model_dir | the path of table structure model | None | -| table_char_dict_path | the dict path of table structure model | ../ppocr/utils/dict/table_structure_dict.txt | -| layout_path_model | The model path of the layout analysis model, which can be an online address or a local path. When it is a local path, layout_label_map needs to be set. In command line mode, use --layout_label_map='{0: "Text", 1: "Title", 2: "List", 3:"Table", 4:"Figure"}' | lp://PubLayNet/ppyolov2_r50vd_dcn_365e_publaynet/config | -| layout_label_map | Layout analysis model model label mapping dictionary path | None | -| model_name_or_path | the model path of VQA SER model | None | -| max_seq_length | the max token length of VQA SER model | 512 | -| label_map_path | the label path of VQA SER model | ./vqa/labels/labels_ser.txt | -| layout | Whether to perform layout analysis in forward | True | -| table | Whether to perform table recognition in forward | True | -| ocr | Whether to perform ocr for non-table areas in layout analysis. When layout is False, it will be automatically set to False | True | -| structure_version | table structure Model version number, the current model support list is as follows: PP-STRUCTURE support english table structure model | PP-STRUCTURE | +| field | description | default | +|---|---|---| +| output | result save path | ./output/table | +| table_max_len | long side of the image resize in table structure model | 488 | +| table_model_dir | Table structure model inference model path| None | +| table_char_dict_path | The dictionary path of table structure model | ../ppocr/utils/dict/table_structure_dict.txt | +| merge_no_span_structure | In the table recognition model, whether to merge '\' and '\' | False | +| layout_model_dir | Layout analysis model inference model path| None | +| layout_dict_path | The dictionary path of layout analysis model| ../ppocr/utils/dict/layout_publaynet_dict.txt | +| layout_score_threshold | The box threshold path of layout analysis model| 0.5| +| layout_nms_threshold | The nms threshold path of layout analysis model| 0.5| +| kie_algorithm | kie model algorithm| LayoutXLM| +| ser_model_dir | Ser model inference model path| None| +| ser_dict_path | The dictionary path of Ser model| ../train_data/XFUND/class_list_xfun.txt| +| mode | structure or kie | structure | +| image_orientation | Whether to perform image orientation classification in forward | False | +| layout | Whether to perform layout analysis in forward | True | +| table | Whether to perform table recognition in forward | True | +| ocr | Whether to perform ocr for non-table areas in layout analysis. When layout is False, it will be automatically set to False| True | +| recovery | Whether to perform layout recovery in forward| False | +| save_pdf | Whether to convert docx to pdf when recovery| False | +| structure_version | Structure version, optional PP-structure and PP-structurev2 | PP-structure | + Most of the parameters are consistent with the PaddleOCR whl package, see [whl package documentation](../../doc/doc_en/whl.md) + + +## 3. Summary + +Through the content in this section, you can master the use of PP-Structure related functions through PaddleOCR whl package. Please refer to [documentation tutorial](../../README.md) for more detailed usage tutorials including model training, inference and deployment, etc. diff --git a/ppstructure/docs/recovery/UnrealText.pdf b/ppstructure/docs/recovery/UnrealText.pdf new file mode 100644 index 0000000000000000000000000000000000000000..0b5cf961af4ebf09cb96fc3f09fb9c19abec68f1 Binary files /dev/null and b/ppstructure/docs/recovery/UnrealText.pdf differ diff --git a/ppstructure/docs/recovery/recovery.jpg b/ppstructure/docs/recovery/recovery.jpg new file mode 100644 index 0000000000000000000000000000000000000000..a3817ab70eff5b380072701b70ab227ae6c8184c Binary files /dev/null and b/ppstructure/docs/recovery/recovery.jpg differ diff --git a/ppstructure/docs/recovery/recovery_ch.jpg b/ppstructure/docs/recovery/recovery_ch.jpg new file mode 100644 index 0000000000000000000000000000000000000000..df5a5063f036053673041b92a01f288b3e1d246b Binary files /dev/null and b/ppstructure/docs/recovery/recovery_ch.jpg differ diff --git a/ppstructure/docs/table/layout.jpg b/ppstructure/docs/table/layout.jpg index db7246b314556d73cd49d049b9b480887b6ef994..c5c39dac7267d8c76121ee686a5931a551903d6f 100644 Binary files a/ppstructure/docs/table/layout.jpg and b/ppstructure/docs/table/layout.jpg differ diff --git a/ppstructure/docs/table/paper-image.jpg b/ppstructure/docs/table/paper-image.jpg index db7246b314556d73cd49d049b9b480887b6ef994..c5c39dac7267d8c76121ee686a5931a551903d6f 100644 Binary files a/ppstructure/docs/table/paper-image.jpg and b/ppstructure/docs/table/paper-image.jpg differ diff --git a/ppstructure/docs/table/recovery.jpg b/ppstructure/docs/table/recovery.jpg deleted file mode 100644 index bee2e2fb3499ec4b348e2b2f1475a87c9c562190..0000000000000000000000000000000000000000 Binary files a/ppstructure/docs/table/recovery.jpg and /dev/null differ diff --git a/ppstructure/docs/vqa/result_ser/zh_val_42_ser.jpg b/ppstructure/docs/vqa/result_ser/zh_val_42_ser.jpg deleted file mode 100644 index 13bc7272e49a03115085d4a7420a7acfb92d3260..0000000000000000000000000000000000000000 Binary files a/ppstructure/docs/vqa/result_ser/zh_val_42_ser.jpg and /dev/null differ diff --git a/ppstructure/kie/README.md b/ppstructure/kie/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b3b4d47d86d0cf2871ff96951afa0007306a572b --- /dev/null +++ b/ppstructure/kie/README.md @@ -0,0 +1,259 @@ +English | [简体中文](README_ch.md) + +# Key Information Extraction (KIE) + +- [1. Introduction](#1-introduction) +- [2. Performance](#2-performance) +- [3. Visualization](#3-visualization) + - [3.1 SER](#31-ser) + - [3.2 RE](#32-re) +- [4. Usage](#4-usage) + - [4.1 Prepare for the environment](#41-prepare-for-the-environment) + - [4.2 Quick start](#42-quick-start) + - [4.3 More](#43-more) +- [5. Reference](#5-reference) +- [6. License](#6-license) + + +## 1. Introduction + +Key information extraction (KIE) refers to extracting key information from text or images. As downstream task of OCR, the key information extraction task of document image has many practical application scenarios, such as form recognition, ticket information extraction, ID card information extraction, etc. + +PP-Structure conducts research based on the LayoutXLM multi-modal, and proposes the VI-LayoutXLM, which gets rid of visual features when finetuning the downstream tasks. An textline sorting method is also utilized to fit in reading order. What's more, UDML knowledge distillation is used for higher accuracy. Finally, the accuracy and inference speed of VI-LayoutXLM surpass those of LayoutXLM. + +The main features of the key information extraction module in PP-Structure are as follows. + + +- Integrate multi-modal methods such as [LayoutXLM](https://arxiv.org/pdf/2104.08836.pdf), VI-LayoutXLM, and PP-OCR inference engine. +- Supports Semantic Entity Recognition (SER) and Relation Extraction (RE) tasks based on multimodal methods. Based on the SER task, the text recognition and classification in the image can be completed; based on the RE task, the relationship extraction of the text content in the image can be completed, such as judging the problem pair (pair). +- Supports custom training for SER tasks and RE tasks. +- Supports end-to-end system prediction and evaluation of OCR+SER. +- Supports end-to-end system prediction of OCR+SER+RE. +- Support SER model export and inference using PaddleInference. + + +## 2. Performance + +We evaluate the methods on the Chinese dataset of [XFUND](https://github.com/doc-analysis/XFUND), and the performance is as follows + +|Model | Backbone | Task | Config file | Hmean | Inference time (ms) | Download link| +| --- | --- | --- | --- | --- | --- | --- | +|VI-LayoutXLM| VI-LayoutXLM-base | SER | [ser_vi_layoutxlm_xfund_zh_udml.yml](../../configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh_udml.yml)|**93.19%**| 15.49|[trained model](https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/ser_vi_layoutxlm_xfund_pretrained.tar)| +|LayoutXLM| LayoutXLM-base | SER | [ser_layoutxlm_xfund_zh.yml](../../configs/kie/layoutlm_series/ser_layoutxlm_xfund_zh.yml)|90.38%| 19.49 | [trained model](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutXLM_xfun_zh.tar)| +|VI-LayoutXLM| VI-LayoutXLM-base | RE | [re_vi_layoutxlm_xfund_zh_udml.yml](../../configs/kie/vi_layoutxlm/re_vi_layoutxlm_xfund_zh_udml.yml)|**83.92%**| 15.49|[trained model](https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/re_vi_layoutxlm_xfund_pretrained.tar)| +|LayoutXLM| LayoutXLM-base | RE | [re_layoutxlm_xfund_zh.yml](../../configs/kie/layoutlm_series/re_layoutxlm_xfund_zh.yml)|74.83%| 19.49|[trained model](https://paddleocr.bj.bcebos.com/pplayout/re_LayoutXLM_xfun_zh.tar)| + + +* Note:Inference environment:V100 GPU + cuda10.2 + cudnn8.1.1 + TensorRT 7.2.3.4,tested using fp16. + +For more KIE models in PaddleOCR, please refer to [KIE model zoo](../../doc/doc_en/algorithm_overview_en.md). + + +## 3. Visualization + +There are two main solutions to the key information extraction task based on VI-LayoutXLM series model. + +(1) Text detection + text recognition + semantic entity recognition (SER) + +(2) Text detection + text recognition + semantic entity recognition (SER) + relationship extraction (RE) + + +The following images are demo results of the SER and RE models. For more detailed introduction to the above solutions, please refer to [KIE Guide](./how_to_do_kie.md). + +### 3.1 SER + +Demo results for SER task are as follows. + +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ + + +**Note:** test pictures are from [xfund dataset](https://github.com/doc-analysis/XFUND), [invoice dataset](https://aistudio.baidu.com/aistudio/datasetdetail/165561) and a composite ID card dataset. + + +Boxes of different colors in the image represent different categories. + +The invoice and application form images have three categories: `request`, `answer` and `header`. The `question` and 'answer' can be used to extract the relationship. + +For the ID card image, the mdoel can be directly identify the key information such as `name`, `gender`, `nationality`, so that the subsequent relationship extraction process is not required, and the key information extraction task can be completed using only on model. + +### 3.2 RE + +Demo results for RE task are as follows. + + +
+ +
+ +
+ +
+ +
+ +
+ +Red boxes are questions, blue boxes are answers. The green lines means the two conected objects are a pair. + + +## 4. Usage + +### 4.1 Prepare for the environment + + +Use the following command to install KIE dependencies. + + +```bash +git clone https://github.com/PaddlePaddle/PaddleOCR.git +cd PaddleOCR +pip install -r requirements.txt +pip install -r ppstructure/kie/requirements.txt +# 安装PaddleOCR引擎用于预测 +pip install paddleocr -U +``` + +The visualized results of SER are saved in the `./output` folder by default. Examples of results are as follows. + + +
+ +
+ + +### 4.2 Quick start + +Here we use XFUND dataset to quickly experience the SER model and RE model. + + +#### 4.2.1 Prepare for the dataset + +```bash +mkdir train_data +cd train_data +# download and uncompress the dataset +wget https://paddleocr.bj.bcebos.com/ppstructure/dataset/XFUND.tar && tar -xf XFUND.tar +cd .. +``` + +#### 4.2.2 Predict images using the trained model + +Use the following command to download the models. + +```bash +mkdir pretrained_model +cd pretrained_model +# download and uncompress the SER trained model +wget https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/ser_vi_layoutxlm_xfund_pretrained.tar && tar -xf ser_vi_layoutxlm_xfund_pretrained.tar + +# download and uncompress the RE trained model +wget https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/re_vi_layoutxlm_xfund_pretrained.tar && tar -xf re_vi_layoutxlm_xfund_pretrained.tar +``` + + +If you want to use OCR engine to obtain end-to-end prediction results, you can use the following command to predict. + +```bash +# just predict using SER trained model +python3 tools/infer_kie_token_ser.py \ + -c configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml \ + -o Architecture.Backbone.checkpoints=./pretrained_model/ser_vi_layoutxlm_xfund_pretrained/best_accuracy \ + Global.infer_img=./ppstructure/docs/kie/input/zh_val_42.jpg + +# predict using SER and RE trained model at the same time +python3 ./tools/infer_kie_token_ser_re.py \ + -c configs/kie/vi_layoutxlm/re_vi_layoutxlm_xfund_zh.yml \ + -o Architecture.Backbone.checkpoints=./pretrained_model/re_vi_layoutxlm_xfund_pretrained/best_accuracy \ + Global.infer_img=./train_data/XFUND/zh_val/image/zh_val_42.jpg \ + -c_ser configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml \ + -o_ser Architecture.Backbone.checkpoints=./pretrained_model/ser_vi_layoutxlm_xfund_pretrained/best_accuracy +``` + +The visual result images and the predicted text file will be saved in the `Global.save_res_path` directory. + + +If you want to load the text detection and recognition results collected before, you can use the following command to predict. + +```bash +# just predict using SER trained model +python3 tools/infer_kie_token_ser.py \ + -c configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml \ + -o Architecture.Backbone.checkpoints=./pretrained_model/ser_vi_layoutxlm_xfund_pretrained/best_accuracy \ + Global.infer_img=./train_data/XFUND/zh_val/val.json \ + Global.infer_mode=False + +# predict using SER and RE trained model at the same time +python3 ./tools/infer_kie_token_ser_re.py \ + -c configs/kie/vi_layoutxlm/re_vi_layoutxlm_xfund_zh.yml \ + -o Architecture.Backbone.checkpoints=./pretrained_model/re_vi_layoutxlm_xfund_pretrained/best_accuracy \ + Global.infer_img=./train_data/XFUND/zh_val/val.json \ + Global.infer_mode=False \ + -c_ser configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml \ + -o_ser Architecture.Backbone.checkpoints=./pretrained_model/ser_vi_layoutxlm_xfund_pretrained/best_accuracy +``` + +#### 4.2.3 Inference using PaddleInference + +At present, only SER model supports inference using PaddleInference. + +Firstly, download the inference SER inference model. + + +```bash +mkdir inference +cd inference +wget https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/ser_vi_layoutxlm_xfund_infer.tar && tar -xf ser_vi_layoutxlm_xfund_infer.tar +``` + +Use the following command for inference. + + +```bash +cd ppstructure +python3 kie/predict_kie_token_ser.py \ + --kie_algorithm=LayoutXLM \ + --ser_model_dir=../inference/ser_vi_layoutxlm_xfund_infer \ + --image_dir=./docs/kie/input/zh_val_42.jpg \ + --ser_dict_path=../train_data/XFUND/class_list_xfun.txt \ + --vis_font_path=../doc/fonts/simfang.ttf \ + --ocr_order_method="tb-yx" +``` + +The visual results and text file will be saved in directory `output`. + + +### 4.3 More + +For training, evaluation and inference tutorial for KIE models, please refer to [KIE doc](../../doc/doc_en/kie_en.md). + +For training, evaluation and inference tutorial for text detection models, please refer to [text detection doc](../../doc/doc_en/detection_en.md). + +For training, evaluation and inference tutorial for text recognition models, please refer to [text recognition doc](../../doc/doc_en/recognition_en.md). + +To complete the key information extraction task in your own scenario from data preparation to model selection, please refer to: [Guide to End-to-end KIE](./how_to_do_kie_en.md)。 + + +## 5. Reference + +- LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding, https://arxiv.org/pdf/2104.08836.pdf +- microsoft/unilm/layoutxlm, https://github.com/microsoft/unilm/tree/master/layoutxlm +- XFUND dataset, https://github.com/doc-analysis/XFUND + +## 6. License + +The content of this project itself is licensed under the [Attribution-NonCommercial-ShareAlike 4.0 International (CC BY-NC-SA 4.0)](https://creativecommons.org/licenses/by-nc-sa/4.0/) diff --git a/ppstructure/kie/README_ch.md b/ppstructure/kie/README_ch.md new file mode 100644 index 0000000000000000000000000000000000000000..cc8c60009f4cb83d349c45573a9fa03832665374 --- /dev/null +++ b/ppstructure/kie/README_ch.md @@ -0,0 +1,241 @@ +[English](README.md) | 简体中文 + +# 关键信息抽取 + +- [1. 简介](#1-简介) +- [2. 精度与性能](#2-精度与性能) +- [3. 效果演示](#3-效果演示) + - [3.1 SER](#31-ser) + - [3.2 RE](#32-re) +- [4. 使用](#4-使用) + - [4.1 准备环境](#41-准备环境) + - [4.2 快速开始](#42-快速开始) + - [4.3 更多](#43-更多) +- [5. 参考链接](#5-参考链接) +- [6. License](#6-License) + + +## 1. 简介 + +关键信息抽取 (Key Information Extraction, KIE)指的是是从文本或者图像中,抽取出关键的信息。针对文档图像的关键信息抽取任务作为OCR的下游任务,存在非常多的实际应用场景,如表单识别、车票信息抽取、身份证信息抽取等。 + +PP-Structure 基于 LayoutXLM 文档多模态系列方法进行研究与优化,设计了视觉特征无关的多模态模型结构VI-LayoutXLM,同时引入符合阅读顺序的文本行排序方法以及UDML联合互学习蒸馏方法,最终在精度与速度均超越LayoutXLM。 + +PP-Structure中关键信息抽取模块的主要特性如下: + +- 集成[LayoutXLM](https://arxiv.org/pdf/2104.08836.pdf)、VI-LayoutXLM等多模态模型以及PP-OCR预测引擎。 +- 支持基于多模态方法的语义实体识别 (Semantic Entity Recognition, SER) 以及关系抽取 (Relation Extraction, RE) 任务。基于 SER 任务,可以完成对图像中的文本识别与分类;基于 RE 任务,可以完成对图象中的文本内容的关系提取,如判断问题对(pair)。 +- 支持SER任务和RE任务的自定义训练。 +- 支持OCR+SER的端到端系统预测与评估。 +- 支持OCR+SER+RE的端到端系统预测。 +- 支持SER模型的动转静导出与基于PaddleInfernece的模型推理。 + + +## 2. 精度与性能 + + +我们在 [XFUND](https://github.com/doc-analysis/XFUND) 的中文数据集上对算法进行了评估,SER与RE上的任务性能如下 + +|模型|骨干网络|任务|配置文件|hmean|预测耗时(ms)|下载链接| +| --- | --- | --- | --- | --- | --- | --- | +|VI-LayoutXLM| VI-LayoutXLM-base | SER | [ser_vi_layoutxlm_xfund_zh_udml.yml](../../configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh_udml.yml)|**93.19%**| 15.49|[训练模型](https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/ser_vi_layoutxlm_xfund_pretrained.tar)| +|LayoutXLM| LayoutXLM-base | SER | [ser_layoutxlm_xfund_zh.yml](../../configs/kie/layoutlm_series/ser_layoutxlm_xfund_zh.yml)|90.38%| 19.49 | [训练模型](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutXLM_xfun_zh.tar)| +|VI-LayoutXLM| VI-LayoutXLM-base | RE | [re_vi_layoutxlm_xfund_zh_udml.yml](../../configs/kie/vi_layoutxlm/re_vi_layoutxlm_xfund_zh_udml.yml)|**83.92%**| 15.49|[训练模型](https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/re_vi_layoutxlm_xfund_pretrained.tar)| +|LayoutXLM| LayoutXLM-base | RE | [re_layoutxlm_xfund_zh.yml](../../configs/kie/layoutlm_series/re_layoutxlm_xfund_zh.yml)|74.83%| 19.49|[训练模型](https://paddleocr.bj.bcebos.com/pplayout/re_LayoutXLM_xfun_zh.tar)| + + +* 注:预测耗时测试条件:V100 GPU + cuda10.2 + cudnn8.1.1 + TensorRT 7.2.3.4,使用FP16进行测试。 + +更多关于PaddleOCR中关键信息抽取模型的介绍,请参考[关键信息抽取模型库](../../doc/doc_ch/algorithm_overview.md)。 + + +## 3. 效果演示 + +基于多模态模型的关键信息抽取任务有2种主要的解决方案。 + +(1)文本检测 + 文本识别 + 语义实体识别(SER) +(2)文本检测 + 文本识别 + 语义实体识别(SER) + 关系抽取(RE) + +下面给出SER与RE任务的示例效果,关于上述解决方案的详细介绍,请参考[关键信息抽取全流程指南](./how_to_do_kie.md)。 + +### 3.1 SER + +对于SER任务,效果如下所示。 + +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +**注意:** 测试图片来源于[XFUND数据集](https://github.com/doc-analysis/XFUND)、[发票数据集](https://aistudio.baidu.com/aistudio/datasetdetail/165561)以及合成的身份证数据集。 + + +图中不同颜色的框表示不同的类别。 + +图中的发票以及申请表图像,有`QUESTION`, `ANSWER`, `HEADER` 3种类别,识别的`QUESTION`, `ANSWER`可以用于后续的问题与答案的关系抽取。 + +图中的身份证图像,则直接识别出其中的`姓名`、`性别`、`民族`等关键信息,这样就无需后续的关系抽取过程,一个模型即可完成关键信息抽取。 + + +### 3.2 RE + +对于RE任务,效果如下所示。 + +
+ +
+ +
+ +
+ +
+ +
+ + +红色框是问题,蓝色框是答案。绿色线条表示连接的两端为一个key-value的pair。 + +## 4. 使用 + +### 4.1 准备环境 + +使用下面的命令安装运行SER与RE关键信息抽取的依赖。 + +```bash +git clone https://github.com/PaddlePaddle/PaddleOCR.git +cd PaddleOCR +pip install -r requirements.txt +pip install -r ppstructure/kie/requirements.txt +# 安装PaddleOCR引擎用于预测 +pip install paddleocr -U +``` + +### 4.2 快速开始 + +下面XFUND数据集,快速体验SER模型与RE模型。 + +#### 4.2.1 准备数据 + +```bash +mkdir train_data +cd train_data +# 下载与解压数据 +wget https://paddleocr.bj.bcebos.com/ppstructure/dataset/XFUND.tar && tar -xf XFUND.tar +cd .. +``` + +#### 4.2.2 基于动态图的预测 + +首先下载模型。 + +```bash +mkdir pretrained_model +cd pretrained_model +# 下载并解压SER预训练模型 +wget https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/ser_vi_layoutxlm_xfund_pretrained.tar && tar -xf ser_vi_layoutxlm_xfund_pretrained.tar + +# 下载并解压RE预训练模型 +wget https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/re_vi_layoutxlm_xfund_pretrained.tar && tar -xf re_vi_layoutxlm_xfund_pretrained.tar +``` + +如果希望使用OCR引擎,获取端到端的预测结果,可以使用下面的命令进行预测。 + +```bash +# 仅预测SER模型 +python3 tools/infer_kie_token_ser.py \ + -c configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml \ + -o Architecture.Backbone.checkpoints=./pretrained_model/ser_vi_layoutxlm_xfund_pretrained/best_accuracy \ + Global.infer_img=./ppstructure/docs/kie/input/zh_val_42.jpg + +# SER + RE模型串联 +python3 ./tools/infer_kie_token_ser_re.py \ + -c configs/kie/vi_layoutxlm/re_vi_layoutxlm_xfund_zh.yml \ + -o Architecture.Backbone.checkpoints=./pretrained_model/re_vi_layoutxlm_xfund_pretrained/best_accuracy \ + Global.infer_img=./train_data/XFUND/zh_val/image/zh_val_42.jpg \ + -c_ser configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml \ + -o_ser Architecture.Backbone.checkpoints=./pretrained_model/ser_vi_layoutxlm_xfund_pretrained/best_accuracy +``` + +`Global.save_res_path`目录中会保存可视化的结果图像以及预测的文本文件。 + + +如果希望加载标注好的文本检测与识别结果,仅预测可以使用下面的命令进行预测。 + +```bash +# 仅预测SER模型 +python3 tools/infer_kie_token_ser.py \ + -c configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml \ + -o Architecture.Backbone.checkpoints=./pretrained_model/ser_vi_layoutxlm_xfund_pretrained/best_accuracy \ + Global.infer_img=./train_data/XFUND/zh_val/val.json \ + Global.infer_mode=False + +# SER + RE模型串联 +python3 ./tools/infer_kie_token_ser_re.py \ + -c configs/kie/vi_layoutxlm/re_vi_layoutxlm_xfund_zh.yml \ + -o Architecture.Backbone.checkpoints=./pretrained_model/re_vi_layoutxlm_xfund_pretrained/best_accuracy \ + Global.infer_img=./train_data/XFUND/zh_val/val.json \ + Global.infer_mode=False \ + -c_ser configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml \ + -o_ser Architecture.Backbone.checkpoints=./pretrained_model/ser_vi_layoutxlm_xfund_pretrained/best_accuracy +``` + +#### 4.2.3 基于PaddleInference的预测 + +目前仅SER模型支持PaddleInference推理。 + +首先下载SER的推理模型。 + + +```bash +mkdir inference +cd inference +wget https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/ser_vi_layoutxlm_xfund_infer.tar && tar -xf ser_vi_layoutxlm_xfund_infer.tar +``` + +执行下面的命令进行预测。 + +```bash +cd ppstructure +python3 kie/predict_kie_token_ser.py \ + --kie_algorithm=LayoutXLM \ + --ser_model_dir=../inference/ser_vi_layoutxlm_xfund_infer \ + --image_dir=./docs/kie/input/zh_val_42.jpg \ + --ser_dict_path=../train_data/XFUND/class_list_xfun.txt \ + --vis_font_path=../doc/fonts/simfang.ttf \ + --ocr_order_method="tb-yx" +``` + +可视化结果保存在`output`目录下。 + +### 4.3 更多 + +关于KIE模型的训练评估与推理,请参考:[关键信息抽取教程](../../doc/doc_ch/kie.md)。 + +关于文本检测模型的训练评估与推理,请参考:[文本检测教程](../../doc/doc_ch/detection.md)。 + +关于文本识别模型的训练评估与推理,请参考:[文本识别教程](../../doc/doc_ch/recognition.md)。 + +关于怎样在自己的场景中完成关键信息抽取任务,请参考:[关键信息抽取全流程指南](./how_to_do_kie.md)。 + + +## 5. 参考链接 + +- LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding, https://arxiv.org/pdf/2104.08836.pdf +- microsoft/unilm/layoutxlm, https://github.com/microsoft/unilm/tree/master/layoutxlm +- XFUND dataset, https://github.com/doc-analysis/XFUND + +## 6. License + +The content of this project itself is licensed under the [Attribution-NonCommercial-ShareAlike 4.0 International (CC BY-NC-SA 4.0)](https://creativecommons.org/licenses/by-nc-sa/4.0/) diff --git a/ppstructure/kie/how_to_do_kie.md b/ppstructure/kie/how_to_do_kie.md new file mode 100644 index 0000000000000000000000000000000000000000..e7ac562b1e567ac2da30becb966193ba8e16979b --- /dev/null +++ b/ppstructure/kie/how_to_do_kie.md @@ -0,0 +1,168 @@ + +# 怎样完成基于图像数据的信息抽取任务 + +- [1. 简介](#1-简介) + - [1.1 背景](#11-背景) + - [1.2 主流方法](#12-主流方法) +- [2. 关键信息抽取任务流程](#2-关键信息抽取任务流程) + - [2.1 训练OCR模型](#21-训练OCR模型) + - [2.2 训练KIE模型](#22-训练KIE模型) +- [3. 参考文献](#3-参考文献) + + +## 1. 简介 + +### 1.1 背景 + +关键信息抽取 (Key Information Extraction, KIE)指的是是从文本或者图像中,抽取出关键的信息。针对文档图像的关键信息抽取任务作为OCR的下游任务,存在非常多的实际应用场景,如表单识别、车票信息抽取、身份证信息抽取等。然而,使用人力从这些文档图像中提取或者收集关键信息耗时费力,怎样自动化融合图像中的视觉、布局、文字等特征并完成关键信息抽取是一个价值与挑战并存的问题。 + +对于特定场景的文档图像,其中的关键信息位置、版式等较为固定,因此在研究早期有很多基于模板匹配的方法进行关键信息的抽取,考虑到其流程较为简单,该方法仍然被广泛应用在目前的很多场景中。但是这种基于模板匹配的方法在应用到不同的场景中时,需要耗费大量精力去调整与适配模板,迁移成本较高。 + +文档图像中的KIE一般包含2个子任务,示意图如下图所示。 + +* (1)SER: 语义实体识别 (Semantic Entity Recognition),对每一个检测到的文本进行分类,如将其分为姓名,身份证。如下图中的黑色框和红色框。 +* (2)RE: 关系抽取 (Relation Extraction),对每一个检测到的文本进行分类,如将其分为问题 (key) 和答案 (value) 。然后对每一个问题找到对应的答案,相当于完成key-value的匹配过程。如下图中的红色框和黑色框分别代表问题和答案,黄色线代表问题和答案之间的对应关系。 + + +
+ +
+ + +### 1.2 基于深度学习的主流方法 + +一般的KIE方法基于命名实体识别(Named Entity Recognition,NER)来展开研究,但是此类方法仅使用了文本信息而忽略了位置与视觉特征信息,因此精度受限。近几年大多学者开始融合多个模态的输入信息,进行特征融合,并对多模态信息进行处理,从而提升KIE的精度。主要方法有以下几种 + +* (1)基于Grid的方法:此类方法主要关注图像层面多模态信息的融合,文本大多大多为字符粒度,对文本与结构结构信息的嵌入方式较为简单,如Chargrid[1]等算法。 +* (2)基于Token的方法:此类方法参考NLP中的BERT等方法,将位置、视觉等特征信息共同编码到多模态模型中,并且在大规模数据集上进行预训练,从而在下游任务中,仅需要少量的标注数据便可以获得很好的效果。如LayoutLM[2], LayoutLMv2[3], LayoutXLM[4], StrucText[5]等算法。 +* (3)基于GCN的方法:此类方法尝试学习图像、文字之间的结构信息,从而可以解决开集信息抽取的问题(训练集中没有见过的模板),如GCN[6]、SDMGR[7]等算法。 +* (4)基于End-to-end的方法:此类方法将现有的OCR文字识别以及KIE信息抽取2个任务放在一个统一的网络中进行共同学习,并在学习过程中相互加强。如Trie[8]等算法。 + +更多关于该系列算法的详细介绍,请参考“动手学OCR·十讲”课程的课节六部分:[文档分析理论与实践](https://aistudio.baidu.com/aistudio/education/group/info/25207)。 + +## 2. 关键信息抽取任务流程 + +PaddleOCR中实现了LayoutXLM等算法(基于Token),同时,在PP-Structurev2中,对LayoutXLM多模态预训练模型的网络结构进行简化,去除了其中的Visual backbone部分,设计了视觉无关的VI-LayoutXLM模型,同时引入符合人类阅读顺序的排序逻辑以及UDML知识蒸馏策略,最终同时提升了关键信息抽取模型的精度与推理速度。 + +下面介绍怎样基于PaddleOCR完成关键信息抽取任务。 + +在非End-to-end的KIE方法中,完成关键信息抽取,至少需要**2个步骤**:首先使用OCR模型,完成文字位置与内容的提取,然后使用KIE模型,根据图像、文字位置以及文字内容,提取出其中的关键信息。 + +### 2.1 训练OCR模型 + +#### 2.1.1 文本检测 + +**(1)数据** + +PaddleOCR中提供的模型大多数为通用模型,在进行文本检测的过程中,相邻文本行的检测一般是根据位置的远近进行区分,如上图,使用PP-OCRv3通用中英文检测模型进行文本检测时,容易将”民族“与“汉”这2个代表不同的字段检测到一起,从而增加后续KIE任务的难度。因此建议在做KIE任务的过程中,首先训练一个针对该文档数据集的检测模型。 + +在数据标注时,关键信息的标注需要隔开,比上图中的 “民族汉” 3个字相隔较近,此时需要将”民族“与”汉“标注为2个文本检测框,否则会增加后续KIE任务的难度。 + +对于下游任务,一般来说,`200~300`张的文本训练数据即可保证基本的训练效果,如果没有太多的先验知识,可以先标注 **`200~300`** 张图片,进行后续文本检测模型的训练。 + + +**(2)模型** + +在模型选择方面,推荐使用PP-OCRv3_det,关于更多关于检测模型的训练方法介绍,请参考:[OCR文本检测模型训练教程](../../doc/doc_ch/detection.md)与[PP-OCRv3 文本检测模型训练教程](../../doc/doc_ch/PPOCRv3_det_train.md)。 + +#### 2.1.2 文本识别 + +相对自然场景,文档图像中的文本内容识别难度一般相对较低(背景相对不太复杂),因此**优先建议**尝试PaddleOCR中提供的PP-OCRv3通用文本识别模型([PP-OCRv3模型库链接](../../doc/doc_ch/models_list.md))。 + +**(1)数据** + +然而,在部分文档场景中也会存在一些挑战,如身份证场景中存在着罕见字,在发票等场景中的字体比较特殊,这些问题都会增加文本识别的难度,此时如果希望保证或者进一步提升模型的精度,建议基于特定文档场景的文本识别数据集,加载PP-OCRv3模型进行微调。 + +在模型微调的过程中,建议准备至少`5000`张垂类场景的文本识别图像,可以保证基本的模型微调效果。如果希望提升模型的精度与泛化能力,可以合成更多与该场景类似的文本识别数据,从公开数据集中收集通用真实文本识别数据,一并添加到该场景的文本识别训练任务过程中。在训练过程中,建议每个epoch的真实垂类数据、合成数据、通用数据比例在`1:1:1`左右,这可以通过设置不同数据源的采样比例进行控制。如有3个训练文本文件,分别包含1W、2W、5W条数据,那么可以在配置文件中设置数据如下: + +```yml +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ + label_file_list: + - ./train_data/train_list_1W.txt + - ./train_data/train_list_2W.txt + - ./train_data/train_list_5W.txt + ratio_list: [1.0, 0.5, 0.2] + ... +``` + +**(2)模型** + +在模型选择方面,推荐使用通用中英文文本识别模型PP-OCRv3_rec,关于更多关于文本识别模型的训练方法介绍,请参考:[OCR文本识别模型训练教程](../../doc/doc_ch/recognition.md)与[PP-OCRv3文本识别模型库与配置文件](../../doc/doc_ch/models_list.md)。 + +### 2.2 训练KIE模型 + +对于识别得到的文字进行关键信息抽取,有2种主要的方法。 + +(1)直接使用SER,获取关键信息的类别:如身份证场景中,将“姓名“与”张三“分别标记为`name_key`与`name_value`。最终识别得到的类别为`name_value`对应的**文本字段**即为我们所需要的关键信息。 + +(2)联合SER与RE进行使用:这种方法中,首先使用SER,获取图像文字内容中所有的key与value,然后使用RE方法,对所有的key与value进行配对,找到映射关系,从而完成关键信息的抽取。 + +#### 2.2.1 SER + +以身份证场景为例, 关键信息一般包含`姓名`、`性别`、`民族`等,我们直接将对应的字段标注为特定的类别即可,如下图所示。 + +
+ +
+ +**注意:** + +- 标注过程中,对于无关于KIE关键信息的文本内容,均需要将其标注为`other`类别,相当于背景信息。如在身份证场景中,如果我们不关注性别信息,那么可以将“性别”与“男”这2个字段的类别均标注为`other`。 +- 标注过程中,需要以**文本行**为单位进行标注,无需标注单个字符的位置信息。 + +数据量方面,一般来说,对于比较固定的场景,**50张**左右的训练图片即可达到可以接受的效果,可以使用[PPOCRLabel](../../PPOCRLabel/README_ch.md)完成KIE的标注过程。 + +模型方面,推荐使用PP-Structurev2中提出的VI-LayoutXLM模型,它基于LayoutXLM模型进行改进,去除其中的视觉特征提取模块,在精度基本无损的情况下,进一步提升了模型推理速度。更多教程请参考:[VI-LayoutXLM算法介绍](../../doc/doc_ch/algorithm_kie_vi_layoutxlm.md)与[KIE关键信息抽取使用教程](../../doc/doc_ch/kie.md)。 + + +#### 2.2.2 SER + RE + +该过程主要包含SER与RE 2个过程。SER阶段主要用于识别出文档图像中的所有key与value,RE阶段主要用于对所有的key与value进行匹配。 + +以身份证场景为例, 关键信息一般包含`姓名`、`性别`、`民族`等关键信息,在SER阶段,我们需要识别所有的question (key) 与answer (value) 。标注如下所示。每个字段的类别信息(`label`字段)可以是question、answer或者other(与待抽取的关键信息无关的字段) + +
+ +
+ + +在RE阶段,需要标注每个字段的的id与连接信息,如下图所示。 + +
+ +
+ +每个文本行字段中,需要添加`id`与`linking`字段信息,`id`记录该文本行的唯一标识,同一张图片中的不同文本内容不能重复,`linking`是一个列表,记录了不同文本之间的连接信息。如字段“出生”的id为0,字段“1996年1月11日”的id为1,那么它们均有[[0, 1]]的`linking`标注,表示该id=0与id=1的字段构成key-value的关系(姓名、性别等字段类似,此处不再一一赘述)。 + + +**注意:** + +- 标注过程中,如果value是多个字符,那么linking中可以新增一个key-value对,如`[[0, 1], [0, 2]]` + + +数据量方面,一般来说,对于比较固定的场景,**50张**左右的训练图片即可达到可以接受的效果,可以使用PPOCRLabel完成KIE的标注过程。 + +模型方面,推荐使用PP-Structurev2中提出的VI-LayoutXLM模型,它基于LayoutXLM模型进行改进,去除其中的视觉特征提取模块,在精度基本无损的情况下,进一步提升了模型推理速度。更多教程请参考:[VI-LayoutXLM算法介绍](../../doc/doc_ch/algorithm_kie_vi_layoutxlm.md)与[KIE关键信息抽取使用教程](../../doc/doc_ch/kie.md)。 + + +## 3. 参考文献 + + +[1] Katti A R, Reisswig C, Guder C, et al. Chargrid: Towards understanding 2d documents[J]. arXiv preprint arXiv:1809.08799, 2018. + +[2] Xu Y, Li M, Cui L, et al. Layoutlm: Pre-training of text and layout for document image understanding[C]//Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining. 2020: 1192-1200. + +[3] Xu Y, Xu Y, Lv T, et al. LayoutLMv2: Multi-modal pre-training for visually-rich document understanding[J]. arXiv preprint arXiv:2012.14740, 2020. + +[4]: Xu Y, Lv T, Cui L, et al. Layoutxlm: Multimodal pre-training for multilingual visually-rich document understanding[J]. arXiv preprint arXiv:2104.08836, 2021. + +[5] Li Y, Qian Y, Yu Y, et al. StrucTexT: Structured Text Understanding with Multi-Modal Transformers[C]//Proceedings of the 29th ACM International Conference on Multimedia. 2021: 1912-1920. + +[6] Liu X, Gao F, Zhang Q, et al. Graph convolution for multimodal information extraction from visually rich documents[J]. arXiv preprint arXiv:1903.11279, 2019. + +[7] Sun H, Kuang Z, Yue X, et al. Spatial Dual-Modality Graph Reasoning for Key Information Extraction[J]. arXiv preprint arXiv:2103.14470, 2021. + +[8] Zhang P, Xu Y, Cheng Z, et al. Trie: End-to-end text reading and information extraction for document understanding[C]//Proceedings of the 28th ACM International Conference on Multimedia. 2020: 1413-1422. diff --git a/ppstructure/kie/how_to_do_kie_en.md b/ppstructure/kie/how_to_do_kie_en.md new file mode 100644 index 0000000000000000000000000000000000000000..23b2394f5aa3911a1311d3bc3be8f362861d34af --- /dev/null +++ b/ppstructure/kie/how_to_do_kie_en.md @@ -0,0 +1,179 @@ + +# Key Information Extraction Pipeline + +- [1. Introduction](#1-Introduction) + - [1.1 Background](#11-Background) + - [1.2 Mainstream Deep-learning Solutions](#12-Mainstream-Deep-learning-Solutions) +- [2. KIE Pipeline](#2-KIE-Pipeline) + - [2.1 Train OCR Models](#21-Train-OCR-Models) + - [2.2 Train KIE Models](#22-Train-KIE-Models) +- [3. Reference](#3-Reference) + + +## 1. Introduction + +### 1.1 Background + +Key information extraction (KIE) refers to extracting key information from text or images. As the downstream task of OCR, KIE of document image has many practical application scenarios, such as form recognition, ticket information extraction, ID card information extraction, etc. However, it is time-consuming and laborious to extract key information from these document images by manpower. It's challengable but also valuable to combine multi-modal features (visual, layout, text, etc) together and complete KIE tasks. + +For the document images in a specific scene, the position and layout of the key information are relatively fixed. Therefore, in the early stage of the research, there are many methods based on template matching to extract the key information. This method is still widely used in many simple scenarios at present. However, it takes long time to adjut the template for different scenarios. + + +The KIE in the document image generally contains 2 subtasks, which is as shown follows. + +* (1) SER: semantic entity recognition, which classifies each detected textline, such as dividing it into name and ID card. As shown in the red boxes in the following figure. + +* (2) RE: relationship extraction, which matches the question and answer based on SER results. As shown in the figure below, the yellow arrows match the question and answer. + +
+ +
+ + + +### 1.2 Mainstream Deep-learning Solutions + +General KIE methods are based on Named Entity Recognition (NER), but such methods only use text information and ignore location and visual feature information, which leads to limited accuracy. In recent years, most scholars have started to combine mutil-modal features to improve the accuracy of KIE model. The main methods are as follows: + +* (1) Grid based methods. These methods mainly focus on the fusion of multi-modal information at the image level. Most texts are of character granularity. The text and structure information embedding method is simple, such as the algorithm of chargrid [1]. + +* (2) Token based methods. These methods refer to the NLP methods such as Bert, which encode the position, vision and other feature information into the multi-modal model, and conduct pre-training on large-scale datasets, so that in downstream tasks, only a small amount of annotation data is required to obtain excellent results. The representative algorithms are layoutlm [2], layoutlmv2 [3], layoutxlm [4], structext [5], etc. + +* (3) GCN based methods. These methods try to learn the structural information between images and characters, so as to solve the problem of extracting open set information (templates not seen in the training set), such as GCN [6], SDMGR [7] and other algorithms. + +* (4) End to end based methods: these methods put the existing OCR character recognition and KIE information extraction tasks into a unified network for common learning, and strengthen each other in the learning process. Such as TRIE [8]. + + +For more detailed introduction of the algorithms, please refer to Chapter 6 of [Diving into OCR](https://aistudio.baidu.com/aistudio/education/group/info/25207). + +## 2. KIE Pipeline + +Token based methods such as LayoutXLM are implemented in PaddleOCR. What's more, in PP-Structurev2, we simplify the LayoutXLM model and proposed VI-LayoutXLM, in which the visual feature extraction module is removed for speed-up. The textline sorting strategy conforming to the human reading order and UDML knowledge distillation strategy are utilized for higher model accuracy. + + +In the non end-to-end KIE method, KIE needs at least ** 2 steps**. Firstly, the OCR model is used to extract the text and its position. Secondly, the KIE model is used to extract the key information according to the image, text position and text content. + + +### 2.1 Train OCR Models + +#### 2.1.1 Text Detection + +**(1) Data** + +Most of the models provided in PaddleOCR are general models. In the process of text detection, the detection of adjacent text lines is generally based on the distance of the position. As shown in the figure above, when using PP-OCRv3 general English detection model for text detection, it is easy to detect the two fields representing different propoerties as one. Therefore, it is suggested to finetune a detection model according to your scenario firstly during the KIE task. + + +During data annotation, the different key information needs to be separated. Otherwise, it will increase the difficulty of subsequent KIE tasks. + +For downstream tasks, generally speaking, `200~300` training images can guarantee the basic training effect. If there is not too much prior knowledge, **`200~300`** images can be labeled firstly for subsequent text detection model training. + +**(2) Model** + +In terms of model selection, PP-OCRv3 detection model is recommended. For more information about the training methods of the detection model, please refer to: [Text detection tutorial](../../doc/doc_en/detection_en.md) and [PP-OCRv3 detection model tutorial](../../doc/doc_ch/PPOCRv3_det_train.md). + +#### 2.1.2 Text recognition + + +Compared with the natural scene, the text recognition in the document image is generally relatively easier (the background is not too complex), so **it is suggested to** try the PP-OCRv3 general text recognition model provided in PaddleOCR ([PP-OCRv3 model list](../../doc/doc_en/models_list_en.md)) + + +**(1) Data** + +However, there are also some challenges in some document scenarios, such as rare words in ID card scenarios and special fonts in invoice and other scenarios. These problems will increase the difficulty of text recognition. At this time, if you want to ensure or further improve the model accuracy, it is recommended to load PP-OCRv3 model based on the text recognition dataset of specific document scenarios for finetuning. + +In the process of model finetuning, it is recommended to prepare at least `5000` vertical scene text recognition images to ensure the basic model fine-tuning effect. If you want to improve the accuracy and generalization ability of the model, you can synthesize more text recognition images similar to the scene, collect general real text recognition data from the public data set, and add them to the text recognition training process. In the training process, it is suggested that the ratio of real data, synthetic data and general data of each epoch should be around `1:1:1`, which can be controlled by setting the sampling ratio of different data sources. If there are 3 training text files, including 10k, 20k and 50k pieces of data respectively, the data can be set in the configuration file as follows: + +```yml +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ + label_file_list: + - ./train_data/train_list_10k.txt + - ./train_data/train_list_10k.txt + - ./train_data/train_list_50k.txt + ratio_list: [1.0, 0.5, 0.2] + ... +``` + +**(2) Model** + +In terms of model selection, PP-OCRv3 recognition model is recommended. For more information about the training methods of the recognition model, please refer to: [Text recognition tutorial](../../doc/doc_en/recognition_en.md) and [PP-OCRv3 model list](../../doc/doc_en/models_list_en.md). + + +### 2.2 Train KIE Models + +There are two main methods to extract the key information from the recognized texts. + +(1) Directly use SER model to obtain the key information category. For example, in the ID card scenario, we mark "name" and "Geoff Sample" as "name_key" and "name_value", respectively. The **text field** corresponding to the category "name_value" finally identified is the key information we need. + +(2) Joint use SER and RE models. For this case, we firstly use SER model to obtain all questions (keys) and questions (values) for the image text, and then use RE model to match all keys and values to find the relationship, so as to complete the extraction of key information. + +#### 2.2.1 SER + +Take the ID card scenario as an example. The key information generally includes `name`, `DOB`, etc. We can directly mark the corresponding fields as specific categories, as shown in the following figure. + +
+ +
+ +**Note:** + +- In the labeling process, text content without key information about KIE shall be labeled as`other`, which is equivalent to background information. For example, in the ID card scenario, if we do not pay attention to `DOB` information, we can mark the categories of `DOB` and `Area manager` as `other`. +- In the annotation process of, it is required to annotate the **textline** position rather than the character. + + +In terms of data, generally speaking, for relatively fixed scenes, **50** training images can achieve acceptable effects. You can refer to [PPOCRLabel](../../PPOCRLabel/README.md) for finish the labeling process. + +In terms of model, it is recommended to use the VI-layoutXLM model proposed in PP-Structurev2. It is improved based on the LayoutXLM model, removing the visual feature extraction module, and further improving the model inference speed without the significant reduction on model accuracy. For more tutorials, please refer to [VI-LayoutXLM introduction](../../doc/doc_en/algorithm_kie_vi_layoutxlm_en.md) and [KIE tutorial](../../doc/doc_en/kie_en.md). + + +#### 2.2.2 SER + RE + +The SER model is mainly used to identify all keys and values in the document image, and the RE model is mainly used to match all keys and values. + +Taking the ID card scenario as an example, the key information generally includes key information such as `name`, `DOB`, etc. in the SER stage, we need to identify all questions (keys) and answers (values). The demo annotation is as follows. All keys can be annotated as `question`, and all keys can be annotated as `answer`. + + +
+ +
+ + +In the RE stage, the ID and connection information of each field need to be marked, as shown in the following figure. + +
+ +
+ +For each textline, you need to add 'ID' and 'linking' field information. The 'ID' records the unique identifier of the textline. Different text contents in the same images cannot be repeated. The 'linking' is a list that records the connection information between different texts. If the ID of the field "name" is 0 and the ID of the field "Geoff Sample" is 1, then they all have [[0, 1]] 'linking' marks, indicating that the fields with `id=0` and `id=1` form a key value relationship (the fields such as DOB and Expires are similar, and will not be repeated here). + + +**Note:** + +-During annotation, if value is multiple textines, a key value pair can be added in linking, such as `[[0, 1], [0, 2]]`. + +In terms of data, generally speaking, for relatively fixed scenes, about **50** training images can achieve acceptable effects. + +In terms of model, it is recommended to use the VI-layoutXLM model proposed in PP-Structurev2. It is improved based on the LayoutXLM model, removing the visual feature extraction module, and further improving the model inference speed without the significant reduction on model accuracy. For more tutorials, please refer to [VI-LayoutXLM introduction](../../doc/doc_en/algorithm_kie_vi_layoutxlm_en.md) and [KIE tutorial](../../doc/doc_en/kie_en.md). + + + +## 3. Reference + + +[1] Katti A R, Reisswig C, Guder C, et al. Chargrid: Towards understanding 2d documents[J]. arXiv preprint arXiv:1809.08799, 2018. + +[2] Xu Y, Li M, Cui L, et al. Layoutlm: Pre-training of text and layout for document image understanding[C]//Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining. 2020: 1192-1200. + +[3] Xu Y, Xu Y, Lv T, et al. LayoutLMv2: Multi-modal pre-training for visually-rich document understanding[J]. arXiv preprint arXiv:2012.14740, 2020. + +[4]: Xu Y, Lv T, Cui L, et al. Layoutxlm: Multimodal pre-training for multilingual visually-rich document understanding[J]. arXiv preprint arXiv:2104.08836, 2021. + +[5] Li Y, Qian Y, Yu Y, et al. StrucTexT: Structured Text Understanding with Multi-Modal Transformers[C]//Proceedings of the 29th ACM International Conference on Multimedia. 2021: 1912-1920. + +[6] Liu X, Gao F, Zhang Q, et al. Graph convolution for multimodal information extraction from visually rich documents[J]. arXiv preprint arXiv:1903.11279, 2019. + +[7] Sun H, Kuang Z, Yue X, et al. Spatial Dual-Modality Graph Reasoning for Key Information Extraction[J]. arXiv preprint arXiv:2103.14470, 2021. + +[8] Zhang P, Xu Y, Cheng Z, et al. Trie: End-to-end text reading and information extraction for document understanding[C]//Proceedings of the 28th ACM International Conference on Multimedia. 2020: 1413-1422. diff --git a/ppstructure/vqa/predict_vqa_token_ser.py b/ppstructure/kie/predict_kie_token_ser.py similarity index 92% rename from ppstructure/vqa/predict_vqa_token_ser.py rename to ppstructure/kie/predict_kie_token_ser.py index 3097ebcf1640eb1e4dd65f76635f21231984b0ef..48cfc528a28e0a2bdfb51d3a537f26e891ae3286 100644 --- a/ppstructure/vqa/predict_vqa_token_ser.py +++ b/ppstructure/kie/predict_kie_token_ser.py @@ -30,7 +30,7 @@ from ppocr.data import create_operators, transform from ppocr.postprocess import build_post_process from ppocr.utils.logging import get_logger from ppocr.utils.visual import draw_ser_results -from ppocr.utils.utility import get_image_file_list, check_and_read_gif +from ppocr.utils.utility import get_image_file_list, check_and_read from ppstructure.utility import parse_args from paddleocr import PaddleOCR @@ -40,14 +40,20 @@ logger = get_logger() class SerPredictor(object): def __init__(self, args): - self.ocr_engine = PaddleOCR(use_angle_cls=False, show_log=False) + self.ocr_engine = PaddleOCR( + use_angle_cls=args.use_angle_cls, + det_model_dir=args.det_model_dir, + rec_model_dir=args.rec_model_dir, + show_log=False, + use_gpu=args.use_gpu) pre_process_list = [{ 'VQATokenLabelEncode': { - 'algorithm': args.vqa_algorithm, + 'algorithm': args.kie_algorithm, 'class_path': args.ser_dict_path, 'contains_re': False, - 'ocr_engine': self.ocr_engine + 'ocr_engine': self.ocr_engine, + 'order_method': args.ocr_order_method, } }, { 'VQATokenPad': { @@ -132,7 +138,7 @@ def main(args): os.path.join(args.output, 'infer.txt'), mode='w', encoding='utf-8') as f_w: for image_file in image_file_list: - img, flag = check_and_read_gif(image_file) + img, flag, _ = check_and_read(image_file) if not flag: img = cv2.imread(image_file) img = img[:, :, ::-1] diff --git a/ppstructure/kie/requirements.txt b/ppstructure/kie/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..11fa98da1bff7a1863d8a077ca73435d15072523 --- /dev/null +++ b/ppstructure/kie/requirements.txt @@ -0,0 +1,7 @@ +sentencepiece +yacs +seqeval +pypandoc +attrdict +python_docx +https://paddleocr.bj.bcebos.com/ppstructure/whl/paddlenlp-2.3.0.dev0-py3-none-any.whl diff --git a/ppstructure/vqa/tools/eval_with_label_end2end.py b/ppstructure/kie/tools/eval_with_label_end2end.py similarity index 99% rename from ppstructure/vqa/tools/eval_with_label_end2end.py rename to ppstructure/kie/tools/eval_with_label_end2end.py index b13ffb568fd9610fee5d5a246c501ed5b90de91a..b0fd84363f450dfb7e4ef18e53adc17ef088cf18 100644 --- a/ppstructure/vqa/tools/eval_with_label_end2end.py +++ b/ppstructure/kie/tools/eval_with_label_end2end.py @@ -20,7 +20,7 @@ from shapely.geometry import Polygon import numpy as np from collections import defaultdict import operator -import Levenshtein +from rapidfuzz.distance import Levenshtein import argparse import json import copy diff --git a/ppstructure/vqa/tools/trans_funsd_label.py b/ppstructure/kie/tools/trans_funsd_label.py similarity index 100% rename from ppstructure/vqa/tools/trans_funsd_label.py rename to ppstructure/kie/tools/trans_funsd_label.py diff --git a/ppstructure/vqa/tools/trans_xfun_data.py b/ppstructure/kie/tools/trans_xfun_data.py similarity index 100% rename from ppstructure/vqa/tools/trans_xfun_data.py rename to ppstructure/kie/tools/trans_xfun_data.py diff --git a/ppstructure/layout/README.md b/ppstructure/layout/README.md index 3a4f5291763e34c8aec2c5b327d40a459bb4be1e..84b977fdd760e6de43d355b802731b5d43eb2cf5 100644 --- a/ppstructure/layout/README.md +++ b/ppstructure/layout/README.md @@ -1,127 +1,470 @@ English | [简体中文](README_ch.md) -- [Getting Started](#getting-started) - - [1. Install whl package](#1--install-whl-package) - - [2. Quick Start](#2-quick-start) - - [3. PostProcess](#3-postprocess) - - [4. Results](#4-results) - - [5. Training](#5-training) -# Getting Started +# Layout analysis + +- [1. Introduction](#1-Introduction) +- [2. Quick start](#2-Quick-start) +- [3. Install](#3-Install) + - [3.1 Install PaddlePaddle](#31-Install-paddlepaddle) + - [3.2 Install PaddleDetection](#32-Install-paddledetection) +- [4. Data preparation](#4-Data-preparation) + - [4.1 English data set](#41-English-data-set) + - [4.2 More datasets](#42-More-datasets) +- [5. Start training](#5-Start-training) + - [5.1 Train](#51-Train) + - [5.2 FGD Distillation training](#52-Fgd-distillation-training) +- [6. Model evaluation and prediction](#6-Model-evaluation-and-prediction) + - [6.1 Indicator evaluation](#61-Indicator-evaluation) + - [6.2 Test layout analysis results](#62-Test-layout-analysis-results) +- [7. Model export and inference](#7-Model-export-and-inference) + - [7.1 Model export](#71-Model-export) + - [7.2 Model inference](#72-Model-inference) + + +## 1. Introduction + +Layout analysis refers to the regional division of documents in the form of pictures and the positioning of key areas, such as text, title, table, picture, etc. The layout analysis algorithm is based on the lightweight model PP-picodet of [PaddleDetection]( https://github.com/PaddlePaddle/PaddleDetection ) + +
+ +
+ +## 2. Quick start +PP-Structure currently provides layout analysis models in Chinese, English and table documents. For the model link, see [models_list](../docs/models_list_en.md). The whl package is also provided for quick use, see [quickstart](../docs/quickstart_en.md) for details. + +## 3. Install + +### 3.1. Install PaddlePaddle + +- **(1) Install PaddlePaddle** -## 1. Install whl package ```bash -wget https://paddleocr.bj.bcebos.com/whl/layoutparser-0.0.0-py3-none-any.whl -pip install -U layoutparser-0.0.0-py3-none-any.whl +python3 -m pip install --upgrade pip + +# GPU Install +python3 -m pip install "paddlepaddle-gpu>=2.3" -i https://mirror.baidu.com/pypi/simple + +# CPU Install +python3 -m pip install "paddlepaddle>=2.3" -i https://mirror.baidu.com/pypi/simple ``` +For more requirements, please refer to the instructions in the [Install file](https://www.paddlepaddle.org.cn/install/quick)。 -## 2. Quick Start +### 3.2. Install PaddleDetection -Use LayoutParser to identify the layout of a document: +- **(1)Download PaddleDetection Source code** -```python -import cv2 -import layoutparser as lp -image = cv2.imread("doc/table/layout.jpg") -image = image[..., ::-1] +```bash +git clone https://github.com/PaddlePaddle/PaddleDetection.git +``` -# load model -model = lp.PaddleDetectionLayoutModel(config_path="lp://PubLayNet/ppyolov2_r50vd_dcn_365e_publaynet/config", - threshold=0.5, - label_map={0: "Text", 1: "Title", 2: "List", 3:"Table", 4:"Figure"}, - enforce_cpu=False, - enable_mkldnn=True) -# detect -layout = model.detect(image) +- **(2)Install third-party libraries** -# show result -show_img = lp.draw_box(image, layout, box_width=3, show_element_type=True) -show_img.show() +```bash +cd PaddleDetection +python3 -m pip install -r requirements.txt ``` -The following figure shows the result, with different colored detection boxes representing different categories and displaying specific categories in the upper left corner of the box with `show_element_type` +## 4. Data preparation -
- -
-`PaddleDetectionLayoutModel`parameters are described as follows: +If you want to experience the prediction process directly, you can skip data preparation and download the pre-training model. + +### 4.1. English data set + +Download document analysis data set [PubLayNet](https://developer.ibm.com/exchanges/data/all/publaynet/)(Dataset 96G),contains 5 classes:`{0: "Text", 1: "Title", 2: "List", 3:"Table", 4:"Figure"}` + +``` +# Download data +wget https://dax-cdn.cdn.appdomain.cloud/dax-publaynet/1.0.0/publaynet.tar.gz +# Decompress data +tar -xvf publaynet.tar.gz +``` + +Uncompressed **directory structure:** + +``` +|-publaynet + |- test + |- PMC1277013_00004.jpg + |- PMC1291385_00002.jpg + | ... + |- train.json + |- train + |- PMC1291385_00002.jpg + |- PMC1277013_00004.jpg + | ... + |- val.json + |- val + |- PMC538274_00004.jpg + |- PMC539300_00004.jpg + | ... +``` + +**data distribution:** + +| File or Folder | Description | num | +| :------------- | :------------- | ------- | +| `train/` | Training set pictures | 335,703 | +| `val/` | Verification set pictures | 11,245 | +| `test/` | Test set pictures | 11,405 | +| `train.json` | Training set annotation files | - | +| `val.json` | Validation set dimension files | - | + +**Data Annotation** + +The JSON file contains the annotations of all images, and the data is stored in a dictionary nested manner.Contains the following keys: -| parameter | description | default | remark | -| :------------: | :------------------------------------------------------: | :---------: | :----------------------------------------------------------: | -| config_path | model config path | None | Specify config_ path will automatically download the model (only for the first time,the model will exist and will not be downloaded again) | -| model_path | model path | None | local model path, config_ path and model_ path must be set to one, cannot be none at the same time | -| threshold | threshold of prediction score | 0.5 | \ | -| input_shape | picture size of reshape | [3,640,640] | \ | -| batch_size | testing batch size | 1 | \ | -| label_map | category mapping table | None | Setting config_ path, it can be none, and the label is automatically obtained according to the dataset name_ map, You need to specify it manually when setting model_path | -| enforce_cpu | whether to use CPU | False | False to use GPU, and True to force the use of CPU | -| enforce_mkldnn | whether mkldnn acceleration is enabled in CPU prediction | True | \ | -| thread_num | the number of CPU threads | 10 | \ | +- info,represents the dimension file info。 -The following model configurations and label maps are currently supported, which you can use by modifying '--config_path' and '--label_map' to detect different types of content: +- licenses,represents the dimension file licenses。 -| dataset | config_path | label_map | -| ------------------------------------------------------------ | ------------------------------------------------------------ | --------------------------------------------------------- | -| [TableBank](https://doc-analysis.github.io/tablebank-page/index.html) word | lp://TableBank/ppyolov2_r50vd_dcn_365e_tableBank_word/config | {0:"Table"} | -| TableBank latex | lp://TableBank/ppyolov2_r50vd_dcn_365e_tableBank_latex/config | {0:"Table"} | -| [PubLayNet](https://github.com/ibm-aur-nlp/PubLayNet) | lp://PubLayNet/ppyolov2_r50vd_dcn_365e_publaynet/config | {0: "Text", 1: "Title", 2: "List", 3:"Table", 4:"Figure"} | +- images,represents the list of image information in the annotation file,each element is the information of an image。The information of one of the images is as follows: -* TableBank word and TableBank latex are trained on datasets of word documents and latex documents respectively; -* Download TableBank dataset contains both word and latex。 + ``` + { + 'file_name': 'PMC4055390_00006.jpg', # file_name + 'height': 601, # image height + 'width': 792, # image width + 'id': 341427 # image id + } + ``` -## 3. PostProcess +- annotations, represents the list of annotation information of the target object in the annotation file,each element is the annotation information of a target object。The following is the annotation information of one of the target objects: -Layout parser contains multiple categories, if you only want to get the detection box for a specific category (such as the "Text" category), you can use the following code: + ``` + { -```python -# follow the above code -# filter areas for a specific text type -text_blocks = lp.Layout([b for b in layout if b.type=='Text']) -figure_blocks = lp.Layout([b for b in layout if b.type=='Figure']) + 'segmentation': # Segmentation annotation of objects + 'area': 60518.099043117836, # Area of object + 'iscrowd': 0, # iscrowd + 'image_id': 341427, # image id + 'bbox': [50.58, 490.86, 240.15, 252.16], # bbox [x1,y1,w,h] + 'category_id': 1, # category_id + 'id': 3322348 # image id + } + ``` -# text areas may be detected within the image area, delete these areas -text_blocks = lp.Layout([b for b in text_blocks \ - if not any(b.is_in(b_fig) for b_fig in figure_blocks)]) +### 4.2. More datasets -# sort text areas and assign ID -h, w = image.shape[:2] +We provide CDLA(Chinese layout analysis), TableBank(Table layout analysis)etc. data set download links,process to the JSON format of the above annotation file,that is, the training can be conducted in the same way。 -left_interval = lp.Interval(0, w/2*1.05, axis='x').put_on_canvas(image) +| dataset | 简介 | +| ------------------------------------------------------------ | ------------------------------------------------------------ | +| [cTDaR2019_cTDaR](https://cndplab-founder.github.io/cTDaR2019/) | For form detection (TRACKA) and form identification (TRACKB).Image types include historical data sets (beginning with cTDaR_t0, such as CTDAR_T00872.jpg) and modern data sets (beginning with cTDaR_t1, CTDAR_T10482.jpg). | +| [IIIT-AR-13K](http://cvit.iiit.ac.in/usodi/iiitar13k.php) | Data sets constructed by manually annotating figures or pages from publicly available annual reports, containing 5 categories:table, figure, natural image, logo, and signature. | +| [TableBank](https://github.com/doc-analysis/TableBank) | For table detection and recognition of large datasets, including Word and Latex document formats | +| [CDLA](https://github.com/buptlihang/CDLA) | Chinese document layout analysis data set, for Chinese literature (paper) scenarios, including 10 categories:Table, Figure, Figure caption, Table, Table caption, Header, Footer, Reference, Equation | +| [DocBank](https://github.com/doc-analysis/DocBank) | Large-scale dataset (500K document pages) constructed using weakly supervised methods for document layout analysis, containing 12 categories:Author, Caption, Date, Equation, Figure, Footer, List, Paragraph, Reference, Section, Table, Title | -left_blocks = text_blocks.filter_by(left_interval, center=True) -left_blocks.sort(key = lambda b:b.coordinates[1]) -right_blocks = [b for b in text_blocks if b not in left_blocks] -right_blocks.sort(key = lambda b:b.coordinates[1]) +## 5. Start training -# the two lists are merged and the indexes are added in order -text_blocks = lp.Layout([b.set(id = idx) for idx, b in enumerate(left_blocks + right_blocks)]) +Training scripts, evaluation scripts, and prediction scripts are provided, and the PubLayNet pre-training model is used as an example in this section. -# display result -show_img = lp.draw_box(image, text_blocks, - box_width=3, - show_element_id=True) -show_img.show() +If you do not want training and directly experience the following process of model evaluation, prediction, motion to static, and inference, you can download the provided pre-trained model (PubLayNet dataset) and skip this part. + +``` +mkdir pretrained_model +cd pretrained_model +# Download PubLayNet pre-training model(Direct experience model evaluates, predicts, and turns static) +wget https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout.pdparams +# Download the PubLaynet inference model(Direct experience model reasoning) +wget https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_infer.tar ``` -Displays results with only the "Text" category: +If the test image is Chinese, the pre-trained model of Chinese CDLA dataset can be downloaded to identify 10 types of document regions:Table, Figure, Figure caption, Table, Table caption, Header, Footer, Reference, Equation,Download the training model and inference model of Model 'picodet_lcnet_x1_0_fgd_layout_cdla' in [layout analysis model](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/ppstructure/docs/models_list.md)。If only the table area in the image is detected, you can download the pre-trained model of the table dataset, and download the training model and inference model of the 'picodet_LCnet_x1_0_FGd_layout_table' model in [Layout Analysis model](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/ppstructure/docs/models_list.md) -
- -
+### 5.1. Train + +Train: + +* Modify Profile + +If you want to train your own data set, you need to modify the data configuration and the number of categories in the configuration file. + + +Using 'configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml' as an example, the change is as follows: + +```yaml +metric: COCO +# Number of categories +num_classes: 5 + +TrainDataset: + !COCODataSet + # Modify to your own training data directory + image_dir: train + # Modify to your own training data label file + anno_path: train.json + # Modify to your own training data root directory + dataset_dir: /root/publaynet/ + data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd'] + +EvalDataset: + !COCODataSet + # Modify to your own validation data directory + image_dir: val + # Modify to your own validation data label file + anno_path: val.json + # Modify to your own validation data root + dataset_dir: /root/publaynet/ + +TestDataset: + !ImageFolder + # Modify to your own test data label file + anno_path: /root/publaynet/val.json +``` + +* Start training. During training, PP picodet pre training model will be downloaded by default. There is no need to download in advance. + +```bash +# GPU training supports single-card and multi-card training +# The training log is automatically saved to the log directory + +# Single card training +export CUDA_VISIBLE_DEVICES=0 +python3 tools/train.py \ + -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ + --eval + +# Multi-card training, with the -- GPUS parameter specifying the card number +export CUDA_VISIBLE_DEVICES=0,1,2,3 +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py \ + -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ + --eval +``` + +**Attention:**If the video memory is out during training, adjust Batch_size in TrainReader and base_LR in LearningRate. The published config is obtained by 8-card training. If the number of GPU cards is changed to 1, then the base_LR needs to be reduced by 8 times. + +After starting training normally, you will see the following log output: + +``` +[08/15 04:02:30] ppdet.utils.checkpoint INFO: Finish loading model weights: /root/.cache/paddle/weights/LCNet_x1_0_pretrained.pdparams +[08/15 04:02:46] ppdet.engine INFO: Epoch: [0] [ 0/1929] learning_rate: 0.040000 loss_vfl: 1.216707 loss_bbox: 1.142163 loss_dfl: 0.544196 loss: 2.903065 eta: 17 days, 13:50:26 batch_cost: 15.7452 data_cost: 2.9112 ips: 1.5243 images/s +[08/15 04:03:19] ppdet.engine INFO: Epoch: [0] [ 20/1929] learning_rate: 0.064000 loss_vfl: 1.180627 loss_bbox: 0.939552 loss_dfl: 0.442436 loss: 2.628206 eta: 2 days, 12:18:53 batch_cost: 1.5770 data_cost: 0.0008 ips: 15.2184 images/s +[08/15 04:03:47] ppdet.engine INFO: Epoch: [0] [ 40/1929] learning_rate: 0.088000 loss_vfl: 0.543321 loss_bbox: 1.071401 loss_dfl: 0.457817 loss: 2.057003 eta: 2 days, 0:07:03 batch_cost: 1.3190 data_cost: 0.0007 ips: 18.1954 images/s +[08/15 04:04:12] ppdet.engine INFO: Epoch: [0] [ 60/1929] learning_rate: 0.112000 loss_vfl: 0.630989 loss_bbox: 0.859183 loss_dfl: 0.384702 loss: 1.883143 eta: 1 day, 19:01:29 batch_cost: 1.2177 data_cost: 0.0006 ips: 19.7087 images/s +``` + +- `--eval` indicates that the best model is saved as `output/picodet_lcnet_x1_0_layout/best_accuracy` by default during the evaluation process 。 + +**Note that the configuration file for prediction / evaluation must be consistent with the training.** + +### 5.2. FGD Distillation Training + +PaddleDetection supports FGD-based [Focal and Global Knowledge Distillation for Detectors]( https://arxiv.org/abs/2111.11837v1) The training process of the target detection model of distillation, FGD distillation is divided into two parts `Focal` and `Global`. `Focal` Distillation separates the foreground and background of the image, allowing the student model to focus on the key pixels of the foreground and background features of the teacher model respectively;` Global`Distillation section reconstructs the relationships between different pixels and transfers them from the teacher to the student to compensate for the global information lost in `Focal`Distillation. + +Change the dataset and modify the data configuration and number of categories in the [TODO] configuration, referring to 4.1. Start training: + +```bash +# Single Card Training +export CUDA_VISIBLE_DEVICES=0 +python3 tools/train.py \ + -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ + --slim_config configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x2_5_layout.yml \ + --eval +``` + +- `-c`: Specify the model configuration file. +- `--slim_config`: Specify the compression policy profile. + +## 6. Model evaluation and prediction + +### 6.1. Indicator evaluation + + Model parameters in training are saved by default in `output/picodet_ Lcnet_ X1_ 0_ Under the layout` directory. When evaluating indicators, you need to set `weights` to point to the saved parameter file.Assessment datasets can be accessed via `configs/picodet/legacy_ Model/application/layout_ Analysis/picodet_ Lcnet_ X1_ 0_ Layout. Yml` . Modify `EvalDataset` : `img_dir`,`anno_ Path`and`dataset_dir` setting. + +```bash +# GPU evaluation, weights as weights to be measured +python3 tools/eval.py \ + -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ + -o weights=./output/picodet_lcnet_x1_0_layout/best_model +``` -## 4. Results +The following information will be printed out, such as mAP, AP0.5, etc. + +```py + Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.935 + Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.979 + Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.956 + Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.404 + Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.782 + Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.969 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.539 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.938 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.949 + Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.495 + Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.818 + Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.978 +[08/15 07:07:09] ppdet.engine INFO: Total sample number: 11245, averge FPS: 24.405059207157436 +[08/15 07:07:09] ppdet.engine INFO: Best test bbox ap is 0.935. +``` + +If you use the provided pre-training model for evaluation or the FGD distillation training model, replace the `weights` model path and execute the following command for evaluation: + +``` +python3 tools/eval.py \ + -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ + --slim_config configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x2_5_layout.yml \ + -o weights=output/picodet_lcnet_x2_5_layout/best_model +``` + +- `-c`: Specify the model configuration file. +- `--slim_config`: Specify the distillation policy profile. +- `-o weights`: Specify the model path trained by the distillation algorithm. -| Dataset | mAP | CPU time cost | GPU time cost | -| --------- | ---- | ------------- | ------------- | -| PubLayNet | 93.6 | 1713.7ms | 66.6ms | -| TableBank | 96.2 | 1968.4ms | 65.1ms | +### 6.2. Test Layout Analysis Results -**Envrionment:** -​ **CPU:** Intel(R) Xeon(R) CPU E5-2650 v4 @ 2.20GHz,24core +The profile predicted to be used must be consistent with the training, for example, if you pass `python3 tools/train'. Py-c configs/picodet/legacy_ Model/application/layout_ Analysis/picodet_ Lcnet_ X1_ 0_ Layout. Yml` completed the training process for the model. -​ **GPU:** a single NVIDIA Tesla P40 +With trained PaddleDetection model, you can use the following commands to make model predictions. -## 5. Training +```bash +python3 tools/infer.py \ + -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ + -o weights='output/picodet_lcnet_x1_0_layout/best_model.pdparams' \ + --infer_img='docs/images/layout.jpg' \ + --output_dir=output_dir/ \ + --draw_threshold=0.5 +``` -The above model is based on [PaddleDetection](https://github.com/PaddlePaddle/PaddleDetection). If you want to train your own layout parser model,please refer to:[train_layoutparser_model](train_layoutparser_model.md) +- `--infer_img`: Reasoning for a single picture can also be done via `--infer_ Dir`Inform all pictures in the file. +- `--output_dir`: Specify the path to save the visualization results. +- `--draw_threshold`:Specify the NMS threshold for drawing the result box. + +If you use the provided pre-training model for prediction or the FGD distillation training model, change the `weights` model path and execute the following command to make the prediction: + +``` +python3 tools/infer.py \ + -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ + --slim_config configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x2_5_layout.yml \ + -o weights='output/picodet_lcnet_x2_5_layout/best_model.pdparams' \ + --infer_img='docs/images/layout.jpg' \ + --output_dir=output_dir/ \ + --draw_threshold=0.5 +``` + + +## 7. Model Export and Inference + + +### 7.1 Model Export + +The inference model (the model saved by `paddle.jit.save`) is generally a solidified model saved after the model training is completed, and is mostly used to give prediction in deployment. + +The model saved during the training process is the checkpoints model, which saves the parameters of the model and is mostly used to resume training. + +Compared with the checkpoints model, the inference model will additionally save the structural information of the model. Therefore, it is easier to deploy because the model structure and model parameters are already solidified in the inference model file, and is suitable for integration with actual systems. + +Layout analysis model to inference model steps are as follows: + +```bash +python3 tools/export_model.py \ + -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ + -o weights=output/picodet_lcnet_x1_0_layout/best_model \ + --output_dir=output_inference/ +``` + +* If no post-export processing is required, specify:`-o export.benchmark=True`(If -o already exists, delete -o here) +* If you do not need to export NMS, specify:`-o export.nms=False` + +After successful conversion, there are three files in the directory: + +``` +output_inference/picodet_lcnet_x1_0_layout/ + ├── model.pdiparams # inference Parameter file for model + ├── model.pdiparams.info # inference Model parameter information, ignorable + └── model.pdmodel # inference Model Structure File for Model +``` + +If you change the `weights` model path using the provided pre-training model to the Inference model, or using the FGD distillation training model, the model to inference model steps are as follows: + +```bash +python3 tools/export_model.py \ + -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ + --slim_config configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x2_5_layout.yml \ + -o weights=./output/picodet_lcnet_x2_5_layout/best_model \ + --output_dir=output_inference/ +``` + +### 7.2 Model inference + +Replace model_with the provided inference training model for inference or the FGD distillation training `model_dir`Inference model path, execute the following commands for inference: + +```bash +python3 deploy/python/infer.py \ + --model_dir=output_inference/picodet_lcnet_x1_0_layout/ \ + --image_file=docs/images/layout.jpg \ + --device=CPU +``` + +- --device:Specify the GPU or CPU device + +When model inference is complete, you will see the following log output: + +``` +------------------------------------------ +----------- Model Configuration ----------- +Model Arch: PicoDet +Transform Order: +--transform op: Resize +--transform op: NormalizeImage +--transform op: Permute +--transform op: PadStride +-------------------------------------------- +class_id:0, confidence:0.9921, left_top:[20.18,35.66],right_bottom:[341.58,600.99] +class_id:0, confidence:0.9914, left_top:[19.77,611.42],right_bottom:[341.48,901.82] +class_id:0, confidence:0.9904, left_top:[369.36,375.10],right_bottom:[691.29,600.59] +class_id:0, confidence:0.9835, left_top:[369.60,608.60],right_bottom:[691.38,736.72] +class_id:0, confidence:0.9830, left_top:[369.58,805.38],right_bottom:[690.97,901.80] +class_id:0, confidence:0.9716, left_top:[383.68,271.44],right_bottom:[688.93,335.39] +class_id:0, confidence:0.9452, left_top:[370.82,34.48],right_bottom:[688.10,63.54] +class_id:1, confidence:0.8712, left_top:[370.84,771.03],right_bottom:[519.30,789.13] +class_id:3, confidence:0.9856, left_top:[371.28,67.85],right_bottom:[685.73,267.72] +save result to: output/layout.jpg +Test iter 0 +------------------ Inference Time Info ---------------------- +total_time(ms): 2196.0, img_num: 1 +average latency time(ms): 2196.00, QPS: 0.455373 +preprocess_time(ms): 2172.50, inference_time(ms): 11.90, postprocess_time(ms): 11.60 +``` + +- Model:model structure +- Transform Order:Preprocessing operation +- class_id, confidence, left_top, right_bottom:Indicates category id, confidence level, upper left coordinate, lower right coordinate, respectively +- save result to:Save path of visual layout analysis results, default save to ./output folder +- inference time info:Inference time, where preprocess_time represents the preprocessing time, Inference_time represents the model prediction time, and postprocess_time represents the post-processing time + +The result of visualization layout is shown in the following figure + +
+ +
+ + + +## Citations + +``` +@inproceedings{zhong2019publaynet, + title={PubLayNet: largest dataset ever for document layout analysis}, + author={Zhong, Xu and Tang, Jianbin and Yepes, Antonio Jimeno}, + booktitle={2019 International Conference on Document Analysis and Recognition (ICDAR)}, + year={2019}, + volume={}, + number={}, + pages={1015-1022}, + doi={10.1109/ICDAR.2019.00166}, + ISSN={1520-5363}, + month={Sep.}, + organization={IEEE} +} + +@inproceedings{yang2022focal, + title={Focal and global knowledge distillation for detectors}, + author={Yang, Zhendong and Li, Zhe and Jiang, Xiaohu and Gong, Yuan and Yuan, Zehuan and Zhao, Danpei and Yuan, Chun}, + booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, + pages={4643--4652}, + year={2022} +} +``` diff --git a/ppstructure/layout/README_ch.md b/ppstructure/layout/README_ch.md index 69419ad1eee3523d498b0d845a72133b619b3787..46d2ba74b2d5c579d4b25cf0cadac22ebc32e5b2 100644 --- a/ppstructure/layout/README_ch.md +++ b/ppstructure/layout/README_ch.md @@ -1,133 +1,469 @@ -[English](README.md) | 简体中文 +简体中文 | [English](README.md) + +# 版面分析 + +- [1. 简介](#1-简介) +- [2. 快速开始](#2-快速开始) +- [3. 安装](#3-安装) + - [3.1 安装PaddlePaddle](#31-安装paddlepaddle) + - [3.2 安装PaddleDetection](#32-安装paddledetection) +- [4. 数据准备](#4-数据准备) + - [4.1 英文数据集](#41-英文数据集) + - [4.2 更多数据集](#42-更多数据集) +- [5. 开始训练](#5-开始训练) + - [5.1 启动训练](#51-启动训练) + - [5.2 FGD蒸馏训练](#52-fgd蒸馏训练) +- [6. 模型评估与预测](#6-模型评估与预测) + - [6.1 指标评估](#61-指标评估) + - [6.2 测试版面分析结果](#62-测试版面分析结果) +- [7 模型导出与预测](#7-模型导出与预测) + - [7.1 模型导出](#71-模型导出) + - [7.2 模型推理](#72-模型推理) + +## 1. 简介 + +版面分析指的是对图片形式的文档进行区域划分,定位其中的关键区域,如文字、标题、表格、图片等。版面分析算法基于[PaddleDetection](https://github.com/PaddlePaddle/PaddleDetection)的轻量模型PP-PicoDet进行开发。 -# 版面分析使用说明 +
+ +
+ +## 2. 快速开始 + +PP-Structure目前提供了中文、英文、表格三类文档版面分析模型,模型链接见 [models_list](../docs/models_list.md#1-版面分析模型)。也提供了whl包的形式方便快速使用,详见 [quickstart](../docs/quickstart.md)。 -- [1. 安装whl包](#1) -- [2. 使用](#2) -- [3. 后处理](#3) -- [4. 指标](#4) -- [5. 训练版面分析模型](#5) +## 3. 安装 + +### 3.1. 安装PaddlePaddle + +- **(1) 安装PaddlePaddle** - -## 1. 安装whl包 ```bash -pip install -U https://paddleocr.bj.bcebos.com/whl/layoutparser-0.0.0-py3-none-any.whl +python3 -m pip install --upgrade pip + +# GPU安装 +python3 -m pip install "paddlepaddle-gpu>=2.3" -i https://mirror.baidu.com/pypi/simple + +# CPU安装 +python3 -m pip install "paddlepaddle>=2.3" -i https://mirror.baidu.com/pypi/simple ``` +更多需求,请参照[安装文档](https://www.paddlepaddle.org.cn/install/quick)中的说明进行操作。 - -## 2. 使用 +### 3.2. 安装PaddleDetection -使用layoutparser识别给定文档的布局: +- **(1)下载PaddleDetection源码** -```python -import cv2 -import layoutparser as lp -image = cv2.imread("ppstructure/docs/table/layout.jpg") -image = image[..., ::-1] +```bash +git clone https://github.com/PaddlePaddle/PaddleDetection.git +``` -# 加载模型 -model = lp.PaddleDetectionLayoutModel(config_path="lp://PubLayNet/ppyolov2_r50vd_dcn_365e_publaynet/config", - threshold=0.5, - label_map={0: "Text", 1: "Title", 2: "List", 3:"Table", 4:"Figure"}, - enforce_cpu=False, - enable_mkldnn=True) -# 检测 -layout = model.detect(image) +- **(2)安装其他依赖** -# 显示结果 -show_img = lp.draw_box(image, layout, box_width=3, show_element_type=True) -show_img.show() +```bash +cd PaddleDetection +python3 -m pip install -r requirements.txt ``` -下图展示了结果,不同颜色的检测框表示不同的类别,并通过`show_element_type`在框的左上角显示具体类别: +## 4. 数据准备 -
- -
+如果希望直接体验预测过程,可以跳过数据准备,下载我们提供的预训练模型。 -`PaddleDetectionLayoutModel`函数参数说明如下: +### 4.1. 英文数据集 -| 参数 | 含义 | 默认值 | 备注 | -| :------------: | :-------------------------: | :---------: | :----------------------------------------------------------: | -| config_path | 模型配置路径 | None | 指定config_path会自动下载模型(仅第一次,之后模型存在,不会再下载) | -| model_path | 模型路径 | None | 本地模型路径,config_path和model_path必须设置一个,不能同时为None | -| threshold | 预测得分的阈值 | 0.5 | \ | -| input_shape | reshape之后图片尺寸 | [3,640,640] | \ | -| batch_size | 测试batch size | 1 | \ | -| label_map | 类别映射表 | None | 设置config_path时,可以为None,根据数据集名称自动获取label_map,设置model_path时需要手动指定 | -| enforce_cpu | 代码是否使用CPU运行 | False | 设置为False表示使用GPU,True表示强制使用CPU | -| enforce_mkldnn | CPU预测中是否开启MKLDNN加速 | True | \ | -| thread_num | 设置CPU线程数 | 10 | \ | +下载文档分析数据集[PubLayNet](https://developer.ibm.com/exchanges/data/all/publaynet/)(数据集96G),包含5个类:`{0: "Text", 1: "Title", 2: "List", 3:"Table", 4:"Figure"}` -目前支持以下几种模型配置和label map,您可以通过修改 `--config_path`和 `--label_map`使用这些模型,从而检测不同类型的内容: +``` +# 下载数据 +wget https://dax-cdn.cdn.appdomain.cloud/dax-publaynet/1.0.0/publaynet.tar.gz +# 解压数据 +tar -xvf publaynet.tar.gz +``` -| dataset | config_path | label_map | -| ------------------------------------------------------------ | ------------------------------------------------------------ | --------------------------------------------------------- | -| [TableBank](https://doc-analysis.github.io/tablebank-page/index.html) word | lp://TableBank/ppyolov2_r50vd_dcn_365e_tableBank_word/config | {0:"Table"} | -| TableBank latex | lp://TableBank/ppyolov2_r50vd_dcn_365e_tableBank_latex/config | {0:"Table"} | -| [PubLayNet](https://github.com/ibm-aur-nlp/PubLayNet) | lp://PubLayNet/ppyolov2_r50vd_dcn_365e_publaynet/config | {0: "Text", 1: "Title", 2: "List", 3:"Table", 4:"Figure"} | +解压之后的**目录结构:** -* TableBank word和TableBank latex分别在word文档、latex文档数据集训练; -* 下载的TableBank数据集里同时包含word和latex。 +``` +|-publaynet + |- test + |- PMC1277013_00004.jpg + |- PMC1291385_00002.jpg + | ... + |- train.json + |- train + |- PMC1291385_00002.jpg + |- PMC1277013_00004.jpg + | ... + |- val.json + |- val + |- PMC538274_00004.jpg + |- PMC539300_00004.jpg + | ... +``` + +**数据分布:** + +| File or Folder | Description | num | +| :------------- | :------------- | ------- | +| `train/` | 训练集图片 | 335,703 | +| `val/` | 验证集图片 | 11,245 | +| `test/` | 测试集图片 | 11,405 | +| `train.json` | 训练集标注文件 | - | +| `val.json` | 验证集标注文件 | - | + +**标注格式:** + +json文件包含所有图像的标注,数据以字典嵌套的方式存放,包含以下key: + +- info,表示标注文件info。 - -## 3. 后处理 +- licenses,表示标注文件licenses。 -版面分析检测包含多个类别,如果只想获取指定类别(如"Text"类别)的检测框、可以使用下述代码: +- images,表示标注文件中图像信息列表,每个元素是一张图像的信息。如下为其中一张图像的信息: -```python -# 接上面代码 -# 首先过滤特定文本类型的区域 -text_blocks = lp.Layout([b for b in layout if b.type=='Text']) -figure_blocks = lp.Layout([b for b in layout if b.type=='Figure']) + ``` + { + 'file_name': 'PMC4055390_00006.jpg', # file_name + 'height': 601, # image height + 'width': 792, # image width + 'id': 341427 # image id + } + ``` -# 因为在图像区域内可能检测到文本区域,所以只需要删除它们 -text_blocks = lp.Layout([b for b in text_blocks \ - if not any(b.is_in(b_fig) for b_fig in figure_blocks)]) +- annotations,表示标注文件中目标物体的标注信息列表,每个元素是一个目标物体的标注信息。如下为其中一个目标物体的标注信息: -# 对文本区域排序并分配id -h, w = image.shape[:2] + ``` + { -left_interval = lp.Interval(0, w/2*1.05, axis='x').put_on_canvas(image) + 'segmentation': # 物体的分割标注 + 'area': 60518.099043117836, # 物体的区域面积 + 'iscrowd': 0, # iscrowd + 'image_id': 341427, # image id + 'bbox': [50.58, 490.86, 240.15, 252.16], # bbox [x1,y1,w,h] + 'category_id': 1, # category_id + 'id': 3322348 # image id + } + ``` -left_blocks = text_blocks.filter_by(left_interval, center=True) -left_blocks.sort(key = lambda b:b.coordinates[1]) +### 4.2. 更多数据集 -right_blocks = [b for b in text_blocks if b not in left_blocks] -right_blocks.sort(key = lambda b:b.coordinates[1]) +我们提供了CDLA(中文版面分析)、TableBank(表格版面分析)等数据集的下连接,处理为上述标注文件json格式,即可以按相同方式进行训练。 -# 最终合并两个列表,并按顺序添加索引 -text_blocks = lp.Layout([b.set(id = idx) for idx, b in enumerate(left_blocks + right_blocks)]) +| dataset | 简介 | +| ------------------------------------------------------------ | ------------------------------------------------------------ | +| [cTDaR2019_cTDaR](https://cndplab-founder.github.io/cTDaR2019/) | 用于表格检测(TRACKA)和表格识别(TRACKB)。图片类型包含历史数据集(以cTDaR_t0开头,如cTDaR_t00872.jpg)和现代数据集(以cTDaR_t1开头,cTDaR_t10482.jpg)。 | +| [IIIT-AR-13K](http://cvit.iiit.ac.in/usodi/iiitar13k.php) | 手动注释公开的年度报告中的图形或页面而构建的数据集,包含5类:table, figure, natural image, logo, and signature | +| [CDLA](https://github.com/buptlihang/CDLA) | 中文文档版面分析数据集,面向中文文献类(论文)场景,包含10类:Table、Figure、Figure caption、Table、Table caption、Header、Footer、Reference、Equation | +| [TableBank](https://github.com/doc-analysis/TableBank) | 用于表格检测和识别大型数据集,包含Word和Latex2种文档格式 | +| [DocBank](https://github.com/doc-analysis/DocBank) | 使用弱监督方法构建的大规模数据集(500K文档页面),用于文档布局分析,包含12类:Author、Caption、Date、Equation、Figure、Footer、List、Paragraph、Reference、Section、Table、Title | + + +## 5. 开始训练 + +提供了训练脚本、评估脚本和预测脚本,本节将以PubLayNet预训练模型为例进行讲解。 + +如果不希望训练,直接体验后面的模型评估、预测、动转静、推理的流程,可以下载提供的预训练模型(PubLayNet数据集),并跳过本部分。 -# 显示结果 -show_img = lp.draw_box(image, text_blocks, - box_width=3, - show_element_id=True) -show_img.show() +``` +mkdir pretrained_model +cd pretrained_model +# 下载PubLayNet预训练模型(直接体验模型评估、预测、动转静) +wget https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout.pdparams +# 下载PubLaynet推理模型(直接体验模型推理) +wget https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_infer.tar ``` -显示只有"Text"类别的结果: +如果测试图片为中文,可以下载中文CDLA数据集的预训练模型,识别10类文档区域:Table、Figure、Figure caption、Table、Table caption、Header、Footer、Reference、Equation,在[版面分析模型](../docs/models_list.md)中下载`picodet_lcnet_x1_0_fgd_layout_cdla`模型的训练模型和推理模型。如果只检测图片中的表格区域,可以下载表格数据集的预训练模型,在[版面分析模型](../docs/models_list.md)中下载`picodet_lcnet_x1_0_fgd_layout_table`模型的训练模型和推理模型。 -
- -
+### 5.1. 启动训练 + +开始训练: + +* 修改配置文件 + +如果你希望训练自己的数据集,需要修改配置文件中的数据配置、类别数。 + + +以`configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml` 为例,修改的内容如下所示。 + +```yaml +metric: COCO +# 类别数 +num_classes: 5 + +TrainDataset: + !COCODataSet + # 修改为你自己的训练数据目录 + image_dir: train + # 修改为你自己的训练数据标签文件 + anno_path: train.json + # 修改为你自己的训练数据根目录 + dataset_dir: /root/publaynet/ + data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd'] + +EvalDataset: + !COCODataSet + # 修改为你自己的验证数据目录 + image_dir: val + # 修改为你自己的验证数据标签文件 + anno_path: val.json + # 修改为你自己的验证数据根目录 + dataset_dir: /root/publaynet/ + +TestDataset: + !ImageFolder + # 修改为你自己的测试数据标签文件 + anno_path: /root/publaynet/val.json +``` + +* 开始训练,在训练时,会默认下载PP-PicoDet预训练模型,这里无需预先下载。 + +```bash +# GPU训练 支持单卡,多卡训练 +# 训练日志会自动保存到 log 目录中 + +# 单卡训练 +export CUDA_VISIBLE_DEVICES=0 +python3 tools/train.py \ + -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ + --eval + +# 多卡训练,通过--gpus参数指定卡号 +export CUDA_VISIBLE_DEVICES=0,1,2,3 +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py \ + -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ + --eval +``` + +**注意:**如果训练时显存out memory,将TrainReader中batch_size调小,同时LearningRate中base_lr等比例减小。发布的config均由8卡训练得到,如果改变GPU卡数为1,那么base_lr需要减小8倍。 + +正常启动训练后,会看到以下log输出: + +``` +[08/15 04:02:30] ppdet.utils.checkpoint INFO: Finish loading model weights: /root/.cache/paddle/weights/LCNet_x1_0_pretrained.pdparams +[08/15 04:02:46] ppdet.engine INFO: Epoch: [0] [ 0/1929] learning_rate: 0.040000 loss_vfl: 1.216707 loss_bbox: 1.142163 loss_dfl: 0.544196 loss: 2.903065 eta: 17 days, 13:50:26 batch_cost: 15.7452 data_cost: 2.9112 ips: 1.5243 images/s +[08/15 04:03:19] ppdet.engine INFO: Epoch: [0] [ 20/1929] learning_rate: 0.064000 loss_vfl: 1.180627 loss_bbox: 0.939552 loss_dfl: 0.442436 loss: 2.628206 eta: 2 days, 12:18:53 batch_cost: 1.5770 data_cost: 0.0008 ips: 15.2184 images/s +[08/15 04:03:47] ppdet.engine INFO: Epoch: [0] [ 40/1929] learning_rate: 0.088000 loss_vfl: 0.543321 loss_bbox: 1.071401 loss_dfl: 0.457817 loss: 2.057003 eta: 2 days, 0:07:03 batch_cost: 1.3190 data_cost: 0.0007 ips: 18.1954 images/s +[08/15 04:04:12] ppdet.engine INFO: Epoch: [0] [ 60/1929] learning_rate: 0.112000 loss_vfl: 0.630989 loss_bbox: 0.859183 loss_dfl: 0.384702 loss: 1.883143 eta: 1 day, 19:01:29 batch_cost: 1.2177 data_cost: 0.0006 ips: 19.7087 images/s +``` + +- `--eval`表示训练的同时,进行评估, 评估过程中默认将最佳模型,保存为 `output/picodet_lcnet_x1_0_layout/best_accuracy` 。 + +**注意,预测/评估时的配置文件请务必与训练一致。** + +### 5.2. FGD蒸馏训练 + +PaddleDetection支持了基于FGD([Focal and Global Knowledge Distillation for Detectors](https://arxiv.org/abs/2111.11837v1))蒸馏的目标检测模型训练过程,FGD蒸馏分为两个部分`Focal`和`Global`。`Focal`蒸馏分离图像的前景和背景,让学生模型分别关注教师模型的前景和背景部分特征的关键像素;`Global`蒸馏部分重建不同像素之间的关系并将其从教师转移到学生,以补偿`Focal`蒸馏中丢失的全局信息。 + +更换数据集,修改【TODO】配置中的数据配置、类别数,具体可以参考4.1。启动训练: + +```bash +# 单卡训练 +export CUDA_VISIBLE_DEVICES=0 +python3 tools/train.py \ + -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ + --slim_config configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x2_5_layout.yml \ + --eval +``` + +- `-c`: 指定模型配置文件。 +- `--slim_config`: 指定压缩策略配置文件。 + +## 6. 模型评估与预测 - -## 4. 指标 +### 6.1. 指标评估 -| Dataset | mAP | CPU time cost | GPU time cost | -| --------- | ---- | ------------- | ------------- | -| PubLayNet | 93.6 | 1713.7ms | 66.6ms | -| TableBank | 96.2 | 1968.4ms | 65.1ms | +训练中模型参数默认保存在`output/picodet_lcnet_x1_0_layout`目录下。在评估指标时,需要设置`weights`指向保存的参数文件。评估数据集可以通过 `configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml` 修改`EvalDataset`中的 `image_dir`、`anno_path`和`dataset_dir` 设置。 -**Envrionment:** +```bash +# GPU 评估, weights 为待测权重 +python3 tools/eval.py \ + -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ + -o weights=./output/picodet_lcnet_x1_0_layout/best_model +``` + +会输出以下信息,打印出mAP、AP0.5等信息。 + +```py + Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.935 + Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.979 + Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.956 + Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.404 + Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.782 + Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.969 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.539 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.938 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.949 + Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.495 + Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.818 + Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.978 +[08/15 07:07:09] ppdet.engine INFO: Total sample number: 11245, averge FPS: 24.405059207157436 +[08/15 07:07:09] ppdet.engine INFO: Best test bbox ap is 0.935. +``` + +若使用**提供的预训练模型进行评估**,或使用**FGD蒸馏训练的模型**,更换`weights`模型路径,执行如下命令进行评估: + +``` +python3 tools/eval.py \ + -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ + --slim_config configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x2_5_layout.yml \ + -o weights=output/picodet_lcnet_x2_5_layout/best_model +``` + +- `-c`: 指定模型配置文件。 +- `--slim_config`: 指定蒸馏策略配置文件。 +- `-o weights`: 指定蒸馏算法训好的模型路径。 + +### 6.2 测试版面分析结果 + + +预测使用的配置文件必须与训练一致,如您通过 `python3 tools/train.py -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml` 完成了模型的训练过程。 + +使用 PaddleDetection 训练好的模型,您可以使用如下命令进行模型预测。 + +```bash +python3 tools/infer.py \ + -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ + -o weights='output/picodet_lcnet_x1_0_layout/best_model.pdparams' \ + --infer_img='docs/images/layout.jpg' \ + --output_dir=output_dir/ \ + --draw_threshold=0.5 +``` + +- `--infer_img`: 推理单张图片,也可以通过`--infer_dir`推理文件中的所有图片。 +- `--output_dir`: 指定可视化结果保存路径。 +- `--draw_threshold`:指定绘制结果框的NMS阈值。 + +若使用**提供的预训练模型进行预测**,或使用**FGD蒸馏训练的模型**,更换`weights`模型路径,执行如下命令进行预测: + +``` +python3 tools/infer.py \ + -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ + --slim_config configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x2_5_layout.yml \ + -o weights='output/picodet_lcnet_x2_5_layout/best_model.pdparams' \ + --infer_img='docs/images/layout.jpg' \ + --output_dir=output_dir/ \ + --draw_threshold=0.5 +``` + + +## 7. 模型导出与预测 + + +### 7.1 模型导出 + +inference 模型(`paddle.jit.save`保存的模型) 一般是模型训练,把模型结构和模型参数保存在文件中的固化模型,多用于预测部署场景。 训练过程中保存的模型是checkpoints模型,保存的只有模型的参数,多用于恢复训练等。 与checkpoints模型相比,inference 模型会额外保存模型的结构信息,在预测部署、加速推理上性能优越,灵活方便,适合于实际系统集成。 + +版面分析模型转inference模型步骤如下: + +```bash +python3 tools/export_model.py \ + -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ + -o weights=output/picodet_lcnet_x1_0_layout/best_model \ + --output_dir=output_inference/ +``` + +* 如无需导出后处理,请指定:`-o export.benchmark=True`(如果-o已出现过,此处删掉-o) +* 如无需导出NMS,请指定:`-o export.nms=False` + +转换成功后,在目录下有三个文件: + +``` +output_inference/picodet_lcnet_x1_0_layout/ + ├── model.pdiparams # inference模型的参数文件 + ├── model.pdiparams.info # inference模型的参数信息,可忽略 + └── model.pdmodel # inference模型的模型结构文件 +``` -​ **CPU:** Intel(R) Xeon(R) CPU E5-2650 v4 @ 2.20GHz,24core +若使用**提供的预训练模型转Inference模型**,或使用**FGD蒸馏训练的模型**,更换`weights`模型路径,模型转inference模型步骤如下: -​ **GPU:** a single NVIDIA Tesla P40 +```bash +python3 tools/export_model.py \ + -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ + --slim_config configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x2_5_layout.yml \ + -o weights=./output/picodet_lcnet_x2_5_layout/best_model \ + --output_dir=output_inference/ +``` + + + +### 7.2 模型推理 + +若使用**提供的推理训练模型推理**,或使用**FGD蒸馏训练的模型**,更换`model_dir`推理模型路径,执行如下命令进行推理: + +```bash +python3 deploy/python/infer.py \ + --model_dir=output_inference/picodet_lcnet_x1_0_layout/ \ + --image_file=docs/images/layout.jpg \ + --device=CPU +``` - -## 5. 训练版面分析模型 +- --device:指定GPU、CPU设备 -上述模型基于[PaddleDetection](https://github.com/PaddlePaddle/PaddleDetection) 训练,如果您想训练自己的版面分析模型,请参考:[train_layoutparser_model](train_layoutparser_model_ch.md) +模型推理完成,会看到以下log输出 + +``` +------------------------------------------ +----------- Model Configuration ----------- +Model Arch: PicoDet +Transform Order: +--transform op: Resize +--transform op: NormalizeImage +--transform op: Permute +--transform op: PadStride +-------------------------------------------- +class_id:0, confidence:0.9921, left_top:[20.18,35.66],right_bottom:[341.58,600.99] +class_id:0, confidence:0.9914, left_top:[19.77,611.42],right_bottom:[341.48,901.82] +class_id:0, confidence:0.9904, left_top:[369.36,375.10],right_bottom:[691.29,600.59] +class_id:0, confidence:0.9835, left_top:[369.60,608.60],right_bottom:[691.38,736.72] +class_id:0, confidence:0.9830, left_top:[369.58,805.38],right_bottom:[690.97,901.80] +class_id:0, confidence:0.9716, left_top:[383.68,271.44],right_bottom:[688.93,335.39] +class_id:0, confidence:0.9452, left_top:[370.82,34.48],right_bottom:[688.10,63.54] +class_id:1, confidence:0.8712, left_top:[370.84,771.03],right_bottom:[519.30,789.13] +class_id:3, confidence:0.9856, left_top:[371.28,67.85],right_bottom:[685.73,267.72] +save result to: output/layout.jpg +Test iter 0 +------------------ Inference Time Info ---------------------- +total_time(ms): 2196.0, img_num: 1 +average latency time(ms): 2196.00, QPS: 0.455373 +preprocess_time(ms): 2172.50, inference_time(ms): 11.90, postprocess_time(ms): 11.60 +``` + +- Model:模型结构 +- Transform Order:预处理操作 +- class_id、confidence、left_top、right_bottom:分别表示类别id、置信度、左上角坐标、右下角坐标 +- save result to:可视化版面分析结果保存路径,默认保存到`./output`文件夹 +- Inference Time Info:推理时间,其中preprocess_time表示预处理耗时,inference_time表示模型预测耗时,postprocess_time表示后处理耗时 + +可视化版面结果如下图所示 + +
+ +
+ + + +## Citations + +``` +@inproceedings{zhong2019publaynet, + title={PubLayNet: largest dataset ever for document layout analysis}, + author={Zhong, Xu and Tang, Jianbin and Yepes, Antonio Jimeno}, + booktitle={2019 International Conference on Document Analysis and Recognition (ICDAR)}, + year={2019}, + volume={}, + number={}, + pages={1015-1022}, + doi={10.1109/ICDAR.2019.00166}, + ISSN={1520-5363}, + month={Sep.}, + organization={IEEE} +} + +@inproceedings{yang2022focal, + title={Focal and global knowledge distillation for detectors}, + author={Yang, Zhendong and Li, Zhe and Jiang, Xiaohu and Gong, Yuan and Yuan, Zehuan and Zhao, Danpei and Yuan, Chun}, + booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, + pages={4643--4652}, + year={2022} +} +``` diff --git a/ppstructure/layout/__init__.py b/ppstructure/layout/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1d11e265597c7c8e39098a228108da3bb954b892 --- /dev/null +++ b/ppstructure/layout/__init__.py @@ -0,0 +1,13 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/ppstructure/layout/predict_layout.py b/ppstructure/layout/predict_layout.py new file mode 100755 index 0000000000000000000000000000000000000000..9f8c884e144654901737191141622abfaa872d24 --- /dev/null +++ b/ppstructure/layout/predict_layout.py @@ -0,0 +1,131 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import sys + +__dir__ = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(__dir__) +sys.path.insert(0, os.path.abspath(os.path.join(__dir__, '../..'))) + +os.environ["FLAGS_allocator_strategy"] = 'auto_growth' + +import cv2 +import numpy as np +import time + +import tools.infer.utility as utility +from ppocr.data import create_operators, transform +from ppocr.postprocess import build_post_process +from ppocr.utils.logging import get_logger +from ppocr.utils.utility import get_image_file_list, check_and_read +from ppstructure.utility import parse_args +from picodet_postprocess import PicoDetPostProcess + +logger = get_logger() + + +class LayoutPredictor(object): + def __init__(self, args): + pre_process_list = [{ + 'Resize': { + 'size': [800, 608] + } + }, { + 'NormalizeImage': { + 'std': [0.229, 0.224, 0.225], + 'mean': [0.485, 0.456, 0.406], + 'scale': '1./255.', + 'order': 'hwc' + } + }, { + 'ToCHWImage': None + }, { + 'KeepKeys': { + 'keep_keys': ['image'] + } + }] + postprocess_params = { + 'name': 'PicoDetPostProcess', + "layout_dict_path": args.layout_dict_path, + "score_threshold": args.layout_score_threshold, + "nms_threshold": args.layout_nms_threshold, + } + + self.preprocess_op = create_operators(pre_process_list) + self.postprocess_op = build_post_process(postprocess_params) + self.predictor, self.input_tensor, self.output_tensors, self.config = \ + utility.create_predictor(args, 'layout', logger) + + def __call__(self, img): + ori_im = img.copy() + data = {'image': img} + data = transform(data, self.preprocess_op) + img = data[0] + + if img is None: + return None, 0 + + img = np.expand_dims(img, axis=0) + img = img.copy() + + preds, elapse = 0, 1 + starttime = time.time() + + self.input_tensor.copy_from_cpu(img) + self.predictor.run() + + np_score_list, np_boxes_list = [], [] + output_names = self.predictor.get_output_names() + num_outs = int(len(output_names) / 2) + for out_idx in range(num_outs): + np_score_list.append( + self.predictor.get_output_handle(output_names[out_idx]) + .copy_to_cpu()) + np_boxes_list.append( + self.predictor.get_output_handle(output_names[ + out_idx + num_outs]).copy_to_cpu()) + preds = dict(boxes=np_score_list, boxes_num=np_boxes_list) + + post_preds = self.postprocess_op(ori_im, img, preds) + elapse = time.time() - starttime + return post_preds, elapse + + +def main(args): + image_file_list = get_image_file_list(args.image_dir) + layout_predictor = LayoutPredictor(args) + count = 0 + total_time = 0 + + repeats = 50 + for image_file in image_file_list: + img, flag, _ = check_and_read(image_file) + if not flag: + img = cv2.imread(image_file) + if img is None: + logger.info("error in loading image:{}".format(image_file)) + continue + + layout_res, elapse = layout_predictor(img) + + logger.info("result: {}".format(layout_res)) + + if count > 0: + total_time += elapse + count += 1 + logger.info("Predict time of {}: {}".format(image_file, elapse)) + + +if __name__ == "__main__": + main(parse_args()) diff --git a/ppstructure/layout/train_layoutparser_model.md b/ppstructure/layout/train_layoutparser_model.md deleted file mode 100644 index e877c9c0c901e8be8299101daa5ce6248de0a1dc..0000000000000000000000000000000000000000 --- a/ppstructure/layout/train_layoutparser_model.md +++ /dev/null @@ -1,174 +0,0 @@ -English | [简体中文](train_layoutparser_model_ch.md) -- [Training layout-parse](#training-layout-parse) - - [1. Installation](#1--installation) - - [1.1 Requirements](#11-requirements) - - [1.2 Install PaddleDetection](#12-install-paddledetection) - - [2. Data preparation](#2-data-preparation) - - [3. Configuration](#3-configuration) - - [4. Training](#4-training) - - [5. Prediction](#5-prediction) - - [6. Deployment](#6-deployment) - - [6.1 Export model](#61-export-model) - - [6.2 Inference](#62-inference) - -# Training layout-parse - -## 1. Installation - -### 1.1 Requirements - -- PaddlePaddle 2.1 -- OS 64 bit -- Python 3(3.5.1+/3.6/3.7/3.8/3.9),64 bit -- pip/pip3(9.0.1+), 64 bit -- CUDA >= 10.1 -- cuDNN >= 7.6 - -### 1.2 Install PaddleDetection - -```bash -# Clone PaddleDetection repository -cd -git clone https://github.com/PaddlePaddle/PaddleDetection.git - -cd PaddleDetection -# Install other dependencies -pip install -r requirements.txt -``` - -For more installation tutorials, please refer to: [Install doc](https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.1/docs/tutorials/INSTALL_cn.md) - -## 2. Data preparation - -Download the [PubLayNet](https://github.com/ibm-aur-nlp/PubLayNet) dataset - -```bash -cd PaddleDetection/dataset/ -mkdir publaynet -# execute the command,download PubLayNet -wget -O publaynet.tar.gz https://dax-cdn.cdn.appdomain.cloud/dax-publaynet/1.0.0/publaynet.tar.gz?_ga=2.104193024.1076900768.1622560733-649911202.1622560733 -# unpack -tar -xvf publaynet.tar.gz -``` - -PubLayNet directory structure after decompressing : - -| File or Folder | Description | num | -| :------------- | :----------------------------------------------- | ------- | -| `train/` | Images in the training subset | 335,703 | -| `val/` | Images in the validation subset | 11,245 | -| `test/` | Images in the testing subset | 11,405 | -| `train.json` | Annotations for training images | 1 | -| `val.json` | Annotations for validation images | 1 | -| `LICENSE.txt` | Plaintext version of the CDLA-Permissive license | 1 | -| `README.txt` | Text file with the file names and description | 1 | - -For other datasets,please refer to [the PrepareDataSet]((https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.1/docs/tutorials/PrepareDataSet.md) ) - -## 3. Configuration - -We use the `configs/ppyolo/ppyolov2_r50vd_dcn_365e_coco.yml` configuration for training,the configuration file is as follows - -```bash -_BASE_: [ - '../datasets/coco_detection.yml', - '../runtime.yml', - './_base_/ppyolov2_r50vd_dcn.yml', - './_base_/optimizer_365e.yml', - './_base_/ppyolov2_reader.yml', -] - -snapshot_epoch: 8 -weights: output/ppyolov2_r50vd_dcn_365e_coco/model_final -``` -The `ppyolov2_r50vd_dcn_365e_coco.yml` configuration depends on other configuration files, in this case: - -- coco_detection.yml:mainly explains the path of training data and verification data - -- runtime.yml:mainly describes the common parameters, such as whether to use the GPU and how many epoch to save model etc. - -- optimizer_365e.yml:mainly explains the learning rate and optimizer configuration - -- ppyolov2_r50vd_dcn.yml:mainly describes the model and the network - -- ppyolov2_reader.yml:mainly describes the configuration of data readers, such as batch size and number of concurrent loading child processes, and also includes post preprocessing, such as resize and data augmention etc. - - -Modify the preceding files, such as the dataset path and batch size etc. - -## 4. Training - -PaddleDetection provides single-card/multi-card training mode to meet various training needs of users: - -* GPU single card training - -```bash -export CUDA_VISIBLE_DEVICES=0 #Don't need to run this command on Windows and Mac -python tools/train.py -c configs/ppyolo/ppyolov2_r50vd_dcn_365e_coco.yml -``` - -* GPU multi-card training - -```bash -export CUDA_VISIBLE_DEVICES=0,1,2,3 -python -m paddle.distributed.launch --gpus 0,1,2,3 tools/train.py -c configs/ppyolo/ppyolov2_r50vd_dcn_365e_coco.yml --eval -``` - ---eval: training while verifying - -* Model recovery training - -During the daily training, if training is interrupted due to some reasons, you can use the -r command to resume the training: - -```bash -export CUDA_VISIBLE_DEVICES=0,1,2,3 -python -m paddle.distributed.launch --gpus 0,1,2,3 tools/train.py -c configs/ppyolo/ppyolov2_r50vd_dcn_365e_coco.yml --eval -r output/ppyolov2_r50vd_dcn_365e_coco/10000 -``` - -Note: If you encounter "`Out of memory error`" , try reducing `batch_size` in the `ppyolov2_reader.yml` file - -## 5. Prediction - -Set parameters and use PaddleDetection to predict: - -```bash -export CUDA_VISIBLE_DEVICES=0 -python tools/infer.py -c configs/ppyolo/ppyolov2_r50vd_dcn_365e_coco.yml --infer_img=images/paper-image.jpg --output_dir=infer_output/ --draw_threshold=0.5 -o weights=output/ppyolov2_r50vd_dcn_365e_coco/model_final --use_vdl=Ture -``` - -`--draw_threshold` is an optional parameter. According to the calculation of [NMS](https://ieeexplore.ieee.org/document/1699659), different threshold will produce different results, ` keep_top_k ` represent the maximum amount of output target, the default value is 10. You can set different value according to your own actual situation。 - -## 6. Deployment - -Use your trained model in Layout Parser - -### 6.1 Export model - -n the process of model training, the model file saved contains the process of forward prediction and back propagation. In the actual industrial deployment, there is no need for back propagation. Therefore, the model should be translated into the model format required by the deployment. The `tools/export_model.py` script is provided in PaddleDetection to export the model. - -The exported model name defaults to `model.*`, Layout Parser's code model is `inference.*`, So change [PaddleDetection/ppdet/engine/trainer. Py ](https://github.com/PaddlePaddle/PaddleDetection/blob/b87a1ea86fa18ce69e44a17ad1b49c1326f19ff9/ppdet/engine/trainer.py# L512) (click on the link to see the detailed line of code), change 'model' to 'inference'. - -Execute the script to export model: - -```bash -python tools/export_model.py -c configs/ppyolo/ppyolov2_r50vd_dcn_365e_coco.yml --output_dir=./inference -o weights=output/ppyolov2_r50vd_dcn_365e_coco/model_final.pdparams -``` - -The prediction model is exported to `inference/ppyolov2_r50vd_dcn_365e_coco` ,including:`infer_cfg.yml`(prediction not required), `inference.pdiparams`, `inference.pdiparams.info`,`inference.pdmodel` - -More model export tutorials, please refer to:[EXPORT_MODEL](https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.1/deploy/EXPORT_MODEL.md) - -### 6.2 Inference - -`model_path` represent the trained model path, and layoutparser is used to predict: - -```bash -import layoutparser as lp -model = lp.PaddleDetectionLayoutModel(model_path="inference/ppyolov2_r50vd_dcn_365e_coco", threshold=0.5,label_map={0: "Text", 1: "Title", 2: "List", 3:"Table", 4:"Figure"},enforce_cpu=True,enable_mkldnn=True) -``` - -*** - -More PaddleDetection training tutorials,please reference:[PaddleDetection Training](https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.1/docs/tutorials/GETTING_STARTED_cn.md) - -*** diff --git a/ppstructure/layout/train_layoutparser_model_ch.md b/ppstructure/layout/train_layoutparser_model_ch.md deleted file mode 100644 index a89b0f3819b52c79b86d2ada13bac23e3d1656ed..0000000000000000000000000000000000000000 --- a/ppstructure/layout/train_layoutparser_model_ch.md +++ /dev/null @@ -1,176 +0,0 @@ -[English](train_layoutparser_model.md) | 简体中文 -- [训练版面分析](#训练版面分析) - - [1. 安装](#1-安装) - - [1.1 环境要求](#11-环境要求) - - [1.2 安装PaddleDetection](#12-安装paddledetection) - - [2. 准备数据](#2-准备数据) - - [3. 配置文件改动和说明](#3-配置文件改动和说明) - - [4. PaddleDetection训练](#4-paddledetection训练) - - [5. PaddleDetection预测](#5-paddledetection预测) - - [6. 预测部署](#6-预测部署) - - [6.1 模型导出](#61-模型导出) - - [6.2 layout_parser预测](#62-layout_parser预测) - -# 训练版面分析 - -## 1. 安装 - -### 1.1 环境要求 - -- PaddlePaddle 2.1 -- OS 64 bit -- Python 3(3.5.1+/3.6/3.7/3.8/3.9),64 bit -- pip/pip3(9.0.1+), 64 bit -- CUDA >= 10.1 -- cuDNN >= 7.6 - -### 1.2 安装PaddleDetection - -```bash -# 克隆PaddleDetection仓库 -cd -git clone https://github.com/PaddlePaddle/PaddleDetection.git - -cd PaddleDetection -# 安装其他依赖 -pip install -r requirements.txt -``` - -更多安装教程,请参考: [Install doc](https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.1/docs/tutorials/INSTALL_cn.md) - -## 2. 准备数据 - -下载 [PubLayNet](https://github.com/ibm-aur-nlp/PubLayNet) 数据集: - -```bash -cd PaddleDetection/dataset/ -mkdir publaynet -# 执行命令,下载 -wget -O publaynet.tar.gz https://dax-cdn.cdn.appdomain.cloud/dax-publaynet/1.0.0/publaynet.tar.gz?_ga=2.104193024.1076900768.1622560733-649911202.1622560733 -# 解压 -tar -xvf publaynet.tar.gz -``` - -解压之后PubLayNet目录结构: - -| File or Folder | Description | num | -| :------------- | :----------------------------------------------- | ------- | -| `train/` | Images in the training subset | 335,703 | -| `val/` | Images in the validation subset | 11,245 | -| `test/` | Images in the testing subset | 11,405 | -| `train.json` | Annotations for training images | 1 | -| `val.json` | Annotations for validation images | 1 | -| `LICENSE.txt` | Plaintext version of the CDLA-Permissive license | 1 | -| `README.txt` | Text file with the file names and description | 1 | - -如果使用其它数据集,请参考[准备训练数据](https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.1/docs/tutorials/PrepareDataSet.md) - -## 3. 配置文件改动和说明 - -我们使用 `configs/ppyolo/ppyolov2_r50vd_dcn_365e_coco.yml`配置进行训练,配置文件摘要如下: - -```bash -_BASE_: [ - '../datasets/coco_detection.yml', - '../runtime.yml', - './_base_/ppyolov2_r50vd_dcn.yml', - './_base_/optimizer_365e.yml', - './_base_/ppyolov2_reader.yml', -] - -snapshot_epoch: 8 -weights: output/ppyolov2_r50vd_dcn_365e_coco/model_final -``` -从中可以看到 `ppyolov2_r50vd_dcn_365e_coco.yml` 配置需要依赖其他的配置文件,在该例子中需要依赖: - -- coco_detection.yml:主要说明了训练数据和验证数据的路径 - -- runtime.yml:主要说明了公共的运行参数,比如是否使用GPU、每多少个epoch存储checkpoint等 - -- optimizer_365e.yml:主要说明了学习率和优化器的配置 - -- ppyolov2_r50vd_dcn.yml:主要说明模型和主干网络的情况 - -- ppyolov2_reader.yml:主要说明数据读取器配置,如batch size,并发加载子进程数等,同时包含读取后预处理操作,如resize、数据增强等等 - - -根据实际情况,修改上述文件,比如数据集路径、batch size等。 - -## 4. PaddleDetection训练 - -PaddleDetection提供了单卡/多卡训练模式,满足用户多种训练需求 - -* GPU 单卡训练 - -```bash -export CUDA_VISIBLE_DEVICES=0 #windows和Mac下不需要执行该命令 -python tools/train.py -c configs/ppyolo/ppyolov2_r50vd_dcn_365e_coco.yml -``` - -* GPU多卡训练 - -```bash -export CUDA_VISIBLE_DEVICES=0,1,2,3 -python -m paddle.distributed.launch --gpus 0,1,2,3 tools/train.py -c configs/ppyolo/ppyolov2_r50vd_dcn_365e_coco.yml --eval -``` - ---eval:表示边训练边验证 - -* 模型恢复训练 - -在日常训练过程中,有的用户由于一些原因导致训练中断,用户可以使用-r的命令恢复训练: - -```bash -export CUDA_VISIBLE_DEVICES=0,1,2,3 -python -m paddle.distributed.launch --gpus 0,1,2,3 tools/train.py -c configs/ppyolo/ppyolov2_r50vd_dcn_365e_coco.yml --eval -r output/ppyolov2_r50vd_dcn_365e_coco/10000 -``` - -注意:如果遇到 "`Out of memory error`" 问题, 尝试在 `ppyolov2_reader.yml` 文件中调小`batch_size` - -## 5. PaddleDetection预测 - -设置参数,使用PaddleDetection预测: - -```bash -export CUDA_VISIBLE_DEVICES=0 -python tools/infer.py -c configs/ppyolo/ppyolov2_r50vd_dcn_365e_coco.yml --infer_img=images/paper-image.jpg --output_dir=infer_output/ --draw_threshold=0.5 -o weights=output/ppyolov2_r50vd_dcn_365e_coco/model_final --use_vdl=Ture -``` - -`--draw_threshold` 是个可选参数. 根据 [NMS](https://ieeexplore.ieee.org/document/1699659) 的计算,不同阈值会产生不同的结果 `keep_top_k`表示设置输出目标的最大数量,默认值为100,用户可以根据自己的实际情况进行设定。 - -## 6. 预测部署 - -在layout parser中使用自己训练好的模型。 - -### 6.1 模型导出 - -在模型训练过程中保存的模型文件是包含前向预测和反向传播的过程,在实际的工业部署则不需要反向传播,因此需要将模型进行导成部署需要的模型格式。 在PaddleDetection中提供了 `tools/export_model.py`脚本来导出模型。 - -导出模型名称默认是`model.*`,layout parser代码模型名称是`inference.*`, 所以修改[PaddleDetection/ppdet/engine/trainer.py ](https://github.com/PaddlePaddle/PaddleDetection/blob/b87a1ea86fa18ce69e44a17ad1b49c1326f19ff9/ppdet/engine/trainer.py#L512) (点开链接查看详细代码行),将`model`改为`inference`即可。 - -执行导出模型脚本: - -```bash -python tools/export_model.py -c configs/ppyolo/ppyolov2_r50vd_dcn_365e_coco.yml --output_dir=./inference -o weights=output/ppyolov2_r50vd_dcn_365e_coco/model_final.pdparams -``` - -预测模型会导出到`inference/ppyolov2_r50vd_dcn_365e_coco`目录下,分别为`infer_cfg.yml`(预测不需要), `inference.pdiparams`, `inference.pdiparams.info`,`inference.pdmodel` 。 - -更多模型导出教程,请参考:[EXPORT_MODEL](https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.1/deploy/EXPORT_MODEL.md) - -### 6.2 layout_parser预测 - -`model_path`指定训练好的模型路径,使用layout parser进行预测: - -```bash -import layoutparser as lp -model = lp.PaddleDetectionLayoutModel(model_path="inference/ppyolov2_r50vd_dcn_365e_coco", threshold=0.5,label_map={0: "Text", 1: "Title", 2: "List", 3:"Table", 4:"Figure"},enforce_cpu=True,enable_mkldnn=True) -``` - - - -*** - -更多PaddleDetection训练教程,请参考:[PaddleDetection训练](https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.1/docs/tutorials/GETTING_STARTED_cn.md) - -*** diff --git a/ppstructure/pdf2word/README.md b/ppstructure/pdf2word/README.md new file mode 100644 index 0000000000000000000000000000000000000000..564df4063e101e028afbea5c3acab8946196d31d --- /dev/null +++ b/ppstructure/pdf2word/README.md @@ -0,0 +1,28 @@ +# PDF2WORD + +PDF2WORD是PaddleOCR社区开发者[whjdark](https://github.com/whjdark) 基于PP-Structure智能文档分析模型实现的PDF转换Word应用程序,提供可直接安装的exe,方便windows用户运行 + +## 1.使用 + +### 应用程序 + +1. 下载与安装:针对Windows用户,根据[软件下载]()一节下载软件后,运行 `pdf2word.exe` 。若您下载的是lite版本,安装过程中会在线下载环境依赖、模型等必要资源,安装时间较长,请确保网络畅通。serve版本打包了相关依赖,安装时间较短,可按需下载。 + +2. 转换:由于PP-Structure根据中英文数据分别进行适配,在转换相应文件时可**根据文档语言进行相应选择**。 + +### 脚本运行 + +首次运行需要将切换路径到 `/ppstructure/pdf2word` ,然后运行代码 + +``` +python pdf2word.py +``` + +## 2.软件下载 + +如需获取已打包程序,可以扫描下方二维码,关注公众号填写问卷后,加入PaddleOCR官方交流群免费获取20G OCR学习大礼包,内含OCR场景应用集合(包含数码管、液晶屏、车牌、高精度SVTR模型等7个垂类模型)、《动手学OCR》电子书、课程回放视频、前沿论文等重磅资料 + +
+ +
+ diff --git a/ppstructure/pdf2word/icons/chinese.png b/ppstructure/pdf2word/icons/chinese.png new file mode 100644 index 0000000000000000000000000000000000000000..328e2fff73bd75188fa888aae45c8cb4ca844f57 Binary files /dev/null and b/ppstructure/pdf2word/icons/chinese.png differ diff --git a/ppstructure/pdf2word/icons/english.png b/ppstructure/pdf2word/icons/english.png new file mode 100644 index 0000000000000000000000000000000000000000..536c4a910ae6fc05040f4958477a34aeae891ea0 Binary files /dev/null and b/ppstructure/pdf2word/icons/english.png differ diff --git a/ppstructure/pdf2word/icons/folder-open.png b/ppstructure/pdf2word/icons/folder-open.png new file mode 100644 index 0000000000000000000000000000000000000000..ab5f55f5a4819add116113b55f717f7a21aeafdd Binary files /dev/null and b/ppstructure/pdf2word/icons/folder-open.png differ diff --git a/ppstructure/pdf2word/icons/folder-plus.png b/ppstructure/pdf2word/icons/folder-plus.png new file mode 100644 index 0000000000000000000000000000000000000000..01ce6c10b0ed3e3975edbebbc8e886f846fabe8d Binary files /dev/null and b/ppstructure/pdf2word/icons/folder-plus.png differ diff --git a/ppstructure/pdf2word/pdf2word.py b/ppstructure/pdf2word/pdf2word.py new file mode 100644 index 0000000000000000000000000000000000000000..6b394094f3b24bfaa7829541f4f9a2a48f3d493f --- /dev/null +++ b/ppstructure/pdf2word/pdf2word.py @@ -0,0 +1,441 @@ +import sys +import tarfile +import os +import time +import datetime +import functools +import cv2 +import platform +import numpy as np +from qtpy.QtWidgets import QApplication, QWidget, QPushButton, QProgressBar, \ + QGridLayout, QMessageBox, QLabel, QFileDialog +from qtpy.QtCore import Signal, QThread, QObject +from qtpy.QtGui import QImage, QPixmap, QIcon + +file = os.path.dirname(os.path.abspath(__file__)) +root = os.path.abspath(os.path.join(file, '../../')) +sys.path.append(file) +sys.path.insert(0, root) + +from ppstructure.predict_system import StructureSystem, save_structure_res +from ppstructure.utility import parse_args, draw_structure_result +from ppocr.utils.network import download_with_progressbar +from ppstructure.recovery.recovery_to_doc import sorted_layout_boxes, convert_info_docx +# from ScreenShotWidget import ScreenShotWidget + +__APPNAME__ = "pdf2word" +__VERSION__ = "0.1.1" + +URLs_EN = { + # 下载超英文轻量级PP-OCRv3模型的检测模型并解压 + "en_PP-OCRv3_det_infer": "https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar", + # 下载英文轻量级PP-OCRv3模型的识别模型并解压 + "en_PP-OCRv3_rec_infer": "https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_infer.tar", + # 下载超轻量级英文表格英文模型并解压 + "en_ppstructure_mobile_v2.0_SLANet_infer": "https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/en_ppstructure_mobile_v2.0_SLANet_infer.tar", + # 英文版面分析模型 + "picodet_lcnet_x1_0_fgd_layout_infer": "https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_infer.tar", +} +DICT_EN = { + "rec_char_dict_path": "en_dict.txt", + "layout_dict_path": "layout_publaynet_dict.txt", +} + +URLs_CN = { + # 下载超中文轻量级PP-OCRv3模型的检测模型并解压 + "cn_PP-OCRv3_det_infer": "https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar", + # 下载中文轻量级PP-OCRv3模型的识别模型并解压 + "cn_PP-OCRv3_rec_infer": "https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar", + # 下载超轻量级英文表格英文模型并解压 + "cn_ppstructure_mobile_v2.0_SLANet_infer": "https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/en_ppstructure_mobile_v2.0_SLANet_infer.tar", + # 中文版面分析模型 + "picodet_lcnet_x1_0_fgd_layout_cdla_infer": "https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_cdla_infer.tar", +} +DICT_CN = { + "rec_char_dict_path": "ppocr_keys_v1.txt", + "layout_dict_path": "layout_cdla_dict.txt", +} + + + +def QImageToCvMat(incomingImage) -> np.array: + ''' + Converts a QImage into an opencv MAT format + ''' + + incomingImage = incomingImage.convertToFormat(QImage.Format.Format_RGBA8888) + + width = incomingImage.width() + height = incomingImage.height() + + ptr = incomingImage.bits() + ptr.setsize(height * width * 4) + arr = np.frombuffer(ptr, np.uint8).reshape((height, width, 4)) + return arr + + +def readImage(image_file) -> list: + if os.path.basename(image_file)[-3:] in ['pdf']: + import fitz + from PIL import Image + imgs = [] + with fitz.open(image_file) as pdf: + for pg in range(0, pdf.pageCount): + page = pdf[pg] + mat = fitz.Matrix(2, 2) + pm = page.getPixmap(matrix=mat, alpha=False) + + # if width or height > 2000 pixels, don't enlarge the image + if pm.width > 2000 or pm.height > 2000: + pm = page.getPixmap(matrix=fitz.Matrix(1, 1), alpha=False) + + img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples) + img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR) + imgs.append(img) + else: + img = cv2.imread(image_file, cv2.IMREAD_COLOR) + if img is not None: + imgs = [img] + + return imgs + + +class Worker(QThread): + progressBarValue = Signal(int) + endsignal = Signal() + loopFlag = True + + def __init__(self, predictors, save_pdf, vis_font_path): + super(Worker, self).__init__() + self.predictors = predictors + self.save_pdf = save_pdf + self.vis_font_path = vis_font_path + self.lang = 'EN' + self.imagePaths = [] + self.outputDir = None + self.setStackSize(1024*1024) + + def setImagePath(self, imagePaths): + self.imagePaths = imagePaths + + def setLang(self, lang): + self.lang = lang + + def setOutputDir(self, outputDir): + self.outputDir = outputDir + + def predictAndSave(self, imgs, img_name): + all_res = [] + for index, img in enumerate(imgs): + res, time_dict = self.predictors[self.lang](img) + + # save output + save_structure_res(res, self.outputDir, img_name) + draw_img = draw_structure_result(img, res, self.vis_font_path) + img_save_path = os.path.join(self.outputDir, img_name, 'show_{}.jpg'.format(index)) + if res != []: + cv2.imwrite(img_save_path, draw_img) + + # recovery + h, w, _ = img.shape + res = sorted_layout_boxes(res, w) + all_res += res + + try: + convert_info_docx(img, all_res, self.outputDir, img_name, self.save_pdf) + except Exception as ex: + print(self, + "error in layout recovery image:{}, err msg: {}".format( + img_name, ex)) + + print('result save to {}'.format(self.outputDir)) + + def run(self): + try: + findex = 0 + os.makedirs(self.outputDir, exist_ok=True) + for i, image_file in enumerate(self.imagePaths): + if self.loopFlag == True: + imgs = readImage(image_file) + if len(imgs) == 0: + continue + img_name = os.path.basename(image_file).split('.')[0] + os.makedirs(os.path.join(self.outputDir, img_name), exist_ok=True) + self.predictAndSave(imgs, img_name) + findex += 1 + self.progressBarValue.emit(findex) + else: + break + self.endsignal.emit() + self.exec() + except Exception as e: + print(e) + raise + + +class APP_Image2Doc(QWidget): + def __init__(self): + super().__init__() + self.setFixedHeight(90) + self.setFixedWidth(400) + + # settings + self.imagePaths = [] + # self.screenShotWg = ScreenShotWidget() + self.screenShot = None + self.save_pdf = False + self.output_dir = None + self.vis_font_path = os.path.join(root, + "doc", "fonts", "simfang.ttf") + + # ProgressBar + self.pb = QProgressBar() + self.pb.setRange(0, 100) + self.pb.setValue(0) + + # 初始化界面 + self.setupUi() + + # 下载模型 + self.downloadModels(URLs_EN) + self.downloadModels(URLs_CN) + + # 初始化模型 + predictors = { + 'EN': self.initPredictor('EN'), + 'CN': self.initPredictor('CN'), + } + + # 设置工作进程 + self._thread = Worker(predictors, self.save_pdf, self.vis_font_path) + self._thread.progressBarValue.connect(self.handleProgressBarSingal) + self._thread.endsignal.connect(self.handleEndsignalSignal) + self._thread.finished.connect(QObject.deleteLater) + self.time_start = 0 # save start time + + def setupUi(self): + self.setObjectName("MainWindow") + self.setWindowTitle(__APPNAME__ + " " + __VERSION__) + + layout = QGridLayout() + + self.openFileButton = QPushButton("打开文件") + self.openFileButton.setIcon(QIcon(QPixmap("./icons/folder-plus.png"))) + layout.addWidget(self.openFileButton, 0, 0, 1, 1) + self.openFileButton.clicked.connect(self.handleOpenFileSignal) + + # screenShotButton = QPushButton("截图识别") + # layout.addWidget(screenShotButton, 0, 1, 1, 1) + # screenShotButton.clicked.connect(self.screenShotSlot) + # screenShotButton.setEnabled(False) # temporarily disenble + + self.startCNButton = QPushButton("中文转换") + self.startCNButton.setIcon(QIcon(QPixmap("./icons/chinese.png"))) + layout.addWidget(self.startCNButton, 0, 1, 1, 1) + self.startCNButton.clicked.connect( + functools.partial(self.handleStartSignal, 'CN')) + + self.startENButton = QPushButton("英文转换") + self.startENButton.setIcon(QIcon(QPixmap("./icons/english.png"))) + layout.addWidget(self.startENButton, 0, 2, 1, 1) + self.startENButton.clicked.connect( + functools.partial(self.handleStartSignal, 'EN')) + + self.showResultButton = QPushButton("显示结果") + self.showResultButton.setIcon(QIcon(QPixmap("./icons/folder-open.png"))) + layout.addWidget(self.showResultButton, 0, 3, 1, 1) + self.showResultButton.clicked.connect(self.handleShowResultSignal) + + # ProgressBar + layout.addWidget(self.pb, 2, 0, 1, 4) + # time estimate label + self.timeEstLabel = QLabel( + ("Time Left: --")) + layout.addWidget(self.timeEstLabel, 3, 0, 1, 4) + + self.setLayout(layout) + + def downloadModels(self, URLs): + # using custom model + tar_file_name_list = [ + 'inference.pdiparams', + 'inference.pdiparams.info', + 'inference.pdmodel', + 'model.pdiparams', + 'model.pdiparams.info', + 'model.pdmodel' + ] + model_path = os.path.join(root, 'inference') + os.makedirs(model_path, exist_ok=True) + + # download and unzip models + for name in URLs.keys(): + url = URLs[name] + print("Try downloading file: {}".format(url)) + tarname = url.split('/')[-1] + tarpath = os.path.join(model_path, tarname) + if os.path.exists(tarpath): + print("File have already exist. skip") + else: + try: + download_with_progressbar(url, tarpath) + except Exception as e: + print("Error occurred when downloading file, error message:") + print(e) + + # unzip model tar + try: + with tarfile.open(tarpath, 'r') as tarObj: + storage_dir = os.path.join(model_path, name) + os.makedirs(storage_dir, exist_ok=True) + for member in tarObj.getmembers(): + filename = None + for tar_file_name in tar_file_name_list: + if tar_file_name in member.name: + filename = tar_file_name + if filename is None: + continue + file = tarObj.extractfile(member) + with open( + os.path.join(storage_dir, filename), + 'wb') as f: + f.write(file.read()) + except Exception as e: + print("Error occurred when unziping file, error message:") + print(e) + + def initPredictor(self, lang='EN'): + # init predictor args + args = parse_args() + args.table_max_len = 488 + args.ocr = True + args.recovery = True + args.save_pdf = self.save_pdf + args.table_char_dict_path = os.path.join(root, + "ppocr", "utils", "dict", "table_structure_dict.txt") + if lang == 'EN': + args.det_model_dir = os.path.join(root, # 此处从这里找到模型存放位置 + "inference", "en_PP-OCRv3_det_infer") + args.rec_model_dir = os.path.join(root, + "inference", "en_PP-OCRv3_rec_infer") + args.table_model_dir = os.path.join(root, + "inference", "en_ppstructure_mobile_v2.0_SLANet_infer") + args.output = os.path.join(root, "output") # 结果保存路径 + args.layout_model_dir = os.path.join(root, + "inference", "picodet_lcnet_x1_0_fgd_layout_infer") + lang_dict = DICT_EN + elif lang == 'CN': + args.det_model_dir = os.path.join(root, # 此处从这里找到模型存放位置 + "inference", "cn_PP-OCRv3_det_infer") + args.rec_model_dir = os.path.join(root, + "inference", "cn_PP-OCRv3_rec_infer") + args.table_model_dir = os.path.join(root, + "inference", "cn_ppstructure_mobile_v2.0_SLANet_infer") + args.output = os.path.join(root, "output") # 结果保存路径 + args.layout_model_dir = os.path.join(root, + "inference", "picodet_lcnet_x1_0_fgd_layout_cdla_infer") + lang_dict = DICT_CN + else: + raise ValueError("Unsupported language") + args.rec_char_dict_path = os.path.join(root, + "ppocr", "utils", + lang_dict['rec_char_dict_path']) + args.layout_dict_path = os.path.join(root, + "ppocr", "utils", "dict", "layout_dict", + lang_dict['layout_dict_path']) + # init predictor + return StructureSystem(args) + + def handleOpenFileSignal(self): + ''' + 可以多选图像文件 + ''' + selectedFiles = QFileDialog.getOpenFileNames(self, + "多文件选择", "/", "图片文件 (*.png *.jpeg *.jpg *.bmp *.pdf)")[0] + if len(selectedFiles) > 0: + self.imagePaths = selectedFiles + self.screenShot = None # discard screenshot temp image + self.pb.setRange(0, len(self.imagePaths)) + self.pb.setValue(0) + + # def screenShotSlot(self): + # ''' + # 选定图像文件和截图的转换过程只能同时进行一个 + # 截图只能同时转换一个 + # ''' + # self.screenShotWg.start() + # if self.screenShotWg.captureImage: + # self.screenShot = self.screenShotWg.captureImage + # self.imagePaths.clear() # discard openfile temp list + # self.pb.setRange(0, 1) + # self.pb.setValue(0) + + def handleStartSignal(self, lang): + if self.screenShot: # for screenShot + img_name = 'screenshot_' + time.strftime("%Y%m%d%H%M%S", time.localtime()) + image = QImageToCvMat(self.screenShot) + self.predictAndSave(image, img_name, lang) + # update Progress Bar + self.pb.setValue(1) + QMessageBox.information(self, + u'Information', "文档提取完成") + elif len(self.imagePaths) > 0 : # for image file selection + # Must set image path list and language before start + self.output_dir = os.path.join( + os.path.dirname(self.imagePaths[0]), "output") # output_dir shold be same as imagepath + self._thread.setOutputDir(self.output_dir) + self._thread.setImagePath(self.imagePaths) + self._thread.setLang(lang) + # disenble buttons + self.openFileButton.setEnabled(False) + self.startCNButton.setEnabled(False) + self.startENButton.setEnabled(False) + # 启动工作进程 + self._thread.start() + self.time_start = time.time() # log start time + QMessageBox.information(self, + u'Information', "开始转换") + else: + QMessageBox.warning(self, + u'Information', "请选择要识别的文件或截图") + + def handleShowResultSignal(self): + if self.output_dir is None: + return + if os.path.exists(self.output_dir): + if platform.system() == 'Windows': + os.startfile(self.output_dir) + else: + os.system('open ' + os.path.normpath(self.output_dir)) + else: + QMessageBox.information(self, + u'Information', "输出文件不存在") + + def handleProgressBarSingal(self, i): + self.pb.setValue(i) + # calculate time left of recognition + lenbar = self.pb.maximum() + avg_time = (time.time() - self.time_start) / i # Use average time to prevent time fluctuations + time_left = str(datetime.timedelta(seconds=avg_time * (lenbar - i))).split(".")[0] # Remove microseconds + self.timeEstLabel.setText(f"Time Left: {time_left}") # show time left + + def handleEndsignalSignal(self): + # enble buttons + self.openFileButton.setEnabled(True) + self.startCNButton.setEnabled(True) + self.startENButton.setEnabled(True) + QMessageBox.information(self, u'Information', "转换结束") + + +def main(): + app = QApplication(sys.argv) + + window = APP_Image2Doc() # 创建对象 + window.show() # 全屏显示窗口 + + QApplication.processEvents() + sys.exit(app.exec()) + + +if __name__ == "__main__": + main() diff --git a/ppstructure/predict_system.py b/ppstructure/predict_system.py index d6f2e24240ff783e14dbd61efdd27877f9ec39ff..71147d3af8ec666d368234270dcb0d16aaf91938 100644 --- a/ppstructure/predict_system.py +++ b/ppstructure/predict_system.py @@ -18,7 +18,7 @@ import subprocess __dir__ = os.path.dirname(os.path.abspath(__file__)) sys.path.append(__dir__) -sys.path.append(os.path.abspath(os.path.join(__dir__, '..'))) +sys.path.insert(0, os.path.abspath(os.path.join(__dir__, '../'))) os.environ["FLAGS_allocator_strategy"] = 'auto_growth' import cv2 @@ -27,14 +27,13 @@ import numpy as np import time import logging from copy import deepcopy -from attrdict import AttrDict -from ppocr.utils.utility import get_image_file_list, check_and_read_gif +from ppocr.utils.utility import get_image_file_list, check_and_read from ppocr.utils.logging import get_logger from tools.infer.predict_system import TextSystem +from ppstructure.layout.predict_layout import LayoutPredictor from ppstructure.table.predict_table import TableSystem, to_excel from ppstructure.utility import parse_args, draw_structure_result -from ppstructure.recovery.recovery_to_doc import convert_info_docx logger = get_logger() @@ -42,6 +41,14 @@ logger = get_logger() class StructureSystem(object): def __init__(self, args): self.mode = args.mode + self.recovery = args.recovery + + self.image_orientation_predictor = None + if args.image_orientation: + import paddleclas + self.image_orientation_predictor = paddleclas.PaddleClas( + model_name="text_image_orientation") + if self.mode == 'structure': if not args.show_log: logger.setLevel(logging.INFO) @@ -51,28 +58,14 @@ class StructureSystem(object): "When args.layout is false, args.ocr is automatically set to false" ) args.drop_score = 0 - # init layout and ocr model + # init model + self.layout_predictor = None self.text_system = None + self.table_system = None if args.layout: - import layoutparser as lp - config_path = None - model_path = None - if os.path.isdir(args.layout_path_model): - model_path = args.layout_path_model - else: - config_path = args.layout_path_model - self.table_layout = lp.PaddleDetectionLayoutModel( - config_path=config_path, - model_path=model_path, - label_map=args.layout_label_map, - threshold=0.5, - enable_mkldnn=args.enable_mkldnn, - enforce_cpu=not args.use_gpu, - thread_num=args.cpu_threads) + self.layout_predictor = LayoutPredictor(args) if args.ocr: self.text_system = TextSystem(args) - else: - self.table_layout = None if args.table: if self.text_system is not None: self.table_system = TableSystem( @@ -80,39 +73,78 @@ class StructureSystem(object): self.text_system.text_recognizer) else: self.table_system = TableSystem(args) - else: - self.table_system = None - elif self.mode == 'vqa': + elif self.mode == 'kie': raise NotImplementedError - def __call__(self, img, return_ocr_result_in_table=False): + def __call__(self, img, return_ocr_result_in_table=False, img_idx=0): + time_dict = { + 'image_orientation': 0, + 'layout': 0, + 'table': 0, + 'table_match': 0, + 'det': 0, + 'rec': 0, + 'kie': 0, + 'all': 0 + } + start = time.time() + if self.image_orientation_predictor is not None: + tic = time.time() + cls_result = self.image_orientation_predictor.predict( + input_data=img) + cls_res = next(cls_result) + angle = cls_res[0]['label_names'][0] + cv_rotate_code = { + '90': cv2.ROTATE_90_COUNTERCLOCKWISE, + '180': cv2.ROTATE_180, + '270': cv2.ROTATE_90_CLOCKWISE + } + img = cv2.rotate(img, cv_rotate_code[angle]) + toc = time.time() + time_dict['image_orientation'] = toc - tic if self.mode == 'structure': ori_im = img.copy() - if self.table_layout is not None: - layout_res = self.table_layout.detect(img[..., ::-1]) + if self.layout_predictor is not None: + layout_res, elapse = self.layout_predictor(img) + time_dict['layout'] += elapse else: h, w = ori_im.shape[:2] - layout_res = [AttrDict(coordinates=[0, 0, w, h], type='Table')] + layout_res = [dict(bbox=None, label='table')] res_list = [] for region in layout_res: res = '' - x1, y1, x2, y2 = region.coordinates - x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2) - roi_img = ori_im[y1:y2, x1:x2, :] - if region.type == 'Table': + if region['bbox'] is not None: + x1, y1, x2, y2 = region['bbox'] + x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2) + roi_img = ori_im[y1:y2, x1:x2, :] + else: + x1, y1, x2, y2 = 0, 0, w, h + roi_img = ori_im + if region['label'] == 'table': if self.table_system is not None: - res = self.table_system(roi_img, - return_ocr_result_in_table) + res, table_time_dict = self.table_system( + roi_img, return_ocr_result_in_table) + time_dict['table'] += table_time_dict['table'] + time_dict['table_match'] += table_time_dict['match'] + time_dict['det'] += table_time_dict['det'] + time_dict['rec'] += table_time_dict['rec'] else: if self.text_system is not None: - if args.recovery: + if self.recovery: wht_im = np.ones(ori_im.shape, dtype=ori_im.dtype) wht_im[y1:y2, x1:x2, :] = roi_img - filter_boxes, filter_rec_res = self.text_system(wht_im) + filter_boxes, filter_rec_res, ocr_time_dict = self.text_system( + wht_im) else: - filter_boxes, filter_rec_res = self.text_system(roi_img) - # remove style char + filter_boxes, filter_rec_res, ocr_time_dict = self.text_system( + roi_img) + time_dict['det'] += ocr_time_dict['det'] + time_dict['rec'] += ocr_time_dict['rec'] + + # remove style char, + # when using the recognition model trained on the PubtabNet dataset, + # it will recognize the text format in the table, such as style_token = [ '', '', '', '', '', '', '', '', '', @@ -125,7 +157,7 @@ class StructureSystem(object): for token in style_token: if token in rec_str: rec_str = rec_str.replace(token, '') - if not args.recovery: + if not self.recovery: box += [x1, y1] res.append({ 'text': rec_str, @@ -133,37 +165,43 @@ class StructureSystem(object): 'text_region': box.tolist() }) res_list.append({ - 'type': region.type, + 'type': region['label'].lower(), 'bbox': [x1, y1, x2, y2], 'img': roi_img, - 'res': res + 'res': res, + 'img_idx': img_idx }) - return res_list - elif self.mode == 'vqa': + end = time.time() + time_dict['all'] = end - start + return res_list, time_dict + elif self.mode == 'kie': raise NotImplementedError - return None + return None, None -def save_structure_res(res, save_folder, img_name): +def save_structure_res(res, save_folder, img_name, img_idx=0): excel_save_folder = os.path.join(save_folder, img_name) os.makedirs(excel_save_folder, exist_ok=True) res_cp = deepcopy(res) # save res with open( - os.path.join(excel_save_folder, 'res.txt'), 'w', + os.path.join(excel_save_folder, 'res_{}.txt'.format(img_idx)), + 'w', encoding='utf8') as f: for region in res_cp: roi_img = region.pop('img') f.write('{}\n'.format(json.dumps(region))) - if region['type'] == 'Table' and len(region[ + if region['type'].lower() == 'table' and len(region[ 'res']) > 0 and 'html' in region['res']: - excel_path = os.path.join(excel_save_folder, - '{}.xlsx'.format(region['bbox'])) + excel_path = os.path.join( + excel_save_folder, + '{}_{}.xlsx'.format(region['bbox'], img_idx)) to_excel(region['res']['html'], excel_path) - elif region['type'] == 'Figure': - img_path = os.path.join(excel_save_folder, - '{}.jpg'.format(region['bbox'])) + elif region['type'].lower() == 'figure': + img_path = os.path.join( + excel_save_folder, + '{}_{}.jpg'.format(region['bbox'], img_idx)) cv2.imwrite(img_path, roi_img) @@ -179,31 +217,50 @@ def main(args): for i, image_file in enumerate(image_file_list): logger.info("[{}/{}] {}".format(i, img_num, image_file)) - img, flag = check_and_read_gif(image_file) + img, flag_gif, flag_pdf = check_and_read(image_file) img_name = os.path.basename(image_file).split('.')[0] - if not flag: + if not flag_gif and not flag_pdf: img = cv2.imread(image_file) - if img is None: - logger.error("error in loading image:{}".format(image_file)) - continue - starttime = time.time() - res = structure_sys(img) - - if structure_sys.mode == 'structure': - save_structure_res(res, save_folder, img_name) - draw_img = draw_structure_result(img, res, args.vis_font_path) - img_save_path = os.path.join(save_folder, img_name, 'show.jpg') - elif structure_sys.mode == 'vqa': - raise NotImplementedError - # draw_img = draw_ser_results(img, res, args.vis_font_path) - # img_save_path = os.path.join(save_folder, img_name + '.jpg') - cv2.imwrite(img_save_path, draw_img) - logger.info('result save to {}'.format(img_save_path)) - if args.recovery: - convert_info_docx(img, res, save_folder, img_name) - elapse = time.time() - starttime - logger.info("Predict time : {:.3f}s".format(elapse)) + + if not flag_pdf: + if img is None: + logger.error("error in loading image:{}".format(image_file)) + continue + imgs = [img] + else: + imgs = img + + all_res = [] + for index, img in enumerate(imgs): + res, time_dict = structure_sys(img, img_idx=index) + if structure_sys.mode == 'structure' and res != []: + save_structure_res(res, save_folder, img_name, index) + draw_img = draw_structure_result(img, res, args.vis_font_path) + img_save_path = os.path.join(save_folder, img_name, + 'show_{}.jpg'.format(index)) + elif structure_sys.mode == 'kie': + raise NotImplementedError + # draw_img = draw_ser_results(img, res, args.vis_font_path) + # img_save_path = os.path.join(save_folder, img_name + '.jpg') + if res != []: + cv2.imwrite(img_save_path, draw_img) + logger.info('result save to {}'.format(img_save_path)) + if args.recovery and res != []: + from ppstructure.recovery.recovery_to_doc import sorted_layout_boxes, convert_info_docx + h, w, _ = img.shape + res = sorted_layout_boxes(res, w) + all_res += res + + if args.recovery and all_res != []: + try: + convert_info_docx(img, all_res, save_folder, img_name, + args.save_pdf) + except Exception as ex: + logger.error("error in layout recovery image:{}, err msg: {}". + format(image_file, ex)) + continue + logger.info("Predict time : {:.3f}s".format(time_dict['all'])) if __name__ == "__main__": diff --git a/ppstructure/recovery/README.md b/ppstructure/recovery/README.md index 883dbef3e829dfa213644b610af1ca279dac8641..011d6e12fda1b09c7a87367fb887a5c99a4ae00a 100644 --- a/ppstructure/recovery/README.md +++ b/ppstructure/recovery/README.md @@ -1,23 +1,30 @@ English | [简体中文](README_ch.md) -- [Getting Started](#getting-started) - - [1. Introduction](#1) - - [2. Install](#2) - - [2.1 Installation dependencies](#2.1) +# Layout Recovery + +- [1. Introduction](#1) +- [2. Install](#2) + - [2.1 Install PaddlePaddle](#2.1) - [2.2 Install PaddleOCR](#2.2) - - [3. Quick Start](#3) +- [3. Quick Start](#3) + - [3.1 Download models](#3.1) + - [3.2 Layout recovery](#3.2) +- [4. More](#4) -## 1. Introduction +## 1. Introduction Layout recovery means that after OCR recognition, the content is still arranged like the original document pictures, and the paragraphs are output to word document in the same order. -Layout recovery combines [layout analysis](../layout/README.md)、[table recognition](../table/README.md) to better recover images, tables, titles, etc. -The following figure shows the result: +Layout recovery combines [layout analysis](../layout/README.md)、[table recognition](../table/README.md) to better recover images, tables, titles, etc. supports input files in PDF and document image formats in Chinese and English. The following figure shows the effect of restoring the layout of English and Chinese documents: + +
+ +
- +
@@ -25,22 +32,19 @@ The following figure shows the result: -### 2.1 Install dependencies - -- **(1) Install PaddlePaddle** +### 2.1 Install PaddlePaddle ```bash python3 -m pip install --upgrade pip -# GPU installation -python3 -m pip install "paddlepaddle-gpu>=2.2" -i https://mirror.baidu.com/pypi/simple +# If you have cuda9 or cuda10 installed on your machine, please run the following command to install +python3 -m pip install "paddlepaddle-gpu" -i https://mirror.baidu.com/pypi/simple # CPU installation -python3 -m pip install "paddlepaddle>=2.2" -i https://mirror.baidu.com/pypi/simple - +python3 -m pip install "paddlepaddle" -i https://mirror.baidu.com/pypi/simple ```` -For more requirements, please refer to the instructions in [Installation Documentation](https://www.paddlepaddle.org.cn/install/quick). +For more requirements, please refer to the instructions in [Installation Documentation](https://www.paddlepaddle.org.cn/en/install/quick?docurl=/documentation/docs/en/install/pip/macos-pip_en.html). @@ -59,6 +63,8 @@ git clone https://gitee.com/paddlepaddle/PaddleOCR - **(2) Install recovery's `requirements`** +The layout restoration is exported as docx and PDF files, so python-docx and docx2pdf API need to be installed, and PyMuPDF api([requires Python >= 3.7](https://pypi.org/project/PyMuPDF/)) need to be installed to process the input files in pdf format. + ```bash python3 -m pip install -r ppstructure/recovery/requirements.txt ```` @@ -67,20 +73,87 @@ python3 -m pip install -r ppstructure/recovery/requirements.txt ## 3. Quick Start -```python +Through layout analysis, we divided the image/PDF documents into regions, located the key regions, such as text, table, picture, etc., and recorded the location, category, and regional pixel value information of each region. Different regions are processed separately, where: + +- OCR detection and recognition is performed in the text area, and the coordinates of the OCR detection box and the text content information are added on the basis of the previous information + +- The table area identifies tables and records html and text information of tables +- Save the image directly + +We can restore the test picture through the layout information, OCR detection and recognition structure, table information, and saved pictures. + +The whl package is also provided for quick use, see [quickstart](../docs/quickstart_en.md) for details. + + + +### 3.1 Download models + +If input is English document, download English models: + +```bash cd PaddleOCR/ppstructure # download model mkdir inference && cd inference # Download the detection model of the ultra-lightweight English PP-OCRv3 model and unzip it -wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar && tar xf ch_PP-OCRv3_det_infer.tar +https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar && tar xf en_PP-OCRv3_det_infer.tar # Download the recognition model of the ultra-lightweight English PP-OCRv3 model and unzip it -wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar && tar xf ch_PP-OCRv3_rec_infer.tar +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_infer.tar && tar xf en_PP-OCRv3_rec_infer.tar # Download the ultra-lightweight English table inch model and unzip it -wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_structure_infer.tar && tar xf en_ppocr_mobile_v2.0_table_structure_infer.tar +wget https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/en_ppstructure_mobile_v2.0_SLANet_infer.tar +tar xf en_ppstructure_mobile_v2.0_SLANet_infer.tar +# Download the layout model of publaynet dataset and unzip it +wget https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_infer.tar +tar xf picodet_lcnet_x1_0_fgd_layout_infer.tar cd .. -# run -python3 predict_system.py --det_model_dir=inference/en_PP-OCRv3_det_infer --rec_model_dir=inference/en_PP-OCRv3_rec_infer --table_model_dir=inference/en_ppocr_mobile_v2.0_table_structure_infer --rec_char_dict_path=../ppocr/utils/en_dict.txt --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt --output ./output/table --rec_image_shape=3,48,320 --vis_font_path=../doc/fonts/simfang.ttf --recovery=True --image_dir=./docs/table/1.png ``` +If input is Chinese document,download Chinese models: +[Chinese and English ultra-lightweight PP-OCRv3 model](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/README.md#pp-ocr-series-model-listupdate-on-september-8th)、[表格识别模型](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/ppstructure/docs/models_list.md#22-表格识别模型)、[版面分析模型](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/ppstructure/docs/models_list.md#1-版面分析模型) + + +### 3.2 Layout recovery + + +```bash +python3 predict_system.py \ + --image_dir=./docs/table/1.png \ + --det_model_dir=inference/en_PP-OCRv3_det_infer \ + --rec_model_dir=inference/en_PP-OCRv3_rec_infer \ + --rec_char_dict_path=../ppocr/utils/en_dict.txt \ + --table_model_dir=inference/en_ppstructure_mobile_v2.0_SLANet_infer \ + --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt \ + --layout_model_dir=inference/picodet_lcnet_x1_0_fgd_layout_infer \ + --layout_dict_path=../ppocr/utils/dict/layout_dict/layout_publaynet_dict.txt \ + --vis_font_path=../doc/fonts/simfang.ttf \ + --recovery=True \ + --save_pdf=False \ + --output=../output/ +``` + +After running, the docx of each picture will be saved in the directory specified by the output field + +Field: + +- image_dir:test file测试文件, can be picture, picture directory, pdf file, pdf file directory +- det_model_dir:OCR detection model path +- rec_model_dir:OCR recognition model path +- rec_char_dict_path:OCR recognition dict path. If the Chinese model is used, change to "../ppocr/utils/ppocr_keys_v1.txt". And if you trained the model on your own dataset, change to the trained dictionary +- table_model_dir:tabel recognition model path +- table_char_dict_path:tabel recognition dict path. If the Chinese model is used, no need to change +- layout_model_dir:layout analysis model path +- layout_dict_path:layout analysis dict path. If the Chinese model is used, change to "../ppocr/utils/dict/layout_dict/layout_cdla_dict.txt" +- recovery:whether to enable layout of recovery, default False +- save_pdf:when recovery file, whether to save pdf file, default False +- output:save the recovery result path + + + +## 4. More + +For training, evaluation and inference tutorial for text detection models, please refer to [text detection doc](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/doc/doc_en/detection_en.md). + +For training, evaluation and inference tutorial for text recognition models, please refer to [text recognition doc](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/doc/doc_en/recognition_en.md). + +For training, evaluation and inference tutorial for layout analysis models, please refer to [layout analysis doc](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/ppstructure/layout/README.md) -After running, the docx of each picture will be saved in the directory specified by the output field \ No newline at end of file +For training, evaluation and inference tutorial for table recognition models, please refer to [table recognition doc](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/ppstructure/table/README.md) diff --git a/ppstructure/recovery/README_ch.md b/ppstructure/recovery/README_ch.md index 5a05abffd0399387bc0d22d878e64d03d8894a79..fd2e649024ec88e2ea5c88536ccac2e259538886 100644 --- a/ppstructure/recovery/README_ch.md +++ b/ppstructure/recovery/README_ch.md @@ -1,25 +1,30 @@ [English](README.md) | 简体中文 -# 版面恢复使用说明 +# 版面恢复 - [1. 简介](#1) - [2. 安装](#2) - - [2.1 安装依赖](#2.1) + - [2.1 安装PaddlePaddle](#2.1) - [2.2 安装PaddleOCR](#2.2) - - [3. 使用](#3) + - [3.1 下载模型](#3.1) + - [3.2 版面恢复](#3.2) +- [4. 更多](#4) -## 1. 简介 +## 1. 简介 版面恢复就是在OCR识别后,内容仍然像原文档图片那样排列着,段落不变、顺序不变的输出到word文档中等。 -版面恢复结合了[版面分析](../layout/README_ch.md)、[表格识别](../table/README_ch.md)技术,从而更好地恢复图片、表格、标题等内容,下图展示了版面恢复的结果: +版面恢复结合了[版面分析](../layout/README_ch.md)、[表格识别](../table/README_ch.md)技术,从而更好地恢复图片、表格、标题等内容,支持中、英文pdf文档、文档图片格式的输入文件,下图分别展示了英文文档和中文文档版面恢复的效果:
- + +
+
+
@@ -27,29 +32,21 @@ -### 2.1 安装依赖 - -- **(1) 安装PaddlePaddle** +### 2.1 安装PaddlePaddle ```bash python3 -m pip install --upgrade pip -# GPU安装 -python3 -m pip install "paddlepaddle-gpu>=2.2" -i https://mirror.baidu.com/pypi/simple +# 您的机器安装的是CUDA9或CUDA10,请运行以下命令安装 +python3 -m pip install "paddlepaddle-gpu" -i https://mirror.baidu.com/pypi/simple -# CPU安装 -python3 -m pip install "paddlepaddle>=2.2" -i https://mirror.baidu.com/pypi/simple +# 您的机器是CPU,请运行以下命令安装 +python3 -m pip install "paddlepaddle" -i https://mirror.baidu.com/pypi/simple ``` 更多需求,请参照[安装文档](https://www.paddlepaddle.org.cn/install/quick)中的说明进行操作。 -* **(2)安装依赖** - -```bash -python3 -m pip install -r ppstructure/recovery/requirements.txt -``` - ### 2.2 安装PaddleOCR @@ -67,6 +64,8 @@ git clone https://gitee.com/paddlepaddle/PaddleOCR - **(2)安装recovery的`requirements`** +版面恢复导出为docx、pdf文件,所以需要安装python-docx、docx2pdf API,同时处理pdf格式的输入文件,需要安装PyMuPDF API([要求Python >= 3.7](https://pypi.org/project/PyMuPDF/))。 + ```bash python3 -m pip install -r ppstructure/recovery/requirements.txt ``` @@ -75,23 +74,91 @@ python3 -m pip install -r ppstructure/recovery/requirements.txt ## 3. 使用 -恢复给定文档的版面: +我们通过版面分析对图片/pdf形式的文档进行区域划分,定位其中的关键区域,如文字、表格、图片等,记录每个区域的位置、类别、区域像素值信息。对不同的区域分别处理,其中: + +- 文字区域直接进行OCR检测和识别,在之前信息基础上增加OCR检测框坐标和文本内容信息 + +- 表格区域进行表格识别,记录表格html和文字信息 +- 图片直接保存 + +我们通过版面信息、OCR检测和识别结构、表格信息、保存的图片,对测试图片进行恢复即可。 + +提供如下代码实现版面恢复,也提供了whl包的形式方便快速使用,详见 [quickstart](../docs/quickstart.md)。 + + + +### 3.1 下载模型 + +如果输入为英文文档类型,下载OCR检测和识别、版面分析、表格识别的英文模型 -```python +```bash cd PaddleOCR/ppstructure # 下载模型 mkdir inference && cd inference -# 下载超英文轻量级PP-OCRv3模型的检测模型并解压 -wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar && tar xf ch_PP-OCRv3_det_infer.tar -# 下载英文轻量级PP-OCRv3模型的识别模型并解压 -wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar && tar xf ch_PP-OCRv3_rec_infer.tar -# 下载超轻量级英文表格英寸模型并解压 -wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_structure_infer.tar && tar xf en_ppocr_mobile_v2.0_table_structure_infer.tar +# 下载英文超轻量PP-OCRv3检测模型并解压 +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar && tar xf en_PP-OCRv3_det_infer.tar +# 下载英文超轻量PP-OCRv3识别模型并解压 +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_infer.tar && tar xf en_PP-OCRv3_rec_infer.tar +# 下载英文表格识别模型并解压 +wget https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/en_ppstructure_mobile_v2.0_SLANet_infer.tar +tar xf en_ppstructure_mobile_v2.0_SLANet_infer.tar +# 下载英文版面分析模型 +wget https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_infer.tar +tar xf picodet_lcnet_x1_0_fgd_layout_infer.tar cd .. -# 执行预测 -python3 predict_system.py --det_model_dir=inference/en_PP-OCRv3_det_infer --rec_model_dir=inference/en_PP-OCRv3_rec_infer --table_model_dir=inference/en_ppocr_mobile_v2.0_table_structure_infer --rec_char_dict_path=../ppocr/utils/en_dict.txt --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt --output ./output/table --rec_image_shape=3,48,320 --vis_font_path=../doc/fonts/simfang.ttf --recovery=True --image_dir=./docs/table/1.png ``` -运行完成后,每张图片的docx文档会保存到output字段指定的目录下 +如果输入为中文文档类型,在下述链接中下载中文模型即可: + +[PP-OCRv3中英文超轻量文本检测和识别模型](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/README_ch.md#pp-ocr%E7%B3%BB%E5%88%97%E6%A8%A1%E5%9E%8B%E5%88%97%E8%A1%A8%E6%9B%B4%E6%96%B0%E4%B8%AD)、[表格识别模型](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/ppstructure/docs/models_list.md#22-表格识别模型)、[版面分析模型](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/ppstructure/docs/models_list.md#1-版面分析模型) + + + +### 3.2 版面恢复 + +使用下载的模型恢复给定文档的版面,以英文模型为例,执行如下命令: + +```bash +python3 predict_system.py \ + --image_dir=./docs/table/1.png \ + --det_model_dir=inference/en_PP-OCRv3_det_infer \ + --rec_model_dir=inference/en_PP-OCRv3_rec_infer \ + --rec_char_dict_path=../ppocr/utils/en_dict.txt \ + --table_model_dir=inference/en_ppstructure_mobile_v2.0_SLANet_infer \ + --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt \ + --layout_model_dir=inference/picodet_lcnet_x1_0_fgd_layout_infer \ + --layout_dict_path=../ppocr/utils/dict/layout_dict/layout_publaynet_dict.txt \ + --vis_font_path=../doc/fonts/simfang.ttf \ + --recovery=True \ + --save_pdf=False \ + --output=../output/ +``` + +运行完成后,恢复版面的docx文档会保存到`output`字段指定的目录下 + +字段含义: + +- image_dir:测试文件,可以是图片、图片目录、pdf文件、pdf文件目录 +- det_model_dir:OCR检测模型路径 +- rec_model_dir:OCR识别模型路径 +- rec_char_dict_path:OCR识别字典,如果更换为中文模型,需要更改为"../ppocr/utils/ppocr_keys_v1.txt",如果您在自己的数据集上训练的模型,则更改为训练的字典的文件 +- table_model_dir:表格识别模型路径 +- table_char_dict_path:表格识别字典,如果更换为中文模型,不需要更换字典 +- layout_model_dir:版面分析模型路径 +- layout_dict_path:版面分析字典,如果更换为中文模型,需要更改为"../ppocr/utils/dict/layout_dict/layout_cdla_dict.txt" +- recovery:是否进行版面恢复,默认False +- save_pdf:进行版面恢复导出docx文档的同时,是否保存为pdf文件,默认为False +- output:版面恢复结果保存路径 + + + +## 4. 更多 + +关于OCR检测模型的训练评估与推理,请参考:[文本检测教程](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/doc/doc_ch/detection.md) + +关于OCR识别模型的训练评估与推理,请参考:[文本识别教程](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/doc/doc_ch/recognition.md) + +关于版面分析模型的训练评估与推理,请参考:[版面分析教程](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/ppstructure/layout/README_ch.md) +关于表格识别模型的训练评估与推理,请参考:[表格识别教程](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/ppstructure/table/README_ch.md) diff --git a/ppstructure/recovery/__init__.py b/ppstructure/recovery/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1d11e265597c7c8e39098a228108da3bb954b892 --- /dev/null +++ b/ppstructure/recovery/__init__.py @@ -0,0 +1,13 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/ppstructure/recovery/recovery_to_doc.py b/ppstructure/recovery/recovery_to_doc.py index 5278217d5b983008d357b6b1be3ab1b883a4939d..73b497d49d0961b253738eddad49c88c12c13601 100644 --- a/ppstructure/recovery/recovery_to_doc.py +++ b/ppstructure/recovery/recovery_to_doc.py @@ -12,9 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -import cv2 import os -import pypandoc from copy import deepcopy from docx import Document @@ -22,21 +20,23 @@ from docx import shared from docx.enum.text import WD_ALIGN_PARAGRAPH from docx.enum.section import WD_SECTION from docx.oxml.ns import qn +from docx.enum.table import WD_TABLE_ALIGNMENT + +from ppstructure.recovery.table_process import HtmlToDocx from ppocr.utils.logging import get_logger logger = get_logger() -def convert_info_docx(img, res, save_folder, img_name): +def convert_info_docx(img, res, save_folder, img_name, save_pdf=False): doc = Document() doc.styles['Normal'].font.name = 'Times New Roman' doc.styles['Normal']._element.rPr.rFonts.set(qn('w:eastAsia'), u'宋体') doc.styles['Normal'].font.size = shared.Pt(6.5) - h, w, _ = img.shape - res = sorted_layout_boxes(res, w) flag = 1 for i, region in enumerate(res): + img_idx = region['img_idx'] if flag == 2 and region['layout'] == 'single': section = doc.add_section(WD_SECTION.CONTINUOUS) section._sectPr.xpath('./w:cols')[0].set(qn('w:num'), '1') @@ -46,10 +46,10 @@ def convert_info_docx(img, res, save_folder, img_name): section._sectPr.xpath('./w:cols')[0].set(qn('w:num'), '2') flag = 2 - if region['type'] == 'Figure': + if region['type'].lower() == 'figure': excel_save_folder = os.path.join(save_folder, img_name) img_path = os.path.join(excel_save_folder, - '{}.jpg'.format(region['bbox'])) + '{}_{}.jpg'.format(region['bbox'], img_idx)) paragraph_pic = doc.add_paragraph() paragraph_pic.alignment = WD_ALIGN_PARAGRAPH.CENTER run = paragraph_pic.add_run("") @@ -57,40 +57,38 @@ def convert_info_docx(img, res, save_folder, img_name): run.add_picture(img_path, width=shared.Inches(5)) elif flag == 2: run.add_picture(img_path, width=shared.Inches(2)) - elif region['type'] == 'Title': + elif region['type'].lower() == 'title': doc.add_heading(region['res'][0]['text']) - elif region['type'] == 'Text': + elif region['type'].lower() == 'table': + paragraph = doc.add_paragraph() + new_parser = HtmlToDocx() + new_parser.table_style = 'TableGrid' + table = new_parser.handle_table(html=region['res']['html']) + new_table = deepcopy(table) + new_table.alignment = WD_TABLE_ALIGNMENT.CENTER + paragraph.add_run().element.addnext(new_table._tbl) + + else: paragraph = doc.add_paragraph() paragraph_format = paragraph.paragraph_format for i, line in enumerate(region['res']): if i == 0: paragraph_format.first_line_indent = shared.Inches(0.25) text_run = paragraph.add_run(line['text'] + ' ') - text_run.font.size = shared.Pt(9) - elif region['type'] == 'Table': - pypandoc.convert( - source=region['res']['html'], - format='html', - to='docx', - outputfile='tmp.docx') - tmp_doc = Document('tmp.docx') - paragraph = doc.add_paragraph() - - table = tmp_doc.tables[0] - new_table = deepcopy(table) - new_table.style = doc.styles['Table Grid'] - from docx.enum.table import WD_TABLE_ALIGNMENT - new_table.alignment = WD_TABLE_ALIGNMENT.CENTER - paragraph.add_run().element.addnext(new_table._tbl) - os.remove('tmp.docx') - else: - continue + text_run.font.size = shared.Pt(10) # save to docx docx_path = os.path.join(save_folder, '{}.docx'.format(img_name)) doc.save(docx_path) logger.info('docx save to {}'.format(docx_path)) + # save to pdf + if save_pdf: + pdf_path = os.path.join(save_folder, '{}.pdf'.format(img_name)) + from docx2pdf import convert + convert(docx_path, pdf_path) + logger.info('pdf save to {}'.format(pdf_path)) + def sorted_layout_boxes(res, w): """ @@ -112,7 +110,7 @@ def sorted_layout_boxes(res, w): res_left = [] res_right = [] i = 0 - + while True: if i >= num_boxes: break @@ -137,7 +135,7 @@ def sorted_layout_boxes(res, w): res_left = [] res_right = [] break - elif _boxes[i]['bbox'][0] < w / 4 and _boxes[i]['bbox'][2] < 3*w / 4: + elif _boxes[i]['bbox'][0] < w / 4 and _boxes[i]['bbox'][2] < 3 * w / 4: _boxes[i]['layout'] = 'double' res_left.append(_boxes[i]) i += 1 @@ -157,4 +155,4 @@ def sorted_layout_boxes(res, w): new_res += res_left if res_right: new_res += res_right - return new_res \ No newline at end of file + return new_res diff --git a/ppstructure/recovery/requirements.txt b/ppstructure/recovery/requirements.txt index 04187baa2a72d2ac60f0a4e5ce643f882b7255fb..25e8cdbb0d58b0a243b176f563c66717d6f4c112 100644 --- a/ppstructure/recovery/requirements.txt +++ b/ppstructure/recovery/requirements.txt @@ -1,3 +1,4 @@ -opencv-contrib-python==4.4.0.46 -pypandoc -python-docx \ No newline at end of file +python-docx +docx2pdf +PyMuPDF +beautifulsoup4 \ No newline at end of file diff --git a/ppstructure/recovery/table_process.py b/ppstructure/recovery/table_process.py new file mode 100644 index 0000000000000000000000000000000000000000..243aaf8933791bf4704964d9665173fe70982f95 --- /dev/null +++ b/ppstructure/recovery/table_process.py @@ -0,0 +1,632 @@ + +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from:https://github.com/pqzx/html2docx/blob/8f6695a778c68befb302e48ac0ed5201ddbd4524/htmldocx/h2d.py + +""" +import re, argparse +import io, os +import urllib.request +from urllib.parse import urlparse +from html.parser import HTMLParser + +import docx, docx.table +from docx import Document +from docx.shared import RGBColor, Pt, Inches +from docx.enum.text import WD_COLOR, WD_ALIGN_PARAGRAPH +from docx.oxml import OxmlElement +from docx.oxml.ns import qn + +from bs4 import BeautifulSoup + +# values in inches +INDENT = 0.25 +LIST_INDENT = 0.5 +MAX_INDENT = 5.5 # To stop indents going off the page + +# Style to use with tables. By default no style is used. +DEFAULT_TABLE_STYLE = None + +# Style to use with paragraphs. By default no style is used. +DEFAULT_PARAGRAPH_STYLE = None + + +def get_filename_from_url(url): + return os.path.basename(urlparse(url).path) + +def is_url(url): + """ + Not to be used for actually validating a url, but in our use case we only + care if it's a url or a file path, and they're pretty distinguishable + """ + parts = urlparse(url) + return all([parts.scheme, parts.netloc, parts.path]) + +def fetch_image(url): + """ + Attempts to fetch an image from a url. + If successful returns a bytes object, else returns None + :return: + """ + try: + with urllib.request.urlopen(url) as response: + # security flaw? + return io.BytesIO(response.read()) + except urllib.error.URLError: + return None + +def remove_last_occurence(ls, x): + ls.pop(len(ls) - ls[::-1].index(x) - 1) + +def remove_whitespace(string, leading=False, trailing=False): + """Remove white space from a string. + Args: + string(str): The string to remove white space from. + leading(bool, optional): Remove leading new lines when True. + trailing(bool, optional): Remove trailing new lines when False. + Returns: + str: The input string with new line characters removed and white space squashed. + Examples: + Single or multiple new line characters are replaced with space. + >>> remove_whitespace("abc\\ndef") + 'abc def' + >>> remove_whitespace("abc\\n\\n\\ndef") + 'abc def' + New line characters surrounded by white space are replaced with a single space. + >>> remove_whitespace("abc \\n \\n \\n def") + 'abc def' + >>> remove_whitespace("abc \\n \\n \\n def") + 'abc def' + Leading and trailing new lines are replaced with a single space. + >>> remove_whitespace("\\nabc") + ' abc' + >>> remove_whitespace(" \\n abc") + ' abc' + >>> remove_whitespace("abc\\n") + 'abc ' + >>> remove_whitespace("abc \\n ") + 'abc ' + Use ``leading=True`` to remove leading new line characters, including any surrounding + white space: + >>> remove_whitespace("\\nabc", leading=True) + 'abc' + >>> remove_whitespace(" \\n abc", leading=True) + 'abc' + Use ``trailing=True`` to remove trailing new line characters, including any surrounding + white space: + >>> remove_whitespace("abc \\n ", trailing=True) + 'abc' + """ + # Remove any leading new line characters along with any surrounding white space + if leading: + string = re.sub(r'^\s*\n+\s*', '', string) + + # Remove any trailing new line characters along with any surrounding white space + if trailing: + string = re.sub(r'\s*\n+\s*$', '', string) + + # Replace new line characters and absorb any surrounding space. + string = re.sub(r'\s*\n\s*', ' ', string) + # TODO need some way to get rid of extra spaces in e.g. text text + return re.sub(r'\s+', ' ', string) + +def delete_paragraph(paragraph): + # https://github.com/python-openxml/python-docx/issues/33#issuecomment-77661907 + p = paragraph._element + p.getparent().remove(p) + p._p = p._element = None + +font_styles = { + 'b': 'bold', + 'strong': 'bold', + 'em': 'italic', + 'i': 'italic', + 'u': 'underline', + 's': 'strike', + 'sup': 'superscript', + 'sub': 'subscript', + 'th': 'bold', +} + +font_names = { + 'code': 'Courier', + 'pre': 'Courier', +} + +styles = { + 'LIST_BULLET': 'List Bullet', + 'LIST_NUMBER': 'List Number', +} + +class HtmlToDocx(HTMLParser): + + def __init__(self): + super().__init__() + self.options = { + 'fix-html': True, + 'images': True, + 'tables': True, + 'styles': True, + } + self.table_row_selectors = [ + 'table > tr', + 'table > thead > tr', + 'table > tbody > tr', + 'table > tfoot > tr' + ] + self.table_style = DEFAULT_TABLE_STYLE + self.paragraph_style = DEFAULT_PARAGRAPH_STYLE + + def set_initial_attrs(self, document=None): + self.tags = { + 'span': [], + 'list': [], + } + if document: + self.doc = document + else: + self.doc = Document() + self.bs = self.options['fix-html'] # whether or not to clean with BeautifulSoup + self.document = self.doc + self.include_tables = True #TODO add this option back in? + self.include_images = self.options['images'] + self.include_styles = self.options['styles'] + self.paragraph = None + self.skip = False + self.skip_tag = None + self.instances_to_skip = 0 + + def copy_settings_from(self, other): + """Copy settings from another instance of HtmlToDocx""" + self.table_style = other.table_style + self.paragraph_style = other.paragraph_style + + def get_cell_html(self, soup): + # Returns string of td element with opening and closing tags removed + # Cannot use find_all as it only finds element tags and does not find text which + # is not inside an element + return ' '.join([str(i) for i in soup.contents]) + + def add_styles_to_paragraph(self, style): + if 'text-align' in style: + align = style['text-align'] + if align == 'center': + self.paragraph.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.CENTER + elif align == 'right': + self.paragraph.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.RIGHT + elif align == 'justify': + self.paragraph.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY + if 'margin-left' in style: + margin = style['margin-left'] + units = re.sub(r'[0-9]+', '', margin) + margin = int(float(re.sub(r'[a-z]+', '', margin))) + if units == 'px': + self.paragraph.paragraph_format.left_indent = Inches(min(margin // 10 * INDENT, MAX_INDENT)) + # TODO handle non px units + + def add_styles_to_run(self, style): + if 'color' in style: + if 'rgb' in style['color']: + color = re.sub(r'[a-z()]+', '', style['color']) + colors = [int(x) for x in color.split(',')] + elif '#' in style['color']: + color = style['color'].lstrip('#') + colors = tuple(int(color[i:i+2], 16) for i in (0, 2, 4)) + else: + colors = [0, 0, 0] + # TODO map colors to named colors (and extended colors...) + # For now set color to black to prevent crashing + self.run.font.color.rgb = RGBColor(*colors) + + if 'background-color' in style: + if 'rgb' in style['background-color']: + color = color = re.sub(r'[a-z()]+', '', style['background-color']) + colors = [int(x) for x in color.split(',')] + elif '#' in style['background-color']: + color = style['background-color'].lstrip('#') + colors = tuple(int(color[i:i+2], 16) for i in (0, 2, 4)) + else: + colors = [0, 0, 0] + # TODO map colors to named colors (and extended colors...) + # For now set color to black to prevent crashing + self.run.font.highlight_color = WD_COLOR.GRAY_25 #TODO: map colors + + def apply_paragraph_style(self, style=None): + try: + if style: + self.paragraph.style = style + elif self.paragraph_style: + self.paragraph.style = self.paragraph_style + except KeyError as e: + raise ValueError(f"Unable to apply style {self.paragraph_style}.") from e + + def parse_dict_string(self, string, separator=';'): + new_string = string.replace(" ", '').split(separator) + string_dict = dict([x.split(':') for x in new_string if ':' in x]) + return string_dict + + def handle_li(self): + # check list stack to determine style and depth + list_depth = len(self.tags['list']) + if list_depth: + list_type = self.tags['list'][-1] + else: + list_type = 'ul' # assign unordered if no tag + + if list_type == 'ol': + list_style = styles['LIST_NUMBER'] + else: + list_style = styles['LIST_BULLET'] + + self.paragraph = self.doc.add_paragraph(style=list_style) + self.paragraph.paragraph_format.left_indent = Inches(min(list_depth * LIST_INDENT, MAX_INDENT)) + self.paragraph.paragraph_format.line_spacing = 1 + + def add_image_to_cell(self, cell, image): + # python-docx doesn't have method yet for adding images to table cells. For now we use this + paragraph = cell.add_paragraph() + run = paragraph.add_run() + run.add_picture(image) + + def handle_img(self, current_attrs): + if not self.include_images: + self.skip = True + self.skip_tag = 'img' + return + src = current_attrs['src'] + # fetch image + src_is_url = is_url(src) + if src_is_url: + try: + image = fetch_image(src) + except urllib.error.URLError: + image = None + else: + image = src + # add image to doc + if image: + try: + if isinstance(self.doc, docx.document.Document): + self.doc.add_picture(image) + else: + self.add_image_to_cell(self.doc, image) + except FileNotFoundError: + image = None + if not image: + if src_is_url: + self.doc.add_paragraph("" % src) + else: + # avoid exposing filepaths in document + self.doc.add_paragraph("" % get_filename_from_url(src)) + + + def handle_table(self, html): + """ + To handle nested tables, we will parse tables manually as follows: + Get table soup + Create docx table + Iterate over soup and fill docx table with new instances of this parser + Tell HTMLParser to ignore any tags until the corresponding closing table tag + """ + doc = Document() + table_soup = BeautifulSoup(html, 'html.parser') + rows, cols_len = self.get_table_dimensions(table_soup) + table = doc.add_table(len(rows), cols_len) + table.style = doc.styles['Table Grid'] + cell_row = 0 + for index, row in enumerate(rows): + cols = self.get_table_columns(row) + cell_col = 0 + for col in cols: + colspan = int(col.attrs.get('colspan', 1)) + rowspan = int(col.attrs.get('rowspan', 1)) + + cell_html = self.get_cell_html(col) + + if col.name == 'th': + cell_html = "%s" % cell_html + docx_cell = table.cell(cell_row, cell_col) + while docx_cell.text != '': # Skip the merged cell + cell_col += 1 + docx_cell = table.cell(cell_row, cell_col) + + cell_to_merge = table.cell(cell_row + rowspan - 1, cell_col + colspan - 1) + if docx_cell != cell_to_merge: + docx_cell.merge(cell_to_merge) + + child_parser = HtmlToDocx() + child_parser.copy_settings_from(self) + + child_parser.add_html_to_cell(cell_html or ' ', docx_cell) # occupy the position + + cell_col += colspan + cell_row += 1 + + # skip all tags until corresponding closing tag + self.instances_to_skip = len(table_soup.find_all('table')) + self.skip_tag = 'table' + self.skip = True + self.table = None + return table + + def handle_link(self, href, text): + # Link requires a relationship + is_external = href.startswith('http') + rel_id = self.paragraph.part.relate_to( + href, + docx.opc.constants.RELATIONSHIP_TYPE.HYPERLINK, + is_external=True # don't support anchor links for this library yet + ) + + # Create the w:hyperlink tag and add needed values + hyperlink = docx.oxml.shared.OxmlElement('w:hyperlink') + hyperlink.set(docx.oxml.shared.qn('r:id'), rel_id) + + + # Create sub-run + subrun = self.paragraph.add_run() + rPr = docx.oxml.shared.OxmlElement('w:rPr') + + # add default color + c = docx.oxml.shared.OxmlElement('w:color') + c.set(docx.oxml.shared.qn('w:val'), "0000EE") + rPr.append(c) + + # add underline + u = docx.oxml.shared.OxmlElement('w:u') + u.set(docx.oxml.shared.qn('w:val'), 'single') + rPr.append(u) + + subrun._r.append(rPr) + subrun._r.text = text + + # Add subrun to hyperlink + hyperlink.append(subrun._r) + + # Add hyperlink to run + self.paragraph._p.append(hyperlink) + + def handle_starttag(self, tag, attrs): + if self.skip: + return + if tag == 'head': + self.skip = True + self.skip_tag = tag + self.instances_to_skip = 0 + return + elif tag == 'body': + return + + current_attrs = dict(attrs) + + if tag == 'span': + self.tags['span'].append(current_attrs) + return + elif tag == 'ol' or tag == 'ul': + self.tags['list'].append(tag) + return # don't apply styles for now + elif tag == 'br': + self.run.add_break() + return + + self.tags[tag] = current_attrs + if tag in ['p', 'pre']: + self.paragraph = self.doc.add_paragraph() + self.apply_paragraph_style() + + elif tag == 'li': + self.handle_li() + + elif tag == "hr": + + # This implementation was taken from: + # https://github.com/python-openxml/python-docx/issues/105#issuecomment-62806373 + + self.paragraph = self.doc.add_paragraph() + pPr = self.paragraph._p.get_or_add_pPr() + pBdr = OxmlElement('w:pBdr') + pPr.insert_element_before(pBdr, + 'w:shd', 'w:tabs', 'w:suppressAutoHyphens', 'w:kinsoku', 'w:wordWrap', + 'w:overflowPunct', 'w:topLinePunct', 'w:autoSpaceDE', 'w:autoSpaceDN', + 'w:bidi', 'w:adjustRightInd', 'w:snapToGrid', 'w:spacing', 'w:ind', + 'w:contextualSpacing', 'w:mirrorIndents', 'w:suppressOverlap', 'w:jc', + 'w:textDirection', 'w:textAlignment', 'w:textboxTightWrap', + 'w:outlineLvl', 'w:divId', 'w:cnfStyle', 'w:rPr', 'w:sectPr', + 'w:pPrChange' + ) + bottom = OxmlElement('w:bottom') + bottom.set(qn('w:val'), 'single') + bottom.set(qn('w:sz'), '6') + bottom.set(qn('w:space'), '1') + bottom.set(qn('w:color'), 'auto') + pBdr.append(bottom) + + elif re.match('h[1-9]', tag): + if isinstance(self.doc, docx.document.Document): + h_size = int(tag[1]) + self.paragraph = self.doc.add_heading(level=min(h_size, 9)) + else: + self.paragraph = self.doc.add_paragraph() + + elif tag == 'img': + self.handle_img(current_attrs) + return + + elif tag == 'table': + self.handle_table() + return + + # set new run reference point in case of leading line breaks + if tag in ['p', 'li', 'pre']: + self.run = self.paragraph.add_run() + + # add style + if not self.include_styles: + return + if 'style' in current_attrs and self.paragraph: + style = self.parse_dict_string(current_attrs['style']) + self.add_styles_to_paragraph(style) + + def handle_endtag(self, tag): + if self.skip: + if not tag == self.skip_tag: + return + + if self.instances_to_skip > 0: + self.instances_to_skip -= 1 + return + + self.skip = False + self.skip_tag = None + self.paragraph = None + + if tag == 'span': + if self.tags['span']: + self.tags['span'].pop() + return + elif tag == 'ol' or tag == 'ul': + remove_last_occurence(self.tags['list'], tag) + return + elif tag == 'table': + self.table_no += 1 + self.table = None + self.doc = self.document + self.paragraph = None + + if tag in self.tags: + self.tags.pop(tag) + # maybe set relevant reference to None? + + def handle_data(self, data): + if self.skip: + return + + # Only remove white space if we're not in a pre block. + if 'pre' not in self.tags: + # remove leading and trailing whitespace in all instances + data = remove_whitespace(data, True, True) + + if not self.paragraph: + self.paragraph = self.doc.add_paragraph() + self.apply_paragraph_style() + + # There can only be one nested link in a valid html document + # You cannot have interactive content in an A tag, this includes links + # https://html.spec.whatwg.org/#interactive-content + link = self.tags.get('a') + if link: + self.handle_link(link['href'], data) + else: + # If there's a link, dont put the data directly in the run + self.run = self.paragraph.add_run(data) + spans = self.tags['span'] + for span in spans: + if 'style' in span: + style = self.parse_dict_string(span['style']) + self.add_styles_to_run(style) + + # add font style and name + for tag in self.tags: + if tag in font_styles: + font_style = font_styles[tag] + setattr(self.run.font, font_style, True) + + if tag in font_names: + font_name = font_names[tag] + self.run.font.name = font_name + + def ignore_nested_tables(self, tables_soup): + """ + Returns array containing only the highest level tables + Operates on the assumption that bs4 returns child elements immediately after + the parent element in `find_all`. If this changes in the future, this method will need to be updated + :return: + """ + new_tables = [] + nest = 0 + for table in tables_soup: + if nest: + nest -= 1 + continue + new_tables.append(table) + nest = len(table.find_all('table')) + return new_tables + + def get_table_rows(self, table_soup): + # If there's a header, body, footer or direct child tr tags, add row dimensions from there + return table_soup.select(', '.join(self.table_row_selectors), recursive=False) + + def get_table_columns(self, row): + # Get all columns for the specified row tag. + return row.find_all(['th', 'td'], recursive=False) if row else [] + + def get_table_dimensions(self, table_soup): + # Get rows for the table + rows = self.get_table_rows(table_soup) + # Table is either empty or has non-direct children between table and tr tags + # Thus the row dimensions and column dimensions are assumed to be 0 + + cols = self.get_table_columns(rows[0]) if rows else [] + # Add colspan calculation column number + col_count = 0 + for col in cols: + colspan = col.attrs.get('colspan', 1) + col_count += int(colspan) + + # return len(rows), col_count + return rows, col_count + + def get_tables(self): + if not hasattr(self, 'soup'): + self.include_tables = False + return + # find other way to do it, or require this dependency? + self.tables = self.ignore_nested_tables(self.soup.find_all('table')) + self.table_no = 0 + + def run_process(self, html): + if self.bs and BeautifulSoup: + self.soup = BeautifulSoup(html, 'html.parser') + html = str(self.soup) + if self.include_tables: + self.get_tables() + self.feed(html) + + def add_html_to_document(self, html, document): + if not isinstance(html, str): + raise ValueError('First argument needs to be a %s' % str) + elif not isinstance(document, docx.document.Document) and not isinstance(document, docx.table._Cell): + raise ValueError('Second argument needs to be a %s' % docx.document.Document) + self.set_initial_attrs(document) + self.run_process(html) + + def add_html_to_cell(self, html, cell): + self.set_initial_attrs(cell) + self.run_process(html) + + def parse_html_file(self, filename_html, filename_docx=None): + with open(filename_html, 'r') as infile: + html = infile.read() + self.set_initial_attrs() + self.run_process(html) + if not filename_docx: + path, filename = os.path.split(filename_html) + filename_docx = '%s/new_docx_file_%s' % (path, filename) + self.doc.save('%s.docx' % filename_docx) + + def parse_html_string(self, html): + self.set_initial_attrs() + self.run_process(html) + return self.doc \ No newline at end of file diff --git a/ppstructure/table/README.md b/ppstructure/table/README.md index b6804c6f09b4ee3d17cd2b81e6cc6642c1c1be9a..08635516ba8301e6f98f175e5eba8c0a97b1708e 100644 --- a/ppstructure/table/README.md +++ b/ppstructure/table/README.md @@ -1,126 +1,158 @@ -- [Table Recognition](#table-recognition) - - [1. pipeline](#1-pipeline) - - [2. Performance](#2-performance) - - [3. How to use](#3-how-to-use) - - [3.1 quick start](#31-quick-start) - - [3.2 Train](#32-train) - - [3.3 Eval](#33-eval) - - [3.4 Inference](#34-inference) - +English | [简体中文](README_ch.md) # Table Recognition +- [1. pipeline](#1-pipeline) +- [2. Performance](#2-performance) +- [3. Result](#3-result) +- [4. How to use](#4-how-to-use) + - [4.1 Quick start](#41-quick-start) + - [4.2 Training, Evaluation and Inference](#42-training-evaluation-and-inference) + - [4.3 Calculate TEDS](#43-calculate-teds) +- [5. Reference](#5-reference) + + ## 1. pipeline The table recognition mainly contains three models 1. Single line text detection-DB 2. Single line text recognition-CRNN -3. Table structure and cell coordinate prediction-RARE +3. Table structure and cell coordinate prediction-SLANet The table recognition flow chart is as follows ![tableocr_pipeline](../docs/table/tableocr_pipeline_en.jpg) 1. The coordinates of single-line text is detected by DB model, and then sends it to the recognition model to get the recognition result. -2. The table structure and cell coordinates is predicted by RARE model. +2. The table structure and cell coordinates is predicted by SLANet model. 3. The recognition result of the cell is combined by the coordinates, recognition result of the single line and the coordinates of the cell. 4. The cell recognition result and the table structure together construct the html string of the table. ## 2. Performance We evaluated the algorithm on the PubTabNet[1] eval dataset, and the performance is as follows: +|Method|Acc|[TEDS(Tree-Edit-Distance-based Similarity)](https://github.com/ibm-aur-nlp/PubTabNet/tree/master/src)|Speed| +| --- | --- | --- | ---| +| EDD[2] |x| 88.3 |x| +| TableRec-RARE(ours) | 71.73%| 93.88% |779ms| +| SLANet(ours) | 76.31%| 95.89%|766ms| + +The performance indicators are explained as follows: +- Acc: The accuracy of the table structure in each image, a wrong token is considered an error. +- TEDS: The accuracy of the model's restoration of table information. This indicator evaluates not only the table structure, but also the text content in the table. +- Speed: The inference speed of a single image when the model runs on the CPU machine and MKL is enabled. + +## 3. Result + +![](../docs/imgs/table_ch_result1.jpg) +![](../docs/imgs/table_ch_result2.jpg) +![](../docs/imgs/table_ch_result3.jpg) + +## 4. How to use + +### 4.1 Quick start -|Method|[TEDS(Tree-Edit-Distance-based Similarity)](https://github.com/ibm-aur-nlp/PubTabNet/tree/master/src)| -| --- | --- | -| EDD[2] | 88.3 | -| Ours | 93.32 | +PP-Structure currently provides table recognition models in both Chinese and English. For the model link, see [models_list](../docs/models_list.md). The whl package is also provided for quick use, see [quickstart](../docs/quickstart_en.md) for details. -## 3. How to use +The following takes the Chinese table recognition model as an example to introduce how to recognize a table. -### 3.1 quick start +Use the following commands to quickly complete the identification of a table. ```python cd PaddleOCR/ppstructure # download model mkdir inference && cd inference -# Download the detection model of the ultra-lightweight table English OCR model and unzip it -wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_det_infer.tar && tar xf en_ppocr_mobile_v2.0_table_det_infer.tar -# Download the recognition model of the ultra-lightweight table English OCR model and unzip it -wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_rec_infer.tar && tar xf en_ppocr_mobile_v2.0_table_rec_infer.tar -# Download the ultra-lightweight English table inch model and unzip it -wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_structure_infer.tar && tar xf en_ppocr_mobile_v2.0_table_structure_infer.tar +# Download the PP-OCRv3 text detection model and unzip it +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar && tar xf ch_PP-OCRv3_det_infer.tar +# Download the PP-OCRv3 text recognition model and unzip it +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar && tar xf ch_PP-OCRv3_rec_infer.tar +# Download the PP-Structurev2 form recognition model and unzip it +wget https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_infer.tar && tar xf ch_ppstructure_mobile_v2.0_SLANet_infer.tar cd .. # run -python3 table/predict_table.py --det_model_dir=inference/en_ppocr_mobile_v2.0_table_det_infer --rec_model_dir=inference/en_ppocr_mobile_v2.0_table_rec_infer --table_model_dir=inference/en_ppocr_mobile_v2.0_table_structure_infer --image_dir=./docs/table/table.jpg --rec_char_dict_path=../ppocr/utils/dict/table_dict.txt --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt --det_limit_side_len=736 --det_limit_type=min --output ./output/table +python3.7 table/predict_table.py \ + --det_model_dir=inference/ch_PP-OCRv3_det_infer \ + --rec_model_dir=inference/ch_PP-OCRv3_rec_infer \ + --table_model_dir=inference/ch_ppstructure_mobile_v2.0_SLANet_infer \ + --rec_char_dict_path=../ppocr/utils/ppocr_keys_v1.txt \ + --table_char_dict_path=../ppocr/utils/dict/table_structure_dict_ch.txt \ + --image_dir=docs/table/table.jpg \ + --output=../output/table + ``` -Note: The above model is trained on the PubLayNet dataset and only supports English scanning scenarios. If you need to identify other scenarios, you need to train the model yourself and replace the three fields `det_model_dir`, `rec_model_dir`, `table_model_dir`. -After running, the excel sheet of each picture will be saved in the directory specified by the output field +After the operation is completed, the excel table of each image will be saved to the directory specified by the output field, and an html file will be produced in the directory to visually view the cell coordinates and the recognized table. -### 3.2 Train +**NOTE** +1. If you want to use the English table recognition model, you need to download the English text detection and recognition model and the English table recognition model in [models_list](../docs/models_list_en.md), and replace `table_structure_dict_ch.txt` with `table_structure_dict.txt`. +2. To use the TableRec-RARE model, you need to replace `table_structure_dict_ch.txt` with `table_structure_dict.txt`, and add parameter `--merge_no_span_structure=False` -In this chapter, we only introduce the training of the table structure model, For model training of [text detection](../../doc/doc_en/detection_en.md) and [text recognition](../../doc/doc_en/recognition_en.md), please refer to the corresponding documents +### 4.2 Training, Evaluation and Inference -* data preparation -The training data uses public data set [PubTabNet](https://arxiv.org/abs/1911.10683 ), Can be downloaded from the official [website](https://github.com/ibm-aur-nlp/PubTabNet) 。The PubTabNet data set contains about 500,000 images, as well as annotations in html format。 +The training, evaluation and inference process of the text detection model can be referred to [detection](../../doc/doc_en/detection_en.md) -* Start training -*If you are installing the cpu version of paddle, please modify the `use_gpu` field in the configuration file to false* -```shell -# single GPU training -python3 tools/train.py -c configs/table/table_mv3.yml -# multi-GPU training -# Set the GPU ID used by the '--gpus' parameter. -python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/table/table_mv3.yml -``` +The training, evaluation and inference process of the text recognition model can be referred to [recognition](../../doc/doc_en/recognition_en.md) -In the above instruction, use `-c` to select the training to use the `configs/table/table_mv3.yml` configuration file. -For a detailed explanation of the configuration file, please refer to [config](../../doc/doc_en/config_en.md). +The training, evaluation and inference process of the table recognition model can be referred to [table_recognition](../../doc/doc_en/table_recognition_en.md) -* load trained model and continue training +### 4.3 Calculate TEDS -If you expect to load trained model and continue the training again, you can specify the parameter `Global.checkpoints` as the model path to be loaded. - -```shell -python3 tools/train.py -c configs/table/table_mv3.yml -o Global.checkpoints=./your/trained/model +The table uses [TEDS(Tree-Edit-Distance-based Similarity)](https://github.com/ibm-aur-nlp/PubTabNet/tree/master/src) as the evaluation metric of the model. Before the model evaluation, the three models in the pipeline need to be exported as inference models (we have provided them), and the gt for evaluation needs to be prepared. Examples of gt are as follows: +```txt +PMC5755158_010_01.png
WeaningWeek 15Off-test
Weaning
Week 150.17 ± 0.080.16 ± 0.03
Off-test0.80 ± 0.240.19 ± 0.09
``` +Each line in gt consists of the file name and the html string of the table. The file name and the html string of the table are separated by `\t`. -**Note**: The priority of `Global.checkpoints` is higher than that of `Global.pretrain_weights`, that is, when two parameters are specified at the same time, the model specified by `Global.checkpoints` will be loaded first. If the model path specified by `Global.checkpoints` is wrong, the one specified by `Global.pretrain_weights` will be loaded. - -### 3.3 Eval - -The table uses [TEDS(Tree-Edit-Distance-based Similarity)](https://github.com/ibm-aur-nlp/PubTabNet/tree/master/src) as the evaluation metric of the model. Before the model evaluation, the three models in the pipeline need to be exported as inference models (we have provided them), and the gt for evaluation needs to be prepared. Examples of gt are as follows: -```json -{"PMC4289340_004_00.png": [ - ["", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "
", "", "", "
", "", "", "
", "", ""], - [[1, 4, 29, 13], [137, 4, 161, 13], [215, 4, 236, 13], [1, 17, 30, 27], [137, 17, 147, 27], [215, 17, 225, 27]], - [["", "F", "e", "a", "t", "u", "r", "e", ""], ["", "G", "b", "3", " ", "+", ""], ["", "G", "b", "3", " ", "-", ""], ["", "P", "a", "t", "i", "e", "n", "t", "s", ""], ["6", "2"], ["4", "5"]] -]} +You can also use the following command to generate an evaluation gt file from the annotation file: +```python +python3 ppstructure/table/convert_label2html.py --ori_gt_path /path/to/your_label_file --save_path /path/to/save_file ``` -In gt json, the key is the image name, the value is the corresponding gt, and gt is a list composed of four items, and each item is -1. HTML string list of table structure -2. The coordinates of each cell (not including the empty text in the cell) -3. The text information in each cell (not including the empty text in the cell) Use the following command to evaluate. After the evaluation is completed, the teds indicator will be output. ```python -cd PaddleOCR/ppstructure -python3 table/eval_table.py --det_model_dir=path/to/det_model_dir --rec_model_dir=path/to/rec_model_dir --table_model_dir=path/to/table_model_dir --image_dir=../doc/table/1.png --rec_char_dict_path=../ppocr/utils/dict/table_dict.txt --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt --det_limit_side_len=736 --det_limit_type=min --gt_path=path/to/gt.json +python3 table/eval_table.py \ + --det_model_dir=path/to/det_model_dir \ + --rec_model_dir=path/to/rec_model_dir \ + --table_model_dir=path/to/table_model_dir \ + --image_dir=../doc/table/1.png \ + --rec_char_dict_path=../ppocr/utils/dict/table_dict.txt \ + --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt \ + --det_limit_side_len=736 \ + --det_limit_type=min \ + --gt_path=path/to/gt.txt ``` -If the PubLatNet eval dataset is used, it will be output +Evaluate on the PubLatNet dataset using the English model + ```bash -teds: 93.32 -``` +cd PaddleOCR/ppstructure +# Download the model +mkdir inference && cd inference +# Download the text detection model trained on the PubTabNet dataset and unzip it +wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_det_infer.tar && tar xf en_ppocr_mobile_v2.0_table_det_infer.tar +# Download the text recognition model trained on the PubTabNet dataset and unzip it +wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_rec_infer.tar && tar xf en_ppocr_mobile_v2.0_table_rec_infer.tar +# Download the table recognition model trained on the PubTabNet dataset and unzip it +wget https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/en_ppstructure_mobile_v2.0_SLANet_infer.tar && tar xf en_ppstructure_mobile_v2.0_SLANet_infer.tar +cd .. -### 3.4 Inference +python3 table/eval_table.py \ + --det_model_dir=inference/en_ppocr_mobile_v2.0_table_det_infer \ + --rec_model_dir=inference/en_ppocr_mobile_v2.0_table_rec_infer \ + --table_model_dir=inference/en_ppstructure_mobile_v2.0_SLANet_infer \ + --image_dir=train_data/table/pubtabnet/val/ \ + --rec_char_dict_path=../ppocr/utils/dict/table_dict.txt \ + --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt \ + --det_limit_side_len=736 \ + --det_limit_type=min \ + --gt_path=path/to/gt.txt +``` -```python -cd PaddleOCR/ppstructure -python3 table/predict_table.py --det_model_dir=path/to/det_model_dir --rec_model_dir=path/to/rec_model_dir --table_model_dir=path/to/table_model_dir --image_dir=../doc/table/1.png --rec_char_dict_path=../ppocr/utils/dict/table_dict.txt --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt --det_limit_side_len=736 --det_limit_type=min --output ../output/table +output is +```bash +teds: 95.89 ``` -After running, the excel sheet of each picture will be saved in the directory specified by the output field -Reference +## 5. Reference 1. https://github.com/ibm-aur-nlp/PubTabNet 2. https://arxiv.org/pdf/1911.10683 diff --git a/ppstructure/table/README_ch.md b/ppstructure/table/README_ch.md index a0a64d6b7ebcb272e4b607975170a679abd036ab..1ef126261d9ce832cd1919a1b3991f341add998c 100644 --- a/ppstructure/table/README_ch.md +++ b/ppstructure/table/README_ch.md @@ -2,22 +2,22 @@ # 表格识别 -- [1. 表格识别 pipeline](#1) -- [2. 性能](#2) -- [3. 使用](#3) - - [3.1 快速开始](#31) - - [3.2 训练](#32) - - [3.3 评估](#33) - - [3.4 预测](#34) +- [1. 表格识别 pipeline](#1-表格识别-pipeline) +- [2. 性能](#2-性能) +- [3. 效果演示](#3-效果演示) +- [4. 使用](#4-使用) + - [4.1 快速开始](#41-快速开始) + - [4.2 模型训练、评估与推理](#42-模型训练评估与推理) + - [4.3 计算TEDS](#43-计算teds) +- [5. Reference](#5-reference) - ## 1. 表格识别 pipeline 表格识别主要包含三个模型 1. 单行文本检测-DB 2. 单行文本识别-CRNN -3. 表格结构和cell坐标预测-RARE +3. 表格结构和cell坐标预测-SLANet 具体流程图如下 @@ -26,111 +26,137 @@ 流程说明: 1. 图片由单行文字检测模型检测到单行文字的坐标,然后送入识别模型拿到识别结果。 -2. 图片由表格结构和cell坐标预测模型拿到表格的结构信息和单元格的坐标信息。 +2. 图片由SLANet模型拿到表格的结构信息和单元格的坐标信息。 3. 由单行文字的坐标、识别结果和单元格的坐标一起组合出单元格的识别结果。 4. 单元格的识别结果和表格结构一起构造表格的html字符串。 - ## 2. 性能 我们在 PubTabNet[1] 评估数据集上对算法进行了评估,性能如下 -|算法|[TEDS(Tree-Edit-Distance-based Similarity)](https://github.com/ibm-aur-nlp/PubTabNet/tree/master/src)| -| --- | --- | -| EDD[2] | 88.3 | -| Ours | 93.32 | +|算法|Acc|[TEDS(Tree-Edit-Distance-based Similarity)](https://github.com/ibm-aur-nlp/PubTabNet/tree/master/src)|Speed| +| --- | --- | --- | ---| +| EDD[2] |x| 88.3% |x| +| TableRec-RARE(ours) | 71.73%| 93.88% |779ms| +| SLANet(ours) |76.31%| 95.89%|766ms| - -## 3. 使用 +性能指标解释如下: +- Acc: 模型对每张图像里表格结构的识别准确率,错一个token就算错误。 +- TEDS: 模型对表格信息还原的准确度,此指标评价内容不仅包含表格结构,还包含表格内的文字内容。 +- Speed: 模型在CPU机器上,开启MKL的情况下,单张图片的推理速度。 - -### 3.1 快速开始 +## 3. 效果演示 +![](../docs/imgs/table_ch_result1.jpg) +![](../docs/imgs/table_ch_result2.jpg) +![](../docs/imgs/table_ch_result3.jpg) + +## 4. 使用 + +### 4.1 快速开始 + +PP-Structure目前提供了中英文两种语言的表格识别模型,模型链接见 [models_list](../docs/models_list.md)。也提供了whl包的形式方便快速使用,详见 [quickstart](../docs/quickstart.md)。 + +下面以中文表格识别模型为例,介绍如何识别一张表格。 + +使用如下命令即可快速完成一张表格的识别。 ```python cd PaddleOCR/ppstructure # 下载模型 mkdir inference && cd inference -# 下载超轻量级表格英文OCR模型的检测模型并解压 -wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_det_infer.tar && tar xf en_ppocr_mobile_v2.0_table_det_infer.tar -# 下载超轻量级表格英文OCR模型的识别模型并解压 -wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_rec_infer.tar && tar xf en_ppocr_mobile_v2.0_table_rec_infer.tar -# 下载超轻量级英文表格英寸模型并解压 -wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_structure_infer.tar && tar xf en_ppocr_mobile_v2.0_table_structure_infer.tar +# 下载PP-OCRv3文本检测模型并解压 +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar && tar xf ch_PP-OCRv3_det_infer.tar +# 下载PP-OCRv3文本识别模型并解压 +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar && tar xf ch_PP-OCRv3_rec_infer.tar +# 下载PP-Structurev2中文表格识别模型并解压 +wget https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_infer.tar && tar xf ch_ppstructure_mobile_v2.0_SLANet_infer.tar cd .. -# 执行预测 -python3 table/predict_table.py --det_model_dir=inference/en_ppocr_mobile_v2.0_table_det_infer --rec_model_dir=inference/en_ppocr_mobile_v2.0_table_rec_infer --table_model_dir=inference/en_ppocr_mobile_v2.0_table_structure_infer --image_dir=./docs/table/table.jpg --rec_char_dict_path=../ppocr/utils/dict/table_dict.txt --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt --det_limit_side_len=736 --det_limit_type=min --output ./output/table +# 执行表格识别 +python table/predict_table.py \ + --det_model_dir=inference/ch_PP-OCRv3_det_infer \ + --rec_model_dir=inference/ch_PP-OCRv3_rec_infer \ + --table_model_dir=inference/ch_ppstructure_mobile_v2.0_SLANet_infer \ + --rec_char_dict_path=../ppocr/utils/ppocr_keys_v1.txt \ + --table_char_dict_path=../ppocr/utils/dict/table_structure_dict_ch.txt \ + --image_dir=docs/table/table.jpg \ + --output=../output/table ``` -运行完成后,每张图片的excel表格会保存到output字段指定的目录下 +运行完成后,每张图片的excel表格会保存到output字段指定的目录下,同时在该目录下回生产一个html文件,用于可视化查看单元格坐标和识别的表格。 -note: 上述模型是在 PubLayNet 数据集上训练的表格识别模型,仅支持英文扫描场景,如需识别其他场景需要自己训练模型后替换 `det_model_dir`,`rec_model_dir`,`table_model_dir`三个字段即可。 +**NOTE** +1. 如果想使用英文模型,需要在 [models_list](../docs/models_list.md) 中下载英文文字检测识别模型和英文表格识别模型,同时替换`table_structure_dict_ch.txt`为`table_structure_dict.txt`即可。 +2. 如需使用TableRec-RARE模型,需要替换`table_structure_dict_ch.txt`为`table_structure_dict.txt`,同时参数`--merge_no_span_structure=False` - -### 3.2 训练 +### 4.2 模型训练、评估与推理 -在这一章节中,我们仅介绍表格结构模型的训练,[文字检测](../../doc/doc_ch/detection.md)和[文字识别](../../doc/doc_ch/recognition.md)的模型训练请参考对应的文档。 +文本检测模型的训练、评估和推理流程可参考 [detection](../../doc/doc_ch/detection.md) -* 数据准备 +文本识别模型的训练、评估和推理流程可参考 [recognition](../../doc/doc_ch/recognition.md) -训练数据使用公开数据集PubTabNet ([论文](https://arxiv.org/abs/1911.10683),[下载地址](https://github.com/ibm-aur-nlp/PubTabNet))。PubTabNet数据集包含约50万张表格数据的图像,以及图像对应的html格式的注释。 +表格识别模型的训练、评估和推理流程可参考 [table_recognition](../../doc/doc_ch/table_recognition.md) -* 启动训练 +### 4.3 计算TEDS -*如果您安装的是cpu版本,请将配置文件中的 `use_gpu` 字段修改为false* -```shell -# 单机单卡训练 -python3 tools/train.py -c configs/table/table_mv3.yml -# 单机多卡训练,通过 --gpus 参数设置使用的GPU ID -python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/table/table_mv3.yml -``` - -上述指令中,通过-c 选择训练使用configs/table/table_mv3.yml配置文件。有关配置文件的详细解释,请参考[链接](../../doc/doc_ch/config.md)。 - -* 断点训练 - -如果训练程序中断,如果希望加载训练中断的模型从而恢复训练,可以通过指定Global.checkpoints指定要加载的模型路径: -```shell -python3 tools/train.py -c configs/table/table_mv3.yml -o Global.checkpoints=./your/trained/model +表格使用 [TEDS(Tree-Edit-Distance-based Similarity)](https://github.com/ibm-aur-nlp/PubTabNet/tree/master/src) 作为模型的评估指标。在进行模型评估之前,需要将pipeline中的三个模型分别导出为inference模型(我们已经提供好),还需要准备评估的gt, gt示例如下: +```txt +PMC5755158_010_01.png
WeaningWeek 15Off-test
Weaning
Week 150.17 ± 0.080.16 ± 0.03
Off-test0.80 ± 0.240.19 ± 0.09
``` +gt每一行都由文件名和表格的html字符串组成,文件名和表格的html字符串之间使用`\t`分隔。 -**注意**:`Global.checkpoints`的优先级高于`Global.pretrain_weights`的优先级,即同时指定两个参数时,优先加载`Global.checkpoints`指定的模型,如果`Global.checkpoints`指定的模型路径有误,会加载`Global.pretrain_weights`指定的模型。 - - -### 3.3 评估 - -表格使用 [TEDS(Tree-Edit-Distance-based Similarity)](https://github.com/ibm-aur-nlp/PubTabNet/tree/master/src) 作为模型的评估指标。在进行模型评估之前,需要将pipeline中的三个模型分别导出为inference模型(我们已经提供好),还需要准备评估的gt, gt示例如下: -```json -{"PMC4289340_004_00.png": [ - ["", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "
", "", "", "
", "", "", "
", "", ""], - [[1, 4, 29, 13], [137, 4, 161, 13], [215, 4, 236, 13], [1, 17, 30, 27], [137, 17, 147, 27], [215, 17, 225, 27]], - [["", "F", "e", "a", "t", "u", "r", "e", ""], ["", "G", "b", "3", " ", "+", ""], ["", "G", "b", "3", " ", "-", ""], ["", "P", "a", "t", "i", "e", "n", "t", "s", ""], ["6", "2"], ["4", "5"]] -]} +也可使用如下命令,由标注文件生成评估的gt文件: +```python +python3 ppstructure/table/convert_label2html.py --ori_gt_path /path/to/your_label_file --save_path /path/to/save_file ``` -json 中,key为图片名,value为对应的gt,gt是一个由三个item组成的list,每个item分别为 -1. 表格结构的html字符串list -2. 每个cell的坐标 (不包括cell里文字为空的) -3. 每个cell里的文字信息 (不包括cell里文字为空的) 准备完成后使用如下命令进行评估,评估完成后会输出teds指标。 ```python cd PaddleOCR/ppstructure -python3 table/eval_table.py --det_model_dir=path/to/det_model_dir --rec_model_dir=path/to/rec_model_dir --table_model_dir=path/to/table_model_dir --image_dir=../doc/table/1.png --rec_char_dict_path=../ppocr/utils/dict/table_dict.txt --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt --det_limit_side_len=736 --det_limit_type=min --gt_path=path/to/gt.json -``` -如使用PubLatNet评估数据集,将会输出 -```bash -teds: 93.32 +python3 table/eval_table.py \ + --det_model_dir=path/to/det_model_dir \ + --rec_model_dir=path/to/rec_model_dir \ + --table_model_dir=path/to/table_model_dir \ + --image_dir=../doc/table/1.png \ + --rec_char_dict_path=../ppocr/utils/dict/table_dict.txt \ + --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt \ + --det_limit_side_len=736 \ + --det_limit_type=min \ + --gt_path=path/to/gt.txt ``` - -### 3.4 预测 +如使用英文表格识别模型在PubLatNet数据集上进行评估 -```python +```bash cd PaddleOCR/ppstructure -python3 table/predict_table.py --det_model_dir=path/to/det_model_dir --rec_model_dir=path/to/rec_model_dir --table_model_dir=path/to/table_model_dir --image_dir=../doc/table/1.png --rec_char_dict_path=../ppocr/utils/dict/table_dict.txt --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt --det_limit_side_len=736 --det_limit_type=min --output ../output/table +# 下载模型 +mkdir inference && cd inference +# 下载基于PubTabNet数据集训练的文本检测模型并解压 +wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_det_infer.tar && tar xf en_ppocr_mobile_v2.0_table_det_infer.tar +# 下载基于PubTabNet数据集训练的文本识别模型并解压 +wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_rec_infer.tar && tar xf en_ppocr_mobile_v2.0_table_rec_infer.tar +# 下载基于PubTabNet数据集训练的表格识别模型并解压 +wget https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/en_ppstructure_mobile_v2.0_SLANet_infer.tar && tar xf en_ppstructure_mobile_v2.0_SLANet_infer.tar +cd .. + +python3 table/eval_table.py \ + --det_model_dir=inference/en_ppocr_mobile_v2.0_table_det_infer \ + --rec_model_dir=inference/en_ppocr_mobile_v2.0_table_rec_infer \ + --table_model_dir=inference/en_ppstructure_mobile_v2.0_SLANet_infer \ + --image_dir=train_data/table/pubtabnet/val/ \ + --rec_char_dict_path=../ppocr/utils/dict/table_dict.txt \ + --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt \ + --det_limit_side_len=736 \ + --det_limit_type=min \ + --gt_path=path/to/gt.txt +``` + +将会输出 +```bash +teds: 95.89 ``` -# Reference +## 5. Reference 1. https://github.com/ibm-aur-nlp/PubTabNet 2. https://arxiv.org/pdf/1911.10683 diff --git a/ppstructure/table/convert_label2html.py b/ppstructure/table/convert_label2html.py new file mode 100644 index 0000000000000000000000000000000000000000..be16212ac420326a91cf8ab281a77e5990530c0e --- /dev/null +++ b/ppstructure/table/convert_label2html.py @@ -0,0 +1,102 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +conver table label to html +""" + +import json +import argparse +from tqdm import tqdm + + +def save_pred_txt(key, val, tmp_file_path): + with open(tmp_file_path, 'a+', encoding='utf-8') as f: + f.write('{}\t{}\n'.format(key, val)) + + +def skip_char(text, sp_char_list): + """ + skip empty cell + @param text: text in cell + @param sp_char_list: style char and special code + @return: + """ + for sp_char in sp_char_list: + text = text.replace(sp_char, '') + return text + + +def gen_html(img): + ''' + Formats HTML code from tokenized annotation of img + ''' + html_code = img['html']['structure']['tokens'].copy() + to_insert = [i for i, tag in enumerate(html_code) if tag in ('', '>')] + for i, cell in zip(to_insert[::-1], img['html']['cells'][::-1]): + if cell['tokens']: + text = ''.join(cell['tokens']) + # skip empty text + sp_char_list = ['', '', '\u2028', ' ', '', ''] + text_remove_style = skip_char(text, sp_char_list) + if len(text_remove_style) == 0: + continue + html_code.insert(i + 1, text) + html_code = ''.join(html_code) + html_code = '{}
'.format(html_code) + return html_code + + +def load_gt_data(gt_path): + """ + load gt + @param gt_path: + @return: + """ + data_list = {} + with open(gt_path, 'rb') as f: + lines = f.readlines() + for line in tqdm(lines): + data_line = line.decode('utf-8').strip("\n") + info = json.loads(data_line) + data_list[info['filename']] = info + return data_list + + +def convert(origin_gt_path, save_path): + """ + gen html from label file + @param origin_gt_path: + @param save_path: + @return: + """ + data_dict = load_gt_data(origin_gt_path) + for img_name, gt in tqdm(data_dict.items()): + html = gen_html(gt) + save_pred_txt(img_name, html, save_path) + print('conver finish') + + +def parse_args(): + parser = argparse.ArgumentParser(description="args for paddleserving") + parser.add_argument( + "--ori_gt_path", type=str, required=True, help="label gt path") + parser.add_argument( + "--save_path", type=str, required=True, help="path to save file") + args = parser.parse_args() + return args + + +if __name__ == '__main__': + args = parse_args() + convert(args.ori_gt_path, args.save_path) diff --git a/ppstructure/table/eval_table.py b/ppstructure/table/eval_table.py index 87b44d3d9792356ec1cdc65693392c288bf67448..4fc16b5d4c6a0143dcea149508bd6b62730092b6 100755 --- a/ppstructure/table/eval_table.py +++ b/ppstructure/table/eval_table.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -11,14 +11,17 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + import os import sys + __dir__ = os.path.dirname(os.path.abspath(__file__)) sys.path.append(__dir__) -sys.path.append(os.path.abspath(os.path.join(__dir__, '../..'))) +sys.path.insert(0, os.path.abspath(os.path.join(__dir__, '../..'))) import cv2 -import json +import pickle +import paddle from tqdm import tqdm from ppstructure.table.table_metric import TEDS from ppstructure.table.predict_table import TableSystem @@ -33,40 +36,74 @@ def parse_args(): parser.add_argument("--gt_path", type=str) return parser.parse_args() -def main(gt_path, img_root, args): - teds = TEDS(n_jobs=16) +def load_txt(txt_path): + pred_html_dict = {} + if not os.path.exists(txt_path): + return pred_html_dict + with open(txt_path, encoding='utf-8') as f: + lines = f.readlines() + for line in lines: + line = line.strip().split('\t') + img_name, pred_html = line + pred_html_dict[img_name] = pred_html + return pred_html_dict + + +def load_result(path): + data = {} + if os.path.exists(path): + data = pickle.load(open(path, 'rb')) + return data + + +def save_result(path, data): + old_data = load_result(path) + old_data.update(data) + with open(path, 'wb') as f: + pickle.dump(old_data, f) + + +def main(gt_path, img_root, args): + os.makedirs(args.output, exist_ok=True) + # init TableSystem text_sys = TableSystem(args) - jsons_gt = json.load(open(gt_path)) # gt + # load gt and preds html result + gt_html_dict = load_txt(gt_path) + + ocr_result = load_result(os.path.join(args.output, 'ocr.pickle')) + structure_result = load_result( + os.path.join(args.output, 'structure.pickle')) + pred_htmls = [] gt_htmls = [] - for img_name in tqdm(jsons_gt): - # read image - img = cv2.imread(os.path.join(img_root,img_name)) - pred_html = text_sys(img) - pred_htmls.append(pred_html) + for img_name, gt_html in tqdm(gt_html_dict.items()): + img = cv2.imread(os.path.join(img_root, img_name)) + # run ocr and save result + if img_name not in ocr_result: + dt_boxes, rec_res, _, _ = text_sys._ocr(img) + ocr_result[img_name] = [dt_boxes, rec_res] + save_result(os.path.join(args.output, 'ocr.pickle'), ocr_result) + # run structure and save result + if img_name not in structure_result: + structure_res, _ = text_sys._structure(img) + structure_result[img_name] = structure_res + save_result( + os.path.join(args.output, 'structure.pickle'), structure_result) + dt_boxes, rec_res = ocr_result[img_name] + structure_res = structure_result[img_name] + # match ocr and structure + pred_html = text_sys.match(structure_res, dt_boxes, rec_res) - gt_structures, gt_bboxes, gt_contents = jsons_gt[img_name] - gt_html, gt = get_gt_html(gt_structures, gt_contents) + pred_htmls.append(pred_html) gt_htmls.append(gt_html) - scores = teds.batch_evaluate_html(gt_htmls, pred_htmls) - logger.info('teds:', sum(scores) / len(scores)) - -def get_gt_html(gt_structures, gt_contents): - end_html = [] - td_index = 0 - for tag in gt_structures: - if '' in tag: - if gt_contents[td_index] != []: - end_html.extend(gt_contents[td_index]) - end_html.append(tag) - td_index += 1 - else: - end_html.append(tag) - return ''.join(end_html), end_html + # compute teds + teds = TEDS(n_jobs=16) + scores = teds.batch_evaluate_html(gt_htmls, pred_htmls) + logger.info('teds: {}'.format(sum(scores) / len(scores))) if __name__ == '__main__': args = parse_args() - main(args.gt_path,args.image_dir, args) + main(args.gt_path, args.image_dir, args) diff --git a/ppstructure/table/matcher.py b/ppstructure/table/matcher.py index c3b56384403f5fd92a8db4b4bb378a6d55e5a76c..9c5bd2630f78527ade4fd1309f22d1731fe838a2 100755 --- a/ppstructure/table/matcher.py +++ b/ppstructure/table/matcher.py @@ -1,11 +1,29 @@ -import json +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +from ppstructure.table.table_master_match import deal_eb_token, deal_bb + + def distance(box_1, box_2): - x1, y1, x2, y2 = box_1 - x3, y3, x4, y4 = box_2 - dis = abs(x3 - x1) + abs(y3 - y1) + abs(x4- x2) + abs(y4 - y2) - dis_2 = abs(x3 - x1) + abs(y3 - y1) - dis_3 = abs(x4- x2) + abs(y4 - y2) - return dis + min(dis_2, dis_3) + x1, y1, x2, y2 = box_1 + x3, y3, x4, y4 = box_2 + dis = abs(x3 - x1) + abs(y3 - y1) + abs(x4 - x2) + abs(y4 - y2) + dis_2 = abs(x3 - x1) + abs(y3 - y1) + dis_3 = abs(x4 - x2) + abs(y4 - y2) + return dis + min(dis_2, dis_3) + def compute_iou(rec1, rec2): """ @@ -18,175 +36,157 @@ def compute_iou(rec1, rec2): # computing area of each rectangles S_rec1 = (rec1[2] - rec1[0]) * (rec1[3] - rec1[1]) S_rec2 = (rec2[2] - rec2[0]) * (rec2[3] - rec2[1]) - + # computing the sum_area sum_area = S_rec1 + S_rec2 - + # find the each edge of intersect rectangle left_line = max(rec1[1], rec2[1]) right_line = min(rec1[3], rec2[3]) top_line = max(rec1[0], rec2[0]) bottom_line = min(rec1[2], rec2[2]) - + # judge if there is an intersect if left_line >= right_line or top_line >= bottom_line: return 0.0 else: intersect = (right_line - left_line) * (bottom_line - top_line) - return (intersect / (sum_area - intersect))*1.0 - - - -def matcher_merge(ocr_bboxes, pred_bboxes): - all_dis = [] - ious = [] - matched = {} - for i, gt_box in enumerate(ocr_bboxes): - distances = [] - for j, pred_box in enumerate(pred_bboxes): - # compute l1 distence and IOU between two boxes - distances.append((distance(gt_box, pred_box), 1. - compute_iou(gt_box, pred_box))) - sorted_distances = distances.copy() - # select nearest cell - sorted_distances = sorted(sorted_distances, key = lambda item: (item[1], item[0])) - if distances.index(sorted_distances[0]) not in matched.keys(): - matched[distances.index(sorted_distances[0])] = [i] - else: - matched[distances.index(sorted_distances[0])].append(i) - return matched#, sum(ious) / len(ious) - -def complex_num(pred_bboxes): - complex_nums = [] - for bbox in pred_bboxes: - distances = [] - temp_ious = [] - for pred_bbox in pred_bboxes: - if bbox != pred_bbox: - distances.append(distance(bbox, pred_bbox)) - temp_ious.append(compute_iou(bbox, pred_bbox)) - complex_nums.append(temp_ious[distances.index(min(distances))]) - return sum(complex_nums) / len(complex_nums) - -def get_rows(pred_bboxes): - pre_bbox = pred_bboxes[0] - res = [] - step = 0 - for i in range(len(pred_bboxes)): - bbox = pred_bboxes[i] - if bbox[1] - pre_bbox[1] > 2 or bbox[0] - pre_bbox[0] < 0: - break - else: - res.append(bbox) - step += 1 - for i in range(step): - pred_bboxes.pop(0) - return res, pred_bboxes -def refine_rows(pred_bboxes): # 微调整行的框,使在一条水平线上 - ys_1 = [] - ys_2 = [] - for box in pred_bboxes: - ys_1.append(box[1]) - ys_2.append(box[3]) - min_y_1 = sum(ys_1) / len(ys_1) - min_y_2 = sum(ys_2) / len(ys_2) - re_boxes = [] - for box in pred_bboxes: - box[1] = min_y_1 - box[3] = min_y_2 - re_boxes.append(box) - return re_boxes - -def matcher_refine_row(gt_bboxes, pred_bboxes): - before_refine_pred_bboxes = pred_bboxes.copy() - pred_bboxes = [] - while(len(before_refine_pred_bboxes) != 0): - row_bboxes, before_refine_pred_bboxes = get_rows(before_refine_pred_bboxes) - print(row_bboxes) - pred_bboxes.extend(refine_rows(row_bboxes)) - all_dis = [] - ious = [] - matched = {} - for i, gt_box in enumerate(gt_bboxes): - distances = [] - #temp_ious = [] - for j, pred_box in enumerate(pred_bboxes): - distances.append(distance(gt_box, pred_box)) - #temp_ious.append(compute_iou(gt_box, pred_box)) - #all_dis.append(min(distances)) - #ious.append(temp_ious[distances.index(min(distances))]) - if distances.index(min(distances)) not in matched.keys(): - matched[distances.index(min(distances))] = [i] + return (intersect / (sum_area - intersect)) * 1.0 + + +class TableMatch: + def __init__(self, filter_ocr_result=False, use_master=False): + self.filter_ocr_result = filter_ocr_result + self.use_master = use_master + + def __call__(self, structure_res, dt_boxes, rec_res): + pred_structures, pred_bboxes = structure_res + if self.filter_ocr_result: + dt_boxes, rec_res = self._filter_ocr_result(pred_bboxes, dt_boxes, + rec_res) + matched_index = self.match_result(dt_boxes, pred_bboxes) + if self.use_master: + pred_html, pred = self.get_pred_html_master(pred_structures, + matched_index, rec_res) else: - matched[distances.index(min(distances))].append(i) - return matched#, sum(ious) / len(ious) - - - -#先挑选出一行,再进行匹配 -def matcher_structure_1(gt_bboxes, pred_bboxes_rows, pred_bboxes): - gt_box_index = 0 - delete_gt_bboxes = gt_bboxes.copy() - match_bboxes_ready = [] - matched = {} - while(len(delete_gt_bboxes) != 0): - row_bboxes, delete_gt_bboxes = get_rows(delete_gt_bboxes) - row_bboxes = sorted(row_bboxes, key = lambda key: key[0]) - if len(pred_bboxes_rows) > 0: - match_bboxes_ready.extend(pred_bboxes_rows.pop(0)) - print(row_bboxes) - for i, gt_box in enumerate(row_bboxes): - #print(gt_box) - pred_distances = [] - distances = [] - for pred_bbox in pred_bboxes: - pred_distances.append(distance(gt_box, pred_bbox)) - for j, pred_box in enumerate(match_bboxes_ready): - distances.append(distance(gt_box, pred_box)) - index = pred_distances.index(min(distances)) - #print('index', index) - if index not in matched.keys(): - matched[index] = [gt_box_index] + pred_html, pred = self.get_pred_html(pred_structures, matched_index, + rec_res) + return pred_html + + def match_result(self, dt_boxes, pred_bboxes): + matched = {} + for i, gt_box in enumerate(dt_boxes): + distances = [] + for j, pred_box in enumerate(pred_bboxes): + if len(pred_box) == 8: + pred_box = [ + np.min(pred_box[0::2]), np.min(pred_box[1::2]), + np.max(pred_box[0::2]), np.max(pred_box[1::2]) + ] + distances.append((distance(gt_box, pred_box), + 1. - compute_iou(gt_box, pred_box) + )) # compute iou and l1 distance + sorted_distances = distances.copy() + # select det box by iou and l1 distance + sorted_distances = sorted( + sorted_distances, key=lambda item: (item[1], item[0])) + if distances.index(sorted_distances[0]) not in matched.keys(): + matched[distances.index(sorted_distances[0])] = [i] else: - matched[index].append(gt_box_index) - gt_box_index += 1 - return matched - -def matcher_structure(gt_bboxes, pred_bboxes_rows, pred_bboxes): - ''' - gt_bboxes: 排序后 - pred_bboxes: - ''' - pre_bbox = gt_bboxes[0] - matched = {} - match_bboxes_ready = [] - match_bboxes_ready.extend(pred_bboxes_rows.pop(0)) - for i, gt_box in enumerate(gt_bboxes): - - pred_distances = [] - for pred_bbox in pred_bboxes: - pred_distances.append(distance(gt_box, pred_bbox)) - distances = [] - gap_pre = gt_box[1] - pre_bbox[1] - gap_pre_1 = gt_box[0] - pre_bbox[2] - #print(gap_pre, len(pred_bboxes_rows)) - if (gap_pre_1 < 0 and len(pred_bboxes_rows) > 0): - match_bboxes_ready.extend(pred_bboxes_rows.pop(0)) - if len(pred_bboxes_rows) == 1: - match_bboxes_ready.extend(pred_bboxes_rows.pop(0)) - if len(match_bboxes_ready) == 0 and len(pred_bboxes_rows) > 0: - match_bboxes_ready.extend(pred_bboxes_rows.pop(0)) - if len(match_bboxes_ready) == 0 and len(pred_bboxes_rows) == 0: - break - #print(match_bboxes_ready) - for j, pred_box in enumerate(match_bboxes_ready): - distances.append(distance(gt_box, pred_box)) - index = pred_distances.index(min(distances)) - #print(gt_box, index) - #match_bboxes_ready.pop(distances.index(min(distances))) - print(gt_box, match_bboxes_ready[distances.index(min(distances))]) - if index not in matched.keys(): - matched[index] = [i] - else: - matched[index].append(i) - pre_bbox = gt_box - return matched + matched[distances.index(sorted_distances[0])].append(i) + return matched + + def get_pred_html(self, pred_structures, matched_index, ocr_contents): + end_html = [] + td_index = 0 + for tag in pred_structures: + if '' in tag: + if '' == tag: + end_html.extend('') + if td_index in matched_index.keys(): + b_with = False + if '' in ocr_contents[matched_index[td_index][ + 0]] and len(matched_index[td_index]) > 1: + b_with = True + end_html.extend('') + for i, td_index_index in enumerate(matched_index[td_index]): + content = ocr_contents[td_index_index][0] + if len(matched_index[td_index]) > 1: + if len(content) == 0: + continue + if content[0] == ' ': + content = content[1:] + if '' in content: + content = content[3:] + if '' in content: + content = content[:-4] + if len(content) == 0: + continue + if i != len(matched_index[ + td_index]) - 1 and ' ' != content[-1]: + content += ' ' + end_html.extend(content) + if b_with: + end_html.extend('') + if '' == tag: + end_html.append('') + else: + end_html.append(tag) + td_index += 1 + else: + end_html.append(tag) + return ''.join(end_html), end_html + + def get_pred_html_master(self, pred_structures, matched_index, + ocr_contents): + end_html = [] + td_index = 0 + for token in pred_structures: + if '' in token: + txt = '' + b_with = False + if td_index in matched_index.keys(): + if '' in ocr_contents[matched_index[td_index][ + 0]] and len(matched_index[td_index]) > 1: + b_with = True + for i, td_index_index in enumerate(matched_index[td_index]): + content = ocr_contents[td_index_index][0] + if len(matched_index[td_index]) > 1: + if len(content) == 0: + continue + if content[0] == ' ': + content = content[1:] + if '' in content: + content = content[3:] + if '' in content: + content = content[:-4] + if len(content) == 0: + continue + if i != len(matched_index[ + td_index]) - 1 and ' ' != content[-1]: + content += ' ' + txt += content + if b_with: + txt = '{}'.format(txt) + if '' == token: + token = '{}'.format(txt) + else: + token = '{}'.format(txt) + td_index += 1 + token = deal_eb_token(token) + end_html.append(token) + html = ''.join(end_html) + html = deal_bb(html) + return html, end_html + + def _filter_ocr_result(self, pred_bboxes, dt_boxes, rec_res): + y1 = pred_bboxes[:, 1::2].min() + new_dt_boxes = [] + new_rec_res = [] + + for box, rec in zip(dt_boxes, rec_res): + if np.max(box[1::2]) < y1: + continue + new_dt_boxes.append(box) + new_rec_res.append(rec) + return new_dt_boxes, new_rec_res diff --git a/ppstructure/table/predict_structure.py b/ppstructure/table/predict_structure.py index 7a7d3169d567493b4707b63c75cec07485cf5acb..0bf100852b9e9d501dfc858d8ce0787da42a61ed 100755 --- a/ppstructure/table/predict_structure.py +++ b/ppstructure/table/predict_structure.py @@ -16,7 +16,7 @@ import sys __dir__ = os.path.dirname(os.path.abspath(__file__)) sys.path.append(__dir__) -sys.path.append(os.path.abspath(os.path.join(__dir__, '../..'))) +sys.path.insert(0, os.path.abspath(os.path.join(__dir__, '../..'))) os.environ["FLAGS_allocator_strategy"] = 'auto_growth' @@ -29,7 +29,7 @@ import tools.infer.utility as utility from ppocr.data import create_operators, transform from ppocr.postprocess import build_post_process from ppocr.utils.logging import get_logger -from ppocr.utils.utility import get_image_file_list, check_and_read_gif +from ppocr.utils.utility import get_image_file_list, check_and_read from ppocr.utils.visual import draw_rectangle from ppstructure.utility import parse_args @@ -68,17 +68,20 @@ def build_pre_process_list(args): class TableStructurer(object): def __init__(self, args): + self.use_onnx = args.use_onnx pre_process_list = build_pre_process_list(args) if args.table_algorithm not in ['TableMaster']: postprocess_params = { 'name': 'TableLabelDecode', "character_dict_path": args.table_char_dict_path, + 'merge_no_span_structure': args.merge_no_span_structure } else: postprocess_params = { 'name': 'TableMasterLabelDecode', "character_dict_path": args.table_char_dict_path, - 'box_shape': 'pad' + 'box_shape': 'pad', + 'merge_no_span_structure': args.merge_no_span_structure } self.preprocess_op = create_operators(pre_process_list) @@ -87,6 +90,7 @@ class TableStructurer(object): utility.create_predictor(args, 'table', logger) def __call__(self, img): + starttime = time.time() ori_im = img.copy() data = {'image': img} data = transform(data, self.preprocess_op) @@ -95,14 +99,17 @@ class TableStructurer(object): return None, 0 img = np.expand_dims(img, axis=0) img = img.copy() - starttime = time.time() - - self.input_tensor.copy_from_cpu(img) - self.predictor.run() - outputs = [] - for output_tensor in self.output_tensors: - output = output_tensor.copy_to_cpu() - outputs.append(output) + if self.use_onnx: + input_dict = {} + input_dict[self.input_tensor.name] = img + outputs = self.predictor.run(self.output_tensors, input_dict) + else: + self.input_tensor.copy_from_cpu(img) + self.predictor.run() + outputs = [] + for output_tensor in self.output_tensors: + output = output_tensor.copy_to_cpu() + outputs.append(output) preds = {} preds['structure_probs'] = outputs[1] @@ -126,13 +133,12 @@ def main(args): table_structurer = TableStructurer(args) count = 0 total_time = 0 - use_xywh = args.table_algorithm in ['TableMaster'] os.makedirs(args.output, exist_ok=True) with open( os.path.join(args.output, 'infer.txt'), mode='w', encoding='utf-8') as f_w: for image_file in image_file_list: - img, flag = check_and_read_gif(image_file) + img, flag, _ = check_and_read(image_file) if not flag: img = cv2.imread(image_file) if img is None: @@ -146,7 +152,10 @@ def main(args): f_w.write("result: {}, {}\n".format(structure_str_list, bbox_list_str)) - img = draw_rectangle(image_file, bbox_list, use_xywh) + if len(bbox_list) > 0 and len(bbox_list[0]) == 4: + img = draw_rectangle(image_file, bbox_list) + else: + img = utility.draw_boxes(img, bbox_list) img_save_path = os.path.join(args.output, os.path.basename(image_file)) cv2.imwrite(img_save_path, img) diff --git a/ppstructure/table/predict_table.py b/ppstructure/table/predict_table.py index becc6daef02e7e3e98fcccd3b87a93e725577886..aeec66deca62f648df249a5833dbfa678d2da612 100644 --- a/ppstructure/table/predict_table.py +++ b/ppstructure/table/predict_table.py @@ -18,20 +18,23 @@ import subprocess __dir__ = os.path.dirname(os.path.abspath(__file__)) sys.path.append(__dir__) -sys.path.append(os.path.abspath(os.path.join(__dir__, '..'))) -sys.path.append(os.path.abspath(os.path.join(__dir__, '../..'))) +sys.path.insert(0, os.path.abspath(os.path.join(__dir__, '..'))) +sys.path.insert(0, os.path.abspath(os.path.join(__dir__, '../..'))) os.environ["FLAGS_allocator_strategy"] = 'auto_growth' import cv2 import copy +import logging import numpy as np import time import tools.infer.predict_rec as predict_rec import tools.infer.predict_det as predict_det import tools.infer.utility as utility -from ppocr.utils.utility import get_image_file_list, check_and_read_gif +from tools.infer.predict_system import sorted_boxes +from ppocr.utils.utility import get_image_file_list, check_and_read from ppocr.utils.logging import get_logger -from ppstructure.table.matcher import distance, compute_iou +from ppstructure.table.matcher import TableMatch +from ppstructure.table.table_master_match import TableMasterMatcher from ppstructure.utility import parse_args import ppstructure.table.predict_structure as predict_strture @@ -55,11 +58,20 @@ def expand(pix, det_box, shape): class TableSystem(object): def __init__(self, args, text_detector=None, text_recognizer=None): + if not args.show_log: + logger.setLevel(logging.INFO) + self.text_detector = predict_det.TextDetector( args) if text_detector is None else text_detector self.text_recognizer = predict_rec.TextRecognizer( args) if text_recognizer is None else text_recognizer + self.table_structurer = predict_strture.TableStructurer(args) + if args.table_algorithm in ['TableMaster']: + self.match = TableMasterMatcher() + else: + self.match = TableMatch(filter_ocr_result=True) + self.benchmark = args.benchmark self.predictor, self.input_tensor, self.output_tensors, self.config = utility.create_predictor( args, 'table', logger) @@ -85,145 +97,72 @@ class TableSystem(object): def __call__(self, img, return_ocr_result_in_table=False): result = dict() - ori_im = img.copy() + time_dict = {'det': 0, 'rec': 0, 'table': 0, 'all': 0, 'match': 0} + start = time.time() + + structure_res, elapse = self._structure(copy.deepcopy(img)) + result['cell_bbox'] = structure_res[1].tolist() + time_dict['table'] = elapse + + dt_boxes, rec_res, det_elapse, rec_elapse = self._ocr( + copy.deepcopy(img)) + time_dict['det'] = det_elapse + time_dict['rec'] = rec_elapse + + if return_ocr_result_in_table: + result['boxes'] = dt_boxes #[x.tolist() for x in dt_boxes] + result['rec_res'] = rec_res + + tic = time.time() + pred_html = self.match(structure_res, dt_boxes, rec_res) + toc = time.time() + time_dict['match'] = toc - tic + result['html'] = pred_html + if self.benchmark: + self.autolog.times.end(stamp=True) + end = time.time() + time_dict['all'] = end - start + if self.benchmark: + self.autolog.times.stamp() + return result, time_dict + + def _structure(self, img): if self.benchmark: self.autolog.times.start() structure_res, elapse = self.table_structurer(copy.deepcopy(img)) + return structure_res, elapse + + def _ocr(self, img): + h, w = img.shape[:2] if self.benchmark: self.autolog.times.stamp() - dt_boxes, elapse = self.text_detector(copy.deepcopy(img)) + dt_boxes, det_elapse = self.text_detector(copy.deepcopy(img)) dt_boxes = sorted_boxes(dt_boxes) - if return_ocr_result_in_table: - result['boxes'] = [x.tolist() for x in dt_boxes] + r_boxes = [] for box in dt_boxes: - x_min = box[:, 0].min() - 1 - x_max = box[:, 0].max() + 1 - y_min = box[:, 1].min() - 1 - y_max = box[:, 1].max() + 1 + x_min = max(0, box[:, 0].min() - 1) + x_max = min(w, box[:, 0].max() + 1) + y_min = max(0, box[:, 1].min() - 1) + y_max = min(h, box[:, 1].max() + 1) box = [x_min, y_min, x_max, y_max] r_boxes.append(box) dt_boxes = np.array(r_boxes) logger.debug("dt_boxes num : {}, elapse : {}".format( - len(dt_boxes), elapse)) + len(dt_boxes), det_elapse)) if dt_boxes is None: return None, None + img_crop_list = [] for i in range(len(dt_boxes)): det_box = dt_boxes[i] - x0, y0, x1, y1 = expand(2, det_box, ori_im.shape) - text_rect = ori_im[int(y0):int(y1), int(x0):int(x1), :] + x0, y0, x1, y1 = expand(2, det_box, img.shape) + text_rect = img[int(y0):int(y1), int(x0):int(x1), :] img_crop_list.append(text_rect) - rec_res, elapse = self.text_recognizer(img_crop_list) + rec_res, rec_elapse = self.text_recognizer(img_crop_list) logger.debug("rec_res num : {}, elapse : {}".format( - len(rec_res), elapse)) - if self.benchmark: - self.autolog.times.stamp() - if return_ocr_result_in_table: - result['rec_res'] = rec_res - pred_html, pred = self.rebuild_table(structure_res, dt_boxes, rec_res) - result['html'] = pred_html - if self.benchmark: - self.autolog.times.end(stamp=True) - return result - - def rebuild_table(self, structure_res, dt_boxes, rec_res): - pred_structures, pred_bboxes = structure_res - dt_boxes, rec_res = self.filter_ocr_result(pred_bboxes,dt_boxes, rec_res) - matched_index = self.match_result(dt_boxes, pred_bboxes) - pred_html, pred = self.get_pred_html(pred_structures, matched_index, - rec_res) - return pred_html, pred - - def filter_ocr_result(self, pred_bboxes,dt_boxes, rec_res): - y1 = pred_bboxes[:,1::2].min() - new_dt_boxes = [] - new_rec_res = [] - - for box,rec in zip(dt_boxes, rec_res): - if np.max(box[1::2]) < y1: - continue - new_dt_boxes.append(box) - new_rec_res.append(rec) - return new_dt_boxes, new_rec_res - - - def match_result(self, dt_boxes, pred_bboxes): - matched = {} - for i, gt_box in enumerate(dt_boxes): - # gt_box = [np.min(gt_box[:, 0]), np.min(gt_box[:, 1]), np.max(gt_box[:, 0]), np.max(gt_box[:, 1])] - distances = [] - for j, pred_box in enumerate(pred_bboxes): - distances.append((distance(gt_box, pred_box), - 1. - compute_iou(gt_box, pred_box) - )) # 获取两两cell之间的L1距离和 1- IOU - sorted_distances = distances.copy() - # 根据距离和IOU挑选最"近"的cell - sorted_distances = sorted( - sorted_distances, key=lambda item: (item[1], item[0])) - if distances.index(sorted_distances[0]) not in matched.keys(): - matched[distances.index(sorted_distances[0])] = [i] - else: - matched[distances.index(sorted_distances[0])].append(i) - return matched - - def get_pred_html(self, pred_structures, matched_index, ocr_contents): - end_html = [] - td_index = 0 - for tag in pred_structures: - if '' in tag: - if td_index in matched_index.keys(): - b_with = False - if '' in ocr_contents[matched_index[td_index][ - 0]] and len(matched_index[td_index]) > 1: - b_with = True - end_html.extend('') - for i, td_index_index in enumerate(matched_index[td_index]): - content = ocr_contents[td_index_index][0] - if len(matched_index[td_index]) > 1: - if len(content) == 0: - continue - if content[0] == ' ': - content = content[1:] - if '' in content: - content = content[3:] - if '' in content: - content = content[:-4] - if len(content) == 0: - continue - if i != len(matched_index[ - td_index]) - 1 and ' ' != content[-1]: - content += ' ' - end_html.extend(content) - if b_with: - end_html.extend('') - - end_html.append(tag) - td_index += 1 - else: - end_html.append(tag) - return ''.join(end_html), end_html - - -def sorted_boxes(dt_boxes): - """ - Sort text boxes in order from top to bottom, left to right - args: - dt_boxes(array):detected text boxes with shape [4, 2] - return: - sorted boxes(array) with shape [4, 2] - """ - num_boxes = dt_boxes.shape[0] - sorted_boxes = sorted(dt_boxes, key=lambda x: (x[0][1], x[0][0])) - _boxes = list(sorted_boxes) - - for i in range(num_boxes - 1): - if abs(_boxes[i + 1][0][1] - _boxes[i][0][1]) < 10 and \ - (_boxes[i + 1][0][0] < _boxes[i][0][0]): - tmp = _boxes[i] - _boxes[i] = _boxes[i + 1] - _boxes[i + 1] = tmp - return _boxes + len(rec_res), rec_elapse)) + return dt_boxes, rec_res, det_elapse, rec_elapse def to_excel(html_table, excel_path): @@ -236,11 +175,26 @@ def main(args): image_file_list = image_file_list[args.process_id::args.total_process_num] os.makedirs(args.output, exist_ok=True) - text_sys = TableSystem(args) + table_sys = TableSystem(args) img_num = len(image_file_list) + + f_html = open( + os.path.join(args.output, 'show.html'), mode='w', encoding='utf-8') + f_html.write('\n\n') + f_html.write('\n') + f_html.write( + "" + ) + f_html.write("\n") + f_html.write('') + f_html.write('') + f_html.write('') + f_html.write("\n") + for i, image_file in enumerate(image_file_list): logger.info("[{}/{}] {}".format(i, img_num, image_file)) - img, flag = check_and_read_gif(image_file) + img, flag, _ = check_and_read(image_file) excel_path = os.path.join( args.output, os.path.basename(image_file).split('.')[0] + '.xlsx') if not flag: @@ -249,13 +203,35 @@ def main(args): logger.error("error in loading image:{}".format(image_file)) continue starttime = time.time() - pred_res = text_sys(img) + pred_res, _ = table_sys(img) pred_html = pred_res['html'] logger.info(pred_html) to_excel(pred_html, excel_path) logger.info('excel saved to {}'.format(excel_path)) elapse = time.time() - starttime logger.info("Predict time : {:.3f}s".format(elapse)) + + if len(pred_res['cell_bbox']) > 0 and len(pred_res['cell_bbox'][ + 0]) == 4: + img = predict_strture.draw_rectangle(image_file, + pred_res['cell_bbox']) + else: + img = utility.draw_boxes(img, pred_res['cell_bbox']) + img_save_path = os.path.join(args.output, os.path.basename(image_file)) + cv2.imwrite(img_save_path, img) + + f_html.write("\n") + f_html.write(f'\n') + f_html.write('
img name\n') + f_html.write('ori imagetable htmlcell box
{os.path.basename(image_file)}
\n') + f_html.write(f'
' + pred_html.replace( + '
', '').replace('
', '') + + '
\n') + f_html.write( + f'\n') + f_html.write("\n") + f_html.write("\n") + f_html.close() + if args.benchmark: text_sys.autolog.report() diff --git a/ppstructure/table/table_master_match.py b/ppstructure/table/table_master_match.py new file mode 100644 index 0000000000000000000000000000000000000000..7a7208d4a94bb357b1bbce0d664d9d6449a96874 --- /dev/null +++ b/ppstructure/table/table_master_match.py @@ -0,0 +1,953 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/JiaquanYe/TableMASTER-mmocr/blob/master/table_recognition/match.py +""" + +import os +import re +import cv2 +import glob +import copy +import math +import pickle +import numpy as np + +from shapely.geometry import Polygon, MultiPoint +""" +Useful function in matching. +""" + + +def remove_empty_bboxes(bboxes): + """ + remove [0., 0., 0., 0.] in structure master bboxes. + len(bboxes.shape) must be 2. + :param bboxes: + :return: + """ + new_bboxes = [] + for bbox in bboxes: + if sum(bbox) == 0.: + continue + new_bboxes.append(bbox) + return np.array(new_bboxes) + + +def xywh2xyxy(bboxes): + if len(bboxes.shape) == 1: + new_bboxes = np.empty_like(bboxes) + new_bboxes[0] = bboxes[0] - bboxes[2] / 2 + new_bboxes[1] = bboxes[1] - bboxes[3] / 2 + new_bboxes[2] = bboxes[0] + bboxes[2] / 2 + new_bboxes[3] = bboxes[1] + bboxes[3] / 2 + return new_bboxes + elif len(bboxes.shape) == 2: + new_bboxes = np.empty_like(bboxes) + new_bboxes[:, 0] = bboxes[:, 0] - bboxes[:, 2] / 2 + new_bboxes[:, 1] = bboxes[:, 1] - bboxes[:, 3] / 2 + new_bboxes[:, 2] = bboxes[:, 0] + bboxes[:, 2] / 2 + new_bboxes[:, 3] = bboxes[:, 1] + bboxes[:, 3] / 2 + return new_bboxes + else: + raise ValueError + + +def xyxy2xywh(bboxes): + if len(bboxes.shape) == 1: + new_bboxes = np.empty_like(bboxes) + new_bboxes[0] = bboxes[0] + (bboxes[2] - bboxes[0]) / 2 + new_bboxes[1] = bboxes[1] + (bboxes[3] - bboxes[1]) / 2 + new_bboxes[2] = bboxes[2] - bboxes[0] + new_bboxes[3] = bboxes[3] - bboxes[1] + return new_bboxes + elif len(bboxes.shape) == 2: + new_bboxes = np.empty_like(bboxes) + new_bboxes[:, 0] = bboxes[:, 0] + (bboxes[:, 2] - bboxes[:, 0]) / 2 + new_bboxes[:, 1] = bboxes[:, 1] + (bboxes[:, 3] - bboxes[:, 1]) / 2 + new_bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 0] + new_bboxes[:, 3] = bboxes[:, 3] - bboxes[:, 1] + return new_bboxes + else: + raise ValueError + + +def pickle_load(path, prefix='end2end'): + if os.path.isfile(path): + data = pickle.load(open(path, 'rb')) + elif os.path.isdir(path): + data = dict() + search_path = os.path.join(path, '{}_*.pkl'.format(prefix)) + pkls = glob.glob(search_path) + for pkl in pkls: + this_data = pickle.load(open(pkl, 'rb')) + data.update(this_data) + else: + raise ValueError + return data + + +def convert_coord(xyxy): + """ + Convert two points format to four points format. + :param xyxy: + :return: + """ + new_bbox = np.zeros([4, 2], dtype=np.float32) + new_bbox[0, 0], new_bbox[0, 1] = xyxy[0], xyxy[1] + new_bbox[1, 0], new_bbox[1, 1] = xyxy[2], xyxy[1] + new_bbox[2, 0], new_bbox[2, 1] = xyxy[2], xyxy[3] + new_bbox[3, 0], new_bbox[3, 1] = xyxy[0], xyxy[3] + return new_bbox + + +def cal_iou(bbox1, bbox2): + bbox1_poly = Polygon(bbox1).convex_hull + bbox2_poly = Polygon(bbox2).convex_hull + union_poly = np.concatenate((bbox1, bbox2)) + + if not bbox1_poly.intersects(bbox2_poly): + iou = 0 + else: + inter_area = bbox1_poly.intersection(bbox2_poly).area + union_area = MultiPoint(union_poly).convex_hull.area + if union_area == 0: + iou = 0 + else: + iou = float(inter_area) / union_area + return iou + + +def cal_distance(p1, p2): + delta_x = p1[0] - p2[0] + delta_y = p1[1] - p2[1] + d = math.sqrt((delta_x**2) + (delta_y**2)) + return d + + +def is_inside(center_point, corner_point): + """ + Find if center_point inside the bbox(corner_point) or not. + :param center_point: center point (x, y) + :param corner_point: corner point ((x1,y1),(x2,y2)) + :return: + """ + x_flag = False + y_flag = False + if (center_point[0] >= corner_point[0][0]) and ( + center_point[0] <= corner_point[1][0]): + x_flag = True + if (center_point[1] >= corner_point[0][1]) and ( + center_point[1] <= corner_point[1][1]): + y_flag = True + if x_flag and y_flag: + return True + else: + return False + + +def find_no_match(match_list, all_end2end_nums, type='end2end'): + """ + Find out no match end2end bbox in previous match list. + :param match_list: matching pairs. + :param all_end2end_nums: numbers of end2end_xywh + :param type: 'end2end' corresponding to idx 0, 'master' corresponding to idx 1. + :return: no match pse bbox index list + """ + if type == 'end2end': + idx = 0 + elif type == 'master': + idx = 1 + else: + raise ValueError + + no_match_indexs = [] + # m[0] is end2end index m[1] is master index + matched_bbox_indexs = [m[idx] for m in match_list] + for n in range(all_end2end_nums): + if n not in matched_bbox_indexs: + no_match_indexs.append(n) + return no_match_indexs + + +def is_abs_lower_than_threshold(this_bbox, target_bbox, threshold=3): + # only consider y axis, for grouping in row. + delta = abs(this_bbox[1] - target_bbox[1]) + if delta < threshold: + return True + else: + return False + + +def sort_line_bbox(g, bg): + """ + Sorted the bbox in the same line(group) + compare coord 'x' value, where 'y' value is closed in the same group. + :param g: index in the same group + :param bg: bbox in the same group + :return: + """ + + xs = [bg_item[0] for bg_item in bg] + xs_sorted = sorted(xs) + + g_sorted = [None] * len(xs_sorted) + bg_sorted = [None] * len(xs_sorted) + for g_item, bg_item in zip(g, bg): + idx = xs_sorted.index(bg_item[0]) + bg_sorted[idx] = bg_item + g_sorted[idx] = g_item + + return g_sorted, bg_sorted + + +def flatten(sorted_groups, sorted_bbox_groups): + idxs = [] + bboxes = [] + for group, bbox_group in zip(sorted_groups, sorted_bbox_groups): + for g, bg in zip(group, bbox_group): + idxs.append(g) + bboxes.append(bg) + return idxs, bboxes + + +def sort_bbox(end2end_xywh_bboxes, no_match_end2end_indexes): + """ + This function will group the render end2end bboxes in row. + :param end2end_xywh_bboxes: + :param no_match_end2end_indexes: + :return: + """ + groups = [] + bbox_groups = [] + for index, end2end_xywh_bbox in zip(no_match_end2end_indexes, + end2end_xywh_bboxes): + this_bbox = end2end_xywh_bbox + if len(groups) == 0: + groups.append([index]) + bbox_groups.append([this_bbox]) + else: + flag = False + for g, bg in zip(groups, bbox_groups): + # this_bbox is belong to bg's row or not + if is_abs_lower_than_threshold(this_bbox, bg[0]): + g.append(index) + bg.append(this_bbox) + flag = True + break + if not flag: + # this_bbox is not belong to bg's row, create a row. + groups.append([index]) + bbox_groups.append([this_bbox]) + + # sorted bboxes in a group + tmp_groups, tmp_bbox_groups = [], [] + for g, bg in zip(groups, bbox_groups): + g_sorted, bg_sorted = sort_line_bbox(g, bg) + tmp_groups.append(g_sorted) + tmp_bbox_groups.append(bg_sorted) + + # sorted groups, sort by coord y's value. + sorted_groups = [None] * len(tmp_groups) + sorted_bbox_groups = [None] * len(tmp_bbox_groups) + ys = [bg[0][1] for bg in tmp_bbox_groups] + sorted_ys = sorted(ys) + for g, bg in zip(tmp_groups, tmp_bbox_groups): + idx = sorted_ys.index(bg[0][1]) + sorted_groups[idx] = g + sorted_bbox_groups[idx] = bg + + # flatten, get final result + end2end_sorted_idx_list, end2end_sorted_bbox_list \ + = flatten(sorted_groups, sorted_bbox_groups) + + return end2end_sorted_idx_list, end2end_sorted_bbox_list, sorted_groups, sorted_bbox_groups + + +def get_bboxes_list(end2end_result, structure_master_result): + """ + This function is use to convert end2end results and structure master results to + List of xyxy bbox format and List of xywh bbox format + :param end2end_result: bbox's format is xyxy + :param structure_master_result: bbox's format is xywh + :return: 4 kind list of bbox () + """ + # end2end + end2end_xyxy_list = [] + end2end_xywh_list = [] + for end2end_item in end2end_result: + src_bbox = end2end_item['bbox'] + end2end_xyxy_list.append(src_bbox) + xywh_bbox = xyxy2xywh(src_bbox) + end2end_xywh_list.append(xywh_bbox) + end2end_xyxy_bboxes = np.array(end2end_xyxy_list) + end2end_xywh_bboxes = np.array(end2end_xywh_list) + + # structure master + src_bboxes = structure_master_result['bbox'] + src_bboxes = remove_empty_bboxes(src_bboxes) + structure_master_xyxy_bboxes = src_bboxes + xywh_bbox = xyxy2xywh(src_bboxes) + structure_master_xywh_bboxes = xywh_bbox + + return end2end_xyxy_bboxes, end2end_xywh_bboxes, structure_master_xywh_bboxes, structure_master_xyxy_bboxes + + +def center_rule_match(end2end_xywh_bboxes, structure_master_xyxy_bboxes): + """ + Judge end2end Bbox's center point is inside structure master Bbox or not, + if end2end Bbox's center is in structure master Bbox, get matching pair. + :param end2end_xywh_bboxes: + :param structure_master_xyxy_bboxes: + :return: match pairs list, e.g. [[0,1], [1,2], ...] + """ + match_pairs_list = [] + for i, end2end_xywh in enumerate(end2end_xywh_bboxes): + for j, master_xyxy in enumerate(structure_master_xyxy_bboxes): + x_end2end, y_end2end = end2end_xywh[0], end2end_xywh[1] + x_master1, y_master1, x_master2, y_master2 \ + = master_xyxy[0], master_xyxy[1], master_xyxy[2], master_xyxy[3] + center_point_end2end = (x_end2end, y_end2end) + corner_point_master = ((x_master1, y_master1), + (x_master2, y_master2)) + if is_inside(center_point_end2end, corner_point_master): + match_pairs_list.append([i, j]) + return match_pairs_list + + +def iou_rule_match(end2end_xyxy_bboxes, end2end_xyxy_indexes, + structure_master_xyxy_bboxes): + """ + Use iou to find matching list. + choose max iou value bbox as match pair. + :param end2end_xyxy_bboxes: + :param end2end_xyxy_indexes: original end2end indexes. + :param structure_master_xyxy_bboxes: + :return: match pairs list, e.g. [[0,1], [1,2], ...] + """ + match_pair_list = [] + for end2end_xyxy_index, end2end_xyxy in zip(end2end_xyxy_indexes, + end2end_xyxy_bboxes): + max_iou = 0 + max_match = [None, None] + for j, master_xyxy in enumerate(structure_master_xyxy_bboxes): + end2end_4xy = convert_coord(end2end_xyxy) + master_4xy = convert_coord(master_xyxy) + iou = cal_iou(end2end_4xy, master_4xy) + if iou > max_iou: + max_match[0], max_match[1] = end2end_xyxy_index, j + max_iou = iou + + if max_match[0] is None: + # no match + continue + match_pair_list.append(max_match) + return match_pair_list + + +def distance_rule_match(end2end_indexes, end2end_bboxes, master_indexes, + master_bboxes): + """ + Get matching between no-match end2end bboxes and no-match master bboxes. + Use min distance to match. + This rule will only run (no-match end2end nums > 0) and (no-match master nums > 0) + It will Return master_bboxes_nums match-pairs. + :param end2end_indexes: + :param end2end_bboxes: + :param master_indexes: + :param master_bboxes: + :return: match_pairs list, e.g. [[0,1], [1,2], ...] + """ + min_match_list = [] + for j, master_bbox in zip(master_indexes, master_bboxes): + min_distance = np.inf + min_match = [0, 0] # i, j + for i, end2end_bbox in zip(end2end_indexes, end2end_bboxes): + x_end2end, y_end2end = end2end_bbox[0], end2end_bbox[1] + x_master, y_master = master_bbox[0], master_bbox[1] + end2end_point = (x_end2end, y_end2end) + master_point = (x_master, y_master) + dist = cal_distance(master_point, end2end_point) + if dist < min_distance: + min_match[0], min_match[1] = i, j + min_distance = dist + min_match_list.append(min_match) + return min_match_list + + +def extra_match(no_match_end2end_indexes, master_bbox_nums): + """ + This function will create some virtual master bboxes, + and get match with the no match end2end indexes. + :param no_match_end2end_indexes: + :param master_bbox_nums: + :return: + """ + end_nums = len(no_match_end2end_indexes) + master_bbox_nums + extra_match_list = [] + for i in range(master_bbox_nums, end_nums): + end2end_index = no_match_end2end_indexes[i - master_bbox_nums] + extra_match_list.append([end2end_index, i]) + return extra_match_list + + +def get_match_dict(match_list): + """ + Convert match_list to a dict, where key is master bbox's index, value is end2end bbox index. + :param match_list: + :return: + """ + match_dict = dict() + for match_pair in match_list: + end2end_index, master_index = match_pair[0], match_pair[1] + if master_index not in match_dict.keys(): + match_dict[master_index] = [end2end_index] + else: + match_dict[master_index].append(end2end_index) + return match_dict + + +def deal_successive_space(text): + """ + deal successive space character for text + 1. Replace ' '*3 with '' which is real space is text + 2. Remove ' ', which is split token, not true space + 3. Replace '' with ' ', to get real text + :param text: + :return: + """ + text = text.replace(' ' * 3, '') + text = text.replace(' ', '') + text = text.replace('', ' ') + return text + + +def reduce_repeat_bb(text_list, break_token): + """ + convert ['Local', 'government', 'unit'] to ['Local government unit'] + PS: maybe style Local is also exist, too. it can be processed like this. + :param text_list: + :param break_token: + :return: + """ + count = 0 + for text in text_list: + if text.startswith(''): + count += 1 + if count == len(text_list): + new_text_list = [] + for text in text_list: + text = text.replace('', '').replace('', '') + new_text_list.append(text) + return ['' + break_token.join(new_text_list) + ''] + else: + return text_list + + +def get_match_text_dict(match_dict, end2end_info, break_token=' '): + match_text_dict = dict() + for master_index, end2end_index_list in match_dict.items(): + text_list = [ + end2end_info[end2end_index]['text'] + for end2end_index in end2end_index_list + ] + text_list = reduce_repeat_bb(text_list, break_token) + text = break_token.join(text_list) + match_text_dict[master_index] = text + return match_text_dict + + +def merge_span_token(master_token_list): + """ + Merge the span style token (row span or col span). + :param master_token_list: + :return: + """ + new_master_token_list = [] + pointer = 0 + if master_token_list[-1] != '': + master_token_list.append('') + while master_token_list[pointer] != '': + try: + if master_token_list[pointer] == '' + '' + """ + tmp = ''.join(master_token_list[pointer:pointer + 3 + 1]) + pointer += 4 + new_master_token_list.append(tmp) + + elif master_token_list[pointer + 2].startswith( + ' colspan=') or master_token_list[ + pointer + 2].startswith(' rowspan='): + """ + example: + pattern + '' + '' + """ + tmp = ''.join(master_token_list[pointer:pointer + 4 + 1]) + pointer += 5 + new_master_token_list.append(tmp) + + else: + new_master_token_list.append(master_token_list[pointer]) + pointer += 1 + else: + new_master_token_list.append(master_token_list[pointer]) + pointer += 1 + except: + print("Break in merge...") + break + new_master_token_list.append('') + + return new_master_token_list + + +def deal_eb_token(master_token): + """ + post process with , , ... + emptyBboxTokenDict = { + "[]": '', + "[' ']": '', + "['', ' ', '']": '', + "['\\u2028', '\\u2028']": '', + "['', ' ', '']": '', + "['', '']": '', + "['', ' ', '']": '', + "['', '', '', '']": '', + "['', '', ' ', '', '']": '', + "['', '']": '', + "['', ' ', '\\u2028', ' ', '\\u2028', ' ', '']": '', + } + :param master_token: + :return: + """ + master_token = master_token.replace('', '') + master_token = master_token.replace('', ' ') + master_token = master_token.replace('', ' ') + master_token = master_token.replace('', '\u2028\u2028') + master_token = master_token.replace('', ' ') + master_token = master_token.replace('', '') + master_token = master_token.replace('', ' ') + master_token = master_token.replace('', + '') + master_token = master_token.replace('', + ' ') + master_token = master_token.replace('', '') + master_token = master_token.replace('', + ' \u2028 \u2028 ') + return master_token + + +def insert_text_to_token(master_token_list, match_text_dict): + """ + Insert OCR text result to structure token. + :param master_token_list: + :param match_text_dict: + :return: + """ + master_token_list = merge_span_token(master_token_list) + merged_result_list = [] + text_count = 0 + for master_token in master_token_list: + if master_token.startswith(' len(match_text_dict) - 1: + text_count += 1 + continue + elif text_count not in match_text_dict.keys(): + text_count += 1 + continue + else: + master_token = master_token.replace( + '><', '>{}<'.format(match_text_dict[text_count])) + text_count += 1 + master_token = deal_eb_token(master_token) + merged_result_list.append(master_token) + + return ''.join(merged_result_list) + + +def deal_isolate_span(thead_part): + """ + Deal with isolate span cases in this function. + It causes by wrong prediction in structure recognition model. + eg. predict to rowspan="2">. + :param thead_part: + :return: + """ + # 1. find out isolate span tokens. + isolate_pattern = " rowspan=\"(\d)+\" colspan=\"(\d)+\">
|" \ + " colspan=\"(\d)+\" rowspan=\"(\d)+\">
|" \ + " rowspan=\"(\d)+\">
|" \ + " colspan=\"(\d)+\">" + isolate_iter = re.finditer(isolate_pattern, thead_part) + isolate_list = [i.group() for i in isolate_iter] + + # 2. find out span number, by step 1 results. + span_pattern = " rowspan=\"(\d)+\" colspan=\"(\d)+\"|" \ + " colspan=\"(\d)+\" rowspan=\"(\d)+\"|" \ + " rowspan=\"(\d)+\"|" \ + " colspan=\"(\d)+\"" + corrected_list = [] + for isolate_item in isolate_list: + span_part = re.search(span_pattern, isolate_item) + spanStr_in_isolateItem = span_part.group() + # 3. merge the span number into the span token format string. + if spanStr_in_isolateItem is not None: + corrected_item = ''.format(spanStr_in_isolateItem) + corrected_list.append(corrected_item) + else: + corrected_list.append(None) + + # 4. replace original isolated token. + for corrected_item, isolate_item in zip(corrected_list, isolate_list): + if corrected_item is not None: + thead_part = thead_part.replace(isolate_item, corrected_item) + else: + pass + return thead_part + + +def deal_duplicate_bb(thead_part): + """ + Deal duplicate or after replace. + Keep one in a token. + :param thead_part: + :return: + """ + # 1. find out in . + td_pattern = "(.+?)|" \ + "(.+?)|" \ + "(.+?)|" \ + "(.+?)|" \ + "(.*?)" + td_iter = re.finditer(td_pattern, thead_part) + td_list = [t.group() for t in td_iter] + + # 2. is multiply in or not? + new_td_list = [] + for td_item in td_list: + if td_item.count('') > 1 or td_item.count('') > 1: + # multiply in case. + # 1. remove all + td_item = td_item.replace('', '').replace('', '') + # 2. replace -> , -> . + td_item = td_item.replace('', '').replace('', + '') + new_td_list.append(td_item) + else: + new_td_list.append(td_item) + + # 3. replace original thead part. + for td_item, new_td_item in zip(td_list, new_td_list): + thead_part = thead_part.replace(td_item, new_td_item) + return thead_part + + +def deal_bb(result_token): + """ + In our opinion, always occurs in text's context. + This function will find out all tokens in and insert by manual. + :param result_token: + :return: + """ + # find out parts. + thead_pattern = '(.*?)' + if re.search(thead_pattern, result_token) is None: + return result_token + thead_part = re.search(thead_pattern, result_token).group() + origin_thead_part = copy.deepcopy(thead_part) + + # check "rowspan" or "colspan" occur in parts or not . + span_pattern = "|||" + span_iter = re.finditer(span_pattern, thead_part) + span_list = [s.group() for s in span_iter] + has_span_in_head = True if len(span_list) > 0 else False + + if not has_span_in_head: + # not include "rowspan" or "colspan" branch 1. + # 1. replace to , and to + # 2. it is possible to predict text include or by Text-line recognition, + # so we replace to , and to + thead_part = thead_part.replace('', '')\ + .replace('', '')\ + .replace('', '')\ + .replace('', '') + else: + # include "rowspan" or "colspan" branch 2. + # Firstly, we deal rowspan or colspan cases. + # 1. replace > to > + # 2. replace to + # 3. it is possible to predict text include or by Text-line recognition, + # so we replace to , and to + + # Secondly, deal ordinary cases like branch 1 + + # replace ">" to "" + replaced_span_list = [] + for sp in span_list: + replaced_span_list.append(sp.replace('>', '>')) + for sp, rsp in zip(span_list, replaced_span_list): + thead_part = thead_part.replace(sp, rsp) + + # replace "" to "" + thead_part = thead_part.replace('', '') + + # remove duplicated by re.sub + mb_pattern = "()+" + single_b_string = "" + thead_part = re.sub(mb_pattern, single_b_string, thead_part) + + mgb_pattern = "()+" + single_gb_string = "" + thead_part = re.sub(mgb_pattern, single_gb_string, thead_part) + + # ordinary cases like branch 1 + thead_part = thead_part.replace('', '').replace('', + '') + + # convert back to , empty cell has no . + # but space cell( ) is suitable for + thead_part = thead_part.replace('', '') + # deal with duplicated + thead_part = deal_duplicate_bb(thead_part) + # deal with isolate span tokens, which causes by wrong predict by structure prediction. + # eg.PMC5994107_011_00.png + thead_part = deal_isolate_span(thead_part) + # replace original result with new thead part. + result_token = result_token.replace(origin_thead_part, thead_part) + return result_token + + +class Matcher: + def __init__(self, end2end_file, structure_master_file): + """ + This class process the end2end results and structure recognition results. + :param end2end_file: end2end results predict by end2end inference. + :param structure_master_file: structure recognition results predict by structure master inference. + """ + self.end2end_file = end2end_file + self.structure_master_file = structure_master_file + self.end2end_results = pickle_load(end2end_file, prefix='end2end') + self.structure_master_results = pickle_load( + structure_master_file, prefix='structure') + + def match(self): + """ + Match process: + pre-process : convert end2end and structure master results to xyxy, xywh ndnarray format. + 1. Use pseBbox is inside masterBbox judge rule + 2. Use iou between pseBbox and masterBbox rule + 3. Use min distance of center point rule + :return: + """ + match_results = dict() + for idx, (file_name, + end2end_result) in enumerate(self.end2end_results.items()): + match_list = [] + if file_name not in self.structure_master_results: + continue + structure_master_result = self.structure_master_results[file_name] + end2end_xyxy_bboxes, end2end_xywh_bboxes, structure_master_xywh_bboxes, structure_master_xyxy_bboxes = \ + get_bboxes_list(end2end_result, structure_master_result) + + # rule 1: center rule + center_rule_match_list = \ + center_rule_match(end2end_xywh_bboxes, structure_master_xyxy_bboxes) + match_list.extend(center_rule_match_list) + + # rule 2: iou rule + # firstly, find not match index in previous step. + center_no_match_end2end_indexs = \ + find_no_match(match_list, len(end2end_xywh_bboxes), type='end2end') + if len(center_no_match_end2end_indexs) > 0: + center_no_match_end2end_xyxy = end2end_xyxy_bboxes[ + center_no_match_end2end_indexs] + # secondly, iou rule match + iou_rule_match_list = \ + iou_rule_match(center_no_match_end2end_xyxy, center_no_match_end2end_indexs, structure_master_xyxy_bboxes) + match_list.extend(iou_rule_match_list) + + # rule 3: distance rule + # match between no-match end2end bboxes and no-match master bboxes. + # it will return master_bboxes_nums match-pairs. + # firstly, find not match index in previous step. + centerIou_no_match_end2end_indexs = \ + find_no_match(match_list, len(end2end_xywh_bboxes), type='end2end') + centerIou_no_match_master_indexs = \ + find_no_match(match_list, len(structure_master_xywh_bboxes), type='master') + if len(centerIou_no_match_master_indexs) > 0 and len( + centerIou_no_match_end2end_indexs) > 0: + centerIou_no_match_end2end_xywh = end2end_xywh_bboxes[ + centerIou_no_match_end2end_indexs] + centerIou_no_match_master_xywh = structure_master_xywh_bboxes[ + centerIou_no_match_master_indexs] + distance_match_list = distance_rule_match( + centerIou_no_match_end2end_indexs, + centerIou_no_match_end2end_xywh, + centerIou_no_match_master_indexs, + centerIou_no_match_master_xywh) + match_list.extend(distance_match_list) + + # TODO: + # The render no-match pseBbox, insert the last + # After step3 distance rule, a master bbox at least match one end2end bbox. + # But end2end bbox maybe overmuch, because numbers of master bbox will cut by max length. + # For these render end2end bboxes, we will make some virtual master bboxes, and get matching. + # The above extra insert bboxes will be further processed in "formatOutput" function. + # After this operation, it will increase TEDS score. + no_match_end2end_indexes = \ + find_no_match(match_list, len(end2end_xywh_bboxes), type='end2end') + if len(no_match_end2end_indexes) > 0: + no_match_end2end_xywh = end2end_xywh_bboxes[ + no_match_end2end_indexes] + # sort the render no-match end2end bbox in row + end2end_sorted_indexes_list, end2end_sorted_bboxes_list, sorted_groups, sorted_bboxes_groups = \ + sort_bbox(no_match_end2end_xywh, no_match_end2end_indexes) + # make virtual master bboxes, and get matching with the no-match end2end bboxes. + extra_match_list = extra_match( + end2end_sorted_indexes_list, + len(structure_master_xywh_bboxes)) + match_list_add_extra_match = copy.deepcopy(match_list) + match_list_add_extra_match.extend(extra_match_list) + else: + # no no-match end2end bboxes + match_list_add_extra_match = copy.deepcopy(match_list) + sorted_groups = [] + sorted_bboxes_groups = [] + + match_result_dict = { + 'match_list': match_list, + 'match_list_add_extra_match': match_list_add_extra_match, + 'sorted_groups': sorted_groups, + 'sorted_bboxes_groups': sorted_bboxes_groups + } + + # format output + match_result_dict = self._format(match_result_dict, file_name) + + match_results[file_name] = match_result_dict + + return match_results + + def _format(self, match_result, file_name): + """ + Extend the master token(insert virtual master token), and format matching result. + :param match_result: + :param file_name: + :return: + """ + end2end_info = self.end2end_results[file_name] + master_info = self.structure_master_results[file_name] + master_token = master_info['text'] + sorted_groups = match_result['sorted_groups'] + + # creat virtual master token + virtual_master_token_list = [] + for line_group in sorted_groups: + tmp_list = [''] + item_nums = len(line_group) + for _ in range(item_nums): + tmp_list.append('') + tmp_list.append('') + virtual_master_token_list.extend(tmp_list) + + # insert virtual master token + master_token_list = master_token.split(',') + if master_token_list[-1] == '': + # complete predict(no cut by max length) + # This situation insert virtual master token will drop TEDs score in val set. + # So we will not extend virtual token in this situation. + + # fake extend virtual + master_token_list[:-1].extend(virtual_master_token_list) + + # real extend virtual + # master_token_list = master_token_list[:-1] + # master_token_list.extend(virtual_master_token_list) + # master_token_list.append('') + + elif master_token_list[-1] == '': + master_token_list.append('') + master_token_list.extend(virtual_master_token_list) + master_token_list.append('') + else: + master_token_list.extend(virtual_master_token_list) + master_token_list.append('') + + # format output + match_result.setdefault('matched_master_token_list', master_token_list) + return match_result + + def get_merge_result(self, match_results): + """ + Merge the OCR result into structure token to get final results. + :param match_results: + :return: + """ + merged_results = dict() + + # break_token is linefeed token, when one master bbox has multiply end2end bboxes. + break_token = ' ' + + for idx, (file_name, match_info) in enumerate(match_results.items()): + end2end_info = self.end2end_results[file_name] + master_token_list = match_info['matched_master_token_list'] + match_list = match_info['match_list_add_extra_match'] + + match_dict = get_match_dict(match_list) + match_text_dict = get_match_text_dict(match_dict, end2end_info, + break_token) + merged_result = insert_text_to_token(master_token_list, + match_text_dict) + merged_result = deal_bb(merged_result) + + merged_results[file_name] = merged_result + + return merged_results + + +class TableMasterMatcher(Matcher): + def __init__(self): + pass + + def __call__(self, structure_res, dt_boxes, rec_res, img_name=1): + end2end_results = {img_name: []} + for dt_box, res in zip(dt_boxes, rec_res): + d = dict( + bbox=np.array(dt_box), + text=res[0], ) + end2end_results[img_name].append(d) + + self.end2end_results = end2end_results + + structure_master_result_dict = {img_name: {}} + pred_structures, pred_bboxes = structure_res + pred_structures = ','.join(pred_structures[3:-3]) + structure_master_result_dict[img_name]['text'] = pred_structures + structure_master_result_dict[img_name]['bbox'] = pred_bboxes + self.structure_master_results = structure_master_result_dict + + # match + match_results = self.match() + merged_results = self.get_merge_result(match_results) + pred_html = merged_results[img_name] + pred_html = '' + pred_html + '
' + return pred_html diff --git a/ppstructure/table/table_metric/table_metric.py b/ppstructure/table/table_metric/table_metric.py index 9aca98ad785d4614a803fa5a277a6e4a27b3b078..923a9c0071d083de72a2a896d6f62037373d4e73 100755 --- a/ppstructure/table/table_metric/table_metric.py +++ b/ppstructure/table/table_metric/table_metric.py @@ -9,7 +9,7 @@ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # Apache 2.0 License for more details. -import distance +from rapidfuzz.distance import Levenshtein from apted import APTED, Config from apted.helpers import Tree from lxml import etree, html @@ -39,17 +39,6 @@ class TableTree(Tree): class CustomConfig(Config): - @staticmethod - def maximum(*sequences): - """Get maximum possible value - """ - return max(map(len, sequences)) - - def normalized_distance(self, *sequences): - """Get distance from 0 to 1 - """ - return float(distance.levenshtein(*sequences)) / self.maximum(*sequences) - def rename(self, node1, node2): """Compares attributes of trees""" #print(node1.tag) @@ -58,23 +47,12 @@ class CustomConfig(Config): if node1.tag == 'td': if node1.content or node2.content: #print(node1.content, ) - return self.normalized_distance(node1.content, node2.content) + return Levenshtein.normalized_distance(node1.content, node2.content) return 0. class CustomConfig_del_short(Config): - @staticmethod - def maximum(*sequences): - """Get maximum possible value - """ - return max(map(len, sequences)) - - def normalized_distance(self, *sequences): - """Get distance from 0 to 1 - """ - return float(distance.levenshtein(*sequences)) / self.maximum(*sequences) - def rename(self, node1, node2): """Compares attributes of trees""" if (node1.tag != node2.tag) or (node1.colspan != node2.colspan) or (node1.rowspan != node2.rowspan): @@ -90,21 +68,10 @@ class CustomConfig_del_short(Config): node1_content = ['####'] if len(node2_content) < 3: node2_content = ['####'] - return self.normalized_distance(node1_content, node2_content) + return Levenshtein.normalized_distance(node1_content, node2_content) return 0. class CustomConfig_del_block(Config): - @staticmethod - def maximum(*sequences): - """Get maximum possible value - """ - return max(map(len, sequences)) - - def normalized_distance(self, *sequences): - """Get distance from 0 to 1 - """ - return float(distance.levenshtein(*sequences)) / self.maximum(*sequences) - def rename(self, node1, node2): """Compares attributes of trees""" if (node1.tag != node2.tag) or (node1.colspan != node2.colspan) or (node1.rowspan != node2.rowspan): @@ -120,7 +87,7 @@ class CustomConfig_del_block(Config): while ' ' in node2_content: print(node2_content.index(' ')) node2_content.pop(node2_content.index(' ')) - return self.normalized_distance(node1_content, node2_content) + return Levenshtein.normalized_distance(node1_content, node2_content) return 0. class TEDS(object): diff --git a/ppstructure/utility.py b/ppstructure/utility.py index af0616239b167ff9ca5f6e1222015d51338d6bab..97b6d6fec0d70fe3014b0b2105dbbef6a292e4d7 100644 --- a/ppstructure/utility.py +++ b/ppstructure/utility.py @@ -27,33 +27,48 @@ def init_args(): parser.add_argument("--table_max_len", type=int, default=488) parser.add_argument("--table_algorithm", type=str, default='TableAttn') parser.add_argument("--table_model_dir", type=str) + parser.add_argument( + "--merge_no_span_structure", type=str2bool, default=True) parser.add_argument( "--table_char_dict_path", type=str, - default="../ppocr/utils/dict/table_structure_dict.txt") + default="../ppocr/utils/dict/table_structure_dict_ch.txt") # params for layout + parser.add_argument("--layout_model_dir", type=str) parser.add_argument( - "--layout_path_model", + "--layout_dict_path", type=str, - default="lp://PubLayNet/ppyolov2_r50vd_dcn_365e_publaynet/config") + default="../ppocr/utils/dict/layout_dict/layout_publaynet_dict.txt") + parser.add_argument( + "--layout_score_threshold", + type=float, + default=0.5, + help="Threshold of score.") parser.add_argument( - "--layout_label_map", - type=ast.literal_eval, - default=None, - help='label map according to ppstructure/layout/README_ch.md') - # params for vqa - parser.add_argument("--vqa_algorithm", type=str, default='LayoutXLM') + "--layout_nms_threshold", + type=float, + default=0.5, + help="Threshold of nms.") + # params for kie + parser.add_argument("--kie_algorithm", type=str, default='LayoutXLM') parser.add_argument("--ser_model_dir", type=str) parser.add_argument( "--ser_dict_path", type=str, default="../train_data/XFUND/class_list_xfun.txt") + # need to be None or tb-yx + parser.add_argument("--ocr_order_method", type=str, default=None) # params for inference parser.add_argument( "--mode", type=str, default='structure', - help='structure and vqa is supported') + help='structure and kie is supported') + parser.add_argument( + "--image_orientation", + type=bool, + default=False, + help='Whether to enable image orientation recognition') parser.add_argument( "--layout", type=str2bool, @@ -69,11 +84,18 @@ def init_args(): type=str2bool, default=True, help='In the forward, whether the non-table area is recognition by ocr') + # param for recovery parser.add_argument( "--recovery", - type=bool, + type=str2bool, default=False, help='Whether to enable layout of recovery') + parser.add_argument( + "--save_pdf", + type=str2bool, + default=False, + help='Whether to save pdf file') + return parser @@ -87,7 +109,7 @@ def draw_structure_result(image, result, font_path): image = Image.fromarray(image) boxes, txts, scores = [], [], [] for region in result: - if region['type'] == 'Table': + if region['type'] == 'table': pass else: for text_result in region['res']: diff --git a/ppstructure/vqa/README.md b/ppstructure/vqa/README.md deleted file mode 100644 index 28b794383bceccf655bdf00df5ee0c98841e2e95..0000000000000000000000000000000000000000 --- a/ppstructure/vqa/README.md +++ /dev/null @@ -1,285 +0,0 @@ -English | [简体中文](README_ch.md) - -- [1 Introduction](#1-introduction) -- [2. Performance](#2-performance) -- [3. Effect demo](#3-effect-demo) - - [3.1 SER](#31-ser) - - [3.2 RE](#32-re) -- [4. Install](#4-install) - - [4.1 Install dependencies](#41-install-dependencies) - - [5.3 RE](#53-re) -- [6. Reference Links](#6-reference-links) -- [License](#license) - -# Document Visual Question Answering - -## 1 Introduction - -VQA refers to visual question answering, which mainly asks and answers image content. DOC-VQA is one of the VQA tasks. DOC-VQA mainly asks questions about the text content of text images. - -The DOC-VQA algorithm in PP-Structure is developed based on the PaddleNLP natural language processing algorithm library. - -The main features are as follows: - -- Integrate [LayoutXLM](https://arxiv.org/pdf/2104.08836.pdf) model and PP-OCR prediction engine. -- Supports Semantic Entity Recognition (SER) and Relation Extraction (RE) tasks based on multimodal methods. Based on the SER task, the text recognition and classification in the image can be completed; based on the RE task, the relationship extraction of the text content in the image can be completed, such as judging the problem pair (pair). -- Supports custom training for SER tasks and RE tasks. -- Supports end-to-end system prediction and evaluation of OCR+SER. -- Supports end-to-end system prediction of OCR+SER+RE. - - -This project is an open source implementation of [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/pdf/2104.08836.pdf) on Paddle 2.2, -Included fine-tuning code on [XFUND dataset](https://github.com/doc-analysis/XFUND). - -## 2. Performance - -We evaluate the algorithm on the Chinese dataset of [XFUND](https://github.com/doc-analysis/XFUND), and the performance is as follows - -| Model | Task | hmean | Model download address | -|:---:|:---:|:---:| :---:| -| LayoutXLM | SER | 0.9038 | [link](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutXLM_xfun_zh.tar) | -| LayoutXLM | RE | 0.7483 | [link](https://paddleocr.bj.bcebos.com/pplayout/re_LayoutXLM_xfun_zh.tar) | -| LayoutLMv2 | SER | 0.8544 | [link](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutLMv2_xfun_zh.tar) -| LayoutLMv2 | RE | 0.6777 | [link](https://paddleocr.bj.bcebos.com/pplayout/re_LayoutLMv2_xfun_zh.tar) | -| LayoutLM | SER | 0.7731 | [link](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutLM_xfun_zh.tar) | - -## 3. Effect demo - -**Note:** The test images are from the XFUND dataset. - - -### 3.1 SER - -![](../docs/vqa/result_ser/zh_val_0_ser.jpg) | ![](../docs/vqa/result_ser/zh_val_42_ser.jpg) ----|--- - -Boxes with different colors in the figure represent different categories. For the XFUND dataset, there are 3 categories: `QUESTION`, `ANSWER`, `HEADER` - -* Dark purple: HEADER -* Light purple: QUESTION -* Army Green: ANSWER - -The corresponding categories and OCR recognition results are also marked on the upper left of the OCR detection frame. - - -### 3.2 RE - -![](../docs/vqa/result_re/zh_val_21_re.jpg) | ![](../docs/vqa/result_re/zh_val_40_re.jpg) ----|--- - - -The red box in the figure represents the question, the blue box represents the answer, and the question and the answer are connected by a green line. The corresponding categories and OCR recognition results are also marked on the upper left of the OCR detection frame. - -## 4. Install - -### 4.1 Install dependencies - -- **(1) Install PaddlePaddle** - -```bash -python3 -m pip install --upgrade pip - -# GPU installation -python3 -m pip install "paddlepaddle-gpu>=2.2" -i https://mirror.baidu.com/pypi/simple - -# CPU installation -python3 -m pip install "paddlepaddle>=2.2" -i https://mirror.baidu.com/pypi/simple - -```` -For more requirements, please refer to the instructions in [Installation Documentation](https://www.paddlepaddle.org.cn/install/quick). - -### 4.2 Install PaddleOCR - -- **(1) pip install PaddleOCR whl package quickly (prediction only)** - -```bash -python3 -m pip install paddleocr -```` - -- **(2) Download VQA source code (prediction + training)** - -```bash -[Recommended] git clone https://github.com/PaddlePaddle/PaddleOCR - -# If the pull cannot be successful due to network problems, you can also choose to use the hosting on the code cloud: -git clone https://gitee.com/paddlepaddle/PaddleOCR - -# Note: Code cloud hosting code may not be able to synchronize the update of this github project in real time, there is a delay of 3 to 5 days, please use the recommended method first. -```` - -- **(3) Install VQA's `requirements`** - -```bash -python3 -m pip install -r ppstructure/vqa/requirements.txt -```` - -## 5. Usage - -### 5.1 Data and Model Preparation - -If you want to experience the prediction process directly, you can download the pre-training model provided by us, skip the training process, and just predict directly. - -* Download the processed dataset - -The download address of the processed XFUND Chinese dataset: [link](https://paddleocr.bj.bcebos.com/ppstructure/dataset/XFUND.tar). - - -Download and unzip the dataset, and place the dataset in the current directory after unzipping. - -```shell -wget https://paddleocr.bj.bcebos.com/ppstructure/dataset/XFUND.tar -```` - -* Convert the dataset - -If you need to train other XFUND datasets, you can use the following commands to convert the datasets - -```bash -python3 ppstructure/vqa/tools/trans_xfun_data.py --ori_gt_path=path/to/json_path --output_path=path/to/save_path -```` - -* Download the pretrained models -```bash -mkdir pretrain && cd pretrain -#download the SER model -wget https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutXLM_xfun_zh.tar && tar -xvf ser_LayoutXLM_xfun_zh.tar -#download the RE model -wget https://paddleocr.bj.bcebos.com/pplayout/re_LayoutXLM_xfun_zh.tar && tar -xvf re_LayoutXLM_xfun_zh.tar -cd ../ -```` - - -### 5.2 SER - -Before starting training, you need to modify the following four fields - -1. `Train.dataset.data_dir`: point to the directory where the training set images are stored -2. `Train.dataset.label_file_list`: point to the training set label file -3. `Eval.dataset.data_dir`: refers to the directory where the validation set images are stored -4. `Eval.dataset.label_file_list`: point to the validation set label file - -* start training -```shell -CUDA_VISIBLE_DEVICES=0 python3 tools/train.py -c configs/vqa/ser/layoutxlm.yml -```` - -Finally, `precision`, `recall`, `hmean` and other indicators will be printed. -In the `./output/ser_layoutxlm/` folder will save the training log, the optimal model and the model for the latest epoch. - -* resume training - -To resume training, assign the folder path of the previously trained model to the `Architecture.Backbone.checkpoints` field. - -```shell -CUDA_VISIBLE_DEVICES=0 python3 tools/train.py -c configs/vqa/ser/layoutxlm.yml -o Architecture.Backbone.checkpoints=path/to/model_dir -```` - -* evaluate - -Evaluation requires assigning the folder path of the model to be evaluated to the `Architecture.Backbone.checkpoints` field. - -```shell -CUDA_VISIBLE_DEVICES=0 python3 tools/eval.py -c configs/vqa/ser/layoutxlm.yml -o Architecture.Backbone.checkpoints=path/to/model_dir -```` -Finally, `precision`, `recall`, `hmean` and other indicators will be printed - -* `OCR + SER` tandem prediction based on training engine - -Use the following command to complete the series prediction of `OCR engine + SER`, taking the SER model based on LayoutXLM as an example:: - -```shell -python3.7 tools/export_model.py -c configs/vqa/ser/layoutxlm.yml -o Architecture.Backbone.checkpoints=pretrain/ser_LayoutXLM_xfun_zh/ Global.save_inference_dir=output/ser/infer -```` - -Finally, the prediction result visualization image and the prediction result text file will be saved in the directory configured by the `config.Global.save_res_path` field. The prediction result text file is named `infer_results.txt`. - -* End-to-end evaluation of `OCR + SER` prediction system - -First use the `tools/infer_vqa_token_ser.py` script to complete the prediction of the dataset, then use the following command to evaluate. - -```shell -export CUDA_VISIBLE_DEVICES=0 -python3 tools/eval_with_label_end2end.py --gt_json_path XFUND/zh_val/xfun_normalize_val.json --pred_json_path output_res/infer_results.txt -```` -* export model - -Use the following command to complete the model export of the SER model, taking the SER model based on LayoutXLM as an example: - -```shell -python3.7 tools/export_model.py -c configs/vqa/ser/layoutxlm.yml -o Architecture.Backbone.checkpoints=pretrain/ser_LayoutXLM_xfun_zh/ Global.save_inference_dir=output/ser/infer -``` -The converted model will be stored in the directory specified by the `Global.save_inference_dir` field. - -* `OCR + SER` tandem prediction based on prediction engine - -Use the following command to complete the tandem prediction of `OCR + SER` based on the prediction engine, taking the SER model based on LayoutXLM as an example: - -```shell -cd ppstructure -CUDA_VISIBLE_DEVICES=0 python3.7 vqa/predict_vqa_token_ser.py --vqa_algorithm=LayoutXLM --ser_model_dir=../output/ser/infer --ser_dict_path=../train_data/XFUND/class_list_xfun.txt --vis_font_path=../doc/fonts/simfang.ttf --image_dir=docs/vqa/input/zh_val_42.jpg --output=output -``` -After the prediction is successful, the visualization images and results will be saved in the directory specified by the `output` field - - -### 5.3 RE - -* start training - -Before starting training, you need to modify the following four fields - -1. `Train.dataset.data_dir`: point to the directory where the training set images are stored -2. `Train.dataset.label_file_list`: point to the training set label file -3. `Eval.dataset.data_dir`: refers to the directory where the validation set images are stored -4. `Eval.dataset.label_file_list`: point to the validation set label file - -```shell -CUDA_VISIBLE_DEVICES=0 python3 tools/train.py -c configs/vqa/re/layoutxlm.yml -```` - -Finally, `precision`, `recall`, `hmean` and other indicators will be printed. -In the `./output/re_layoutxlm/` folder will save the training log, the optimal model and the model for the latest epoch. - -* resume training - -To resume training, assign the folder path of the previously trained model to the `Architecture.Backbone.checkpoints` field. - -```shell -CUDA_VISIBLE_DEVICES=0 python3 tools/train.py -c configs/vqa/re/layoutxlm.yml -o Architecture.Backbone.checkpoints=path/to/model_dir -```` - -* evaluate - -Evaluation requires assigning the folder path of the model to be evaluated to the `Architecture.Backbone.checkpoints` field. - -```shell -CUDA_VISIBLE_DEVICES=0 python3 tools/eval.py -c configs/vqa/re/layoutxlm.yml -o Architecture.Backbone.checkpoints=path/to/model_dir -```` -Finally, `precision`, `recall`, `hmean` and other indicators will be printed - -* Use `OCR engine + SER + RE` tandem prediction - -Use the following command to complete the series prediction of `OCR engine + SER + RE`, taking the pretrained SER and RE models as an example: -```shell -export CUDA_VISIBLE_DEVICES=0 -python3 tools/infer_vqa_token_ser_re.py -c configs/vqa/re/layoutxlm.yml -o Architecture.Backbone.checkpoints=pretrain/re_LayoutXLM_xfun_zh/Global.infer_img=ppstructure/docs/vqa/input/zh_val_21.jpg -c_ser configs/vqa/ser/layoutxlm. yml -o_ser Architecture.Backbone.checkpoints=pretrain/ser_LayoutXLM_xfun_zh/ -```` - -Finally, the prediction result visualization image and the prediction result text file will be saved in the directory configured by the `config.Global.save_res_path` field. The prediction result text file is named `infer_results.txt`. - -* export model - -cooming soon - -* `OCR + SER + RE` tandem prediction based on prediction engine - -cooming soon - -## 6. Reference Links - -- LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding, https://arxiv.org/pdf/2104.08836.pdf -- microsoft/unilm/layoutxlm, https://github.com/microsoft/unilm/tree/master/layoutxlm -- XFUND dataset, https://github.com/doc-analysis/XFUND - -## License - -The content of this project itself is licensed under the [Attribution-NonCommercial-ShareAlike 4.0 International (CC BY-NC-SA 4.0)](https://creativecommons.org/licenses/by-nc-sa/4.0/) diff --git a/ppstructure/vqa/README_ch.md b/ppstructure/vqa/README_ch.md deleted file mode 100644 index f168110ed9b2e750b3b2ee6f5ab0116daebc3e77..0000000000000000000000000000000000000000 --- a/ppstructure/vqa/README_ch.md +++ /dev/null @@ -1,283 +0,0 @@ -[English](README.md) | 简体中文 - -- [1. 简介](#1-简介) -- [2. 性能](#2-性能) -- [3. 效果演示](#3-效果演示) - - [3.1 SER](#31-ser) - - [3.2 RE](#32-re) -- [4. 安装](#4-安装) - - [4.1 安装依赖](#41-安装依赖) - - [4.2 安装PaddleOCR(包含 PP-OCR 和 VQA)](#42-安装paddleocr包含-pp-ocr-和-vqa) -- [5. 使用](#5-使用) - - [5.1 数据和预训练模型准备](#51-数据和预训练模型准备) - - [5.2 SER](#52-ser) - - [5.3 RE](#53-re) -- [6. 参考链接](#6-参考链接) -- [License](#license) - -# 文档视觉问答(DOC-VQA) - -## 1. 简介 - -VQA指视觉问答,主要针对图像内容进行提问和回答,DOC-VQA是VQA任务中的一种,DOC-VQA主要针对文本图像的文字内容提出问题。 - -PP-Structure 里的 DOC-VQA算法基于PaddleNLP自然语言处理算法库进行开发。 - -主要特性如下: - -- 集成[LayoutXLM](https://arxiv.org/pdf/2104.08836.pdf)模型以及PP-OCR预测引擎。 -- 支持基于多模态方法的语义实体识别 (Semantic Entity Recognition, SER) 以及关系抽取 (Relation Extraction, RE) 任务。基于 SER 任务,可以完成对图像中的文本识别与分类;基于 RE 任务,可以完成对图象中的文本内容的关系提取,如判断问题对(pair)。 -- 支持SER任务和RE任务的自定义训练。 -- 支持OCR+SER的端到端系统预测与评估。 -- 支持OCR+SER+RE的端到端系统预测。 - -本项目是 [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/pdf/2104.08836.pdf) 在 Paddle 2.2上的开源实现, -包含了在 [XFUND数据集](https://github.com/doc-analysis/XFUND) 上的微调代码。 - -## 2. 性能 - -我们在 [XFUND](https://github.com/doc-analysis/XFUND) 的中文数据集上对算法进行了评估,性能如下 - -| 模型 | 任务 | hmean | 模型下载地址 | -|:---:|:---:|:---:| :---:| -| LayoutXLM | SER | 0.9038 | [链接](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutXLM_xfun_zh.tar) | -| LayoutXLM | RE | 0.7483 | [链接](https://paddleocr.bj.bcebos.com/pplayout/re_LayoutXLM_xfun_zh.tar) | -| LayoutLMv2 | SER | 0.8544 | [链接](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutLMv2_xfun_zh.tar) -| LayoutLMv2 | RE | 0.6777 | [链接](https://paddleocr.bj.bcebos.com/pplayout/re_LayoutLMv2_xfun_zh.tar) | -| LayoutLM | SER | 0.7731 | [链接](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutLM_xfun_zh.tar) | - -## 3. 效果演示 - -**注意:** 测试图片来源于XFUND数据集。 - -### 3.1 SER - -![](../docs/vqa/result_ser/zh_val_0_ser.jpg) | ![](../docs/vqa/result_ser/zh_val_42_ser.jpg) ----|--- - -图中不同颜色的框表示不同的类别,对于XFUND数据集,有`QUESTION`, `ANSWER`, `HEADER` 3种类别 - -* 深紫色:HEADER -* 浅紫色:QUESTION -* 军绿色:ANSWER - -在OCR检测框的左上方也标出了对应的类别和OCR识别结果。 - -### 3.2 RE - -![](../docs/vqa/result_re/zh_val_21_re.jpg) | ![](../docs/vqa/result_re/zh_val_40_re.jpg) ----|--- - - -图中红色框表示问题,蓝色框表示答案,问题和答案之间使用绿色线连接。在OCR检测框的左上方也标出了对应的类别和OCR识别结果。 - -## 4. 安装 - -### 4.1 安装依赖 - -- **(1) 安装PaddlePaddle** - -```bash -python3 -m pip install --upgrade pip - -# GPU安装 -python3 -m pip install "paddlepaddle-gpu>=2.2" -i https://mirror.baidu.com/pypi/simple - -# CPU安装 -python3 -m pip install "paddlepaddle>=2.2" -i https://mirror.baidu.com/pypi/simple - -``` -更多需求,请参照[安装文档](https://www.paddlepaddle.org.cn/install/quick)中的说明进行操作。 - -### 4.2 安装PaddleOCR(包含 PP-OCR 和 VQA) - -- **(1)pip快速安装PaddleOCR whl包(仅预测)** - -```bash -python3 -m pip install paddleocr -``` - -- **(2)下载VQA源码(预测+训练)** - -```bash -【推荐】git clone https://github.com/PaddlePaddle/PaddleOCR - -# 如果因为网络问题无法pull成功,也可选择使用码云上的托管: -git clone https://gitee.com/paddlepaddle/PaddleOCR - -# 注:码云托管代码可能无法实时同步本github项目更新,存在3~5天延时,请优先使用推荐方式。 -``` - -- **(3)安装VQA的`requirements`** - -```bash -python3 -m pip install -r ppstructure/vqa/requirements.txt -``` - -## 5. 使用 - -### 5.1 数据和预训练模型准备 - -如果希望直接体验预测过程,可以下载我们提供的预训练模型,跳过训练过程,直接预测即可。 - -* 下载处理好的数据集 - -处理好的XFUND中文数据集下载地址:[链接](https://paddleocr.bj.bcebos.com/ppstructure/dataset/XFUND.tar)。 - - -下载并解压该数据集,解压后将数据集放置在当前目录下。 - -```shell -wget https://paddleocr.bj.bcebos.com/ppstructure/dataset/XFUND.tar -``` - -* 转换数据集 - -若需进行其他XFUND数据集的训练,可使用下面的命令进行数据集的转换 - -```bash -python3 ppstructure/vqa/tools/trans_xfun_data.py --ori_gt_path=path/to/json_path --output_path=path/to/save_path -``` - -* 下载预训练模型 -```bash -mkdir pretrain && cd pretrain -#下载SER模型 -wget https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutXLM_xfun_zh.tar && tar -xvf ser_LayoutXLM_xfun_zh.tar -#下载RE模型 -wget https://paddleocr.bj.bcebos.com/pplayout/re_LayoutXLM_xfun_zh.tar && tar -xvf re_LayoutXLM_xfun_zh.tar -cd ../ -``` - -### 5.2 SER - -启动训练之前,需要修改下面的四个字段 - -1. `Train.dataset.data_dir`:指向训练集图片存放目录 -2. `Train.dataset.label_file_list`:指向训练集标注文件 -3. `Eval.dataset.data_dir`:指指向验证集图片存放目录 -4. `Eval.dataset.label_file_list`:指向验证集标注文件 - -* 启动训练 -```shell -CUDA_VISIBLE_DEVICES=0 python3 tools/train.py -c configs/vqa/ser/layoutxlm.yml -``` - -最终会打印出`precision`, `recall`, `hmean`等指标。 -在`./output/ser_layoutxlm/`文件夹中会保存训练日志,最优的模型和最新epoch的模型。 - -* 恢复训练 - -恢复训练需要将之前训练好的模型所在文件夹路径赋值给 `Architecture.Backbone.checkpoints` 字段。 - -```shell -CUDA_VISIBLE_DEVICES=0 python3 tools/train.py -c configs/vqa/ser/layoutxlm.yml -o Architecture.Backbone.checkpoints=path/to/model_dir -``` - -* 评估 - -评估需要将待评估的模型所在文件夹路径赋值给 `Architecture.Backbone.checkpoints` 字段。 - -```shell -CUDA_VISIBLE_DEVICES=0 python3 tools/eval.py -c configs/vqa/ser/layoutxlm.yml -o Architecture.Backbone.checkpoints=path/to/model_dir -``` -最终会打印出`precision`, `recall`, `hmean`等指标 - -* 基于训练引擎的`OCR + SER`串联预测 - -使用如下命令即可完成基于训练引擎的`OCR + SER`的串联预测, 以基于LayoutXLM的SER模型为例: -```shell -CUDA_VISIBLE_DEVICES=0 python3 tools/infer_vqa_token_ser.py -c configs/vqa/ser/layoutxlm.yml -o Architecture.Backbone.checkpoints=pretrain/ser_LayoutXLM_xfun_zh/ Global.infer_img=doc/vqa/input/zh_val_42.jpg -``` - -最终会在`config.Global.save_res_path`字段所配置的目录下保存预测结果可视化图像以及预测结果文本文件,预测结果文本文件名为`infer_results.txt`。 - -* 对`OCR + SER`预测系统进行端到端评估 - -首先使用 `tools/infer_vqa_token_ser.py` 脚本完成数据集的预测,然后使用下面的命令进行评估。 - -```shell -export CUDA_VISIBLE_DEVICES=0 -python3 tools/eval_with_label_end2end.py --gt_json_path XFUND/zh_val/xfun_normalize_val.json --pred_json_path output_res/infer_results.txt -``` -* 模型导出 - -使用如下命令即可完成SER模型的模型导出, 以基于LayoutXLM的SER模型为例: - -```shell -python3.7 tools/export_model.py -c configs/vqa/ser/layoutxlm.yml -o Architecture.Backbone.checkpoints=pretrain/ser_LayoutXLM_xfun_zh/ Global.save_inference_dir=output/ser/infer -``` -转换后的模型会存放在`Global.save_inference_dir`字段指定的目录下。 - -* 基于预测引擎的`OCR + SER`串联预测 - -使用如下命令即可完成基于预测引擎的`OCR + SER`的串联预测, 以基于LayoutXLM的SER模型为例: - -```shell -cd ppstructure -CUDA_VISIBLE_DEVICES=0 python3.7 vqa/predict_vqa_token_ser.py --vqa_algorithm=LayoutXLM --ser_model_dir=../output/ser/infer --ser_dict_path=../train_data/XFUND/class_list_xfun.txt --vis_font_path=../doc/fonts/simfang.ttf --image_dir=docs/vqa/input/zh_val_42.jpg --output=output -``` -预测成功后,可视化图片和结果会保存在`output`字段指定的目录下 - -### 5.3 RE - -* 启动训练 - -启动训练之前,需要修改下面的四个字段 - -1. `Train.dataset.data_dir`:指向训练集图片存放目录 -2. `Train.dataset.label_file_list`:指向训练集标注文件 -3. `Eval.dataset.data_dir`:指指向验证集图片存放目录 -4. `Eval.dataset.label_file_list`:指向验证集标注文件 - -```shell -CUDA_VISIBLE_DEVICES=0 python3 tools/train.py -c configs/vqa/re/layoutxlm.yml -``` - -最终会打印出`precision`, `recall`, `hmean`等指标。 -在`./output/re_layoutxlm/`文件夹中会保存训练日志,最优的模型和最新epoch的模型。 - -* 恢复训练 - -恢复训练需要将之前训练好的模型所在文件夹路径赋值给 `Architecture.Backbone.checkpoints` 字段。 - -```shell -CUDA_VISIBLE_DEVICES=0 python3 tools/train.py -c configs/vqa/re/layoutxlm.yml -o Architecture.Backbone.checkpoints=path/to/model_dir -``` - -* 评估 - -评估需要将待评估的模型所在文件夹路径赋值给 `Architecture.Backbone.checkpoints` 字段。 - -```shell -CUDA_VISIBLE_DEVICES=0 python3 tools/eval.py -c configs/vqa/re/layoutxlm.yml -o Architecture.Backbone.checkpoints=path/to/model_dir -``` -最终会打印出`precision`, `recall`, `hmean`等指标 - -* 基于训练引擎的`OCR + SER + RE`串联预测 - -使用如下命令即可完成基于训练引擎的`OCR + SER + RE`串联预测, 以基于LayoutXLMSER和RE模型为例: -```shell -export CUDA_VISIBLE_DEVICES=0 -python3 tools/infer_vqa_token_ser_re.py -c configs/vqa/re/layoutxlm.yml -o Architecture.Backbone.checkpoints=pretrain/re_LayoutXLM_xfun_zh/ Global.infer_img=ppstructure/docs/vqa/input/zh_val_21.jpg -c_ser configs/vqa/ser/layoutxlm.yml -o_ser Architecture.Backbone.checkpoints=pretrain/ser_LayoutXLM_xfun_zh/ -``` - -最终会在`config.Global.save_res_path`字段所配置的目录下保存预测结果可视化图像以及预测结果文本文件,预测结果文本文件名为`infer_results.txt`。 - -* 模型导出 - -cooming soon - -* 基于预测引擎的`OCR + SER + RE`串联预测 - -cooming soon - -## 6. 参考链接 - -- LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding, https://arxiv.org/pdf/2104.08836.pdf -- microsoft/unilm/layoutxlm, https://github.com/microsoft/unilm/tree/master/layoutxlm -- XFUND dataset, https://github.com/doc-analysis/XFUND - -## License - -The content of this project itself is licensed under the [Attribution-NonCommercial-ShareAlike 4.0 International (CC BY-NC-SA 4.0)](https://creativecommons.org/licenses/by-nc-sa/4.0/) diff --git a/ppstructure/vqa/requirements.txt b/ppstructure/vqa/requirements.txt deleted file mode 100644 index fcd882274c4402ba2a1d34f20ee6e2befa157121..0000000000000000000000000000000000000000 --- a/ppstructure/vqa/requirements.txt +++ /dev/null @@ -1,7 +0,0 @@ -sentencepiece -yacs -seqeval -paddlenlp>=2.2.1 -pypandoc -attrdict -python_docx \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index b15176db3eb42c381c1612f404fd15c6b020b3dc..43cd8c1b082768ebad44a5cf58fc31980ebfe891 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,15 +1,17 @@ shapely scikit-image -imgaug==0.4.0 +imgaug pyclipper lmdb tqdm numpy visualdl -python-Levenshtein -opencv-contrib-python==4.4.0.46 +rapidfuzz +opencv-python +opencv-contrib-python cython lxml premailer openpyxl attrdict +Polygon3 diff --git a/test_tipc/benchmark_train.sh b/test_tipc/benchmark_train.sh index 1dcb0129e767e6c35adfad36aa5dce2fbd84a2fd..25fda8f97f0bfdefbd6922b13a0ffef3f40c3de9 100644 --- a/test_tipc/benchmark_train.sh +++ b/test_tipc/benchmark_train.sh @@ -1,12 +1,6 @@ #!/bin/bash source test_tipc/common_func.sh -# set env -python=python -export str_tmp=$(echo `pip list|grep paddlepaddle-gpu|awk -F ' ' '{print $2}'`) -export frame_version=${str_tmp%%.post*} -export frame_commit=$(echo `${python} -c "import paddle;print(paddle.version.commit)"`) - # run benchmark sh # Usage: # bash run_benchmark_train.sh config.txt params @@ -86,6 +80,13 @@ dataline=`cat $FILENAME` IFS=$'\n' lines=(${dataline}) model_name=$(func_parser_value "${lines[1]}") +python_name=$(func_parser_value "${lines[2]}") + +# set env +python=${python_name} +export str_tmp=$(echo `pip list|grep paddlepaddle-gpu|awk -F ' ' '{print $2}'`) +export frame_version=${str_tmp%%.post*} +export frame_commit=$(echo `${python} -c "import paddle;print(paddle.version.commit)"`) # 获取benchmark_params所在的行数 line_num=`grep -n "train_benchmark_params" $FILENAME | cut -d ":" -f 1` diff --git a/test_tipc/common_func.sh b/test_tipc/common_func.sh index f7d8a1e04adee9d32332eda8cb5913bbaf168481..1bbf829165323b76341461b297b71102462d83af 100644 --- a/test_tipc/common_func.sh +++ b/test_tipc/common_func.sh @@ -58,10 +58,11 @@ function status_check(){ run_command=$2 run_log=$3 model_name=$4 + log_path=$5 if [ $last_status -eq 0 ]; then - echo -e "\033[33m Run successfully with command - ${model_name} - ${run_command}! \033[0m" | tee -a ${run_log} + echo -e "\033[33m Run successfully with command - ${model_name} - ${run_command} - ${log_path} \033[0m" | tee -a ${run_log} else - echo -e "\033[33m Run failed with command - ${model_name} - ${run_command}! \033[0m" | tee -a ${run_log} + echo -e "\033[33m Run failed with command - ${model_name} - ${run_command} - ${log_path} \033[0m" | tee -a ${run_log} fi } diff --git a/test_tipc/configs/ch_PP-OCRv2_rec/train_infer_python.txt b/test_tipc/configs/ch_PP-OCRv2_rec/train_infer_python.txt index a96b87dede1e1b4c7b3ed59c4bd9c0470402e7e2..6d20b2df7420371ce964cf8fd5cb29726c000d1d 100644 --- a/test_tipc/configs/ch_PP-OCRv2_rec/train_infer_python.txt +++ b/test_tipc/configs/ch_PP-OCRv2_rec/train_infer_python.txt @@ -41,7 +41,7 @@ inference:tools/infer/predict_rec.py --use_gpu:True|False --enable_mkldnn:False --cpu_threads:6 ---rec_batch_num:1|6 +--rec_batch_num:1 --use_tensorrt:False --precision:fp32 --rec_model_dir: diff --git a/test_tipc/configs/ch_PP-OCRv3_rec/train_infer_python.txt b/test_tipc/configs/ch_PP-OCRv3_rec/train_infer_python.txt index 59fc1bd4160ec77edb0b781c8ffa9845c6a3d5c7..fee08b08ede0f61ae4f57fd42dba303301798a3e 100644 --- a/test_tipc/configs/ch_PP-OCRv3_rec/train_infer_python.txt +++ b/test_tipc/configs/ch_PP-OCRv3_rec/train_infer_python.txt @@ -41,7 +41,7 @@ inference:tools/infer/predict_rec.py --rec_image_shape="3,48,320" --use_gpu:True|False --enable_mkldnn:False --cpu_threads:6 ---rec_batch_num:1|6 +--rec_batch_num:1 --use_tensorrt:False --precision:fp32 --rec_model_dir: diff --git a/test_tipc/configs/ch_ppocr_mobile_v2_0_det/train_infer_python.txt b/test_tipc/configs/ch_ppocr_mobile_v2_0_det/train_infer_python.txt index f3aa9d0f8218a24b11e3d0d079ae79a07d3e5874..4112e6498c6316e211ad69a69bdb531ec7a105b2 100644 --- a/test_tipc/configs/ch_ppocr_mobile_v2_0_det/train_infer_python.txt +++ b/test_tipc/configs/ch_ppocr_mobile_v2_0_det/train_infer_python.txt @@ -13,7 +13,7 @@ train_infer_img_dir:./train_data/icdar2015/text_localization/ch4_test_images/ null:null ## trainer:norm_train -norm_train:tools/train.py -c configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml -o Global.pretrained_model=./pretrain_models/MobileNetV3_large_x0_5_pretrained Global.print_batch_step=1 Train.loader.shuffle=false +norm_train:tools/train.py -c configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml -o Global.pretrained_model=./pretrain_models/MobileNetV3_large_x0_5_pretrained Global.print_batch_step=2 Train.loader.shuffle=false pact_train:null fpgm_train:null distill_train:null diff --git a/test_tipc/configs/ch_ppocr_mobile_v2_0_rec/train_infer_python.txt b/test_tipc/configs/ch_ppocr_mobile_v2_0_rec/train_infer_python.txt index 40f397948936beba0a3a4bdce9aa4a9953ec9d0f..dc490cdc60c2c012549e6fd00c13ec18676ede20 100644 --- a/test_tipc/configs/ch_ppocr_mobile_v2_0_rec/train_infer_python.txt +++ b/test_tipc/configs/ch_ppocr_mobile_v2_0_rec/train_infer_python.txt @@ -41,7 +41,7 @@ inference:tools/infer/predict_rec.py --use_gpu:True|False --enable_mkldnn:False --cpu_threads:6 ---rec_batch_num:1|6 +--rec_batch_num:1 --use_tensorrt:False --precision:fp32 --rec_model_dir: diff --git a/test_tipc/configs/ch_ppocr_server_v2_0_rec/train_infer_python.txt b/test_tipc/configs/ch_ppocr_server_v2_0_rec/train_infer_python.txt index b9a1ae4984c30a08d75b73b884ceb97658eb11c7..85741f98c3fd645a64d8820a046030f1bb7e03c7 100644 --- a/test_tipc/configs/ch_ppocr_server_v2_0_rec/train_infer_python.txt +++ b/test_tipc/configs/ch_ppocr_server_v2_0_rec/train_infer_python.txt @@ -41,7 +41,7 @@ inference:tools/infer/predict_rec.py --use_gpu:True|False --enable_mkldnn:False --cpu_threads:6 ---rec_batch_num:1|6 +--rec_batch_num:1 --use_tensorrt:False --precision:fp32 --rec_model_dir: diff --git a/test_tipc/configs/det_r18_vd_db_v2_0/train_infer_python.txt b/test_tipc/configs/det_r18_ct/train_infer_python.txt similarity index 72% rename from test_tipc/configs/det_r18_vd_db_v2_0/train_infer_python.txt rename to test_tipc/configs/det_r18_ct/train_infer_python.txt index df88c0e5434511fb48deac699e8f67fc535765d3..5933fdbeed762a73324fbfb5a4113a390926e7ea 100644 --- a/test_tipc/configs/det_r18_vd_db_v2_0/train_infer_python.txt +++ b/test_tipc/configs/det_r18_ct/train_infer_python.txt @@ -1,5 +1,5 @@ ===========================train_params=========================== -model_name:det_r18_db_v2_0 +model_name:det_r18_ct python:python3.7 gpu_list:0|0,1 Global.use_gpu:True|True @@ -9,11 +9,11 @@ Global.save_model_dir:./output/ Train.loader.batch_size_per_card:lite_train_lite_infer=2|whole_train_lite_infer=4 Global.pretrained_model:null train_model_name:latest -train_infer_img_dir:./train_data/icdar2015/text_localization/ch4_test_images/ +train_infer_img_dir:./train_data/total_text/test/rgb/ null:null ## trainer:norm_train -norm_train:tools/train.py -c configs/det/det_res18_db_v2.0.yml -o +norm_train:tools/train.py -c configs/det/det_r18_vd_ct.yml -o Global.print_batch_step=1 Train.loader.shuffle=false quant_export:null fpgm_export:null distill_train:null @@ -21,21 +21,21 @@ null:null null:null ## ===========================eval_params=========================== -eval:null +eval:tools/eval.py -c configs/det/det_r18_vd_ct.yml -o null:null ## ===========================infer_params=========================== Global.save_inference_dir:./output/ Global.checkpoints: -norm_export:null +norm_export:tools/export_model.py -c configs/det/det_r18_vd_ct.yml -o quant_export:null fpgm_export:null distill_export:null export1:null export2:null ## -train_model:null -infer_export:null +train_model:./inference/det_r18_vd_ct/best_accuracy +infer_export:tools/export_model.py -c configs/det/det_r18_vd_ct.yml -o infer_quant:False inference:tools/infer/predict_det.py --use_gpu:True|False @@ -50,9 +50,4 @@ inference:tools/infer/predict_det.py --benchmark:True null:null ===========================infer_benchmark_params========================== -random_infer_input:[{float32,[3,640,640]}];[{float32,[3,960,960]}] -===========================train_benchmark_params========================== -batch_size:8|16 -fp_items:fp32|fp16 -epoch:15 ---profiler_options:batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile +random_infer_input:[{float32,[3,640,640]}];[{float32,[3,960,960]}] \ No newline at end of file diff --git a/test_tipc/configs/det_r50_dcn_fce_ctw_v2_0/det_r50_vd_dcn_fce_ctw.yml b/test_tipc/configs/det_r50_dcn_fce_ctw_v2_0/det_r50_vd_dcn_fce_ctw.yml index 3a513b8f38cd5abf800c86f8fbeda789cb3d056a..29f6f32a58739e181d0c0f54d62021e3754a324a 100644 --- a/test_tipc/configs/det_r50_dcn_fce_ctw_v2_0/det_r50_vd_dcn_fce_ctw.yml +++ b/test_tipc/configs/det_r50_dcn_fce_ctw_v2_0/det_r50_vd_dcn_fce_ctw.yml @@ -8,7 +8,7 @@ Global: # evaluation is run every 835 iterations eval_batch_step: [0, 4000] cal_metric_during_train: False - pretrained_model: ./pretrain_models/ResNet50_vd_ssld_pretrained + pretrained_model: pretrain_models/det_r50_dcn_fce_ctw_v2.0_train/best_accuracy.pdparams checkpoints: save_inference_dir: use_visualdl: False diff --git a/test_tipc/configs/en_table_structure/model_linux_gpu_normal_normal_infer_cpp_linux_gpu_cpu.txt b/test_tipc/configs/en_table_structure/model_linux_gpu_normal_normal_infer_cpp_linux_gpu_cpu.txt new file mode 100644 index 0000000000000000000000000000000000000000..ad002a334e3b351b0fa2aa641906f4aa753071c9 --- /dev/null +++ b/test_tipc/configs/en_table_structure/model_linux_gpu_normal_normal_infer_cpp_linux_gpu_cpu.txt @@ -0,0 +1,20 @@ +===========================cpp_infer_params=========================== +model_name:en_table_structure +use_opencv:True +infer_model:./inference/en_ppocr_mobile_v2.0_table_structure_infer/ +infer_quant:False +inference:./deploy/cpp_infer/build/ppocr --rec_img_h=32 --det_model_dir=./inference/en_ppocr_mobile_v2.0_table_det_infer --rec_model_dir=./inference/en_ppocr_mobile_v2.0_table_rec_infer --rec_char_dict_path=./ppocr/utils/dict/table_dict.txt --table_char_dict_path=./ppocr/utils/dict/table_structure_dict.txt --limit_side_len=736 --limit_type=min --output=./output/table --merge_no_span_structure=False --type=structure --table=True +--use_gpu:True|False +--enable_mkldnn:False +--cpu_threads:6 +--rec_batch_num:6 +--use_tensorrt:False +--precision:fp32 +--table_model_dir: +--image_dir:./ppstructure/docs/table/table.jpg +null:null +--benchmark:True +--det:True +--rec:True +--cls:False +--use_angle_cls:False \ No newline at end of file diff --git a/test_tipc/configs/en_table_structure/table_mv3.yml b/test_tipc/configs/en_table_structure/table_mv3.yml index 5d8e84c95c477a639130a342c6c72345e97701da..edcbe2c3b00e8d8a56ad8dd9f208e283b511b86e 100755 --- a/test_tipc/configs/en_table_structure/table_mv3.yml +++ b/test_tipc/configs/en_table_structure/table_mv3.yml @@ -4,7 +4,7 @@ Global: log_smooth_window: 20 print_batch_step: 5 save_model_dir: ./output/table_mv3/ - save_epoch_step: 3 + save_epoch_step: 400 # evaluation is run every 400 iterations after the 0th iteration eval_batch_step: [0, 40000] cal_metric_during_train: True @@ -17,10 +17,9 @@ Global: # for data or label process character_dict_path: ppocr/utils/dict/table_structure_dict.txt character_type: en - max_text_length: 800 + max_text_length: &max_text_length 500 + box_format: &box_format 'xyxy' # 'xywh', 'xyxy', 'xyxyxyxy' infer_mode: False - process_total_num: 0 - process_cut_num: 0 Optimizer: name: Adam @@ -39,12 +38,14 @@ Architecture: Backbone: name: MobileNetV3 scale: 1.0 - model_name: large + model_name: small + disable_se: true Head: name: TableAttentionHead hidden_size: 256 loc_type: 2 - max_text_length: 800 + max_text_length: *max_text_length + loc_reg_num: &loc_reg_num 4 Loss: name: TableAttentionLoss @@ -72,6 +73,8 @@ Train: learn_empty_box: False merge_no_span_structure: False replace_empty_cell_token: False + loc_reg_num: *loc_reg_num + max_text_length: *max_text_length - TableBoxEncode: - ResizeTableImage: max_len: 488 @@ -104,6 +107,8 @@ Eval: learn_empty_box: False merge_no_span_structure: False replace_empty_cell_token: False + loc_reg_num: *loc_reg_num + max_text_length: *max_text_length - TableBoxEncode: - ResizeTableImage: max_len: 488 diff --git a/test_tipc/configs/en_table_structure/train_infer_python.txt b/test_tipc/configs/en_table_structure/train_infer_python.txt index 633b6185d976ac61408283025bd4ba305187317d..3fd5dc9f60a9621026d488e5654cd7e1421e8b65 100644 --- a/test_tipc/configs/en_table_structure/train_infer_python.txt +++ b/test_tipc/configs/en_table_structure/train_infer_python.txt @@ -54,6 +54,6 @@ random_infer_input:[{float32,[3,488,488]}] ===========================train_benchmark_params========================== batch_size:32 fp_items:fp32|fp16 -epoch:1 +epoch:2 --profiler_options:batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile flags:FLAGS_eager_delete_tensor_gb=0.0;FLAGS_fraction_of_gpu_memory_to_use=0.98;FLAGS_conv_workspace_size_limit=4096 diff --git a/test_tipc/configs/layoutxlm_ser/ser_layoutxlm_xfund_zh.yml b/test_tipc/configs/layoutxlm_ser/ser_layoutxlm_xfund_zh.yml new file mode 100644 index 0000000000000000000000000000000000000000..d2be152f0bae7d87129904d87c56c6d777a1f338 --- /dev/null +++ b/test_tipc/configs/layoutxlm_ser/ser_layoutxlm_xfund_zh.yml @@ -0,0 +1,122 @@ +Global: + use_gpu: True + epoch_num: &epoch_num 200 + log_smooth_window: 10 + print_batch_step: 10 + save_model_dir: ./output/ser_layoutxlm_xfund_zh + save_epoch_step: 2000 + # evaluation is run every 10 iterations after the 0th iteration + eval_batch_step: [ 0, 187 ] + cal_metric_during_train: False + save_inference_dir: + use_visualdl: False + seed: 2022 + infer_img: ppstructure/docs/kie/input/zh_val_42.jpg + save_res_path: ./output/ser_layoutxlm_xfund_zh/res + +Architecture: + model_type: kie + algorithm: &algorithm "LayoutXLM" + Transform: + Backbone: + name: LayoutXLMForSer + pretrained: True + checkpoints: + num_classes: &num_classes 7 + +Loss: + name: VQASerTokenLayoutLMLoss + num_classes: *num_classes + key: "backbone_out" + +Optimizer: + name: AdamW + beta1: 0.9 + beta2: 0.999 + lr: + name: Linear + learning_rate: 0.00005 + epochs: *epoch_num + warmup_epoch: 2 + regularizer: + name: L2 + factor: 0.00000 + +PostProcess: + name: VQASerTokenLayoutLMPostProcess + class_path: &class_path train_data/XFUND/class_list_xfun.txt + +Metric: + name: VQASerTokenMetric + main_indicator: hmean + +Train: + dataset: + name: SimpleDataSet + data_dir: train_data/XFUND/zh_train/image + label_file_list: + - train_data/XFUND/zh_train/train.json + ratio_list: [ 1.0 ] + transforms: + - DecodeImage: # load image + img_mode: RGB + channel_first: False + - VQATokenLabelEncode: # Class handling label + contains_re: False + algorithm: *algorithm + class_path: *class_path + - VQATokenPad: + max_seq_len: &max_seq_len 512 + return_attention_mask: True + - VQASerTokenChunk: + max_seq_len: *max_seq_len + - Resize: + size: [224,224] + - NormalizeImage: + scale: 1 + mean: [ 123.675, 116.28, 103.53 ] + std: [ 58.395, 57.12, 57.375 ] + order: 'hwc' + - ToCHWImage: + - KeepKeys: + keep_keys: [ 'input_ids', 'bbox', 'attention_mask', 'token_type_ids', 'image', 'labels'] # dataloader will return list in this order + loader: + shuffle: True + drop_last: False + batch_size_per_card: 8 + num_workers: 4 + +Eval: + dataset: + name: SimpleDataSet + data_dir: train_data/XFUND/zh_val/image + label_file_list: + - train_data/XFUND/zh_val/val.json + transforms: + - DecodeImage: # load image + img_mode: RGB + channel_first: False + - VQATokenLabelEncode: # Class handling label + contains_re: False + algorithm: *algorithm + class_path: *class_path + - VQATokenPad: + max_seq_len: *max_seq_len + return_attention_mask: True + - VQASerTokenChunk: + max_seq_len: *max_seq_len + - Resize: + size: [224,224] + - NormalizeImage: + scale: 1 + mean: [ 123.675, 116.28, 103.53 ] + std: [ 58.395, 57.12, 57.375 ] + order: 'hwc' + - ToCHWImage: + - KeepKeys: + keep_keys: [ 'input_ids', 'bbox', 'attention_mask', 'token_type_ids', 'image', 'labels'] # dataloader will return list in this order + loader: + shuffle: False + drop_last: False + batch_size_per_card: 8 + num_workers: 4 diff --git a/test_tipc/configs/layoutxlm_ser/train_infer_python.txt b/test_tipc/configs/layoutxlm_ser/train_infer_python.txt index 6d05d413e106eee873b026d60fb4320c61f833c4..d07daa9a1429ec5cd1955ec64ded122a9d1a723d 100644 --- a/test_tipc/configs/layoutxlm_ser/train_infer_python.txt +++ b/test_tipc/configs/layoutxlm_ser/train_infer_python.txt @@ -9,11 +9,11 @@ Global.save_model_dir:./output/ Train.loader.batch_size_per_card:lite_train_lite_infer=4|whole_train_whole_infer=8 Architecture.Backbone.checkpoints:null train_model_name:latest -train_infer_img_dir:ppstructure/docs/vqa/input/zh_val_42.jpg +train_infer_img_dir:ppstructure/docs/kie/input/zh_val_42.jpg null:null ## trainer:norm_train -norm_train:tools/train.py -c configs/vqa/ser/layoutxlm_xfund_zh.yml -o Global.print_batch_step=1 Global.eval_batch_step=[1000,1000] Train.loader.shuffle=false +norm_train:tools/train.py -c test_tipc/configs/layoutxlm_ser/ser_layoutxlm_xfund_zh.yml -o Global.print_batch_step=1 Global.eval_batch_step=[1000,1000] Train.loader.shuffle=false pact_train:null fpgm_train:null distill_train:null @@ -27,7 +27,7 @@ null:null ===========================infer_params=========================== Global.save_inference_dir:./output/ Architecture.Backbone.checkpoints: -norm_export:tools/export_model.py -c configs/vqa/ser/layoutxlm_xfund_zh.yml -o +norm_export:tools/export_model.py -c test_tipc/configs/layoutxlm_ser/ser_layoutxlm_xfund_zh.yml -o quant_export: fpgm_export: distill_export:null @@ -37,7 +37,7 @@ export2:null infer_model:null infer_export:null infer_quant:False -inference:ppstructure/vqa/predict_vqa_token_ser.py --vqa_algorithm=LayoutXLM --ser_dict_path=train_data/XFUND/class_list_xfun.txt --output=output +inference:ppstructure/kie/predict_kie_token_ser.py --kie_algorithm=LayoutXLM --ser_dict_path=train_data/XFUND/class_list_xfun.txt --output=output --use_gpu:True|False --enable_mkldnn:False --cpu_threads:6 @@ -45,14 +45,14 @@ inference:ppstructure/vqa/predict_vqa_token_ser.py --vqa_algorithm=LayoutXLM - --use_tensorrt:False --precision:fp32 --ser_model_dir: ---image_dir:./ppstructure/docs/vqa/input/zh_val_42.jpg +--image_dir:./ppstructure/docs/kie/input/zh_val_42.jpg null:null --benchmark:False null:null ===========================infer_benchmark_params========================== random_infer_input:[{float32,[3,224,224]}] ===========================train_benchmark_params========================== -batch_size:4 +batch_size:8 fp_items:fp32|fp16 epoch:3 --profiler_options:batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile diff --git a/test_tipc/configs/layoutxlm_ser/train_pact_infer_python.txt b/test_tipc/configs/layoutxlm_ser/train_pact_infer_python.txt new file mode 100644 index 0000000000000000000000000000000000000000..fbf2a880269fba4596908def0980cb778a9281e3 --- /dev/null +++ b/test_tipc/configs/layoutxlm_ser/train_pact_infer_python.txt @@ -0,0 +1,53 @@ +===========================train_params=========================== +model_name:layoutxlm_ser_PACT +python:python3.7 +gpu_list:0|0,1 +Global.use_gpu:True|True +Global.auto_cast:fp32 +Global.epoch_num:lite_train_lite_infer=1|whole_train_whole_infer=17 +Global.save_model_dir:./output/ +Train.loader.batch_size_per_card:lite_train_lite_infer=4|whole_train_whole_infer=8 +Architecture.Backbone.checkpoints:pretrain_models/ser_LayoutXLM_xfun_zh +train_model_name:latest +train_infer_img_dir:ppstructure/docs/kie/input/zh_val_42.jpg +null:null +## +trainer:pact_train +norm_train:null +pact_train:deploy/slim/quantization/quant.py -c test_tipc/configs/layoutxlm_ser/ser_layoutxlm_xfund_zh.yml -o +fpgm_train:null +distill_train:null +null:null +null:null +## +===========================eval_params=========================== +eval:null +null:null +## +===========================infer_params=========================== +Global.save_inference_dir:./output/ +Architecture.Backbone.checkpoints: +norm_export:null +quant_export:deploy/slim/quantization/export_model.py -c test_tipc/configs/layoutxlm_ser/ser_layoutxlm_xfund_zh.yml -o +fpgm_export: null +distill_export:null +export1:null +export2:null +## +infer_model:null +infer_export:null +infer_quant:False +inference:ppstructure/kie/predict_kie_token_ser.py --kie_algorithm=LayoutXLM --ser_dict_path=train_data/XFUND/class_list_xfun.txt --output=output +--use_gpu:True|False +--enable_mkldnn:False +--cpu_threads:6 +--rec_batch_num:1 +--use_tensorrt:False +--precision:fp32 +--ser_model_dir: +--image_dir:./ppstructure/docs/kie/input/zh_val_42.jpg +null:null +--benchmark:False +null:null +===========================infer_benchmark_params========================== +random_infer_input:[{float32,[3,224,224]}] diff --git a/test_tipc/configs/layoutxlm_ser/train_ptq_infer_python.txt b/test_tipc/configs/layoutxlm_ser/train_ptq_infer_python.txt new file mode 100644 index 0000000000000000000000000000000000000000..47e1e7026bd6bb113b05d70c2bfc7f90879bd485 --- /dev/null +++ b/test_tipc/configs/layoutxlm_ser/train_ptq_infer_python.txt @@ -0,0 +1,21 @@ +===========================train_params=========================== +model_name:layoutxlm_ser_KL +python:python3.7 +Global.pretrained_model: +Global.save_inference_dir:null +infer_model:./inference/ser_LayoutXLM_xfun_zh_infer/ +infer_export:deploy/slim/quantization/quant_kl.py -c test_tipc/configs/layoutxlm_ser/ser_layoutxlm_xfund_zh.yml -o Train.loader.batch_size_per_card=1 Eval.loader.batch_size_per_card=1 +infer_quant:True +inference:ppstructure/kie/predict_kie_token_ser.py --kie_algorithm=LayoutXLM --ser_dict_path=./train_data/XFUND/class_list_xfun.txt +--use_gpu:True|False +--enable_mkldnn:False +--cpu_threads:6 +--rec_batch_num:1 +--use_tensorrt:False +--precision:int8 +--ser_model_dir: +--image_dir:./ppstructure/docs/kie/input/zh_val_42.jpg +null:null +--benchmark:False +null:null +null:null diff --git a/test_tipc/configs/rec_mtb_nrtr/train_infer_python.txt b/test_tipc/configs/rec_mtb_nrtr/train_infer_python.txt index fed8ba26753bb770e062f751a9ba1e8e35fc6843..4a8fda0fea76da41a0a13b61f35d96a4d230d488 100644 --- a/test_tipc/configs/rec_mtb_nrtr/train_infer_python.txt +++ b/test_tipc/configs/rec_mtb_nrtr/train_infer_python.txt @@ -41,7 +41,7 @@ inference:tools/infer/predict_rec.py --rec_char_dict_path=./ppocr/utils/EN_symbo --use_gpu:True|False --enable_mkldnn:False --cpu_threads:6 ---rec_batch_num:1|6 +--rec_batch_num:1 --use_tensorrt:False --precision:fp32 --rec_model_dir: diff --git a/test_tipc/configs/rec_mv3_none_bilstm_ctc_v2_0/train_infer_python.txt b/test_tipc/configs/rec_mv3_none_bilstm_ctc_v2_0/train_infer_python.txt index db89b4c78d72d1853096d6b44b73a7ca61792dfe..22c29c9b233ac908741accd7eb85fb3832fb0c0f 100644 --- a/test_tipc/configs/rec_mv3_none_bilstm_ctc_v2_0/train_infer_python.txt +++ b/test_tipc/configs/rec_mv3_none_bilstm_ctc_v2_0/train_infer_python.txt @@ -41,7 +41,7 @@ inference:tools/infer/predict_rec.py --rec_char_dict_path=./ppocr/utils/ic15_dic --use_gpu:True|False --enable_mkldnn:False --cpu_threads:6 ---rec_batch_num:1|6 +--rec_batch_num:1 --use_tensorrt:False --precision:fp32 --rec_model_dir: diff --git a/test_tipc/configs/rec_mv3_none_none_ctc_v2_0/train_infer_python.txt b/test_tipc/configs/rec_mv3_none_none_ctc_v2_0/train_infer_python.txt index 003e91ff3d95e62d4353d7c4545e780ecd2f9708..d91c55e8852eee2cc7913235308f6d1f31e1f2e9 100644 --- a/test_tipc/configs/rec_mv3_none_none_ctc_v2_0/train_infer_python.txt +++ b/test_tipc/configs/rec_mv3_none_none_ctc_v2_0/train_infer_python.txt @@ -41,7 +41,7 @@ inference:tools/infer/predict_rec.py --rec_char_dict_path=./ppocr/utils/ic15_dic --use_gpu:True|False --enable_mkldnn:False --cpu_threads:6 ---rec_batch_num:1|6 +--rec_batch_num:1 --use_tensorrt:False --precision:fp32 --rec_model_dir: diff --git a/test_tipc/configs/rec_mv3_tps_bilstm_att_v2_0/train_infer_python.txt b/test_tipc/configs/rec_mv3_tps_bilstm_att_v2_0/train_infer_python.txt index c7b416c83323863a905929a2effcb1d3ad856422..77dc79cdae8bf4843ad17282885b46a33e64ce53 100644 --- a/test_tipc/configs/rec_mv3_tps_bilstm_att_v2_0/train_infer_python.txt +++ b/test_tipc/configs/rec_mv3_tps_bilstm_att_v2_0/train_infer_python.txt @@ -41,7 +41,7 @@ inference:tools/infer/predict_rec.py --rec_char_dict_path=./ppocr/utils/ic15_dic --use_gpu:True|False --enable_mkldnn:False --cpu_threads:6 ---rec_batch_num:1|6 +--rec_batch_num:1 --use_tensorrt:False --precision:fp32 --rec_model_dir: diff --git a/test_tipc/configs/rec_mv3_tps_bilstm_ctc_v2_0/train_infer_python.txt b/test_tipc/configs/rec_mv3_tps_bilstm_ctc_v2_0/train_infer_python.txt index 0c6e2d1da7f163521e8859bd8c96436b2a6bac64..f38c8d8d67bae84232749e60952a5c73871f9a88 100644 --- a/test_tipc/configs/rec_mv3_tps_bilstm_ctc_v2_0/train_infer_python.txt +++ b/test_tipc/configs/rec_mv3_tps_bilstm_ctc_v2_0/train_infer_python.txt @@ -41,7 +41,7 @@ inference:tools/infer/predict_rec.py --rec_char_dict_path=./ppocr/utils/ic15_dic --use_gpu:True|False --enable_mkldnn:False --cpu_threads:6 ---rec_batch_num:1|6 +--rec_batch_num:1 --use_tensorrt:False --precision:fp32 --rec_model_dir: diff --git a/test_tipc/configs/rec_r31_robustscanner/rec_r31_robustscanner.yml b/test_tipc/configs/rec_r31_robustscanner/rec_r31_robustscanner.yml new file mode 100644 index 0000000000000000000000000000000000000000..b5466d4478be27d6fd152ee467f7f25731c8dce0 --- /dev/null +++ b/test_tipc/configs/rec_r31_robustscanner/rec_r31_robustscanner.yml @@ -0,0 +1,111 @@ +Global: + use_gpu: true + epoch_num: 5 + log_smooth_window: 20 + print_batch_step: 20 + save_model_dir: ./output/rec/rec_r31_robustscanner/ + save_epoch_step: 1 + # evaluation is run every 2000 iterations + eval_batch_step: [0, 2000] + cal_metric_during_train: True + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: False + infer_img: ./inference/rec_inference + # for data or label process + character_dict_path: ppocr/utils/dict90.txt + max_text_length: &max_text_length 40 + infer_mode: False + use_space_char: False + rm_symbol: True + save_res_path: ./output/rec/predicts_robustscanner.txt + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Piecewise + decay_epochs: [3, 4] + values: [0.001, 0.0001, 0.00001] + regularizer: + name: 'L2' + factor: 0 + +Architecture: + model_type: rec + algorithm: RobustScanner + Transform: + Backbone: + name: ResNet31 + init_type: KaimingNormal + Head: + name: RobustScannerHead + enc_outchannles: 128 + hybrid_dec_rnn_layers: 2 + hybrid_dec_dropout: 0 + position_dec_rnn_layers: 2 + start_idx: 91 + mask: True + padding_idx: 92 + encode_value: False + max_text_length: *max_text_length + +Loss: + name: SARLoss + +PostProcess: + name: SARLabelDecode + +Metric: + name: RecMetric + is_filter: True + + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ic15_data/ + label_file_list: ["./train_data/ic15_data/rec_gt_train.txt"] + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - SARLabelEncode: # Class handling label + - RobustScannerRecResizeImg: + image_shape: [3, 48, 48, 160] # h:48 w:[48,160] + width_downsample_ratio: 0.25 + max_text_length: *max_text_length + - KeepKeys: + keep_keys: ['image', 'label', 'valid_ratio', 'word_positons'] # dataloader will return list in this order + loader: + shuffle: True + batch_size_per_card: 16 + drop_last: True + num_workers: 0 + use_shared_memory: False + +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ic15_data + label_file_list: ["./train_data/ic15_data/rec_gt_test.txt"] + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - SARLabelEncode: # Class handling label + - RobustScannerRecResizeImg: + image_shape: [3, 48, 48, 160] + max_text_length: *max_text_length + width_downsample_ratio: 0.25 + - KeepKeys: + keep_keys: ['image', 'label', 'valid_ratio', 'word_positons'] # dataloader will return list in this order + loader: + shuffle: False + drop_last: False + batch_size_per_card: 16 + num_workers: 0 + use_shared_memory: False + diff --git a/test_tipc/configs/rec_r31_robustscanner/train_infer_python.txt b/test_tipc/configs/rec_r31_robustscanner/train_infer_python.txt new file mode 100644 index 0000000000000000000000000000000000000000..1bf8dc0b6c5ba707d572bc0ad44818d5a51c8800 --- /dev/null +++ b/test_tipc/configs/rec_r31_robustscanner/train_infer_python.txt @@ -0,0 +1,54 @@ +===========================train_params=========================== +model_name:rec_r31_robustscanner +python:python3.7 +gpu_list:0|0,1 +Global.use_gpu:True|True +Global.auto_cast:null +Global.epoch_num:lite_train_lite_infer=2|whole_train_whole_infer=5 +Global.save_model_dir:./output/ +Train.loader.batch_size_per_card:lite_train_lite_infer=16|whole_train_whole_infer=64 +Global.pretrained_model:null +train_model_name:latest +train_infer_img_dir:./inference/rec_inference +null:null +## +trainer:norm_train +norm_train:tools/train.py -c test_tipc/configs/rec_r31_robustscanner/rec_r31_robustscanner.yml -o +pact_train:null +fpgm_train:null +distill_train:null +null:null +null:null +## +===========================eval_params=========================== +eval:tools/eval.py -c test_tipc/configs/rec_r31_robustscanner/rec_r31_robustscanner.yml -o +null:null +## +===========================infer_params=========================== +Global.save_inference_dir:./output/ +Global.checkpoints: +norm_export:tools/export_model.py -c test_tipc/configs/rec_r31_robustscanner/rec_r31_robustscanner.yml -o +quant_export:null +fpgm_export:null +distill_export:null +export1:null +export2:null +## +train_model:./inference/rec_r31_robustscanner/best_accuracy +infer_export:tools/export_model.py -c test_tipc/configs/rec_r31_robustscanner/rec_r31_robustscanner.yml -o +infer_quant:False +inference:tools/infer/predict_rec.py --rec_char_dict_path=./ppocr/utils/dict90.txt --rec_image_shape="3,48,48,160" --use_space_char=False --rec_algorithm="RobustScanner" +--use_gpu:True|False +--enable_mkldnn:False +--cpu_threads:6 +--rec_batch_num:1 +--use_tensorrt:False +--precision:fp32 +--rec_model_dir: +--image_dir:./inference/rec_inference +--save_log_path:./test/output/ +--benchmark:True +null:null +===========================infer_benchmark_params========================== +random_infer_input:[{float32,[3,48,160]}] + diff --git a/test_tipc/configs/rec_r31_sar/train_infer_python.txt b/test_tipc/configs/rec_r31_sar/train_infer_python.txt index 03ec54abb65ac41d3b5ad4f6e2fdcf7abb34c344..4acc6223e3b65211d62f2f128150e1c76f286674 100644 --- a/test_tipc/configs/rec_r31_sar/train_infer_python.txt +++ b/test_tipc/configs/rec_r31_sar/train_infer_python.txt @@ -41,7 +41,7 @@ inference:tools/infer/predict_rec.py --rec_char_dict_path=./ppocr/utils/dict90.t --use_gpu:True --enable_mkldnn:False --cpu_threads:6 ---rec_batch_num:1|6 +--rec_batch_num:1 --use_tensorrt:False --precision:fp32 --rec_model_dir: diff --git a/test_tipc/configs/rec_r32_gaspin_bilstm_att/train_infer_python.txt b/test_tipc/configs/rec_r32_gaspin_bilstm_att/train_infer_python.txt index 115dfd661abc64db9e14c629f79099be7b6ff0e0..ac378b36046d532a887056183de9c7788f628b76 100644 --- a/test_tipc/configs/rec_r32_gaspin_bilstm_att/train_infer_python.txt +++ b/test_tipc/configs/rec_r32_gaspin_bilstm_att/train_infer_python.txt @@ -41,7 +41,7 @@ inference:tools/infer/predict_rec.py --rec_char_dict_path=./ppocr/utils/dict/spi --use_gpu:True|False --enable_mkldnn:False --cpu_threads:6 ---rec_batch_num:1|6 +--rec_batch_num:1 --use_tensorrt:False --precision:fp32 --rec_model_dir: diff --git a/test_tipc/configs/rec_r34_vd_none_bilstm_ctc_v2_0/train_infer_python.txt b/test_tipc/configs/rec_r34_vd_none_bilstm_ctc_v2_0/train_infer_python.txt index 07a6190b0ef09da5cd20b9dd8ea922544c578710..b53efbd6ba5db36813733f6682bde1cfd614c6ee 100644 --- a/test_tipc/configs/rec_r34_vd_none_bilstm_ctc_v2_0/train_infer_python.txt +++ b/test_tipc/configs/rec_r34_vd_none_bilstm_ctc_v2_0/train_infer_python.txt @@ -41,7 +41,7 @@ inference:tools/infer/predict_rec.py --rec_char_dict_path=./ppocr/utils/ic15_dic --use_gpu:True|False --enable_mkldnn:False --cpu_threads:6 ---rec_batch_num:1|6 +--rec_batch_num:1 --use_tensorrt:False --precision:fp32 --rec_model_dir: diff --git a/test_tipc/configs/rec_r34_vd_none_none_ctc_v2_0/train_infer_python.txt b/test_tipc/configs/rec_r34_vd_none_none_ctc_v2_0/train_infer_python.txt index 145793aa472d8330daf9321f44692a03e7ef6354..7d953968b8a9d3f62f7c6fb48ed65bd9743d5ba3 100644 --- a/test_tipc/configs/rec_r34_vd_none_none_ctc_v2_0/train_infer_python.txt +++ b/test_tipc/configs/rec_r34_vd_none_none_ctc_v2_0/train_infer_python.txt @@ -41,7 +41,7 @@ inference:tools/infer/predict_rec.py --rec_char_dict_path=./ppocr/utils/ic15_dic --use_gpu:True|False --enable_mkldnn:False --cpu_threads:6 ---rec_batch_num:1|6 +--rec_batch_num:1 --use_tensorrt:False --precision:fp32 --rec_model_dir: diff --git a/test_tipc/configs/rec_r34_vd_tps_bilstm_att_v2_0/train_infer_python.txt b/test_tipc/configs/rec_r34_vd_tps_bilstm_att_v2_0/train_infer_python.txt index 759518a4a11a17e076401bb8dd193617c9f10530..0910ff840e350333a26de9b959229b6f8d39c19e 100644 --- a/test_tipc/configs/rec_r34_vd_tps_bilstm_att_v2_0/train_infer_python.txt +++ b/test_tipc/configs/rec_r34_vd_tps_bilstm_att_v2_0/train_infer_python.txt @@ -41,7 +41,7 @@ inference:tools/infer/predict_rec.py --rec_char_dict_path=./ppocr/utils/ic15_dic --use_gpu:True|False --enable_mkldnn:False --cpu_threads:6 ---rec_batch_num:1|6 +--rec_batch_num:1 --use_tensorrt:False --precision:fp32 --rec_model_dir: diff --git a/test_tipc/configs/rec_r34_vd_tps_bilstm_ctc_v2_0/train_infer_python.txt b/test_tipc/configs/rec_r34_vd_tps_bilstm_ctc_v2_0/train_infer_python.txt index ecc898341ce14dfed0de4290b798dd70078ae2da..33144e622e5fbb399e6dd274196812e2d44dc0fd 100644 --- a/test_tipc/configs/rec_r34_vd_tps_bilstm_ctc_v2_0/train_infer_python.txt +++ b/test_tipc/configs/rec_r34_vd_tps_bilstm_ctc_v2_0/train_infer_python.txt @@ -41,7 +41,7 @@ inference:tools/infer/predict_rec.py --rec_char_dict_path=./ppocr/utils/ic15_dic --use_gpu:True|False --enable_mkldnn:False --cpu_threads:6 ---rec_batch_num:1|6 +--rec_batch_num:1 --use_tensorrt:False --precision:fp32 --rec_model_dir: diff --git a/test_tipc/configs/rec_r45_abinet/train_infer_python.txt b/test_tipc/configs/rec_r45_abinet/train_infer_python.txt index ecab1bcbbde11fc6d14357b6715033704c2c3316..04fc188649c77c62b43307cb2fff2249f28bddae 100644 --- a/test_tipc/configs/rec_r45_abinet/train_infer_python.txt +++ b/test_tipc/configs/rec_r45_abinet/train_infer_python.txt @@ -41,7 +41,7 @@ inference:tools/infer/predict_rec.py --rec_char_dict_path=./ppocr/utils/ic15_dic --use_gpu:True|False --enable_mkldnn:False --cpu_threads:6 ---rec_batch_num:1|6 +--rec_batch_num:1 --use_tensorrt:False --precision:fp32 --rec_model_dir: diff --git a/test_tipc/configs/rec_r45_visionlan/rec_r45_visionlan.yml b/test_tipc/configs/rec_r45_visionlan/rec_r45_visionlan.yml new file mode 100644 index 0000000000000000000000000000000000000000..860e4f53043138e7434d71a816fdf051048be6f7 --- /dev/null +++ b/test_tipc/configs/rec_r45_visionlan/rec_r45_visionlan.yml @@ -0,0 +1,108 @@ +Global: + use_gpu: true + epoch_num: 8 + log_smooth_window: 200 + print_batch_step: 200 + save_model_dir: ./output/rec/r45_visionlan + save_epoch_step: 1 + # evaluation is run every 2000 iterations + eval_batch_step: [0, 2000] + cal_metric_during_train: True + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: False + infer_img: doc/imgs_words/en/word_2.png + # for data or label process + character_dict_path: + max_text_length: &max_text_length 25 + training_step: &training_step LA + infer_mode: False + use_space_char: False + save_res_path: ./output/rec/predicts_visionlan.txt + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + clip_norm: 20.0 + group_lr: true + training_step: *training_step + lr: + name: Piecewise + decay_epochs: [6] + values: [0.0001, 0.00001] + regularizer: + name: 'L2' + factor: 0 + +Architecture: + model_type: rec + algorithm: VisionLAN + Transform: + Backbone: + name: ResNet45 + strides: [2, 2, 2, 1, 1] + Head: + name: VLHead + n_layers: 3 + n_position: 256 + n_dim: 512 + max_text_length: *max_text_length + training_step: *training_step + +Loss: + name: VLLoss + mode: *training_step + weight_res: 0.5 + weight_mas: 0.5 + +PostProcess: + name: VLLabelDecode + +Metric: + name: RecMetric + is_filter: true + + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ic15_data/ + label_file_list: ["./train_data/ic15_data/rec_gt_train.txt"] + transforms: + - DecodeImage: # load image + img_mode: RGB + channel_first: False + - ABINetRecAug: + - VLLabelEncode: # Class handling label + - VLRecResizeImg: + image_shape: [3, 64, 256] + - KeepKeys: + keep_keys: ['image', 'label', 'label_res', 'label_sub', 'label_id', 'length'] # dataloader will return list in this order + loader: + shuffle: True + batch_size_per_card: 220 + drop_last: True + num_workers: 4 + +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ic15_data + label_file_list: ["./train_data/ic15_data/rec_gt_test.txt"] + transforms: + - DecodeImage: # load image + img_mode: RGB + channel_first: False + - VLLabelEncode: # Class handling label + - VLRecResizeImg: + image_shape: [3, 64, 256] + - KeepKeys: + keep_keys: ['image', 'label', 'label_res', 'label_sub', 'label_id', 'length'] # dataloader will return list in this order + loader: + shuffle: False + drop_last: False + batch_size_per_card: 64 + num_workers: 4 + diff --git a/test_tipc/configs/rec_r45_visionlan/train_infer_python.txt b/test_tipc/configs/rec_r45_visionlan/train_infer_python.txt new file mode 100644 index 0000000000000000000000000000000000000000..79618edafa794a683e085fb1b8050358342e1f77 --- /dev/null +++ b/test_tipc/configs/rec_r45_visionlan/train_infer_python.txt @@ -0,0 +1,53 @@ +===========================train_params=========================== +model_name:rec_r45_visionlan +python:python3.7 +gpu_list:0|0,1 +Global.use_gpu:True|True +Global.auto_cast:null +Global.epoch_num:lite_train_lite_infer=2|whole_train_whole_infer=300 +Global.save_model_dir:./output/ +Train.loader.batch_size_per_card:lite_train_lite_infer=32|whole_train_whole_infer=64 +Global.pretrained_model:null +train_model_name:latest +train_infer_img_dir:./inference/rec_inference +null:null +## +trainer:norm_train +norm_train:tools/train.py -c test_tipc/configs/rec_r45_visionlan/rec_r45_visionlan.yml -o +pact_train:null +fpgm_train:null +distill_train:null +null:null +null:null +## +===========================eval_params=========================== +eval:tools/eval.py -c test_tipc/configs/rec_r45_visionlan/rec_r45_visionlan.yml -o +null:null +## +===========================infer_params=========================== +Global.save_inference_dir:./output/ +Global.checkpoints: +norm_export:tools/export_model.py -c test_tipc/configs/rec_r45_visionlan/rec_r45_visionlan.yml -o +quant_export:null +fpgm_export:null +distill_export:null +export1:null +export2:null +## +train_model:./inference/rec_r45_visionlan_train/best_accuracy +infer_export:tools/export_model.py -c test_tipc/configs/rec_r45_visionlan/rec_r45_visionlan.yml -o +infer_quant:False +inference:tools/infer/predict_rec.py --rec_char_dict_path=./ppocr/utils/ic15_dict.txt --rec_image_shape="3,64,256" --rec_algorithm="VisionLAN" --use_space_char=False +--use_gpu:True|False +--enable_mkldnn:False +--cpu_threads:6 +--rec_batch_num:1 +--use_tensorrt:False +--precision:fp32 +--rec_model_dir: +--image_dir:./inference/rec_inference +--save_log_path:./test/output/ +--benchmark:True +null:null +===========================infer_benchmark_params========================== +random_infer_input:[{float32,[3,64,256]}] diff --git a/test_tipc/configs/rec_r50_fpn_vd_none_srn/train_infer_python.txt b/test_tipc/configs/rec_r50_fpn_vd_none_srn/train_infer_python.txt index b5a5286010a5830dc23031b3e0885247fb6ae53f..c1cfd1fcd930c6992982feeb3c118dbc5a56f226 100644 --- a/test_tipc/configs/rec_r50_fpn_vd_none_srn/train_infer_python.txt +++ b/test_tipc/configs/rec_r50_fpn_vd_none_srn/train_infer_python.txt @@ -41,7 +41,7 @@ inference:tools/infer/predict_rec.py --rec_char_dict_path=./ppocr/utils/ic15_dic --use_gpu:True|False --enable_mkldnn:False --cpu_threads:6 ---rec_batch_num:1|6 +--rec_batch_num:1 --use_tensorrt:False --precision:fp32 --rec_model_dir: diff --git a/test_tipc/configs/rec_svtrnet/train_infer_python.txt b/test_tipc/configs/rec_svtrnet/train_infer_python.txt index a7e4a24063b2e248f2ab92d5efd257a2837c0a34..5508c0411cfdc7102ccec7a00c59c2a5e1a54998 100644 --- a/test_tipc/configs/rec_svtrnet/train_infer_python.txt +++ b/test_tipc/configs/rec_svtrnet/train_infer_python.txt @@ -41,7 +41,7 @@ inference:tools/infer/predict_rec.py --rec_char_dict_path=./ppocr/utils/ic15_dic --use_gpu:True|False --enable_mkldnn:False --cpu_threads:6 ---rec_batch_num:1|6 +--rec_batch_num:1 --use_tensorrt:False --precision:fp32 --rec_model_dir: diff --git a/test_tipc/configs/rec_vitstr_none_ce/train_infer_python.txt b/test_tipc/configs/rec_vitstr_none_ce/train_infer_python.txt index 04c5742ea2ddaf01e782d8b39c21bcbcfa0a7ce7..187c11544998626af556e3eeef5f958fbe42fea0 100644 --- a/test_tipc/configs/rec_vitstr_none_ce/train_infer_python.txt +++ b/test_tipc/configs/rec_vitstr_none_ce/train_infer_python.txt @@ -41,7 +41,7 @@ inference:tools/infer/predict_rec.py --rec_char_dict_path=./ppocr/utils/EN_symbo --use_gpu:True|False --enable_mkldnn:False --cpu_threads:6 ---rec_batch_num:1|6 +--rec_batch_num:1 --use_tensorrt:False --precision:fp32 --rec_model_dir: diff --git a/test_tipc/configs/slanet/SLANet.yml b/test_tipc/configs/slanet/SLANet.yml new file mode 100644 index 0000000000000000000000000000000000000000..4ebfdd20f7356e004ed9cec24fe27fc7607aeb70 --- /dev/null +++ b/test_tipc/configs/slanet/SLANet.yml @@ -0,0 +1,143 @@ +Global: + use_gpu: true + epoch_num: 100 + log_smooth_window: 20 + print_batch_step: 20 + save_model_dir: ./output/SLANet + save_epoch_step: 400 + # evaluation is run every 1000 iterations after the 0th iteration + eval_batch_step: [0, 1000] + cal_metric_during_train: True + pretrained_model: + checkpoints: + save_inference_dir: ./output/SLANet/infer + use_visualdl: False + infer_img: doc/table/table.jpg + # for data or label process + character_dict_path: ppocr/utils/dict/table_structure_dict.txt + character_type: en + max_text_length: &max_text_length 500 + box_format: &box_format 'xyxy' # 'xywh', 'xyxy', 'xyxyxyxy' + infer_mode: False + use_sync_bn: True + save_res_path: 'output/infer' + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + clip_norm: 5.0 + lr: + name: Piecewise + learning_rate: 0.001 + decay_epochs : [40, 50] + values : [0.001, 0.0001, 0.00005] + regularizer: + name: 'L2' + factor: 0.00000 + +Architecture: + model_type: table + algorithm: SLANet + Backbone: + name: PPLCNet + scale: 1.0 + pretrained: true + use_ssld: true + Neck: + name: CSPPAN + out_channels: 96 + Head: + name: SLAHead + hidden_size: 256 + max_text_length: *max_text_length + loc_reg_num: &loc_reg_num 4 + +Loss: + name: SLALoss + structure_weight: 1.0 + loc_weight: 2.0 + loc_loss: smooth_l1 + +PostProcess: + name: TableLabelDecode + merge_no_span_structure: &merge_no_span_structure True + +Metric: + name: TableMetric + main_indicator: acc + compute_bbox_metric: False + loc_reg_num: *loc_reg_num + box_format: *box_format + +Train: + dataset: + name: PubTabDataSet + data_dir: ./train_data/pubtabnet/train/ + label_file_list: [./train_data/pubtabnet/train.jsonl] + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - TableLabelEncode: + learn_empty_box: False + merge_no_span_structure: *merge_no_span_structure + replace_empty_cell_token: False + loc_reg_num: *loc_reg_num + max_text_length: *max_text_length + - TableBoxEncode: + in_box_format: *box_format + out_box_format: *box_format + - ResizeTableImage: + max_len: 488 + - NormalizeImage: + scale: 1./255. + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: 'hwc' + - PaddingTableImage: + size: [488, 488] + - ToCHWImage: + - KeepKeys: + keep_keys: [ 'image', 'structure', 'bboxes', 'bbox_masks', 'shape' ] + loader: + shuffle: True + batch_size_per_card: 48 + drop_last: True + num_workers: 1 + +Eval: + dataset: + name: PubTabDataSet + data_dir: ./train_data/pubtabnet/test/ + label_file_list: [./train_data/pubtabnet/test.jsonl] + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - TableLabelEncode: + learn_empty_box: False + merge_no_span_structure: *merge_no_span_structure + replace_empty_cell_token: False + loc_reg_num: *loc_reg_num + max_text_length: *max_text_length + - TableBoxEncode: + in_box_format: *box_format + out_box_format: *box_format + - ResizeTableImage: + max_len: 488 + - NormalizeImage: + scale: 1./255. + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: 'hwc' + - PaddingTableImage: + size: [488, 488] + - ToCHWImage: + - KeepKeys: + keep_keys: [ 'image', 'structure', 'bboxes', 'bbox_masks', 'shape' ] + loader: + shuffle: False + drop_last: False + batch_size_per_card: 48 + num_workers: 1 diff --git a/test_tipc/configs/slanet/model_linux_gpu_normal_normal_infer_cpp_linux_gpu_cpu.txt b/test_tipc/configs/slanet/model_linux_gpu_normal_normal_infer_cpp_linux_gpu_cpu.txt new file mode 100644 index 0000000000000000000000000000000000000000..1b4226706b067f65361fd3e79bcbc52e1cf70ad0 --- /dev/null +++ b/test_tipc/configs/slanet/model_linux_gpu_normal_normal_infer_cpp_linux_gpu_cpu.txt @@ -0,0 +1,20 @@ +===========================cpp_infer_params=========================== +model_name:slanet +use_opencv:True +infer_model:./inference/ch_ppstructure_mobile_v2.0_SLANet_infer/ +infer_quant:False +inference:./deploy/cpp_infer/build/ppocr --det_model_dir=./inference/ch_PP-OCRv3_det_infer --rec_model_dir=./inference/ch_PP-OCRv3_rec_infer --output=./output/table --type=structure --table=True --rec_char_dict_path=./ppocr/utils/ppocr_keys_v1.txt --table_char_dict_path=./ppocr/utils/dict/table_structure_dict_ch.txt +--use_gpu:True|False +--enable_mkldnn:False +--cpu_threads:6 +--rec_batch_num:6 +--use_tensorrt:False +--precision:fp32 +--table_model_dir: +--image_dir:./ppstructure/docs/table/table.jpg +null:null +--benchmark:True +--det:True +--rec:True +--cls:False +--use_angle_cls:False \ No newline at end of file diff --git a/test_tipc/configs/slanet/train_infer_python.txt b/test_tipc/configs/slanet/train_infer_python.txt new file mode 100644 index 0000000000000000000000000000000000000000..05264360ac95d08ba11157372a9badef23afdc70 --- /dev/null +++ b/test_tipc/configs/slanet/train_infer_python.txt @@ -0,0 +1,59 @@ +===========================train_params=========================== +model_name:slanet +python:python3.7 +gpu_list:0|0,1 +Global.use_gpu:True|True +Global.auto_cast:fp32 +Global.epoch_num:lite_train_lite_infer=3|whole_train_whole_infer=50 +Global.save_model_dir:./output/ +Train.loader.batch_size_per_card:lite_train_lite_infer=16|whole_train_whole_infer=128 +Global.pretrained_model:./pretrain_models/en_ppstructure_mobile_v2.0_SLANet_train/best_accuracy +train_model_name:latest +train_infer_img_dir:./ppstructure/docs/table/table.jpg +null:null +## +trainer:norm_train +norm_train:tools/train.py -c test_tipc/configs/slanet/SLANet.yml -o Global.print_batch_step=1 Train.loader.shuffle=false +pact_train:null +fpgm_train:null +distill_train:null +null:null +null:null +## +===========================eval_params=========================== +eval:null +null:null +## +===========================infer_params=========================== +Global.save_inference_dir:./output/ +Global.checkpoints: +norm_export:tools/export_model.py -c test_tipc/configs/slanet/SLANet.yml -o +quant_export: +fpgm_export: +distill_export:null +export1:null +export2:null +## +infer_model:./inference/en_ppstructure_mobile_v2.0_SLANet_train +infer_export:null +infer_quant:False +inference:ppstructure/table/predict_table.py --det_model_dir=./inference/en_ppocr_mobile_v2.0_table_det_infer --rec_model_dir=./inference/en_ppocr_mobile_v2.0_table_rec_infer --rec_char_dict_path=./ppocr/utils/dict/table_dict.txt --table_char_dict_path=./ppocr/utils/dict/table_structure_dict.txt --image_dir=./ppstructure/docs/table/table.jpg --det_limit_side_len=736 --det_limit_type=min --output ./output/table +--use_gpu:True|False +--enable_mkldnn:False +--cpu_threads:6 +--rec_batch_num:1 +--use_tensorrt:False +--precision:fp32 +--table_model_dir: +--image_dir:./ppstructure/docs/table/table.jpg +null:null +--benchmark:False +null:null +===========================infer_benchmark_params========================== +random_infer_input:[{float32,[3,488,488]}] +===========================train_benchmark_params========================== +batch_size:32 +fp_items:fp32|fp16 +epoch:2 +--profiler_options:batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile +flags:FLAGS_eager_delete_tensor_gb=0.0;FLAGS_fraction_of_gpu_memory_to_use=0.98;FLAGS_conv_workspace_size_limit=4096 diff --git a/test_tipc/configs/table_master/table_master.yml b/test_tipc/configs/table_master/table_master.yml index c519b5b8f464d8843888155387b74a8416821f2f..27f81683b9b7e9475bdfa4ad4862166f4cf9c14d 100644 --- a/test_tipc/configs/table_master/table_master.yml +++ b/test_tipc/configs/table_master/table_master.yml @@ -16,8 +16,6 @@ Global: character_dict_path: ppocr/utils/dict/table_master_structure_dict.txt infer_mode: false max_text_length: 500 - process_total_num: 0 - process_cut_num: 0 Optimizer: @@ -86,7 +84,7 @@ Train: - PaddingTableImage: size: [480, 480] - TableBoxEncode: - use_xywh: True + box_format: 'xywh' - NormalizeImage: scale: 1./255. mean: [0.5, 0.5, 0.5] @@ -120,7 +118,7 @@ Eval: - PaddingTableImage: size: [480, 480] - TableBoxEncode: - use_xywh: True + box_format: 'xywh' - NormalizeImage: scale: 1./255. mean: [0.5, 0.5, 0.5] diff --git a/test_tipc/configs/table_master/train_infer_python.txt b/test_tipc/configs/table_master/train_infer_python.txt index 56b8e636026939ae8cd700308690010e1300d8f6..c3a871731a36fb5434db111cfd68b6eab7ba3f99 100644 --- a/test_tipc/configs/table_master/train_infer_python.txt +++ b/test_tipc/configs/table_master/train_infer_python.txt @@ -37,8 +37,8 @@ export2:null infer_model:null infer_export:null infer_quant:False -inference:ppstructure/table/predict_structure.py --table_char_dict_path=./ppocr/utils/dict/table_master_structure_dict.txt --image_dir=./ppstructure/docs/table/table.jpg --output ./output/table --table_algorithm=TableMaster --table_max_len=480 ---use_gpu:True|False +inference:ppstructure/table/predict_structure.py --table_char_dict_path=./ppocr/utils/dict/table_master_structure_dict.txt --output ./output/table --table_algorithm=TableMaster --table_max_len=480 +--use_gpu:True --enable_mkldnn:False --cpu_threads:6 --rec_batch_num:1 diff --git a/test_tipc/configs/vi_layoutxlm_ser/train_infer_python.txt b/test_tipc/configs/vi_layoutxlm_ser/train_infer_python.txt new file mode 100644 index 0000000000000000000000000000000000000000..adad78bb76e34635a632ef7c1b55e212bc4b636a --- /dev/null +++ b/test_tipc/configs/vi_layoutxlm_ser/train_infer_python.txt @@ -0,0 +1,59 @@ +===========================train_params=========================== +model_name:vi_layoutxlm_ser +python:python3.7 +gpu_list:0|0,1 +Global.use_gpu:True|True +Global.auto_cast:fp32 +Global.epoch_num:lite_train_lite_infer=1|whole_train_whole_infer=17 +Global.save_model_dir:./output/ +Train.loader.batch_size_per_card:lite_train_lite_infer=4|whole_train_whole_infer=8 +Architecture.Backbone.checkpoints:null +train_model_name:latest +train_infer_img_dir:ppstructure/docs/kie/input/zh_val_42.jpg +null:null +## +trainer:norm_train +norm_train:tools/train.py -c ./configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml -o Global.print_batch_step=1 Global.eval_batch_step=[1000,1000] Train.loader.shuffle=false +pact_train:null +fpgm_train:null +distill_train:null +null:null +null:null +## +===========================eval_params=========================== +eval:null +null:null +## +===========================infer_params=========================== +Global.save_inference_dir:./output/ +Architecture.Backbone.checkpoints: +norm_export:tools/export_model.py -c ./configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml -o +quant_export: +fpgm_export: +distill_export:null +export1:null +export2:null +## +infer_model:null +infer_export:null +infer_quant:False +inference:ppstructure/kie/predict_kie_token_ser.py --kie_algorithm=LayoutXLM --ser_dict_path=train_data/XFUND/class_list_xfun.txt --output=output --ocr_order_method=tb-yx +--use_gpu:True|False +--enable_mkldnn:False +--cpu_threads:6 +--rec_batch_num:1 +--use_tensorrt:False +--precision:fp32 +--ser_model_dir: +--image_dir:./ppstructure/docs/kie/input/zh_val_42.jpg +null:null +--benchmark:False +null:null +===========================infer_benchmark_params========================== +random_infer_input:[{float32,[3,224,224]}] +===========================train_benchmark_params========================== +batch_size:4 +fp_items:fp32|fp16 +epoch:3 +--profiler_options:batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile +flags:FLAGS_eager_delete_tensor_gb=0.0;FLAGS_fraction_of_gpu_memory_to_use=0.98 diff --git a/test_tipc/docs/jeston_test_train_inference_python.md b/test_tipc/docs/jeston_test_train_inference_python.md index b25175ed0071dd3728ae22c7588ca20535af0505..22fc21c1cb615fa3e9cb0eb12441db80968a23ed 100644 --- a/test_tipc/docs/jeston_test_train_inference_python.md +++ b/test_tipc/docs/jeston_test_train_inference_python.md @@ -24,12 +24,7 @@ Jetson端基础训练预测功能测试的主程序为`test_inference_inference. ``` - 安装autolog(规范化日志输出工具) ``` - git clone https://github.com/LDOUBLEV/AutoLog - cd AutoLog - pip install -r requirements.txt - python setup.py bdist_wheel - pip install ./dist/auto_log-1.0.0-py3-none-any.whl - cd ../ + pip install https://paddleocr.bj.bcebos.com/libs/auto_log-1.2.0-py3-none-any.whl ``` - 安装PaddleSlim (可选) ``` diff --git a/test_tipc/docs/mac_test_train_inference_python.md b/test_tipc/docs/mac_test_train_inference_python.md index c37291a8fc9b239564adce8f556565f51f2a9475..759ea516430183a1b949ed5b69e24cceac8b6125 100644 --- a/test_tipc/docs/mac_test_train_inference_python.md +++ b/test_tipc/docs/mac_test_train_inference_python.md @@ -1,6 +1,6 @@ # Mac端基础训练预测功能测试 -Mac端基础训练预测功能测试的主程序为`test_train_inference_python.sh`,可以测试基于Python的模型CPU训练,包括裁剪、量化、蒸馏训练,以及评估、CPU推理等基本功能。 +Mac端基础训练预测功能测试的主程序为`test_train_inference_python.sh`,可以测试基于Python的模型CPU训练,包括裁剪、PACT在线量化、蒸馏训练,以及评估、CPU推理等基本功能。 注:Mac端测试用法同linux端测试方法类似,但是无需测试需要在GPU上运行的测试。 @@ -10,7 +10,7 @@ Mac端基础训练预测功能测试的主程序为`test_train_inference_python. | 算法名称 | 模型名称 | 单机单卡(CPU) | 单机多卡 | 多机多卡 | 模型压缩(CPU) | | :---- | :---- | :---- | :---- | :---- | :---- | -| DB | ch_ppocr_mobile_v2.0_det| 正常训练 | - | - | 正常训练:FPGM裁剪、PACT量化
离线量化(无需训练) | +| DB | ch_ppocr_mobile_v2.0_det| 正常训练 | - | - | 正常训练:FPGM裁剪、PACT量化 | - 预测相关:基于训练是否使用量化,可以将训练产出的模型可以分为`正常模型`和`量化模型`,这两类模型对应的预测功能汇总如下, @@ -26,19 +26,14 @@ Mac端基础训练预测功能测试的主程序为`test_train_inference_python. Mac端无GPU,环境准备只需要Python环境即可,安装PaddlePaddle等依赖参考下述文档。 ### 2.1 安装依赖 -- 安装PaddlePaddle >= 2.0 +- 安装PaddlePaddle >= 2.3 - 安装PaddleOCR依赖 ``` pip install -r ../requirements.txt ``` - 安装autolog(规范化日志输出工具) ``` - git clone https://github.com/LDOUBLEV/AutoLog - cd AutoLog - pip install -r requirements.txt - python setup.py bdist_wheel - pip install ./dist/auto_log-1.0.0-py3-none-any.whl - cd ../ + pip install https://paddleocr.bj.bcebos.com/libs/auto_log-1.2.0-py3-none-any.whl ``` - 安装PaddleSlim (可选) ``` @@ -49,53 +44,46 @@ Mac端无GPU,环境准备只需要Python环境即可,安装PaddlePaddle等 ### 2.2 功能测试 -先运行`prepare.sh`准备数据和模型,然后运行`test_train_inference_python.sh`进行测试,最终在```test_tipc/output```目录下生成`python_infer_*.log`格式的日志文件。 +先运行`prepare.sh`准备数据和模型,然后运行`test_train_inference_python.sh`进行测试,最终在```test_tipc/output```目录下生成`,model_name/lite_train_lite_infer/*.log`格式的日志文件。 -`test_train_inference_python.sh`包含5种运行模式,每种模式的运行数据不同,分别用于测试速度和精度,分别是: +`test_train_inference_python.sh`包含基础链条的4种运行模式,每种模式的运行数据不同,分别用于测试速度和精度,分别是: - 模式1:lite_train_lite_infer,使用少量数据训练,用于快速验证训练到预测的走通流程,不验证精度和速度; ```shell # 同linux端运行不同的是,Mac端测试使用新的配置文件mac_ppocr_det_mobile_params.txt, # 配置文件中默认去掉了GPU和mkldnn相关的测试链条 -bash test_tipc/prepare.sh ./test_tipc/configs/ch_ppocr_mobile_v2.0_det/train_mac_cpu_normal_normal_infer_python_mac_cpu.txt 'lite_train_lite_infer' -bash test_tipc/test_train_inference_python.sh ./test_tipc/configs/ch_ppocr_mobile_v2.0_det/train_mac_cpu_normal_normal_infer_python_mac_cpu.txt 'lite_train_lite_infer' +bash test_tipc/prepare.sh ./test_tipc/configs/ch_ppocr_mobile_v2_0_det/train_mac_cpu_normal_normal_infer_python_mac_cpu.txt 'lite_train_lite_infer' +bash test_tipc/test_train_inference_python.sh ./test_tipc/configs/ch_ppocr_mobile_v2_0_det/train_mac_cpu_normal_normal_infer_python_mac_cpu.txt 'lite_train_lite_infer' ``` - 模式2:lite_train_whole_infer,使用少量数据训练,一定量数据预测,用于验证训练后的模型执行预测,预测速度是否合理; ```shell -bash test_tipc/prepare.sh ./test_tipc/configs/ch_ppocr_mobile_v2.0_det/train_mac_cpu_normal_normal_infer_python_mac_cpu.txt 'lite_train_whole_infer' -bash test_tipc/test_train_inference_python.sh ./test_tipc/configs/ch_ppocr_mobile_v2.0_det/train_mac_cpu_normal_normal_infer_python_mac_cpu.txt 'lite_train_whole_infer' +bash test_tipc/prepare.sh ./test_tipc/configs/ch_ppocr_mobile_v2_0_det/train_mac_cpu_normal_normal_infer_python_mac_cpu.txt 'lite_train_whole_infer' +bash test_tipc/test_train_inference_python.sh ./test_tipc/configs/ch_ppocr_mobile_v2_0_det/train_mac_cpu_normal_normal_infer_python_mac_cpu.txt 'lite_train_whole_infer' ``` - 模式3:whole_infer,不训练,全量数据预测,走通开源模型评估、动转静,检查inference model预测时间和精度; ```shell -bash test_tipc/prepare.sh ./test_tipc/configs/ch_ppocr_mobile_v2.0_det/train_mac_cpu_normal_normal_infer_python_mac_cpu.txt 'whole_infer' +bash test_tipc/prepare.sh ./test_tipc/configs/ch_ppocr_mobile_v2_0_det/train_mac_cpu_normal_normal_infer_python_mac_cpu.txt 'whole_infer' # 用法1: -bash test_tipc/test_train_inference_python.sh ./test_tipc/configs/ch_ppocr_mobile_v2.0_det/train_mac_cpu_normal_normal_infer_python_mac_cpu.txt 'whole_infer' +bash test_tipc/test_train_inference_python.sh ./test_tipc/configs/ch_ppocr_mobile_v2_0_det/train_mac_cpu_normal_normal_infer_python_mac_cpu.txt 'whole_infer' # 用法2: 指定GPU卡预测,第三个传入参数为GPU卡号 -bash test_tipc/test_train_inference_python.sh ./test_tipc/configs/ch_ppocr_mobile_v2.0_det/train_mac_cpu_normal_normal_infer_python_mac_cpu.txt 'whole_infer' '1' +bash test_tipc/test_train_inference_python.sh ./test_tipc/configs/ch_ppocr_mobile_v2_0_det/train_mac_cpu_normal_normal_infer_python_mac_cpu.txt 'whole_infer' '1' ``` - 模式4:whole_train_whole_infer,CE: 全量数据训练,全量数据预测,验证模型训练精度,预测精度,预测速度;(Mac端不建议运行此模式) ```shell -bash test_tipc/prepare.sh ./test_tipc/configs/ch_ppocr_mobile_v2.0_det/train_mac_cpu_normal_normal_infer_python_mac_cpu.txt 'whole_train_whole_infer' -bash test_tipc/test_train_inference_python.sh ./test_tipc/configs/ch_ppocr_mobile_v2.0_det/train_mac_cpu_normal_normal_infer_python_mac_cpu.txt 'whole_train_whole_infer' +bash test_tipc/prepare.sh ./test_tipc/configs/ch_ppocr_mobile_v2_0_det/train_mac_cpu_normal_normal_infer_python_mac_cpu.txt 'whole_train_whole_infer' +bash test_tipc/test_train_inference_python.sh ./test_tipc/configs/ch_ppocr_mobile_v2_0_det/train_mac_cpu_normal_normal_infer_python_mac_cpu.txt 'whole_train_whole_infer' ``` -- 模式5:klquant_whole_infer,测试离线量化; -```shell -bash test_tipc/prepare.sh ./test_tipc/configs/ch_ppocr_mobile_v2.0_det_KL/model_linux_gpu_normal_normal_infer_python_mac_cpu.txt 'klquant_whole_infer' -bash test_tipc/test_train_inference_python.sh ./test_tipc/configs/ch_ppocr_mobile_v2.0_det_KL/model_linux_gpu_normal_normal_infer_python_mac_cpu.txt 'klquant_whole_infer' -``` - 运行相应指令后,在`test_tipc/output`文件夹下自动会保存运行日志。如`lite_train_lite_infer`模式下,会运行训练+inference的链条,因此,在`test_tipc/output`文件夹有以下文件: ``` -test_tipc/output/ +test_tipc/output/model_name/lite_train_lite_infer/ |- results_python.log # 运行指令状态的日志 |- norm_train_gpus_-1_autocast_null/ # CPU上正常训练的训练日志和模型保存文件夹 -|- pact_train_gpus_-1_autocast_null/ # CPU上量化训练的训练日志和模型保存文件夹 ...... -|- python_infer_cpu_usemkldnn_False_threads_1_batchsize_1.log # CPU上关闭Mkldnn线程数设置为1,测试batch_size=1条件下的预测运行日志 +|- python_infer_cpu_usemkldnn_False_threads_1_precision_fp32_batchsize_1.log # CPU上关闭Mkldnn线程数设置为1,测试batch_size=1条件下的fp32精度预测运行日志 ...... ``` diff --git a/test_tipc/docs/test_inference_cpp.md b/test_tipc/docs/test_inference_cpp.md index e662f4bacc0b69bd605a79dac0e36c99daac87d5..5d8aeda6c401b48892de1006c2a024447823defa 100644 --- a/test_tipc/docs/test_inference_cpp.md +++ b/test_tipc/docs/test_inference_cpp.md @@ -17,15 +17,15 @@ C++预测功能测试的主程序为`test_inference_cpp.sh`,可以测试基于 运行环境配置请参考[文档](./install.md)的内容配置TIPC的运行环境。 ### 2.1 功能测试 -先运行`prepare.sh`准备数据和模型,然后运行`test_inference_cpp.sh`进行测试,最终在```test_tipc/output```目录下生成`cpp_infer_*.log`后缀的日志文件。 +先运行`prepare.sh`准备数据和模型,然后运行`test_inference_cpp.sh`进行测试,最终在```test_tipc/output/{model_name}/cpp_infer```目录下生成`cpp_infer_*.log`后缀的日志文件。 ```shell -bash test_tipc/prepare.sh ./test_tipc/configs/ch_ppocr_mobile_v2.0_det/model_linux_gpu_normal_normal_infer_cpp_linux_gpu_cpu.txt "cpp_infer" +bash test_tipc/prepare.sh ./test_tipc/configs/ch_PP-OCRv2_rec/model_linux_gpu_normal_normal_infer_cpp_linux_gpu_cpu.txt "cpp_infer" # 用法1: -bash test_tipc/test_inference_cpp.sh test_tipc/configs/ch_ppocr_mobile_v2.0_det/model_linux_gpu_normal_normal_infer_cpp_linux_gpu_cpu.txt +bash test_tipc/test_inference_cpp.sh test_tipc/configs/ch_PP-OCRv2_rec/model_linux_gpu_normal_normal_infer_cpp_linux_gpu_cpu.txt # 用法2: 指定GPU卡预测,第三个传入参数为GPU卡号 -bash test_tipc/test_inference_cpp.sh test_tipc/configs/ch_ppocr_mobile_v2.0_det/model_linux_gpu_normal_normal_infer_cpp_linux_gpu_cpu.txt '1' +bash test_tipc/test_inference_cpp.sh test_tipc/configs/ch_PP-OCRv2_rec/model_linux_gpu_normal_normal_infer_cpp_linux_gpu_cpu.txt '1' ``` 运行预测指令后,在`test_tipc/output`文件夹下自动会保存运行日志,包括以下文件: @@ -33,23 +33,21 @@ bash test_tipc/test_inference_cpp.sh test_tipc/configs/ch_ppocr_mobile_v2.0_det/ ```shell test_tipc/output/ |- results_cpp.log # 运行指令状态的日志 -|- cpp_infer_cpu_usemkldnn_False_threads_1_precision_fp32_batchsize_1.log # CPU上不开启Mkldnn,线程数设置为1,测试batch_size=1条件下的预测运行日志 -|- cpp_infer_cpu_usemkldnn_False_threads_6_precision_fp32_batchsize_1.log # CPU上不开启Mkldnn,线程数设置为6,测试batch_size=1条件下的预测运行日志 -|- cpp_infer_gpu_usetrt_False_precision_fp32_batchsize_1.log # GPU上不开启TensorRT,测试batch_size=1的fp32精度预测日志 -|- cpp_infer_gpu_usetrt_True_precision_fp16_batchsize_1.log # GPU上开启TensorRT,测试batch_size=1的fp16精度预测日志 +|- cpp_infer_cpu_usemkldnn_False_threads_6_precision_fp32_batchsize_6.log # CPU上不开启Mkldnn,线程数设置为6,测试batch_size=6条件下的预测运行日志 +|- cpp_infer_gpu_usetrt_False_precision_fp32_batchsize_6.log # GPU上不开启TensorRT,测试batch_size=6的fp32精度预测日志 ...... ``` 其中results_cpp.log中包含了每条指令的运行状态,如果运行成功会输出: ``` -Run successfully with command - ./deploy/cpp_infer/build/ppocr det --use_gpu=False --enable_mkldnn=False --cpu_threads=6 --det_model_dir=./inference/ch_ppocr_mobile_v2.0_det_infer/ --rec_batch_num=1 --image_dir=./inference/ch_det_data_50/all-sum-510/ --benchmar k=True > ./test_tipc/output/cpp_infer_cpu_usemkldnn_False_threads_6_precision_fp32_batchsize_1.log 2>&1 ! -Run successfully with command - ./deploy/cpp_infer/build/ppocr det --use_gpu=True --use_tensorrt=False --precision=fp32 --det_model_dir=./inference/ch_ppocr_mobile_v2.0_det_infer/ --rec_batch_num=1 --image_dir=./inference/ch_det_data_50/all-sum-510/ --benchmark =True > ./test_tipc/output/cpp_infer_gpu_usetrt_False_precision_fp32_batchsize_1.log 2>&1 ! +[33m Run successfully with command - ch_PP-OCRv2_rec - ./deploy/cpp_infer/build/ppocr --rec_char_dict_path=./ppocr/utils/ppocr_keys_v1.txt --rec_img_h=32 --use_gpu=True --use_tensorrt=False --precision=fp32 --rec_model_dir=./inference/ch_PP-OCRv2_rec_infer/ --rec_batch_num=6 --image_dir=./inference/rec_inference/ --benchmark=True --det=False --rec=True --cls=False --use_angle_cls=False > ./test_tipc/output/ch_PP-OCRv2_rec/cpp_infer/cpp_infer_gpu_usetrt_False_precision_fp32_batchsize_6.log 2>&1 !  + Run successfully with command - ch_PP-OCRv2_rec - ./deploy/cpp_infer/build/ppocr --rec_char_dict_path=./ppocr/utils/ppocr_keys_v1.txt --rec_img_h=32 --use_gpu=False --enable_mkldnn=False --cpu_threads=6 --rec_model_dir=./inference/ch_PP-OCRv2_rec_infer/ --rec_batch_num=6 --image_dir=./inference/rec_inference/ --benchmark=True --det=False --rec=True --cls=False --use_angle_cls=False > ./test_tipc/output/ch_PP-OCRv2_rec/cpp_infer/cpp_infer_cpu_usemkldnn_False_threads_6_precision_fp32_batchsize_6.log 2>&1 !  ...... ``` 如果运行失败,会输出: ``` -Run failed with command - ./deploy/cpp_infer/build/ppocr det --use_gpu=True --use_tensorrt=True --precision=fp32 --det_model_dir=./inference/ch_ppocr_mobile_v2.0_det_infer/ --rec_batch_num=1 --image_dir=./inference/ch_det_data_50/all-sum-510/ --benchmark=True > ./test_tipc/output/cpp_infer_gpu_usetrt_True_precision_fp32_batchsize_1.log 2>&1 ! -Run failed with command - ./deploy/cpp_infer/build/ppocr det --use_gpu=True --use_tensorrt=True --precision=fp16 --det_model_dir=./inference/ch_ppocr_mobile_v2.0_det_infer/ --rec_batch_num=1 --image_dir=./inference/ch_det_data_50/all-sum-510/ --benchmark=True > ./test_tipc/output/cpp_infer_gpu_usetrt_True_precision_fp16_batchsize_1.log 2>&1 ! +Run failed with command - ch_PP-OCRv2_rec - ./deploy/cpp_infer/build/ppocr --rec_char_dict_path=./ppocr/utils/ppocr_keys_v1.txt --rec_img_h=32 --use_gpu=True --use_tensorrt=False --precision=fp32 --rec_model_dir=./inference/ch_PP-OCRv2_rec_infer/ --rec_batch_num=6 --image_dir=./inference/rec_inference/ --benchmark=True --det=False --rec=True --cls=False --use_angle_cls=False > ./test_tipc/output/ch_PP-OCRv2_rec/cpp_infer/cpp_infer_gpu_usetrt_False_precision_fp32_batchsize_6.log 2>&1 ! +Run failed with command - ch_PP-OCRv2_rec - ./deploy/cpp_infer/build/ppocr --rec_char_dict_path=./ppocr/utils/ppocr_keys_v1.txt --rec_img_h=32 --use_gpu=False --enable_mkldnn=False --cpu_threads=6 --rec_model_dir=./inference/ch_PP-OCRv2_rec_infer/ --rec_batch_num=6 --image_dir=./inference/rec_inference/ --benchmark=True --det=False --rec=True --cls=False --use_angle_cls=False > ./test_tipc/output/ch_PP-OCRv2_rec/cpp_infer/cpp_infer_cpu_usemkldnn_False_threads_6_precision_fp32_batchsize_6.log 2>&1 ! ...... ``` 可以很方便的根据results_cpp.log中的内容判定哪一个指令运行错误。 diff --git a/test_tipc/docs/test_paddle2onnx.md b/test_tipc/docs/test_paddle2onnx.md index df2734771e9252a40811c42ead03abbff1b7a1a3..299621d01122995434646351edfd524a0aa3206a 100644 --- a/test_tipc/docs/test_paddle2onnx.md +++ b/test_tipc/docs/test_paddle2onnx.md @@ -15,29 +15,30 @@ PaddleServing预测功能测试的主程序为`test_paddle2onnx.sh`,可以测 ## 2. 测试流程 ### 2.1 功能测试 -先运行`prepare.sh`准备数据和模型,然后运行`test_paddle2onnx.sh`进行测试,最终在```test_tipc/output```目录下生成`paddle2onnx_infer_*.log`后缀的日志文件。 +先运行`prepare.sh`准备数据和模型,然后运行`test_paddle2onnx.sh`进行测试,最终在```test_tipc/output/{model_name}/paddle2onnx```目录下生成`paddle2onnx_infer_*.log`后缀的日志文件。 ```shell -bash test_tipc/prepare.sh ./test_tipc/configs/ppocr_det_mobile/model_linux_gpu_normal_normal_paddle2onnx_python_linux_cpu.txt "paddle2onnx_infer" +bash test_tipc/prepare.sh ./test_tipc/configs/ch_PP-OCRv2_det/model_linux_gpu_normal_normal_paddle2onnx_python_linux_cpu.txt "paddle2onnx_infer" # 用法: -bash test_tipc/test_paddle2onnx.sh ./test_tipc/configs/ppocr_det_mobile/model_linux_gpu_normal_normal_paddle2onnx_python_linux_cpu.txt +bash test_tipc/test_paddle2onnx.sh ./test_tipc/configs/ch_PP-OCRv2_det/model_linux_gpu_normal_normal_paddle2onnx_python_linux_cpu.txt ``` #### 运行结果 -各测试的运行情况会打印在 `test_tipc/output/results_paddle2onnx.log` 中: +各测试的运行情况会打印在 `test_tipc/output/{model_name}/paddle2onnx/results_paddle2onnx.log` 中: 运行成功时会输出: ``` -Run successfully with command - paddle2onnx --model_dir=./inference/ch_ppocr_mobile_v2.0_det_infer/ --model_filename=inference.pdmodel --params_filename=inference.pdiparams --save_file=./inference/det_mobile_onnx/model.onnx --opset_version=10 --enable_onnx_checker=True! -Run successfully with command - python test_tipc/onnx_inference/predict_det.py --use_gpu=False --image_dir=./inference/ch_det_data_50/all-sum-510/ --det_model_dir=./inference/det_mobile_onnx/model.onnx 2>&1 ! +Run successfully with command - ch_PP-OCRv2_det - paddle2onnx --model_dir=./inference/ch_PP-OCRv2_det_infer/ --model_filename=inference.pdmodel --params_filename=inference.pdiparams --save_file=./inference/det_v2_onnx/model.onnx --opset_version=10 --enable_onnx_checker=True! +Run successfully with command - ch_PP-OCRv2_det - python3.7 tools/infer/predict_det.py --use_gpu=True --image_dir=./inference/ch_det_data_50/all-sum-510/ --det_model_dir=./inference/det_v2_onnx/model.onnx --use_onnx=True > ./test_tipc/output/ch_PP-OCRv2_det/paddle2onnx/paddle2onnx_infer_gpu.log 2>&1 ! +Run successfully with command - ch_PP-OCRv2_det - python3.7 tools/infer/predict_det.py --use_gpu=False --image_dir=./inference/ch_det_data_50/all-sum-510/ --det_model_dir=./inference/det_v2_onnx/model.onnx --use_onnx=True > ./test_tipc/output/ch_PP-OCRv2_det/paddle2onnx/paddle2onnx_infer_cpu.log 2>&1 ! ``` 运行失败时会输出: ``` -Run failed with command - paddle2onnx --model_dir=./inference/ch_ppocr_mobile_v2.0_det_infer/ --model_filename=inference.pdmodel --params_filename=inference.pdiparams --save_file=./inference/det_mobile_onnx/model.onnx --opset_version=10 --enable_onnx_checker=True! +Run failed with command - ch_PP-OCRv2_det - paddle2onnx --model_dir=./inference/ch_PP-OCRv2_det_infer/ --model_filename=inference.pdmodel --params_filename=inference.pdiparams --save_file=./inference/det_v2_onnx/model.onnx --opset_version=10 --enable_onnx_checker=True! ... ``` diff --git a/test_tipc/docs/test_ptq_inference_python.md b/test_tipc/docs/test_ptq_inference_python.md new file mode 100644 index 0000000000000000000000000000000000000000..7887c0b5c93decac61f56d8c8b92018f40c78b32 --- /dev/null +++ b/test_tipc/docs/test_ptq_inference_python.md @@ -0,0 +1,51 @@ +# Linux GPU/CPU KL离线量化训练推理测试 + +Linux GPU/CPU KL离线量化训练推理测试的主程序为`test_ptq_inference_python.sh`,可以测试基于Python的模型训练、评估、推理等基本功能。 + +## 1. 测试结论汇总 +- 训练相关: + +| 算法名称 | 模型名称 | 单机单卡 | +| :----: | :----: | :----: | +| | model_name | KL离线量化训练 | + +- 推理相关: + +| 算法名称 | 模型名称 | device_CPU | device_GPU | batchsize | +| :----: | :----: | :----: | :----: | :----: | +| | model_name | 支持 | 支持 | 1 | + +## 2. 测试流程 + +### 2.1 准备数据和模型 + +先运行`prepare.sh`准备数据和模型,然后运行`test_ptq_inference_python.sh`进行测试,最终在```test_tipc/output/{model_name}/whole_infer```目录下生成`python_infer_*.log`后缀的日志文件。 + +```shell +bash test_tipc/prepare.sh ./test_tipc/configs/ch_PP-OCRv2_det/train_ptq_infer_python.txt "whole_infer" + +# 用法: +bash test_tipc/test_ptq_inference_python.sh ./test_tipc/configs/ch_PP-OCRv2_det/train_ptq_infer_python.txt "whole_infer" +``` + +#### 运行结果 + +各测试的运行情况会打印在 `test_tipc/output/{model_name}/paddle2onnx/results_paddle2onnx.log` 中: +运行成功时会输出: + +``` +Run successfully with command - ch_PP-OCRv2_det_KL - python3.7 deploy/slim/quantization/quant_kl.py -c configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_cml.yml -o Global.pretrained_model=./inference/ch_PP-OCRv2_det_infer/ Global.save_inference_dir=./inference/ch_PP-OCRv2_det_infer/_klquant > ./test_tipc/output/ch_PP-OCRv2_det_KL/whole_infer/whole_infer_export_0.log 2>&1 ! +Run successfully with command - ch_PP-OCRv2_det_KL - python3.7 tools/infer/predict_det.py --use_gpu=False --enable_mkldnn=False --cpu_threads=6 --det_model_dir=./inference/ch_PP-OCRv2_det_infer/_klquant --rec_batch_num=1 --image_dir=./inference/ch_det_data_50/all-sum-510/ --precision=int8 > ./test_tipc/output/ch_PP-OCRv2_det_KL/whole_infer/python_infer_cpu_usemkldnn_False_threads_6_precision_int8_batchsize_1.log 2>&1 ! +Run successfully with command - ch_PP-OCRv2_det_KL - python3.7 tools/infer/predict_det.py --use_gpu=True --use_tensorrt=False --precision=int8 --det_model_dir=./inference/ch_PP-OCRv2_det_infer/_klquant --rec_batch_num=1 --image_dir=./inference/ch_det_data_50/all-sum-510/ > ./test_tipc/output/ch_PP-OCRv2_det_KL/whole_infer/python_infer_gpu_usetrt_False_precision_int8_batchsize_1.log 2>&1 ! +``` + +运行失败时会输出: + +``` +Run failed with command - ch_PP-OCRv2_det_KL - python3.7 deploy/slim/quantization/quant_kl.py -c configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_cml.yml -o Global.pretrained_model=./inference/ch_PP-OCRv2_det_infer/ Global.save_inference_dir=./inference/ch_PP-OCRv2_det_infer/_klquant > ./test_tipc/output/ch_PP-OCRv2_det_KL/whole_infer/whole_infer_export_0.log 2>&1 ! +... +``` + +## 3. 更多教程 + +本文档为功能测试用,更详细的量化使用教程请参考:[量化](../../deploy/slim/quantization/README.md) diff --git a/test_tipc/docs/test_serving.md b/test_tipc/docs/test_serving.md index 71f01c0d5ff47004d70baa17b404c10714a6fb64..ef38888784b600233fe85afe3c1064caf12173d4 100644 --- a/test_tipc/docs/test_serving.md +++ b/test_tipc/docs/test_serving.md @@ -18,71 +18,44 @@ PaddleServing预测功能测试的主程序为`test_serving_infer_python.sh`和` ### 2.1 功能测试 **python serving** -先运行`prepare.sh`准备数据和模型,然后运行`test_serving_infer_python.sh`进行测试,最终在```test_tipc/output```目录下生成`serving_infer_python*.log`后缀的日志文件。 +先运行`prepare.sh`准备数据和模型,然后运行`test_serving_infer_python.sh`进行测试,最终在```test_tipc/output/{model_name}/serving_infer/python```目录下生成`python_*.log`后缀的日志文件。 ```shell -bash test_tipc/prepare.sh ./test_tipc/configs/ch_ppocr_mobile_v2.0_det/model_linux_gpu_normal_normal_serving_python_linux_gpu_cpu.txt "serving_infer" +bash test_tipc/prepare.sh ./test_tipc/configs/ch_PP-OCRv2/model_linux_gpu_normal_normal_serving_python_linux_gpu_cpu.txt "serving_infer" # 用法: -bash test_tipc/test_serving_infer_python.sh ./test_tipc/configs/ch_ppocr_mobile_v2.0_det/model_linux_gpu_normal_normal_serving_python_linux_gpu_cpu.txt "serving_infer" +bash test_tipc/test_serving_infer_python.sh ./test_tipc/configs/ch_PP-OCRv2/model_linux_gpu_normal_normal_serving_python_linux_gpu_cpu.txt "serving_infer" ``` **cpp serving** -先运行`prepare.sh`准备数据和模型,然后运行`test_serving_infer_cpp.sh`进行测试,最终在```test_tipc/output```目录下生成`serving_infer_cpp*.log`后缀的日志文件。 +先运行`prepare.sh`准备数据和模型,然后运行`test_serving_infer_cpp.sh`进行测试,最终在```test_tipc/output/{model_name}/serving_infer/cpp```目录下生成`cpp_*.log`后缀的日志文件。 ```shell -bash test_tipc/prepare.sh ./test_tipc/configs/ch_ppocr_mobile_v2.0/model_linux_gpu_normal_normal_serving_python_linux_gpu_cpu.txt "serving_infer" +bash test_tipc/prepare.sh ./test_tipc/configs/ch_PP-OCRv2/model_linux_gpu_normal_normal_serving_cpp_linux_gpu_cpu.txt "serving_infer" # 用法: -bash test_tipc/test_serving_infer_cpp.sh ./test_tipc/configs/ch_ppocr_mobile_v2.0/model_linux_gpu_normal_normal_serving_cpp_linux_gpu_cpu.txt "serving_infer" +bash test_tipc/test_serving_infer_cpp.sh ./test_tipc/configs/ch_PP-OCRv2/model_linux_gpu_normal_normal_serving_cpp_linux_gpu_cpu.txt "serving_infer" ``` #### 运行结果 -各测试的运行情况会打印在 `test_tipc/output/results_serving.log` 中: +各测试的运行情况会打印在 `test_tipc/output/{model_name}/serving_infer/python(cpp)/results_python(cpp)_serving.log` 中: 运行成功时会输出: ``` -Run successfully with command - python3.7 pipeline_http_client.py --image_dir=../../doc/imgs > ../../tests/output/server_infer_cpu_usemkldnn_True_threads_1_batchsize_1.log 2>&1 ! -Run successfully with command - xxxxx +Run successfully with command - ch_PP-OCRv2_rec - nohup python3.7 web_service_rec.py --config=config.yml --opt op.rec.concurrency="1" op.det.local_service_conf.devices= op.det.local_service_conf.use_mkldnn=False op.det.local_service_conf.thread_num=6 op.rec.local_service_conf.model_config=ppocr_rec_v2_serving > ./test_tipc/output/ch_PP-OCRv2_rec/serving_infer/python/python_server_cpu_usemkldnn_False_threads_6.log 2>&1 &! +Run successfully with command - ch_PP-OCRv2_rec - python3.7 pipeline_http_client.py --det=False --image_dir=../../inference/rec_inference > ./test_tipc/output/ch_PP-OCRv2_rec/serving_infer/python/python_client_cpu_pipeline_http_usemkldnn_False_threads_6_batchsize_1.log 2>&1 ! ... ``` 运行失败时会输出: ``` -Run failed with command - python3.7 pipeline_http_client.py --image_dir=../../doc/imgs > ../../tests/output/server_infer_cpu_usemkldnn_True_threads_1_batchsize_1.log 2>&1 ! -Run failed with command - python3.7 pipeline_http_client.py --image_dir=../../doc/imgs > ../../tests/output/server_infer_cpu_usemkldnn_True_threads_6_batchsize_1.log 2>&1 ! -Run failed with command - xxxxx +Run failed with command - ch_PP-OCRv2_rec - nohup python3.7 web_service_rec.py --config=config.yml --opt op.rec.concurrency="1" op.det.local_service_conf.devices= op.det.local_service_conf.use_mkldnn=False op.det.local_service_conf.thread_num=6 op.rec.local_service_conf.model_config=ppocr_rec_v2_serving > ./test_tipc/output/ch_PP-OCRv2_rec/serving_infer/python/python_server_cpu_usemkldnn_False_threads_6.log 2>&1 &! +Run failed with command - ch_PP-OCRv2_rec - python3.7 pipeline_http_client.py --det=False --image_dir=../../inference/rec_inference > ./test_tipc/output/ch_PP-OCRv2_rec/serving_infer/python/python_client_cpu_pipeline_http_usemkldnn_False_threads_6_batchsize_1.log 2>&1 ! ... ``` -详细的预测结果会存在 test_tipc/output/ 文件夹下,例如`server_infer_gpu_usetrt_True_precision_fp16_batchsize_1.log`中会返回检测框的坐标: - -``` -{'err_no': 0, 'err_msg': '', 'key': ['dt_boxes'], 'value': ['[[[ 78. 642.]\n [409. 640.]\n [409. 657.]\n -[ 78. 659.]]\n\n [[ 75. 614.]\n [211. 614.]\n [211. 635.]\n [ 75. 635.]]\n\n -[[103. 554.]\n [135. 554.]\n [135. 575.]\n [103. 575.]]\n\n [[ 75. 531.]\n -[347. 531.]\n [347. 549.]\n [ 75. 549.] ]\n\n [[ 76. 503.]\n [309. 498.]\n -[309. 521.]\n [ 76. 526.]]\n\n [[163. 462.]\n [317. 462.]\n [317. 493.]\n -[163. 493.]]\n\n [[324. 431.]\n [414. 431.]\n [414. 452.]\n [324. 452.]]\n\n -[[ 76. 412.]\n [208. 408.]\n [209. 424.]\n [ 76. 428.]]\n\n [[307. 409.]\n -[428. 409.]\n [428. 426.]\n [307 . 426.]]\n\n [[ 74. 385.]\n [217. 382.]\n -[217. 400.]\n [ 74. 403.]]\n\n [[308. 381.]\n [427. 380.]\n [427. 400.]\n -[308. 401.]]\n\n [[ 74. 363.]\n [195. 362.]\n [195. 378.]\n [ 74. 379.]]\n\n -[[303. 359.]\n [423. 357.]\n [423. 375.]\n [303. 377.]]\n\n [[ 70. 336.]\n -[239. 334.]\n [239. 354.]\ n [ 70. 356.]]\n\n [[ 70. 312.]\n [204. 310.]\n -[204. 327.]\n [ 70. 330.]]\n\n [[303. 308.]\n [419. 306.]\n [419. 326.]\n -[303. 328.]]\n\n [[113. 2 72.]\n [246. 270.]\n [247. 299.]\n [113. 301.]]\n\n - [[361. 269.]\n [384. 269.]\n [384. 296.]\n [361. 296.]]\n\n [[ 70. 250.]\n - [243. 246.]\n [243. 265.]\n [ 70. 269.]]\n\n [[ 65. 221.]\n [187. 220.]\n -[187. 240.]\n [ 65. 241.]]\n\n [[337. 216.]\n [382. 216.]\n [382. 240.]\n -[337. 240.]]\n\n [ [ 65. 196.]\n [247. 193.]\n [247. 213.]\n [ 65. 216.]]\n\n -[[296. 197.]\n [423. 191.]\n [424. 209.]\n [296. 215.]]\n\n [[ 65. 167.]\n [244. 167.]\n -[244. 186.]\n [ 65. 186.]]\n\n [[ 67. 139.]\n [290. 139.]\n [290. 159.]\n [ 67. 159.]]\n\n -[[ 68. 113.]\n [410. 113.]\n [410. 128.]\n [ 68. 129.] ]\n\n [[277. 87.]\n [416. 87.]\n -[416. 108.]\n [277. 108.]]\n\n [[ 79. 28.]\n [132. 28.]\n [132. 62.]\n [ 79. 62.]]\n\n -[[163. 17.]\n [410. 14.]\n [410. 50.]\n [163. 53.]]]']} -``` +详细的预测结果会存在 test_tipc/output/{model_name}/serving_infer/python(cpp)/ 文件夹下 ## 3. 更多教程 diff --git a/test_tipc/docs/test_train_inference_python.md b/test_tipc/docs/test_train_inference_python.md index 99de9400797493f429f8176a9b6b374a76df4872..d1dbd8ee47a4dc7fb4c0bb3d26a920aab1c7ff72 100644 --- a/test_tipc/docs/test_train_inference_python.md +++ b/test_tipc/docs/test_train_inference_python.md @@ -1,6 +1,6 @@ # Linux端基础训练预测功能测试 -Linux端基础训练预测功能测试的主程序为`test_train_inference_python.sh`,可以测试基于Python的模型训练、评估、推理等基本功能,包括裁剪、量化、蒸馏。 +Linux端基础训练预测功能测试的主程序为`test_train_inference_python.sh`,可以测试基于Python的模型训练、评估、推理等基本功能,包括PACT在线量化。 - Mac端基础训练预测功能测试参考[链接](./mac_test_train_inference_python.md) - Windows端基础训练预测功能测试参考[链接](./win_test_train_inference_python.md) @@ -11,13 +11,14 @@ Linux端基础训练预测功能测试的主程序为`test_train_inference_pytho | 算法名称 | 模型名称 | 单机单卡 | 单机多卡 | 多机多卡 | 模型压缩(单机多卡) | | :---- | :---- | :---- | :---- | :---- | :---- | -| DB | ch_ppocr_mobile_v2.0_det| 正常训练
混合精度 | 正常训练
混合精度 | 正常训练
混合精度 | 正常训练:FPGM裁剪、PACT量化
离线量化(无需训练) | -| DB | ch_ppocr_server_v2.0_det| 正常训练
混合精度 | 正常训练
混合精度 | 正常训练
混合精度 | 正常训练:FPGM裁剪、PACT量化
离线量化(无需训练) | -| CRNN | ch_ppocr_mobile_v2.0_rec| 正常训练
混合精度 | 正常训练
混合精度 | 正常训练
混合精度 | 正常训练:PACT量化
离线量化(无需训练) | -| CRNN | ch_ppocr_server_v2.0_rec| 正常训练
混合精度 | 正常训练
混合精度 | 正常训练
混合精度 | 正常训练:PACT量化
离线量化(无需训练) | -|PP-OCR| ch_ppocr_mobile_v2.0| 正常训练
混合精度 | 正常训练
混合精度 | 正常训练
混合精度 | - | -|PP-OCR| ch_ppocr_server_v2.0| 正常训练
混合精度 | 正常训练
混合精度 | 正常训练
混合精度 | - | +| DB | ch_ppocr_mobile_v2_0_det| 正常训练
混合精度 | 正常训练
混合精度 | 正常训练
混合精度 | 正常训练:FPGM裁剪、PACT量化 | +| DB | ch_ppocr_server_v2_0_det| 正常训练
混合精度 | 正常训练
混合精度 | 正常训练
混合精度 | 正常训练:FPGM裁剪、PACT量化 | +| CRNN | ch_ppocr_mobile_v2_0_rec| 正常训练
混合精度 | 正常训练
混合精度 | 正常训练
混合精度 | 正常训练:PACT量化 | +| CRNN | ch_ppocr_server_v2_0_rec| 正常训练
混合精度 | 正常训练
混合精度 | 正常训练
混合精度 | 正常训练:PACT量化 | +|PP-OCR| ch_ppocr_mobile_v2_0| 正常训练
混合精度 | 正常训练
混合精度 | 正常训练
混合精度 | - | +|PP-OCR| ch_ppocr_server_v2_0| 正常训练
混合精度 | 正常训练
混合精度 | 正常训练
混合精度 | - | |PP-OCRv2| ch_PP-OCRv2 | 正常训练
混合精度 | 正常训练
混合精度 | 正常训练
混合精度 | - | +|PP-OCRv3| ch_PP-OCRv3 | 正常训练
混合精度 | 正常训练
混合精度 | 正常训练
混合精度 | - | - 预测相关:基于训练是否使用量化,可以将训练产出的模型可以分为`正常模型`和`量化模型`,这两类模型对应的预测功能汇总如下, @@ -35,19 +36,14 @@ Linux端基础训练预测功能测试的主程序为`test_train_inference_pytho 运行环境配置请参考[文档](./install.md)的内容配置TIPC的运行环境。 ### 2.1 安装依赖 -- 安装PaddlePaddle >= 2.0 +- 安装PaddlePaddle >= 2.3 - 安装PaddleOCR依赖 ``` pip3 install -r ../requirements.txt ``` - 安装autolog(规范化日志输出工具) ``` - git clone https://github.com/LDOUBLEV/AutoLog - cd AutoLog - pip3 install -r requirements.txt - python3 setup.py bdist_wheel - pip3 install ./dist/auto_log-1.0.0-py3-none-any.whl - cd ../ + pip3 install https://paddleocr.bj.bcebos.com/libs/auto_log-1.2.0-py3-none-any.whl ``` - 安装PaddleSlim (可选) ``` @@ -57,60 +53,57 @@ Linux端基础训练预测功能测试的主程序为`test_train_inference_pytho ### 2.2 功能测试 -先运行`prepare.sh`准备数据和模型,然后运行`test_train_inference_python.sh`进行测试,最终在```test_tipc/output```目录下生成`python_infer_*.log`格式的日志文件。 +#### 2.2.1 基础训练推理链条 +先运行`prepare.sh`准备数据和模型,然后运行`test_train_inference_python.sh`进行测试,最终在```test_tipc/output```目录下生成`,model_name/lite_train_lite_infer/*.log`格式的日志文件。 -`test_train_inference_python.sh`包含5种运行模式,每种模式的运行数据不同,分别用于测试速度和精度,分别是: +`test_train_inference_python.sh`包含基础链条的4种运行模式,每种模式的运行数据不同,分别用于测试速度和精度,分别是: - 模式1:lite_train_lite_infer,使用少量数据训练,用于快速验证训练到预测的走通流程,不验证精度和速度; ```shell -bash test_tipc/prepare.sh ./test_tipc/configs/ch_ppocr_mobile_v2.0_det/train_infer_python.txt 'lite_train_lite_infer' -bash test_tipc/test_train_inference_python.sh ./test_tipc/configs/ch_ppocr_mobile_v2.0_det/train_infer_python.txt 'lite_train_lite_infer' +bash test_tipc/prepare.sh ./test_tipc/configs/ch_ppocr_mobile_v2_0_det/train_infer_python.txt 'lite_train_lite_infer' +bash test_tipc/test_train_inference_python.sh ./test_tipc/configs/ch_ppocr_mobile_v2_0_det/train_infer_python.txt 'lite_train_lite_infer' ``` - 模式2:lite_train_whole_infer,使用少量数据训练,一定量数据预测,用于验证训练后的模型执行预测,预测速度是否合理; ```shell -bash test_tipc/prepare.sh ./test_tipc/configs/ch_ppocr_mobile_v2.0_det/train_infer_python.txt 'lite_train_whole_infer' -bash test_tipc/test_train_inference_python.sh ../test_tipc/configs/ch_ppocr_mobile_v2.0_det/train_infer_python.txt 'lite_train_whole_infer' +bash test_tipc/prepare.sh ./test_tipc/configs/ch_ppocr_mobile_v2_0_det/train_infer_python.txt 'lite_train_whole_infer' +bash test_tipc/test_train_inference_python.sh ../test_tipc/configs/ch_ppocr_mobile_v2_0_det/train_infer_python.txt 'lite_train_whole_infer' ``` - 模式3:whole_infer,不训练,全量数据预测,走通开源模型评估、动转静,检查inference model预测时间和精度; ```shell -bash test_tipc/prepare.sh ./test_tipc/configs/ch_ppocr_mobile_v2.0_det/train_infer_python.txt 'whole_infer' +bash test_tipc/prepare.sh ./test_tipc/configs/ch_ppocr_mobile_v2_0_det/train_infer_python.txt 'whole_infer' # 用法1: -bash test_tipc/test_train_inference_python.sh ../test_tipc/configs/ch_ppocr_mobile_v2.0_det/train_infer_python.txt 'whole_infer' +bash test_tipc/test_train_inference_python.sh ../test_tipc/configs/ch_ppocr_mobile_v2_0_det/train_infer_python.txt 'whole_infer' # 用法2: 指定GPU卡预测,第三个传入参数为GPU卡号 -bash test_tipc/test_train_inference_python.sh ./test_tipc/configs/ch_ppocr_mobile_v2.0_det/train_infer_python.txt 'whole_infer' '1' +bash test_tipc/test_train_inference_python.sh ./test_tipc/configs/ch_ppocr_mobile_v2_0_det/train_infer_python.txt 'whole_infer' '1' ``` - 模式4:whole_train_whole_infer,CE: 全量数据训练,全量数据预测,验证模型训练精度,预测精度,预测速度; ```shell -bash test_tipc/prepare.sh ./test_tipc/configs/ch_ppocr_mobile_v2.0_det/train_infer_python.txt 'whole_train_whole_infer' -bash test_tipc/test_train_inference_python.sh ./test_tipc/configs/ch_ppocr_mobile_v2.0_det/train_infer_python.txt 'whole_train_whole_infer' +bash test_tipc/prepare.sh ./test_tipc/configs/ch_ppocr_mobile_v2_0_det/train_infer_python.txt 'whole_train_whole_infer' +bash test_tipc/test_train_inference_python.sh ./test_tipc/configs/ch_ppocr_mobile_v2_0_det/train_infer_python.txt 'whole_train_whole_infer' ``` -- 模式5:klquant_whole_infer,测试离线量化; -```shell -bash test_tipc/prepare.sh ./test_tipc/configs/ch_ppocr_mobile_v2.0_det_KL/model_linux_gpu_normal_normal_infer_python_linux_gpu_cpu.txt 'klquant_whole_infer' -bash test_tipc/test_train_inference_python.sh ./test_tipc/configs/ch_ppocr_mobile_v2.0_det_KL/model_linux_gpu_normal_normal_infer_python_linux_gpu_cpu.txt 'klquant_whole_infer' -``` - 运行相应指令后,在`test_tipc/output`文件夹下自动会保存运行日志。如'lite_train_lite_infer'模式下,会运行训练+inference的链条,因此,在`test_tipc/output`文件夹有以下文件: ``` -test_tipc/output/ +test_tipc/output/model_name/lite_train_lite_infer/ |- results_python.log # 运行指令状态的日志 -|- norm_train_gpus_0_autocast_null/ # GPU 0号卡上正常训练的训练日志和模型保存文件夹 -|- pact_train_gpus_0_autocast_null/ # GPU 0号卡上量化训练的训练日志和模型保存文件夹 +|- norm_train_gpus_0_autocast_null/ # GPU 0号卡上正常单机单卡训练的训练日志和模型保存文件夹 +|- norm_train_gpus_0,1_autocast_null/ # GPU 0,1号卡上正常单机多卡训练的训练日志和模型保存文件夹 ...... -|- python_infer_cpu_usemkldnn_True_threads_1_batchsize_1.log # CPU上开启Mkldnn线程数设置为1,测试batch_size=1条件下的预测运行日志 -|- python_infer_gpu_usetrt_True_precision_fp16_batchsize_1.log # GPU上开启TensorRT,测试batch_size=1的半精度预测日志 +|- python_infer_cpu_usemkldnn_False_threads_6_precision_fp32_batchsize_1.log # CPU上关闭Mkldnn线程数设置为6,测试batch_size=1条件下的fp32精度预测运行日志 +|- python_infer_gpu_usetrt_False_precision_fp32_batchsize_1.log # GPU上关闭TensorRT,测试batch_size=1的fp32精度预测日志 ...... ``` 其中`results_python.log`中包含了每条指令的运行状态,如果运行成功会输出: ``` -Run successfully with command - python3.7 tools/train.py -c tests/configs/det_mv3_db.yml -o Global.pretrained_model=./pretrain_models/MobileNetV3_large_x0_5_pretrained Global.use_gpu=True Global.save_model_dir=./tests/output/norm_train_gpus_0_autocast_null Global.epoch_num=1 Train.loader.batch_size_per_card=2 ! -Run successfully with command - python3.7 tools/export_model.py -c tests/configs/det_mv3_db.yml -o Global.pretrained_model=./tests/output/norm_train_gpus_0_autocast_null/latest Global.save_inference_dir=./tests/output/norm_train_gpus_0_autocast_null! +[33m Run successfully with command - ch_ppocr_mobile_v2_0_det - python3.7 tools/train.py -c configs/det/ch_ppocr_v2_0/ch_det_mv3_db_v2_0.yml -o Global.pretrained_model=./pretrain_models/MobileNetV3_large_x0_5_pretrained Global.use_gpu=True Global.save_model_dir=./test_tipc/output/ch_ppocr_mobile_v2_0_det/lite_train_lite_infer/norm_train_gpus_0_autocast_null Global.epoch_num=100 Train.loader.batch_size_per_card=2 !  + Run successfully with command - ch_ppocr_mobile_v2_0_det - python3.7 tools/export_model.py -c configs/det/ch_ppocr_v2_0/ch_det_mv3_db_v2_0.yml -o Global.checkpoints=./test_tipc/output/ch_ppocr_mobile_v2_0_det/lite_train_lite_infer/norm_train_gpus_0_autocast_null/latest Global.save_inference_dir=./test_tipc/output/ch_ppocr_mobile_v2_0_det/lite_train_lite_infer/norm_train_gpus_0_autocast_null > ./test_tipc/output/ch_ppocr_mobile_v2_0_det/lite_train_lite_infer/norm_train_gpus_0_autocast_null_nodes_1_export.log 2>&1 !  + Run successfully with command - ch_ppocr_mobile_v2_0_det - python3.7 tools/infer/predict_det.py --use_gpu=True --use_tensorrt=False --precision=fp32 --det_model_dir=./test_tipc/output/ch_ppocr_mobile_v2_0_det/lite_train_lite_infer/norm_train_gpus_0_autocast_null --rec_batch_num=1 --image_dir=./train_data/icdar2015/text_localization/ch4_test_images/ --benchmark=True > ./test_tipc/output/ch_ppocr_mobile_v2_0_det/lite_train_lite_infer/python_infer_gpu_usetrt_False_precision_fp32_batchsize_1.log 2>&1 !  + Run successfully with command - ch_ppocr_mobile_v2_0_det - python3.7 tools/infer/predict_det.py --use_gpu=False --enable_mkldnn=False --cpu_threads=6 --det_model_dir=./test_tipc/output/ch_ppocr_mobile_v2_0_det/lite_train_lite_infer/norm_train_gpus_0_autocast_null --rec_batch_num=1 --image_dir=./train_data/icdar2015/text_localization/ch4_test_images/ --benchmark=True --precision=fp32 > ./test_tipc/output/ch_ppocr_mobile_v2_0_det/lite_train_lite_infer/python_infer_cpu_usemkldnn_False_threads_6_precision_fp32_batchsize_1.log 2>&1 !  ...... ``` 如果运行失败,会输出: @@ -121,6 +114,22 @@ Run failed with command - python3.7 tools/export_model.py -c tests/configs/det_m ``` 可以很方便的根据`results_python.log`中的内容判定哪一个指令运行错误。 +#### 2.2.2 PACT在线量化链条 +此外,`test_train_inference_python.sh`还包含PACT在线量化模式,命令如下: +以ch_PP-OCRv2_det为例,如需测试其他模型更换配置即可。 + +```shell +bash test_tipc/prepare.sh ./test_tipc/configs/ch_PP-OCRv2_det/train_pact_infer_python.txt 'lite_train_lite_infer' +bash test_tipc/test_train_inference_python.sh ./test_tipc/configs/ch_PP-OCRv2_det/train_pact_infer_python.txt 'lite_train_lite_infer' +``` +#### 2.2.3 混合精度训练链条 +此外,`test_train_inference_python.sh`还包含混合精度训练模式,命令如下: +以ch_PP-OCRv2_det为例,如需测试其他模型更换配置即可。 + +```shell +bash test_tipc/prepare.sh ./test_tipc/configs/ch_PP-OCRv2_det/train_linux_gpu_normal_amp_infer_python_linux_gpu_cpu.txt 'lite_train_lite_infer' +bash test_tipc/test_train_inference_python.sh ./test_tipc/configs/ch_PP-OCRv2_det/train_linux_gpu_normal_amp_infer_python_linux_gpu_cpu.txt 'lite_train_lite_infer' +``` ### 2.3 精度测试 diff --git a/test_tipc/docs/win_test_train_inference_python.md b/test_tipc/docs/win_test_train_inference_python.md index 6e3ce93bb3123133075b9d65c64850a87de5f828..d631c38873867ef1fa6e9a03582df26b59e309a5 100644 --- a/test_tipc/docs/win_test_train_inference_python.md +++ b/test_tipc/docs/win_test_train_inference_python.md @@ -8,7 +8,7 @@ Windows端基础训练预测功能测试的主程序为`test_train_inference_pyt | 算法名称 | 模型名称 | 单机单卡 | 单机多卡 | 多机多卡 | 模型压缩(单机多卡) | | :---- | :---- | :---- | :---- | :---- | :---- | -| DB | ch_ppocr_mobile_v2.0_det| 正常训练
混合精度 | - | - | 正常训练:FPGM裁剪、PACT量化
离线量化(无需训练) | +| DB | ch_ppocr_mobile_v2_0_det| 正常训练
混合精度 | - | - | 正常训练:FPGM裁剪、PACT量化 | - 预测相关:基于训练是否使用量化,可以将训练产出的模型可以分为`正常模型`和`量化模型`,这两类模型对应的预测功能汇总如下: @@ -29,19 +29,14 @@ Windows端基础训练预测功能测试的主程序为`test_train_inference_pyt ### 2.1 安装依赖 -- 安装PaddlePaddle >= 2.0 +- 安装PaddlePaddle >= 2.3 - 安装PaddleOCR依赖 ``` pip install -r ../requirements.txt ``` - 安装autolog(规范化日志输出工具) ``` - git clone https://github.com/LDOUBLEV/AutoLog - cd AutoLog - pip install -r requirements.txt - python setup.py bdist_wheel - pip install ./dist/auto_log-1.0.0-py3-none-any.whl - cd ../ + pip install https://paddleocr.bj.bcebos.com/libs/auto_log-1.2.0-py3-none-any.whl ``` - 安装PaddleSlim (可选) ``` @@ -51,54 +46,46 @@ Windows端基础训练预测功能测试的主程序为`test_train_inference_pyt ### 2.2 功能测试 -先运行`prepare.sh`准备数据和模型,然后运行`test_train_inference_python.sh`进行测试,最终在```test_tipc/output```目录下生成`python_infer_*.log`格式的日志文件。 +先运行`prepare.sh`准备数据和模型,然后运行`test_train_inference_python.sh`进行测试,最终在```test_tipc/output```目录下生成`,model_name/lite_train_lite_infer/*.log`格式的日志文件。 -`test_train_inference_python.sh`包含5种运行模式,每种模式的运行数据不同,分别用于测试速度和精度,分别是: +`test_train_inference_python.sh`包含基础链条的4种运行模式,每种模式的运行数据不同,分别用于测试速度和精度,分别是: - 模式1:lite_train_lite_infer,使用少量数据训练,用于快速验证训练到预测的走通流程,不验证精度和速度; ```shell -bash test_tipc/prepare.sh ./test_tipc/configs/ch_ppocr_mobile_v2.0_det/train_windows_gpu_normal_normal_infer_python_windows_cpu_gpu.txt 'lite_train_lite_infer' -bash test_tipc/test_train_inference_python.sh ./test_tipc/configs/ch_ppocr_mobile_v2.0_det/train_windows_gpu_normal_normal_infer_python_windows_cpu_gpu.txt 'lite_train_lite_infer' +bash test_tipc/prepare.sh ./test_tipc/configs/ch_ppocr_mobile_v2_0_det/train_windows_gpu_normal_normal_infer_python_windows_cpu_gpu.txt 'lite_train_lite_infer' +bash test_tipc/test_train_inference_python.sh ./test_tipc/configs/ch_ppocr_mobile_v2_0_det/train_windows_gpu_normal_normal_infer_python_windows_cpu_gpu.txt 'lite_train_lite_infer' ``` - 模式2:lite_train_whole_infer,使用少量数据训练,一定量数据预测,用于验证训练后的模型执行预测,预测速度是否合理; ```shell -bash test_tipc/prepare.sh ./test_tipc/configs/ch_ppocr_mobile_v2.0_det/train_windows_gpu_normal_normal_infer_python_windows_cpu_gpu.txt 'lite_train_whole_infer' -bash test_tipc/test_train_inference_python.sh ./test_tipc/configs/ch_ppocr_mobile_v2.0_det/train_windows_gpu_normal_normal_infer_python_windows_cpu_gpu.txt 'lite_train_whole_infer' +bash test_tipc/prepare.sh ./test_tipc/configs/ch_ppocr_mobile_v2_0_det/train_windows_gpu_normal_normal_infer_python_windows_cpu_gpu.txt 'lite_train_whole_infer' +bash test_tipc/test_train_inference_python.sh ./test_tipc/configs/ch_ppocr_mobile_v2_0_det/train_windows_gpu_normal_normal_infer_python_windows_cpu_gpu.txt 'lite_train_whole_infer' ``` - 模式3:whole_infer,不训练,全量数据预测,走通开源模型评估、动转静,检查inference model预测时间和精度; ```shell -bash test_tipc/prepare.sh ./test_tipc/configs/ch_ppocr_mobile_v2.0_det/train_windows_gpu_normal_normal_infer_python_windows_cpu_gpu.txt 'whole_infer' +bash test_tipc/prepare.sh ./test_tipc/configs/ch_ppocr_mobile_v2_0_det/train_windows_gpu_normal_normal_infer_python_windows_cpu_gpu.txt 'whole_infer' # 用法1: -bash test_tipc/test_train_inference_python.sh ./test_tipc/configs/ch_ppocr_mobile_v2.0_det/train_windows_gpu_normal_normal_infer_python_windows_cpu_gpu.txt 'whole_infer' +bash test_tipc/test_train_inference_python.sh ./test_tipc/configs/ch_ppocr_mobile_v2_0_det/train_windows_gpu_normal_normal_infer_python_windows_cpu_gpu.txt 'whole_infer' # 用法2: 指定GPU卡预测,第三个传入参数为GPU卡号 -bash test_tipc/test_train_inference_python.sh ./test_tipc/configs/ch_ppocr_mobile_v2.0_det/train_windows_gpu_normal_normal_infer_python_windows_cpu_gpu.txt 'whole_infer' '1' +bash test_tipc/test_train_inference_python.sh ./test_tipc/configs/ch_ppocr_mobile_v2_0_det/train_windows_gpu_normal_normal_infer_python_windows_cpu_gpu.txt 'whole_infer' '1' ``` - 模式4:whole_train_whole_infer,CE: 全量数据训练,全量数据预测,验证模型训练精度,预测精度,预测速度; ```shell -bash test_tipc/prepare.sh ./test_tipc/configs/ch_ppocr_mobile_v2.0_det/train_windows_gpu_normal_normal_infer_python_windows_cpu_gpu.txt 'whole_train_whole_infer' -bash test_tipc/test_train_inference_python.sh ./test_tipc/configs/ch_ppocr_mobile_v2.0_det/train_windows_gpu_normal_normal_infer_python_windows_cpu_gpu.txt 'whole_train_whole_infer' +bash test_tipc/prepare.sh ./test_tipc/configs/ch_ppocr_mobile_v2_0_det/train_windows_gpu_normal_normal_infer_python_windows_cpu_gpu.txt 'whole_train_whole_infer' +bash test_tipc/test_train_inference_python.sh ./test_tipc/configs/ch_ppocr_mobile_v2_0_det/train_windows_gpu_normal_normal_infer_python_windows_cpu_gpu.txt 'whole_train_whole_infer' ``` -- 模式5:klquant_whole_infer,测试离线量化; -```shell -bash test_tipc/prepare.sh ./test_tipc/configs/ch_ppocr_mobile_v2.0_det_KL/model_linux_gpu_normal_normal_infer_python_windows_gpu_cpu.txt 'klquant_whole_infer' -bash test_tipc/test_train_inference_python.sh ./test_tipc/configs/ch_ppocr_mobile_v2.0_det_KL/model_linux_gpu_normal_normal_infer_python_windows_gpu_cpu.txt 'klquant_whole_infer' -``` - - 运行相应指令后,在`test_tipc/output`文件夹下自动会保存运行日志。如'lite_train_lite_infer'模式下,会运行训练+inference的链条,因此,在`test_tipc/output`文件夹有以下文件: ``` -test_tipc/output/ +test_tipc/output/model_name/lite_train_lite_infer/ |- results_python.log # 运行指令状态的日志 |- norm_train_gpus_0_autocast_null/ # GPU 0号卡上正常训练的训练日志和模型保存文件夹 -|- pact_train_gpus_0_autocast_null/ # GPU 0号卡上量化训练的训练日志和模型保存文件夹 ...... -|- python_infer_cpu_usemkldnn_True_threads_1_batchsize_1.log # CPU上开启Mkldnn线程数设置为1,测试batch_size=1条件下的预测运行日志 -|- python_infer_gpu_usetrt_True_precision_fp16_batchsize_1.log # GPU上开启TensorRT,测试batch_size=1的半精度预测日志 +|- python_infer_cpu_usemkldnn_False_threads_6_precision_fp32_batchsize_1.log # CPU上关闭Mkldnn线程数设置为6,测试batch_size=1条件下的fp32精度预测运行日志 +|- python_infer_gpu_usetrt_False_precision_fp32_batchsize_1.log # GPU上关闭TensorRT,测试batch_size=1的fp32精度预测日志 ...... ``` diff --git a/test_tipc/prepare.sh b/test_tipc/prepare.sh index 76543f39e4952b40368cdd392acc430dda8fcd9b..a4ba31928bba4a00a560461392f7011244af5e0c 100644 --- a/test_tipc/prepare.sh +++ b/test_tipc/prepare.sh @@ -21,7 +21,10 @@ model_name=$(func_parser_value "${lines[1]}") trainer_list=$(func_parser_value "${lines[14]}") if [ ${MODE} = "benchmark_train" ];then - pip install -r requirements.txt + python_name_list=$(func_parser_value "${lines[2]}") + array=(${python_name_list}) + python_name=${array[0]} + ${python_name} -m pip install -r requirements.txt if [[ ${model_name} =~ "ch_ppocr_mobile_v2_0_det" || ${model_name} =~ "det_mv3_db_v2_0" ]];then wget -nc -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/pretrained/MobileNetV3_large_x0_5_pretrained.pdparams --no-check-certificate rm -rf ./train_data/icdar2015 @@ -29,6 +32,13 @@ if [ ${MODE} = "benchmark_train" ];then cd ./train_data/ && tar xf icdar2015_benckmark.tar ln -s ./icdar2015_benckmark ./icdar2015 cd ../ + if [[ ${model_name} =~ "ch_ppocr_mobile_v2_0_det" ]];then + # expand gt.txt 2 times + cd ./train_data/icdar2015/text_localization + for i in `seq 2`;do cp train_icdar2015_label.txt dup$i.txt;done + cat dup* > train_icdar2015_label.txt && rm -rf dup* + cd ../../../ + fi fi if [[ ${model_name} =~ "ch_ppocr_server_v2_0_det" || ${model_name} =~ "ch_PP-OCRv3_det" ]];then rm -rf ./train_data/icdar2015 @@ -97,6 +107,15 @@ if [ ${MODE} = "benchmark_train" ];then ln -s ./pubtabnet_benckmark ./pubtabnet cd ../ fi + if [[ ${model_name} == "slanet" ]];then + wget -nc -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/en_ppstructure_mobile_v2.0_SLANet_train.tar --no-check-certificate + cd ./pretrain_models/ && tar xf en_ppstructure_mobile_v2.0_SLANet_train.tar && cd ../ + rm -rf ./train_data/pubtabnet + wget -nc -P ./train_data/ https://paddleocr.bj.bcebos.com/dataset/pubtabnet_benckmark.tar --no-check-certificate + cd ./train_data/ && tar xf pubtabnet_benckmark.tar + ln -s ./pubtabnet_benckmark ./pubtabnet + cd ../ + fi if [[ ${model_name} == "det_r50_dcn_fce_ctw_v2_0" ]]; then wget -nc -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/contribution/det_r50_dcn_fce_ctw_v2.0_train.tar --no-check-certificate cd ./pretrain_models/ && tar xf det_r50_dcn_fce_ctw_v2.0_train.tar && cd ../ @@ -106,9 +125,9 @@ if [ ${MODE} = "benchmark_train" ];then ln -s ./icdar2015_benckmark ./icdar2015 cd ../ fi - if [ ${model_name} == "layoutxlm_ser" ]; then - pip install -r ppstructure/vqa/requirements.txt - pip install paddlenlp\>=2.3.5 --force-reinstall -i https://mirrors.aliyun.com/pypi/simple/ + if [ ${model_name} == "layoutxlm_ser" ] || [ ${model_name} == "vi_layoutxlm_ser" ]; then + ${python_name} -m pip install -r ppstructure/kie/requirements.txt + ${python_name} -m pip install opencv-python -U wget -nc -P ./train_data/ https://paddleocr.bj.bcebos.com/ppstructure/dataset/XFUND.tar --no-check-certificate cd ./train_data/ && tar xf XFUND.tar # expand gt.txt 10 times @@ -122,6 +141,11 @@ if [ ${MODE} = "benchmark_train" ];then fi if [ ${MODE} = "lite_train_lite_infer" ];then + python_name_list=$(func_parser_value "${lines[2]}") + array=(${python_name_list}) + python_name=${array[0]} + ${python_name} -m pip install -r requirements.txt + ${python_name} -m pip install https://paddleocr.bj.bcebos.com/libs/auto_log-1.2.0-py3-none-any.whl # pretrain lite train data wget -nc -P ./pretrain_models/ https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileNetV3_large_x0_5_pretrained.pdparams --no-check-certificate wget -nc -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_mv3_db_v2.0_train.tar --no-check-certificate @@ -140,6 +164,13 @@ if [ ${MODE} = "lite_train_lite_infer" ];then wget -nc -P ./inference/ https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_rec_infer.tar --no-check-certificate cd ./inference/ && tar xf en_ppocr_mobile_v2.0_table_det_infer.tar && tar xf en_ppocr_mobile_v2.0_table_rec_infer.tar && cd ../ fi + if [ ${model_name} == "slanet" ];then + wget -nc -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/en_ppstructure_mobile_v2.0_SLANet_train.tar --no-check-certificate + cd ./pretrain_models/ && tar xf en_ppstructure_mobile_v2.0_SLANet_train.tar && cd ../ + wget -nc -P ./inference/ https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_det_infer.tar --no-check-certificate + wget -nc -P ./inference/ https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_rec_infer.tar --no-check-certificate + cd ./inference/ && tar xf en_ppocr_mobile_v2.0_table_det_infer.tar && tar xf en_ppocr_mobile_v2.0_table_rec_infer.tar && cd ../ + fi if [[ ${model_name} =~ "det_r50_db_plusplus" ]];then wget -nc -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/dygraph_v2.1/en_det/ResNet50_dcn_asf_synthtext_pretrained.pdparams --no-check-certificate fi @@ -161,6 +192,8 @@ if [ ${MODE} = "lite_train_lite_infer" ];then ln -s ./icdar2015_lite ./icdar2015 wget -nc -P ./ic15_data/ https://paddleocr.bj.bcebos.com/dataset/rec_gt_train_lite.txt --no-check-certificate wget -nc -P ./ic15_data/ https://paddleocr.bj.bcebos.com/dataset/rec_gt_test_lite.txt --no-check-certificate + mv ic15_data/rec_gt_train_lite.txt ic15_data/rec_gt_train.txt + mv ic15_data/rec_gt_test_lite.txt ic15_data/rec_gt_test.txt cd ../ cd ./inference && tar xf rec_inference.tar && cd ../ if [ ${model_name} == "ch_PP-OCRv2_det" ] || [ ${model_name} == "ch_PP-OCRv2_det_PACT" ]; then @@ -203,6 +236,10 @@ if [ ${MODE} = "lite_train_lite_infer" ];then if [ ${model_name} == "ch_ppocr_mobile_v2_0_rec_FPGM" ]; then wget -nc -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_train.tar --no-check-certificate cd ./pretrain_models/ && tar xf ch_ppocr_mobile_v2.0_rec_train.tar && cd ../ + ${python_name} -m pip install paddleslim + fi + if [ ${model_name} == "ch_ppocr_mobile_v2_0_det_FPGM" ]; then + ${python_name} -m pip install paddleslim fi if [ ${model_name} == "det_mv3_east_v2_0" ]; then wget -nc -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_mv3_east_v2.0_train.tar --no-check-certificate @@ -221,11 +258,26 @@ if [ ${MODE} = "lite_train_lite_infer" ];then cd ./pretrain_models/ && tar xf rec_r32_gaspin_bilstm_att_train.tar && cd ../ fi if [ ${model_name} == "layoutxlm_ser" ]; then - pip install -r ppstructure/vqa/requirements.txt - pip install paddlenlp\>=2.3.5 --force-reinstall -i https://mirrors.aliyun.com/pypi/simple/ + ${python_name} -m pip install -r ppstructure/kie/requirements.txt + ${python_name} -m pip install opencv-python -U wget -nc -P ./train_data/ https://paddleocr.bj.bcebos.com/ppstructure/dataset/XFUND.tar --no-check-certificate cd ./train_data/ && tar xf XFUND.tar cd ../ + + wget -nc -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutXLM_xfun_zh.tar --no-check-certificate + cd ./pretrain_models/ && tar xf ser_LayoutXLM_xfun_zh.tar && cd ../ + fi + if [ ${model_name} == "vi_layoutxlm_ser" ]; then + ${python_name} -m pip install -r ppstructure/kie/requirements.txt + ${python_name} -m pip install opencv-python -U + wget -nc -P ./train_data/ https://paddleocr.bj.bcebos.com/ppstructure/dataset/XFUND.tar --no-check-certificate + cd ./train_data/ && tar xf XFUND.tar + cd ../ + fi + if [ ${model_name} == "det_r18_ct" ]; then + wget -nc -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/pretrained/ResNet18_vd_pretrained.pdparams --no-check-certificate + wget -nc -P ./train_data/ https://paddleocr.bj.bcebos.com/dataset/ct_tipc/total_text_lite2.tar --no-check-certificate + cd ./train_data && tar xf total_text_lite2.tar && ln -s total_text_lite2 total_text && cd ../ fi elif [ ${MODE} = "whole_train_whole_infer" ];then @@ -295,9 +347,18 @@ elif [ ${MODE} = "lite_train_whole_infer" ];then cd ./inference/ && tar xf en_ppocr_mobile_v2.0_table_det_infer.tar && tar xf en_ppocr_mobile_v2.0_table_rec_infer.tar && cd ../ fi elif [ ${MODE} = "whole_infer" ];then + python_name_list=$(func_parser_value "${lines[2]}") + array=(${python_name_list}) + python_name=${array[0]} + ${python_name} -m pip install paddleslim + ${python_name} -m pip install -r requirements.txt wget -nc -P ./inference https://paddleocr.bj.bcebos.com/dygraph_v2.0/test/ch_det_data_50.tar --no-check-certificate wget -nc -P ./inference/ https://paddleocr.bj.bcebos.com/dygraph_v2.0/test/rec_inference.tar --no-check-certificate cd ./inference && tar xf rec_inference.tar && tar xf ch_det_data_50.tar && cd ../ + wget -nc -P ./train_data/ https://paddleocr.bj.bcebos.com/ppstructure/dataset/XFUND.tar --no-check-certificate + cd ./train_data/ && tar xf XFUND.tar && cd ../ + head -n 2 train_data/XFUND/zh_val/val.json > train_data/XFUND/zh_val/val_lite.json + mv train_data/XFUND/zh_val/val_lite.json train_data/XFUND/zh_val/val.json if [ ${model_name} = "ch_ppocr_mobile_v2_0_det" ]; then eval_model_name="ch_ppocr_mobile_v2.0_det_train" rm -rf ./train_data/icdar2015 @@ -463,6 +524,12 @@ elif [ ${MODE} = "whole_infer" ];then wget -nc -P ./inference/ https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_rec_infer.tar --no-check-certificate cd ./inference/ && tar xf en_ppocr_mobile_v2.0_table_structure_infer.tar && tar xf en_ppocr_mobile_v2.0_table_det_infer.tar && tar xf en_ppocr_mobile_v2.0_table_rec_infer.tar && cd ../ fi + if [[ ${model_name} =~ "layoutxlm_ser" ]]; then + ${python_name} -m pip install -r ppstructure/kie/requirements.txt + ${python_name} -m pip install opencv-python -U + wget -nc -P ./inference/ https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutXLM_xfun_zh_infer.tar --no-check-certificate + cd ./inference/ && tar xf ser_LayoutXLM_xfun_zh_infer.tar & cd ../ + fi fi if [[ ${model_name} =~ "KL" ]]; then @@ -515,6 +582,12 @@ if [[ ${model_name} =~ "KL" ]]; then cd ./inference/ && tar xf en_ppocr_mobile_v2.0_table_structure_infer.tar && tar xf en_ppocr_mobile_v2.0_table_det_infer.tar && tar xf en_ppocr_mobile_v2.0_table_rec_infer.tar && cd ../ cd ./train_data/ && tar xf pubtabnet.tar && cd ../ fi + if [[ ${model_name} =~ "layoutxlm_ser_KL" ]]; then + wget -nc -P ./train_data/ https://paddleocr.bj.bcebos.com/ppstructure/dataset/XFUND.tar --no-check-certificate + cd ./train_data/ && tar xf XFUND.tar && cd ../ + wget -nc -P ./inference/ https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutXLM_xfun_zh_infer.tar --no-check-certificate + cd ./inference/ && tar xf ser_LayoutXLM_xfun_zh_infer.tar & cd ../ + fi fi if [ ${MODE} = "cpp_infer" ];then @@ -619,6 +692,12 @@ if [ ${MODE} = "cpp_infer" ];then wget -nc -P ./inference https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar --no-check-certificate cd ./inference && tar xf ch_PP-OCRv3_det_infer.tar && tar xf ch_PP-OCRv3_rec_infer.tar && tar xf ch_det_data_50.tar && cd ../ fi + elif [ ${model_name} = "en_table_structure_KL" ];then + wget -nc -P ./inference/ https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_structure_infer.tar --no-check-certificate + wget -nc -P ./inference/ https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_det_infer.tar --no-check-certificate + wget -nc -P ./inference/ https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_rec_infer.tar --no-check-certificate + cd ./inference/ && tar xf en_ppocr_mobile_v2.0_table_structure_infer.tar && tar xf en_ppocr_mobile_v2.0_table_det_infer.tar && tar xf en_ppocr_mobile_v2.0_table_rec_infer.tar && cd ../ + fi fi if [ ${MODE} = "serving_infer" ];then @@ -630,6 +709,7 @@ if [ ${MODE} = "serving_infer" ];then ${python_name} -m pip install paddle-serving-server-gpu ${python_name} -m pip install paddle_serving_client ${python_name} -m pip install paddle-serving-app + ${python_name} -m pip install https://paddleocr.bj.bcebos.com/libs/auto_log-1.2.0-py3-none-any.whl # wget model if [ ${model_name} == "ch_ppocr_mobile_v2_0_det_KL" ] || [ ${model_name} == "ch_ppocr_mobile_v2.0_rec_KL" ] ; then wget -nc -P ./inference https://paddleocr.bj.bcebos.com/tipc_fake_model/ch_ppocr_mobile_v2.0_det_klquant_infer.tar --no-check-certificate @@ -681,8 +761,7 @@ fi if [ ${MODE} = "paddle2onnx_infer" ];then # prepare serving env python_name=$(func_parser_value "${lines[2]}") - ${python_name} -m pip install paddle2onnx - ${python_name} -m pip install onnxruntime + ${python_name} -m pip install paddle2onnx onnxruntime onnx # wget model if [[ ${model_name} =~ "ch_ppocr_mobile_v2_0" ]]; then wget -nc -P ./inference https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_infer.tar --no-check-certificate diff --git a/test_tipc/readme.md b/test_tipc/readme.md index f9e9d89e4198c1ad5fabdf58775c6f7b6d190322..1442ee1c86a7c1319446a0eb22c08287e1ce689a 100644 --- a/test_tipc/readme.md +++ b/test_tipc/readme.md @@ -54,6 +54,7 @@ | NRTR |rec_mtb_nrtr | 识别 | 支持 | 多机多卡
混合精度 | - | - | | SAR |rec_r31_sar | 识别 | 支持 | 多机多卡
混合精度 | - | - | | SPIN |rec_r32_gaspin_bilstm_att | 识别 | 支持 | 多机多卡
混合精度 | - | - | +| RobustScanner |rec_r31_robustscanner | 识别 | 支持 | 多机多卡
混合精度 | - | - | | PGNet |rec_r34_vd_none_none_ctc_v2.0 | 端到端| 支持 | 多机多卡
混合精度 | - | - | | TableMaster |table_structure_tablemaster_train | 表格识别| 支持 | 多机多卡
混合精度 | - | - | diff --git a/test_tipc/test_inference_cpp.sh b/test_tipc/test_inference_cpp.sh index c0c7c18a38a46b00c839757e303049135a508691..aadaa8b0773632885138806861fc851ede503f3d 100644 --- a/test_tipc/test_inference_cpp.sh +++ b/test_tipc/test_inference_cpp.sh @@ -84,7 +84,7 @@ function func_cpp_inference(){ eval $command last_status=${PIPESTATUS[0]} eval "cat ${_save_log_path}" - status_check $last_status "${command}" "${status_log}" "${model_name}" + status_check $last_status "${command}" "${status_log}" "${model_name}" "${_save_log_path}" done done done @@ -117,7 +117,7 @@ function func_cpp_inference(){ eval $command last_status=${PIPESTATUS[0]} eval "cat ${_save_log_path}" - status_check $last_status "${command}" "${status_log}" "${model_name}" + status_check $last_status "${command}" "${status_log}" "${model_name}" "${_save_log_path}" done done diff --git a/test_tipc/test_inference_python.sh b/test_tipc/test_inference_python.sh index 2a31a468f0d54d1979e82c8f0da98cac6f4edcec..e9908df1f6049f9d38524dc6598499ddd2b58af8 100644 --- a/test_tipc/test_inference_python.sh +++ b/test_tipc/test_inference_python.sh @@ -88,7 +88,7 @@ function func_inference(){ eval $command last_status=${PIPESTATUS[0]} eval "cat ${_save_log_path}" - status_check $last_status "${command}" "${status_log}" "${model_name}" + status_check $last_status "${command}" "${status_log}" "${model_name}" "${_save_log_path}" done done done @@ -119,7 +119,7 @@ function func_inference(){ eval $command last_status=${PIPESTATUS[0]} eval "cat ${_save_log_path}" - status_check $last_status "${command}" "${status_log}" "${model_name}" + status_check $last_status "${command}" "${status_log}" "${model_name}" "${_save_log_path}" done done @@ -146,14 +146,15 @@ if [ ${MODE} = "whole_infer" ]; then for infer_model in ${infer_model_dir_list[*]}; do # run export if [ ${infer_run_exports[Count]} != "null" ];then + _save_log_path="${_log_path}/python_infer_gpu_usetrt_${use_trt}_precision_${precision}_batchsize_${batch_size}_infermodel_${infer_model}.log" save_infer_dir=$(dirname $infer_model) set_export_weight=$(func_set_params "${export_weight}" "${infer_model}") set_save_infer_key=$(func_set_params "${save_infer_key}" "${save_infer_dir}") - export_cmd="${python} ${infer_run_exports[Count]} ${set_export_weight} ${set_save_infer_key}" + export_cmd="${python} ${infer_run_exports[Count]} ${set_export_weight} ${set_save_infer_key} > ${_save_log_path} 2>&1 " echo ${infer_run_exports[Count]} eval $export_cmd status_export=$? - status_check $status_export "${export_cmd}" "${status_log}" "${model_name}" + status_check $status_export "${export_cmd}" "${status_log}" "${model_name}" "${_save_log_path}" else save_infer_dir=${infer_model} fi diff --git a/test_tipc/test_paddle2onnx.sh b/test_tipc/test_paddle2onnx.sh index 78d79d0b8eaac782f98c1e883d091a001443f41a..04bfb590f7c6e64cf136d3feef8594994cb86877 100644 --- a/test_tipc/test_paddle2onnx.sh +++ b/test_tipc/test_paddle2onnx.sh @@ -63,10 +63,10 @@ function func_paddle2onnx(){ set_opset_version=$(func_set_params "${opset_version_key}" "${opset_version_value}") set_enable_onnx_checker=$(func_set_params "${enable_onnx_checker_key}" "${enable_onnx_checker_value}") trans_det_log="${LOG_PATH}/trans_model_det.log" - trans_model_cmd="${padlle2onnx_cmd} ${set_dirname} ${set_model_filename} ${set_params_filename} ${set_save_model} ${set_opset_version} ${set_enable_onnx_checker} > ${trans_det_log} 2>&1 " + trans_model_cmd="${padlle2onnx_cmd} ${set_dirname} ${set_model_filename} ${set_params_filename} ${set_save_model} ${set_opset_version} ${set_enable_onnx_checker} --enable_dev_version=False > ${trans_det_log} 2>&1 " eval $trans_model_cmd last_status=${PIPESTATUS[0]} - status_check $last_status "${trans_model_cmd}" "${status_log}" "${model_name}" + status_check $last_status "${trans_model_cmd}" "${status_log}" "${model_name}" "${trans_det_log}" # trans rec set_dirname=$(func_set_params "--model_dir" "${rec_infer_model_dir_value}") set_model_filename=$(func_set_params "${model_filename_key}" "${model_filename_value}") @@ -75,10 +75,10 @@ function func_paddle2onnx(){ set_opset_version=$(func_set_params "${opset_version_key}" "${opset_version_value}") set_enable_onnx_checker=$(func_set_params "${enable_onnx_checker_key}" "${enable_onnx_checker_value}") trans_rec_log="${LOG_PATH}/trans_model_rec.log" - trans_model_cmd="${padlle2onnx_cmd} ${set_dirname} ${set_model_filename} ${set_params_filename} ${set_save_model} ${set_opset_version} ${set_enable_onnx_checker} > ${trans_rec_log} 2>&1 " + trans_model_cmd="${padlle2onnx_cmd} ${set_dirname} ${set_model_filename} ${set_params_filename} ${set_save_model} ${set_opset_version} ${set_enable_onnx_checker} --enable_dev_version=False > ${trans_rec_log} 2>&1 " eval $trans_model_cmd last_status=${PIPESTATUS[0]} - status_check $last_status "${trans_model_cmd}" "${status_log}" "${model_name}" + status_check $last_status "${trans_model_cmd}" "${status_log}" "${model_name}" "${trans_rec_log}" elif [[ ${model_name} =~ "det" ]]; then # trans det set_dirname=$(func_set_params "--model_dir" "${det_infer_model_dir_value}") @@ -88,10 +88,10 @@ function func_paddle2onnx(){ set_opset_version=$(func_set_params "${opset_version_key}" "${opset_version_value}") set_enable_onnx_checker=$(func_set_params "${enable_onnx_checker_key}" "${enable_onnx_checker_value}") trans_det_log="${LOG_PATH}/trans_model_det.log" - trans_model_cmd="${padlle2onnx_cmd} ${set_dirname} ${set_model_filename} ${set_params_filename} ${set_save_model} ${set_opset_version} ${set_enable_onnx_checker} > ${trans_det_log} 2>&1 " + trans_model_cmd="${padlle2onnx_cmd} ${set_dirname} ${set_model_filename} ${set_params_filename} ${set_save_model} ${set_opset_version} ${set_enable_onnx_checker} --enable_dev_version=False > ${trans_det_log} 2>&1 " eval $trans_model_cmd last_status=${PIPESTATUS[0]} - status_check $last_status "${trans_model_cmd}" "${status_log}" "${model_name}" + status_check $last_status "${trans_model_cmd}" "${status_log}" "${model_name}" "${trans_det_log}" elif [[ ${model_name} =~ "rec" ]]; then # trans rec set_dirname=$(func_set_params "--model_dir" "${rec_infer_model_dir_value}") @@ -101,10 +101,10 @@ function func_paddle2onnx(){ set_opset_version=$(func_set_params "${opset_version_key}" "${opset_version_value}") set_enable_onnx_checker=$(func_set_params "${enable_onnx_checker_key}" "${enable_onnx_checker_value}") trans_rec_log="${LOG_PATH}/trans_model_rec.log" - trans_model_cmd="${padlle2onnx_cmd} ${set_dirname} ${set_model_filename} ${set_params_filename} ${set_save_model} ${set_opset_version} ${set_enable_onnx_checker} > ${trans_rec_log} 2>&1 " + trans_model_cmd="${padlle2onnx_cmd} ${set_dirname} ${set_model_filename} ${set_params_filename} ${set_save_model} ${set_opset_version} ${set_enable_onnx_checker} --enable_dev_version=False > ${trans_rec_log} 2>&1 " eval $trans_model_cmd last_status=${PIPESTATUS[0]} - status_check $last_status "${trans_model_cmd}" "${status_log}" "${model_name}" + status_check $last_status "${trans_model_cmd}" "${status_log}" "${model_name}" "${trans_rec_log}" fi # python inference @@ -127,7 +127,7 @@ function func_paddle2onnx(){ eval $infer_model_cmd last_status=${PIPESTATUS[0]} eval "cat ${_save_log_path}" - status_check $last_status "${infer_model_cmd}" "${status_log}" "${model_name}" + status_check $last_status "${infer_model_cmd}" "${status_log}" "${model_name}" "${_save_log_path}" elif [ ${use_gpu} = "True" ] || [ ${use_gpu} = "gpu" ]; then _save_log_path="${LOG_PATH}/paddle2onnx_infer_gpu.log" set_gpu=$(func_set_params "${use_gpu_key}" "${use_gpu}") @@ -146,7 +146,7 @@ function func_paddle2onnx(){ eval $infer_model_cmd last_status=${PIPESTATUS[0]} eval "cat ${_save_log_path}" - status_check $last_status "${infer_model_cmd}" "${status_log}" "${model_name}" + status_check $last_status "${infer_model_cmd}" "${status_log}" "${model_name}" "${_save_log_path}" else echo "Does not support hardware other than CPU and GPU Currently!" fi @@ -158,4 +158,4 @@ echo "################### run test ###################" export Count=0 IFS="|" -func_paddle2onnx \ No newline at end of file +func_paddle2onnx diff --git a/test_tipc/test_ptq_inference_python.sh b/test_tipc/test_ptq_inference_python.sh index e2939fd5e638ad0f6b4c44422a6fec6459903d1c..caf3d506029ee066aa5abebc25b739439b6e9d75 100644 --- a/test_tipc/test_ptq_inference_python.sh +++ b/test_tipc/test_ptq_inference_python.sh @@ -84,7 +84,7 @@ function func_inference(){ eval $command last_status=${PIPESTATUS[0]} eval "cat ${_save_log_path}" - status_check $last_status "${command}" "${status_log}" "${model_name}" + status_check $last_status "${command}" "${status_log}" "${model_name}" "${_save_log_path}" done done done @@ -109,7 +109,7 @@ function func_inference(){ eval $command last_status=${PIPESTATUS[0]} eval "cat ${_save_log_path}" - status_check $last_status "${command}" "${status_log}" "${model_name}" + status_check $last_status "${command}" "${status_log}" "${model_name}" "${_save_log_path}" done done @@ -145,7 +145,7 @@ if [ ${MODE} = "whole_infer" ]; then echo $export_cmd eval $export_cmd status_export=$? - status_check $status_export "${export_cmd}" "${status_log}" "${model_name}" + status_check $status_export "${export_cmd}" "${status_log}" "${model_name}" "${export_log_path}" else save_infer_dir=${infer_model} fi diff --git a/test_tipc/test_serving_infer_cpp.sh b/test_tipc/test_serving_infer_cpp.sh index 0be6a45adf3105f088a96336dddfbe9ac612f19b..10ddecf3fa26805fef7bc6ae10d78ee5e741cd27 100644 --- a/test_tipc/test_serving_infer_cpp.sh +++ b/test_tipc/test_serving_infer_cpp.sh @@ -83,7 +83,7 @@ function func_serving(){ trans_model_cmd="${python_list[0]} ${trans_model_py} ${set_dirname} ${set_model_filename} ${set_params_filename} ${set_serving_server} ${set_serving_client} > ${trans_rec_log} 2>&1 " eval $trans_model_cmd last_status=${PIPESTATUS[0]} - status_check $last_status "${trans_model_cmd}" "${status_log}" "${model_name}" + status_check $last_status "${trans_model_cmd}" "${status_log}" "${model_name}" "${trans_rec_log}" set_image_dir=$(func_set_params "${image_dir_key}" "${image_dir_value}") python_list=(${python_list}) cd ${serving_dir_value} @@ -95,14 +95,14 @@ function func_serving(){ web_service_cpp_cmd="nohup ${python_list[0]} ${web_service_py} --model ${det_server_value} ${rec_server_value} ${op_key} ${op_value} ${port_key} ${port_value} > ${server_log_path} 2>&1 &" eval $web_service_cpp_cmd last_status=${PIPESTATUS[0]} - status_check $last_status "${web_service_cpp_cmd}" "${status_log}" "${model_name}" + status_check $last_status "${web_service_cpp_cmd}" "${status_log}" "${model_name}" "${server_log_path}" sleep 5s _save_log_path="${LOG_PATH}/cpp_client_cpu.log" cpp_client_cmd="${python_list[0]} ${cpp_client_py} ${det_client_value} ${rec_client_value} > ${_save_log_path} 2>&1" eval $cpp_client_cmd last_status=${PIPESTATUS[0]} eval "cat ${_save_log_path}" - status_check $last_status "${cpp_client_cmd}" "${status_log}" "${model_name}" + status_check $last_status "${cpp_client_cmd}" "${status_log}" "${model_name}" "${_save_log_path}" ps ux | grep -i ${port_value} | awk '{print $2}' | xargs kill -s 9 else server_log_path="${LOG_PATH}/cpp_server_gpu.log" @@ -114,7 +114,7 @@ function func_serving(){ eval $cpp_client_cmd last_status=${PIPESTATUS[0]} eval "cat ${_save_log_path}" - status_check $last_status "${cpp_client_cmd}" "${status_log}" "${model_name}" + status_check $last_status "${cpp_client_cmd}" "${status_log}" "${model_name}" "${_save_log_path}" ps ux | grep -i ${port_value} | awk '{print $2}' | xargs kill -s 9 fi done diff --git a/test_tipc/test_serving_infer_python.sh b/test_tipc/test_serving_infer_python.sh index 4b7dfcf785a3c8459cce95d55744dbcd4f97027a..c7d305d5d2dcd2ea1bf5a7c3254eea4231d59879 100644 --- a/test_tipc/test_serving_infer_python.sh +++ b/test_tipc/test_serving_infer_python.sh @@ -126,19 +126,19 @@ function func_serving(){ web_service_cmd="nohup ${python} ${web_service_py} ${web_use_gpu_key}="" ${web_use_mkldnn_key}=${use_mkldnn} ${set_cpu_threads} ${set_det_model_config} ${set_rec_model_config} > ${server_log_path} 2>&1 &" eval $web_service_cmd last_status=${PIPESTATUS[0]} - status_check $last_status "${web_service_cmd}" "${status_log}" "${model_name}" + status_check $last_status "${web_service_cmd}" "${status_log}" "${model_name}" "${server_log_path}" elif [[ ${model_name} =~ "det" ]]; then set_det_model_config=$(func_set_params "${det_server_key}" "${det_server_value}") web_service_cmd="nohup ${python} ${web_service_py} ${web_use_gpu_key}="" ${web_use_mkldnn_key}=${use_mkldnn} ${set_cpu_threads} ${set_det_model_config} > ${server_log_path} 2>&1 &" eval $web_service_cmd last_status=${PIPESTATUS[0]} - status_check $last_status "${web_service_cmd}" "${status_log}" "${model_name}" + status_check $last_status "${web_service_cmd}" "${status_log}" "${model_name}" "${server_log_path}" elif [[ ${model_name} =~ "rec" ]]; then set_rec_model_config=$(func_set_params "${rec_server_key}" "${rec_server_value}") web_service_cmd="nohup ${python} ${web_service_py} ${web_use_gpu_key}="" ${web_use_mkldnn_key}=${use_mkldnn} ${set_cpu_threads} ${set_rec_model_config} > ${server_log_path} 2>&1 &" eval $web_service_cmd last_status=${PIPESTATUS[0]} - status_check $last_status "${web_service_cmd}" "${status_log}" "${model_name}" + status_check $last_status "${web_service_cmd}" "${status_log}" "${model_name}" "${server_log_path}" fi sleep 2s for pipeline in ${pipeline_py[*]}; do @@ -147,7 +147,7 @@ function func_serving(){ eval $pipeline_cmd last_status=${PIPESTATUS[0]} eval "cat ${_save_log_path}" - status_check $last_status "${pipeline_cmd}" "${status_log}" "${model_name}" + status_check $last_status "${pipeline_cmd}" "${status_log}" "${model_name}" "${_save_log_path}" sleep 2s done ps ux | grep -E 'web_service' | awk '{print $2}' | xargs kill -s 9 @@ -177,19 +177,19 @@ function func_serving(){ web_service_cmd="nohup ${python} ${web_service_py} ${set_tensorrt} ${set_precision} ${set_det_model_config} ${set_rec_model_config} > ${server_log_path} 2>&1 &" eval $web_service_cmd last_status=${PIPESTATUS[0]} - status_check $last_status "${web_service_cmd}" "${status_log}" "${model_name}" + status_check $last_status "${web_service_cmd}" "${status_log}" "${model_name}" "${server_log_path}" elif [[ ${model_name} =~ "det" ]]; then set_det_model_config=$(func_set_params "${det_server_key}" "${det_server_value}") web_service_cmd="nohup ${python} ${web_service_py} ${set_tensorrt} ${set_precision} ${set_det_model_config} > ${server_log_path} 2>&1 &" eval $web_service_cmd last_status=${PIPESTATUS[0]} - status_check $last_status "${web_service_cmd}" "${status_log}" "${model_name}" + status_check $last_status "${web_service_cmd}" "${status_log}" "${model_name}" "${server_log_path}" elif [[ ${model_name} =~ "rec" ]]; then set_rec_model_config=$(func_set_params "${rec_server_key}" "${rec_server_value}") web_service_cmd="nohup ${python} ${web_service_py} ${set_tensorrt} ${set_precision} ${set_rec_model_config} > ${server_log_path} 2>&1 &" eval $web_service_cmd last_status=${PIPESTATUS[0]} - status_check $last_status "${web_service_cmd}" "${status_log}" "${model_name}" + status_check $last_status "${web_service_cmd}" "${status_log}" "${model_name}" "${server_log_path}" fi sleep 2s for pipeline in ${pipeline_py[*]}; do @@ -198,7 +198,7 @@ function func_serving(){ eval $pipeline_cmd last_status=${PIPESTATUS[0]} eval "cat ${_save_log_path}" - status_check $last_status "${pipeline_cmd}" "${status_log}" "${model_name}" + status_check $last_status "${pipeline_cmd}" "${status_log}" "${model_name}" "${_save_log_path}" sleep 2s done ps ux | grep -E 'web_service' | awk '{print $2}' | xargs kill -s 9 diff --git a/test_tipc/test_train_inference_python.sh b/test_tipc/test_train_inference_python.sh index 545cdbba2051c8123ef7f70f2aeb4b4b5a57b7c5..e182fa57f060c81af012a5da89b892bde02b4a2b 100644 --- a/test_tipc/test_train_inference_python.sh +++ b/test_tipc/test_train_inference_python.sh @@ -133,7 +133,7 @@ function func_inference(){ eval $command last_status=${PIPESTATUS[0]} eval "cat ${_save_log_path}" - status_check $last_status "${command}" "${status_log}" "${model_name}" + status_check $last_status "${command}" "${status_log}" "${model_name}" "${_save_log_path}" done done done @@ -164,7 +164,7 @@ function func_inference(){ eval $command last_status=${PIPESTATUS[0]} eval "cat ${_save_log_path}" - status_check $last_status "${command}" "${status_log}" "${model_name}" + status_check $last_status "${command}" "${status_log}" "${model_name}" "${_save_log_path}" done done @@ -201,7 +201,7 @@ if [ ${MODE} = "whole_infer" ]; then echo $export_cmd eval $export_cmd status_export=$? - status_check $status_export "${export_cmd}" "${status_log}" "${model_name}" + status_check $status_export "${export_cmd}" "${status_log}" "${model_name}" "${export_log_path}" else save_infer_dir=${infer_model} fi @@ -298,7 +298,7 @@ else # run train eval $cmd eval "cat ${save_log}/train.log >> ${save_log}.log" - status_check $? "${cmd}" "${status_log}" "${model_name}" + status_check $? "${cmd}" "${status_log}" "${model_name}" "${save_log}.log" set_eval_pretrain=$(func_set_params "${pretrain_model_key}" "${save_log}/${train_model_name}") @@ -309,7 +309,7 @@ else eval_log_path="${LOG_PATH}/${trainer}_gpus_${gpu}_autocast_${autocast}_nodes_${nodes}_eval.log" eval_cmd="${python} ${eval_py} ${set_eval_pretrain} ${set_use_gpu} ${set_eval_params1} > ${eval_log_path} 2>&1 " eval $eval_cmd - status_check $? "${eval_cmd}" "${status_log}" "${model_name}" + status_check $? "${eval_cmd}" "${status_log}" "${model_name}" "${eval_log_path}" fi # run export model if [ ${run_export} != "null" ]; then @@ -320,7 +320,7 @@ else set_save_infer_key=$(func_set_params "${save_infer_key}" "${save_infer_path}") export_cmd="${python} ${run_export} ${set_export_weight} ${set_save_infer_key} > ${export_log_path} 2>&1 " eval $export_cmd - status_check $? "${export_cmd}" "${status_log}" "${model_name}" + status_check $? "${export_cmd}" "${status_log}" "${model_name}" "${export_log_path}" #run inference eval $env diff --git a/test_tipc/test_train_inference_python_npu.sh b/test_tipc/test_train_inference_python_npu.sh new file mode 100644 index 0000000000000000000000000000000000000000..bab70fc78ee902515c0fccb57d9215d86f2a6589 --- /dev/null +++ b/test_tipc/test_train_inference_python_npu.sh @@ -0,0 +1,52 @@ +#!/bin/bash +source test_tipc/common_func.sh + +function readlinkf() { + perl -MCwd -e 'print Cwd::abs_path shift' "$1"; +} + +function func_parser_config() { + strs=$1 + IFS=" " + array=(${strs}) + tmp=${array[2]} + echo ${tmp} +} + +BASEDIR=$(dirname "$0") +REPO_ROOT_PATH=$(readlinkf ${BASEDIR}/../) + +FILENAME=$1 + +# disable mkldnn on non x86_64 env +arch=$(uname -i) +if [ $arch != 'x86_64' ]; then + sed -i 's/--enable_mkldnn:True|False/--enable_mkldnn:False/g' $FILENAME + sed -i 's/--enable_mkldnn:True/--enable_mkldnn:False/g' $FILENAME +fi + +# change gpu to npu in tipc txt configs +sed -i 's/use_gpu/use_npu/g' $FILENAME +# disable benchmark as AutoLog required nvidia-smi command +sed -i 's/--benchmark:True/--benchmark:False/g' $FILENAME +dataline=`cat $FILENAME` + +# parser params +IFS=$'\n' +lines=(${dataline}) + +# replace training config file +grep -n 'tools/.*yml' $FILENAME | cut -d ":" -f 1 \ +| while read line_num ; do + train_cmd=$(func_parser_value "${lines[line_num-1]}") + trainer_config=$(func_parser_config ${train_cmd}) + sed -i 's/use_gpu/use_npu/g' "$REPO_ROOT_PATH/$trainer_config" +done + +# change gpu to npu in execution script +sed -i 's/\"gpu\"/\"npu\"/g' test_tipc/test_train_inference_python.sh + +# pass parameters to test_train_inference_python.sh +cmd='bash test_tipc/test_train_inference_python.sh ${FILENAME} $2' +echo -e '\033[1;32m Started to run command: ${cmd}! \033[0m' +eval $cmd diff --git a/test_tipc/test_train_inference_python_xpu.sh b/test_tipc/test_train_inference_python_xpu.sh new file mode 100644 index 0000000000000000000000000000000000000000..7c6dc1e52a67caf9c858b2f8b6561b3919134b0b --- /dev/null +++ b/test_tipc/test_train_inference_python_xpu.sh @@ -0,0 +1,52 @@ +#!/bin/bash +source test_tipc/common_func.sh + +function readlinkf() { + perl -MCwd -e 'print Cwd::abs_path shift' "$1"; +} + +function func_parser_config() { + strs=$1 + IFS=" " + array=(${strs}) + tmp=${array[2]} + echo ${tmp} +} + +BASEDIR=$(dirname "$0") +REPO_ROOT_PATH=$(readlinkf ${BASEDIR}/../) + +FILENAME=$1 + +# disable mkldnn on non x86_64 env +arch=$(uname -i) +if [ $arch != 'x86_64' ]; then + sed -i 's/--enable_mkldnn:True|False/--enable_mkldnn:False/g' $FILENAME + sed -i 's/--enable_mkldnn:True/--enable_mkldnn:False/g' $FILENAME +fi + +# change gpu to xpu in tipc txt configs +sed -i 's/use_gpu/use_xpu/g' $FILENAME +# disable benchmark as AutoLog required nvidia-smi command +sed -i 's/--benchmark:True/--benchmark:False/g' $FILENAME +dataline=`cat $FILENAME` + +# parser params +IFS=$'\n' +lines=(${dataline}) + +# replace training config file +grep -n 'tools/.*yml' $FILENAME | cut -d ":" -f 1 \ +| while read line_num ; do + train_cmd=$(func_parser_value "${lines[line_num-1]}") + trainer_config=$(func_parser_config ${train_cmd}) + sed -i 's/use_gpu/use_xpu/g' "$REPO_ROOT_PATH/$trainer_config" +done + +# change gpu to xpu in execution script +sed -i 's/\"gpu\"/\"xpu\"/g' test_tipc/test_train_inference_python.sh + +# pass parameters to test_train_inference_python.sh +cmd='bash test_tipc/test_train_inference_python.sh ${FILENAME} $2' +echo -e '\033[1;32m Started to run command: ${cmd}! \033[0m' +eval $cmd diff --git a/tools/eval.py b/tools/eval.py index 2fc53488efa2c4c475d31af47f69b3560e6cc69a..3d1d3813d33e251ec83a9729383fe772bc4cc225 100755 --- a/tools/eval.py +++ b/tools/eval.py @@ -23,6 +23,7 @@ __dir__ = os.path.dirname(os.path.abspath(__file__)) sys.path.insert(0, __dir__) sys.path.insert(0, os.path.abspath(os.path.join(__dir__, '..'))) +import paddle from ppocr.data import build_dataloader from ppocr.modeling.architectures import build_model from ppocr.postprocess import build_post_process @@ -73,7 +74,7 @@ def main(): config['Architecture']["Head"]['out_channels'] = char_num model = build_model(config['Architecture']) - extra_input_models = ["SRN", "NRTR", "SAR", "SEED", "SVTR", "VisionLAN"] + extra_input_models = ["SRN", "NRTR", "SAR", "SEED", "SVTR", "VisionLAN", "RobustScanner"] extra_input = False if config['Architecture']['algorithm'] == 'Distillation': for key in config['Architecture']["Models"]: @@ -86,6 +87,30 @@ def main(): else: model_type = None + # build metric + eval_class = build_metric(config['Metric']) + # amp + use_amp = config["Global"].get("use_amp", False) + amp_level = config["Global"].get("amp_level", 'O2') + amp_custom_black_list = config['Global'].get('amp_custom_black_list',[]) + if use_amp: + AMP_RELATED_FLAGS_SETTING = { + 'FLAGS_cudnn_batchnorm_spatial_persistent': 1, + 'FLAGS_max_inplace_grad_add': 8, + } + paddle.fluid.set_flags(AMP_RELATED_FLAGS_SETTING) + scale_loss = config["Global"].get("scale_loss", 1.0) + use_dynamic_loss_scaling = config["Global"].get( + "use_dynamic_loss_scaling", False) + scaler = paddle.amp.GradScaler( + init_loss_scaling=scale_loss, + use_dynamic_loss_scaling=use_dynamic_loss_scaling) + if amp_level == "O2": + model = paddle.amp.decorate( + models=model, level=amp_level, master_weight=True) + else: + scaler = None + best_model_dict = load_model( config, model, model_type=config['Architecture']["model_type"]) if len(best_model_dict): @@ -93,11 +118,9 @@ def main(): for k, v in best_model_dict.items(): logger.info('{}:{}'.format(k, v)) - # build metric - eval_class = build_metric(config['Metric']) # start eval metric = program.eval(model, valid_dataloader, post_process_class, - eval_class, model_type, extra_input) + eval_class, model_type, extra_input, scaler, amp_level, amp_custom_black_list) logger.info('metric eval ***************') for k, v in metric.items(): logger.info('{}:{}'.format(k, v)) diff --git a/tools/export_model.py b/tools/export_model.py index 78932c987d8bc57216ef3586c2bdc0cdbd6a9037..193988cc1b62a6c4536a8d2ec640e3e5fc81a79c 100755 --- a/tools/export_model.py +++ b/tools/export_model.py @@ -58,6 +58,8 @@ def export_single_model(model, other_shape = [ paddle.static.InputSpec( shape=[None, 3, 48, 160], dtype="float32"), + [paddle.static.InputSpec( + shape=[None], dtype="float32")] ] model = to_static(model, input_spec=other_shape) elif arch_config["algorithm"] == "SVTR": @@ -78,6 +80,12 @@ def export_single_model(model, shape=[None, 3, 64, 512], dtype="float32"), ] model = to_static(model, input_spec=other_shape) + elif arch_config["model_type"] == "sr": + other_shape = [ + paddle.static.InputSpec( + shape=[None, 3, 16, 64], dtype="float32") + ] + model = to_static(model, input_spec=other_shape) elif arch_config["algorithm"] == "ViTSTR": other_shape = [ paddle.static.InputSpec( @@ -103,6 +111,22 @@ def export_single_model(model, shape=[None, 3, 64, 256], dtype="float32"), ] model = to_static(model, input_spec=other_shape) + elif arch_config["algorithm"] == "RobustScanner": + max_text_length = arch_config["Head"]["max_text_length"] + other_shape = [ + paddle.static.InputSpec( + shape=[None, 3, 48, 160], dtype="float32"), + + [ + paddle.static.InputSpec( + shape=[None, ], + dtype="float32"), + paddle.static.InputSpec( + shape=[None, max_text_length], + dtype="int64") + ] + ] + model = to_static(model, input_spec=other_shape) elif arch_config["algorithm"] in ["LayoutLM", "LayoutLMv2", "LayoutXLM"]: input_spec = [ paddle.static.InputSpec( @@ -116,13 +140,13 @@ def export_single_model(model, paddle.static.InputSpec( shape=[None, 3, 224, 224], dtype="int64"), # image ] - if arch_config["algorithm"] == "LayoutLM": + if model.backbone.use_visual_backbone is False: input_spec.pop(4) model = to_static(model, input_spec=[input_spec]) else: infer_shape = [3, -1, -1] if arch_config["model_type"] == "rec": - infer_shape = [3, 48, -1] # for rec model, H must be 32 + infer_shape = [3, 32, -1] # for rec model, H must be 32 if "Transform" in arch_config and arch_config[ "Transform"] is not None and arch_config["Transform"][ "name"] == "TPS": @@ -134,6 +158,8 @@ def export_single_model(model, infer_shape = [3, 488, 488] if arch_config["algorithm"] == "TableMaster": infer_shape = [3, 480, 480] + if arch_config["algorithm"] == "SLANet": + infer_shape = [3, -1, -1] model = to_static( model, input_spec=[ @@ -195,6 +221,9 @@ def main(): else: # base rec model config["Architecture"]["Head"]["out_channels"] = char_num + # for sr algorithm + if config["Architecture"]["model_type"] == "sr": + config['Architecture']["Transform"]['infer_mode'] = True model = build_model(config["Architecture"]) load_model(config, model, model_type=config['Architecture']["model_type"]) model.eval() @@ -223,4 +252,4 @@ def main(): if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/tools/infer/predict_cls.py b/tools/infer/predict_cls.py index ed2f47c04de6f4ab6a874db052e953a1ce4e0b76..d2b7108ca35666acfa53e785686fd7b9dfc21ed5 100755 --- a/tools/infer/predict_cls.py +++ b/tools/infer/predict_cls.py @@ -30,7 +30,7 @@ import traceback import tools.infer.utility as utility from ppocr.postprocess import build_post_process from ppocr.utils.logging import get_logger -from ppocr.utils.utility import get_image_file_list, check_and_read_gif +from ppocr.utils.utility import get_image_file_list, check_and_read logger = get_logger() @@ -128,7 +128,7 @@ def main(args): valid_image_file_list = [] img_list = [] for image_file in image_file_list: - img, flag = check_and_read_gif(image_file) + img, flag, _ = check_and_read(image_file) if not flag: img = cv2.imread(image_file) if img is None: diff --git a/tools/infer/predict_det.py b/tools/infer/predict_det.py index 394a48948b1f284bd405532769b76eeb298668bd..00fa2e9b7fafd949c59a0eebd43f2f88ae717320 100755 --- a/tools/infer/predict_det.py +++ b/tools/infer/predict_det.py @@ -27,7 +27,7 @@ import sys import tools.infer.utility as utility from ppocr.utils.logging import get_logger -from ppocr.utils.utility import get_image_file_list, check_and_read_gif +from ppocr.utils.utility import get_image_file_list, check_and_read from ppocr.data import create_operators, transform from ppocr.postprocess import build_post_process import json @@ -127,6 +127,9 @@ class TextDetector(object): postprocess_params["beta"] = args.beta postprocess_params["fourier_degree"] = args.fourier_degree postprocess_params["box_type"] = args.det_fce_box_type + elif self.det_algorithm == "CT": + pre_process_list[0] = {'ScaleAlignedShort': {'short_size': 640}} + postprocess_params['name'] = 'CTPostProcess' else: logger.info("unknown det_algorithm:{}".format(self.det_algorithm)) sys.exit(0) @@ -253,6 +256,9 @@ class TextDetector(object): elif self.det_algorithm == 'FCE': for i, output in enumerate(outputs): preds['level_{}'.format(i)] = output + elif self.det_algorithm == "CT": + preds['maps'] = outputs[0] + preds['score'] = outputs[1] else: raise NotImplementedError @@ -260,7 +266,7 @@ class TextDetector(object): post_result = self.postprocess_op(preds, shape_list) dt_boxes = post_result[0]['points'] if (self.det_algorithm == "SAST" and self.det_sast_polygon) or ( - self.det_algorithm in ["PSE", "FCE"] and + self.det_algorithm in ["PSE", "FCE", "CT"] and self.postprocess_op.box_type == 'poly'): dt_boxes = self.filter_tag_det_res_only_clip(dt_boxes, ori_im.shape) else: @@ -289,7 +295,7 @@ if __name__ == "__main__": os.makedirs(draw_img_save) save_results = [] for image_file in image_file_list: - img, flag = check_and_read_gif(image_file) + img, flag, _ = check_and_read(image_file) if not flag: img = cv2.imread(image_file) if img is None: diff --git a/tools/infer/predict_e2e.py b/tools/infer/predict_e2e.py index fb2859f0c7e0d3aa0b87dbe11123dfc88f4b4e8e..de315d701c7172ded4d30e48e79abee367f42239 100755 --- a/tools/infer/predict_e2e.py +++ b/tools/infer/predict_e2e.py @@ -27,7 +27,7 @@ import sys import tools.infer.utility as utility from ppocr.utils.logging import get_logger -from ppocr.utils.utility import get_image_file_list, check_and_read_gif +from ppocr.utils.utility import get_image_file_list, check_and_read from ppocr.data import create_operators, transform from ppocr.postprocess import build_post_process @@ -148,7 +148,7 @@ if __name__ == "__main__": if not os.path.exists(draw_img_save): os.makedirs(draw_img_save) for image_file in image_file_list: - img, flag = check_and_read_gif(image_file) + img, flag, _ = check_and_read(image_file) if not flag: img = cv2.imread(image_file) if img is None: diff --git a/tools/infer/predict_rec.py b/tools/infer/predict_rec.py index 4e4150c515fc2d0ee4eb7e635cb8c81a467e748f..176e2c68e2c9b2e08f9b56378c45a57733faf8cd 100755 --- a/tools/infer/predict_rec.py +++ b/tools/infer/predict_rec.py @@ -30,7 +30,7 @@ import paddle import tools.infer.utility as utility from ppocr.postprocess import build_post_process from ppocr.utils.logging import get_logger -from ppocr.utils.utility import get_image_file_list, check_and_read_gif +from ppocr.utils.utility import get_image_file_list, check_and_read logger = get_logger() @@ -93,6 +93,13 @@ class TextRecognizer(object): "character_dict_path": args.rec_char_dict_path, "use_space_char": args.use_space_char } + elif self.rec_algorithm == "RobustScanner": + postprocess_params = { + 'name': 'SARLabelDecode', + "character_dict_path": args.rec_char_dict_path, + "use_space_char": args.use_space_char, + "rm_symbol": True + } self.postprocess_op = build_post_process(postprocess_params) self.predictor, self.input_tensor, self.output_tensors, self.config = \ utility.create_predictor(args, 'rec', logger) @@ -342,7 +349,14 @@ class TextRecognizer(object): for beg_img_no in range(0, img_num, batch_num): end_img_no = min(img_num, beg_img_no + batch_num) norm_img_batch = [] - imgC, imgH, imgW = self.rec_image_shape + if self.rec_algorithm == "SRN": + encoder_word_pos_list = [] + gsrm_word_pos_list = [] + gsrm_slf_attn_bias1_list = [] + gsrm_slf_attn_bias2_list = [] + if self.rec_algorithm == "SAR": + valid_ratios = [] + imgC, imgH, imgW = self.rec_image_shape[:3] max_wh_ratio = imgW / imgH # max_wh_ratio = 0 for ino in range(beg_img_no, end_img_no): @@ -350,22 +364,16 @@ class TextRecognizer(object): wh_ratio = w * 1.0 / h max_wh_ratio = max(max_wh_ratio, wh_ratio) for ino in range(beg_img_no, end_img_no): - if self.rec_algorithm == "SAR": norm_img, _, _, valid_ratio = self.resize_norm_img_sar( img_list[indices[ino]], self.rec_image_shape) norm_img = norm_img[np.newaxis, :] valid_ratio = np.expand_dims(valid_ratio, axis=0) - valid_ratios = [] valid_ratios.append(valid_ratio) norm_img_batch.append(norm_img) elif self.rec_algorithm == "SRN": norm_img = self.process_image_srn( img_list[indices[ino]], self.rec_image_shape, 8, 25) - encoder_word_pos_list = [] - gsrm_word_pos_list = [] - gsrm_slf_attn_bias1_list = [] - gsrm_slf_attn_bias2_list = [] encoder_word_pos_list.append(norm_img[1]) gsrm_word_pos_list.append(norm_img[2]) gsrm_slf_attn_bias1_list.append(norm_img[3]) @@ -390,6 +398,20 @@ class TextRecognizer(object): img_list[indices[ino]], self.rec_image_shape) norm_img = norm_img[np.newaxis, :] norm_img_batch.append(norm_img) + elif self.rec_algorithm == "RobustScanner": + norm_img, _, _, valid_ratio = self.resize_norm_img_sar( + img_list[indices[ino]], + self.rec_image_shape, + width_downsample_ratio=0.25) + norm_img = norm_img[np.newaxis, :] + valid_ratio = np.expand_dims(valid_ratio, axis=0) + valid_ratios = [] + valid_ratios.append(valid_ratio) + norm_img_batch.append(norm_img) + word_positions_list = [] + word_positions = np.array(range(0, 40)).astype('int64') + word_positions = np.expand_dims(word_positions, axis=0) + word_positions_list.append(word_positions) else: norm_img = self.resize_norm_img(img_list[indices[ino]], max_wh_ratio) @@ -439,8 +461,34 @@ class TextRecognizer(object): valid_ratios = np.concatenate(valid_ratios) inputs = [ norm_img_batch, - valid_ratios, + np.array( + [valid_ratios], dtype=np.float32), ] + if self.use_onnx: + input_dict = {} + input_dict[self.input_tensor.name] = norm_img_batch + outputs = self.predictor.run(self.output_tensors, + input_dict) + preds = outputs[0] + else: + input_names = self.predictor.get_input_names() + for i in range(len(input_names)): + input_tensor = self.predictor.get_input_handle( + input_names[i]) + input_tensor.copy_from_cpu(inputs[i]) + self.predictor.run() + outputs = [] + for output_tensor in self.output_tensors: + output = output_tensor.copy_to_cpu() + outputs.append(output) + if self.benchmark: + self.autolog.times.stamp() + preds = outputs[0] + elif self.rec_algorithm == "RobustScanner": + valid_ratios = np.concatenate(valid_ratios) + word_positions_list = np.concatenate(word_positions_list) + inputs = [norm_img_batch, valid_ratios, word_positions_list] + if self.use_onnx: input_dict = {} input_dict[self.input_tensor.name] = norm_img_batch @@ -506,7 +554,7 @@ def main(args): res = text_recognizer([img] * int(args.rec_batch_num)) for image_file in image_file_list: - img, flag = check_and_read_gif(image_file) + img, flag, _ = check_and_read(image_file) if not flag: img = cv2.imread(image_file) if img is None: diff --git a/tools/infer/predict_sr.py b/tools/infer/predict_sr.py new file mode 100755 index 0000000000000000000000000000000000000000..ca99f6819f4b207ecc0f0d1383fe1d26d07fbf50 --- /dev/null +++ b/tools/infer/predict_sr.py @@ -0,0 +1,155 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import sys +from PIL import Image +__dir__ = os.path.dirname(os.path.abspath(__file__)) +sys.path.insert(0, __dir__) +sys.path.insert(0, os.path.abspath(os.path.join(__dir__, '../..'))) + +os.environ["FLAGS_allocator_strategy"] = 'auto_growth' + +import cv2 +import numpy as np +import math +import time +import traceback +import paddle + +import tools.infer.utility as utility +from ppocr.postprocess import build_post_process +from ppocr.utils.logging import get_logger +from ppocr.utils.utility import get_image_file_list, check_and_read + +logger = get_logger() + + +class TextSR(object): + def __init__(self, args): + self.sr_image_shape = [int(v) for v in args.sr_image_shape.split(",")] + self.sr_batch_num = args.sr_batch_num + + self.predictor, self.input_tensor, self.output_tensors, self.config = \ + utility.create_predictor(args, 'sr', logger) + self.benchmark = args.benchmark + if args.benchmark: + import auto_log + pid = os.getpid() + gpu_id = utility.get_infer_gpuid() + self.autolog = auto_log.AutoLogger( + model_name="sr", + model_precision=args.precision, + batch_size=args.sr_batch_num, + data_shape="dynamic", + save_path=None, #args.save_log_path, + inference_config=self.config, + pids=pid, + process_name=None, + gpu_ids=gpu_id if args.use_gpu else None, + time_keys=[ + 'preprocess_time', 'inference_time', 'postprocess_time' + ], + warmup=0, + logger=logger) + + def resize_norm_img(self, img): + imgC, imgH, imgW = self.sr_image_shape + img = img.resize((imgW // 2, imgH // 2), Image.BICUBIC) + img_numpy = np.array(img).astype("float32") + img_numpy = img_numpy.transpose((2, 0, 1)) / 255 + return img_numpy + + def __call__(self, img_list): + img_num = len(img_list) + batch_num = self.sr_batch_num + st = time.time() + st = time.time() + all_result = [] * img_num + if self.benchmark: + self.autolog.times.start() + for beg_img_no in range(0, img_num, batch_num): + end_img_no = min(img_num, beg_img_no + batch_num) + norm_img_batch = [] + imgC, imgH, imgW = self.sr_image_shape + for ino in range(beg_img_no, end_img_no): + norm_img = self.resize_norm_img(img_list[ino]) + norm_img = norm_img[np.newaxis, :] + norm_img_batch.append(norm_img) + + norm_img_batch = np.concatenate(norm_img_batch) + norm_img_batch = norm_img_batch.copy() + if self.benchmark: + self.autolog.times.stamp() + self.input_tensor.copy_from_cpu(norm_img_batch) + self.predictor.run() + outputs = [] + for output_tensor in self.output_tensors: + output = output_tensor.copy_to_cpu() + outputs.append(output) + if len(outputs) != 1: + preds = outputs + else: + preds = outputs[0] + all_result.append(outputs) + if self.benchmark: + self.autolog.times.end(stamp=True) + return all_result, time.time() - st + + +def main(args): + image_file_list = get_image_file_list(args.image_dir) + text_recognizer = TextSR(args) + valid_image_file_list = [] + img_list = [] + + # warmup 2 times + if args.warmup: + img = np.random.uniform(0, 255, [16, 64, 3]).astype(np.uint8) + for i in range(2): + res = text_recognizer([img] * int(args.sr_batch_num)) + + for image_file in image_file_list: + img, flag, _ = check_and_read(image_file) + if not flag: + img = Image.open(image_file).convert("RGB") + if img is None: + logger.info("error in loading image:{}".format(image_file)) + continue + valid_image_file_list.append(image_file) + img_list.append(img) + try: + preds, _ = text_recognizer(img_list) + for beg_no in range(len(preds)): + sr_img = preds[beg_no][1] + lr_img = preds[beg_no][0] + for i in (range(sr_img.shape[0])): + fm_sr = (sr_img[i] * 255).transpose(1, 2, 0).astype(np.uint8) + fm_lr = (lr_img[i] * 255).transpose(1, 2, 0).astype(np.uint8) + img_name_pure = os.path.split(valid_image_file_list[ + beg_no * args.sr_batch_num + i])[-1] + cv2.imwrite("infer_result/sr_{}".format(img_name_pure), + fm_sr[:, :, ::-1]) + logger.info("The visualized image saved in infer_result/sr_{}". + format(img_name_pure)) + + except Exception as E: + logger.info(traceback.format_exc()) + logger.info(E) + exit() + if args.benchmark: + text_recognizer.autolog.report() + + +if __name__ == "__main__": + main(utility.parse_args()) diff --git a/tools/infer/predict_system.py b/tools/infer/predict_system.py index 625d365f45c578d051974d7174e26246e9bc2442..e0f2c41fa2aba23491efee920afbd76db1ec84e0 100755 --- a/tools/infer/predict_system.py +++ b/tools/infer/predict_system.py @@ -32,7 +32,7 @@ import tools.infer.utility as utility import tools.infer.predict_rec as predict_rec import tools.infer.predict_det as predict_det import tools.infer.predict_cls as predict_cls -from ppocr.utils.utility import get_image_file_list, check_and_read_gif +from ppocr.utils.utility import get_image_file_list, check_and_read from ppocr.utils.logging import get_logger from tools.infer.utility import draw_ocr_box_txt, get_rotate_crop_image logger = get_logger() @@ -65,9 +65,11 @@ class TextSystem(object): self.crop_image_res_index += bbox_num def __call__(self, img, cls=True): + time_dict = {'det': 0, 'rec': 0, 'csl': 0, 'all': 0} + start = time.time() ori_im = img.copy() dt_boxes, elapse = self.text_detector(img) - + time_dict['det'] = elapse logger.debug("dt_boxes num : {}, elapse : {}".format( len(dt_boxes), elapse)) if dt_boxes is None: @@ -83,10 +85,12 @@ class TextSystem(object): if self.use_angle_cls and cls: img_crop_list, angle_list, elapse = self.text_classifier( img_crop_list) + time_dict['cls'] = elapse logger.debug("cls num : {}, elapse : {}".format( len(img_crop_list), elapse)) rec_res, elapse = self.text_recognizer(img_crop_list) + time_dict['rec'] = elapse logger.debug("rec_res num : {}, elapse : {}".format( len(rec_res), elapse)) if self.args.save_crop_res: @@ -98,7 +102,9 @@ class TextSystem(object): if score >= self.drop_score: filter_boxes.append(box) filter_rec_res.append(rec_result) - return filter_boxes, filter_rec_res + end = time.time() + time_dict['all'] = end - start + return filter_boxes, filter_rec_res, time_dict def sorted_boxes(dt_boxes): @@ -114,11 +120,14 @@ def sorted_boxes(dt_boxes): _boxes = list(sorted_boxes) for i in range(num_boxes - 1): - if abs(_boxes[i + 1][0][1] - _boxes[i][0][1]) < 10 and \ - (_boxes[i + 1][0][0] < _boxes[i][0][0]): - tmp = _boxes[i] - _boxes[i] = _boxes[i + 1] - _boxes[i + 1] = tmp + for j in range(i, 0, -1): + if abs(_boxes[j + 1][0][1] - _boxes[j][0][1]) < 10 and \ + (_boxes[j + 1][0][0] < _boxes[j][0][0]): + tmp = _boxes[j] + _boxes[j] = _boxes[j + 1] + _boxes[j + 1] = tmp + else: + break return _boxes @@ -133,9 +142,11 @@ def main(args): os.makedirs(draw_img_save_dir, exist_ok=True) save_results = [] - logger.info("In PP-OCRv3, rec_image_shape parameter defaults to '3, 48, 320', " - "if you are using recognition model with PP-OCRv2 or an older version, please set --rec_image_shape='3,32,320") - + logger.info( + "In PP-OCRv3, rec_image_shape parameter defaults to '3, 48, 320', " + "if you are using recognition model with PP-OCRv2 or an older version, please set --rec_image_shape='3,32,320" + ) + # warm up 10 times if args.warmup: img = np.random.uniform(0, 255, [640, 640, 3]).astype(np.uint8) @@ -148,14 +159,14 @@ def main(args): count = 0 for idx, image_file in enumerate(image_file_list): - img, flag = check_and_read_gif(image_file) + img, flag, _ = check_and_read(image_file) if not flag: img = cv2.imread(image_file) if img is None: logger.debug("error in loading image:{}".format(image_file)) continue starttime = time.time() - dt_boxes, rec_res = text_sys(img) + dt_boxes, rec_res, time_dict = text_sys(img) elapse = time.time() - starttime total_time += elapse @@ -198,7 +209,10 @@ def main(args): text_sys.text_detector.autolog.report() text_sys.text_recognizer.autolog.report() - with open(os.path.join(draw_img_save_dir, "system_results.txt"), 'w', encoding='utf-8') as f: + with open( + os.path.join(draw_img_save_dir, "system_results.txt"), + 'w', + encoding='utf-8') as f: f.writelines(save_results) diff --git a/tools/infer/utility.py b/tools/infer/utility.py index 9345106e774cfbcf0e87a7cf5d8b6cdabb4cf490..07b2172cd3c6a624d4b1026163dcb811edebde02 100644 --- a/tools/infer/utility.py +++ b/tools/infer/utility.py @@ -23,6 +23,7 @@ from PIL import Image, ImageDraw, ImageFont import math from paddle import inference import time +import random from ppocr.utils.logging import get_logger @@ -35,6 +36,7 @@ def init_args(): # params for prediction engine parser.add_argument("--use_gpu", type=str2bool, default=True) parser.add_argument("--use_xpu", type=str2bool, default=False) + parser.add_argument("--use_npu", type=str2bool, default=False) parser.add_argument("--ir_optim", type=str2bool, default=True) parser.add_argument("--use_tensorrt", type=str2bool, default=False) parser.add_argument("--min_subgraph_size", type=int, default=15) @@ -121,6 +123,11 @@ def init_args(): parser.add_argument("--use_pdserving", type=str2bool, default=False) parser.add_argument("--warmup", type=str2bool, default=False) + # SR parmas + parser.add_argument("--sr_model_dir", type=str) + parser.add_argument("--sr_image_shape", type=str, default="3, 32, 128") + parser.add_argument("--sr_batch_num", type=int, default=1) + # parser.add_argument( "--draw_img_save_dir", type=str, default="./inference_results") @@ -156,6 +163,10 @@ def create_predictor(args, mode, logger): model_dir = args.table_model_dir elif mode == 'ser': model_dir = args.ser_model_dir + elif mode == "sr": + model_dir = args.sr_model_dir + elif mode == 'layout': + model_dir = args.layout_model_dir else: model_dir = args.e2e_model_dir @@ -172,14 +183,21 @@ def create_predictor(args, mode, logger): return sess, sess.get_inputs()[0], None, None else: - model_file_path = model_dir + "/inference.pdmodel" - params_file_path = model_dir + "/inference.pdiparams" + file_names = ['model', 'inference'] + for file_name in file_names: + model_file_path = '{}/{}.pdmodel'.format(model_dir, file_name) + params_file_path = '{}/{}.pdiparams'.format(model_dir, file_name) + if os.path.exists(model_file_path) and os.path.exists( + params_file_path): + break if not os.path.exists(model_file_path): - raise ValueError("not find model file path {}".format( - model_file_path)) + raise ValueError( + "not find model.pdmodel or inference.pdmodel in {}".format( + model_dir)) if not os.path.exists(params_file_path): - raise ValueError("not find params file path {}".format( - params_file_path)) + raise ValueError( + "not find model.pdiparams or inference.pdiparams in {}".format( + model_dir)) config = inference.Config(model_file_path, params_file_path) @@ -205,116 +223,45 @@ def create_predictor(args, mode, logger): workspace_size=1 << 30, precision_mode=precision, max_batch_size=args.max_batch_size, - min_subgraph_size=args.min_subgraph_size, # skip the minmum trt subgraph + min_subgraph_size=args. + min_subgraph_size, # skip the minmum trt subgraph use_calib_mode=False) - - # collect shape - if args.shape_info_filename is not None: - if not os.path.exists(args.shape_info_filename): - config.collect_shape_range_info(args.shape_info_filename) - logger.info(f"collect dynamic shape info into : {args.shape_info_filename}") + + # collect shape + trt_shape_f = f"{os.path.dirname(args.shape_info_filename)}/{mode}_{os.path.basename(args.shape_info_filename)}" + if trt_shape_f is not None: + if not os.path.exists(trt_shape_f): + config.collect_shape_range_info(trt_shape_f) + logger.info( + f"collect dynamic shape info into : {trt_shape_f}" + ) + else: + logger.info( + f"dynamic shape info file( {trt_shape_f} ) already exists, not need to generate again." + ) + config.enable_tuned_tensorrt_dynamic_shape(trt_shape_f, True) else: - logger.info(f"dynamic shape info file( {args.shape_info_filename} ) already exists, not need to generate again.") - config.enable_tuned_tensorrt_dynamic_shape(args.shape_info_filename, True) - - use_dynamic_shape = True - if mode == "det": - min_input_shape = { - "x": [1, 3, 50, 50], - "conv2d_92.tmp_0": [1, 120, 20, 20], - "conv2d_91.tmp_0": [1, 24, 10, 10], - "conv2d_59.tmp_0": [1, 96, 20, 20], - "nearest_interp_v2_1.tmp_0": [1, 256, 10, 10], - "nearest_interp_v2_2.tmp_0": [1, 256, 20, 20], - "conv2d_124.tmp_0": [1, 256, 20, 20], - "nearest_interp_v2_3.tmp_0": [1, 64, 20, 20], - "nearest_interp_v2_4.tmp_0": [1, 64, 20, 20], - "nearest_interp_v2_5.tmp_0": [1, 64, 20, 20], - "elementwise_add_7": [1, 56, 2, 2], - "nearest_interp_v2_0.tmp_0": [1, 256, 2, 2] - } - max_input_shape = { - "x": [1, 3, 1536, 1536], - "conv2d_92.tmp_0": [1, 120, 400, 400], - "conv2d_91.tmp_0": [1, 24, 200, 200], - "conv2d_59.tmp_0": [1, 96, 400, 400], - "nearest_interp_v2_1.tmp_0": [1, 256, 200, 200], - "conv2d_124.tmp_0": [1, 256, 400, 400], - "nearest_interp_v2_2.tmp_0": [1, 256, 400, 400], - "nearest_interp_v2_3.tmp_0": [1, 64, 400, 400], - "nearest_interp_v2_4.tmp_0": [1, 64, 400, 400], - "nearest_interp_v2_5.tmp_0": [1, 64, 400, 400], - "elementwise_add_7": [1, 56, 400, 400], - "nearest_interp_v2_0.tmp_0": [1, 256, 400, 400] - } - opt_input_shape = { - "x": [1, 3, 640, 640], - "conv2d_92.tmp_0": [1, 120, 160, 160], - "conv2d_91.tmp_0": [1, 24, 80, 80], - "conv2d_59.tmp_0": [1, 96, 160, 160], - "nearest_interp_v2_1.tmp_0": [1, 256, 80, 80], - "nearest_interp_v2_2.tmp_0": [1, 256, 160, 160], - "conv2d_124.tmp_0": [1, 256, 160, 160], - "nearest_interp_v2_3.tmp_0": [1, 64, 160, 160], - "nearest_interp_v2_4.tmp_0": [1, 64, 160, 160], - "nearest_interp_v2_5.tmp_0": [1, 64, 160, 160], - "elementwise_add_7": [1, 56, 40, 40], - "nearest_interp_v2_0.tmp_0": [1, 256, 40, 40] - } - min_pact_shape = { - "nearest_interp_v2_26.tmp_0": [1, 256, 20, 20], - "nearest_interp_v2_27.tmp_0": [1, 64, 20, 20], - "nearest_interp_v2_28.tmp_0": [1, 64, 20, 20], - "nearest_interp_v2_29.tmp_0": [1, 64, 20, 20] - } - max_pact_shape = { - "nearest_interp_v2_26.tmp_0": [1, 256, 400, 400], - "nearest_interp_v2_27.tmp_0": [1, 64, 400, 400], - "nearest_interp_v2_28.tmp_0": [1, 64, 400, 400], - "nearest_interp_v2_29.tmp_0": [1, 64, 400, 400] - } - opt_pact_shape = { - "nearest_interp_v2_26.tmp_0": [1, 256, 160, 160], - "nearest_interp_v2_27.tmp_0": [1, 64, 160, 160], - "nearest_interp_v2_28.tmp_0": [1, 64, 160, 160], - "nearest_interp_v2_29.tmp_0": [1, 64, 160, 160] - } - min_input_shape.update(min_pact_shape) - max_input_shape.update(max_pact_shape) - opt_input_shape.update(opt_pact_shape) - elif mode == "rec": - if args.rec_algorithm not in ["CRNN", "SVTR_LCNet"]: - use_dynamic_shape = False - imgH = int(args.rec_image_shape.split(',')[-2]) - min_input_shape = {"x": [1, 3, imgH, 10]} - max_input_shape = {"x": [args.rec_batch_num, 3, imgH, 2304]} - opt_input_shape = {"x": [args.rec_batch_num, 3, imgH, 320]} - config.exp_disable_tensorrt_ops(["transpose2"]) - elif mode == "cls": - min_input_shape = {"x": [1, 3, 48, 10]} - max_input_shape = {"x": [args.rec_batch_num, 3, 48, 1024]} - opt_input_shape = {"x": [args.rec_batch_num, 3, 48, 320]} - else: - use_dynamic_shape = False - if use_dynamic_shape: - config.set_trt_dynamic_shape_info( - min_input_shape, max_input_shape, opt_input_shape) + logger.info( + f"when using tensorrt, dynamic shape is a suggested option, you can use '--shape_info_filename=shape.txt' for offline dygnamic shape tuning" + ) + elif args.use_npu: + config.enable_npu() elif args.use_xpu: config.enable_xpu(10 * 1024 * 1024) else: config.disable_gpu() - if hasattr(args, "cpu_threads"): - config.set_cpu_math_library_num_threads(args.cpu_threads) - else: - # default cpu threads as 10 - config.set_cpu_math_library_num_threads(10) if args.enable_mkldnn: # cache 10 different shapes for mkldnn to avoid memory leak config.set_mkldnn_cache_capacity(10) config.enable_mkldnn() if args.precision == "fp16": config.enable_mkldnn_bfloat16() + if hasattr(args, "cpu_threads"): + config.set_cpu_math_library_num_threads(args.cpu_threads) + else: + # default cpu threads as 10 + config.set_cpu_math_library_num_threads(10) # enable memory optim config.enable_memory_optim() config.disable_glog_info() @@ -453,56 +400,81 @@ def draw_ocr(image, def draw_ocr_box_txt(image, boxes, - txts, + txts=None, scores=None, drop_score=0.5, - font_path="./doc/simfang.ttf"): + font_path="./doc/fonts/simfang.ttf"): h, w = image.height, image.width img_left = image.copy() - img_right = Image.new('RGB', (w, h), (255, 255, 255)) - - import random - + img_right = np.ones((h, w, 3), dtype=np.uint8) * 255 random.seed(0) + draw_left = ImageDraw.Draw(img_left) - draw_right = ImageDraw.Draw(img_right) + if txts is None or len(txts) != len(boxes): + txts = [None] * len(boxes) for idx, (box, txt) in enumerate(zip(boxes, txts)): if scores is not None and scores[idx] < drop_score: continue color = (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255)) draw_left.polygon(box, fill=color) - draw_right.polygon( - [ - box[0][0], box[0][1], box[1][0], box[1][1], box[2][0], - box[2][1], box[3][0], box[3][1] - ], - outline=color) - box_height = math.sqrt((box[0][0] - box[3][0])**2 + (box[0][1] - box[3][ - 1])**2) - box_width = math.sqrt((box[0][0] - box[1][0])**2 + (box[0][1] - box[1][ - 1])**2) - if box_height > 2 * box_width: - font_size = max(int(box_width * 0.9), 10) - font = ImageFont.truetype(font_path, font_size, encoding="utf-8") - cur_y = box[0][1] - for c in txt: - char_size = font.getsize(c) - draw_right.text( - (box[0][0] + 3, cur_y), c, fill=(0, 0, 0), font=font) - cur_y += char_size[1] - else: - font_size = max(int(box_height * 0.8), 10) - font = ImageFont.truetype(font_path, font_size, encoding="utf-8") - draw_right.text( - [box[0][0], box[0][1]], txt, fill=(0, 0, 0), font=font) + img_right_text = draw_box_txt_fine((w, h), box, txt, font_path) + pts = np.array(box, np.int32).reshape((-1, 1, 2)) + cv2.polylines(img_right_text, [pts], True, color, 1) + img_right = cv2.bitwise_and(img_right, img_right_text) img_left = Image.blend(image, img_left, 0.5) img_show = Image.new('RGB', (w * 2, h), (255, 255, 255)) img_show.paste(img_left, (0, 0, w, h)) - img_show.paste(img_right, (w, 0, w * 2, h)) + img_show.paste(Image.fromarray(img_right), (w, 0, w * 2, h)) return np.array(img_show) +def draw_box_txt_fine(img_size, box, txt, font_path="./doc/fonts/simfang.ttf"): + box_height = int( + math.sqrt((box[0][0] - box[3][0])**2 + (box[0][1] - box[3][1])**2)) + box_width = int( + math.sqrt((box[0][0] - box[1][0])**2 + (box[0][1] - box[1][1])**2)) + + if box_height > 2 * box_width and box_height > 30: + img_text = Image.new('RGB', (box_height, box_width), (255, 255, 255)) + draw_text = ImageDraw.Draw(img_text) + if txt: + font = create_font(txt, (box_height, box_width), font_path) + draw_text.text([0, 0], txt, fill=(0, 0, 0), font=font) + img_text = img_text.transpose(Image.ROTATE_270) + else: + img_text = Image.new('RGB', (box_width, box_height), (255, 255, 255)) + draw_text = ImageDraw.Draw(img_text) + if txt: + font = create_font(txt, (box_width, box_height), font_path) + draw_text.text([0, 0], txt, fill=(0, 0, 0), font=font) + + pts1 = np.float32( + [[0, 0], [box_width, 0], [box_width, box_height], [0, box_height]]) + pts2 = np.array(box, dtype=np.float32) + M = cv2.getPerspectiveTransform(pts1, pts2) + + img_text = np.array(img_text, dtype=np.uint8) + img_right_text = cv2.warpPerspective( + img_text, + M, + img_size, + flags=cv2.INTER_NEAREST, + borderMode=cv2.BORDER_CONSTANT, + borderValue=(255, 255, 255)) + return img_right_text + + +def create_font(txt, sz, font_path="./doc/fonts/simfang.ttf"): + font_size = int(sz[1] * 0.99) + font = ImageFont.truetype(font_path, font_size, encoding="utf-8") + length = font.getsize(txt)[0] + if length > sz[0]: + font_size = int(font_size * sz[0] / length) + font = ImageFont.truetype(font_path, font_size, encoding="utf-8") + return font + + def str_count(s): """ Count the number of Chinese characters, @@ -606,7 +578,7 @@ def text_visual(texts, def base64_to_cv2(b64str): import base64 data = base64.b64decode(b64str.encode('utf8')) - data = np.fromstring(data, np.uint8) + data = np.frombuffer(data, np.uint8) data = cv2.imdecode(data, cv2.IMREAD_COLOR) return data diff --git a/tools/infer_e2e.py b/tools/infer_e2e.py index d3e6b28fca0a3ff32ea940747712d6c71aa290fd..37fdcbaadc2984c9cf4fb105b7122db31b99be30 100755 --- a/tools/infer_e2e.py +++ b/tools/infer_e2e.py @@ -37,6 +37,46 @@ from ppocr.postprocess import build_post_process from ppocr.utils.save_load import load_model from ppocr.utils.utility import get_image_file_list import tools.program as program +from PIL import Image, ImageDraw, ImageFont +import math + + +def draw_e2e_res_for_chinese(image, + boxes, + txts, + config, + img_name, + font_path="./doc/simfang.ttf"): + h, w = image.height, image.width + img_left = image.copy() + img_right = Image.new('RGB', (w, h), (255, 255, 255)) + + import random + + random.seed(0) + draw_left = ImageDraw.Draw(img_left) + draw_right = ImageDraw.Draw(img_right) + for idx, (box, txt) in enumerate(zip(boxes, txts)): + box = np.array(box) + box = [tuple(x) for x in box] + color = (random.randint(0, 255), random.randint(0, 255), + random.randint(0, 255)) + draw_left.polygon(box, fill=color) + draw_right.polygon(box, outline=color) + font = ImageFont.truetype(font_path, 15, encoding="utf-8") + draw_right.text([box[0][0], box[0][1]], txt, fill=(0, 0, 0), font=font) + img_left = Image.blend(image, img_left, 0.5) + img_show = Image.new('RGB', (w * 2, h), (255, 255, 255)) + img_show.paste(img_left, (0, 0, w, h)) + img_show.paste(img_right, (w, 0, w * 2, h)) + + save_e2e_path = os.path.dirname(config['Global'][ + 'save_res_path']) + "/e2e_results/" + if not os.path.exists(save_e2e_path): + os.makedirs(save_e2e_path) + save_path = os.path.join(save_e2e_path, os.path.basename(img_name)) + cv2.imwrite(save_path, np.array(img_show)[:, :, ::-1]) + logger.info("The e2e Image saved in {}".format(save_path)) def draw_e2e_res(dt_boxes, strs, config, img, img_name): @@ -113,7 +153,19 @@ def main(): otstr = file + "\t" + json.dumps(dt_boxes_json) + "\n" fout.write(otstr.encode()) src_img = cv2.imread(file) - draw_e2e_res(points, strs, config, src_img, file) + if global_config['infer_visual_type'] == 'EN': + draw_e2e_res(points, strs, config, src_img, file) + elif global_config['infer_visual_type'] == 'CN': + src_img = Image.fromarray( + cv2.cvtColor(src_img, cv2.COLOR_BGR2RGB)) + draw_e2e_res_for_chinese( + src_img, + points, + strs, + config, + file, + font_path="./doc/fonts/simfang.ttf") + logger.info("success!") diff --git a/tools/infer_kie.py b/tools/infer_kie.py index 346e2e0aeeee695ab49577b6b13dcc058150df1a..9375434cc887b08dfa746420a6c73c58c6e04797 100755 --- a/tools/infer_kie.py +++ b/tools/infer_kie.py @@ -88,6 +88,29 @@ def draw_kie_result(batch, node, idx_to_cls, count): cv2.imwrite(save_path, vis_img) logger.info("The Kie Image saved in {}".format(save_path)) +def write_kie_result(fout, node, data): + """ + Write infer result to output file, sorted by the predict label of each line. + The format keeps the same as the input with additional score attribute. + """ + import json + label = data['label'] + annotations = json.loads(label) + max_value, max_idx = paddle.max(node, -1), paddle.argmax(node, -1) + node_pred_label = max_idx.numpy().tolist() + node_pred_score = max_value.numpy().tolist() + res = [] + for i, label in enumerate(node_pred_label): + pred_score = '{:.2f}'.format(node_pred_score[i]) + pred_res = { + 'label': label, + 'transcription': annotations[i]['transcription'], + 'score': pred_score, + 'points': annotations[i]['points'], + } + res.append(pred_res) + res.sort(key=lambda x: x['label']) + fout.writelines([json.dumps(res, ensure_ascii=False) + '\n']) def main(): global_config = config['Global'] @@ -114,7 +137,7 @@ def main(): warmup_times = 0 count_t = [] - with open(save_res_path, "wb") as fout: + with open(save_res_path, "w") as fout: with open(config['Global']['infer_img'], "rb") as f: lines = f.readlines() for index, data_line in enumerate(lines): @@ -139,6 +162,8 @@ def main(): node = F.softmax(node, -1) count_t.append(time.time() - st) draw_kie_result(batch, node, idx_to_cls, index) + write_kie_result(fout, node, data) + fout.close() logger.info("success!") logger.info("It took {} s for predict {} images.".format( np.sum(count_t), len(count_t))) diff --git a/tools/infer_vqa_token_ser.py b/tools/infer_kie_token_ser.py similarity index 97% rename from tools/infer_vqa_token_ser.py rename to tools/infer_kie_token_ser.py index 0173a554cace31e20ab47dbe36d132a4dbb2127b..2fc5749b9c10b9c89bc16e561fbe9c5ce58eb13c 100755 --- a/tools/infer_vqa_token_ser.py +++ b/tools/infer_kie_token_ser.py @@ -75,6 +75,8 @@ class SerPredictor(object): self.ocr_engine = PaddleOCR( use_angle_cls=False, show_log=False, + rec_model_dir=global_config.get("kie_rec_model_dir", None), + det_model_dir=global_config.get("kie_det_model_dir", None), use_gpu=global_config['use_gpu']) # create data ops @@ -104,8 +106,6 @@ class SerPredictor(object): batch = transform(data, self.ops) batch = to_tensor(batch) preds = self.model(batch) - if self.algorithm in ['LayoutLMv2', 'LayoutXLM']: - preds = preds[0] post_result = self.post_process_class( preds, segment_offset_ids=batch[6], ocr_infos=batch[7]) diff --git a/tools/infer_vqa_token_ser_re.py b/tools/infer_kie_token_ser_re.py similarity index 97% rename from tools/infer_vqa_token_ser_re.py rename to tools/infer_kie_token_ser_re.py index 51378bdaeb03d4ec6d7684de80625c5029963745..3ee696f28470a16205be628b3aeb586ef7a9c6a6 100755 --- a/tools/infer_vqa_token_ser_re.py +++ b/tools/infer_kie_token_ser_re.py @@ -39,7 +39,7 @@ from ppocr.utils.visual import draw_re_results from ppocr.utils.logging import get_logger from ppocr.utils.utility import get_image_file_list, load_vqa_bio_label_maps, print_dict from tools.program import ArgsParser, load_config, merge_config -from tools.infer_vqa_token_ser import SerPredictor +from tools.infer_kie_token_ser import SerPredictor class ReArgsParser(ArgsParser): @@ -205,9 +205,7 @@ if __name__ == '__main__': result = ser_re_engine(data) result = result[0] fout.write(img_path + "\t" + json.dumps( - { - "ser_result": result, - }, ensure_ascii=False) + "\n") + result, ensure_ascii=False) + "\n") img_res = draw_re_results(img_path, result) cv2.imwrite(save_img_path, img_res) diff --git a/tools/infer_rec.py b/tools/infer_rec.py index 182694e6cda12ead0e263bb94a7d6483a6f7f212..14b14544eb11e9fb0a0c2cdf92aff9d7cb4b5ba7 100755 --- a/tools/infer_rec.py +++ b/tools/infer_rec.py @@ -96,6 +96,8 @@ def main(): ] elif config['Architecture']['algorithm'] == "SAR": op[op_name]['keep_keys'] = ['image', 'valid_ratio'] + elif config['Architecture']['algorithm'] == "RobustScanner": + op[op_name]['keep_keys'] = ['image', 'valid_ratio', 'word_positons'] else: op[op_name]['keep_keys'] = ['image'] transforms.append(op) @@ -131,12 +133,20 @@ def main(): if config['Architecture']['algorithm'] == "SAR": valid_ratio = np.expand_dims(batch[-1], axis=0) img_metas = [paddle.to_tensor(valid_ratio)] + if config['Architecture']['algorithm'] == "RobustScanner": + valid_ratio = np.expand_dims(batch[1], axis=0) + word_positons = np.expand_dims(batch[2], axis=0) + img_metas = [paddle.to_tensor(valid_ratio), + paddle.to_tensor(word_positons), + ] images = np.expand_dims(batch[0], axis=0) images = paddle.to_tensor(images) if config['Architecture']['algorithm'] == "SRN": preds = model(images, others) elif config['Architecture']['algorithm'] == "SAR": preds = model(images, img_metas) + elif config['Architecture']['algorithm'] == "RobustScanner": + preds = model(images, img_metas) else: preds = model(images) post_result = post_process_class(preds) diff --git a/tools/infer_sr.py b/tools/infer_sr.py new file mode 100755 index 0000000000000000000000000000000000000000..df4334f3427e57b9062dd819aa16c110fd771e8c --- /dev/null +++ b/tools/infer_sr.py @@ -0,0 +1,100 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np + +import os +import sys +import json +from PIL import Image +import cv2 + +__dir__ = os.path.dirname(os.path.abspath(__file__)) +sys.path.insert(0, __dir__) +sys.path.insert(0, os.path.abspath(os.path.join(__dir__, '..'))) + +os.environ["FLAGS_allocator_strategy"] = 'auto_growth' + +import paddle + +from ppocr.data import create_operators, transform +from ppocr.modeling.architectures import build_model +from ppocr.postprocess import build_post_process +from ppocr.utils.save_load import load_model +from ppocr.utils.utility import get_image_file_list +import tools.program as program + + +def main(): + global_config = config['Global'] + + # build post process + post_process_class = build_post_process(config['PostProcess'], + global_config) + + # sr transform + config['Architecture']["Transform"]['infer_mode'] = True + + model = build_model(config['Architecture']) + + load_model(config, model) + + # create data ops + transforms = [] + for op in config['Eval']['dataset']['transforms']: + op_name = list(op)[0] + if 'Label' in op_name: + continue + elif op_name in ['SRResize']: + op[op_name]['infer_mode'] = True + elif op_name == 'KeepKeys': + op[op_name]['keep_keys'] = ['img_lr'] + transforms.append(op) + global_config['infer_mode'] = True + ops = create_operators(transforms, global_config) + + save_visual_path = config['Global'].get('save_visual', "infer_result/") + if not os.path.exists(os.path.dirname(save_visual_path)): + os.makedirs(os.path.dirname(save_visual_path)) + + model.eval() + for file in get_image_file_list(config['Global']['infer_img']): + logger.info("infer_img: {}".format(file)) + img = Image.open(file).convert("RGB") + data = {'image_lr': img} + batch = transform(data, ops) + images = np.expand_dims(batch[0], axis=0) + images = paddle.to_tensor(images) + + preds = model(images) + sr_img = preds["sr_img"][0] + lr_img = preds["lr_img"][0] + fm_sr = (sr_img.numpy() * 255).transpose(1, 2, 0).astype(np.uint8) + fm_lr = (lr_img.numpy() * 255).transpose(1, 2, 0).astype(np.uint8) + img_name_pure = os.path.split(file)[-1] + cv2.imwrite("{}/sr_{}".format(save_visual_path, img_name_pure), + fm_sr[:, :, ::-1]) + logger.info("The visualized image saved in infer_result/sr_{}".format( + img_name_pure)) + + logger.info("success!") + + +if __name__ == '__main__': + config, device, logger, vdl_writer = program.preprocess() + main() diff --git a/tools/infer_table.py b/tools/infer_table.py index 6c02dd8640c9345c267e56d6e5a0c14bde121b7e..6dde5d67d061f4d0593928759db34bb9b22cde0d 100644 --- a/tools/infer_table.py +++ b/tools/infer_table.py @@ -37,6 +37,7 @@ from ppocr.postprocess import build_post_process from ppocr.utils.save_load import load_model from ppocr.utils.utility import get_image_file_list from ppocr.utils.visual import draw_rectangle +from tools.infer.utility import draw_boxes import tools.program as program import cv2 @@ -56,7 +57,6 @@ def main(config, device, logger, vdl_writer): model = build_model(config['Architecture']) algorithm = config['Architecture']['algorithm'] - use_xywh = algorithm in ['TableMaster'] load_model(config, model) @@ -106,9 +106,13 @@ def main(config, device, logger, vdl_writer): f_w.write("result: {}, {}\n".format(structure_str_list, bbox_list_str)) - img = draw_rectangle(file, bbox_list, use_xywh) + if len(bbox_list) > 0 and len(bbox_list[0]) == 4: + img = draw_rectangle(file, bbox_list) + else: + img = draw_boxes(cv2.imread(file), bbox_list) cv2.imwrite( os.path.join(save_res_path, os.path.basename(file)), img) + logger.info('save result to {}'.format(save_res_path)) logger.info("success!") diff --git a/tools/program.py b/tools/program.py index d799a7e656ccea1d9b7476d56edb9fe7dcf7efe4..9117d51b95b343c46982f212d4e5faa069b7b44a 100755 --- a/tools/program.py +++ b/tools/program.py @@ -25,6 +25,8 @@ import datetime import paddle import paddle.distributed as dist from tqdm import tqdm +import cv2 +import numpy as np from argparse import ArgumentParser, RawDescriptionHelpFormatter from ppocr.utils.stats import TrainingStats @@ -112,7 +114,7 @@ def merge_config(config, opts): return config -def check_device(use_gpu, use_xpu=False): +def check_device(use_gpu, use_xpu=False, use_npu=False): """ Log error and exit when set use_gpu=true in paddlepaddle cpu version. @@ -132,24 +134,8 @@ def check_device(use_gpu, use_xpu=False): if use_xpu and not paddle.device.is_compiled_with_xpu(): print(err.format("use_xpu", "xpu", "xpu", "use_xpu")) sys.exit(1) - except Exception as e: - pass - - -def check_xpu(use_xpu): - """ - Log error and exit when set use_xpu=true in paddlepaddle - cpu/gpu version. - """ - err = "Config use_xpu cannot be set as true while you are " \ - "using paddlepaddle cpu/gpu version ! \nPlease try: \n" \ - "\t1. Install paddlepaddle-xpu to run model on XPU \n" \ - "\t2. Set use_xpu as false in config file to run " \ - "model on CPU/GPU" - - try: - if use_xpu and not paddle.is_compiled_with_xpu(): - print(err) + if use_npu and not paddle.device.is_compiled_with_npu(): + print(err.format("use_npu", "npu", "npu", "use_npu")) sys.exit(1) except Exception as e: pass @@ -160,18 +146,18 @@ def to_float32(preds): for k in preds: if isinstance(preds[k], dict) or isinstance(preds[k], list): preds[k] = to_float32(preds[k]) - else: - preds[k] = paddle.to_tensor(preds[k], dtype='float32') + elif isinstance(preds[k], paddle.Tensor): + preds[k] = preds[k].astype(paddle.float32) elif isinstance(preds, list): for k in range(len(preds)): if isinstance(preds[k], dict): preds[k] = to_float32(preds[k]) elif isinstance(preds[k], list): preds[k] = to_float32(preds[k]) - else: - preds[k] = paddle.to_tensor(preds[k], dtype='float32') - else: - preds = paddle.to_tensor(preds, dtype='float32') + elif isinstance(preds[k], paddle.Tensor): + preds[k] = preds[k].astype(paddle.float32) + elif isinstance(preds, paddle.Tensor): + preds = preds.astype(paddle.float32) return preds @@ -188,7 +174,9 @@ def train(config, pre_best_model_dict, logger, log_writer=None, - scaler=None): + scaler=None, + amp_level='O2', + amp_custom_black_list=[]): cal_metric_during_train = config['Global'].get('cal_metric_during_train', False) calc_epoch_interval = config['Global'].get('calc_epoch_interval', 1) @@ -228,7 +216,8 @@ def train(config, use_srn = config['Architecture']['algorithm'] == "SRN" extra_input_models = [ - "SRN", "NRTR", "SAR", "SEED", "SVTR", "SPIN", "VisionLAN" + "SRN", "NRTR", "SAR", "SEED", "SVTR", "SPIN", "VisionLAN", + "RobustScanner" ] extra_input = False if config['Architecture']['algorithm'] == 'Distillation': @@ -262,6 +251,7 @@ def train(config, config, 'Train', device, logger, seed=epoch) max_iter = len(train_dataloader) - 1 if platform.system( ) == "Windows" else len(train_dataloader) + for idx, batch in enumerate(train_dataloader): profiler.add_profiler_step(profiler_options) train_reader_cost += time.time() - reader_start @@ -273,10 +263,12 @@ def train(config, model_average = True # use amp if scaler: - with paddle.amp.auto_cast(level='O2'): + with paddle.amp.auto_cast( + level=amp_level, + custom_black_list=amp_custom_black_list): if model_type == 'table' or extra_input: preds = model(images, data=batch[1:]) - elif model_type in ["kie", 'vqa']: + elif model_type in ["kie"]: preds = model(batch) else: preds = model(images) @@ -289,7 +281,7 @@ def train(config, else: if model_type == 'table' or extra_input: preds = model(images, data=batch[1:]) - elif model_type in ["kie", 'vqa']: + elif model_type in ["kie", 'sr']: preds = model(batch) else: preds = model(images) @@ -297,11 +289,12 @@ def train(config, avg_loss = loss['loss'] avg_loss.backward() optimizer.step() + optimizer.clear_grad() if cal_metric_during_train and epoch % calc_epoch_interval == 0: # only rec and cls need batch = [item.numpy() for item in batch] - if model_type in ['kie']: + if model_type in ['kie', 'sr']: eval_class(preds, batch) elif model_type in ['table']: post_result = post_process_class(preds, batch) @@ -347,8 +340,8 @@ def train(config, len(train_dataloader) - idx - 1) * eta_meter.avg eta_sec_format = str(datetime.timedelta(seconds=int(eta_sec))) strs = 'epoch: [{}/{}], global_step: {}, {}, avg_reader_cost: ' \ - '{:.5f} s, avg_batch_cost: {:.5f} s, avg_samples: {}, ' \ - 'ips: {:.5f} samples/s, eta: {}'.format( + '{:.5f} s, avg_batch_cost: {:.5f} s, avg_samples: {}, ' \ + 'ips: {:.5f} samples/s, eta: {}'.format( epoch, epoch_num, global_step, logs, train_reader_cost / print_batch_step, train_batch_cost / print_batch_step, @@ -376,7 +369,10 @@ def train(config, post_process_class, eval_class, model_type, - extra_input=extra_input) + extra_input=extra_input, + scaler=scaler, + amp_level=amp_level, + amp_custom_black_list=amp_custom_black_list) cur_metric_str = 'cur metric, {}'.format(', '.join( ['{}: {}'.format(k, v) for k, v in cur_metric.items()])) logger.info(cur_metric_str) @@ -466,7 +462,10 @@ def eval(model, post_process_class, eval_class, model_type=None, - extra_input=False): + extra_input=False, + scaler=None, + amp_level='O2', + amp_custom_black_list=[]): model.eval() with paddle.no_grad(): total_frame = 0.0 @@ -478,17 +477,41 @@ def eval(model, leave=True) max_iter = len(valid_dataloader) - 1 if platform.system( ) == "Windows" else len(valid_dataloader) + sum_images = 0 for idx, batch in enumerate(valid_dataloader): if idx >= max_iter: break images = batch[0] start = time.time() - if model_type == 'table' or extra_input: - preds = model(images, data=batch[1:]) - elif model_type in ["kie", 'vqa']: - preds = model(batch) + + # use amp + if scaler: + with paddle.amp.auto_cast( + level=amp_level, + custom_black_list=amp_custom_black_list): + if model_type == 'table' or extra_input: + preds = model(images, data=batch[1:]) + elif model_type in ["kie"]: + preds = model(batch) + elif model_type in ['sr']: + preds = model(batch) + sr_img = preds["sr_img"] + lr_img = preds["lr_img"] + else: + preds = model(images) + preds = to_float32(preds) else: - preds = model(images) + if model_type == 'table' or extra_input: + preds = model(images, data=batch[1:]) + elif model_type in ["kie"]: + preds = model(batch) + elif model_type in ['sr']: + preds = model(batch) + sr_img = preds["sr_img"] + lr_img = preds["lr_img"] + else: + preds = model(images) + batch_numpy = [] for item in batch: if isinstance(item, paddle.Tensor): @@ -498,17 +521,21 @@ def eval(model, # Obtain usable results from post-processing methods total_time += time.time() - start # Evaluate the results of the current batch - if model_type in ['kie']: + if model_type in ['table', 'kie']: + if post_process_class is None: + eval_class(preds, batch_numpy) + else: + post_result = post_process_class(preds, batch_numpy) + eval_class(post_result, batch_numpy) + elif model_type in ['sr']: eval_class(preds, batch_numpy) - elif model_type in ['table', 'vqa']: - post_result = post_process_class(preds, batch_numpy) - eval_class(post_result, batch_numpy) else: post_result = post_process_class(preds, batch_numpy[1]) eval_class(post_result, batch_numpy) pbar.update(1) total_frame += len(images) + sum_images += 1 # Get final metric,eg. acc or hmean metric = eval_class.get_metric() @@ -588,29 +615,27 @@ def preprocess(is_train=False): logger = get_logger(log_file=log_file) # check if set use_gpu=True in paddlepaddle cpu version - use_gpu = config['Global']['use_gpu'] + use_gpu = config['Global'].get('use_gpu', False) use_xpu = config['Global'].get('use_xpu', False) - - # check if set use_xpu=True in paddlepaddle cpu/gpu version - use_xpu = False - if 'use_xpu' in config['Global']: - use_xpu = config['Global']['use_xpu'] - check_xpu(use_xpu) + use_npu = config['Global'].get('use_npu', False) alg = config['Architecture']['algorithm'] assert alg in [ 'EAST', 'DB', 'SAST', 'Rosetta', 'CRNN', 'STARNet', 'RARE', 'SRN', 'CLS', 'PGNet', 'Distillation', 'NRTR', 'TableAttn', 'SAR', 'PSE', 'SEED', 'SDMGR', 'LayoutXLM', 'LayoutLM', 'LayoutLMv2', 'PREN', 'FCE', - 'SVTR', 'ViTSTR', 'ABINet', 'DB++', 'TableMaster', 'SPIN', 'VisionLAN' + 'SVTR', 'ViTSTR', 'ABINet', 'DB++', 'TableMaster', 'SPIN', 'VisionLAN', + 'Gestalt', 'SLANet', 'RobustScanner', 'CT' ] if use_xpu: device = 'xpu:{0}'.format(os.getenv('FLAGS_selected_xpus', 0)) + elif use_npu: + device = 'npu:{0}'.format(os.getenv('FLAGS_selected_npus', 0)) else: device = 'gpu:{}'.format(dist.ParallelEnv() .dev_id) if use_gpu else 'cpu' - check_device(use_gpu, use_xpu) + check_device(use_gpu, use_xpu, use_npu) device = paddle.set_device(device) diff --git a/tools/train.py b/tools/train.py index dc8cae8a63744bb9bd486d9899680dbde9da1697..970a52624af7b2831d88956f857cd4271086bcca 100755 --- a/tools/train.py +++ b/tools/train.py @@ -119,6 +119,12 @@ def main(config, device, logger, vdl_writer): config['Loss']['ignore_index'] = char_num - 1 model = build_model(config['Architecture']) + + use_sync_bn = config["Global"].get("use_sync_bn", False) + if use_sync_bn: + model = paddle.nn.SyncBatchNorm.convert_sync_batchnorm(model) + logger.info('convert_sync_batchnorm') + model = apply_to_static(model, config, logger) # build loss @@ -133,15 +139,15 @@ def main(config, device, logger, vdl_writer): # build metric eval_class = build_metric(config['Metric']) - # load pretrain model - pre_best_model_dict = load_model(config, model, optimizer, - config['Architecture']["model_type"]) + logger.info('train dataloader has {} iters'.format(len(train_dataloader))) if valid_dataloader is not None: logger.info('valid dataloader has {} iters'.format( len(valid_dataloader))) use_amp = config["Global"].get("use_amp", False) + amp_level = config["Global"].get("amp_level", 'O2') + amp_custom_black_list = config['Global'].get('amp_custom_black_list', []) if use_amp: AMP_RELATED_FLAGS_SETTING = { 'FLAGS_cudnn_batchnorm_spatial_persistent': 1, @@ -154,17 +160,26 @@ def main(config, device, logger, vdl_writer): scaler = paddle.amp.GradScaler( init_loss_scaling=scale_loss, use_dynamic_loss_scaling=use_dynamic_loss_scaling) - model, optimizer = paddle.amp.decorate( - models=model, optimizers=optimizer, level='O2', master_weight=True) + if amp_level == "O2": + model, optimizer = paddle.amp.decorate( + models=model, + optimizers=optimizer, + level=amp_level, + master_weight=True) else: scaler = None + # load pretrain model + pre_best_model_dict = load_model(config, model, optimizer, + config['Architecture']["model_type"]) + if config['Global']['distributed']: model = paddle.DataParallel(model) # start train program.train(config, train_dataloader, valid_dataloader, device, model, loss_class, optimizer, lr_scheduler, post_process_class, - eval_class, pre_best_model_dict, logger, vdl_writer, scaler) + eval_class, pre_best_model_dict, logger, vdl_writer, scaler, + amp_level, amp_custom_black_list) def test_reader(config, device, logger): diff --git a/train.sh b/train.sh index 4225470cb9f545b874e5f806af22405895e8f6c7..6fa04ea3febe8982016a35d83f119c0a483e3bb8 100644 --- a/train.sh +++ b/train.sh @@ -1,2 +1,2 @@ # recommended paddle.__version__ == 2.0.0 -python3 -m paddle.distributed.launch --log_dir=./debug/ --gpus '0,1,2,3,4,5,6,7' tools/train.py -c configs/rec/rec_mv3_none_bilstm_ctc.yml +python3 -m paddle.distributed.launch --log_dir=./debug/ --gpus '0,1,2,3,4,5,6,7' tools/train.py -c configs/rec/rec_mv3_none_bilstm_ctc.yml \ No newline at end of file