提交 bea99752 编写于 作者: A andyjpaddle

Merge branch 'dygraph' of https://github.com/PaddlePaddle/PaddleOCR into dygraph

此差异已折叠。
try:
from PyQt5.QtGui import *
from PyQt5.QtCore import *
from PyQt5.QtWidgets import *
except ImportError:
from PyQt4.QtGui import *
from PyQt4.QtCore import *
from libs.utils import newIcon
import time
import datetime
import json
import cv2
import numpy as np
BB = QDialogButtonBox
class DataPartitionDialog(QDialog):
def __init__(self, parent=None):
super().__init__()
self.parnet = parent
self.title = 'DATA PARTITION'
self.train_ratio = 70
self.val_ratio = 15
self.test_ratio = 15
self.initUI()
def initUI(self):
self.setWindowTitle(self.title)
self.setWindowModality(Qt.ApplicationModal)
self.flag_accept = True
if self.parnet.lang == 'ch':
msg = "导出JSON前请保存所有图像的标注且关闭EXCEL!"
else:
msg = "Please save all the annotations and close the EXCEL before exporting JSON!"
info_msg = QLabel(msg, self)
info_msg.setWordWrap(True)
info_msg.setStyleSheet("color: red")
info_msg.setFont(QFont('Arial', 12))
train_lbl = QLabel('Train split: ', self)
train_lbl.setFont(QFont('Arial', 15))
val_lbl = QLabel('Valid split: ', self)
val_lbl.setFont(QFont('Arial', 15))
test_lbl = QLabel('Test split: ', self)
test_lbl.setFont(QFont('Arial', 15))
self.train_input = QLineEdit(self)
self.train_input.setFont(QFont('Arial', 15))
self.val_input = QLineEdit(self)
self.val_input.setFont(QFont('Arial', 15))
self.test_input = QLineEdit(self)
self.test_input.setFont(QFont('Arial', 15))
self.train_input.setText(str(self.train_ratio))
self.val_input.setText(str(self.val_ratio))
self.test_input.setText(str(self.test_ratio))
validator = QIntValidator(0, 100)
self.train_input.setValidator(validator)
self.val_input.setValidator(validator)
self.test_input.setValidator(validator)
gridlayout = QGridLayout()
gridlayout.addWidget(info_msg, 0, 0, 1, 2)
gridlayout.addWidget(train_lbl, 1, 0)
gridlayout.addWidget(val_lbl, 2, 0)
gridlayout.addWidget(test_lbl, 3, 0)
gridlayout.addWidget(self.train_input, 1, 1)
gridlayout.addWidget(self.val_input, 2, 1)
gridlayout.addWidget(self.test_input, 3, 1)
bb = BB(BB.Ok | BB.Cancel, Qt.Horizontal, self)
bb.button(BB.Ok).setIcon(newIcon('done'))
bb.button(BB.Cancel).setIcon(newIcon('undo'))
bb.accepted.connect(self.validate)
bb.rejected.connect(self.cancel)
gridlayout.addWidget(bb, 4, 0, 1, 2)
self.setLayout(gridlayout)
self.show()
def validate(self):
self.flag_accept = True
self.accept()
def cancel(self):
self.flag_accept = False
self.reject()
def getStatus(self):
return self.flag_accept
def getDataPartition(self):
self.train_ratio = int(self.train_input.text())
self.val_ratio = int(self.val_input.text())
self.test_ratio = int(self.test_input.text())
return self.train_ratio, self.val_ratio, self.test_ratio
def closeEvent(self, event):
self.flag_accept = False
self.reject()
......@@ -161,6 +161,77 @@ def get_rotate_crop_image(img, points):
print(e)
def boxPad(box, imgShape, pad : int) -> np.array:
"""
Pad a box with [pad] pixels on each side.
"""
box = np.array(box, dtype=np.int32)
box[0][0], box[0][1] = box[0][0] - pad, box[0][1] - pad
box[1][0], box[1][1] = box[1][0] + pad, box[1][1] - pad
box[2][0], box[2][1] = box[2][0] + pad, box[2][1] + pad
box[3][0], box[3][1] = box[3][0] - pad, box[3][1] + pad
h, w, _ = imgShape
box[:,0] = np.clip(box[:,0], 0, w)
box[:,1] = np.clip(box[:,1], 0, h)
return box
def OBB2HBB(obb) -> np.array:
"""
Convert Oriented Bounding Box to Horizontal Bounding Box.
"""
hbb = np.zeros(4, dtype=np.int32)
hbb[0] = min(obb[:, 0])
hbb[1] = min(obb[:, 1])
hbb[2] = max(obb[:, 0])
hbb[3] = max(obb[:, 1])
return hbb
def expand_list(merged, html_list):
'''
Fill blanks according to merged cells
'''
sr, er, sc, ec = merged
for i in range(sr, er):
for j in range(sc, ec):
html_list[i][j] = None
html_list[sr][sc] = ''
if ec - sc > 1:
html_list[sr][sc] += " colspan={}".format(ec - sc)
if er - sr > 1:
html_list[sr][sc] += " rowspan={}".format(er - sr)
return html_list
def convert_token(html_list):
'''
Convert raw html to label format
'''
token_list = ["<tbody>"]
# final html list:
for row in html_list:
token_list.append("<tr>")
for col in row:
if col == None:
continue
elif col == 'td':
token_list.extend(["<td>", "</td>"])
else:
token_list.append("<td")
if 'colspan' in col:
_, n = col.split('colspan=')
token_list.append(" colspan=\"{}\"".format(n))
if 'rowspan' in col:
_, n = col.split('rowspan=')
token_list.append(" rowspan=\"{}\"".format(n))
token_list.extend([">", "</td>"])
token_list.append("</tr>")
token_list.append("</tbody>")
return token_list
def stepsInfo(lang='en'):
if lang == 'ch':
msg = "1. 安装与运行:使用上述命令安装与运行程序。\n" \
......
......@@ -84,7 +84,7 @@ mhelp=Help
iconList=Icon List
detectionBoxposition=Detection box position
recognitionResult=Recognition result
creatPolygon=Create Quadrilateral
creatPolygon=Create PolygonBox
rotateLeft=Left turn 90 degrees
rotateRight=Right turn 90 degrees
drawSquares=Draw Squares
......@@ -110,3 +110,6 @@ lockBoxDetail=Lock selected box/Unlock all box
keyListTitle=Key List
keyDialogTip=Enter object label
keyChange=Change Box Key
TableRecognition=Table Recognition
cellreRecognition=Cell Re-Recognition
exportJSON=export JSON(PubTabNet)
......@@ -84,7 +84,7 @@ mhelp=帮助
iconList=缩略图
detectionBoxposition=检测框位置
recognitionResult=识别结果
creatPolygon=四点标注
creatPolygon=多边形标注
drawSquares=正方形标注
rotateLeft=图片左旋转90度
rotateRight=图片右旋转90度
......@@ -109,4 +109,7 @@ lockBox=锁定框/解除锁定框
lockBoxDetail=若当前没有框处于锁定状态则锁定选中的框,若存在锁定框则解除所有锁定框的锁定状态
keyListTitle=关键词列表
keyDialogTip=请输入类型名称
keyChange=更改Box关键字类别
\ No newline at end of file
keyChange=更改Box关键字类别
TableRecognition=表格识别
cellreRecognition=单元格重识别
exportJSON=导出表格JSON标注
\ No newline at end of file
......@@ -11,28 +11,28 @@
<a name="1"></a>
## 1. 简介
PP-OCRv3在PP-OCRv2的基础上进一步升级。整体的框架图保持了与PP-OCRv2相同的pipeline,针对检测模型和识别模型进行了优化。其中,检测模型仍基于DB模型优化,而识别模型不再采用CRNN,换成了会议IJCAI 2022中的最新方法[SVTR](https://arxiv.org/abs/2205.00159)PP-OCRv3系统框图如下所示(粉色框中为PP-OCRv3新增策略):
PP-OCRv3在PP-OCRv2的基础上进一步升级。整体的框架图保持了与PP-OCRv2相同的pipeline,针对检测模型和识别模型进行了优化。其中,检测模块仍基于DB算法优化,而识别模块不再采用CRNN,换成了IJCAI 2022最新收录的文本识别算法[SVTR](https://arxiv.org/abs/2205.00159),并对其进行产业适配。PP-OCRv3系统框图如下所示(粉色框中为PP-OCRv3新增策略):
<div align="center">
<img src="../ppocrv3_framework.png" width="800">
</div>
从算法改进思路上看,分别针对检测和识别模型,进行了共个方面的改进:
从算法改进思路上看,分别针对检测和识别模型,进行了共9个方面的改进:
- 检测模型优化:
- LK-PAN:增大感受野的PAN模块;
- 检测模块:
- LK-PAN:大感受野的PAN结构;
- DML:教师模型互学习策略;
- RSE-FPN:带残差注意力机制的FPN模块
- RSE-FPN:残差注意力机制的FPN结构
- 识别模型优化
- 识别模
- SVTR_LCNet:轻量级文本识别网络;
- GTC:Attention指导CTC训练策略;
- TextConAug:丰富图像上下文信息的数据增广策略;
- TextConAug:挖掘文字上下文信息的数据增广策略;
- TextRotNet:自监督的预训练模型;
- UIM:无标签数据挖掘方案。
- UDML:联合互学习策略;
- UIM:无标注数据挖掘方案。
从效果上看,速度可比情况下,多种场景精度均有大幅提升:
- 中文场景,相对于PP-OCRv2中文模型提升超5%;
......@@ -43,9 +43,7 @@ PP-OCRv3在PP-OCRv2的基础上进一步升级。整体的框架图保持了与P
<a name="2"></a>
## 2. 检测优化
PP-OCRv3检测模型整体训练方案仍采用PP-OCRv2的[CML](https://arxiv.org/pdf/2109.03144.pdf)蒸馏策略,CML蒸馏包含一个教师模型和两个学生模型,在训练过程中,教师模型不参与训练,学生模型受到来自标签和教师模型的监督,同时两个学生模型互相学习。PP-OCRv3分别针对教师模型、学生模型进一步优化。其中,在对教师模型优化时,采用了增大感受野的PAN模块LK-PAN和DML蒸馏策略;在对学生模型优化时,采用了带残差注意力机制的FPN模块RSE-FPN。
PP-OCRv3 CML蒸馏训练框架图如下:
PP-OCRv3检测模型是对PP-OCRv2中的[CML](https://arxiv.org/pdf/2109.03144.pdf)(Collaborative Mutual Learning) 协同互学习文本检测蒸馏策略进行了升级。如下图所示,CML的核心思想结合了①传统的Teacher指导Student的标准蒸馏与 ②Students网络之间的DML互学习,可以让Students网络互学习的同时,Teacher网络予以指导。PP-OCRv3分别针对教师模型和学生模型进行进一步效果优化。其中,在对教师模型优化时,提出了大感受野的PAN结构LK-PAN和引入了DML(Deep Mutual Learning)蒸馏策略;在对学生模型优化时,提出了残差注意力机制的FPN结构RSE-FPN。
<div align="center">
<img src=".././ppocr_v3/ppocrv3_det_cml.png" width="800">
......@@ -65,27 +63,25 @@ PP-OCRv3 CML蒸馏训练框架图如下:
测试环境: Intel Gold 6148 CPU,预测时开启MKLDNN加速。
**(1)增大感受野的PAN模块LK-PAN(Large Kernel PAN)**
**(1)LK-PAN:大感受野的PAN结构**
LK-PAN(Large Kernel PAN)是一个具有更大感受野的轻量级[PAN](https://arxiv.org/pdf/1803.01534.pdf)结构。在LK-PAN的path augmentation中,使用卷积核为`9*9`的卷积;更大的卷积核意味着更大的感受野,更容易检测大字体的文字以及极端长宽比的文字。LK-PAN将PP-OCR server检测模型的hmean从83.2%提升到85.0%。
LK-PAN (Large Kernel PAN) 是一个具有更大感受野的轻量级[PAN](https://arxiv.org/pdf/1803.01534.pdf)结构,核心是将PAN结构的path augmentation中卷积核从`3*3`改为`9*9`。通过增大卷积核,提升特征图每个位置覆盖的感受野,更容易检测大字体的文字以及极端长宽比的文字。使用LK-PAN结构,可以将教师模型的hmean从83.2%提升到85.0%。
<div align="center">
<img src="../ppocr_v3/LKPAN.png" width="1000">
</div>
**(2)DML(Deep Mutual Learning)蒸馏进一步提升teacher模型精度。**
**(2)DML:教师模型互学习策略**
[DML](https://arxiv.org/abs/1706.00384) 互学习蒸馏方法,通过两个结构相同的模型互相学习,相比于传统的教师模型监督学生模型的蒸馏方法,DML 摆脱了对大的教师模型的依赖,蒸馏训练的流程更加简单。在PP-OCRv3的检测模型训练中,使用DML蒸馏策略进一步提升教师模型的精度,并使用ResNet50作为Backbone。DML策略将教师模型的Hmean从85%进一步提升至86%。
教师模型DML训练流程图如下:
[DML](https://arxiv.org/abs/1706.00384) (Deep Mutual Learning)互学习蒸馏方法,如下图所示,通过两个结构相同的模型互相学习,可以有效提升文本检测模型的精度。教师模型采用DML策略,hmean从85%提升到86%。将PP-OCRv2中CML的教师模型更新为上述更高精度的教师模型,学生模型的hmean可以进一步从83.2%提升到84.3%。
<div align="center">
<img src="../ppocr_v3/teacher_dml.png" width="800">
</div>
**(3)带残差注意力机制的FPN模块RSE-FPN(Residual SE-FPN)。**
**(3)RSE-FPN:残差注意力机制的FPN结构**
残差结构的通道注意力模块RSE-FPN结构如下图所示,RSE-FPN在PP-OCRv2的FPN基础上,将FPN中的卷积层更换为通道注意力结构的RSEConv层。考虑到PP-OCRv2的FPN通道数仅为96和24,如果直接用SEblock代替FPN中卷积会导致某些通道的特征被抑制,进而导致精度下降,RSEConv引入残差结构防止训练中包含重要特征的通道被抑制。直接添加RSE-FPN模块,可将PP-OCR检测模型的精度Hmean从81.3%提升到84.5%。在学生模型中加入RSE-FPN后进行CML蒸馏,比不加时,Hmean指标从83.2提升到84.3%。
RSE-FPN(Residual Squeeze-and-Excitation FPN)如下图所示,引入残差结构和通道注意力结构,将FPN中的卷积层更换为通道注意力结构的RSEConv层,进一步提升特征图的表征能力。考虑到PP-OCRv2的检测模型中FPN通道数非常小,仅为96,如果直接用SEblock代替FPN中卷积会导致某些通道的特征被抑制,精度会下降。RSEConv引入残差结构会缓解上述问题,提升文本检测效果。进一步将PP-OCRv2中CML的学生模型的FPN结构更新为RSE-FPN,学生模型的hmean可以进一步从84.3%提升到85.4%。
<div align="center">
<img src=".././ppocr_v3/RSEFPN.png" width="1000">
......@@ -95,15 +91,12 @@ LK-PAN(Large Kernel PAN)是一个具有更大感受野的轻量级[PAN](https://
<a name="3"></a>
## 3. 识别优化
PP-OCRv3识别模型从网络结构、训练策略、数据增广等多个方面进行了优化,PP-OCRv3系统流程图如下所示:
PP-OCRv3的识别模块是基于文本识别算法[SVTR](https://arxiv.org/abs/2205.00159)优化。SVTR不再采用RNN结构,通过引入Transformers结构更加有效地挖掘文本行图像的上下文信息,从而提升文本识别能力。直接将PP-OCRv2的识别模型,替换成SVTR_Tiny,识别准确率从74.8%提升到80.1%(+5.3%),但是预测速度慢了将近11倍,CPU上预测一条文本行,将近100ms。因此,如下图所示,PP-OCRv3采用如下6个优化策略进行识别模型加速。
<div align="center">
<img src="../ppocr_v3/v3_rec_pipeline.png" width=800>
</div>
上图中,蓝色方块中列举了PP-OCRv3识别模型的6个主要模块。首先在模块①,将base模型从CRNN替换为精度更高的单一视觉模型[SVTR](https://arxiv.org/abs/2205.00159),并进行一系列的结构优化进行加速,得到全新的轻量级文本识别网络SVTR_LCNet(如图中红色虚线框所示);在模块②,借鉴[GTC](https://arxiv.org/pdf/2002.01276.pdf)策略,引入Attention指导CTC训练,进一步提升模型精度;在模块③,使用基于上下文信息的数据增广策略TextConAug,丰富训练数据上下文信息,提升训练数据多样性;在模块④,使用TextRotNet训练自监督的预训练模型,充分利用无标注识别数据的信息;模块⑤基于PP-OCRv2中提出的UDML蒸馏策略进行蒸馏学习,除计算2个模型的CTC分支的DMLLoss外,也计算2个模型的Attention分支之间的DMLLoss,从而得到更优模型;在模块⑥中,基于UIM无标注数据挖掘方法,使用效果好但速度相对较慢的SVTR_tiny模型进行无标签数据挖掘,为模型训练增加更多真实数据。
基于上述策略,PP-OCRv3识别模型相比PP-OCRv2,在速度可比的情况下,精度进一步提升4.6%。 具体消融实验如下所示:
| ID | 策略 | 模型大小 | 精度 | 预测耗时(CPU + MKLDNN)|
......@@ -118,12 +111,13 @@ PP-OCRv3识别模型从网络结构、训练策略、数据增广等多个方面
| 08 | + UDML | 12M | 78.4% | 7.6ms |
| 09 | + UIM | 12M | 79.4% | 7.6ms |
注: 测试速度时,实验01-03输入图片尺寸均为(3,32,320),04-09输入图片尺寸均为(3,48,320)。在实际预测时,图像为变长输入,速度会有所变化
注: 测试速度时,实验01-03输入图片尺寸均为(3,32,320),04-08输入图片尺寸均为(3,48,320)。在实际预测时,图像为变长输入,速度会有所变化。测试环境: Intel Gold 6148 CPU,预测时开启MKLDNN加速
**(1)SVTR_LCNet:轻量级文本识别网络**
**(1)轻量级文本识别网络SVTR_LCNet。**
SVTR_LCNet是针对文本识别任务,将基于Transformer的[SVTR](https://arxiv.org/abs/2205.00159)网络和轻量级CNN网络[PP-LCNet](https://arxiv.org/abs/2109.15099) 融合的一种轻量级文本识别网络。使用该网络,预测速度优于PP-OCRv2的识别模型20%,但是由于没有采用蒸馏策略,该识别模型效果略差。此外,进一步将输入图片规范化高度从32提升到48,预测速度稍微变慢,但是模型效果大幅提升,识别准确率达到73.98%(+2.08%),接近PP-OCRv2采用蒸馏策略的识别模型效果。
PP-OCRv3将base模型从CRNN替换成了[SVTR](https://arxiv.org/abs/2205.00159),SVTR证明了强大的单视觉模型(无需序列模型)即可高效准确完成文本识别任务,在中英文数据上均有优秀的表现。经过实验验证,SVTR_Tiny 在自建的[中文数据集](https://arxiv.org/abs/2109.03144)上 ,识别精度可以提升至80.1%,SVTR_Tiny 网络结构如下所示:
SVTR_Tiny 网络结构如下所示:
<div align="center">
<img src="../ppocr_v3/svtr_tiny.png" width=800>
......@@ -159,34 +153,39 @@ PP-OCRv3将base模型从CRNN替换成了[SVTR](https://arxiv.org/abs/2205.00159)
注: 测试速度时,01-05输入图片尺寸均为(3,32,320); PP-OCRv2-baseline 代表没有借助蒸馏方法训练得到的模型
**(2)采用Attention指导CTC训练。**
**(2)GTC:Attention指导CTC训练策略**
为了提升模型精度同时不引入额外推理成本,PP-OCRv3 参考 GTC(Guided Training of CTC) 策略,使用 Attention 监督 CTC 训练,预测时完全去除 Attention 模块,在推理阶段不增加任何耗时, 精度提升3.8%,训练流程如下所示:
[GTC](https://arxiv.org/pdf/2002.01276.pdf)(Guided Training of CTC),利用Attention模块以及损失,指导CTC损失训练,融合多种文本特征的表达,是一种有效的提升文本识别的策略。使用该策略,预测时完全去除 Attention 模块,在推理阶段不增加任何耗时,识别模型的准确率进一步提升到75.8%(+1.82%)。训练流程如下所示:
<div align="center">
<img src="../ppocr_v3/GTC.png" width=800>
</div>
**(3)TextConAug数据增广策略。**
**(3)TextConAug:挖掘文字上下文信息的数据增广策略**
在论文[ConCLR](https://www.cse.cuhk.edu.hk/~byu/papers/C139-AAAI2022-ConCLR.pdf)中,作者提出ConAug数据增广,在一个batch内对2张不同的图像进行联结,组成新的图像并进行自监督对比学习。PP-OCRv3将此方法应用到有监督的学习任务中,设计了TextConAug数据增强方法,支持更多图像的联结,从而进一步丰富了图像的上下文信息。最终将识别模型精度进一步提升0.5%。TextConAug示意图如下所示:
TextConAug是一种挖掘文字上下文信息的数据增广策略,主要思想来源于论文[ConCLR](https://www.cse.cuhk.edu.hk/~byu/papers/C139-AAAI2022-ConCLR.pdf),作者提出ConAug数据增广,在一个batch内对2张不同的图像进行联结,组成新的图像并进行自监督对比学习。PP-OCRv3将此方法应用到有监督的学习任务中,设计了TextConAug数据增强方法,可以丰富训练数据上下文信息,提升训练数据多样性。使用该策略,识别模型的准确率进一步提升到76.3%(+0.5%)。TextConAug示意图如下所示:
<div align="center">
<img src="../ppocr_v3/recconaug.png" width=800>
</div>
**(4)TextRotNet自监督训练优化预训练模型。**
**(4)TextRotNet:自监督的预训练模型**
为了充分利用自然场景中的大量无标注文本数据,PP-OCRv3参考论文[STR-Fewer-Labels](https://github.com/ku21fan/STR-Fewer-Labels),设计TextRotNet自监督任务,对识别图像进行旋转并预测其旋转角度,同时结合中文场景文字识别任务的特点,在训练时适当调整图像的尺寸,添加文本识别数据增广,最终产出针对文本识别任务的PP-LCNet预训练模型,帮助识别模型精度进一步提升0.6%。TextRotNet训练流程如下图所示:
TextRotNet是使用大量无标注的文本行数据,通过自监督方式训练的预训练模型,参考于论文[STR-Fewer-Labels](https://github.com/ku21fan/STR-Fewer-Labels)。该模型可以初始化SVTR_LCNet的初始权重,从而帮助文本识别模型收敛到更佳位置。使用该策略,识别模型的准确率进一步提升到76.9%(+0.6%)。TextRotNet训练流程如下图所示:
<div align="center">
<img src="../ppocr_v3/SSL.png" width="500">
</div>
**(5)UIM(Unlabeled Images Mining)无标注数据挖掘策略。**
**(5)UDML:联合互学习策略**
UDML(Unified-Deep Mutual Learning)联合互学习是PP-OCRv2中就采用的对于文本识别非常有效的提升模型效果的策略。在PP-OCRv3中,针对两个不同的SVTR_LCNet和Attention结构,对他们之间的PP-LCNet的特征图、SVTR模块的输出和Attention模块的输出同时进行监督训练。使用该策略,识别模型的准确率进一步提升到78.4%(+1.5%)。
**(6)UIM:无标注数据挖掘方案**
为更直接利用自然场景中包含大量无标注数据,使用PP-OCRv2检测模型以及SVTR_tiny识别模型对百度开源的40W [LSVT弱标注数据集](https://ai.baidu.com/broad/introduction?dataset=lsvt)进行检测与识别,并筛选出识别得分大于0.95的文本,共81W文本行数据,将其补充到训练数据中,最终进一步提升模型精度1.0%
UIM(Unlabeled Images Mining)是一种非常简单的无标注数据挖掘方案。核心思想是利用高精度的文本识别大模型对无标注数据进行预测,获取伪标签,并且选择预测置信度高的样本作为训练数据,用于训练小模型。使用该策略,识别模型的准确率进一步提升到79.4%(+1%)
<div align="center">
<img src="../ppocr_v3/UIM.png" width="500">
......
......@@ -38,8 +38,9 @@ PP-OCRv2在PP-OCR的基础上,进一步在5个方面重点优化,检测模
#### PP-OCRv3
PP-OCRv3在PP-OCRv2的基础上进一步升级。检测模型仍然基于DB算法,优化策略采用了带残差注意力机制的FPN结构RSEFPN、增大感受野的PAN结构LKPAN、基于DML训练的更优的教师模型;识别模型将base模型从CRNN替换成了IJCAI 2022论文[SVTR](https://arxiv.org/abs/2205.00159),并采用SVTR轻量化、带指导训练CTC、数据增广策略RecConAug、自监督训练的更好的预训练模型、无标签数据的使用进行模型加速和效果提升。更多细节请参考PP-OCRv3[技术报告](./PP-OCRv3_introduction.md)
PP-OCRv3在PP-OCRv2的基础上,针对检测模型和识别模型,进行了共计9个方面的升级:
- PP-OCRv3检测模型对PP-OCRv2中的CML协同互学习文本检测蒸馏策略进行了升级,分别针对教师模型和学生模型进行进一步效果优化。其中,在对教师模型优化时,提出了大感受野的PAN结构LK-PAN和引入了DML蒸馏策略;在对学生模型优化时,提出了残差注意力机制的FPN结构RSE-FPN。
- PP-OCRv3的识别模块是基于文本识别算法[SVTR](https://arxiv.org/abs/2205.00159)优化。SVTR不再采用RNN结构,通过引入Transformers结构更加有效地挖掘文本行图像的上下文信息,从而提升文本识别能力。PP-OCRv3通过轻量级文本识别网络SVTR_LCNet、Attention损失指导CTC损失训练策略、挖掘文字上下文信息的数据增广策略TextConAug、TextRotNet自监督预训练模型、UDML联合互学习策略、UIM无标注数据挖掘方案,6个方面进行模型加速和效果提升。
PP-OCRv3系统pipeline如下:
......@@ -47,6 +48,9 @@ PP-OCRv3系统pipeline如下:
<img src="../ppocrv3_framework.png" width="800">
</div>
更多细节请参考PP-OCRv3[技术报告](./PP-OCRv3_introduction.md)
<a name="2"></a>
## 2. 特性
......
......@@ -17,6 +17,7 @@ English | [简体中文](../doc_ch/ppocr_introduction.md)
PP-OCR is a self-developed practical ultra-lightweight OCR system, which is slimed and optimized based on the reimplemented [academic algorithms](algorithm_en.md), considering the balance between **accuracy** and **speed**.
#### PP-OCR
PP-OCR is a two-stage OCR system, in which the text detection algorithm is [DB](algorithm_det_db_en.md), and the text recognition algorithm is [CRNN](algorithm_rec_crnn_en.md). Besides, a [text direction classifier](angle_class_en.md) is added between the detection and recognition modules to deal with text in different directions.
PP-OCR pipeline is as follows:
......@@ -28,11 +29,16 @@ PP-OCR pipeline is as follows:
PP-OCR system is in continuous optimization. At present, PP-OCR and PP-OCRv2 have been released:
[1] PP-OCR adopts 19 effective strategies from 8 aspects including backbone network selection and adjustment, prediction head design, data augmentation, learning rate transformation strategy, regularization parameter selection, pre-training model use, and automatic model tailoring and quantization to optimize and slim down the models of each module (as shown in the green box above). The final results are an ultra-lightweight Chinese and English OCR model with an overall size of 3.5M and a 2.8M English digital OCR model. For more details, please refer to the PP-OCR technical article (https://arxiv.org/abs/2009.09941).
PP-OCR adopts 19 effective strategies from 8 aspects including backbone network selection and adjustment, prediction head design, data augmentation, learning rate transformation strategy, regularization parameter selection, pre-training model use, and automatic model tailoring and quantization to optimize and slim down the models of each module (as shown in the green box above). The final results are an ultra-lightweight Chinese and English OCR model with an overall size of 3.5M and a 2.8M English digital OCR model. For more details, please refer to the PP-OCR technical article (https://arxiv.org/abs/2009.09941).
[2] On the basis of PP-OCR, PP-OCRv2 is further optimized in five aspects. The detection model adopts CML(Collaborative Mutual Learning) knowledge distillation strategy and CopyPaste data expansion strategy. The recognition model adopts LCNet lightweight backbone network, U-DML knowledge distillation strategy and enhanced CTC loss function improvement (as shown in the red box above), which further improves the inference speed and prediction effect. For more details, please refer to the technical report of PP-OCRv2 (https://arxiv.org/abs/2109.03144).
#### PP-OCRv2
On the basis of PP-OCR, PP-OCRv2 is further optimized in five aspects. The detection model adopts CML(Collaborative Mutual Learning) knowledge distillation strategy and CopyPaste data expansion strategy. The recognition model adopts LCNet lightweight backbone network, U-DML knowledge distillation strategy and enhanced CTC loss function improvement (as shown in the red box above), which further improves the inference speed and prediction effect. For more details, please refer to the technical report of PP-OCRv2 (https://arxiv.org/abs/2109.03144).
[3] PP-OCRv3 is further upgraded on the basis of PP-OCRv2. The detection model is still based on DB algorithm, and the optimization strategies include a newly proposed FPN structure with residual attention mechanism named with RSEFPN, a PAN structure with enlarged receptive field named with LKPAN, and better teacher model based on DML training; The recognition model replaces the base model from CRNN with IJCAI 2022 paper [SVTR](https://arxiv.org/abs/2205.00159), and adopts lightweight SVTR, guided training of CTC, data augmentation strategy RecConAug, better pre-trained model by self-supervised training, and the use of unlabeled data to accelerate the model and improve the effect. For more details, please refer to PP-OCRv3 [technical report](./PP-OCRv3_introduction_en.md).
#### PP-OCRv3
PP-OCRv3 upgraded the detection model and recognition model in 9 aspects based on PP-OCRv2:
- PP-OCRv3 detector upgrades the CML(Collaborative Mutual Learning) text detection strategy proposed in PP-OCRv2, and further optimizes the effect of teacher model and student model respectively. In the optimization of teacher model, a pan module with large receptive field named LK-PAN is proposed and the DML distillation strategy is adopted; In the optimization of student model, a FPN module with residual attention mechanism named RSE-FPN is proposed.
- PP-OCRv3 recognizer is optimized based on text recognition algorithm [SVTR](https://arxiv.org/abs/2205.00159). SVTR no longer adopts RNN by introducing transformers structure, which can mine the context information of text line image more effectively, so as to improve the ability of text recognition. PP-OCRv3 adopts lightweight text recognition network SVTR_LCNet, guided training of CTC loss by attention loss, data augmentation strategy TextConAug, better pre-trained model by self-supervised TextRotNet, UDML(Unified Deep Mutual Learning), and UIM (Unlabeled Images Mining) to accelerate the model and improve the effect.
PP-OCRv3 pipeline is as follows:
......@@ -40,6 +46,8 @@ PP-OCRv3 pipeline is as follows:
<img src="../ppocrv3_framework.png" width="800">
</div>
For more details, please refer to [PP-OCRv3 technical report](./PP-OCRv3_introduction_en.md).
<a name="2"></a>
## 2. Features
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册