提交 fcc70660 编写于 作者: qq_25193841's avatar qq_25193841

Merge remote-tracking branch 'origin/dygraph' into dygraph

......@@ -21,12 +21,13 @@ import os.path
import platform
import subprocess
import sys
import xlrd
from functools import partial
from PyQt5.QtCore import QSize, Qt, QPoint, QByteArray, QTimer, QFileInfo, QPointF, QProcess
from PyQt5.QtGui import QImage, QCursor, QPixmap, QImageReader
from PyQt5.QtWidgets import QMainWindow, QListWidget, QVBoxLayout, QToolButton, QHBoxLayout, QDockWidget, QWidget, \
QSlider, QGraphicsOpacityEffect, QMessageBox, QListView, QScrollArea, QWidgetAction, QApplication, QLabel, \
QSlider, QGraphicsOpacityEffect, QMessageBox, QListView, QScrollArea, QWidgetAction, QApplication, QLabel, QGridLayout, \
QFileDialog, QListWidgetItem, QComboBox, QDialog
__dir__ = os.path.dirname(os.path.abspath(__file__))
......@@ -36,7 +37,7 @@ sys.path.append(os.path.abspath(os.path.join(__dir__, '../..')))
sys.path.append(os.path.abspath(os.path.join(__dir__, '../PaddleOCR')))
sys.path.append("..")
from paddleocr import PaddleOCR
from paddleocr import PaddleOCR, PPStructure
from libs.constants import *
from libs.utils import *
from libs.labelColor import label_colormap
......@@ -100,9 +101,15 @@ class MainWindow(QMainWindow):
use_gpu=gpu,
lang=lang,
show_log=False)
self.table_ocr = PPStructure(use_pdserving=False,
use_gpu=gpu,
lang=lang,
layout=False,
show_log=False)
if os.path.exists('./data/paddle.png'):
result = self.ocr.ocr('./data/paddle.png', cls=True, det=True)
result = self.table_ocr('./data/paddle.png', return_ocr_result_in_table=True)
# For loading all image under a directory
self.mImgList = []
......@@ -196,16 +203,25 @@ class MainWindow(QMainWindow):
self.reRecogButton.setIcon(newIcon('reRec', 30))
self.reRecogButton.setToolButtonStyle(Qt.ToolButtonTextBesideIcon)
self.tableRecButton = QToolButton()
self.tableRecButton.setToolButtonStyle(Qt.ToolButtonTextBesideIcon)
self.newButton = QToolButton()
self.newButton.setToolButtonStyle(Qt.ToolButtonTextBesideIcon)
self.createpolyButton = QToolButton()
self.createpolyButton.setToolButtonStyle(Qt.ToolButtonTextBesideIcon)
self.SaveButton = QToolButton()
self.SaveButton.setToolButtonStyle(Qt.ToolButtonTextBesideIcon)
self.DelButton = QToolButton()
self.DelButton.setToolButtonStyle(Qt.ToolButtonTextBesideIcon)
leftTopToolBox = QHBoxLayout()
leftTopToolBox.addWidget(self.newButton)
leftTopToolBox.addWidget(self.reRecogButton)
leftTopToolBox = QGridLayout()
leftTopToolBox.addWidget(self.newButton, 0, 0, 1, 1)
leftTopToolBox.addWidget(self.createpolyButton, 0, 1, 1, 1)
leftTopToolBox.addWidget(self.reRecogButton, 1, 0, 1, 1)
leftTopToolBox.addWidget(self.tableRecButton, 1, 1, 1, 1)
leftTopToolBoxContainer = QWidget()
leftTopToolBoxContainer.setLayout(leftTopToolBox)
listLayout.addWidget(leftTopToolBoxContainer)
......@@ -446,13 +462,22 @@ class MainWindow(QMainWindow):
'Ctrl+R', 'reRec', getStr('singleRe'), enabled=False)
createpoly = action(getStr('creatPolygon'), self.createPolygon,
'q', 'new', getStr('creatPolygon'), enabled=True)
'q', 'new', getStr('creatPolygon'), enabled=False)
tableRec = action(getStr('TableRecognition'), self.TableRecognition,
'', 'Auto', getStr('TableRecognition'), enabled=False)
cellreRec = action(getStr('cellreRecognition'), self.cellreRecognition,
'', 'reRec', getStr('cellreRecognition'), enabled=False)
saveRec = action(getStr('saveRec'), self.saveRecResult,
'', 'save', getStr('saveRec'), enabled=False)
saveLabel = action(getStr('saveLabel'), self.saveLabelFile, #
'Ctrl+S', 'save', getStr('saveLabel'), enabled=False)
exportJSON = action(getStr('exportJSON'), self.exportJSON,
'', 'save', getStr('exportJSON'), enabled=False)
undoLastPoint = action(getStr("undoLastPoint"), self.canvas.undoLastPoint,
'Ctrl+Z', "undo", getStr("undoLastPoint"), enabled=False)
......@@ -474,10 +499,12 @@ class MainWindow(QMainWindow):
self.editButton.setDefaultAction(edit)
self.newButton.setDefaultAction(create)
self.createpolyButton.setDefaultAction(createpoly)
self.DelButton.setDefaultAction(deleteImg)
self.SaveButton.setDefaultAction(save)
self.AutoRecognition.setDefaultAction(AutoRec)
self.reRecogButton.setDefaultAction(reRec)
self.tableRecButton.setDefaultAction(tableRec)
# self.preButton.setDefaultAction(openPrevImg)
# self.nextButton.setDefaultAction(openNextImg)
......@@ -523,25 +550,25 @@ class MainWindow(QMainWindow):
# Store actions for further handling.
self.actions = struct(save=save, resetAll=resetAll, deleteImg=deleteImg,
lineColor=color1, create=create, delete=delete, edit=edit, copy=copy,
saveRec=saveRec, singleRere=singleRere, AutoRec=AutoRec, reRec=reRec,
lineColor=color1, create=create, createpoly=createpoly, tableRec=tableRec, delete=delete, edit=edit, copy=copy,
saveRec=saveRec, singleRere=singleRere, AutoRec=AutoRec, reRec=reRec, cellreRec=cellreRec,
createMode=createMode, editMode=editMode,
shapeLineColor=shapeLineColor, shapeFillColor=shapeFillColor,
zoom=zoom, zoomIn=zoomIn, zoomOut=zoomOut, zoomOrg=zoomOrg,
fitWindow=fitWindow, fitWidth=fitWidth,
zoomActions=zoomActions, saveLabel=saveLabel, change_cls=change_cls,
undo=undo, undoLastPoint=undoLastPoint, open_dataset_dir=open_dataset_dir,
rotateLeft=rotateLeft, rotateRight=rotateRight, lock=lock,
fileMenuActions=(opendir, open_dataset_dir, saveLabel, resetAll, quit),
rotateLeft=rotateLeft, rotateRight=rotateRight, lock=lock, exportJSON=exportJSON,
fileMenuActions=(opendir, open_dataset_dir, saveLabel, exportJSON, resetAll, quit),
beginner=(), advanced=(),
editMenu=(createpoly, edit, copy, delete, singleRere, None, undo, undoLastPoint,
editMenu=(createpoly, edit, copy, delete, singleRere, cellreRec, None, undo, undoLastPoint,
None, rotateLeft, rotateRight, None, color1, self.drawSquaresOption, lock,
None, change_cls),
beginnerContext=(
create, edit, copy, delete, singleRere, rotateLeft, rotateRight, lock, change_cls),
create, createpoly, edit, copy, delete, singleRere, cellreRec, rotateLeft, rotateRight, lock, change_cls),
advancedContext=(createMode, editMode, edit, copy,
delete, shapeLineColor, shapeFillColor),
onLoadActive=(create, createMode, editMode),
onLoadActive=(create, createpoly, createMode, editMode),
onShapesPresent=(hideAll, showAll))
# menus
......@@ -574,7 +601,7 @@ class MainWindow(QMainWindow):
self.autoSaveOption.triggered.connect(self.autoSaveFunc)
addActions(self.menus.file,
(opendir, open_dataset_dir, None, saveLabel, saveRec, self.autoSaveOption, None, resetAll, deleteImg,
(opendir, open_dataset_dir, None, saveLabel, saveRec, exportJSON, self.autoSaveOption, None, resetAll, deleteImg,
quit))
addActions(self.menus.help, (showKeys, showSteps, showInfo))
......@@ -585,7 +612,7 @@ class MainWindow(QMainWindow):
zoomIn, zoomOut, zoomOrg, None,
fitWindow, fitWidth))
addActions(self.menus.autolabel, (AutoRec, reRec, alcm, None, help))
addActions(self.menus.autolabel, (AutoRec, reRec, cellreRec, alcm, None, help))
self.menus.file.aboutToShow.connect(self.updateFileMenu)
......@@ -695,6 +722,7 @@ class MainWindow(QMainWindow):
self.dirty = False
self.actions.save.setEnabled(False)
self.actions.create.setEnabled(True)
self.actions.createpoly.setEnabled(True)
def toggleActions(self, value=True):
"""Enable/Disable widgets which depend on an opened image."""
......@@ -780,6 +808,7 @@ class MainWindow(QMainWindow):
assert self.beginner()
self.canvas.setEditing(False)
self.actions.create.setEnabled(False)
self.actions.createpoly.setEnabled(False)
self.canvas.fourpoint = False
def createPolygon(self):
......@@ -787,10 +816,10 @@ class MainWindow(QMainWindow):
self.canvas.setEditing(False)
self.canvas.fourpoint = True
self.actions.create.setEnabled(False)
self.actions.createpoly.setEnabled(False)
self.actions.undoLastPoint.setEnabled(True)
def rotateImg(self, filename, k, _value):
self.actions.rotateRight.setEnabled(_value)
pix = cv2.imread(filename)
pix = np.rot90(pix, k)
......@@ -831,6 +860,7 @@ class MainWindow(QMainWindow):
self.canvas.setEditing(True)
self.canvas.restoreCursor()
self.actions.create.setEnabled(True)
self.actions.createpoly.setEnabled(True)
def toggleDrawMode(self, edit=True):
self.canvas.setEditing(edit)
......@@ -987,11 +1017,21 @@ class MainWindow(QMainWindow):
if len(self.canvas.selectedShapes) == 1 and self.keyList.count() > 0:
selected_key_item_row = self.keyList.findItemsByLabel(self.canvas.selectedShapes[0].key_cls,
get_row=True)
if isinstance(selected_key_item_row, list) and len(selected_key_item_row) == 0:
key_text = self.canvas.selectedShapes[0].key_cls
item = self.keyList.createItemFromLabel(key_text)
self.keyList.addItem(item)
rgb = self._get_rgb_by_label(key_text, self.kie_mode)
self.keyList.setItemLabel(item, key_text, rgb)
selected_key_item_row = self.keyList.findItemsByLabel(self.canvas.selectedShapes[0].key_cls,
get_row=True)
self.keyList.setCurrentRow(selected_key_item_row)
self._noSelectionSlot = False
n_selected = len(selected_shapes)
self.actions.singleRere.setEnabled(n_selected)
self.actions.cellreRec.setEnabled(n_selected)
self.actions.delete.setEnabled(n_selected)
self.actions.copy.setEnabled(n_selected)
self.actions.edit.setEnabled(n_selected == 1)
......@@ -1216,6 +1256,7 @@ class MainWindow(QMainWindow):
if self.beginner(): # Switch to edit mode.
self.canvas.setEditing(True)
self.actions.create.setEnabled(True)
self.actions.createpoly.setEnabled(True)
self.actions.undoLastPoint.setEnabled(False)
self.actions.undo.setEnabled(True)
else:
......@@ -1654,8 +1695,10 @@ class MainWindow(QMainWindow):
self.haveAutoReced = False
self.AutoRecognition.setEnabled(True)
self.reRecogButton.setEnabled(True)
self.tableRecButton.setEnabled(True)
self.actions.AutoRec.setEnabled(True)
self.actions.reRec.setEnabled(True)
self.actions.tableRec.setEnabled(True)
self.actions.open_dataset_dir.setEnabled(True)
self.actions.rotateLeft.setEnabled(True)
self.actions.rotateRight.setEnabled(True)
......@@ -1755,6 +1798,7 @@ class MainWindow(QMainWindow):
self.openNextImg()
self.actions.saveRec.setEnabled(True)
self.actions.saveLabel.setEnabled(True)
self.actions.exportJSON.setEnabled(True)
elif mode == 'Auto':
if annotationFilePath and self.saveLabels(annotationFilePath, mode=mode):
......@@ -2081,6 +2125,280 @@ class MainWindow(QMainWindow):
self.singleLabel(shape)
self.setDirty()
def TableRecognition(self):
'''
Table Recegnition
'''
from paddleocr.ppstructure.table.predict_table import to_excel
import time
start = time.time()
img = cv2.imread(self.filePath)
res = self.table_ocr(img, return_ocr_result_in_table=True)
TableRec_excel_dir = self.lastOpenDir + '/tableRec_excel_output/'
os.makedirs(TableRec_excel_dir, exist_ok=True)
filename, _ = os.path.splitext(os.path.basename(self.filePath))
excel_path = TableRec_excel_dir + '{}.xlsx'.format(filename)
if res is None:
msg = 'Can not recognise the table in ' + self.filePath + '. Please change manually'
QMessageBox.information(self, "Information", msg)
to_excel('', excel_path) # create an empty excel
return
# save res
# ONLY SUPPORT ONE TABLE in one image
hasTable = False
for region in res:
if region['type'] == 'Table':
if region['res']['boxes'] is None:
msg = 'Can not recognise the detection box in ' + self.filePath + '. Please change manually'
QMessageBox.information(self, "Information", msg)
to_excel('', excel_path) # create an empty excel
return
hasTable = True
# save table ocr result on PPOCRLabel
# clear all old annotaions before saving result
self.itemsToShapes.clear()
self.shapesToItems.clear()
self.itemsToShapesbox.clear() # ADD
self.shapesToItemsbox.clear()
self.labelList.clear()
self.BoxList.clear()
self.result_dic = []
self.result_dic_locked = []
shapes = []
result_len = len(region['res']['boxes'])
for i in range(result_len):
bbox = np.array(region['res']['boxes'][i])
rec_text = region['res']['rec_res'][i][0]
# polys to rectangles
x1, y1 = np.min(bbox[:, 0]), np.min(bbox[:, 1])
x2, y2 = np.max(bbox[:, 0]), np.max(bbox[:, 1])
rext_bbox = [[x1, y1], [x2, y1], [x2, y2], [x1, y2]]
# save bbox to shape
shape = Shape(label=rec_text, line_color=DEFAULT_LINE_COLOR, key_cls=None)
for point in rext_bbox:
x, y = point
# Ensure the labels are within the bounds of the image.
# If not, fix them.
x, y, snapped = self.canvas.snapPointToCanvas(x, y)
shape.addPoint(QPointF(x, y))
shape.difficult = False
# shape.locked = False
shape.close()
self.addLabel(shape)
shapes.append(shape)
self.setDirty()
self.canvas.loadShapes(shapes)
# save HTML result to excel
try:
to_excel(region['res']['html'], excel_path)
except:
print('Can not save excel file, maybe Permission denied (.xlsx is being occupied)')
break
if not hasTable:
msg = 'Can not recognise the table in ' + self.filePath + '. Please change manually'
QMessageBox.information(self, "Information", msg)
to_excel('', excel_path) # create an empty excel
return
# automatically open excel annotation file
if platform.system() == 'Windows':
try:
import win32com.client
except:
print("CANNOT OPEN .xlsx. It could be one of the following reasons: " \
"Only support Windows | No python win32com")
try:
xl = win32com.client.Dispatch("Excel.Application")
xl.Visible = True
xl.Workbooks.Open(excel_path)
# excelEx = "You need to show the excel executable at this point"
# subprocess.Popen([excelEx, excel_path])
# os.startfile(excel_path)
except:
print("CANNOT OPEN .xlsx. It could be the following reasons: " \
".xlsx is not existed")
else:
os.system('open ' + os.path.normpath(excel_path))
print('time cost: ', time.time() - start)
def cellreRecognition(self):
'''
re-recognise text in a cell
'''
img = cv2.imread(self.filePath)
for shape in self.canvas.selectedShapes:
box = [[int(p.x()), int(p.y())] for p in shape.points]
if len(box) > 4:
box = self.gen_quad_from_poly(np.array(box))
assert len(box) == 4
# pad around bbox for better text recognition accuracy
_box = boxPad(box, img.shape, 6)
img_crop = get_rotate_crop_image(img, np.array(_box, np.float32))
if img_crop is None:
msg = 'Can not recognise the detection box in ' + self.filePath + '. Please change manually'
QMessageBox.information(self, "Information", msg)
return
# merge the text result in the cell
texts = ''
probs = 0. # the probability of the cell is avgerage prob of every text box in the cell
bboxes = self.ocr.ocr(img_crop, det=True, rec=False, cls=False)
if len(bboxes) > 0:
bboxes.reverse() # top row text at first
for _bbox in bboxes:
patch = get_rotate_crop_image(img_crop, np.array(_bbox, np.float32))
rec_res = self.ocr.ocr(patch, det=False, rec=True, cls=False)
text = rec_res[0][0]
if text != '':
texts += text + (' ' if text[0].isalpha() else '') # add space between english word
probs += rec_res[0][1]
probs = probs / len(bboxes)
result = [(texts.strip(), probs)]
if result[0][0] != '':
result.insert(0, box)
print('result in reRec is ', result)
if result[1][0] == shape.label:
print('label no change')
else:
shape.label = result[1][0]
else:
print('Can not recognise the box')
if self.noLabelText == shape.label:
print('label no change')
else:
shape.label = self.noLabelText
self.singleLabel(shape)
self.setDirty()
def exportJSON(self):
'''
export PPLabel and CSV to JSON (PubTabNet)
'''
import pandas as pd
from libs.dataPartitionDialog import DataPartitionDialog
# data partition user input
partitionDialog = DataPartitionDialog(parent=self)
partitionDialog.exec()
if partitionDialog.getStatus() == False:
return
# automatically save annotations
self.saveFilestate()
self.savePPlabel(mode='auto')
# load box annotations
labeldict = {}
if not os.path.exists(self.PPlabelpath):
msg = 'ERROR, Can not find Label.txt'
QMessageBox.information(self, "Information", msg)
return
else:
with open(self.PPlabelpath, 'r', encoding='utf-8') as f:
data = f.readlines()
for each in data:
file, label = each.split('\t')
if label:
label = label.replace('false', 'False')
label = label.replace('true', 'True')
labeldict[file] = eval(label)
else:
labeldict[file] = []
# if len(labeldict) != len(csv_paths):
# msg = 'ERROR, box label and excel label are not in the same number\n' + \
# 'box label: ' + str(len(labeldict)) + '\n' + \
# 'excel label: ' + str(len(csv_paths)) + '\n' + \
# 'Please check the label.txt and tableRec_excel_output\n'
# QMessageBox.information(self, "Information", msg)
# return
train_split, val_split, test_split = partitionDialog.getDataPartition()
# check validate
if train_split + val_split + test_split > 100:
msg = "The sum of training, validation and testing data should be less than 100%"
QMessageBox.information(self, "Information", msg)
return
print(train_split, val_split, test_split)
train_split, val_split, test_split = float(train_split) / 100., float(val_split) / 100., float(test_split) / 100.
train_id = int(len(labeldict) * train_split)
val_id = int(len(labeldict) * (train_split + val_split))
print('Data partition: train:', train_id,
'validation:', val_id - train_id,
'test:', len(labeldict) - val_id)
TableRec_excel_dir = os.path.join(self.lastOpenDir, 'tableRec_excel_output')
json_results = []
imgid = 0
for image_path in labeldict.keys():
# load csv annotations
filename, _ = os.path.splitext(os.path.basename(image_path))
csv_path = os.path.join(TableRec_excel_dir, filename + '.xlsx')
if not os.path.exists(csv_path):
msg = 'ERROR, Can not find ' + csv_path
QMessageBox.information(self, "Information", msg)
return
# read xlsx file, convert to HTML
# xd = pd.ExcelFile(csv_path)
# df = xd.parse()
# structure = df.to_html(index = False)
excel = xlrd.open_workbook(csv_path)
sheet0 = excel.sheet_by_index(0) # only sheet 0
merged_cells = sheet0.merged_cells # (0,1,1,3) start row, end row, start col, end col
html_list = [['td'] * sheet0.ncols for i in range(sheet0.nrows)]
for merged in merged_cells:
html_list = expand_list(merged, html_list)
token_list = convert_token(html_list)
# load box annotations
cells = []
for anno in labeldict[image_path]:
tokens = list(anno['transcription'])
obb = anno['points']
hbb = OBB2HBB(np.array(obb)).tolist()
cells.append({'tokens': tokens, 'bbox': hbb})
# data split
if imgid < train_id:
split = 'train'
elif imgid < val_id:
split = 'val'
else:
split = 'test'
# save dict
html = {'structure': {'tokens': token_list}, 'cell': cells}
json_results.append({'filename': os.path.basename(image_path), 'split': split, 'imgid': imgid, 'html': html})
imgid += 1
# save json
with open("{}/annotation.json".format(self.lastOpenDir), "w", encoding='utf-8') as fid:
fid.write(json.dumps(json_results, ensure_ascii=False))
msg = 'JSON sucessfully saved in {}/annotation.json'.format(self.lastOpenDir)
QMessageBox.information(self, "Information", msg)
def autolcm(self):
vbox = QVBoxLayout()
hbox = QHBoxLayout()
......@@ -2120,6 +2438,12 @@ class MainWindow(QMainWindow):
del self.ocr
self.ocr = PaddleOCR(use_pdserving=False, use_angle_cls=True, det=True, cls=True, use_gpu=False,
lang=lg_idx[self.comboBox.currentText()])
del self.table_ocr
self.table_ocr = PPStructure(use_pdserving=False,
use_gpu=False,
lang=lg_idx[self.comboBox.currentText()],
layout=False,
show_log=False)
self.dialog.close()
def cancel(self):
......@@ -2138,6 +2462,7 @@ class MainWindow(QMainWindow):
self.fileStatedict[file] = 1
self.actions.saveLabel.setEnabled(True)
self.actions.saveRec.setEnabled(True)
self.actions.exportJSON.setEnabled(True)
def saveFilestate(self):
with open(self.fileStatepath, 'w', encoding='utf-8') as f:
......
try:
from PyQt5.QtGui import *
from PyQt5.QtCore import *
from PyQt5.QtWidgets import *
except ImportError:
from PyQt4.QtGui import *
from PyQt4.QtCore import *
from libs.utils import newIcon
import time
import datetime
import json
import cv2
import numpy as np
BB = QDialogButtonBox
class DataPartitionDialog(QDialog):
def __init__(self, parent=None):
super().__init__()
self.parnet = parent
self.title = 'DATA PARTITION'
self.train_ratio = 70
self.val_ratio = 15
self.test_ratio = 15
self.initUI()
def initUI(self):
self.setWindowTitle(self.title)
self.setWindowModality(Qt.ApplicationModal)
self.flag_accept = True
if self.parnet.lang == 'ch':
msg = "导出JSON前请保存所有图像的标注且关闭EXCEL!"
else:
msg = "Please save all the annotations and close the EXCEL before exporting JSON!"
info_msg = QLabel(msg, self)
info_msg.setWordWrap(True)
info_msg.setStyleSheet("color: red")
info_msg.setFont(QFont('Arial', 12))
train_lbl = QLabel('Train split: ', self)
train_lbl.setFont(QFont('Arial', 15))
val_lbl = QLabel('Valid split: ', self)
val_lbl.setFont(QFont('Arial', 15))
test_lbl = QLabel('Test split: ', self)
test_lbl.setFont(QFont('Arial', 15))
self.train_input = QLineEdit(self)
self.train_input.setFont(QFont('Arial', 15))
self.val_input = QLineEdit(self)
self.val_input.setFont(QFont('Arial', 15))
self.test_input = QLineEdit(self)
self.test_input.setFont(QFont('Arial', 15))
self.train_input.setText(str(self.train_ratio))
self.val_input.setText(str(self.val_ratio))
self.test_input.setText(str(self.test_ratio))
validator = QIntValidator(0, 100)
self.train_input.setValidator(validator)
self.val_input.setValidator(validator)
self.test_input.setValidator(validator)
gridlayout = QGridLayout()
gridlayout.addWidget(info_msg, 0, 0, 1, 2)
gridlayout.addWidget(train_lbl, 1, 0)
gridlayout.addWidget(val_lbl, 2, 0)
gridlayout.addWidget(test_lbl, 3, 0)
gridlayout.addWidget(self.train_input, 1, 1)
gridlayout.addWidget(self.val_input, 2, 1)
gridlayout.addWidget(self.test_input, 3, 1)
bb = BB(BB.Ok | BB.Cancel, Qt.Horizontal, self)
bb.button(BB.Ok).setIcon(newIcon('done'))
bb.button(BB.Cancel).setIcon(newIcon('undo'))
bb.accepted.connect(self.validate)
bb.rejected.connect(self.cancel)
gridlayout.addWidget(bb, 4, 0, 1, 2)
self.setLayout(gridlayout)
self.show()
def validate(self):
self.flag_accept = True
self.accept()
def cancel(self):
self.flag_accept = False
self.reject()
def getStatus(self):
return self.flag_accept
def getDataPartition(self):
self.train_ratio = int(self.train_input.text())
self.val_ratio = int(self.val_input.text())
self.test_ratio = int(self.test_input.text())
return self.train_ratio, self.val_ratio, self.test_ratio
def closeEvent(self, event):
self.flag_accept = False
self.reject()
......@@ -161,6 +161,77 @@ def get_rotate_crop_image(img, points):
print(e)
def boxPad(box, imgShape, pad : int) -> np.array:
"""
Pad a box with [pad] pixels on each side.
"""
box = np.array(box, dtype=np.int32)
box[0][0], box[0][1] = box[0][0] - pad, box[0][1] - pad
box[1][0], box[1][1] = box[1][0] + pad, box[1][1] - pad
box[2][0], box[2][1] = box[2][0] + pad, box[2][1] + pad
box[3][0], box[3][1] = box[3][0] - pad, box[3][1] + pad
h, w, _ = imgShape
box[:,0] = np.clip(box[:,0], 0, w)
box[:,1] = np.clip(box[:,1], 0, h)
return box
def OBB2HBB(obb) -> np.array:
"""
Convert Oriented Bounding Box to Horizontal Bounding Box.
"""
hbb = np.zeros(4, dtype=np.int32)
hbb[0] = min(obb[:, 0])
hbb[1] = min(obb[:, 1])
hbb[2] = max(obb[:, 0])
hbb[3] = max(obb[:, 1])
return hbb
def expand_list(merged, html_list):
'''
Fill blanks according to merged cells
'''
sr, er, sc, ec = merged
for i in range(sr, er):
for j in range(sc, ec):
html_list[i][j] = None
html_list[sr][sc] = ''
if ec - sc > 1:
html_list[sr][sc] += " colspan={}".format(ec - sc)
if er - sr > 1:
html_list[sr][sc] += " rowspan={}".format(er - sr)
return html_list
def convert_token(html_list):
'''
Convert raw html to label format
'''
token_list = ["<tbody>"]
# final html list:
for row in html_list:
token_list.append("<tr>")
for col in row:
if col == None:
continue
elif col == 'td':
token_list.extend(["<td>", "</td>"])
else:
token_list.append("<td")
if 'colspan' in col:
_, n = col.split('colspan=')
token_list.append(" colspan=\"{}\"".format(n))
if 'rowspan' in col:
_, n = col.split('rowspan=')
token_list.append(" rowspan=\"{}\"".format(n))
token_list.extend([">", "</td>"])
token_list.append("</tr>")
token_list.append("</tbody>")
return token_list
def stepsInfo(lang='en'):
if lang == 'ch':
msg = "1. 安装与运行:使用上述命令安装与运行程序。\n" \
......
......@@ -84,7 +84,7 @@ mhelp=Help
iconList=Icon List
detectionBoxposition=Detection box position
recognitionResult=Recognition result
creatPolygon=Create Quadrilateral
creatPolygon=Create PolygonBox
rotateLeft=Left turn 90 degrees
rotateRight=Right turn 90 degrees
drawSquares=Draw Squares
......@@ -110,3 +110,6 @@ lockBoxDetail=Lock selected box/Unlock all box
keyListTitle=Key List
keyDialogTip=Enter object label
keyChange=Change Box Key
TableRecognition=Table Recognition
cellreRecognition=Cell Re-Recognition
exportJSON=export JSON(PubTabNet)
......@@ -84,7 +84,7 @@ mhelp=帮助
iconList=缩略图
detectionBoxposition=检测框位置
recognitionResult=识别结果
creatPolygon=四点标注
creatPolygon=多边形标注
drawSquares=正方形标注
rotateLeft=图片左旋转90度
rotateRight=图片右旋转90度
......@@ -109,4 +109,7 @@ lockBox=锁定框/解除锁定框
lockBoxDetail=若当前没有框处于锁定状态则锁定选中的框,若存在锁定框则解除所有锁定框的锁定状态
keyListTitle=关键词列表
keyDialogTip=请输入类型名称
keyChange=更改Box关键字类别
\ No newline at end of file
keyChange=更改Box关键字类别
TableRecognition=表格识别
cellreRecognition=单元格重识别
exportJSON=导出表格JSON标注
\ No newline at end of file
......@@ -19,12 +19,9 @@ PaddleOCR aims to create multilingual, awesome, leading, and practical OCR tools
**Recent updates**
- 2021.12.21 OCR open source online course starts. The lesson starts at 8:30 every night and lasts for ten days. Free registration: https://aistudio.baidu.com/aistudio/course/introduce/25207
- 2021.12.21 release PaddleOCR v2.4, release 1 text detection algorithm (PSENet), 3 text recognition algorithms (NRTR、SEED、SAR), 1 key information extraction algorithm (SDMGR, [tutorial](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.4/ppstructure/docs/kie.md)) and 3 DocVQA algorithms (LayoutLM, LayoutLMv2, LayoutXLM, [tutorial](https://github.com/PaddlePaddle/PaddleOCR/tree/release/2.4/ppstructure/vqa)).
- PaddleOCR R&D team would like to share the key points of PP-OCRv2, at 20:15 pm on September 8th, [Course Address](https://aistudio.baidu.com/aistudio/education/group/info/6758).
- 2021.9.7 release PaddleOCR v2.3, [PP-OCRv2](#PP-OCRv2) is proposed. The inference speed of PP-OCRv2 is 220% higher than that of PP-OCR server in CPU device. The F-score of PP-OCRv2 is 7% higher than that of PP-OCR mobile.
- 2021.8.3 released PaddleOCR v2.2, add a new structured documents analysis toolkit, i.e., [PP-Structure](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.2/ppstructure/README.md), support layout analysis and table recognition (One-key to export chart images to Excel files).
- 2021.4.8 release end-to-end text recognition algorithm [PGNet](https://www.aaai.org/AAAI21Papers/AAAI-2885.WangP.pdf) which is published in AAAI 2021. Find tutorial [here](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.1/doc/doc_en/pgnet_en.md);release multi language recognition [models](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.1/doc/doc_en/multi_languages_en.md), support more than 80 languages recognition; especically, the performance of [English recognition model](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.1/doc/doc_en/models_list_en.md#English) is Optimized.
- 2021.12.21 release PaddleOCR v2.4, release 1 text detection algorithm (PSENet), 3 text recognition algorithms (NRTR、SEED、SAR), 1 key information extraction algorithm (SDMGR, [tutorial](./ppstructure/docs/kie_en.md)) and 3 DocVQA algorithms (LayoutLM, LayoutLMv2, LayoutXLM, [tutorial](./ppstructure/vqa)).
- 2021.9.7 release PaddleOCR v2.3, [PP-OCRv2](./doc/doc_en/ppocr_introduction_en.md#pp-ocrv2) is proposed. The inference speed of PP-OCRv2 is 220% higher than that of PP-OCR server in CPU device. The F-score of PP-OCRv2 is 7% higher than that of PP-OCR mobile.
- 2021.8.3 released PaddleOCR v2.2, add a new structured documents analysis toolkit, i.e., [PP-Structure](./ppstructure/README.md), support layout analysis and table recognition (One-key to export chart images to Excel files).
- [more](./doc/doc_en/update_en.md)
......@@ -81,7 +78,6 @@ PaddleOCR support a variety of cutting-edge algorithms related to OCR, and devel
## Tutorials
- [Environment Preparation](./doc/doc_en/environment_en.md)
- [Quick Start](./doc/doc_en/quickstart_en.md)
- [PP-OCR 🔥](./doc/doc_en/ppocr_introduction_en.md)
- [Quick Start](./doc/doc_en/quickstart_en.md)
- [Model Zoo](./doc/doc_en/models_en.md)
......
......@@ -27,10 +27,9 @@ PaddleOCR旨在打造一套丰富、领先、且实用的OCR工具库,助力
## 近期更新
- 2021.12.21《动手学OCR · 十讲》课程开讲,12月21日起每晚八点半线上授课![免费报名地址](https://aistudio.baidu.com/aistudio/course/introduce/25207)
- 2021.12.21 发布PaddleOCR v2.4。OCR算法新增1种文本检测算法(PSENet),3种文本识别算法(NRTR、SEED、SAR);文档结构化算法新增1种关键信息提取算法(SDMGR,[文档](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.4/ppstructure/docs/kie.md)),3种DocVQA算法(LayoutLM、LayoutLMv2,LayoutXLM,[文档](https://github.com/PaddlePaddle/PaddleOCR/tree/release/2.4/ppstructure/vqa))。
- 2021.9.7 发布PaddleOCR v2.3与[PP-OCRv2](#PP-OCRv2),CPU推理速度相比于PP-OCR server提升220%;效果相比于PP-OCR mobile 提升7%。
- 2021.8.3 发布PaddleOCR v2.2,新增文档结构分析[PP-Structure](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.2/ppstructure/README_ch.md)工具包,支持版面分析与表格识别(含Excel导出)。
- 2021.12.21 发布PaddleOCR v2.4。OCR算法新增1种文本检测算法(PSENet),3种文本识别算法(NRTR、SEED、SAR);文档结构化算法新增1种关键信息提取算法(SDMGR,[文档](./ppstructure/docs/kie.md)),3种DocVQA算法(LayoutLM、LayoutLMv2,LayoutXLM,[文档](./ppstructure/vqa))。
- 2021.9.7 发布PaddleOCR v2.3与[PP-OCRv2](./doc/doc_ch/ppocr_introduction.md#pp-ocrv2),CPU推理速度相比于PP-OCR server提升220%;效果相比于PP-OCR mobile 提升7%。
- 2021.8.3 发布PaddleOCR v2.2,新增文档结构分析[PP-Structure](./ppstructure/README_ch.md)工具包,支持版面分析与表格识别(含Excel导出)。
> [更多](./doc/doc_ch/update.md)
......@@ -83,7 +82,6 @@ PaddleOCR旨在打造一套丰富、领先、且实用的OCR工具库,助力
## 文档教程
- [运行环境准备](./doc/doc_ch/environment.md)
- [快速开始(中英文/多语言/文档分析)](./doc/doc_ch/quickstart.md)
- [PP-OCR文本检测识别🔥](./doc/doc_ch/ppocr_introduction.md)
- [快速开始](./doc/doc_ch/quickstart.md)
- [模型库](./doc/doc_ch/models_list.md)
......
......@@ -129,7 +129,7 @@ Loss:
key: head_out
multi_head: True
- DistillationSARLoss:
weight: 0.5
weight: 1.0
model_name_list: ["Student", "Teacher"]
key: head_out
multi_head: True
......
......@@ -166,6 +166,10 @@ void CRNNRecognizer::LoadModel(const std::string &model_dir) {
config.SetCpuMathLibraryNumThreads(this->cpu_math_library_num_threads_);
}
// get pass_builder object
auto pass_builder = config.pass_builder();
// delete "matmul_transpose_reshape_fuse_pass"
pass_builder->DeletePass("matmul_transpose_reshape_fuse_pass");
config.SwitchUseFeedFetchOps(false);
// true for multiple input
config.SwitchSpecifyInputNames(true);
......
......@@ -36,8 +36,8 @@ op:
#det模型路径
model_config: ./ppocr_det_v3_serving
#Fetch结果列表,以client_config中fetch_var的alias_name为准
fetch_list: ["sigmoid_0.tmp_0"]
#Fetch结果列表,以client_config中fetch_var的alias_name为准,不设置默认取全部输出变量
#fetch_list: ["sigmoid_0.tmp_0"]
#计算硬件ID,当devices为""或不写时为CPU预测;当devices为"0", "0,1,2"时为GPU预测,表示使用的GPU卡
devices: "0"
......@@ -62,8 +62,8 @@ op:
#rec模型路径
model_config: ./ppocr_rec_v3_serving
#Fetch结果列表,以client_config中fetch_var的alias_name为准
fetch_list: ["softmax_5.tmp_0"]
#Fetch结果列表,以client_config中fetch_var的alias_name为准, 不设置默认取全部输出变量
#fetch_list:
#计算硬件ID,当devices为""或不写时为CPU预测;当devices为"0", "0,1,2"时为GPU预测,表示使用的GPU卡
devices: "0"
......
......@@ -393,7 +393,7 @@ class OCRReader(object):
return norm_img_batch[0]
def postprocess(self, outputs, with_score=False):
preds = outputs["softmax_5.tmp_0"]
preds = list(outputs.values())[0]
try:
preds = preds.numpy()
except:
......@@ -404,8 +404,11 @@ class OCRReader(object):
preds_idx, preds_prob, is_remove_duplicate=True)
return text
from argparse import ArgumentParser,RawDescriptionHelpFormatter
from argparse import ArgumentParser, RawDescriptionHelpFormatter
import yaml
class ArgsParser(ArgumentParser):
def __init__(self):
super(ArgsParser, self).__init__(
......@@ -441,16 +444,16 @@ class ArgsParser(ArgumentParser):
s = s.strip()
k, v = s.split('=')
v = self._parse_helper(v)
print(k,v, type(v))
print(k, v, type(v))
cur = config
parent = cur
for kk in k.split("."):
if kk not in cur:
cur[kk] = {}
parent = cur
cur = cur[kk]
cur[kk] = {}
parent = cur
cur = cur[kk]
else:
parent = cur
cur = cur[kk]
parent = cur
cur = cur[kk]
parent[k.split(".")[-1]] = v
return config
\ No newline at end of file
return config
......@@ -56,7 +56,7 @@ class DetOp(Op):
return {"x": det_img[np.newaxis, :].copy()}, False, None, ""
def postprocess(self, input_dicts, fetch_dict, data_id, log_id):
det_out = fetch_dict["sigmoid_0.tmp_0"]
det_out = list(fetch_dict.values())[0]
ratio_list = [
float(self.new_h) / self.ori_h, float(self.new_w) / self.ori_w
]
......
......@@ -55,7 +55,7 @@ class DetOp(Op):
return {"x": det_img[np.newaxis, :].copy()}, False, None, ""
def postprocess(self, input_dicts, fetch_dict, data_id, log_id):
det_out = fetch_dict["sigmoid_0.tmp_0"]
det_out = list(fetch_dict.values())[0]
ratio_list = [
float(self.new_h) / self.ori_h, float(self.new_w) / self.ori_w
]
......
......@@ -8,123 +8,215 @@
- [4. 端到端评估](#4)
<a name="1"></a>
## 1. 简介
PP-OCRv3在PP-OCRv2的基础上进一步升级。检测模型仍然基于DB算法,优化策略采用了带残差注意力机制的FPN结构RSEFPN、增大感受野的PAN结构LKPAN、基于DML训练的更优的教师模型;识别模型将base模型从CRNN替换成了IJCAI 2022论文[SVTR](),并采用SVTR轻量化、带指导训练CTC、数据增广策略RecConAug、自监督训练的更好的预训练模型、无标签数据的使用进行模型加速和效果提升。更多细节请参考PP-OCRv3[技术报告](./PP-OCRv3_introduction.md)。
PP-OCRv3系统pipeline如下:
PP-OCRv3在PP-OCRv2的基础上进一步升级。整体的框架图保持了与PP-OCRv2相同的pipeline,针对检测模型和识别模型进行了优化。其中,检测模块仍基于DB算法优化,而识别模块不再采用CRNN,换成了IJCAI 2022最新收录的文本识别算法[SVTR](https://arxiv.org/abs/2205.00159),并对其进行产业适配。PP-OCRv3系统框图如下所示(粉色框中为PP-OCRv3新增策略):
<div align="center">
<img src="../ppocrv3_framework.png" width="800">
</div>
<a name="2"></a>
## 2. 检测优化
PP-OCRv3采用PP-OCRv2的[CML](https://arxiv.org/pdf/2109.03144.pdf)蒸馏策略,在蒸馏的student模型、teacher模型精度提升,CML蒸馏策略上分别做了优化。
从算法改进思路上看,分别针对检测和识别模型,进行了共9个方面的改进:
- 检测模块:
- LK-PAN:大感受野的PAN结构;
- DML:教师模型互学习策略;
- RSE-FPN:残差注意力机制的FPN结构;
- 在蒸馏student模型精度提升方面,提出了基于残差结构的通道注意力模块RSEFPN(Residual Squeeze-and-Excitation FPN),用于提升student模型精度和召回。
RSEFPN的网络结构如下图所示,RSEFPN在PP-OCRv2的FPN基础上,将FPN中的卷积层更换为了通道注意力结构的RSEConv层。
- 识别模块:
- SVTR_LCNet:轻量级文本识别网络;
- GTC:Attention指导CTC训练策略;
- TextConAug:挖掘文字上下文信息的数据增广策略;
- TextRotNet:自监督的预训练模型;
- UDML:联合互学习策略;
- UIM:无标注数据挖掘方案。
从效果上看,速度可比情况下,多种场景精度均有大幅提升:
- 中文场景,相对于PP-OCRv2中文模型提升超5%;
- 英文数字场景,相比于PP-OCRv2英文模型提升11%;
- 多语言场景,优化80+语种识别效果,平均准确率提升超5%。
<a name="2"></a>
## 2. 检测优化
PP-OCRv3检测模型是对PP-OCRv2中的[CML](https://arxiv.org/pdf/2109.03144.pdf)(Collaborative Mutual Learning) 协同互学习文本检测蒸馏策略进行了升级。如下图所示,CML的核心思想结合了①传统的Teacher指导Student的标准蒸馏与 ②Students网络之间的DML互学习,可以让Students网络互学习的同时,Teacher网络予以指导。PP-OCRv3分别针对教师模型和学生模型进行进一步效果优化。其中,在对教师模型优化时,提出了大感受野的PAN结构LK-PAN和引入了DML(Deep Mutual Learning)蒸馏策略;在对学生模型优化时,提出了残差注意力机制的FPN结构RSE-FPN。
<div align="center">
<img src=".././ppocr_v3/RSEFPN.png" width="800">
<img src=".././ppocr_v3/ppocrv3_det_cml.png" width="800">
</div>
RSEFPN将PP-OCR检测模型的精度hmean从81.3%提升到84.5%。模型大小从3M变为3.6M。
消融实验如下:
*注:PP-OCRv2的FPN通道数仅为96和24,如果直接用SE模块代替FPN的卷积会导致精度下降,RSEConv引入残差结构可以防止训练中包含重要特征的通道被抑制。*
|序号|策略|模型大小|hmean|速度(cpu + mkldnn)|
|-|-|-|-|-|
|baseline teacher|PP-OCR server|49M|83.2%|171ms|
|teacher1|DB-R50-LK-PAN|124M|85.0%|396ms|
|teacher2|DB-R50-LK-PAN-DML|124M|86.0%|396ms|
|baseline student|PP-OCRv2|3M|83.2%|117ms|
|student0|DB-MV3-RSE-FPN|3.6M|84.5%|124ms|
|student1|DB-MV3-CML(teacher2)|3M|84.3%|117ms|
|student2|DB-MV3-RSE-FPN-CML(teacher2)|3.6M|85.4%|124ms|
- 在蒸馏的teacher模型精度提升方面,提出了LKPAN结构替换PP-OCRv2的FPN结构,并且使用ResNet50作为Backbone,更大的模型带来更多的精度提升。另外,对teacher模型使用[DML](https://arxiv.org/abs/1706.00384)蒸馏策略进一步提升teacher模型的精度。最终teacher的模型指标相比ppocr_server_v2.0从83.2%提升到了86.0%
测试环境: Intel Gold 6148 CPU,预测时开启MKLDNN加速
*注:[PP-OCRv2的FPN结构](https://github.com/PaddlePaddle/PaddleOCR/blob/77acb3bfe51c8a46c684527f73cd218cefedb4a3/ppocr/modeling/necks/db_fpn.py#L107)对DB算法FPN结构做了轻量级设计*
**(1)LK-PAN:大感受野的PAN结构**
LKPAN的网络结构如下图所示:
LK-PAN (Large Kernel PAN) 是一个具有更大感受野的轻量级[PAN](https://arxiv.org/pdf/1803.01534.pdf)结构,核心是将PAN结构的path augmentation中卷积核从`3*3`改为`9*9`。通过增大卷积核,提升特征图每个位置覆盖的感受野,更容易检测大字体的文字以及极端长宽比的文字。使用LK-PAN结构,可以将教师模型的hmean从83.2%提升到85.0%。
<div align="center">
<img src="../ppocr_v3/LKPAN.png" width="800">
<img src="../ppocr_v3/LKPAN.png" width="1000">
</div>
LKPAN(Large Kernel PAN)是一个具有更大感受野的轻量级[PAN](https://arxiv.org/pdf/1803.01534.pdf)结构。在LKPAN的path augmentation中,使用kernel size为`9*9`的卷积;更大的kernel size意味着更大的感受野,更容易检测大字体的文字以及极端长宽比的文字。LKPAN将PP-OCR检测模型的精度hmean从81.3%提升到84.9%。
**(2)DML:教师模型互学习策略**
*注:LKPAN相比RSEFPN有更多的精度提升,但是考虑到模型大小和预测速度等因素,在student模型中使用RSEFPN。*
[DML](https://arxiv.org/abs/1706.00384) (Deep Mutual Learning)互学习蒸馏方法,如下图所示,通过两个结构相同的模型互相学习,可以有效提升文本检测模型的精度。教师模型采用DML策略,hmean从85%提升到86%。将PP-OCRv2中CML的教师模型更新为上述更高精度的教师模型,学生模型的hmean可以进一步从83.2%提升到84.3%。
采用上述策略,PP-OCRv3相比PP-OCRv2,hmean指标从83.3%提升到85.4%;预测速度从平均117ms/image变为124ms/image。
<div align="center">
<img src="../ppocr_v3/teacher_dml.png" width="800">
</div>
3. PP-OCRv3检测模型消融实验
**(3)RSE-FPN:残差注意力机制的FPN结构**
|序号|策略|模型大小|hmean|Intel Gold 6148CPU+mkldnn预测耗时|
|-|-|-|-|-|
|0|PP-OCR|3M|81.3%|117ms|
|1|PP-OCRV2|3M|83.3%|117ms|
|2|0 + RESFPN|3.6M|84.5%|124ms|
|3|0 + LKPAN|4.6M|84.9%|156ms|
|4|ppocr_server_v2.0 |124M|83.2%||171ms|
|5|teacher + DML + LKPAN|124M|86.0%|396ms|
|6|0 + 2 + 5 + CML|3.6M|85.4%|124ms|
RSE-FPN(Residual Squeeze-and-Excitation FPN)如下图所示,引入残差结构和通道注意力结构,将FPN中的卷积层更换为通道注意力结构的RSEConv层,进一步提升特征图的表征能力。考虑到PP-OCRv2的检测模型中FPN通道数非常小,仅为96,如果直接用SEblock代替FPN中卷积会导致某些通道的特征被抑制,精度会下降。RSEConv引入残差结构会缓解上述问题,提升文本检测效果。进一步将PP-OCRv2中CML的学生模型的FPN结构更新为RSE-FPN,学生模型的hmean可以进一步从84.3%提升到85.4%。
<div align="center">
<img src=".././ppocr_v3/RSEFPN.png" width="1000">
</div>
<a name="3"></a>
## 3. 识别优化
[SVTR](https://arxiv.org/abs/2205.00159) 证明了强大的单视觉模型(无需序列模型)即可高效准确完成文本识别任务,在中英文数据上均有优秀的表现。经过实验验证,SVTR_Tiny在自建的 [中文数据集上](https://arxiv.org/abs/2109.03144) ,识别精度可以提升10.7%,网络结构如下所示:
PP-OCRv3的识别模块是基于文本识别算法[SVTR](https://arxiv.org/abs/2205.00159)优化。SVTR不再采用RNN结构,通过引入Transformers结构更加有效地挖掘文本行图像的上下文信息,从而提升文本识别能力。直接将PP-OCRv2的识别模型,替换成SVTR_Tiny,识别准确率从74.8%提升到80.1%(+5.3%),但是预测速度慢了将近11倍,CPU上预测一条文本行,将近100ms。因此,如下图所示,PP-OCRv3采用如下6个优化策略进行识别模型加速。
<img src="../ppocr_v3/svtr_tiny.jpg" width=800>
由于 MKLDNN 加速库支持的模型结构有限,SVTR 在CPU+MKLDNN上相比PP-OCRv2慢了10倍。
PP-OCRv3 期望在提升模型精度的同时,不带来额外的推理耗时。通过分析发现,SVTR_Tiny结构的主要耗时模块为Mixing Block,因此我们对 SVTR_Tiny 的结构进行了一系列优化(详细速度数据请参考下方消融实验表格):
<div align="center">
<img src="../ppocr_v3/v3_rec_pipeline.png" width=800>
</div>
1. 将SVTR网络前半部分替换为PP-LCNet的前三个stage,保留4个 Global Mixing Block ,精度为76%,加速69%,网络结构如下所示:
<img src="../ppocr_v3/svtr_g4.png" width=800>
2. 将4个 Global Attenntion Block 减小到2个,精度为72.9%,加速69%,网络结构如下所示:
<img src="../ppocr_v3/svtr_g2.png" width=800>
3. 实验发现 Global Attention 的预测速度与输入其特征的shape有关,因此后移Global Mixing Block的位置到池化层之后,精度下降为71.9%,速度超越 CNN-base 的PP-OCRv2 22%,网络结构如下所示:
<img src="../ppocr_v3/ppocr_v3.png" width=800>
基于上述策略,PP-OCRv3识别模型相比PP-OCRv2,在速度可比的情况下,精度进一步提升4.6%。 具体消融实验如下所示:
为了提升模型精度同时不引入额外推理成本,PP-OCRv3参考GTC策略,使用Attention监督CTC训练,预测时完全去除Attention模块,在推理阶段不增加任何耗时, 精度提升3.8%,训练流程如下所示:
<img src="../ppocr_v3/GTC.png" width=800>
| ID | 策略 | 模型大小 | 精度 | 预测耗时(CPU + MKLDNN)|
|-----|-----|--------|----| --- |
| 01 | PP-OCRv2 | 8M | 74.8% | 8.54ms |
| 02 | SVTR_Tiny | 21M | 80.1% | 97ms |
| 03 | SVTR_LCNet(h32) | 12M | 71.9% | 6.6ms |
| 04 | SVTR_LCNet(h48) | 12M | 73.98% | 7.6ms |
| 05 | + GTC | 12M | 75.8% | 7.6ms |
| 06 | + TextConAug | 12M | 76.3% | 7.6ms |
| 07 | + TextRotNet | 12M | 76.9% | 7.6ms |
| 08 | + UDML | 12M | 78.4% | 7.6ms |
| 09 | + UIM | 12M | 79.4% | 7.6ms |
在训练策略方面,PP-OCRv3参考 [SSL](https://github.com/ku21fan/STR-Fewer-Labels) 设计了文本方向任务,训练了适用于文本识别的预训练模型,加速模型收敛过程,精度提升了0.6%; 使用UDML蒸馏策略,进一步提升精度1.5%,训练流程所示:
注: 测试速度时,实验01-03输入图片尺寸均为(3,32,320),04-08输入图片尺寸均为(3,48,320)。在实际预测时,图像为变长输入,速度会有所变化。测试环境: Intel Gold 6148 CPU,预测时开启MKLDNN加速。
<img src="../ppocr_v3/SSL.png" width="300"> <img src="../ppocr_v3/UDML.png" width="500">
**(1)SVTR_LCNet:轻量级文本识别网络**
SVTR_LCNet是针对文本识别任务,将基于Transformer的[SVTR](https://arxiv.org/abs/2205.00159)网络和轻量级CNN网络[PP-LCNet](https://arxiv.org/abs/2109.15099) 融合的一种轻量级文本识别网络。使用该网络,预测速度优于PP-OCRv2的识别模型20%,但是由于没有采用蒸馏策略,该识别模型效果略差。此外,进一步将输入图片规范化高度从32提升到48,预测速度稍微变慢,但是模型效果大幅提升,识别准确率达到73.98%(+2.08%),接近PP-OCRv2采用蒸馏策略的识别模型效果。
数据增强方面
SVTR_Tiny 网络结构如下所示
1. 基于 [ConCLR](https://www.cse.cuhk.edu.hk/~byu/papers/C139-AAAI2022-ConCLR.pdf) 中的ConAug方法,设计了 RecConAug 数据增强方法,增强数据多样性,精度提升0.5%,增强可视化效果如下所示:
<img src="../ppocr_v3/recconaug.png" width=800>
<div align="center">
<img src="../ppocr_v3/svtr_tiny.png" width=800>
</div>
2. 使用训练好的 SVTR_large 预测 120W 的 lsvt 无标注数据,取出其中得分大于0.95的数据,共得到81W识别数据加入到PP-OCRv3的训练数据中,精度提升1%。
总体来讲PP-OCRv3识别从网络结构、训练策略、数据增强三个方向做了进一步优化:
由于 MKLDNN 加速库支持的模型结构有限,SVTR 在 CPU+MKLDNN 上相比 PP-OCRv2 慢了10倍。PP-OCRv3 期望在提升模型精度的同时,不带来额外的推理耗时。通过分析发现,SVTR_Tiny 结构的主要耗时模块为 Mixing Block,因此我们对 SVTR_Tiny 的结构进行了一系列优化(详细速度数据请参考下方消融实验表格):
- 网络结构上:考虑[SVTR](https://arxiv.org/abs/2205.00159) 在中英文效果上的优越性,采用SVTR_Tiny作为base,选取Global Mixing Block和卷积组合提取特征,并将Global Mixing Block位置后移进行加速; 参考 [GTC](https://arxiv.org/pdf/2002.01276.pdf) 策略,使用注意力机制模块指导CTC训练,定位和识别字符,提升不规则文本的识别精度。
- 训练策略上:参考 [SSL](https://github.com/ku21fan/STR-Fewer-Labels) 设计了方向分类前序任务,获取更优预训练模型,加速模型收敛过程,提升精度; 使用UDML蒸馏策略、监督attention、ctc两个分支得到更优模型。
- 数据增强上:基于 [ConCLR](https://www.cse.cuhk.edu.hk/~byu/papers/C139-AAAI2022-ConCLR.pdf) 中的ConAug方法,改进得到 RecConAug 数据增广方法,支持随机结合任意多张图片,提升训练数据的上下文信息丰富度,增强模型鲁棒性;使用 SVTR_large 预测无标签数据,向训练集中补充81w高质量真实数据。
基于上述策略,PP-OCRv3识别模型相比PP-OCRv2,在速度可比的情况下,精度进一步提升4.5%。 具体消融实验如下所示:
1. 将 SVTR 网络前半部分替换为 PP-LCNet 的前三个stage,保留4个 Global Mixing Block ,精度为76%,加速69%,网络结构如下所示:
<div align="center">
<img src="../ppocr_v3/svtr_g4.png" width=800>
</div>
2. 将4个 Global Mixing Block 减小到2个,精度为72.9%,加速69%,网络结构如下所示:
<div align="center">
<img src="../ppocr_v3/svtr_g2.png" width=800>
</div>
3. 实验发现 Global Mixing Block 的预测速度与输入其特征的shape有关,因此后移 Global Mixing Block 的位置到池化层之后,精度下降为71.9%,速度超越基于CNN结构的PP-OCRv2-baseline 22%,网络结构如下所示:
<div align="center">
<img src="../ppocr_v3/LCNet_SVTR.png" width=800>
</div>
实验细节
具体消融实验如下所示
| id | 策略 | 模型大小 | 精度 | 速度(cpu + mkldnn)|
| ID | 策略 | 模型大小 | 精度 | 速度(CPU + MKLDNN)|
|-----|-----|--------|----| --- |
| 01 | PP-OCRv2 | 8M | 69.3% | 8.54ms |
| 01 | PP-OCRv2-baseline | 8M | 69.3% | 8.54ms |
| 02 | SVTR_Tiny | 21M | 80.1% | 97ms |
| 03 | LCNet_SVTR_G4 | 9.2M | 76% | 30ms |
| 04 | LCNet_SVTR_G2 | 13M | 72.98% | 9.37ms |
| 05 | PP-OCRv3 | 12M | 71.9% | 6.6ms |
| 06 | + large input_shape | 12M | 73.98% | 7.6ms |
| 06 | + GTC | 12M | 75.8% | 7.6ms |
| 07 | + RecConAug | 12M | 76.3% | 7.6ms |
| 08 | + SSL pretrain | 12M | 76.9% | 7.6ms |
| 09 | + UDML | 12M | 78.4% | 7.6ms |
| 10 | + unlabeled data | 12M | 79.4% | 7.6ms |
注: 测试速度时,实验01-05输入图片尺寸均为(3,32,320),06-10输入图片尺寸均为(3,48,320)
| 03 | SVTR_LCNet(G4) | 9.2M | 76% | 30ms |
| 04 | SVTR_LCNet(G2) | 13M | 72.98% | 9.37ms |
| 05 | SVTR_LCNet(h32) | 12M | 71.9% | 6.6ms |
| 06 | SVTR_LCNet(h48) | 12M | 73.98% | 7.6ms |
注: 测试速度时,01-05输入图片尺寸均为(3,32,320); PP-OCRv2-baseline 代表没有借助蒸馏方法训练得到的模型
**(2)GTC:Attention指导CTC训练策略**
[GTC](https://arxiv.org/pdf/2002.01276.pdf)(Guided Training of CTC),利用Attention模块以及损失,指导CTC损失训练,融合多种文本特征的表达,是一种有效的提升文本识别的策略。使用该策略,预测时完全去除 Attention 模块,在推理阶段不增加任何耗时,识别模型的准确率进一步提升到75.8%(+1.82%)。训练流程如下所示:
<div align="center">
<img src="../ppocr_v3/GTC.png" width=800>
</div>
**(3)TextConAug:挖掘文字上下文信息的数据增广策略**
TextConAug是一种挖掘文字上下文信息的数据增广策略,主要思想来源于论文[ConCLR](https://www.cse.cuhk.edu.hk/~byu/papers/C139-AAAI2022-ConCLR.pdf),作者提出ConAug数据增广,在一个batch内对2张不同的图像进行联结,组成新的图像并进行自监督对比学习。PP-OCRv3将此方法应用到有监督的学习任务中,设计了TextConAug数据增强方法,可以丰富训练数据上下文信息,提升训练数据多样性。使用该策略,识别模型的准确率进一步提升到76.3%(+0.5%)。TextConAug示意图如下所示:
<div align="center">
<img src="../ppocr_v3/recconaug.png" width=800>
</div>
**(4)TextRotNet:自监督的预训练模型**
TextRotNet是使用大量无标注的文本行数据,通过自监督方式训练的预训练模型,参考于论文[STR-Fewer-Labels](https://github.com/ku21fan/STR-Fewer-Labels)。该模型可以初始化SVTR_LCNet的初始权重,从而帮助文本识别模型收敛到更佳位置。使用该策略,识别模型的准确率进一步提升到76.9%(+0.6%)。TextRotNet训练流程如下图所示:
<div align="center">
<img src="../ppocr_v3/SSL.png" width="500">
</div>
**(5)UDML:联合互学习策略**
UDML(Unified-Deep Mutual Learning)联合互学习是PP-OCRv2中就采用的对于文本识别非常有效的提升模型效果的策略。在PP-OCRv3中,针对两个不同的SVTR_LCNet和Attention结构,对他们之间的PP-LCNet的特征图、SVTR模块的输出和Attention模块的输出同时进行监督训练。使用该策略,识别模型的准确率进一步提升到78.4%(+1.5%)。
**(6)UIM:无标注数据挖掘方案**
UIM(Unlabeled Images Mining)是一种非常简单的无标注数据挖掘方案。核心思想是利用高精度的文本识别大模型对无标注数据进行预测,获取伪标签,并且选择预测置信度高的样本作为训练数据,用于训练小模型。使用该策略,识别模型的准确率进一步提升到79.4%(+1%)。
<div align="center">
<img src="../ppocr_v3/UIM.png" width="500">
</div>
<a name="4"></a>
## 4. 端到端评估
经过以上优化,最终PP-OCRv3在速度可比情况下,中文场景端到端Hmean指标相比于PP-OCRv2提升5%,效果大幅提升。具体指标如下表所示:
| Model | Hmean | Model Size (M) | Time Cost (CPU, ms) | Time Cost (T4 GPU, ms) |
|-----|-----|--------|----| --- |
| PP-OCR mobile | 50.3% | 8.1 | 356 | 116 |
| PP-OCR server | 57.0% | 155.1 | 1056 | 200 |
| PP-OCRv2 | 57.6% | 11.6 | 330 | 111 |
| PP-OCRv3 | 62.9% | 15.6 | 331 | 86.64 |
测试环境:CPU型号为Intel Gold 6148,CPU预测时开启MKLDNN加速。
除了更新中文模型,本次升级也同步优化了英文数字模型,端到端效果提升11%,如下表所示:
| Model | Recall | Precision | Hmean |
|-----|-----|--------|----|
| PP-OCR_en | 38.99% | 45.91% | 42.17% |
| PP-OCRv3_en | 50.95% | 55.53% | 53.14% |
同时,也对已支持的80余种语言识别模型进行了升级更新,在有评估集的四种语系识别准确率平均提升5%以上,如下表所示:
| Model | 拉丁语系 | 阿拉伯语系 | 日语 | 韩语 |
|-----|-----|--------|----| --- |
| PP-OCR_mul | 69.6% | 40.5% | 38.5% | 55.4% |
| PP-OCRv3_mul | 75.2%| 45.37% | 45.8% | 60.1% |
......@@ -18,13 +18,13 @@
- [3. 文本方向分类模型](#3-文本方向分类模型)
- [4. Paddle-Lite 模型](#4-paddle-lite-模型)
PaddleOCR提供的可下载模型包括`推理模型``训练模型``预训练模型``slim模型`,模型区别说明如下:
PaddleOCR提供的可下载模型包括`推理模型``训练模型``预训练模型``nb模型`,模型区别说明如下:
|模型类型|模型格式|简介|
|--- | --- | --- |
|推理模型|inference.pdmodel、inference.pdiparams|用于预测引擎推理,[详情](./inference.md)|
|训练模型、预训练模型|\*.pdparams、\*.pdopt、\*.states |训练过程中保存的模型的参数、优化器状态和训练中间信息,多用于模型指标评估和恢复训练|
|slim模型|\*.nb|经过飞桨模型压缩工具PaddleSlim压缩后的模型,适用于移动端/IoT端等端侧部署场景(需使用飞桨Paddle Lite部署)。|
|nb模型|\*.nb|经过飞桨Paddle-Lite工具优化后的模型,适用于移动端/IoT端等端侧部署场景(需使用飞桨Paddle Lite部署)。|
各个模型的关系如下面的示意图所示。
......@@ -41,7 +41,7 @@ PaddleOCR提供的可下载模型包括`推理模型`、`训练模型`、`预训
|模型名称|模型简介|配置文件|推理模型大小|下载地址|
| --- | --- | --- | --- | --- |
|ch_PP-OCRv3_det_slim|【最新】slim量化+蒸馏版超轻量模型,支持中英文、多语种文本检测|[ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml)| 1.1M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_slim_infer.tar) / [训练模型(coming soon)](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_slim_distill_train.tar) / [slim模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_slim_infer.nb)|
|ch_PP-OCRv3_det_slim|【最新】slim量化+蒸馏版超轻量模型,支持中英文、多语种文本检测|[ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml)| 1.1M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_slim_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_slim_distill_train.tar) / [nb模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_slim_infer.nb)|
|ch_PP-OCRv3_det| 【最新】原始超轻量模型,支持中英文、多语种文本检测 |[ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml)| 3.8M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_distill_train.tar)|
|ch_PP-OCRv2_det_slim| slim量化+蒸馏版超轻量模型,支持中英文、多语种文本检测|[ch_PP-OCRv2_det_cml.yml](../../configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_cml.yml)| 3M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_slim_quant_infer.tar)|
|ch_PP-OCRv2_det| 原始超轻量模型,支持中英文、多语种文本检测|[ch_PP-OCRv2_det_cml.yml](../../configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_cml.yml)|3M|[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_distill_train.tar)|
......@@ -55,8 +55,8 @@ PaddleOCR提供的可下载模型包括`推理模型`、`训练模型`、`预训
|模型名称|模型简介|配置文件|推理模型大小|下载地址|
| --- | --- | --- | --- | --- |
|en_PP-OCRv3_det_slim |【最新】slim量化版超轻量模型,支持英文、数字检测 | [ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml) | 1.1M |[推理模型(coming soon)](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_slim_infer.tar) / [训练模型(coming soon)](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_slim_distill_train.tar) / [slim模型(coming soon)](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_slim_infer.nb) |
|ch_PP-OCRv3_det |【最新】原始超轻量模型,支持英文、数字检测|[ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml)| 3.8M | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_distill_train.tar) |
|en_PP-OCRv3_det_slim |【最新】slim量化版超轻量模型,支持英文、数字检测 | [ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml) | 1.1M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_slim_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_slim_distill_train.tar) / [nb模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_slim_infer.nb) |
|en_PP-OCRv3_det |【最新】原始超轻量模型,支持英文、数字检测|[ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml)| 3.8M | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_distill_train.tar) |
* 注:英文检测模型与中文检测模型结构完全相同,只有训练数据不同,在此仅提供相同的配置文件。
......@@ -66,7 +66,7 @@ PaddleOCR提供的可下载模型包括`推理模型`、`训练模型`、`预训
|模型名称|模型简介|配置文件|推理模型大小|下载地址|
| --- | --- | --- | --- | --- |
| ml_PP-OCRv3_det_slim |【最新】slim量化版超轻量模型,支持多语言检测 | [ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml) | 1.1M |[推理模型(coming soon)](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/Multilingual_PP-OCRv3_det_slim_infer.tar) / [训练模型(coming soon)](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/Multilingual_PP-OCRv3_det_slim_distill_train.tar) / [slim模型(coming soon)](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/Multilingual_PP-OCRv3_det_slim_infer.nb) |
| ml_PP-OCRv3_det_slim |【最新】slim量化版超轻量模型,支持多语言检测 | [ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml) | 1.1M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/Multilingual_PP-OCRv3_det_slim_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/Multilingual_PP-OCRv3_det_slim_distill_train.tar) / [nb模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/Multilingual_PP-OCRv3_det_slim_infer.nb) |
| ml_PP-OCRv3_det |【最新】原始超轻量模型,支持多语言检测 | [ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml)| 3.8M | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/Multilingual_PP-OCRv3_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/Multilingual_PP-OCRv3_det_distill_train.tar) |
* 注:多语言检测模型与中文检测模型结构完全相同,只有训练数据不同,在此仅提供相同的配置文件。
......@@ -81,7 +81,7 @@ PaddleOCR提供的可下载模型包括`推理模型`、`训练模型`、`预训
|模型名称|模型简介|配置文件|推理模型大小|下载地址|
| --- | --- | --- | --- | --- |
|ch_PP-OCRv3_rec_slim |【最新】slim量化版超轻量模型,支持中英文、数字识别|[ch_PP-OCRv3_rec_distillation.yml](../../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml)| 4.9M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_train.tar) / [slim模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_infer.nb) |
|ch_PP-OCRv3_rec_slim |【最新】slim量化版超轻量模型,支持中英文、数字识别|[ch_PP-OCRv3_rec_distillation.yml](../../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml)| 4.9M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_train.tar) / [nb模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_infer.nb) |
|ch_PP-OCRv3_rec|【最新】原始超轻量模型,支持中英文、数字识别|[ch_PP-OCRv3_rec_distillation.yml](../../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml)| 12.4M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_train.tar) |
|ch_PP-OCRv2_rec_slim| slim量化版超轻量模型,支持中英文、数字识别|[ch_PP-OCRv2_rec.yml](../../configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec.yml)| 9M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_train.tar) |
|ch_PP-OCRv2_rec| 原始超轻量模型,支持中英文、数字识别|[ch_PP-OCRv2_rec_distillation.yml](../../configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec_distillation.yml)|8.5M|[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_train.tar) |
......@@ -96,8 +96,8 @@ PaddleOCR提供的可下载模型包括`推理模型`、`训练模型`、`预训
|模型名称|模型简介|配置文件|推理模型大小|下载地址|
| --- | --- | --- | --- | --- |
|en_PP-OCRv3_rec_slim |【最新】slim量化版超轻量模型,支持英文、数字识别 | [en_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/en_PP-OCRv3_rec.yml)| - |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_slim_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_slim_train.tar) / [slim模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_slim_infer.nb) |
|ch_PP-OCRv3_rec |【最新】原始超轻量模型,支持英文、数字识别|[en_PP-OCRv3_rec.yml](../../configs/rec/en_PP-OCRv3/en_PP-OCRv3_rec.yml)| 9.6M | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_train.tar) |
|en_PP-OCRv3_rec_slim |【最新】slim量化版超轻量模型,支持英文、数字识别 | [en_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/en_PP-OCRv3_rec.yml)| 3.2M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_slim_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_slim_train.tar) / [nb模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_slim_infer.nb) |
|en_PP-OCRv3_rec |【最新】原始超轻量模型,支持英文、数字识别|[en_PP-OCRv3_rec.yml](../../configs/rec/en_PP-OCRv3/en_PP-OCRv3_rec.yml)| 9.6M | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_train.tar) |
|en_number_mobile_slim_v2.0_rec|slim裁剪量化版超轻量模型,支持英文、数字识别|[rec_en_number_lite_train.yml](../../configs/rec/multi_language/rec_en_number_lite_train.yml)| 2.7M | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/en_number_mobile_v2.0_rec_slim_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/en_number_mobile_v2.0_rec_slim_train.tar) |
|en_number_mobile_v2.0_rec|原始超轻量模型,支持英文、数字识别|[rec_en_number_lite_train.yml](../../configs/rec/multi_language/rec_en_number_lite_train.yml)|2.6M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/en_number_mobile_v2.0_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/en_number_mobile_v2.0_rec_train.tar) |
......@@ -107,17 +107,16 @@ PaddleOCR提供的可下载模型包括`推理模型`、`训练模型`、`预训
|模型名称|字典文件|模型简介|配置文件|推理模型大小|下载地址|
| --- | --- | --- | --- |--- | --- |
| korean_PP-OCRv3_rec | ppocr/utils/dict/korean_dict.txt |韩文识别|[korean_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/korean_PP-OCRv3_rec.yml)|11M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/korean_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/korean_PP-OCRv3_rec_train.tar) |
| japan_PP-OCRv3_rec | ppocr/utils/dict/japan_dict.txt |日文识别|[japan_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/japan_PP-OCRv3_rec.yml)|11M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/japan_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/japan_PP-OCRv3_rec_train.tar) |
| chinese_cht_PP-OCRv3_rec | ppocr/utils/dict/chinese_cht_dict.txt | 中文繁体识别|[chinese_cht_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/chinese_cht_PP-OCRv3_rec.yml)|12M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/chinese_cht_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/chinese_cht_PP-OCRv3_rec_train.tar) |
| te_PP-OCRv3_rec | ppocr/utils/dict/te_dict.txt | 泰卢固文识别|[te_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/te_PP-OCRv3_rec.yml)|9.6M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/te_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/te_PP-OCRv3_rec_train.tar) |
| ka_PP-OCRv3_rec | ppocr/utils/dict/ka_dict.txt |卡纳达文识别|[ka_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/ka_PP-OCRv3_rec.yml)|9.9M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/ka_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/ka_PP-OCRv3_rec_train.tar) |
| ta_PP-OCRv3_rec | ppocr/utils/dict/ta_dict.txt |泰米尔文识别|[ta_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/ta_PP-OCRv3_rec.yml)|9.6M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/ta_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/ta_PP-OCRv3_rec_train.tar) |
| latin_PP-OCRv3_rec | ppocr/utils/dict/latin_dict.txt | 拉丁文识别 | [latin_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/latin_PP-OCRv3_rec.yml) |9.6M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/latin_ppocr_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/latin_ppocr_PP-OCRv3_rec_train.tar) |
| arabic_PP-OCRv3_rec | ppocr/utils/dict/arabic_dict.txt | 阿拉伯字母 | [arabic_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/rec_arabic_lite_train.yml) |9.6M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/arabic_ppocr_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/arabic_ppocr_PP-OCRv3_rec_train.tar) |
| cyrillic_PP-OCRv3_rec | ppocr/utils/dict/cyrillic_dict.txt | 斯拉夫字母 | [cyrillic_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/cyrillic_PP-OCRv3_rec.yml) |9.6M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/cyrillic_ppocr_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/cyrillic_ppocr_PP-OCRv3_rec_train.tar) |
| devanagari_PP-OCRv3_rec | ppocr/utils/dict/devanagari_dict.txt |梵文字母 | [devanagari_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/devanagari_PP-OCRv3_rec.yml) |9.6M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/devanagari_ppocr_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/devanagari_ppocr_PP-OCRv3_rec_train.tar) |
| korean_PP-OCRv3_rec | ppocr/utils/dict/korean_dict.txt |韩文识别|[korean_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/korean_PP-OCRv3_rec.yml)|11M|[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/korean_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/korean_PP-OCRv3_rec_train.tar) |
| japan_PP-OCRv3_rec | ppocr/utils/dict/japan_dict.txt |日文识别|[japan_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/japan_PP-OCRv3_rec.yml)|11M|[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/japan_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/japan_PP-OCRv3_rec_train.tar) |
| chinese_cht_PP-OCRv3_rec | ppocr/utils/dict/chinese_cht_dict.txt | 中文繁体识别|[chinese_cht_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/chinese_cht_PP-OCRv3_rec.yml)|12M|[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/chinese_cht_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/chinese_cht_PP-OCRv3_rec_train.tar) |
| te_PP-OCRv3_rec | ppocr/utils/dict/te_dict.txt | 泰卢固文识别|[te_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/te_PP-OCRv3_rec.yml)|9.6M|[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/te_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/te_PP-OCRv3_rec_train.tar) |
| ka_PP-OCRv3_rec | ppocr/utils/dict/ka_dict.txt |卡纳达文识别|[ka_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/ka_PP-OCRv3_rec.yml)|9.9M|[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/ka_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/ka_PP-OCRv3_rec_train.tar) |
| ta_PP-OCRv3_rec | ppocr/utils/dict/ta_dict.txt |泰米尔文识别|[ta_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/ta_PP-OCRv3_rec.yml)|9.6M|[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/ta_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/ta_PP-OCRv3_rec_train.tar) |
| latin_PP-OCRv3_rec | ppocr/utils/dict/latin_dict.txt | 拉丁文识别 | [latin_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/latin_PP-OCRv3_rec.yml) |9.7M|[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/latin_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/latin_PP-OCRv3_rec_train.tar) |
| arabic_PP-OCRv3_rec | ppocr/utils/dict/arabic_dict.txt | 阿拉伯字母 | [arabic_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/rec_arabic_lite_train.yml) |9.6M|[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/arabic_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/arabic_PP-OCRv3_rec_train.tar) |
| cyrillic_PP-OCRv3_rec | ppocr/utils/dict/cyrillic_dict.txt | 斯拉夫字母 | [cyrillic_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/cyrillic_PP-OCRv3_rec.yml) |9.6M|[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/cyrillic_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/cyrillic_PP-OCRv3_rec_train.tar) |
| devanagari_PP-OCRv3_rec | ppocr/utils/dict/devanagari_dict.txt |梵文字母 | [devanagari_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/devanagari_PP-OCRv3_rec.yml) |9.9M|[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/devanagari_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/devanagari_PP-OCRv3_rec_train.tar) |
更多支持语种请参考: [多语言模型](./multi_languages.md)
......@@ -127,13 +126,18 @@ PaddleOCR提供的可下载模型包括`推理模型`、`训练模型`、`预训
|模型名称|模型简介|配置文件|推理模型大小|下载地址|
| --- | --- | --- | --- | --- |
|ch_ppocr_mobile_slim_v2.0_cls|slim量化版模型,对检测到的文本行文字角度分类|[cls_mv3.yml](../../configs/cls/cls_mv3.yml)| 2.1M |[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_slim_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_slim_infer.tar) |
|ch_ppocr_mobile_slim_v2.0_cls|slim量化版模型,对检测到的文本行文字角度分类|[cls_mv3.yml](../../configs/cls/cls_mv3.yml)| 2.1M |[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_slim_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_slim_infer.tar) / [nb模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_ppocr_mobile_v2.0_cls_infer_opt.nb) |
|ch_ppocr_mobile_v2.0_cls|原始分类器模型,对检测到的文本行文字角度分类|[cls_mv3.yml](../../configs/cls/cls_mv3.yml)|1.38M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) |
<a name="Paddle-Lite模型"></a>
## 4. Paddle-Lite 模型
Paddle-Lite 是一个高性能、轻量级、灵活性强且易于扩展的深度学习推理框架,它可以对inference模型进一步优化,得到适用于移动端/IoT端等端侧部署场景的`nb模型`。一般建议基于量化模型进行转换,因为可以将模型以INT8形式进行存储与推理,从而进一步减小模型大小,提升模型速度。
本节主要列出PP-OCRv2以及更早版本的检测与识别nb模型,最新版本的nb模型可以直接从上面的模型列表中获得。
|模型版本|模型简介|模型大小|检测模型|文本方向分类模型|识别模型|Paddle-Lite版本|
|---|---|---|---|---|---|---|
|PP-OCRv2|蒸馏版超轻量中文OCR移动端模型|11M|[下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_PP-OCRv2_det_infer_opt.nb)|[下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_ppocr_mobile_v2.0_cls_infer_opt.nb)|[下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_PP-OCRv2_rec_infer_opt.nb)|v2.10|
......
......@@ -38,8 +38,9 @@ PP-OCRv2在PP-OCR的基础上,进一步在5个方面重点优化,检测模
#### PP-OCRv3
PP-OCRv3在PP-OCRv2的基础上进一步升级。检测模型仍然基于DB算法,优化策略采用了带残差注意力机制的FPN结构RSEFPN、增大感受野的PAN结构LKPAN、基于DML训练的更优的教师模型;识别模型将base模型从CRNN替换成了IJCAI 2022论文[SVTR](https://arxiv.org/abs/2205.00159),并采用SVTR轻量化、带指导训练CTC、数据增广策略RecConAug、自监督训练的更好的预训练模型、无标签数据的使用进行模型加速和效果提升。更多细节请参考PP-OCRv3[技术报告](./PP-OCRv3_introduction.md)
PP-OCRv3在PP-OCRv2的基础上,针对检测模型和识别模型,进行了共计9个方面的升级:
- PP-OCRv3检测模型对PP-OCRv2中的CML协同互学习文本检测蒸馏策略进行了升级,分别针对教师模型和学生模型进行进一步效果优化。其中,在对教师模型优化时,提出了大感受野的PAN结构LK-PAN和引入了DML蒸馏策略;在对学生模型优化时,提出了残差注意力机制的FPN结构RSE-FPN。
- PP-OCRv3的识别模块是基于文本识别算法[SVTR](https://arxiv.org/abs/2205.00159)优化。SVTR不再采用RNN结构,通过引入Transformers结构更加有效地挖掘文本行图像的上下文信息,从而提升文本识别能力。PP-OCRv3通过轻量级文本识别网络SVTR_LCNet、Attention损失指导CTC损失训练策略、挖掘文字上下文信息的数据增广策略TextConAug、TextRotNet自监督预训练模型、UDML联合互学习策略、UIM无标注数据挖掘方案,6个方面进行模型加速和效果提升。
PP-OCRv3系统pipeline如下:
......@@ -47,6 +48,9 @@ PP-OCRv3系统pipeline如下:
<img src="../ppocrv3_framework.png" width="800">
</div>
更多细节请参考PP-OCRv3[技术报告](./PP-OCRv3_introduction.md)
<a name="2"></a>
## 2. 特性
......
......@@ -59,15 +59,13 @@ cd /path/to/ppocr_img
如果不使用提供的测试图片,可以将下方`--image_dir`参数替换为相应的测试图片路径。
**注意** whl包默认使用`PP-OCRv3`模型,识别模型使用的输入shape为`3,48,320`, 因此如果使用识别功能,需要添加参数`--rec_image_shape 3,48,320`,如果不使用默认的`PP-OCRv3`模型,则无需设置该参数。
<a name="211"></a>
#### 2.1.1 中英文模型
* 检测+方向分类器+识别全流程:`--use_angle_cls true`设置使用方向分类器识别180度旋转文字,`--use_gpu false`设置不使用GPU
```bash
paddleocr --image_dir ./imgs/11.jpg --use_angle_cls true --use_gpu false --rec_image_shape 3,48,320
paddleocr --image_dir ./imgs/11.jpg --use_angle_cls true --use_gpu false
```
结果是一个list,每个item包含了文本框,文字和识别置信度
......@@ -94,7 +92,7 @@ cd /path/to/ppocr_img
- 单独使用识别:设置`--det``false`
```bash
paddleocr --image_dir ./imgs_words/ch/word_1.jpg --det false --rec_image_shape 3,48,320
paddleocr --image_dir ./imgs_words/ch/word_1.jpg --det false
```
结果是一个list,每个item只包含识别结果和识别置信度
......@@ -104,16 +102,16 @@ cd /path/to/ppocr_img
```
如需使用2.0模型,请指定参数`--version PP-OCR`,paddleocr默认使用PP-OCRv3模型(`--versioin PP-OCRv3`)。更多whl包使用可参考[whl包文档](./whl.md)
如需使用2.0模型,请指定参数`--ocr_version PP-OCR`,paddleocr默认使用PP-OCRv3模型(`--ocr_version PP-OCRv3`)。更多whl包使用可参考[whl包文档](./whl.md)
<a name="212"></a>
#### 2.1.2 多语言模型
Paddleocr目前支持80个语种,可以通过修改`--lang`参数进行切换,对于英文模型,指定`--lang=en`, PP-OCRv3目前只支持中文和英文模型,其他多语言模型会陆续更新
PaddleOCR目前支持80个语种,可以通过修改`--lang`参数进行切换,对于英文模型,指定`--lang=en`
``` bash
paddleocr --image_dir ./imgs_en/254.jpg --lang=en --rec_image_shape 3,48,320
paddleocr --image_dir ./imgs_en/254.jpg --lang=en
```
<div align="center">
......
# 更新
- 2022.5.7 添加对[Weights & Biases](https://docs.wandb.ai/)训练日志记录工具的支持。
- 2021.12.21 《OCR十讲》课程开讲,12月21日起每晚八点半线上授课! 【免费】报名地址:https://aistudio.baidu.com/aistudio/course/introduce/25207
- 2021.12.21 发布PaddleOCR v2.4。OCR算法新增1种文本检测算法(PSENet),3种文本识别算法(NRTR、SEED、SAR);文档结构化算法新增1种关键信息提取算法(SDMGR),3种DocVQA算法(LayoutLM、LayoutLMv2,LayoutXLM)。
- 2021.9.7 发布PaddleOCR v2.3,发布[PP-OCRv2](#PP-OCRv2),CPU推理速度相比于PP-OCR server提升220%;效果相比于PP-OCR mobile 提升7%。
......
......@@ -199,12 +199,10 @@ for line in result:
paddleocr -h
```
**注意** whl包默认使用`PP-OCRv3`模型,识别模型使用的输入shape为`3,48,320`, 因此如果使用识别功能,需要添加参数`--rec_image_shape 3,48,320`,如果不使用默认的`PP-OCRv3`模型,则无需设置该参数。
* 检测+方向分类器+识别全流程
```bash
paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg --use_angle_cls true --rec_image_shape 3,48,320
paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg --use_angle_cls true
```
结果是一个list,每个item包含了文本框,文字和识别置信度
......@@ -217,7 +215,7 @@ paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg --use_angle_cls true --rec_image
* 检测+识别
```bash
paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg --rec_image_shape 3,48,320
paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg
```
结果是一个list,每个item包含了文本框,文字和识别置信度
......@@ -230,7 +228,7 @@ paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg --rec_image_shape 3,48,320
* 方向分类器+识别
```bash
paddleocr --image_dir PaddleOCR/doc/imgs_words/ch/word_1.jpg --use_angle_cls true --det false --rec_image_shape 3,48,320
paddleocr --image_dir PaddleOCR/doc/imgs_words/ch/word_1.jpg --use_angle_cls true --det false
```
结果是一个list,每个item只包含识别结果和识别置信度
......@@ -256,7 +254,7 @@ paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg --rec false
* 单独执行识别
```bash
paddleocr --image_dir PaddleOCR/doc/imgs_words/ch/word_1.jpg --det false --rec_image_shape 3,48,320
paddleocr --image_dir PaddleOCR/doc/imgs_words/ch/word_1.jpg --det false
```
结果是一个list,每个item只包含识别结果和识别置信度
......@@ -416,4 +414,4 @@ im_show.save('result.jpg')
| cls | 前向时是否启动分类 (命令行模式下使用use_angle_cls控制前向是否启动分类) | FALSE |
| show_log | 是否打印logger信息 | FALSE |
| type | 执行ocr或者表格结构化, 值可选['ocr','structure'] | ocr |
| ocr_version | OCR模型版本,可选PP-OCRv3, PP-OCRv2, PP-OCR。PP-OCRv3 目前仅支持中、英文的检测和识别模型,方向分类器模型;PP-OCRv2 目前仅支持中文的检测和识别模型;PP-OCR支持中文的检测,识别,多语种识别,方向分类器等模型 | PP-OCRv3 |
| ocr_version | OCR模型版本,可选PP-OCRv3, PP-OCRv2, PP-OCR。PP-OCRv3 支持中、英文的检测、识别、多语种识别,方向分类器等模型;PP-OCRv2 目前仅支持中文的检测和识别模型;PP-OCR支持中文的检测,识别,多语种识别,方向分类器等模型 | PP-OCRv3 |
......@@ -36,6 +36,7 @@ Take rec_chinese_lite_train_v2.0.yml as an example
| pretrained_model | Set the path of the pre-trained model | ./pretrain_models/CRNN/best_accuracy | \ |
| checkpoints | set model parameter path | None | Used to load parameters after interruption to continue training|
| use_visualdl | Set whether to enable visualdl for visual log display | False | [Tutorial](https://www.paddlepaddle.org.cn/paddle/visualdl) |
| use_wandb | Set whether to enable W&B for visual log display | False | [Documentation](https://docs.wandb.ai/)
| infer_img | Set inference image path or folder path | ./infer_img | \||
| character_dict_path | Set dictionary path | ./ppocr/utils/ppocr_keys_v1.txt | If the character_dict_path is None, model can only recognize number and lower letters |
| max_text_length | Set the maximum length of text | 25 | \ |
......@@ -66,7 +67,7 @@ In PaddleOCR, the network is divided into four stages: Transform, Backbone, Neck
| :---------------------: | :---------------------: | :--------------: | :--------------------: |
| model_type | Network Type | rec | Currently support`rec`,`det`,`cls` |
| algorithm | Model name | CRNN | See [algorithm_overview](./algorithm_overview_en.md) for the support list |
| **Transform** | Set the transformation method | - | Currently only recognition algorithms are supported, see [ppocr/modeling/transforms](../../ppocr/modeling/transforms) for details |
| **Transform** | Set the transformation method | - | Currently only recognition algorithms are supported, see [ppocr/modeling/transform](../../ppocr/modeling/transforms) for details |
| name | Transformation class name | TPS | Currently supports `TPS` |
| num_fiducial | Number of TPS control points | 20 | Ten on the top and bottom |
| loc_lr | Localization network learning rate | 0.1 | \ |
......@@ -130,6 +131,17 @@ In PaddleOCR, the network is divided into four stages: Transform, Backbone, Neck
| drop_last | Whether to discard the last incomplete mini-batch because the number of samples in the data set cannot be divisible by batch_size | True | \ |
| num_workers | The number of sub-processes used to load data, if it is 0, the sub-process is not started, and the data is loaded in the main process | 8 | \ |
### Weights & Biases ([W&B](../../ppocr/utils/loggers/wandb_logger.py))
| Parameter | Use | Defaults | Note |
| :---------------------: | :---------------------: | :--------------: | :--------------------: |
| project | Project to which the run is to be logged | uncategorized | \
| name | Alias/Name of the run | Randomly generated by wandb | \
| id | ID of the run | Randomly generated by wandb | \
| entity | User or team to which the run is being logged | The logged in user | \
| save_dir | local directory in which all the models and other data is saved | wandb | \
| config | model configuration | None | \
<a name="3-multilingual-config-file-generation"></a>
## 3. Multilingual Config File Generation
......@@ -233,4 +245,4 @@ For more supported languages, please refer to : [Multi-language model](https://g
The multi-language model training method is the same as the Chinese model. The training data set is 100w synthetic data. A small amount of fonts and test data can be downloaded using the following two methods.
* [Baidu Netdisk](https://pan.baidu.com/s/1bS_u207Rm7YbY33wOECKDA),Extraction code:frgi.
* [Google drive](https://drive.google.com/file/d/18cSWX7wXSy4G0tbKJ0d9PuIaiwRLHpjA/view)
* [Google drive](https://drive.google.com/file/d/18cSWX7wXSy4G0tbKJ0d9PuIaiwRLHpjA/view)
\ No newline at end of file
## Logging metrics and models
PaddleOCR comes with two metric logging tools integrated directly into the training API: [VisualDL](https://readthedocs.org/projects/visualdl/) and [Weights & Biases](https://docs.wandb.ai/).
### VisualDL
VisualDL is a visualization analysis tool of PaddlePaddle. The integration allows all training metrics to be logged to a VisualDL dashboard. To use it, add the following line to the `Global` section of the config yaml file -
```
Global:
use_visualdl: True
```
To see the visualizations run the following command in your terminal
```shell
visualdl --logdir <save_model_dir>
```
Now open `localhost:8040` in your browser of choice!
### Weights & Biases
W&B is a MLOps tool that can be used for experiment tracking, dataset/model versioning, visualizing results and collaborating with colleagues. A W&B logger is integrated directly into PaddleOCR and to use it, first you need to install the `wandb` sdk and login to your wandb account.
```shell
pip install wandb
wandb login
```
If you do not have a wandb account, you can make one [here](https://wandb.ai/site).
To visualize and track your model training add the following flag to your config yaml file under the `Global` section -
```
Global:
use_wandb: True
```
To add more arguments to the `WandbLogger` listed [here](./config_en.md) add the header `wandb` to the yaml file and add the arguments under it -
```
wandb:
project: my_project
entity: my_team
```
These config variables from the yaml file are used to instantiate the `WandbLogger` object with the project name, entity name (the logged in user by default), directory to store metadata (`./wandb` by default) and more. During the training process, the `log_metrics` function is called to log training and evaluation metrics at the training and evaluation steps respectively from the rank 0 process only.
At every model saving step, the WandbLogger, logs the model using the `log_model` function along with relavant metadata and tags showing the epoch in which the model is saved, the model is best or not and so on.
All the logging mentioned above is integrated into the `program.train` function and will generate dashboards like this -
![W&B Dashboard](../imgs_en/wandb_metrics.png)
![W&B Models](../imgs_en/wandb_models.png)
For more advanced usage to log images, audios, videos or any other form of data, you can use `WandbLogger().run.log`. More examples on how to log different kinds of data are available [here](https://docs.wandb.ai/examples).
To view the dashboard, the link to the dashboard is printed to the console at the beginning and end of every training job and you can also access it by logging into your W&B account on your browser.
### Using Multiple Loggers
Both VisualDL and W&B can also be used simultaneously by just setting both the aforementioned flags to True.
\ No newline at end of file
......@@ -16,13 +16,13 @@
- [3. Text Angle Classification Model](#3-text-angle-classification-model)
- [4. Paddle-Lite Model](#4-paddle-lite-model)
The downloadable models provided by PaddleOCR include `inference model`, `trained model`, `pre-trained model` and `slim model`. The differences between the models are as follows:
The downloadable models provided by PaddleOCR include `inference model`, `trained model`, `pre-trained model` and `nb model`. The differences between the models are as follows:
|model type|model format|description|
|--- | --- | --- |
|inference model|inference.pdmodel、inference.pdiparams|Used for inference based on Paddle inference engine,[detail](./inference_en.md)|
|trained model, pre-trained model|\*.pdparams、\*.pdopt、\*.states |The checkpoints model saved in the training process, which stores the parameters of the model, mostly used for model evaluation and continuous training.|
|slim model|\*.nb| Model compressed by PaddleSlim (a model compression tool using PaddlePaddle), which is suitable for mobile-side deployment scenarios (Paddle-Lite is needed for slim model deployment). |
|nb model|\*.nb| Model optimized by Paddle-Lite, which is suitable for mobile-side deployment scenarios (Paddle-Lite is needed for nb model deployment). |
Relationship of the above models is as follows.
......@@ -37,7 +37,7 @@ Relationship of the above models is as follows.
|model name|description|config|model size|download|
| --- | --- | --- | --- | --- |
|ch_PP-OCRv3_det_slim| [New] slim quantization with distillation lightweight model, supporting Chinese, English, multilingual text detection |[ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml)| 1.1M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_slim_infer.tar) / [trained model (coming soon)](https://paddleocr.bj.bcebos.com/PP-OCRv3/ch/ch_PP-OCRv3_det_slim_distill_train.tar) / [slim model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_slim_infer.nb)|
|ch_PP-OCRv3_det_slim| [New] slim quantization with distillation lightweight model, supporting Chinese, English, multilingual text detection |[ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml)| 1.1M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_slim_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/ch/ch_PP-OCRv3_det_slim_distill_train.tar) / [nb model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_slim_infer.nb)|
|ch_PP-OCRv3_det| [New] Original lightweight model, supporting Chinese, English, multilingual text detection |[ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml)| 3.8M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_distill_train.tar)|
|ch_PP-OCRv2_det_slim| [New] slim quantization with distillation lightweight model, supporting Chinese, English, multilingual text detection|[ch_PP-OCRv2_det_cml.yml](../../configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_cml.yml)| 3M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_slim_quant_infer.tar)|
|ch_PP-OCRv2_det| [New] Original lightweight model, supporting Chinese, English, multilingual text detection|[ch_PP-OCRv2_det_cml.yml](../../configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_cml.yml)|3M|[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_distill_train.tar)|
......@@ -51,7 +51,7 @@ Relationship of the above models is as follows.
|model name|description|config|model size|download|
| --- | --- | --- | --- | --- |
|en_PP-OCRv3_det_slim | [New] Slim qunatization with distillation lightweight detection model, supporting English | [ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml) | 1.1M |[inference model(coming soon)](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_slim_infer.tar) / [trained model (coming soon)](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_slim_distill_train.tar) / [slim model(coming soon)](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_slim_infer.nb) |
|en_PP-OCRv3_det_slim | [New] Slim qunatization with distillation lightweight detection model, supporting English | [ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml) | 1.1M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_slim_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_slim_distill_train.tar) / [nb model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_slim_infer.nb) |
|ch_PP-OCRv3_det | [New] Original lightweight detection model, supporting English |[ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml)| 3.8M | [inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_distill_train.tar) |
* Note: English configuration file is same as Chinese except training data, here we only provide one configuration file.
......@@ -62,7 +62,7 @@ Relationship of the above models is as follows.
|model name|description|config|model size|download|
| --- | --- | --- | --- | --- |
| ml_PP-OCRv3_det_slim | [New] Slim qunatization with distillation lightweight detection model, supporting English | [ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml) | 1.1M | [inference model(coming soon)](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/Multilingual_PP-OCRv3_det_slim_infer.tar) / [trained model (coming soon)](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/Multilingual_PP-OCRv3_det_slim_distill_train.tar) / [slim model(coming soon)](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/Multilingual_PP-OCRv3_det_slim_infer.nb) |
| ml_PP-OCRv3_det_slim | [New] Slim qunatization with distillation lightweight detection model, supporting English | [ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml) | 1.1M | [inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/Multilingual_PP-OCRv3_det_slim_infer.tar) / [trained model ](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/Multilingual_PP-OCRv3_det_slim_distill_train.tar) / [nb model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/Multilingual_PP-OCRv3_det_slim_infer.nb) |
| ml_PP-OCRv3_det |[New] Original lightweight detection model, supporting English | [ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml)| 3.8M | [inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/Multilingual_PP-OCRv3_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/Multilingual_PP-OCRv3_det_distill_train.tar) |
* Note: English configuration file is same as Chinese except training data, here we only provide one configuration file.
......@@ -75,7 +75,7 @@ Relationship of the above models is as follows.
|model name|description|config|model size|download|
| --- | --- | --- | --- | --- |
|ch_PP-OCRv3_rec_slim | [New] Slim qunatization with distillation lightweight model, supporting Chinese, English text recognition |[ch_PP-OCRv3_rec_distillation.yml](../../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml)| 4.9M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/ch/ch_PP-OCRv3_rec_slim_train.tar) / [slim model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_infer.nb) |
|ch_PP-OCRv3_rec_slim | [New] Slim qunatization with distillation lightweight model, supporting Chinese, English text recognition |[ch_PP-OCRv3_rec_distillation.yml](../../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml)| 4.9M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/ch/ch_PP-OCRv3_rec_slim_train.tar) / [nb model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_infer.nb) |
|ch_PP-OCRv3_rec| [New] Original lightweight model, supporting Chinese, English, multilingual text recognition |[ch_PP-OCRv3_rec_distillation.yml](../../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml)| 12.4M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_train.tar) |
|ch_PP-OCRv2_rec_slim| Slim qunatization with distillation lightweight model, supporting Chinese, English text recognition|[ch_PP-OCRv2_rec.yml](../../configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec.yml)| 9M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_train.tar) |
|ch_PP-OCRv2_rec| Original lightweight model, supporting Chinese, English, multilingual text recognition |[ch_PP-OCRv2_rec_distillation.yml](../../configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec_distillation.yml)|8.5M|[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_train.tar) |
......@@ -91,8 +91,8 @@ Relationship of the above models is as follows.
|model name|description|config|model size|download|
| --- | --- | --- | --- | --- |
|en_PP-OCRv3_rec_slim | [New] Slim qunatization with distillation lightweight model, supporting english, English text recognition |[en_PP-OCRv3_rec_distillation.yml](../../configs/rec/PP-OCRv3/en_PP-OCRv3_rec_distillation.yml)| 4.9M |[inference model(coming soon)](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_slim_infer.tar) / [trained model (coming soon)](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_slim_train.tar) / [slim model(coming soon)](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_slim_infer.nb) |
|en_PP-OCRv3_rec| [New] Original lightweight model, supporting english, English, multilingual text recognition |[en_PP-OCRv3_rec_distillation.yml](../../configs/rec/PP-OCRv3/en_PP-OCRv3_rec_distillation.yml)| 12.4M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_train.tar) |
|en_PP-OCRv3_rec_slim | [New] Slim qunatization with distillation lightweight model, supporting english, English text recognition |[en_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/en_PP-OCRv3_rec.yml)| 3.2M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_slim_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_slim_train.tar) / [nb model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_slim_infer.nb) |
|en_PP-OCRv3_rec| [New] Original lightweight model, supporting english, English, multilingual text recognition |[en_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/en_PP-OCRv3_rec.yml)| 9.6M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_train.tar) |
|en_number_mobile_slim_v2.0_rec|Slim pruned and quantized lightweight model, supporting English and number recognition|[rec_en_number_lite_train.yml](../../configs/rec/multi_language/rec_en_number_lite_train.yml)| 2.7M | [inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/en_number_mobile_v2.0_rec_slim_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/en_number_mobile_v2.0_rec_slim_train.tar) |
|en_number_mobile_v2.0_rec|Original lightweight model, supporting English and number recognition|[rec_en_number_lite_train.yml](../../configs/rec/multi_language/rec_en_number_lite_train.yml)|2.6M|[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/en_number_mobile_v2.0_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/en_number_mobile_v2.0_rec_train.tar) |
......@@ -122,11 +122,16 @@ For more supported languages, please refer to : [Multi-language model](./multi_l
|model name|description|config|model size|download|
| --- | --- | --- | --- | --- |
|ch_ppocr_mobile_slim_v2.0_cls|Slim quantized model for text angle classification|[cls_mv3.yml](../../configs/cls/cls_mv3.yml)| 2.1M | [inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_slim_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_slim_train.tar) |
|ch_ppocr_mobile_slim_v2.0_cls|Slim quantized model for text angle classification|[cls_mv3.yml](../../configs/cls/cls_mv3.yml)| 2.1M | [inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_slim_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_slim_train.tar) / [nb model](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_ppocr_mobile_v2.0_cls_infer_opt.nb) |
|ch_ppocr_mobile_v2.0_cls|Original model for text angle classification|[cls_mv3.yml](../../configs/cls/cls_mv3.yml)|1.38M|[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) |
<a name="Paddle-Lite"></a>
## 4. Paddle-Lite Model
Paddle Lite is an updated version of Paddle-Mobile, an open-open source deep learning framework designed to make it easy to perform inference on mobile, embeded, and IoT devices. It can further optimize the inference model and generate `nb model` used for edge devices. It's suggested to optimize the quantization model using Paddle-Lite because `INT8` format is used for the model storage and inference.
This chapter lists OCR nb models with PP-OCRv2 or earlier versions. You can access to the latest nb models from the above tables.
|Version|Introduction|Model size|Detection model|Text Direction model|Recognition model|Paddle-Lite branch|
|---|---|---|---|---|---|---|
|PP-OCRv2|extra-lightweight chinese OCR optimized model|11M|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_PP-OCRv2_det_infer_opt.nb)|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_ppocr_mobile_v2.0_cls_infer_opt.nb)|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_PP-OCRv2_rec_infer_opt.nb)|v2.10|
......
......@@ -17,6 +17,7 @@ English | [简体中文](../doc_ch/ppocr_introduction.md)
PP-OCR is a self-developed practical ultra-lightweight OCR system, which is slimed and optimized based on the reimplemented [academic algorithms](algorithm_en.md), considering the balance between **accuracy** and **speed**.
#### PP-OCR
PP-OCR is a two-stage OCR system, in which the text detection algorithm is [DB](algorithm_det_db_en.md), and the text recognition algorithm is [CRNN](algorithm_rec_crnn_en.md). Besides, a [text direction classifier](angle_class_en.md) is added between the detection and recognition modules to deal with text in different directions.
PP-OCR pipeline is as follows:
......@@ -28,11 +29,16 @@ PP-OCR pipeline is as follows:
PP-OCR system is in continuous optimization. At present, PP-OCR and PP-OCRv2 have been released:
[1] PP-OCR adopts 19 effective strategies from 8 aspects including backbone network selection and adjustment, prediction head design, data augmentation, learning rate transformation strategy, regularization parameter selection, pre-training model use, and automatic model tailoring and quantization to optimize and slim down the models of each module (as shown in the green box above). The final results are an ultra-lightweight Chinese and English OCR model with an overall size of 3.5M and a 2.8M English digital OCR model. For more details, please refer to the PP-OCR technical article (https://arxiv.org/abs/2009.09941).
PP-OCR adopts 19 effective strategies from 8 aspects including backbone network selection and adjustment, prediction head design, data augmentation, learning rate transformation strategy, regularization parameter selection, pre-training model use, and automatic model tailoring and quantization to optimize and slim down the models of each module (as shown in the green box above). The final results are an ultra-lightweight Chinese and English OCR model with an overall size of 3.5M and a 2.8M English digital OCR model. For more details, please refer to the PP-OCR technical article (https://arxiv.org/abs/2009.09941).
[2] On the basis of PP-OCR, PP-OCRv2 is further optimized in five aspects. The detection model adopts CML(Collaborative Mutual Learning) knowledge distillation strategy and CopyPaste data expansion strategy. The recognition model adopts LCNet lightweight backbone network, U-DML knowledge distillation strategy and enhanced CTC loss function improvement (as shown in the red box above), which further improves the inference speed and prediction effect. For more details, please refer to the technical report of PP-OCRv2 (https://arxiv.org/abs/2109.03144).
#### PP-OCRv2
On the basis of PP-OCR, PP-OCRv2 is further optimized in five aspects. The detection model adopts CML(Collaborative Mutual Learning) knowledge distillation strategy and CopyPaste data expansion strategy. The recognition model adopts LCNet lightweight backbone network, U-DML knowledge distillation strategy and enhanced CTC loss function improvement (as shown in the red box above), which further improves the inference speed and prediction effect. For more details, please refer to the technical report of PP-OCRv2 (https://arxiv.org/abs/2109.03144).
[3] PP-OCRv3 is further upgraded on the basis of PP-OCRv2. The detection model is still based on DB algorithm, and the optimization strategies include a newly proposed FPN structure with residual attention mechanism named with RSEFPN, a PAN structure with enlarged receptive field named with LKPAN, and better teacher model based on DML training; The recognition model replaces the base model from CRNN with IJCAI 2022 paper [SVTR](https://arxiv.org/abs/2205.00159), and adopts lightweight SVTR, guided training of CTC, data augmentation strategy RecConAug, better pre-trained model by self-supervised training, and the use of unlabeled data to accelerate the model and improve the effect. For more details, please refer to PP-OCRv3 [technical report](./PP-OCRv3_introduction_en.md).
#### PP-OCRv3
PP-OCRv3 upgraded the detection model and recognition model in 9 aspects based on PP-OCRv2:
- PP-OCRv3 detector upgrades the CML(Collaborative Mutual Learning) text detection strategy proposed in PP-OCRv2, and further optimizes the effect of teacher model and student model respectively. In the optimization of teacher model, a pan module with large receptive field named LK-PAN is proposed and the DML distillation strategy is adopted; In the optimization of student model, a FPN module with residual attention mechanism named RSE-FPN is proposed.
- PP-OCRv3 recognizer is optimized based on text recognition algorithm [SVTR](https://arxiv.org/abs/2205.00159). SVTR no longer adopts RNN by introducing transformers structure, which can mine the context information of text line image more effectively, so as to improve the ability of text recognition. PP-OCRv3 adopts lightweight text recognition network SVTR_LCNet, guided training of CTC loss by attention loss, data augmentation strategy TextConAug, better pre-trained model by self-supervised TextRotNet, UDML(Unified Deep Mutual Learning), and UIM (Unlabeled Images Mining) to accelerate the model and improve the effect.
PP-OCRv3 pipeline is as follows:
......@@ -40,6 +46,8 @@ PP-OCRv3 pipeline is as follows:
<img src="../ppocrv3_framework.png" width="800">
</div>
For more details, please refer to [PP-OCRv3 technical report](./PP-OCRv3_introduction_en.md).
<a name="2"></a>
## 2. Features
......
- [PaddleOCR Quick Start](#paddleocr-quick-start)
- [1. Installation](#1-installation)
# PaddleOCR Quick Start
**Note:** This tutorial mainly introduces the usage of PP-OCR series models, please refer to [PP-Structure Quick Start](../../ppstructure/docs/quickstart_en.md) for the quick use of document analysis related functions.
- [1. Installation](#1-installation)
- [1.1 Install PaddlePaddle](#11-install-paddlepaddle)
- [1.2 Install PaddleOCR Whl Package](#12-install-paddleocr-whl-package)
- [2. Easy-to-Use](#2-easy-to-use)
- [2. Easy-to-Use](#2-easy-to-use)
- [2.1 Use by Command Line](#21-use-by-command-line)
- [2.1.1 Chinese and English Model](#211-chinese-and-english-model)
- [2.1.2 Multi-language Model](#212-multi-language-model)
- [2.1.3 Layout Analysis](#213-layout-analysis)
- [2.2 Use by Code](#22-use-by-code)
- [2.2.1 Chinese & English Model and Multilingual Model](#221-chinese--english-model-and-multilingual-model)
- [2.2.2 Layout Analysis](#222-layout-analysis)
- [3. Summary](#3-summary)
- [3. Summary](#3-summary)
# PaddleOCR Quick Start
<a name="1nstallation"></a>
......@@ -73,8 +73,6 @@ cd /path/to/ppocr_img
If you do not use the provided test image, you can replace the following `--image_dir` parameter with the corresponding test image path
**Note**: The whl package uses the `PP-OCRv3` model by default, and the input shape used by the recognition model is `3,48,320`, so if you use the recognition function, you need to add the parameter `--rec_image_shape 3,48,320`, if you do not use the default `PP- OCRv3` model, you do not need to set this parameter.
<a name="211-english-and-chinese-model"></a>
#### 2.1.1 Chinese and English Model
......@@ -82,7 +80,7 @@ If you do not use the provided test image, you can replace the following `--imag
* Detection, direction classification and recognition: set the parameter`--use_gpu false` to disable the gpu device
```bash
paddleocr --image_dir ./imgs_en/img_12.jpg --use_angle_cls true --lang en --use_gpu false --rec_image_shape 3,48,320
paddleocr --image_dir ./imgs_en/img_12.jpg --use_angle_cls true --lang en --use_gpu false
```
Output will be a list, each item contains bounding box, text and recognition confidence
......@@ -112,7 +110,7 @@ If you do not use the provided test image, you can replace the following `--imag
* Only recognition: set `--det` to `false`
```bash
paddleocr --image_dir ./imgs_words_en/word_10.png --det false --lang en --rec_image_shape 3,48,320
paddleocr --image_dir ./imgs_words_en/word_10.png --det false --lang en
```
Output will be a list, each item contains text and recognition confidence
......@@ -121,15 +119,15 @@ If you do not use the provided test image, you can replace the following `--imag
['PAIN', 0.9934559464454651]
```
If you need to use the 2.0 model, please specify the parameter `--version PP-OCR`, paddleocr uses the PP-OCRv3 model by default(`--versioin PP-OCRv3`). More whl package usage can be found in [whl package](./whl_en.md)
If you need to use the 2.0 model, please specify the parameter `--ocr_version PP-OCR`, paddleocr uses the PP-OCRv3 model by default(`--ocr_version PP-OCRv3`). More whl package usage can be found in [whl package](./whl_en.md)
<a name="212-multi-language-model"></a>
#### 2.1.2 Multi-language Model
Paddleocr currently supports 80 languages, which can be switched by modifying the `--lang` parameter. PP-OCRv3 currently only supports Chinese and English models, and other multilingual models will be updated one after another.
PaddleOCR currently supports 80 languages, which can be switched by modifying the `--lang` parameter.
``` bash
paddleocr --image_dir ./doc/imgs_en/254.jpg --lang=en --rec_image_shape 3,48,320
paddleocr --image_dir ./doc/imgs_en/254.jpg --lang=en
```
<div align="center">
......@@ -154,48 +152,7 @@ Commonly used multilingual abbreviations include
| Chinese Traditional | chinese_cht | | Italian | it | | Russian | ru |
A list of all languages and their corresponding abbreviations can be found in [Multi-Language Model Tutorial](./multi_languages_en.md)
<a name="213-layoutAnalysis"></a>
#### 2.1.3 Layout Analysis
Layout analysis refers to the division of 5 types of areas of the document, including text, title, list, picture and table. For the first three types of regions, directly use the OCR model to complete the text detection and recognition of the corresponding regions, and save the results in txt. For the table area, after the table structuring process, the table picture is converted into an Excel file of the same table style. The picture area will be individually cropped into an image.
To use the layout analysis function of PaddleOCR, you need to specify `--type=structure`
```bash
paddleocr --image_dir=../doc/table/1.png --type=structure
```
- **Results Format**
The returned results of PP-Structure is a list composed of a dict, an example is as follows
```shell
[
{ 'type': 'Text',
'bbox': [34, 432, 345, 462],
'res': ([[36.0, 437.0, 341.0, 437.0, 341.0, 446.0, 36.0, 447.0], [41.0, 454.0, 125.0, 453.0, 125.0, 459.0, 41.0, 460.0]],
[('Tigure-6. The performance of CNN and IPT models using difforen', 0.90060663), ('Tent ', 0.465441)])
}
]
```
The description of each field in dict is as follows
| Parameter | Description |
| --------- | ------------------------------------------------------------ |
| type | Type of image area |
| bbox | The coordinates of the image area in the original image, respectively [left upper x, left upper y, right bottom x, right bottom y] |
| res | OCR or table recognition result of image area。<br> Table: HTML string of the table; <br> OCR: A tuple containing the detection coordinates and recognition results of each single line of text |
- **Parameter Description:**
| Parameter | Description | Default value |
| --------------- | ------------------------------------------------------------ | -------------------------------------------- |
| output | The path where excel and recognition results are saved | ./output/table |
| table_max_len | The long side of the image is resized in table structure model | 488 |
| table_model_dir | inference model path of table structure model | None |
| table_char_dict_path | dict path of table structure model | ../ppocr/utils/dict/table_structure_dict.txt |
<a name="22-use-by-code"></a>
......@@ -243,40 +200,12 @@ Visualization of results
<div align="center">
<img src="../imgs_results/whl/12_det_rec.jpg" width="800">
</div>
<a name="222-layoutAnalysis"></a>
#### 2.2.2 Layout Analysis
```python
import os
import cv2
from paddleocr import PPStructure,draw_structure_result,save_structure_res
table_engine = PPStructure(show_log=True)
save_folder = './output/table'
img_path = './table/1.png'
img = cv2.imread(img_path)
result = table_engine(img)
save_structure_res(result, save_folder,os.path.basename(img_path).split('.')[0])
for line in result:
line.pop('img')
print(line)
from PIL import Image
font_path = './fonts/simfang.ttf'
image = Image.open(img_path).convert('RGB')
im_show = draw_structure_result(image, result,font_path=font_path)
im_show = Image.fromarray(im_show)
im_show.save('result.jpg')
```
<a name="3"></a>
## 3. Summary
In this section, you have mastered the use of PaddleOCR whl packages and obtained results.
In this section, you have mastered the use of PaddleOCR whl package.
PaddleOCR is a rich and practical OCR tool library that opens up the whole process of data, model training, compression and inference deployment, so in the [next section](./paddleOCR_overview_en.md) we will first introduce you to the overview of PaddleOCR, and then clone the PaddleOCR project to start the application journey of PaddleOCR.
PaddleOCR is a rich and practical OCR tool library that get through the whole process of data production, model training, compression, inference and deployment, please refer to the [tutorials](../../README.md#tutorials) to start the journey of PaddleOCR.
# RECENT UPDATES
- 2022.5.7 Add support for metric and model logging during training to [Weights & Biases](https://docs.wandb.ai/).
- 2021.12.21 OCR open source online course starts. The lesson starts at 8:30 every night and lasts for ten days. Free registration: https://aistudio.baidu.com/aistudio/course/introduce/25207
- 2021.12.21 release PaddleOCR v2.4, release 1 text detection algorithm (PSENet), 3 text recognition algorithms (NRTR、SEED、SAR), 1 key information extraction algorithm (SDMGR) and 3 DocVQA algorithms (LayoutLM、LayoutLMv2,LayoutXLM).
- 2021.9.7 release PaddleOCR v2.3, [PP-OCRv2](#PP-OCRv2) is proposed. The CPU inference speed of PP-OCRv2 is 220% higher than that of PP-OCR server. The F-score of PP-OCRv2 is 7% higher than that of PP-OCR mobile.
......
......@@ -172,11 +172,9 @@ show help information
paddleocr -h
```
**Note**: The whl package uses the `PP-OCRv3` model by default, and the input shape used by the recognition model is `3,48,320`, so if you use the recognition function, you need to add the parameter `--rec_image_shape 3,48,320`, if you do not use the default `PP- OCRv3` model, you do not need to set this parameter.
* detection classification and recognition
```bash
paddleocr --image_dir PaddleOCR/doc/imgs_en/img_12.jpg --use_angle_cls true --lang en --rec_image_shape 3,48,320
paddleocr --image_dir PaddleOCR/doc/imgs_en/img_12.jpg --use_angle_cls true --lang en
```
Output will be a list, each item contains bounding box, text and recognition confidence
......@@ -189,7 +187,7 @@ Output will be a list, each item contains bounding box, text and recognition con
* detection and recognition
```bash
paddleocr --image_dir PaddleOCR/doc/imgs_en/img_12.jpg --lang en --rec_image_shape 3,48,320
paddleocr --image_dir PaddleOCR/doc/imgs_en/img_12.jpg --lang en
```
Output will be a list, each item contains bounding box, text and recognition confidence
......@@ -202,7 +200,7 @@ Output will be a list, each item contains bounding box, text and recognition con
* classification and recognition
```bash
paddleocr --image_dir PaddleOCR/doc/imgs_words_en/word_10.png --use_angle_cls true --det false --lang en --rec_image_shape 3,48,320
paddleocr --image_dir PaddleOCR/doc/imgs_words_en/word_10.png --use_angle_cls true --det false --lang en
```
Output will be a list, each item contains text and recognition confidence
......@@ -225,7 +223,7 @@ Output will be a list, each item only contains bounding box
* only recognition
```bash
paddleocr --image_dir PaddleOCR/doc/imgs_words_en/word_10.png --det false --lang en --rec_image_shape 3,48,320
paddleocr --image_dir PaddleOCR/doc/imgs_words_en/word_10.png --det false --lang en
```
Output will be a list, each item contains text and recognition confidence
......@@ -368,4 +366,4 @@ im_show.save('result.jpg')
| cls | Enable classification when `ppocr.ocr` func exec((Use use_angle_cls in command line mode to control whether to start classification in the forward direction) | FALSE |
| show_log | Whether to print log| FALSE |
| type | Perform ocr or table structuring, the value is selected in ['ocr','structure'] | ocr |
| ocr_version | OCR Model version number, the current model support list is as follows: PP-OCRv3 support Chinese and English detection and recognition model and direction classifier model, PP-OCRv2 support Chinese detection and recognition model, PP-OCR support Chinese detection, recognition and direction classifier, multilingual recognition model | PP-OCRv3 |
| ocr_version | OCR Model version number, the current model support list is as follows: PP-OCRv3 supports Chinese and English detection, recognition, multilingual recognition, direction classifier models, PP-OCRv2 support Chinese detection and recognition model, PP-OCR support Chinese detection, recognition and direction classifier, multilingual recognition model | PP-OCRv3 |
doc/imgs_words/arabic/ar_1.jpg

4.7 KB | W: | H:

doc/imgs_words/arabic/ar_1.jpg

46.9 KB | W: | H:

doc/imgs_words/arabic/ar_1.jpg
doc/imgs_words/arabic/ar_1.jpg
doc/imgs_words/arabic/ar_1.jpg
doc/imgs_words/arabic/ar_1.jpg
  • 2-up
  • Swipe
  • Onion skin
doc/imgs_words/arabic/ar_2.jpg

3.6 KB | W: | H:

doc/imgs_words/arabic/ar_2.jpg

13.1 KB | W: | H:

doc/imgs_words/arabic/ar_2.jpg
doc/imgs_words/arabic/ar_2.jpg
doc/imgs_words/arabic/ar_2.jpg
doc/imgs_words/arabic/ar_2.jpg
  • 2-up
  • Swipe
  • Onion skin
doc/ppocr_v3/GTC.png

415.0 KB | W: | H:

doc/ppocr_v3/GTC.png

238.2 KB | W: | H:

doc/ppocr_v3/GTC.png
doc/ppocr_v3/GTC.png
doc/ppocr_v3/GTC.png
doc/ppocr_v3/GTC.png
  • 2-up
  • Swipe
  • Onion skin
doc/ppocr_v3/LKPAN.png

130.2 KB | W: | H:

doc/ppocr_v3/LKPAN.png

92.5 KB | W: | H:

doc/ppocr_v3/LKPAN.png
doc/ppocr_v3/LKPAN.png
doc/ppocr_v3/LKPAN.png
doc/ppocr_v3/LKPAN.png
  • 2-up
  • Swipe
  • Onion skin
doc/ppocr_v3/RSEFPN.png

125.7 KB | W: | H:

doc/ppocr_v3/RSEFPN.png

97.6 KB | W: | H:

doc/ppocr_v3/RSEFPN.png
doc/ppocr_v3/RSEFPN.png
doc/ppocr_v3/RSEFPN.png
doc/ppocr_v3/RSEFPN.png
  • 2-up
  • Swipe
  • Onion skin
doc/ppocr_v3/svtr_g2.png

323.3 KB | W: | H:

doc/ppocr_v3/svtr_g2.png

187.3 KB | W: | H:

doc/ppocr_v3/svtr_g2.png
doc/ppocr_v3/svtr_g2.png
doc/ppocr_v3/svtr_g2.png
doc/ppocr_v3/svtr_g2.png
  • 2-up
  • Swipe
  • Onion skin
doc/ppocr_v3/svtr_g4.png

549.7 KB | W: | H:

doc/ppocr_v3/svtr_g4.png

213.6 KB | W: | H:

doc/ppocr_v3/svtr_g4.png
doc/ppocr_v3/svtr_g4.png
doc/ppocr_v3/svtr_g4.png
doc/ppocr_v3/svtr_g4.png
  • 2-up
  • Swipe
  • Onion skin
doc/ppocr_v3/svtr_tiny.png

585.6 KB | W: | H:

doc/ppocr_v3/svtr_tiny.png

725.7 KB | W: | H:

doc/ppocr_v3/svtr_tiny.png
doc/ppocr_v3/svtr_tiny.png
doc/ppocr_v3/svtr_tiny.png
doc/ppocr_v3/svtr_tiny.png
  • 2-up
  • Swipe
  • Onion skin
doc/ppocrv3_framework.png

957.2 KB | W: | H:

doc/ppocrv3_framework.png

1.1 MB | W: | H:

doc/ppocrv3_framework.png
doc/ppocrv3_framework.png
doc/ppocrv3_framework.png
doc/ppocrv3_framework.png
  • 2-up
  • Swipe
  • Onion skin
......@@ -67,6 +67,10 @@ MODEL_URLS = {
'url':
'https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar',
},
'ml': {
'url':
'https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/Multilingual_PP-OCRv3_det_infer.tar'
}
},
'rec': {
'ch': {
......@@ -79,6 +83,56 @@ MODEL_URLS = {
'https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_infer.tar',
'dict_path': './ppocr/utils/en_dict.txt'
},
'korean': {
'url':
'https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/korean_PP-OCRv3_rec_infer.tar',
'dict_path': './ppocr/utils/dict/korean_dict.txt'
},
'japan': {
'url':
'https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/japan_PP-OCRv3_rec_infer.tar',
'dict_path': './ppocr/utils/dict/japan_dict.txt'
},
'chinese_cht': {
'url':
'https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/chinese_cht_PP-OCRv3_rec_infer.tar',
'dict_path': './ppocr/utils/dict/chinese_cht_dict.txt'
},
'ta': {
'url':
'https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/ta_PP-OCRv3_rec_infer.tar',
'dict_path': './ppocr/utils/dict/ta_dict.txt'
},
'te': {
'url':
'https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/te_PP-OCRv3_rec_infer.tar',
'dict_path': './ppocr/utils/dict/te_dict.txt'
},
'ka': {
'url':
'https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/ka_PP-OCRv3_rec_infer.tar',
'dict_path': './ppocr/utils/dict/ka_dict.txt'
},
'latin': {
'url':
'https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/latin_PP-OCRv3_rec_infer.tar',
'dict_path': './ppocr/utils/dict/latin_dict.txt'
},
'arabic': {
'url':
'https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/arabic_PP-OCRv3_rec_infer.tar',
'dict_path': './ppocr/utils/dict/arabic_dict.txt'
},
'cyrillic': {
'url':
'https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/cyrillic_PP-OCRv3_rec_infer.tar',
'dict_path': './ppocr/utils/dict/cyrillic_dict.txt'
},
'devanagari': {
'url':
'https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/devanagari_PP-OCRv3_rec_infer.tar',
'dict_path': './ppocr/utils/dict/devanagari_dict.txt'
},
},
'cls': {
'ch': {
......@@ -259,7 +313,7 @@ def parse_lang(lang):
'af', 'az', 'bs', 'cs', 'cy', 'da', 'de', 'es', 'et', 'fr', 'ga', 'hr',
'hu', 'id', 'is', 'it', 'ku', 'la', 'lt', 'lv', 'mi', 'ms', 'mt', 'nl',
'no', 'oc', 'pi', 'pl', 'pt', 'ro', 'rs_latin', 'sk', 'sl', 'sq', 'sv',
'sw', 'tl', 'tr', 'uz', 'vi'
'sw', 'tl', 'tr', 'uz', 'vi', 'french', 'german'
]
arabic_lang = ['ar', 'fa', 'ug', 'ur']
cyrillic_lang = [
......@@ -285,8 +339,10 @@ def parse_lang(lang):
det_lang = "ch"
elif lang == 'structure':
det_lang = 'structure'
else:
elif lang in ["en", "latin"]:
det_lang = "en"
else:
det_lang = "ml"
return lang, det_lang
......@@ -356,6 +412,10 @@ class PaddleOCR(predict_system.TextSystem):
params.cls_model_dir, cls_url = confirm_model_dir_url(
params.cls_model_dir,
os.path.join(BASE_DIR, 'whl', 'cls'), cls_model_config['url'])
if params.ocr_version == 'PP-OCRv3':
params.rec_image_shape = "3, 48, 320"
else:
params.rec_image_shape = "3, 32, 320"
# download model
maybe_download(params.det_model_dir, det_url)
maybe_download(params.rec_model_dir, rec_url)
......
......@@ -99,8 +99,8 @@ class SAREncoder(nn.Layer):
if valid_ratios is not None:
valid_hf = []
T = holistic_feat.shape[1]
for i, valid_ratio in enumerate(valid_ratios):
valid_step = min(T, math.ceil(T * valid_ratio)) - 1
for i in range(len(valid_ratios)):
valid_step = min(T, math.ceil(T * valid_ratios[i])) - 1
valid_hf.append(holistic_feat[i, valid_step, :])
valid_hf = paddle.stack(valid_hf, axis=0)
else:
......@@ -252,8 +252,8 @@ class ParallelSARDecoder(BaseDecoder):
if valid_ratios is not None:
# cal mask of attention weight
for i, valid_ratio in enumerate(valid_ratios):
valid_width = min(w, math.ceil(w * valid_ratio))
for i in range(len(valid_ratios)):
valid_width = min(w, math.ceil(w * valid_ratios[i]))
if valid_width < w:
attn_weight[i, :, :, valid_width:, :] = float('-inf')
......
from .vdl_logger import VDLLogger
from .wandb_logger import WandbLogger
from .loggers import Loggers
import os
from abc import ABC, abstractmethod
class BaseLogger(ABC):
def __init__(self, save_dir):
self.save_dir = save_dir
os.makedirs(self.save_dir, exist_ok=True)
@abstractmethod
def log_metrics(self, metrics, prefix=None):
pass
@abstractmethod
def close(self):
pass
\ No newline at end of file
from .wandb_logger import WandbLogger
class Loggers(object):
def __init__(self, loggers):
super().__init__()
self.loggers = loggers
def log_metrics(self, metrics, prefix=None, step=None):
for logger in self.loggers:
logger.log_metrics(metrics, prefix=prefix, step=step)
def log_model(self, is_best, prefix, metadata=None):
for logger in self.loggers:
logger.log_model(is_best=is_best, prefix=prefix, metadata=metadata)
def close(self):
for logger in self.loggers:
logger.close()
\ No newline at end of file
from .base_logger import BaseLogger
from visualdl import LogWriter
class VDLLogger(BaseLogger):
def __init__(self, save_dir):
super().__init__(save_dir)
self.vdl_writer = LogWriter(logdir=save_dir)
def log_metrics(self, metrics, prefix=None, step=None):
if not prefix:
prefix = ""
updated_metrics = {prefix + "/" + k: v for k, v in metrics.items()}
for k, v in updated_metrics.items():
self.vdl_writer.add_scalar(k, v, step)
def log_model(self, is_best, prefix, metadata=None):
pass
def close(self):
self.vdl_writer.close()
\ No newline at end of file
import os
from .base_logger import BaseLogger
class WandbLogger(BaseLogger):
def __init__(self,
project=None,
name=None,
id=None,
entity=None,
save_dir=None,
config=None,
**kwargs):
try:
import wandb
self.wandb = wandb
except ModuleNotFoundError:
raise ModuleNotFoundError(
"Please install wandb using `pip install wandb`"
)
self.project = project
self.name = name
self.id = id
self.save_dir = save_dir
self.config = config
self.kwargs = kwargs
self.entity = entity
self._run = None
self._wandb_init = dict(
project=self.project,
name=self.name,
id=self.id,
entity=self.entity,
dir=self.save_dir,
resume="allow"
)
self._wandb_init.update(**kwargs)
_ = self.run
if self.config:
self.run.config.update(self.config)
@property
def run(self):
if self._run is None:
if self.wandb.run is not None:
logger.info(
"There is a wandb run already in progress "
"and newly created instances of `WandbLogger` will reuse"
" this run. If this is not desired, call `wandb.finish()`"
"before instantiating `WandbLogger`."
)
self._run = self.wandb.run
else:
self._run = self.wandb.init(**self._wandb_init)
return self._run
def log_metrics(self, metrics, prefix=None, step=None):
if not prefix:
prefix = ""
updated_metrics = {prefix.lower() + "/" + k: v for k, v in metrics.items()}
self.run.log(updated_metrics, step=step)
def log_model(self, is_best, prefix, metadata=None):
model_path = os.path.join(self.save_dir, prefix + '.pdparams')
artifact = self.wandb.Artifact('model-{}'.format(self.run.id), type='model', metadata=metadata)
artifact.add_file(model_path, name="model_ckpt.pdparams")
aliases = [prefix]
if is_best:
aliases.append("best")
self.run.log_artifact(artifact, aliases=aliases)
def close(self):
self.run.finish()
\ No newline at end of file
......@@ -31,6 +31,7 @@ from ppocr.utils.stats import TrainingStats
from ppocr.utils.save_load import save_model
from ppocr.utils.utility import print_dict, AverageMeter
from ppocr.utils.logging import get_logger
from ppocr.utils.loggers import VDLLogger, WandbLogger, Loggers
from ppocr.utils import profiler
from ppocr.data import build_dataloader
......@@ -161,7 +162,7 @@ def train(config,
eval_class,
pre_best_model_dict,
logger,
vdl_writer=None,
log_writer=None,
scaler=None):
cal_metric_during_train = config['Global'].get('cal_metric_during_train',
False)
......@@ -300,10 +301,8 @@ def train(config,
stats['lr'] = lr
train_stats.update(stats)
if vdl_writer is not None and dist.get_rank() == 0:
for k, v in train_stats.get().items():
vdl_writer.add_scalar('TRAIN/{}'.format(k), v, global_step)
vdl_writer.add_scalar('TRAIN/lr', lr, global_step)
if log_writer is not None and dist.get_rank() == 0:
log_writer.log_metrics(metrics=train_stats.get(), prefix="TRAIN", step=global_step)
if dist.get_rank() == 0 and (
(global_step > 0 and global_step % print_batch_step == 0) or
......@@ -349,11 +348,9 @@ def train(config,
logger.info(cur_metric_str)
# logger metric
if vdl_writer is not None:
for k, v in cur_metric.items():
if isinstance(v, (float, int)):
vdl_writer.add_scalar('EVAL/{}'.format(k),
cur_metric[k], global_step)
if log_writer is not None:
log_writer.log_metrics(metrics=cur_metric, prefix="EVAL", step=global_step)
if cur_metric[main_indicator] >= best_model_dict[
main_indicator]:
best_model_dict.update(cur_metric)
......@@ -374,10 +371,12 @@ def train(config,
]))
logger.info(best_str)
# logger best metric
if vdl_writer is not None:
vdl_writer.add_scalar('EVAL/best_{}'.format(main_indicator),
best_model_dict[main_indicator],
global_step)
if log_writer is not None:
log_writer.log_metrics(metrics={
"best_{}".format(main_indicator): best_model_dict[main_indicator]
}, prefix="EVAL", step=global_step)
log_writer.log_model(is_best=True, prefix="best_accuracy", metadata=best_model_dict)
reader_start = time.time()
if dist.get_rank() == 0:
......@@ -392,6 +391,10 @@ def train(config,
best_model_dict=best_model_dict,
epoch=epoch,
global_step=global_step)
if log_writer is not None:
log_writer.log_model(is_best=False, prefix="latest")
if dist.get_rank() == 0 and epoch > 0 and epoch % save_epoch_step == 0:
save_model(
model,
......@@ -404,11 +407,14 @@ def train(config,
best_model_dict=best_model_dict,
epoch=epoch,
global_step=global_step)
if log_writer is not None:
log_writer.log_model(is_best=False, prefix='iter_epoch_{}'.format(epoch))
best_str = 'best metric, {}'.format(', '.join(
['{}: {}'.format(k, v) for k, v in best_model_dict.items()]))
logger.info(best_str)
if dist.get_rank() == 0 and vdl_writer is not None:
vdl_writer.close()
if dist.get_rank() == 0 and log_writer is not None:
log_writer.close()
return
......@@ -565,15 +571,32 @@ def preprocess(is_train=False):
config['Global']['distributed'] = dist.get_world_size() != 1
if config['Global']['use_visualdl'] and dist.get_rank() == 0:
from visualdl import LogWriter
loggers = []
if 'use_visualdl' in config['Global'] and config['Global']['use_visualdl']:
save_model_dir = config['Global']['save_model_dir']
vdl_writer_path = '{}/vdl/'.format(save_model_dir)
os.makedirs(vdl_writer_path, exist_ok=True)
vdl_writer = LogWriter(logdir=vdl_writer_path)
log_writer = VDLLogger(save_model_dir)
loggers.append(log_writer)
if ('use_wandb' in config['Global'] and config['Global']['use_wandb']) or 'wandb' in config:
save_dir = config['Global']['save_model_dir']
wandb_writer_path = "{}/wandb".format(save_dir)
if "wandb" in config:
wandb_params = config['wandb']
else:
wandb_params = dict()
wandb_params.update({'save_dir': save_model_dir})
log_writer = WandbLogger(**wandb_params, config=config)
loggers.append(log_writer)
else:
vdl_writer = None
log_writer = None
print_dict(config, logger)
if loggers:
log_writer = Loggers(loggers)
else:
log_writer = None
logger.info('train with paddle {} and device {}'.format(paddle.__version__,
device))
return config, device, logger, vdl_writer
return config, device, logger, log_writer
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册