diff --git a/PPOCRLabel/PPOCRLabel.py b/PPOCRLabel/PPOCRLabel.py index 7c7802a73ed32680142f8119b10a0393d1fab9cc..1b902484e2352fda27efa787be39ddb52150f1c9 100644 --- a/PPOCRLabel/PPOCRLabel.py +++ b/PPOCRLabel/PPOCRLabel.py @@ -21,12 +21,13 @@ import os.path import platform import subprocess import sys +import xlrd from functools import partial from PyQt5.QtCore import QSize, Qt, QPoint, QByteArray, QTimer, QFileInfo, QPointF, QProcess from PyQt5.QtGui import QImage, QCursor, QPixmap, QImageReader from PyQt5.QtWidgets import QMainWindow, QListWidget, QVBoxLayout, QToolButton, QHBoxLayout, QDockWidget, QWidget, \ - QSlider, QGraphicsOpacityEffect, QMessageBox, QListView, QScrollArea, QWidgetAction, QApplication, QLabel, \ + QSlider, QGraphicsOpacityEffect, QMessageBox, QListView, QScrollArea, QWidgetAction, QApplication, QLabel, QGridLayout, \ QFileDialog, QListWidgetItem, QComboBox, QDialog __dir__ = os.path.dirname(os.path.abspath(__file__)) @@ -36,7 +37,7 @@ sys.path.append(os.path.abspath(os.path.join(__dir__, '../..'))) sys.path.append(os.path.abspath(os.path.join(__dir__, '../PaddleOCR'))) sys.path.append("..") -from paddleocr import PaddleOCR +from paddleocr import PaddleOCR, PPStructure from libs.constants import * from libs.utils import * from libs.labelColor import label_colormap @@ -100,9 +101,15 @@ class MainWindow(QMainWindow): use_gpu=gpu, lang=lang, show_log=False) + self.table_ocr = PPStructure(use_pdserving=False, + use_gpu=gpu, + lang=lang, + layout=False, + show_log=False) if os.path.exists('./data/paddle.png'): result = self.ocr.ocr('./data/paddle.png', cls=True, det=True) + result = self.table_ocr('./data/paddle.png', return_ocr_result_in_table=True) # For loading all image under a directory self.mImgList = [] @@ -196,16 +203,25 @@ class MainWindow(QMainWindow): self.reRecogButton.setIcon(newIcon('reRec', 30)) self.reRecogButton.setToolButtonStyle(Qt.ToolButtonTextBesideIcon) + self.tableRecButton = QToolButton() + self.tableRecButton.setToolButtonStyle(Qt.ToolButtonTextBesideIcon) + self.newButton = QToolButton() self.newButton.setToolButtonStyle(Qt.ToolButtonTextBesideIcon) + self.createpolyButton = QToolButton() + self.createpolyButton.setToolButtonStyle(Qt.ToolButtonTextBesideIcon) + self.SaveButton = QToolButton() self.SaveButton.setToolButtonStyle(Qt.ToolButtonTextBesideIcon) self.DelButton = QToolButton() self.DelButton.setToolButtonStyle(Qt.ToolButtonTextBesideIcon) - leftTopToolBox = QHBoxLayout() - leftTopToolBox.addWidget(self.newButton) - leftTopToolBox.addWidget(self.reRecogButton) + leftTopToolBox = QGridLayout() + leftTopToolBox.addWidget(self.newButton, 0, 0, 1, 1) + leftTopToolBox.addWidget(self.createpolyButton, 0, 1, 1, 1) + leftTopToolBox.addWidget(self.reRecogButton, 1, 0, 1, 1) + leftTopToolBox.addWidget(self.tableRecButton, 1, 1, 1, 1) + leftTopToolBoxContainer = QWidget() leftTopToolBoxContainer.setLayout(leftTopToolBox) listLayout.addWidget(leftTopToolBoxContainer) @@ -446,13 +462,22 @@ class MainWindow(QMainWindow): 'Ctrl+R', 'reRec', getStr('singleRe'), enabled=False) createpoly = action(getStr('creatPolygon'), self.createPolygon, - 'q', 'new', getStr('creatPolygon'), enabled=True) + 'q', 'new', getStr('creatPolygon'), enabled=False) + + tableRec = action(getStr('TableRecognition'), self.TableRecognition, + '', 'Auto', getStr('TableRecognition'), enabled=False) + + cellreRec = action(getStr('cellreRecognition'), self.cellreRecognition, + '', 'reRec', getStr('cellreRecognition'), enabled=False) saveRec = action(getStr('saveRec'), self.saveRecResult, '', 'save', getStr('saveRec'), enabled=False) saveLabel = action(getStr('saveLabel'), self.saveLabelFile, # 'Ctrl+S', 'save', getStr('saveLabel'), enabled=False) + + exportJSON = action(getStr('exportJSON'), self.exportJSON, + '', 'save', getStr('exportJSON'), enabled=False) undoLastPoint = action(getStr("undoLastPoint"), self.canvas.undoLastPoint, 'Ctrl+Z', "undo", getStr("undoLastPoint"), enabled=False) @@ -474,10 +499,12 @@ class MainWindow(QMainWindow): self.editButton.setDefaultAction(edit) self.newButton.setDefaultAction(create) + self.createpolyButton.setDefaultAction(createpoly) self.DelButton.setDefaultAction(deleteImg) self.SaveButton.setDefaultAction(save) self.AutoRecognition.setDefaultAction(AutoRec) self.reRecogButton.setDefaultAction(reRec) + self.tableRecButton.setDefaultAction(tableRec) # self.preButton.setDefaultAction(openPrevImg) # self.nextButton.setDefaultAction(openNextImg) @@ -523,25 +550,25 @@ class MainWindow(QMainWindow): # Store actions for further handling. self.actions = struct(save=save, resetAll=resetAll, deleteImg=deleteImg, - lineColor=color1, create=create, delete=delete, edit=edit, copy=copy, - saveRec=saveRec, singleRere=singleRere, AutoRec=AutoRec, reRec=reRec, + lineColor=color1, create=create, createpoly=createpoly, tableRec=tableRec, delete=delete, edit=edit, copy=copy, + saveRec=saveRec, singleRere=singleRere, AutoRec=AutoRec, reRec=reRec, cellreRec=cellreRec, createMode=createMode, editMode=editMode, shapeLineColor=shapeLineColor, shapeFillColor=shapeFillColor, zoom=zoom, zoomIn=zoomIn, zoomOut=zoomOut, zoomOrg=zoomOrg, fitWindow=fitWindow, fitWidth=fitWidth, zoomActions=zoomActions, saveLabel=saveLabel, change_cls=change_cls, undo=undo, undoLastPoint=undoLastPoint, open_dataset_dir=open_dataset_dir, - rotateLeft=rotateLeft, rotateRight=rotateRight, lock=lock, - fileMenuActions=(opendir, open_dataset_dir, saveLabel, resetAll, quit), + rotateLeft=rotateLeft, rotateRight=rotateRight, lock=lock, exportJSON=exportJSON, + fileMenuActions=(opendir, open_dataset_dir, saveLabel, exportJSON, resetAll, quit), beginner=(), advanced=(), - editMenu=(createpoly, edit, copy, delete, singleRere, None, undo, undoLastPoint, + editMenu=(createpoly, edit, copy, delete, singleRere, cellreRec, None, undo, undoLastPoint, None, rotateLeft, rotateRight, None, color1, self.drawSquaresOption, lock, None, change_cls), beginnerContext=( - create, edit, copy, delete, singleRere, rotateLeft, rotateRight, lock, change_cls), + create, createpoly, edit, copy, delete, singleRere, cellreRec, rotateLeft, rotateRight, lock, change_cls), advancedContext=(createMode, editMode, edit, copy, delete, shapeLineColor, shapeFillColor), - onLoadActive=(create, createMode, editMode), + onLoadActive=(create, createpoly, createMode, editMode), onShapesPresent=(hideAll, showAll)) # menus @@ -574,7 +601,7 @@ class MainWindow(QMainWindow): self.autoSaveOption.triggered.connect(self.autoSaveFunc) addActions(self.menus.file, - (opendir, open_dataset_dir, None, saveLabel, saveRec, self.autoSaveOption, None, resetAll, deleteImg, + (opendir, open_dataset_dir, None, saveLabel, saveRec, exportJSON, self.autoSaveOption, None, resetAll, deleteImg, quit)) addActions(self.menus.help, (showKeys, showSteps, showInfo)) @@ -585,7 +612,7 @@ class MainWindow(QMainWindow): zoomIn, zoomOut, zoomOrg, None, fitWindow, fitWidth)) - addActions(self.menus.autolabel, (AutoRec, reRec, alcm, None, help)) + addActions(self.menus.autolabel, (AutoRec, reRec, cellreRec, alcm, None, help)) self.menus.file.aboutToShow.connect(self.updateFileMenu) @@ -695,6 +722,7 @@ class MainWindow(QMainWindow): self.dirty = False self.actions.save.setEnabled(False) self.actions.create.setEnabled(True) + self.actions.createpoly.setEnabled(True) def toggleActions(self, value=True): """Enable/Disable widgets which depend on an opened image.""" @@ -780,6 +808,7 @@ class MainWindow(QMainWindow): assert self.beginner() self.canvas.setEditing(False) self.actions.create.setEnabled(False) + self.actions.createpoly.setEnabled(False) self.canvas.fourpoint = False def createPolygon(self): @@ -787,10 +816,10 @@ class MainWindow(QMainWindow): self.canvas.setEditing(False) self.canvas.fourpoint = True self.actions.create.setEnabled(False) + self.actions.createpoly.setEnabled(False) self.actions.undoLastPoint.setEnabled(True) def rotateImg(self, filename, k, _value): - self.actions.rotateRight.setEnabled(_value) pix = cv2.imread(filename) pix = np.rot90(pix, k) @@ -831,6 +860,7 @@ class MainWindow(QMainWindow): self.canvas.setEditing(True) self.canvas.restoreCursor() self.actions.create.setEnabled(True) + self.actions.createpoly.setEnabled(True) def toggleDrawMode(self, edit=True): self.canvas.setEditing(edit) @@ -987,11 +1017,21 @@ class MainWindow(QMainWindow): if len(self.canvas.selectedShapes) == 1 and self.keyList.count() > 0: selected_key_item_row = self.keyList.findItemsByLabel(self.canvas.selectedShapes[0].key_cls, get_row=True) + if isinstance(selected_key_item_row, list) and len(selected_key_item_row) == 0: + key_text = self.canvas.selectedShapes[0].key_cls + item = self.keyList.createItemFromLabel(key_text) + self.keyList.addItem(item) + rgb = self._get_rgb_by_label(key_text, self.kie_mode) + self.keyList.setItemLabel(item, key_text, rgb) + selected_key_item_row = self.keyList.findItemsByLabel(self.canvas.selectedShapes[0].key_cls, + get_row=True) + self.keyList.setCurrentRow(selected_key_item_row) self._noSelectionSlot = False n_selected = len(selected_shapes) self.actions.singleRere.setEnabled(n_selected) + self.actions.cellreRec.setEnabled(n_selected) self.actions.delete.setEnabled(n_selected) self.actions.copy.setEnabled(n_selected) self.actions.edit.setEnabled(n_selected == 1) @@ -1216,6 +1256,7 @@ class MainWindow(QMainWindow): if self.beginner(): # Switch to edit mode. self.canvas.setEditing(True) self.actions.create.setEnabled(True) + self.actions.createpoly.setEnabled(True) self.actions.undoLastPoint.setEnabled(False) self.actions.undo.setEnabled(True) else: @@ -1654,8 +1695,10 @@ class MainWindow(QMainWindow): self.haveAutoReced = False self.AutoRecognition.setEnabled(True) self.reRecogButton.setEnabled(True) + self.tableRecButton.setEnabled(True) self.actions.AutoRec.setEnabled(True) self.actions.reRec.setEnabled(True) + self.actions.tableRec.setEnabled(True) self.actions.open_dataset_dir.setEnabled(True) self.actions.rotateLeft.setEnabled(True) self.actions.rotateRight.setEnabled(True) @@ -1755,6 +1798,7 @@ class MainWindow(QMainWindow): self.openNextImg() self.actions.saveRec.setEnabled(True) self.actions.saveLabel.setEnabled(True) + self.actions.exportJSON.setEnabled(True) elif mode == 'Auto': if annotationFilePath and self.saveLabels(annotationFilePath, mode=mode): @@ -2081,6 +2125,280 @@ class MainWindow(QMainWindow): self.singleLabel(shape) self.setDirty() + def TableRecognition(self): + ''' + Table Recegnition + ''' + from paddleocr.ppstructure.table.predict_table import to_excel + + import time + + start = time.time() + img = cv2.imread(self.filePath) + res = self.table_ocr(img, return_ocr_result_in_table=True) + + TableRec_excel_dir = self.lastOpenDir + '/tableRec_excel_output/' + os.makedirs(TableRec_excel_dir, exist_ok=True) + filename, _ = os.path.splitext(os.path.basename(self.filePath)) + + excel_path = TableRec_excel_dir + '{}.xlsx'.format(filename) + + if res is None: + msg = 'Can not recognise the table in ' + self.filePath + '. Please change manually' + QMessageBox.information(self, "Information", msg) + to_excel('', excel_path) # create an empty excel + return + + # save res + # ONLY SUPPORT ONE TABLE in one image + hasTable = False + for region in res: + if region['type'] == 'Table': + if region['res']['boxes'] is None: + msg = 'Can not recognise the detection box in ' + self.filePath + '. Please change manually' + QMessageBox.information(self, "Information", msg) + to_excel('', excel_path) # create an empty excel + return + hasTable = True + # save table ocr result on PPOCRLabel + # clear all old annotaions before saving result + self.itemsToShapes.clear() + self.shapesToItems.clear() + self.itemsToShapesbox.clear() # ADD + self.shapesToItemsbox.clear() + self.labelList.clear() + self.BoxList.clear() + self.result_dic = [] + self.result_dic_locked = [] + + shapes = [] + result_len = len(region['res']['boxes']) + for i in range(result_len): + bbox = np.array(region['res']['boxes'][i]) + rec_text = region['res']['rec_res'][i][0] + + # polys to rectangles + x1, y1 = np.min(bbox[:, 0]), np.min(bbox[:, 1]) + x2, y2 = np.max(bbox[:, 0]), np.max(bbox[:, 1]) + rext_bbox = [[x1, y1], [x2, y1], [x2, y2], [x1, y2]] + + # save bbox to shape + shape = Shape(label=rec_text, line_color=DEFAULT_LINE_COLOR, key_cls=None) + for point in rext_bbox: + x, y = point + # Ensure the labels are within the bounds of the image. + # If not, fix them. + x, y, snapped = self.canvas.snapPointToCanvas(x, y) + shape.addPoint(QPointF(x, y)) + shape.difficult = False + # shape.locked = False + shape.close() + self.addLabel(shape) + shapes.append(shape) + self.setDirty() + self.canvas.loadShapes(shapes) + + # save HTML result to excel + try: + to_excel(region['res']['html'], excel_path) + except: + print('Can not save excel file, maybe Permission denied (.xlsx is being occupied)') + break + + if not hasTable: + msg = 'Can not recognise the table in ' + self.filePath + '. Please change manually' + QMessageBox.information(self, "Information", msg) + to_excel('', excel_path) # create an empty excel + return + + # automatically open excel annotation file + if platform.system() == 'Windows': + try: + import win32com.client + except: + print("CANNOT OPEN .xlsx. It could be one of the following reasons: " \ + "Only support Windows | No python win32com") + + try: + xl = win32com.client.Dispatch("Excel.Application") + xl.Visible = True + xl.Workbooks.Open(excel_path) + # excelEx = "You need to show the excel executable at this point" + # subprocess.Popen([excelEx, excel_path]) + + # os.startfile(excel_path) + except: + print("CANNOT OPEN .xlsx. It could be the following reasons: " \ + ".xlsx is not existed") + else: + os.system('open ' + os.path.normpath(excel_path)) + + print('time cost: ', time.time() - start) + + def cellreRecognition(self): + ''' + re-recognise text in a cell + ''' + img = cv2.imread(self.filePath) + for shape in self.canvas.selectedShapes: + box = [[int(p.x()), int(p.y())] for p in shape.points] + + if len(box) > 4: + box = self.gen_quad_from_poly(np.array(box)) + assert len(box) == 4 + + # pad around bbox for better text recognition accuracy + _box = boxPad(box, img.shape, 6) + img_crop = get_rotate_crop_image(img, np.array(_box, np.float32)) + if img_crop is None: + msg = 'Can not recognise the detection box in ' + self.filePath + '. Please change manually' + QMessageBox.information(self, "Information", msg) + return + + # merge the text result in the cell + texts = '' + probs = 0. # the probability of the cell is avgerage prob of every text box in the cell + bboxes = self.ocr.ocr(img_crop, det=True, rec=False, cls=False) + if len(bboxes) > 0: + bboxes.reverse() # top row text at first + for _bbox in bboxes: + patch = get_rotate_crop_image(img_crop, np.array(_bbox, np.float32)) + rec_res = self.ocr.ocr(patch, det=False, rec=True, cls=False) + text = rec_res[0][0] + if text != '': + texts += text + (' ' if text[0].isalpha() else '') # add space between english word + probs += rec_res[0][1] + probs = probs / len(bboxes) + result = [(texts.strip(), probs)] + + if result[0][0] != '': + result.insert(0, box) + print('result in reRec is ', result) + if result[1][0] == shape.label: + print('label no change') + else: + shape.label = result[1][0] + else: + print('Can not recognise the box') + if self.noLabelText == shape.label: + print('label no change') + else: + shape.label = self.noLabelText + self.singleLabel(shape) + self.setDirty() + + def exportJSON(self): + ''' + export PPLabel and CSV to JSON (PubTabNet) + ''' + import pandas as pd + from libs.dataPartitionDialog import DataPartitionDialog + + # data partition user input + partitionDialog = DataPartitionDialog(parent=self) + partitionDialog.exec() + if partitionDialog.getStatus() == False: + return + + # automatically save annotations + self.saveFilestate() + self.savePPlabel(mode='auto') + + # load box annotations + labeldict = {} + if not os.path.exists(self.PPlabelpath): + msg = 'ERROR, Can not find Label.txt' + QMessageBox.information(self, "Information", msg) + return + else: + with open(self.PPlabelpath, 'r', encoding='utf-8') as f: + data = f.readlines() + for each in data: + file, label = each.split('\t') + if label: + label = label.replace('false', 'False') + label = label.replace('true', 'True') + labeldict[file] = eval(label) + else: + labeldict[file] = [] + + # if len(labeldict) != len(csv_paths): + # msg = 'ERROR, box label and excel label are not in the same number\n' + \ + # 'box label: ' + str(len(labeldict)) + '\n' + \ + # 'excel label: ' + str(len(csv_paths)) + '\n' + \ + # 'Please check the label.txt and tableRec_excel_output\n' + # QMessageBox.information(self, "Information", msg) + # return + train_split, val_split, test_split = partitionDialog.getDataPartition() + # check validate + if train_split + val_split + test_split > 100: + msg = "The sum of training, validation and testing data should be less than 100%" + QMessageBox.information(self, "Information", msg) + return + print(train_split, val_split, test_split) + train_split, val_split, test_split = float(train_split) / 100., float(val_split) / 100., float(test_split) / 100. + train_id = int(len(labeldict) * train_split) + val_id = int(len(labeldict) * (train_split + val_split)) + print('Data partition: train:', train_id, + 'validation:', val_id - train_id, + 'test:', len(labeldict) - val_id) + + TableRec_excel_dir = os.path.join(self.lastOpenDir, 'tableRec_excel_output') + json_results = [] + imgid = 0 + for image_path in labeldict.keys(): + # load csv annotations + filename, _ = os.path.splitext(os.path.basename(image_path)) + csv_path = os.path.join(TableRec_excel_dir, filename + '.xlsx') + if not os.path.exists(csv_path): + msg = 'ERROR, Can not find ' + csv_path + QMessageBox.information(self, "Information", msg) + return + + # read xlsx file, convert to HTML + # xd = pd.ExcelFile(csv_path) + # df = xd.parse() + # structure = df.to_html(index = False) + excel = xlrd.open_workbook(csv_path) + sheet0 = excel.sheet_by_index(0) # only sheet 0 + merged_cells = sheet0.merged_cells # (0,1,1,3) start row, end row, start col, end col + + html_list = [['td'] * sheet0.ncols for i in range(sheet0.nrows)] + + for merged in merged_cells: + html_list = expand_list(merged, html_list) + + token_list = convert_token(html_list) + + + # load box annotations + cells = [] + for anno in labeldict[image_path]: + tokens = list(anno['transcription']) + obb = anno['points'] + hbb = OBB2HBB(np.array(obb)).tolist() + cells.append({'tokens': tokens, 'bbox': hbb}) + + # data split + if imgid < train_id: + split = 'train' + elif imgid < val_id: + split = 'val' + else: + split = 'test' + + # save dict + html = {'structure': {'tokens': token_list}, 'cell': cells} + json_results.append({'filename': os.path.basename(image_path), 'split': split, 'imgid': imgid, 'html': html}) + imgid += 1 + + # save json + with open("{}/annotation.json".format(self.lastOpenDir), "w", encoding='utf-8') as fid: + fid.write(json.dumps(json_results, ensure_ascii=False)) + + msg = 'JSON sucessfully saved in {}/annotation.json'.format(self.lastOpenDir) + QMessageBox.information(self, "Information", msg) + def autolcm(self): vbox = QVBoxLayout() hbox = QHBoxLayout() @@ -2120,6 +2438,12 @@ class MainWindow(QMainWindow): del self.ocr self.ocr = PaddleOCR(use_pdserving=False, use_angle_cls=True, det=True, cls=True, use_gpu=False, lang=lg_idx[self.comboBox.currentText()]) + del self.table_ocr + self.table_ocr = PPStructure(use_pdserving=False, + use_gpu=False, + lang=lg_idx[self.comboBox.currentText()], + layout=False, + show_log=False) self.dialog.close() def cancel(self): @@ -2138,6 +2462,7 @@ class MainWindow(QMainWindow): self.fileStatedict[file] = 1 self.actions.saveLabel.setEnabled(True) self.actions.saveRec.setEnabled(True) + self.actions.exportJSON.setEnabled(True) def saveFilestate(self): with open(self.fileStatepath, 'w', encoding='utf-8') as f: diff --git a/PPOCRLabel/libs/dataPartitionDialog.py b/PPOCRLabel/libs/dataPartitionDialog.py new file mode 100644 index 0000000000000000000000000000000000000000..33bd491552fe773bd07020d82f7ea9bab76e7557 --- /dev/null +++ b/PPOCRLabel/libs/dataPartitionDialog.py @@ -0,0 +1,113 @@ +try: + from PyQt5.QtGui import * + from PyQt5.QtCore import * + from PyQt5.QtWidgets import * +except ImportError: + from PyQt4.QtGui import * + from PyQt4.QtCore import * + +from libs.utils import newIcon + +import time +import datetime +import json +import cv2 +import numpy as np + + +BB = QDialogButtonBox + +class DataPartitionDialog(QDialog): + def __init__(self, parent=None): + super().__init__() + self.parnet = parent + self.title = 'DATA PARTITION' + + self.train_ratio = 70 + self.val_ratio = 15 + self.test_ratio = 15 + + self.initUI() + + def initUI(self): + self.setWindowTitle(self.title) + self.setWindowModality(Qt.ApplicationModal) + + self.flag_accept = True + + if self.parnet.lang == 'ch': + msg = "导出JSON前请保存所有图像的标注且关闭EXCEL!" + else: + msg = "Please save all the annotations and close the EXCEL before exporting JSON!" + + info_msg = QLabel(msg, self) + info_msg.setWordWrap(True) + info_msg.setStyleSheet("color: red") + info_msg.setFont(QFont('Arial', 12)) + + train_lbl = QLabel('Train split: ', self) + train_lbl.setFont(QFont('Arial', 15)) + val_lbl = QLabel('Valid split: ', self) + val_lbl.setFont(QFont('Arial', 15)) + test_lbl = QLabel('Test split: ', self) + test_lbl.setFont(QFont('Arial', 15)) + + self.train_input = QLineEdit(self) + self.train_input.setFont(QFont('Arial', 15)) + self.val_input = QLineEdit(self) + self.val_input.setFont(QFont('Arial', 15)) + self.test_input = QLineEdit(self) + self.test_input.setFont(QFont('Arial', 15)) + + self.train_input.setText(str(self.train_ratio)) + self.val_input.setText(str(self.val_ratio)) + self.test_input.setText(str(self.test_ratio)) + + validator = QIntValidator(0, 100) + self.train_input.setValidator(validator) + self.val_input.setValidator(validator) + self.test_input.setValidator(validator) + + gridlayout = QGridLayout() + gridlayout.addWidget(info_msg, 0, 0, 1, 2) + gridlayout.addWidget(train_lbl, 1, 0) + gridlayout.addWidget(val_lbl, 2, 0) + gridlayout.addWidget(test_lbl, 3, 0) + gridlayout.addWidget(self.train_input, 1, 1) + gridlayout.addWidget(self.val_input, 2, 1) + gridlayout.addWidget(self.test_input, 3, 1) + + bb = BB(BB.Ok | BB.Cancel, Qt.Horizontal, self) + bb.button(BB.Ok).setIcon(newIcon('done')) + bb.button(BB.Cancel).setIcon(newIcon('undo')) + bb.accepted.connect(self.validate) + bb.rejected.connect(self.cancel) + gridlayout.addWidget(bb, 4, 0, 1, 2) + + self.setLayout(gridlayout) + + self.show() + + def validate(self): + self.flag_accept = True + self.accept() + + def cancel(self): + self.flag_accept = False + self.reject() + + def getStatus(self): + return self.flag_accept + + def getDataPartition(self): + self.train_ratio = int(self.train_input.text()) + self.val_ratio = int(self.val_input.text()) + self.test_ratio = int(self.test_input.text()) + + return self.train_ratio, self.val_ratio, self.test_ratio + + def closeEvent(self, event): + self.flag_accept = False + self.reject() + + diff --git a/PPOCRLabel/libs/utils.py b/PPOCRLabel/libs/utils.py index 2510520caa8048d7787d7c8f65df2885d76026f7..bf54700488e285da8a89b2648a17e0e1a7341b60 100644 --- a/PPOCRLabel/libs/utils.py +++ b/PPOCRLabel/libs/utils.py @@ -161,6 +161,77 @@ def get_rotate_crop_image(img, points): print(e) +def boxPad(box, imgShape, pad : int) -> np.array: + """ + Pad a box with [pad] pixels on each side. + """ + box = np.array(box, dtype=np.int32) + box[0][0], box[0][1] = box[0][0] - pad, box[0][1] - pad + box[1][0], box[1][1] = box[1][0] + pad, box[1][1] - pad + box[2][0], box[2][1] = box[2][0] + pad, box[2][1] + pad + box[3][0], box[3][1] = box[3][0] - pad, box[3][1] + pad + h, w, _ = imgShape + box[:,0] = np.clip(box[:,0], 0, w) + box[:,1] = np.clip(box[:,1], 0, h) + return box + + +def OBB2HBB(obb) -> np.array: + """ + Convert Oriented Bounding Box to Horizontal Bounding Box. + """ + hbb = np.zeros(4, dtype=np.int32) + hbb[0] = min(obb[:, 0]) + hbb[1] = min(obb[:, 1]) + hbb[2] = max(obb[:, 0]) + hbb[3] = max(obb[:, 1]) + return hbb + + +def expand_list(merged, html_list): + ''' + Fill blanks according to merged cells + ''' + sr, er, sc, ec = merged + for i in range(sr, er): + for j in range(sc, ec): + html_list[i][j] = None + html_list[sr][sc] = '' + if ec - sc > 1: + html_list[sr][sc] += " colspan={}".format(ec - sc) + if er - sr > 1: + html_list[sr][sc] += " rowspan={}".format(er - sr) + return html_list + + +def convert_token(html_list): + ''' + Convert raw html to label format + ''' + token_list = [""] + # final html list: + for row in html_list: + token_list.append("") + for col in row: + if col == None: + continue + elif col == 'td': + token_list.extend(["", ""]) + else: + token_list.append("", ""]) + token_list.append("") + token_list.append("") + + return token_list + + def stepsInfo(lang='en'): if lang == 'ch': msg = "1. 安装与运行:使用上述命令安装与运行程序。\n" \ diff --git a/PPOCRLabel/resources/strings/strings-en.properties b/PPOCRLabel/resources/strings/strings-en.properties index 3c4eda65a32e1048405041667ba61bdb639bfd7b..7ba9af4c33ebe8130b5ce529e01270361c5f11d6 100644 --- a/PPOCRLabel/resources/strings/strings-en.properties +++ b/PPOCRLabel/resources/strings/strings-en.properties @@ -84,7 +84,7 @@ mhelp=Help iconList=Icon List detectionBoxposition=Detection box position recognitionResult=Recognition result -creatPolygon=Create Quadrilateral +creatPolygon=Create PolygonBox rotateLeft=Left turn 90 degrees rotateRight=Right turn 90 degrees drawSquares=Draw Squares @@ -110,3 +110,6 @@ lockBoxDetail=Lock selected box/Unlock all box keyListTitle=Key List keyDialogTip=Enter object label keyChange=Change Box Key +TableRecognition=Table Recognition +cellreRecognition=Cell Re-Recognition +exportJSON=export JSON(PubTabNet) diff --git a/PPOCRLabel/resources/strings/strings-zh-CN.properties b/PPOCRLabel/resources/strings/strings-zh-CN.properties index a7c30368b87354cbae81b2cdead8ad31b2a8c1eb..308974ef29fac367bccd5f0e97926b2087a9f8df 100644 --- a/PPOCRLabel/resources/strings/strings-zh-CN.properties +++ b/PPOCRLabel/resources/strings/strings-zh-CN.properties @@ -84,7 +84,7 @@ mhelp=帮助 iconList=缩略图 detectionBoxposition=检测框位置 recognitionResult=识别结果 -creatPolygon=四点标注 +creatPolygon=多边形标注 drawSquares=正方形标注 rotateLeft=图片左旋转90度 rotateRight=图片右旋转90度 @@ -109,4 +109,7 @@ lockBox=锁定框/解除锁定框 lockBoxDetail=若当前没有框处于锁定状态则锁定选中的框,若存在锁定框则解除所有锁定框的锁定状态 keyListTitle=关键词列表 keyDialogTip=请输入类型名称 -keyChange=更改Box关键字类别 \ No newline at end of file +keyChange=更改Box关键字类别 +TableRecognition=表格识别 +cellreRecognition=单元格重识别 +exportJSON=导出表格JSON标注 \ No newline at end of file diff --git a/README.md b/README.md index 259ccb5aa02352ca2a2b81bf81d858cec2b47081..835b1e2509ebca6f6d0dd71a53a7ec02a147efcf 100644 --- a/README.md +++ b/README.md @@ -19,12 +19,9 @@ PaddleOCR aims to create multilingual, awesome, leading, and practical OCR tools **Recent updates** -- 2021.12.21 OCR open source online course starts. The lesson starts at 8:30 every night and lasts for ten days. Free registration: https://aistudio.baidu.com/aistudio/course/introduce/25207 -- 2021.12.21 release PaddleOCR v2.4, release 1 text detection algorithm (PSENet), 3 text recognition algorithms (NRTR、SEED、SAR), 1 key information extraction algorithm (SDMGR, [tutorial](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.4/ppstructure/docs/kie.md)) and 3 DocVQA algorithms (LayoutLM, LayoutLMv2, LayoutXLM, [tutorial](https://github.com/PaddlePaddle/PaddleOCR/tree/release/2.4/ppstructure/vqa)). -- PaddleOCR R&D team would like to share the key points of PP-OCRv2, at 20:15 pm on September 8th, [Course Address](https://aistudio.baidu.com/aistudio/education/group/info/6758). -- 2021.9.7 release PaddleOCR v2.3, [PP-OCRv2](#PP-OCRv2) is proposed. The inference speed of PP-OCRv2 is 220% higher than that of PP-OCR server in CPU device. The F-score of PP-OCRv2 is 7% higher than that of PP-OCR mobile. -- 2021.8.3 released PaddleOCR v2.2, add a new structured documents analysis toolkit, i.e., [PP-Structure](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.2/ppstructure/README.md), support layout analysis and table recognition (One-key to export chart images to Excel files). -- 2021.4.8 release end-to-end text recognition algorithm [PGNet](https://www.aaai.org/AAAI21Papers/AAAI-2885.WangP.pdf) which is published in AAAI 2021. Find tutorial [here](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.1/doc/doc_en/pgnet_en.md);release multi language recognition [models](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.1/doc/doc_en/multi_languages_en.md), support more than 80 languages recognition; especically, the performance of [English recognition model](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.1/doc/doc_en/models_list_en.md#English) is Optimized. +- 2021.12.21 release PaddleOCR v2.4, release 1 text detection algorithm (PSENet), 3 text recognition algorithms (NRTR、SEED、SAR), 1 key information extraction algorithm (SDMGR, [tutorial](./ppstructure/docs/kie_en.md)) and 3 DocVQA algorithms (LayoutLM, LayoutLMv2, LayoutXLM, [tutorial](./ppstructure/vqa)). +- 2021.9.7 release PaddleOCR v2.3, [PP-OCRv2](./doc/doc_en/ppocr_introduction_en.md#pp-ocrv2) is proposed. The inference speed of PP-OCRv2 is 220% higher than that of PP-OCR server in CPU device. The F-score of PP-OCRv2 is 7% higher than that of PP-OCR mobile. +- 2021.8.3 released PaddleOCR v2.2, add a new structured documents analysis toolkit, i.e., [PP-Structure](./ppstructure/README.md), support layout analysis and table recognition (One-key to export chart images to Excel files). - [more](./doc/doc_en/update_en.md) @@ -81,7 +78,6 @@ PaddleOCR support a variety of cutting-edge algorithms related to OCR, and devel ## Tutorials - [Environment Preparation](./doc/doc_en/environment_en.md) -- [Quick Start](./doc/doc_en/quickstart_en.md) - [PP-OCR 🔥](./doc/doc_en/ppocr_introduction_en.md) - [Quick Start](./doc/doc_en/quickstart_en.md) - [Model Zoo](./doc/doc_en/models_en.md) diff --git a/README_ch.md b/README_ch.md index c040853074b3b6f99895ee984ed9828140fa5713..1988bfb016d2a0d5bd343dd7e93d1e168773a25a 100755 --- a/README_ch.md +++ b/README_ch.md @@ -27,10 +27,9 @@ PaddleOCR旨在打造一套丰富、领先、且实用的OCR工具库,助力 ## 近期更新 -- 2021.12.21《动手学OCR · 十讲》课程开讲,12月21日起每晚八点半线上授课![免费报名地址](https://aistudio.baidu.com/aistudio/course/introduce/25207)。 -- 2021.12.21 发布PaddleOCR v2.4。OCR算法新增1种文本检测算法(PSENet),3种文本识别算法(NRTR、SEED、SAR);文档结构化算法新增1种关键信息提取算法(SDMGR,[文档](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.4/ppstructure/docs/kie.md)),3种DocVQA算法(LayoutLM、LayoutLMv2,LayoutXLM,[文档](https://github.com/PaddlePaddle/PaddleOCR/tree/release/2.4/ppstructure/vqa))。 -- 2021.9.7 发布PaddleOCR v2.3与[PP-OCRv2](#PP-OCRv2),CPU推理速度相比于PP-OCR server提升220%;效果相比于PP-OCR mobile 提升7%。 -- 2021.8.3 发布PaddleOCR v2.2,新增文档结构分析[PP-Structure](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.2/ppstructure/README_ch.md)工具包,支持版面分析与表格识别(含Excel导出)。 +- 2021.12.21 发布PaddleOCR v2.4。OCR算法新增1种文本检测算法(PSENet),3种文本识别算法(NRTR、SEED、SAR);文档结构化算法新增1种关键信息提取算法(SDMGR,[文档](./ppstructure/docs/kie.md)),3种DocVQA算法(LayoutLM、LayoutLMv2,LayoutXLM,[文档](./ppstructure/vqa))。 +- 2021.9.7 发布PaddleOCR v2.3与[PP-OCRv2](./doc/doc_ch/ppocr_introduction.md#pp-ocrv2),CPU推理速度相比于PP-OCR server提升220%;效果相比于PP-OCR mobile 提升7%。 +- 2021.8.3 发布PaddleOCR v2.2,新增文档结构分析[PP-Structure](./ppstructure/README_ch.md)工具包,支持版面分析与表格识别(含Excel导出)。 > [更多](./doc/doc_ch/update.md) @@ -83,7 +82,6 @@ PaddleOCR旨在打造一套丰富、领先、且实用的OCR工具库,助力 ## 文档教程 - [运行环境准备](./doc/doc_ch/environment.md) -- [快速开始(中英文/多语言/文档分析)](./doc/doc_ch/quickstart.md) - [PP-OCR文本检测识别🔥](./doc/doc_ch/ppocr_introduction.md) - [快速开始](./doc/doc_ch/quickstart.md) - [模型库](./doc/doc_ch/models_list.md) diff --git a/configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml b/configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml index 773a3649d8378cb39373b5b90837f17f9ecba335..e7cbae59a14af73639e1a74a14021b9b2ef60057 100644 --- a/configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml +++ b/configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml @@ -129,7 +129,7 @@ Loss: key: head_out multi_head: True - DistillationSARLoss: - weight: 0.5 + weight: 1.0 model_name_list: ["Student", "Teacher"] key: head_out multi_head: True diff --git a/deploy/cpp_infer/src/ocr_rec.cpp b/deploy/cpp_infer/src/ocr_rec.cpp index f0adfcf0270b4a1b8b50025dc094eb8f05319819..f69f37b8f51ecec5925d556f2b3e169bb0e80715 100644 --- a/deploy/cpp_infer/src/ocr_rec.cpp +++ b/deploy/cpp_infer/src/ocr_rec.cpp @@ -166,6 +166,10 @@ void CRNNRecognizer::LoadModel(const std::string &model_dir) { config.SetCpuMathLibraryNumThreads(this->cpu_math_library_num_threads_); } + // get pass_builder object + auto pass_builder = config.pass_builder(); + // delete "matmul_transpose_reshape_fuse_pass" + pass_builder->DeletePass("matmul_transpose_reshape_fuse_pass"); config.SwitchUseFeedFetchOps(false); // true for multiple input config.SwitchSpecifyInputNames(true); diff --git a/deploy/pdserving/config.yml b/deploy/pdserving/config.yml index 19cd9325ee8b241fd591678b9ba6452de9bec025..8014cbd362461ead5d065f96a50eb3031a60fa67 100644 --- a/deploy/pdserving/config.yml +++ b/deploy/pdserving/config.yml @@ -36,8 +36,8 @@ op: #det模型路径 model_config: ./ppocr_det_v3_serving - #Fetch结果列表,以client_config中fetch_var的alias_name为准 - fetch_list: ["sigmoid_0.tmp_0"] + #Fetch结果列表,以client_config中fetch_var的alias_name为准,不设置默认取全部输出变量 + #fetch_list: ["sigmoid_0.tmp_0"] #计算硬件ID,当devices为""或不写时为CPU预测;当devices为"0", "0,1,2"时为GPU预测,表示使用的GPU卡 devices: "0" @@ -62,8 +62,8 @@ op: #rec模型路径 model_config: ./ppocr_rec_v3_serving - #Fetch结果列表,以client_config中fetch_var的alias_name为准 - fetch_list: ["softmax_5.tmp_0"] + #Fetch结果列表,以client_config中fetch_var的alias_name为准, 不设置默认取全部输出变量 + #fetch_list: #计算硬件ID,当devices为""或不写时为CPU预测;当devices为"0", "0,1,2"时为GPU预测,表示使用的GPU卡 devices: "0" diff --git a/deploy/pdserving/ocr_reader.py b/deploy/pdserving/ocr_reader.py index 6a2d57b679d69ab11ac6f0fd74c47a342b391545..75f0f3d5c3aea488f82ec01a72e20310663d565b 100644 --- a/deploy/pdserving/ocr_reader.py +++ b/deploy/pdserving/ocr_reader.py @@ -393,7 +393,7 @@ class OCRReader(object): return norm_img_batch[0] def postprocess(self, outputs, with_score=False): - preds = outputs["softmax_5.tmp_0"] + preds = list(outputs.values())[0] try: preds = preds.numpy() except: @@ -404,8 +404,11 @@ class OCRReader(object): preds_idx, preds_prob, is_remove_duplicate=True) return text -from argparse import ArgumentParser,RawDescriptionHelpFormatter + +from argparse import ArgumentParser, RawDescriptionHelpFormatter import yaml + + class ArgsParser(ArgumentParser): def __init__(self): super(ArgsParser, self).__init__( @@ -441,16 +444,16 @@ class ArgsParser(ArgumentParser): s = s.strip() k, v = s.split('=') v = self._parse_helper(v) - print(k,v, type(v)) + print(k, v, type(v)) cur = config parent = cur for kk in k.split("."): if kk not in cur: - cur[kk] = {} - parent = cur - cur = cur[kk] + cur[kk] = {} + parent = cur + cur = cur[kk] else: - parent = cur - cur = cur[kk] + parent = cur + cur = cur[kk] parent[k.split(".")[-1]] = v - return config \ No newline at end of file + return config diff --git a/deploy/pdserving/web_service.py b/deploy/pdserving/web_service.py index 98e2dfba2f5abd3fc36bf3743b23f7eb7be3b9c4..f05806ce030238144568a3ca137798a9132027e4 100644 --- a/deploy/pdserving/web_service.py +++ b/deploy/pdserving/web_service.py @@ -56,7 +56,7 @@ class DetOp(Op): return {"x": det_img[np.newaxis, :].copy()}, False, None, "" def postprocess(self, input_dicts, fetch_dict, data_id, log_id): - det_out = fetch_dict["sigmoid_0.tmp_0"] + det_out = list(fetch_dict.values())[0] ratio_list = [ float(self.new_h) / self.ori_h, float(self.new_w) / self.ori_w ] diff --git a/deploy/pdserving/web_service_det.py b/deploy/pdserving/web_service_det.py index 7584608a9fed4bea93caa5c814c0450566696d56..4a62ab861d8338194da826cdcea2d42de189c994 100644 --- a/deploy/pdserving/web_service_det.py +++ b/deploy/pdserving/web_service_det.py @@ -55,7 +55,7 @@ class DetOp(Op): return {"x": det_img[np.newaxis, :].copy()}, False, None, "" def postprocess(self, input_dicts, fetch_dict, data_id, log_id): - det_out = fetch_dict["sigmoid_0.tmp_0"] + det_out = list(fetch_dict.values())[0] ratio_list = [ float(self.new_h) / self.ori_h, float(self.new_w) / self.ori_w ] diff --git a/doc/doc_ch/PP-OCRv3_introduction.md b/doc/doc_ch/PP-OCRv3_introduction.md index dc0271f294cf43a26477dbc974b77297e04122ac..8ca9344a4505f5369cbe8f550b66c76edeedc2ea 100644 --- a/doc/doc_ch/PP-OCRv3_introduction.md +++ b/doc/doc_ch/PP-OCRv3_introduction.md @@ -8,123 +8,215 @@ - [4. 端到端评估](#4) - ## 1. 简介 -PP-OCRv3在PP-OCRv2的基础上进一步升级。检测模型仍然基于DB算法,优化策略采用了带残差注意力机制的FPN结构RSEFPN、增大感受野的PAN结构LKPAN、基于DML训练的更优的教师模型;识别模型将base模型从CRNN替换成了IJCAI 2022论文[SVTR](),并采用SVTR轻量化、带指导训练CTC、数据增广策略RecConAug、自监督训练的更好的预训练模型、无标签数据的使用进行模型加速和效果提升。更多细节请参考PP-OCRv3[技术报告](./PP-OCRv3_introduction.md)。 - -PP-OCRv3系统pipeline如下: +PP-OCRv3在PP-OCRv2的基础上进一步升级。整体的框架图保持了与PP-OCRv2相同的pipeline,针对检测模型和识别模型进行了优化。其中,检测模块仍基于DB算法优化,而识别模块不再采用CRNN,换成了IJCAI 2022最新收录的文本识别算法[SVTR](https://arxiv.org/abs/2205.00159),并对其进行产业适配。PP-OCRv3系统框图如下所示(粉色框中为PP-OCRv3新增策略):
- -## 2. 检测优化 -PP-OCRv3采用PP-OCRv2的[CML](https://arxiv.org/pdf/2109.03144.pdf)蒸馏策略,在蒸馏的student模型、teacher模型精度提升,CML蒸馏策略上分别做了优化。 +从算法改进思路上看,分别针对检测和识别模型,进行了共9个方面的改进: + +- 检测模块: + - LK-PAN:大感受野的PAN结构; + - DML:教师模型互学习策略; + - RSE-FPN:残差注意力机制的FPN结构; -- 在蒸馏student模型精度提升方面,提出了基于残差结构的通道注意力模块RSEFPN(Residual Squeeze-and-Excitation FPN),用于提升student模型精度和召回。 -RSEFPN的网络结构如下图所示,RSEFPN在PP-OCRv2的FPN基础上,将FPN中的卷积层更换为了通道注意力结构的RSEConv层。 +- 识别模块: + - SVTR_LCNet:轻量级文本识别网络; + - GTC:Attention指导CTC训练策略; + - TextConAug:挖掘文字上下文信息的数据增广策略; + - TextRotNet:自监督的预训练模型; + - UDML:联合互学习策略; + - UIM:无标注数据挖掘方案。 + +从效果上看,速度可比情况下,多种场景精度均有大幅提升: +- 中文场景,相对于PP-OCRv2中文模型提升超5%; +- 英文数字场景,相比于PP-OCRv2英文模型提升11%; +- 多语言场景,优化80+语种识别效果,平均准确率提升超5%。 + + + +## 2. 检测优化 + +PP-OCRv3检测模型是对PP-OCRv2中的[CML](https://arxiv.org/pdf/2109.03144.pdf)(Collaborative Mutual Learning) 协同互学习文本检测蒸馏策略进行了升级。如下图所示,CML的核心思想结合了①传统的Teacher指导Student的标准蒸馏与 ②Students网络之间的DML互学习,可以让Students网络互学习的同时,Teacher网络予以指导。PP-OCRv3分别针对教师模型和学生模型进行进一步效果优化。其中,在对教师模型优化时,提出了大感受野的PAN结构LK-PAN和引入了DML(Deep Mutual Learning)蒸馏策略;在对学生模型优化时,提出了残差注意力机制的FPN结构RSE-FPN。
- +
-RSEFPN将PP-OCR检测模型的精度hmean从81.3%提升到84.5%。模型大小从3M变为3.6M。 +消融实验如下: -*注:PP-OCRv2的FPN通道数仅为96和24,如果直接用SE模块代替FPN的卷积会导致精度下降,RSEConv引入残差结构可以防止训练中包含重要特征的通道被抑制。* +|序号|策略|模型大小|hmean|速度(cpu + mkldnn)| +|-|-|-|-|-| +|baseline teacher|PP-OCR server|49M|83.2%|171ms| +|teacher1|DB-R50-LK-PAN|124M|85.0%|396ms| +|teacher2|DB-R50-LK-PAN-DML|124M|86.0%|396ms| +|baseline student|PP-OCRv2|3M|83.2%|117ms| +|student0|DB-MV3-RSE-FPN|3.6M|84.5%|124ms| +|student1|DB-MV3-CML(teacher2)|3M|84.3%|117ms| +|student2|DB-MV3-RSE-FPN-CML(teacher2)|3.6M|85.4%|124ms| -- 在蒸馏的teacher模型精度提升方面,提出了LKPAN结构替换PP-OCRv2的FPN结构,并且使用ResNet50作为Backbone,更大的模型带来更多的精度提升。另外,对teacher模型使用[DML](https://arxiv.org/abs/1706.00384)蒸馏策略进一步提升teacher模型的精度。最终teacher的模型指标相比ppocr_server_v2.0从83.2%提升到了86.0%。 +测试环境: Intel Gold 6148 CPU,预测时开启MKLDNN加速。 -*注:[PP-OCRv2的FPN结构](https://github.com/PaddlePaddle/PaddleOCR/blob/77acb3bfe51c8a46c684527f73cd218cefedb4a3/ppocr/modeling/necks/db_fpn.py#L107)对DB算法FPN结构做了轻量级设计* +**(1)LK-PAN:大感受野的PAN结构** -LKPAN的网络结构如下图所示: +LK-PAN (Large Kernel PAN) 是一个具有更大感受野的轻量级[PAN](https://arxiv.org/pdf/1803.01534.pdf)结构,核心是将PAN结构的path augmentation中卷积核从`3*3`改为`9*9`。通过增大卷积核,提升特征图每个位置覆盖的感受野,更容易检测大字体的文字以及极端长宽比的文字。使用LK-PAN结构,可以将教师模型的hmean从83.2%提升到85.0%。
- +
-LKPAN(Large Kernel PAN)是一个具有更大感受野的轻量级[PAN](https://arxiv.org/pdf/1803.01534.pdf)结构。在LKPAN的path augmentation中,使用kernel size为`9*9`的卷积;更大的kernel size意味着更大的感受野,更容易检测大字体的文字以及极端长宽比的文字。LKPAN将PP-OCR检测模型的精度hmean从81.3%提升到84.9%。 +**(2)DML:教师模型互学习策略** -*注:LKPAN相比RSEFPN有更多的精度提升,但是考虑到模型大小和预测速度等因素,在student模型中使用RSEFPN。* +[DML](https://arxiv.org/abs/1706.00384) (Deep Mutual Learning)互学习蒸馏方法,如下图所示,通过两个结构相同的模型互相学习,可以有效提升文本检测模型的精度。教师模型采用DML策略,hmean从85%提升到86%。将PP-OCRv2中CML的教师模型更新为上述更高精度的教师模型,学生模型的hmean可以进一步从83.2%提升到84.3%。 -采用上述策略,PP-OCRv3相比PP-OCRv2,hmean指标从83.3%提升到85.4%;预测速度从平均117ms/image变为124ms/image。 +
+ +
-3. PP-OCRv3检测模型消融实验 +**(3)RSE-FPN:残差注意力机制的FPN结构** -|序号|策略|模型大小|hmean|Intel Gold 6148CPU+mkldnn预测耗时| -|-|-|-|-|-| -|0|PP-OCR|3M|81.3%|117ms| -|1|PP-OCRV2|3M|83.3%|117ms| -|2|0 + RESFPN|3.6M|84.5%|124ms| -|3|0 + LKPAN|4.6M|84.9%|156ms| -|4|ppocr_server_v2.0 |124M|83.2%||171ms| -|5|teacher + DML + LKPAN|124M|86.0%|396ms| -|6|0 + 2 + 5 + CML|3.6M|85.4%|124ms| +RSE-FPN(Residual Squeeze-and-Excitation FPN)如下图所示,引入残差结构和通道注意力结构,将FPN中的卷积层更换为通道注意力结构的RSEConv层,进一步提升特征图的表征能力。考虑到PP-OCRv2的检测模型中FPN通道数非常小,仅为96,如果直接用SEblock代替FPN中卷积会导致某些通道的特征被抑制,精度会下降。RSEConv引入残差结构会缓解上述问题,提升文本检测效果。进一步将PP-OCRv2中CML的学生模型的FPN结构更新为RSE-FPN,学生模型的hmean可以进一步从84.3%提升到85.4%。 +
+ +
## 3. 识别优化 -[SVTR](https://arxiv.org/abs/2205.00159) 证明了强大的单视觉模型(无需序列模型)即可高效准确完成文本识别任务,在中英文数据上均有优秀的表现。经过实验验证,SVTR_Tiny在自建的 [中文数据集上](https://arxiv.org/abs/2109.03144) ,识别精度可以提升10.7%,网络结构如下所示: +PP-OCRv3的识别模块是基于文本识别算法[SVTR](https://arxiv.org/abs/2205.00159)优化。SVTR不再采用RNN结构,通过引入Transformers结构更加有效地挖掘文本行图像的上下文信息,从而提升文本识别能力。直接将PP-OCRv2的识别模型,替换成SVTR_Tiny,识别准确率从74.8%提升到80.1%(+5.3%),但是预测速度慢了将近11倍,CPU上预测一条文本行,将近100ms。因此,如下图所示,PP-OCRv3采用如下6个优化策略进行识别模型加速。 - - -由于 MKLDNN 加速库支持的模型结构有限,SVTR 在CPU+MKLDNN上相比PP-OCRv2慢了10倍。 - -PP-OCRv3 期望在提升模型精度的同时,不带来额外的推理耗时。通过分析发现,SVTR_Tiny结构的主要耗时模块为Mixing Block,因此我们对 SVTR_Tiny 的结构进行了一系列优化(详细速度数据请参考下方消融实验表格): +
+ +
-1. 将SVTR网络前半部分替换为PP-LCNet的前三个stage,保留4个 Global Mixing Block ,精度为76%,加速69%,网络结构如下所示: - -2. 将4个 Global Attenntion Block 减小到2个,精度为72.9%,加速69%,网络结构如下所示: - -3. 实验发现 Global Attention 的预测速度与输入其特征的shape有关,因此后移Global Mixing Block的位置到池化层之后,精度下降为71.9%,速度超越 CNN-base 的PP-OCRv2 22%,网络结构如下所示: - +基于上述策略,PP-OCRv3识别模型相比PP-OCRv2,在速度可比的情况下,精度进一步提升4.6%。 具体消融实验如下所示: -为了提升模型精度同时不引入额外推理成本,PP-OCRv3参考GTC策略,使用Attention监督CTC训练,预测时完全去除Attention模块,在推理阶段不增加任何耗时, 精度提升3.8%,训练流程如下所示: - +| ID | 策略 | 模型大小 | 精度 | 预测耗时(CPU + MKLDNN)| +|-----|-----|--------|----| --- | +| 01 | PP-OCRv2 | 8M | 74.8% | 8.54ms | +| 02 | SVTR_Tiny | 21M | 80.1% | 97ms | +| 03 | SVTR_LCNet(h32) | 12M | 71.9% | 6.6ms | +| 04 | SVTR_LCNet(h48) | 12M | 73.98% | 7.6ms | +| 05 | + GTC | 12M | 75.8% | 7.6ms | +| 06 | + TextConAug | 12M | 76.3% | 7.6ms | +| 07 | + TextRotNet | 12M | 76.9% | 7.6ms | +| 08 | + UDML | 12M | 78.4% | 7.6ms | +| 09 | + UIM | 12M | 79.4% | 7.6ms | -在训练策略方面,PP-OCRv3参考 [SSL](https://github.com/ku21fan/STR-Fewer-Labels) 设计了文本方向任务,训练了适用于文本识别的预训练模型,加速模型收敛过程,精度提升了0.6%; 使用UDML蒸馏策略,进一步提升精度1.5%,训练流程所示: +注: 测试速度时,实验01-03输入图片尺寸均为(3,32,320),04-08输入图片尺寸均为(3,48,320)。在实际预测时,图像为变长输入,速度会有所变化。测试环境: Intel Gold 6148 CPU,预测时开启MKLDNN加速。 - +**(1)SVTR_LCNet:轻量级文本识别网络** +SVTR_LCNet是针对文本识别任务,将基于Transformer的[SVTR](https://arxiv.org/abs/2205.00159)网络和轻量级CNN网络[PP-LCNet](https://arxiv.org/abs/2109.15099) 融合的一种轻量级文本识别网络。使用该网络,预测速度优于PP-OCRv2的识别模型20%,但是由于没有采用蒸馏策略,该识别模型效果略差。此外,进一步将输入图片规范化高度从32提升到48,预测速度稍微变慢,但是模型效果大幅提升,识别准确率达到73.98%(+2.08%),接近PP-OCRv2采用蒸馏策略的识别模型效果。 -数据增强方面: +SVTR_Tiny 网络结构如下所示: -1. 基于 [ConCLR](https://www.cse.cuhk.edu.hk/~byu/papers/C139-AAAI2022-ConCLR.pdf) 中的ConAug方法,设计了 RecConAug 数据增强方法,增强数据多样性,精度提升0.5%,增强可视化效果如下所示: - +
+ +
-2. 使用训练好的 SVTR_large 预测 120W 的 lsvt 无标注数据,取出其中得分大于0.95的数据,共得到81W识别数据加入到PP-OCRv3的训练数据中,精度提升1%。 -总体来讲PP-OCRv3识别从网络结构、训练策略、数据增强三个方向做了进一步优化: +由于 MKLDNN 加速库支持的模型结构有限,SVTR 在 CPU+MKLDNN 上相比 PP-OCRv2 慢了10倍。PP-OCRv3 期望在提升模型精度的同时,不带来额外的推理耗时。通过分析发现,SVTR_Tiny 结构的主要耗时模块为 Mixing Block,因此我们对 SVTR_Tiny 的结构进行了一系列优化(详细速度数据请参考下方消融实验表格): -- 网络结构上:考虑[SVTR](https://arxiv.org/abs/2205.00159) 在中英文效果上的优越性,采用SVTR_Tiny作为base,选取Global Mixing Block和卷积组合提取特征,并将Global Mixing Block位置后移进行加速; 参考 [GTC](https://arxiv.org/pdf/2002.01276.pdf) 策略,使用注意力机制模块指导CTC训练,定位和识别字符,提升不规则文本的识别精度。 -- 训练策略上:参考 [SSL](https://github.com/ku21fan/STR-Fewer-Labels) 设计了方向分类前序任务,获取更优预训练模型,加速模型收敛过程,提升精度; 使用UDML蒸馏策略、监督attention、ctc两个分支得到更优模型。 -- 数据增强上:基于 [ConCLR](https://www.cse.cuhk.edu.hk/~byu/papers/C139-AAAI2022-ConCLR.pdf) 中的ConAug方法,改进得到 RecConAug 数据增广方法,支持随机结合任意多张图片,提升训练数据的上下文信息丰富度,增强模型鲁棒性;使用 SVTR_large 预测无标签数据,向训练集中补充81w高质量真实数据。 -基于上述策略,PP-OCRv3识别模型相比PP-OCRv2,在速度可比的情况下,精度进一步提升4.5%。 具体消融实验如下所示: +1. 将 SVTR 网络前半部分替换为 PP-LCNet 的前三个stage,保留4个 Global Mixing Block ,精度为76%,加速69%,网络结构如下所示: +
+ +
+2. 将4个 Global Mixing Block 减小到2个,精度为72.9%,加速69%,网络结构如下所示: +
+ +
+3. 实验发现 Global Mixing Block 的预测速度与输入其特征的shape有关,因此后移 Global Mixing Block 的位置到池化层之后,精度下降为71.9%,速度超越基于CNN结构的PP-OCRv2-baseline 22%,网络结构如下所示: +
+ +
-实验细节: +具体消融实验如下所示: -| id | 策略 | 模型大小 | 精度 | 速度(cpu + mkldnn)| +| ID | 策略 | 模型大小 | 精度 | 速度(CPU + MKLDNN)| |-----|-----|--------|----| --- | -| 01 | PP-OCRv2 | 8M | 69.3% | 8.54ms | +| 01 | PP-OCRv2-baseline | 8M | 69.3% | 8.54ms | | 02 | SVTR_Tiny | 21M | 80.1% | 97ms | -| 03 | LCNet_SVTR_G4 | 9.2M | 76% | 30ms | -| 04 | LCNet_SVTR_G2 | 13M | 72.98% | 9.37ms | -| 05 | PP-OCRv3 | 12M | 71.9% | 6.6ms | -| 06 | + large input_shape | 12M | 73.98% | 7.6ms | -| 06 | + GTC | 12M | 75.8% | 7.6ms | -| 07 | + RecConAug | 12M | 76.3% | 7.6ms | -| 08 | + SSL pretrain | 12M | 76.9% | 7.6ms | -| 09 | + UDML | 12M | 78.4% | 7.6ms | -| 10 | + unlabeled data | 12M | 79.4% | 7.6ms | - -注: 测试速度时,实验01-05输入图片尺寸均为(3,32,320),06-10输入图片尺寸均为(3,48,320) +| 03 | SVTR_LCNet(G4) | 9.2M | 76% | 30ms | +| 04 | SVTR_LCNet(G2) | 13M | 72.98% | 9.37ms | +| 05 | SVTR_LCNet(h32) | 12M | 71.9% | 6.6ms | +| 06 | SVTR_LCNet(h48) | 12M | 73.98% | 7.6ms | + +注: 测试速度时,01-05输入图片尺寸均为(3,32,320); PP-OCRv2-baseline 代表没有借助蒸馏方法训练得到的模型 + +**(2)GTC:Attention指导CTC训练策略** + +[GTC](https://arxiv.org/pdf/2002.01276.pdf)(Guided Training of CTC),利用Attention模块以及损失,指导CTC损失训练,融合多种文本特征的表达,是一种有效的提升文本识别的策略。使用该策略,预测时完全去除 Attention 模块,在推理阶段不增加任何耗时,识别模型的准确率进一步提升到75.8%(+1.82%)。训练流程如下所示: +
+ +
+ +**(3)TextConAug:挖掘文字上下文信息的数据增广策略** + +TextConAug是一种挖掘文字上下文信息的数据增广策略,主要思想来源于论文[ConCLR](https://www.cse.cuhk.edu.hk/~byu/papers/C139-AAAI2022-ConCLR.pdf),作者提出ConAug数据增广,在一个batch内对2张不同的图像进行联结,组成新的图像并进行自监督对比学习。PP-OCRv3将此方法应用到有监督的学习任务中,设计了TextConAug数据增强方法,可以丰富训练数据上下文信息,提升训练数据多样性。使用该策略,识别模型的准确率进一步提升到76.3%(+0.5%)。TextConAug示意图如下所示: + +
+ +
+ + +**(4)TextRotNet:自监督的预训练模型** + +TextRotNet是使用大量无标注的文本行数据,通过自监督方式训练的预训练模型,参考于论文[STR-Fewer-Labels](https://github.com/ku21fan/STR-Fewer-Labels)。该模型可以初始化SVTR_LCNet的初始权重,从而帮助文本识别模型收敛到更佳位置。使用该策略,识别模型的准确率进一步提升到76.9%(+0.6%)。TextRotNet训练流程如下图所示: + +
+ +
+ + +**(5)UDML:联合互学习策略** + +UDML(Unified-Deep Mutual Learning)联合互学习是PP-OCRv2中就采用的对于文本识别非常有效的提升模型效果的策略。在PP-OCRv3中,针对两个不同的SVTR_LCNet和Attention结构,对他们之间的PP-LCNet的特征图、SVTR模块的输出和Attention模块的输出同时进行监督训练。使用该策略,识别模型的准确率进一步提升到78.4%(+1.5%)。 + + +**(6)UIM:无标注数据挖掘方案** + +UIM(Unlabeled Images Mining)是一种非常简单的无标注数据挖掘方案。核心思想是利用高精度的文本识别大模型对无标注数据进行预测,获取伪标签,并且选择预测置信度高的样本作为训练数据,用于训练小模型。使用该策略,识别模型的准确率进一步提升到79.4%(+1%)。 + +
+ +
+ ## 4. 端到端评估 + +经过以上优化,最终PP-OCRv3在速度可比情况下,中文场景端到端Hmean指标相比于PP-OCRv2提升5%,效果大幅提升。具体指标如下表所示: + +| Model | Hmean | Model Size (M) | Time Cost (CPU, ms) | Time Cost (T4 GPU, ms) | +|-----|-----|--------|----| --- | +| PP-OCR mobile | 50.3% | 8.1 | 356 | 116 | +| PP-OCR server | 57.0% | 155.1 | 1056 | 200 | +| PP-OCRv2 | 57.6% | 11.6 | 330 | 111 | +| PP-OCRv3 | 62.9% | 15.6 | 331 | 86.64 | + +测试环境:CPU型号为Intel Gold 6148,CPU预测时开启MKLDNN加速。 + + +除了更新中文模型,本次升级也同步优化了英文数字模型,端到端效果提升11%,如下表所示: + +| Model | Recall | Precision | Hmean | +|-----|-----|--------|----| +| PP-OCR_en | 38.99% | 45.91% | 42.17% | +| PP-OCRv3_en | 50.95% | 55.53% | 53.14% | + +同时,也对已支持的80余种语言识别模型进行了升级更新,在有评估集的四种语系识别准确率平均提升5%以上,如下表所示: + +| Model | 拉丁语系 | 阿拉伯语系 | 日语 | 韩语 | +|-----|-----|--------|----| --- | +| PP-OCR_mul | 69.6% | 40.5% | 38.5% | 55.4% | +| PP-OCRv3_mul | 75.2%| 45.37% | 45.8% | 60.1% | diff --git a/doc/doc_ch/models_list.md b/doc/doc_ch/models_list.md index a8d99b51f17fa0912de77418e24d08ab10774c2c..2012381af5a1cfe53771903e0ab99bab0b7cbc08 100644 --- a/doc/doc_ch/models_list.md +++ b/doc/doc_ch/models_list.md @@ -18,13 +18,13 @@ - [3. 文本方向分类模型](#3-文本方向分类模型) - [4. Paddle-Lite 模型](#4-paddle-lite-模型) -PaddleOCR提供的可下载模型包括`推理模型`、`训练模型`、`预训练模型`、`slim模型`,模型区别说明如下: +PaddleOCR提供的可下载模型包括`推理模型`、`训练模型`、`预训练模型`、`nb模型`,模型区别说明如下: |模型类型|模型格式|简介| |--- | --- | --- | |推理模型|inference.pdmodel、inference.pdiparams|用于预测引擎推理,[详情](./inference.md)| |训练模型、预训练模型|\*.pdparams、\*.pdopt、\*.states |训练过程中保存的模型的参数、优化器状态和训练中间信息,多用于模型指标评估和恢复训练| -|slim模型|\*.nb|经过飞桨模型压缩工具PaddleSlim压缩后的模型,适用于移动端/IoT端等端侧部署场景(需使用飞桨Paddle Lite部署)。| +|nb模型|\*.nb|经过飞桨Paddle-Lite工具优化后的模型,适用于移动端/IoT端等端侧部署场景(需使用飞桨Paddle Lite部署)。| 各个模型的关系如下面的示意图所示。 @@ -41,7 +41,7 @@ PaddleOCR提供的可下载模型包括`推理模型`、`训练模型`、`预训 |模型名称|模型简介|配置文件|推理模型大小|下载地址| | --- | --- | --- | --- | --- | -|ch_PP-OCRv3_det_slim|【最新】slim量化+蒸馏版超轻量模型,支持中英文、多语种文本检测|[ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml)| 1.1M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_slim_infer.tar) / [训练模型(coming soon)](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_slim_distill_train.tar) / [slim模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_slim_infer.nb)| +|ch_PP-OCRv3_det_slim|【最新】slim量化+蒸馏版超轻量模型,支持中英文、多语种文本检测|[ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml)| 1.1M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_slim_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_slim_distill_train.tar) / [nb模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_slim_infer.nb)| |ch_PP-OCRv3_det| 【最新】原始超轻量模型,支持中英文、多语种文本检测 |[ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml)| 3.8M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_distill_train.tar)| |ch_PP-OCRv2_det_slim| slim量化+蒸馏版超轻量模型,支持中英文、多语种文本检测|[ch_PP-OCRv2_det_cml.yml](../../configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_cml.yml)| 3M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_slim_quant_infer.tar)| |ch_PP-OCRv2_det| 原始超轻量模型,支持中英文、多语种文本检测|[ch_PP-OCRv2_det_cml.yml](../../configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_cml.yml)|3M|[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_distill_train.tar)| @@ -55,8 +55,8 @@ PaddleOCR提供的可下载模型包括`推理模型`、`训练模型`、`预训 |模型名称|模型简介|配置文件|推理模型大小|下载地址| | --- | --- | --- | --- | --- | -|en_PP-OCRv3_det_slim |【最新】slim量化版超轻量模型,支持英文、数字检测 | [ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml) | 1.1M |[推理模型(coming soon)](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_slim_infer.tar) / [训练模型(coming soon)](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_slim_distill_train.tar) / [slim模型(coming soon)](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_slim_infer.nb) | -|ch_PP-OCRv3_det |【最新】原始超轻量模型,支持英文、数字检测|[ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml)| 3.8M | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_distill_train.tar) | +|en_PP-OCRv3_det_slim |【最新】slim量化版超轻量模型,支持英文、数字检测 | [ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml) | 1.1M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_slim_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_slim_distill_train.tar) / [nb模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_slim_infer.nb) | +|en_PP-OCRv3_det |【最新】原始超轻量模型,支持英文、数字检测|[ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml)| 3.8M | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_distill_train.tar) | * 注:英文检测模型与中文检测模型结构完全相同,只有训练数据不同,在此仅提供相同的配置文件。 @@ -66,7 +66,7 @@ PaddleOCR提供的可下载模型包括`推理模型`、`训练模型`、`预训 |模型名称|模型简介|配置文件|推理模型大小|下载地址| | --- | --- | --- | --- | --- | -| ml_PP-OCRv3_det_slim |【最新】slim量化版超轻量模型,支持多语言检测 | [ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml) | 1.1M |[推理模型(coming soon)](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/Multilingual_PP-OCRv3_det_slim_infer.tar) / [训练模型(coming soon)](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/Multilingual_PP-OCRv3_det_slim_distill_train.tar) / [slim模型(coming soon)](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/Multilingual_PP-OCRv3_det_slim_infer.nb) | +| ml_PP-OCRv3_det_slim |【最新】slim量化版超轻量模型,支持多语言检测 | [ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml) | 1.1M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/Multilingual_PP-OCRv3_det_slim_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/Multilingual_PP-OCRv3_det_slim_distill_train.tar) / [nb模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/Multilingual_PP-OCRv3_det_slim_infer.nb) | | ml_PP-OCRv3_det |【最新】原始超轻量模型,支持多语言检测 | [ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml)| 3.8M | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/Multilingual_PP-OCRv3_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/Multilingual_PP-OCRv3_det_distill_train.tar) | * 注:多语言检测模型与中文检测模型结构完全相同,只有训练数据不同,在此仅提供相同的配置文件。 @@ -81,7 +81,7 @@ PaddleOCR提供的可下载模型包括`推理模型`、`训练模型`、`预训 |模型名称|模型简介|配置文件|推理模型大小|下载地址| | --- | --- | --- | --- | --- | -|ch_PP-OCRv3_rec_slim |【最新】slim量化版超轻量模型,支持中英文、数字识别|[ch_PP-OCRv3_rec_distillation.yml](../../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml)| 4.9M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_train.tar) / [slim模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_infer.nb) | +|ch_PP-OCRv3_rec_slim |【最新】slim量化版超轻量模型,支持中英文、数字识别|[ch_PP-OCRv3_rec_distillation.yml](../../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml)| 4.9M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_train.tar) / [nb模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_infer.nb) | |ch_PP-OCRv3_rec|【最新】原始超轻量模型,支持中英文、数字识别|[ch_PP-OCRv3_rec_distillation.yml](../../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml)| 12.4M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_train.tar) | |ch_PP-OCRv2_rec_slim| slim量化版超轻量模型,支持中英文、数字识别|[ch_PP-OCRv2_rec.yml](../../configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec.yml)| 9M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_train.tar) | |ch_PP-OCRv2_rec| 原始超轻量模型,支持中英文、数字识别|[ch_PP-OCRv2_rec_distillation.yml](../../configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec_distillation.yml)|8.5M|[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_train.tar) | @@ -96,8 +96,8 @@ PaddleOCR提供的可下载模型包括`推理模型`、`训练模型`、`预训 |模型名称|模型简介|配置文件|推理模型大小|下载地址| | --- | --- | --- | --- | --- | -|en_PP-OCRv3_rec_slim |【最新】slim量化版超轻量模型,支持英文、数字识别 | [en_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/en_PP-OCRv3_rec.yml)| - |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_slim_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_slim_train.tar) / [slim模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_slim_infer.nb) | -|ch_PP-OCRv3_rec |【最新】原始超轻量模型,支持英文、数字识别|[en_PP-OCRv3_rec.yml](../../configs/rec/en_PP-OCRv3/en_PP-OCRv3_rec.yml)| 9.6M | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_train.tar) | +|en_PP-OCRv3_rec_slim |【最新】slim量化版超轻量模型,支持英文、数字识别 | [en_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/en_PP-OCRv3_rec.yml)| 3.2M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_slim_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_slim_train.tar) / [nb模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_slim_infer.nb) | +|en_PP-OCRv3_rec |【最新】原始超轻量模型,支持英文、数字识别|[en_PP-OCRv3_rec.yml](../../configs/rec/en_PP-OCRv3/en_PP-OCRv3_rec.yml)| 9.6M | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_train.tar) | |en_number_mobile_slim_v2.0_rec|slim裁剪量化版超轻量模型,支持英文、数字识别|[rec_en_number_lite_train.yml](../../configs/rec/multi_language/rec_en_number_lite_train.yml)| 2.7M | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/en_number_mobile_v2.0_rec_slim_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/en_number_mobile_v2.0_rec_slim_train.tar) | |en_number_mobile_v2.0_rec|原始超轻量模型,支持英文、数字识别|[rec_en_number_lite_train.yml](../../configs/rec/multi_language/rec_en_number_lite_train.yml)|2.6M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/en_number_mobile_v2.0_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/en_number_mobile_v2.0_rec_train.tar) | @@ -107,17 +107,16 @@ PaddleOCR提供的可下载模型包括`推理模型`、`训练模型`、`预训 |模型名称|字典文件|模型简介|配置文件|推理模型大小|下载地址| | --- | --- | --- | --- |--- | --- | -| korean_PP-OCRv3_rec | ppocr/utils/dict/korean_dict.txt |韩文识别|[korean_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/korean_PP-OCRv3_rec.yml)|11M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/korean_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/korean_PP-OCRv3_rec_train.tar) | -| japan_PP-OCRv3_rec | ppocr/utils/dict/japan_dict.txt |日文识别|[japan_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/japan_PP-OCRv3_rec.yml)|11M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/japan_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/japan_PP-OCRv3_rec_train.tar) | -| chinese_cht_PP-OCRv3_rec | ppocr/utils/dict/chinese_cht_dict.txt | 中文繁体识别|[chinese_cht_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/chinese_cht_PP-OCRv3_rec.yml)|12M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/chinese_cht_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/chinese_cht_PP-OCRv3_rec_train.tar) | -| te_PP-OCRv3_rec | ppocr/utils/dict/te_dict.txt | 泰卢固文识别|[te_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/te_PP-OCRv3_rec.yml)|9.6M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/te_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/te_PP-OCRv3_rec_train.tar) | -| ka_PP-OCRv3_rec | ppocr/utils/dict/ka_dict.txt |卡纳达文识别|[ka_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/ka_PP-OCRv3_rec.yml)|9.9M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/ka_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/ka_PP-OCRv3_rec_train.tar) | -| ta_PP-OCRv3_rec | ppocr/utils/dict/ta_dict.txt |泰米尔文识别|[ta_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/ta_PP-OCRv3_rec.yml)|9.6M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/ta_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/ta_PP-OCRv3_rec_train.tar) | -| latin_PP-OCRv3_rec | ppocr/utils/dict/latin_dict.txt | 拉丁文识别 | [latin_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/latin_PP-OCRv3_rec.yml) |9.6M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/latin_ppocr_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/latin_ppocr_PP-OCRv3_rec_train.tar) | -| arabic_PP-OCRv3_rec | ppocr/utils/dict/arabic_dict.txt | 阿拉伯字母 | [arabic_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/rec_arabic_lite_train.yml) |9.6M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/arabic_ppocr_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/arabic_ppocr_PP-OCRv3_rec_train.tar) | -| cyrillic_PP-OCRv3_rec | ppocr/utils/dict/cyrillic_dict.txt | 斯拉夫字母 | [cyrillic_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/cyrillic_PP-OCRv3_rec.yml) |9.6M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/cyrillic_ppocr_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/cyrillic_ppocr_PP-OCRv3_rec_train.tar) | -| devanagari_PP-OCRv3_rec | ppocr/utils/dict/devanagari_dict.txt |梵文字母 | [devanagari_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/devanagari_PP-OCRv3_rec.yml) |9.6M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/devanagari_ppocr_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/devanagari_ppocr_PP-OCRv3_rec_train.tar) | - +| korean_PP-OCRv3_rec | ppocr/utils/dict/korean_dict.txt |韩文识别|[korean_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/korean_PP-OCRv3_rec.yml)|11M|[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/korean_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/korean_PP-OCRv3_rec_train.tar) | +| japan_PP-OCRv3_rec | ppocr/utils/dict/japan_dict.txt |日文识别|[japan_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/japan_PP-OCRv3_rec.yml)|11M|[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/japan_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/japan_PP-OCRv3_rec_train.tar) | +| chinese_cht_PP-OCRv3_rec | ppocr/utils/dict/chinese_cht_dict.txt | 中文繁体识别|[chinese_cht_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/chinese_cht_PP-OCRv3_rec.yml)|12M|[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/chinese_cht_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/chinese_cht_PP-OCRv3_rec_train.tar) | +| te_PP-OCRv3_rec | ppocr/utils/dict/te_dict.txt | 泰卢固文识别|[te_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/te_PP-OCRv3_rec.yml)|9.6M|[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/te_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/te_PP-OCRv3_rec_train.tar) | +| ka_PP-OCRv3_rec | ppocr/utils/dict/ka_dict.txt |卡纳达文识别|[ka_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/ka_PP-OCRv3_rec.yml)|9.9M|[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/ka_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/ka_PP-OCRv3_rec_train.tar) | +| ta_PP-OCRv3_rec | ppocr/utils/dict/ta_dict.txt |泰米尔文识别|[ta_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/ta_PP-OCRv3_rec.yml)|9.6M|[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/ta_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/ta_PP-OCRv3_rec_train.tar) | +| latin_PP-OCRv3_rec | ppocr/utils/dict/latin_dict.txt | 拉丁文识别 | [latin_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/latin_PP-OCRv3_rec.yml) |9.7M|[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/latin_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/latin_PP-OCRv3_rec_train.tar) | +| arabic_PP-OCRv3_rec | ppocr/utils/dict/arabic_dict.txt | 阿拉伯字母 | [arabic_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/rec_arabic_lite_train.yml) |9.6M|[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/arabic_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/arabic_PP-OCRv3_rec_train.tar) | +| cyrillic_PP-OCRv3_rec | ppocr/utils/dict/cyrillic_dict.txt | 斯拉夫字母 | [cyrillic_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/cyrillic_PP-OCRv3_rec.yml) |9.6M|[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/cyrillic_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/cyrillic_PP-OCRv3_rec_train.tar) | +| devanagari_PP-OCRv3_rec | ppocr/utils/dict/devanagari_dict.txt |梵文字母 | [devanagari_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/devanagari_PP-OCRv3_rec.yml) |9.9M|[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/devanagari_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/devanagari_PP-OCRv3_rec_train.tar) | 更多支持语种请参考: [多语言模型](./multi_languages.md) @@ -127,13 +126,18 @@ PaddleOCR提供的可下载模型包括`推理模型`、`训练模型`、`预训 |模型名称|模型简介|配置文件|推理模型大小|下载地址| | --- | --- | --- | --- | --- | -|ch_ppocr_mobile_slim_v2.0_cls|slim量化版模型,对检测到的文本行文字角度分类|[cls_mv3.yml](../../configs/cls/cls_mv3.yml)| 2.1M |[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_slim_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_slim_infer.tar) | +|ch_ppocr_mobile_slim_v2.0_cls|slim量化版模型,对检测到的文本行文字角度分类|[cls_mv3.yml](../../configs/cls/cls_mv3.yml)| 2.1M |[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_slim_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_slim_infer.tar) / [nb模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_ppocr_mobile_v2.0_cls_infer_opt.nb) | |ch_ppocr_mobile_v2.0_cls|原始分类器模型,对检测到的文本行文字角度分类|[cls_mv3.yml](../../configs/cls/cls_mv3.yml)|1.38M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) | ## 4. Paddle-Lite 模型 +Paddle-Lite 是一个高性能、轻量级、灵活性强且易于扩展的深度学习推理框架,它可以对inference模型进一步优化,得到适用于移动端/IoT端等端侧部署场景的`nb模型`。一般建议基于量化模型进行转换,因为可以将模型以INT8形式进行存储与推理,从而进一步减小模型大小,提升模型速度。 + +本节主要列出PP-OCRv2以及更早版本的检测与识别nb模型,最新版本的nb模型可以直接从上面的模型列表中获得。 + + |模型版本|模型简介|模型大小|检测模型|文本方向分类模型|识别模型|Paddle-Lite版本| |---|---|---|---|---|---|---| |PP-OCRv2|蒸馏版超轻量中文OCR移动端模型|11M|[下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_PP-OCRv2_det_infer_opt.nb)|[下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_ppocr_mobile_v2.0_cls_infer_opt.nb)|[下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_PP-OCRv2_rec_infer_opt.nb)|v2.10| diff --git a/doc/doc_ch/ppocr_introduction.md b/doc/doc_ch/ppocr_introduction.md index 6527c5803b3135bda922b5478ebe9ddbbb9ae0d9..14f95f1cd65da249d58da39c5228cb6d4bcb045e 100644 --- a/doc/doc_ch/ppocr_introduction.md +++ b/doc/doc_ch/ppocr_introduction.md @@ -38,8 +38,9 @@ PP-OCRv2在PP-OCR的基础上,进一步在5个方面重点优化,检测模 #### PP-OCRv3 -PP-OCRv3在PP-OCRv2的基础上进一步升级。检测模型仍然基于DB算法,优化策略采用了带残差注意力机制的FPN结构RSEFPN、增大感受野的PAN结构LKPAN、基于DML训练的更优的教师模型;识别模型将base模型从CRNN替换成了IJCAI 2022论文[SVTR](https://arxiv.org/abs/2205.00159),并采用SVTR轻量化、带指导训练CTC、数据增广策略RecConAug、自监督训练的更好的预训练模型、无标签数据的使用进行模型加速和效果提升。更多细节请参考PP-OCRv3[技术报告](./PP-OCRv3_introduction.md)。 - +PP-OCRv3在PP-OCRv2的基础上,针对检测模型和识别模型,进行了共计9个方面的升级: +- PP-OCRv3检测模型对PP-OCRv2中的CML协同互学习文本检测蒸馏策略进行了升级,分别针对教师模型和学生模型进行进一步效果优化。其中,在对教师模型优化时,提出了大感受野的PAN结构LK-PAN和引入了DML蒸馏策略;在对学生模型优化时,提出了残差注意力机制的FPN结构RSE-FPN。 +- PP-OCRv3的识别模块是基于文本识别算法[SVTR](https://arxiv.org/abs/2205.00159)优化。SVTR不再采用RNN结构,通过引入Transformers结构更加有效地挖掘文本行图像的上下文信息,从而提升文本识别能力。PP-OCRv3通过轻量级文本识别网络SVTR_LCNet、Attention损失指导CTC损失训练策略、挖掘文字上下文信息的数据增广策略TextConAug、TextRotNet自监督预训练模型、UDML联合互学习策略、UIM无标注数据挖掘方案,6个方面进行模型加速和效果提升。 PP-OCRv3系统pipeline如下: @@ -47,6 +48,9 @@ PP-OCRv3系统pipeline如下: +更多细节请参考PP-OCRv3[技术报告](./PP-OCRv3_introduction.md)。 + + ## 2. 特性 diff --git a/doc/doc_ch/quickstart.md b/doc/doc_ch/quickstart.md index 6301755de8e41e497b83d54c897b2b939d758cdc..29ca48fa838be4a60f08d31d5031180b951e33bc 100644 --- a/doc/doc_ch/quickstart.md +++ b/doc/doc_ch/quickstart.md @@ -59,15 +59,13 @@ cd /path/to/ppocr_img 如果不使用提供的测试图片,可以将下方`--image_dir`参数替换为相应的测试图片路径。 -**注意** whl包默认使用`PP-OCRv3`模型,识别模型使用的输入shape为`3,48,320`, 因此如果使用识别功能,需要添加参数`--rec_image_shape 3,48,320`,如果不使用默认的`PP-OCRv3`模型,则无需设置该参数。 - #### 2.1.1 中英文模型 * 检测+方向分类器+识别全流程:`--use_angle_cls true`设置使用方向分类器识别180度旋转文字,`--use_gpu false`设置不使用GPU ```bash - paddleocr --image_dir ./imgs/11.jpg --use_angle_cls true --use_gpu false --rec_image_shape 3,48,320 + paddleocr --image_dir ./imgs/11.jpg --use_angle_cls true --use_gpu false ``` 结果是一个list,每个item包含了文本框,文字和识别置信度 @@ -94,7 +92,7 @@ cd /path/to/ppocr_img - 单独使用识别:设置`--det`为`false` ```bash - paddleocr --image_dir ./imgs_words/ch/word_1.jpg --det false --rec_image_shape 3,48,320 + paddleocr --image_dir ./imgs_words/ch/word_1.jpg --det false ``` 结果是一个list,每个item只包含识别结果和识别置信度 @@ -104,16 +102,16 @@ cd /path/to/ppocr_img ``` -如需使用2.0模型,请指定参数`--version PP-OCR`,paddleocr默认使用PP-OCRv3模型(`--versioin PP-OCRv3`)。更多whl包使用可参考[whl包文档](./whl.md) +如需使用2.0模型,请指定参数`--ocr_version PP-OCR`,paddleocr默认使用PP-OCRv3模型(`--ocr_version PP-OCRv3`)。更多whl包使用可参考[whl包文档](./whl.md) #### 2.1.2 多语言模型 -Paddleocr目前支持80个语种,可以通过修改`--lang`参数进行切换,对于英文模型,指定`--lang=en`, PP-OCRv3目前只支持中文和英文模型,其他多语言模型会陆续更新。 +PaddleOCR目前支持80个语种,可以通过修改`--lang`参数进行切换,对于英文模型,指定`--lang=en`。 ``` bash -paddleocr --image_dir ./imgs_en/254.jpg --lang=en --rec_image_shape 3,48,320 +paddleocr --image_dir ./imgs_en/254.jpg --lang=en ```
diff --git a/doc/doc_ch/update.md b/doc/doc_ch/update.md index 9071e673910f8d87762dc8f9dd097d444f36e624..69dde041d3b4d6af3b248b9392a7946dca61858e 100644 --- a/doc/doc_ch/update.md +++ b/doc/doc_ch/update.md @@ -1,4 +1,5 @@ # 更新 +- 2022.5.7 添加对[Weights & Biases](https://docs.wandb.ai/)训练日志记录工具的支持。 - 2021.12.21 《OCR十讲》课程开讲,12月21日起每晚八点半线上授课! 【免费】报名地址:https://aistudio.baidu.com/aistudio/course/introduce/25207 - 2021.12.21 发布PaddleOCR v2.4。OCR算法新增1种文本检测算法(PSENet),3种文本识别算法(NRTR、SEED、SAR);文档结构化算法新增1种关键信息提取算法(SDMGR),3种DocVQA算法(LayoutLM、LayoutLMv2,LayoutXLM)。 - 2021.9.7 发布PaddleOCR v2.3,发布[PP-OCRv2](#PP-OCRv2),CPU推理速度相比于PP-OCR server提升220%;效果相比于PP-OCR mobile 提升7%。 diff --git a/doc/doc_ch/whl.md b/doc/doc_ch/whl.md index d57f2ac3255a78b630e2ea4189ab182d5c7f71ba..511e0421f1e249e340f2002a900b59633e31880e 100644 --- a/doc/doc_ch/whl.md +++ b/doc/doc_ch/whl.md @@ -199,12 +199,10 @@ for line in result: paddleocr -h ``` -**注意** whl包默认使用`PP-OCRv3`模型,识别模型使用的输入shape为`3,48,320`, 因此如果使用识别功能,需要添加参数`--rec_image_shape 3,48,320`,如果不使用默认的`PP-OCRv3`模型,则无需设置该参数。 - * 检测+方向分类器+识别全流程 ```bash -paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg --use_angle_cls true --rec_image_shape 3,48,320 +paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg --use_angle_cls true ``` 结果是一个list,每个item包含了文本框,文字和识别置信度 @@ -217,7 +215,7 @@ paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg --use_angle_cls true --rec_image * 检测+识别 ```bash -paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg --rec_image_shape 3,48,320 +paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg ``` 结果是一个list,每个item包含了文本框,文字和识别置信度 @@ -230,7 +228,7 @@ paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg --rec_image_shape 3,48,320 * 方向分类器+识别 ```bash -paddleocr --image_dir PaddleOCR/doc/imgs_words/ch/word_1.jpg --use_angle_cls true --det false --rec_image_shape 3,48,320 +paddleocr --image_dir PaddleOCR/doc/imgs_words/ch/word_1.jpg --use_angle_cls true --det false ``` 结果是一个list,每个item只包含识别结果和识别置信度 @@ -256,7 +254,7 @@ paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg --rec false * 单独执行识别 ```bash -paddleocr --image_dir PaddleOCR/doc/imgs_words/ch/word_1.jpg --det false --rec_image_shape 3,48,320 +paddleocr --image_dir PaddleOCR/doc/imgs_words/ch/word_1.jpg --det false ``` 结果是一个list,每个item只包含识别结果和识别置信度 @@ -416,4 +414,4 @@ im_show.save('result.jpg') | cls | 前向时是否启动分类 (命令行模式下使用use_angle_cls控制前向是否启动分类) | FALSE | | show_log | 是否打印logger信息 | FALSE | | type | 执行ocr或者表格结构化, 值可选['ocr','structure'] | ocr | -| ocr_version | OCR模型版本,可选PP-OCRv3, PP-OCRv2, PP-OCR。PP-OCRv3 目前仅支持中、英文的检测和识别模型,方向分类器模型;PP-OCRv2 目前仅支持中文的检测和识别模型;PP-OCR支持中文的检测,识别,多语种识别,方向分类器等模型 | PP-OCRv3 | +| ocr_version | OCR模型版本,可选PP-OCRv3, PP-OCRv2, PP-OCR。PP-OCRv3 支持中、英文的检测、识别、多语种识别,方向分类器等模型;PP-OCRv2 目前仅支持中文的检测和识别模型;PP-OCR支持中文的检测,识别,多语种识别,方向分类器等模型 | PP-OCRv3 | diff --git a/doc/doc_en/config_en.md b/doc/doc_en/config_en.md index 68c2b5f0c14f0c9b09d854f5a8b33ca86cc4bdf7..d467a7f918ed57eb80754483715f3671fd2552c7 100644 --- a/doc/doc_en/config_en.md +++ b/doc/doc_en/config_en.md @@ -36,6 +36,7 @@ Take rec_chinese_lite_train_v2.0.yml as an example | pretrained_model | Set the path of the pre-trained model | ./pretrain_models/CRNN/best_accuracy | \ | | checkpoints | set model parameter path | None | Used to load parameters after interruption to continue training| | use_visualdl | Set whether to enable visualdl for visual log display | False | [Tutorial](https://www.paddlepaddle.org.cn/paddle/visualdl) | +| use_wandb | Set whether to enable W&B for visual log display | False | [Documentation](https://docs.wandb.ai/) | infer_img | Set inference image path or folder path | ./infer_img | \|| | character_dict_path | Set dictionary path | ./ppocr/utils/ppocr_keys_v1.txt | If the character_dict_path is None, model can only recognize number and lower letters | | max_text_length | Set the maximum length of text | 25 | \ | @@ -66,7 +67,7 @@ In PaddleOCR, the network is divided into four stages: Transform, Backbone, Neck | :---------------------: | :---------------------: | :--------------: | :--------------------: | | model_type | Network Type | rec | Currently support`rec`,`det`,`cls` | | algorithm | Model name | CRNN | See [algorithm_overview](./algorithm_overview_en.md) for the support list | -| **Transform** | Set the transformation method | - | Currently only recognition algorithms are supported, see [ppocr/modeling/transforms](../../ppocr/modeling/transforms) for details | +| **Transform** | Set the transformation method | - | Currently only recognition algorithms are supported, see [ppocr/modeling/transform](../../ppocr/modeling/transforms) for details | | name | Transformation class name | TPS | Currently supports `TPS` | | num_fiducial | Number of TPS control points | 20 | Ten on the top and bottom | | loc_lr | Localization network learning rate | 0.1 | \ | @@ -130,6 +131,17 @@ In PaddleOCR, the network is divided into four stages: Transform, Backbone, Neck | drop_last | Whether to discard the last incomplete mini-batch because the number of samples in the data set cannot be divisible by batch_size | True | \ | | num_workers | The number of sub-processes used to load data, if it is 0, the sub-process is not started, and the data is loaded in the main process | 8 | \ | +### Weights & Biases ([W&B](../../ppocr/utils/loggers/wandb_logger.py)) +| Parameter | Use | Defaults | Note | +| :---------------------: | :---------------------: | :--------------: | :--------------------: | +| project | Project to which the run is to be logged | uncategorized | \ +| name | Alias/Name of the run | Randomly generated by wandb | \ +| id | ID of the run | Randomly generated by wandb | \ +| entity | User or team to which the run is being logged | The logged in user | \ +| save_dir | local directory in which all the models and other data is saved | wandb | \ +| config | model configuration | None | \ + + ## 3. Multilingual Config File Generation @@ -233,4 +245,4 @@ For more supported languages, please refer to : [Multi-language model](https://g The multi-language model training method is the same as the Chinese model. The training data set is 100w synthetic data. A small amount of fonts and test data can be downloaded using the following two methods. * [Baidu Netdisk](https://pan.baidu.com/s/1bS_u207Rm7YbY33wOECKDA),Extraction code:frgi. -* [Google drive](https://drive.google.com/file/d/18cSWX7wXSy4G0tbKJ0d9PuIaiwRLHpjA/view) +* [Google drive](https://drive.google.com/file/d/18cSWX7wXSy4G0tbKJ0d9PuIaiwRLHpjA/view) \ No newline at end of file diff --git a/doc/doc_en/logging_en.md b/doc/doc_en/logging_en.md new file mode 100644 index 0000000000000000000000000000000000000000..d00ab8bd561c1bb7e489642298e74180e0c66886 --- /dev/null +++ b/doc/doc_en/logging_en.md @@ -0,0 +1,61 @@ +## Logging metrics and models + +PaddleOCR comes with two metric logging tools integrated directly into the training API: [VisualDL](https://readthedocs.org/projects/visualdl/) and [Weights & Biases](https://docs.wandb.ai/). + +### VisualDL +VisualDL is a visualization analysis tool of PaddlePaddle. The integration allows all training metrics to be logged to a VisualDL dashboard. To use it, add the following line to the `Global` section of the config yaml file - + +``` +Global: + use_visualdl: True +``` + +To see the visualizations run the following command in your terminal + +```shell +visualdl --logdir +``` + +Now open `localhost:8040` in your browser of choice! + +### Weights & Biases +W&B is a MLOps tool that can be used for experiment tracking, dataset/model versioning, visualizing results and collaborating with colleagues. A W&B logger is integrated directly into PaddleOCR and to use it, first you need to install the `wandb` sdk and login to your wandb account. + +```shell +pip install wandb +wandb login +``` + +If you do not have a wandb account, you can make one [here](https://wandb.ai/site). + +To visualize and track your model training add the following flag to your config yaml file under the `Global` section - + +``` +Global: + use_wandb: True +``` + +To add more arguments to the `WandbLogger` listed [here](./config_en.md) add the header `wandb` to the yaml file and add the arguments under it - + +``` +wandb: + project: my_project + entity: my_team +``` + +These config variables from the yaml file are used to instantiate the `WandbLogger` object with the project name, entity name (the logged in user by default), directory to store metadata (`./wandb` by default) and more. During the training process, the `log_metrics` function is called to log training and evaluation metrics at the training and evaluation steps respectively from the rank 0 process only. + +At every model saving step, the WandbLogger, logs the model using the `log_model` function along with relavant metadata and tags showing the epoch in which the model is saved, the model is best or not and so on. + +All the logging mentioned above is integrated into the `program.train` function and will generate dashboards like this - + +![W&B Dashboard](../imgs_en/wandb_metrics.png) + +![W&B Models](../imgs_en/wandb_models.png) + +For more advanced usage to log images, audios, videos or any other form of data, you can use `WandbLogger().run.log`. More examples on how to log different kinds of data are available [here](https://docs.wandb.ai/examples). + +To view the dashboard, the link to the dashboard is printed to the console at the beginning and end of every training job and you can also access it by logging into your W&B account on your browser. + +### Using Multiple Loggers +Both VisualDL and W&B can also be used simultaneously by just setting both the aforementioned flags to True. \ No newline at end of file diff --git a/doc/doc_en/models_list_en.md b/doc/doc_en/models_list_en.md index a61667b8d66a72d265c5ea9d3dbb9a2bff51de61..15a7fdb94e303297f7be681f297a5e52613a268a 100644 --- a/doc/doc_en/models_list_en.md +++ b/doc/doc_en/models_list_en.md @@ -16,13 +16,13 @@ - [3. Text Angle Classification Model](#3-text-angle-classification-model) - [4. Paddle-Lite Model](#4-paddle-lite-model) -The downloadable models provided by PaddleOCR include `inference model`, `trained model`, `pre-trained model` and `slim model`. The differences between the models are as follows: +The downloadable models provided by PaddleOCR include `inference model`, `trained model`, `pre-trained model` and `nb model`. The differences between the models are as follows: |model type|model format|description| |--- | --- | --- | |inference model|inference.pdmodel、inference.pdiparams|Used for inference based on Paddle inference engine,[detail](./inference_en.md)| |trained model, pre-trained model|\*.pdparams、\*.pdopt、\*.states |The checkpoints model saved in the training process, which stores the parameters of the model, mostly used for model evaluation and continuous training.| -|slim model|\*.nb| Model compressed by PaddleSlim (a model compression tool using PaddlePaddle), which is suitable for mobile-side deployment scenarios (Paddle-Lite is needed for slim model deployment). | +|nb model|\*.nb| Model optimized by Paddle-Lite, which is suitable for mobile-side deployment scenarios (Paddle-Lite is needed for nb model deployment). | Relationship of the above models is as follows. @@ -37,7 +37,7 @@ Relationship of the above models is as follows. |model name|description|config|model size|download| | --- | --- | --- | --- | --- | -|ch_PP-OCRv3_det_slim| [New] slim quantization with distillation lightweight model, supporting Chinese, English, multilingual text detection |[ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml)| 1.1M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_slim_infer.tar) / [trained model (coming soon)](https://paddleocr.bj.bcebos.com/PP-OCRv3/ch/ch_PP-OCRv3_det_slim_distill_train.tar) / [slim model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_slim_infer.nb)| +|ch_PP-OCRv3_det_slim| [New] slim quantization with distillation lightweight model, supporting Chinese, English, multilingual text detection |[ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml)| 1.1M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_slim_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/ch/ch_PP-OCRv3_det_slim_distill_train.tar) / [nb model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_slim_infer.nb)| |ch_PP-OCRv3_det| [New] Original lightweight model, supporting Chinese, English, multilingual text detection |[ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml)| 3.8M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_distill_train.tar)| |ch_PP-OCRv2_det_slim| [New] slim quantization with distillation lightweight model, supporting Chinese, English, multilingual text detection|[ch_PP-OCRv2_det_cml.yml](../../configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_cml.yml)| 3M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_slim_quant_infer.tar)| |ch_PP-OCRv2_det| [New] Original lightweight model, supporting Chinese, English, multilingual text detection|[ch_PP-OCRv2_det_cml.yml](../../configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_cml.yml)|3M|[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_distill_train.tar)| @@ -51,7 +51,7 @@ Relationship of the above models is as follows. |model name|description|config|model size|download| | --- | --- | --- | --- | --- | -|en_PP-OCRv3_det_slim | [New] Slim qunatization with distillation lightweight detection model, supporting English | [ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml) | 1.1M |[inference model(coming soon)](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_slim_infer.tar) / [trained model (coming soon)](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_slim_distill_train.tar) / [slim model(coming soon)](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_slim_infer.nb) | +|en_PP-OCRv3_det_slim | [New] Slim qunatization with distillation lightweight detection model, supporting English | [ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml) | 1.1M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_slim_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_slim_distill_train.tar) / [nb model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_slim_infer.nb) | |ch_PP-OCRv3_det | [New] Original lightweight detection model, supporting English |[ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml)| 3.8M | [inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_distill_train.tar) | * Note: English configuration file is same as Chinese except training data, here we only provide one configuration file. @@ -62,7 +62,7 @@ Relationship of the above models is as follows. |model name|description|config|model size|download| | --- | --- | --- | --- | --- | -| ml_PP-OCRv3_det_slim | [New] Slim qunatization with distillation lightweight detection model, supporting English | [ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml) | 1.1M | [inference model(coming soon)](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/Multilingual_PP-OCRv3_det_slim_infer.tar) / [trained model (coming soon)](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/Multilingual_PP-OCRv3_det_slim_distill_train.tar) / [slim model(coming soon)](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/Multilingual_PP-OCRv3_det_slim_infer.nb) | +| ml_PP-OCRv3_det_slim | [New] Slim qunatization with distillation lightweight detection model, supporting English | [ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml) | 1.1M | [inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/Multilingual_PP-OCRv3_det_slim_infer.tar) / [trained model ](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/Multilingual_PP-OCRv3_det_slim_distill_train.tar) / [nb model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/Multilingual_PP-OCRv3_det_slim_infer.nb) | | ml_PP-OCRv3_det |[New] Original lightweight detection model, supporting English | [ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml)| 3.8M | [inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/Multilingual_PP-OCRv3_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/Multilingual_PP-OCRv3_det_distill_train.tar) | * Note: English configuration file is same as Chinese except training data, here we only provide one configuration file. @@ -75,7 +75,7 @@ Relationship of the above models is as follows. |model name|description|config|model size|download| | --- | --- | --- | --- | --- | -|ch_PP-OCRv3_rec_slim | [New] Slim qunatization with distillation lightweight model, supporting Chinese, English text recognition |[ch_PP-OCRv3_rec_distillation.yml](../../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml)| 4.9M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/ch/ch_PP-OCRv3_rec_slim_train.tar) / [slim model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_infer.nb) | +|ch_PP-OCRv3_rec_slim | [New] Slim qunatization with distillation lightweight model, supporting Chinese, English text recognition |[ch_PP-OCRv3_rec_distillation.yml](../../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml)| 4.9M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/ch/ch_PP-OCRv3_rec_slim_train.tar) / [nb model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_infer.nb) | |ch_PP-OCRv3_rec| [New] Original lightweight model, supporting Chinese, English, multilingual text recognition |[ch_PP-OCRv3_rec_distillation.yml](../../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml)| 12.4M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_train.tar) | |ch_PP-OCRv2_rec_slim| Slim qunatization with distillation lightweight model, supporting Chinese, English text recognition|[ch_PP-OCRv2_rec.yml](../../configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec.yml)| 9M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_train.tar) | |ch_PP-OCRv2_rec| Original lightweight model, supporting Chinese, English, multilingual text recognition |[ch_PP-OCRv2_rec_distillation.yml](../../configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec_distillation.yml)|8.5M|[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_train.tar) | @@ -91,8 +91,8 @@ Relationship of the above models is as follows. |model name|description|config|model size|download| | --- | --- | --- | --- | --- | -|en_PP-OCRv3_rec_slim | [New] Slim qunatization with distillation lightweight model, supporting english, English text recognition |[en_PP-OCRv3_rec_distillation.yml](../../configs/rec/PP-OCRv3/en_PP-OCRv3_rec_distillation.yml)| 4.9M |[inference model(coming soon)](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_slim_infer.tar) / [trained model (coming soon)](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_slim_train.tar) / [slim model(coming soon)](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_slim_infer.nb) | -|en_PP-OCRv3_rec| [New] Original lightweight model, supporting english, English, multilingual text recognition |[en_PP-OCRv3_rec_distillation.yml](../../configs/rec/PP-OCRv3/en_PP-OCRv3_rec_distillation.yml)| 12.4M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_train.tar) | +|en_PP-OCRv3_rec_slim | [New] Slim qunatization with distillation lightweight model, supporting english, English text recognition |[en_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/en_PP-OCRv3_rec.yml)| 3.2M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_slim_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_slim_train.tar) / [nb model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_slim_infer.nb) | +|en_PP-OCRv3_rec| [New] Original lightweight model, supporting english, English, multilingual text recognition |[en_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/en_PP-OCRv3_rec.yml)| 9.6M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_train.tar) | |en_number_mobile_slim_v2.0_rec|Slim pruned and quantized lightweight model, supporting English and number recognition|[rec_en_number_lite_train.yml](../../configs/rec/multi_language/rec_en_number_lite_train.yml)| 2.7M | [inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/en_number_mobile_v2.0_rec_slim_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/en_number_mobile_v2.0_rec_slim_train.tar) | |en_number_mobile_v2.0_rec|Original lightweight model, supporting English and number recognition|[rec_en_number_lite_train.yml](../../configs/rec/multi_language/rec_en_number_lite_train.yml)|2.6M|[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/en_number_mobile_v2.0_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/en_number_mobile_v2.0_rec_train.tar) | @@ -122,11 +122,16 @@ For more supported languages, please refer to : [Multi-language model](./multi_l |model name|description|config|model size|download| | --- | --- | --- | --- | --- | -|ch_ppocr_mobile_slim_v2.0_cls|Slim quantized model for text angle classification|[cls_mv3.yml](../../configs/cls/cls_mv3.yml)| 2.1M | [inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_slim_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_slim_train.tar) | +|ch_ppocr_mobile_slim_v2.0_cls|Slim quantized model for text angle classification|[cls_mv3.yml](../../configs/cls/cls_mv3.yml)| 2.1M | [inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_slim_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_slim_train.tar) / [nb model](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_ppocr_mobile_v2.0_cls_infer_opt.nb) | |ch_ppocr_mobile_v2.0_cls|Original model for text angle classification|[cls_mv3.yml](../../configs/cls/cls_mv3.yml)|1.38M|[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) | ## 4. Paddle-Lite Model + +Paddle Lite is an updated version of Paddle-Mobile, an open-open source deep learning framework designed to make it easy to perform inference on mobile, embeded, and IoT devices. It can further optimize the inference model and generate `nb model` used for edge devices. It's suggested to optimize the quantization model using Paddle-Lite because `INT8` format is used for the model storage and inference. + +This chapter lists OCR nb models with PP-OCRv2 or earlier versions. You can access to the latest nb models from the above tables. + |Version|Introduction|Model size|Detection model|Text Direction model|Recognition model|Paddle-Lite branch| |---|---|---|---|---|---|---| |PP-OCRv2|extra-lightweight chinese OCR optimized model|11M|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_PP-OCRv2_det_infer_opt.nb)|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_ppocr_mobile_v2.0_cls_infer_opt.nb)|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_PP-OCRv2_rec_infer_opt.nb)|v2.10| diff --git a/doc/doc_en/ppocr_introduction_en.md b/doc/doc_en/ppocr_introduction_en.md index d8af8d9ee31dd4ab63b8e22b8f1c59f64ee10f38..b2895cc27b98564a99c73a9abf7ee0d7451176e1 100644 --- a/doc/doc_en/ppocr_introduction_en.md +++ b/doc/doc_en/ppocr_introduction_en.md @@ -17,6 +17,7 @@ English | [简体中文](../doc_ch/ppocr_introduction.md) PP-OCR is a self-developed practical ultra-lightweight OCR system, which is slimed and optimized based on the reimplemented [academic algorithms](algorithm_en.md), considering the balance between **accuracy** and **speed**. +#### PP-OCR PP-OCR is a two-stage OCR system, in which the text detection algorithm is [DB](algorithm_det_db_en.md), and the text recognition algorithm is [CRNN](algorithm_rec_crnn_en.md). Besides, a [text direction classifier](angle_class_en.md) is added between the detection and recognition modules to deal with text in different directions. PP-OCR pipeline is as follows: @@ -28,11 +29,16 @@ PP-OCR pipeline is as follows: PP-OCR system is in continuous optimization. At present, PP-OCR and PP-OCRv2 have been released: -[1] PP-OCR adopts 19 effective strategies from 8 aspects including backbone network selection and adjustment, prediction head design, data augmentation, learning rate transformation strategy, regularization parameter selection, pre-training model use, and automatic model tailoring and quantization to optimize and slim down the models of each module (as shown in the green box above). The final results are an ultra-lightweight Chinese and English OCR model with an overall size of 3.5M and a 2.8M English digital OCR model. For more details, please refer to the PP-OCR technical article (https://arxiv.org/abs/2009.09941). +PP-OCR adopts 19 effective strategies from 8 aspects including backbone network selection and adjustment, prediction head design, data augmentation, learning rate transformation strategy, regularization parameter selection, pre-training model use, and automatic model tailoring and quantization to optimize and slim down the models of each module (as shown in the green box above). The final results are an ultra-lightweight Chinese and English OCR model with an overall size of 3.5M and a 2.8M English digital OCR model. For more details, please refer to the PP-OCR technical article (https://arxiv.org/abs/2009.09941). -[2] On the basis of PP-OCR, PP-OCRv2 is further optimized in five aspects. The detection model adopts CML(Collaborative Mutual Learning) knowledge distillation strategy and CopyPaste data expansion strategy. The recognition model adopts LCNet lightweight backbone network, U-DML knowledge distillation strategy and enhanced CTC loss function improvement (as shown in the red box above), which further improves the inference speed and prediction effect. For more details, please refer to the technical report of PP-OCRv2 (https://arxiv.org/abs/2109.03144). +#### PP-OCRv2 +On the basis of PP-OCR, PP-OCRv2 is further optimized in five aspects. The detection model adopts CML(Collaborative Mutual Learning) knowledge distillation strategy and CopyPaste data expansion strategy. The recognition model adopts LCNet lightweight backbone network, U-DML knowledge distillation strategy and enhanced CTC loss function improvement (as shown in the red box above), which further improves the inference speed and prediction effect. For more details, please refer to the technical report of PP-OCRv2 (https://arxiv.org/abs/2109.03144). -[3] PP-OCRv3 is further upgraded on the basis of PP-OCRv2. The detection model is still based on DB algorithm, and the optimization strategies include a newly proposed FPN structure with residual attention mechanism named with RSEFPN, a PAN structure with enlarged receptive field named with LKPAN, and better teacher model based on DML training; The recognition model replaces the base model from CRNN with IJCAI 2022 paper [SVTR](https://arxiv.org/abs/2205.00159), and adopts lightweight SVTR, guided training of CTC, data augmentation strategy RecConAug, better pre-trained model by self-supervised training, and the use of unlabeled data to accelerate the model and improve the effect. For more details, please refer to PP-OCRv3 [technical report](./PP-OCRv3_introduction_en.md). +#### PP-OCRv3 + +PP-OCRv3 upgraded the detection model and recognition model in 9 aspects based on PP-OCRv2: +- PP-OCRv3 detector upgrades the CML(Collaborative Mutual Learning) text detection strategy proposed in PP-OCRv2, and further optimizes the effect of teacher model and student model respectively. In the optimization of teacher model, a pan module with large receptive field named LK-PAN is proposed and the DML distillation strategy is adopted; In the optimization of student model, a FPN module with residual attention mechanism named RSE-FPN is proposed. +- PP-OCRv3 recognizer is optimized based on text recognition algorithm [SVTR](https://arxiv.org/abs/2205.00159). SVTR no longer adopts RNN by introducing transformers structure, which can mine the context information of text line image more effectively, so as to improve the ability of text recognition. PP-OCRv3 adopts lightweight text recognition network SVTR_LCNet, guided training of CTC loss by attention loss, data augmentation strategy TextConAug, better pre-trained model by self-supervised TextRotNet, UDML(Unified Deep Mutual Learning), and UIM (Unlabeled Images Mining) to accelerate the model and improve the effect. PP-OCRv3 pipeline is as follows: @@ -40,6 +46,8 @@ PP-OCRv3 pipeline is as follows:
+For more details, please refer to [PP-OCRv3 technical report](./PP-OCRv3_introduction_en.md). + ## 2. Features diff --git a/doc/doc_en/quickstart_en.md b/doc/doc_en/quickstart_en.md index 7243e2db927a1cc89f8ac4d63c2a5a722de393d5..d7aeb7773021aa6cf8f4d71298588915e5938fab 100644 --- a/doc/doc_en/quickstart_en.md +++ b/doc/doc_en/quickstart_en.md @@ -1,18 +1,18 @@ -- [PaddleOCR Quick Start](#paddleocr-quick-start) - - [1. Installation](#1-installation) +# PaddleOCR Quick Start + +**Note:** This tutorial mainly introduces the usage of PP-OCR series models, please refer to [PP-Structure Quick Start](../../ppstructure/docs/quickstart_en.md) for the quick use of document analysis related functions. + +- [1. Installation](#1-installation) - [1.1 Install PaddlePaddle](#11-install-paddlepaddle) - [1.2 Install PaddleOCR Whl Package](#12-install-paddleocr-whl-package) - - [2. Easy-to-Use](#2-easy-to-use) +- [2. Easy-to-Use](#2-easy-to-use) - [2.1 Use by Command Line](#21-use-by-command-line) - [2.1.1 Chinese and English Model](#211-chinese-and-english-model) - [2.1.2 Multi-language Model](#212-multi-language-model) - - [2.1.3 Layout Analysis](#213-layout-analysis) - [2.2 Use by Code](#22-use-by-code) - [2.2.1 Chinese & English Model and Multilingual Model](#221-chinese--english-model-and-multilingual-model) - - [2.2.2 Layout Analysis](#222-layout-analysis) - - [3. Summary](#3-summary) +- [3. Summary](#3-summary) -# PaddleOCR Quick Start @@ -73,8 +73,6 @@ cd /path/to/ppocr_img If you do not use the provided test image, you can replace the following `--image_dir` parameter with the corresponding test image path -**Note**: The whl package uses the `PP-OCRv3` model by default, and the input shape used by the recognition model is `3,48,320`, so if you use the recognition function, you need to add the parameter `--rec_image_shape 3,48,320`, if you do not use the default `PP- OCRv3` model, you do not need to set this parameter. - #### 2.1.1 Chinese and English Model @@ -82,7 +80,7 @@ If you do not use the provided test image, you can replace the following `--imag * Detection, direction classification and recognition: set the parameter`--use_gpu false` to disable the gpu device ```bash - paddleocr --image_dir ./imgs_en/img_12.jpg --use_angle_cls true --lang en --use_gpu false --rec_image_shape 3,48,320 + paddleocr --image_dir ./imgs_en/img_12.jpg --use_angle_cls true --lang en --use_gpu false ``` Output will be a list, each item contains bounding box, text and recognition confidence @@ -112,7 +110,7 @@ If you do not use the provided test image, you can replace the following `--imag * Only recognition: set `--det` to `false` ```bash - paddleocr --image_dir ./imgs_words_en/word_10.png --det false --lang en --rec_image_shape 3,48,320 + paddleocr --image_dir ./imgs_words_en/word_10.png --det false --lang en ``` Output will be a list, each item contains text and recognition confidence @@ -121,15 +119,15 @@ If you do not use the provided test image, you can replace the following `--imag ['PAIN', 0.9934559464454651] ``` -If you need to use the 2.0 model, please specify the parameter `--version PP-OCR`, paddleocr uses the PP-OCRv3 model by default(`--versioin PP-OCRv3`). More whl package usage can be found in [whl package](./whl_en.md) +If you need to use the 2.0 model, please specify the parameter `--ocr_version PP-OCR`, paddleocr uses the PP-OCRv3 model by default(`--ocr_version PP-OCRv3`). More whl package usage can be found in [whl package](./whl_en.md) #### 2.1.2 Multi-language Model -Paddleocr currently supports 80 languages, which can be switched by modifying the `--lang` parameter. PP-OCRv3 currently only supports Chinese and English models, and other multilingual models will be updated one after another. +PaddleOCR currently supports 80 languages, which can be switched by modifying the `--lang` parameter. ``` bash -paddleocr --image_dir ./doc/imgs_en/254.jpg --lang=en --rec_image_shape 3,48,320 +paddleocr --image_dir ./doc/imgs_en/254.jpg --lang=en ```
@@ -154,48 +152,7 @@ Commonly used multilingual abbreviations include | Chinese Traditional | chinese_cht | | Italian | it | | Russian | ru | A list of all languages and their corresponding abbreviations can be found in [Multi-Language Model Tutorial](./multi_languages_en.md) - - -#### 2.1.3 Layout Analysis - -Layout analysis refers to the division of 5 types of areas of the document, including text, title, list, picture and table. For the first three types of regions, directly use the OCR model to complete the text detection and recognition of the corresponding regions, and save the results in txt. For the table area, after the table structuring process, the table picture is converted into an Excel file of the same table style. The picture area will be individually cropped into an image. - -To use the layout analysis function of PaddleOCR, you need to specify `--type=structure` - -```bash -paddleocr --image_dir=../doc/table/1.png --type=structure -``` - -- **Results Format** - - The returned results of PP-Structure is a list composed of a dict, an example is as follows - - ```shell - [ - { 'type': 'Text', - 'bbox': [34, 432, 345, 462], - 'res': ([[36.0, 437.0, 341.0, 437.0, 341.0, 446.0, 36.0, 447.0], [41.0, 454.0, 125.0, 453.0, 125.0, 459.0, 41.0, 460.0]], - [('Tigure-6. The performance of CNN and IPT models using difforen', 0.90060663), ('Tent ', 0.465441)]) - } - ] - ``` - - The description of each field in dict is as follows - | Parameter | Description | - | --------- | ------------------------------------------------------------ | - | type | Type of image area | - | bbox | The coordinates of the image area in the original image, respectively [left upper x, left upper y, right bottom x, right bottom y] | - | res | OCR or table recognition result of image area。
Table: HTML string of the table;
OCR: A tuple containing the detection coordinates and recognition results of each single line of text | - -- **Parameter Description:** - - | Parameter | Description | Default value | - | --------------- | ------------------------------------------------------------ | -------------------------------------------- | - | output | The path where excel and recognition results are saved | ./output/table | - | table_max_len | The long side of the image is resized in table structure model | 488 | - | table_model_dir | inference model path of table structure model | None | - | table_char_dict_path | dict path of table structure model | ../ppocr/utils/dict/table_structure_dict.txt | @@ -243,40 +200,12 @@ Visualization of results
- - -#### 2.2.2 Layout Analysis - -```python -import os -import cv2 -from paddleocr import PPStructure,draw_structure_result,save_structure_res - -table_engine = PPStructure(show_log=True) - -save_folder = './output/table' -img_path = './table/1.png' -img = cv2.imread(img_path) -result = table_engine(img) -save_structure_res(result, save_folder,os.path.basename(img_path).split('.')[0]) - -for line in result: - line.pop('img') - print(line) - -from PIL import Image -font_path = './fonts/simfang.ttf' -image = Image.open(img_path).convert('RGB') -im_show = draw_structure_result(image, result,font_path=font_path) -im_show = Image.fromarray(im_show) -im_show.save('result.jpg') -``` ## 3. Summary -In this section, you have mastered the use of PaddleOCR whl packages and obtained results. +In this section, you have mastered the use of PaddleOCR whl package. -PaddleOCR is a rich and practical OCR tool library that opens up the whole process of data, model training, compression and inference deployment, so in the [next section](./paddleOCR_overview_en.md) we will first introduce you to the overview of PaddleOCR, and then clone the PaddleOCR project to start the application journey of PaddleOCR. +PaddleOCR is a rich and practical OCR tool library that get through the whole process of data production, model training, compression, inference and deployment, please refer to the [tutorials](../../README.md#tutorials) to start the journey of PaddleOCR. diff --git a/doc/doc_en/update_en.md b/doc/doc_en/update_en.md index 8ec74fe8b73d89cc97904e2ce156e14bbd596eb4..24e342a636abc3300455c5e1c3d9f5670e0a9be4 100644 --- a/doc/doc_en/update_en.md +++ b/doc/doc_en/update_en.md @@ -1,4 +1,5 @@ # RECENT UPDATES +- 2022.5.7 Add support for metric and model logging during training to [Weights & Biases](https://docs.wandb.ai/). - 2021.12.21 OCR open source online course starts. The lesson starts at 8:30 every night and lasts for ten days. Free registration: https://aistudio.baidu.com/aistudio/course/introduce/25207 - 2021.12.21 release PaddleOCR v2.4, release 1 text detection algorithm (PSENet), 3 text recognition algorithms (NRTR、SEED、SAR), 1 key information extraction algorithm (SDMGR) and 3 DocVQA algorithms (LayoutLM、LayoutLMv2,LayoutXLM). - 2021.9.7 release PaddleOCR v2.3, [PP-OCRv2](#PP-OCRv2) is proposed. The CPU inference speed of PP-OCRv2 is 220% higher than that of PP-OCR server. The F-score of PP-OCRv2 is 7% higher than that of PP-OCR mobile. diff --git a/doc/doc_en/whl_en.md b/doc/doc_en/whl_en.md index 40a2e122d19679a59e7e65df29dd59781b4a2143..d81e5532cf1db0193abf61b972420bdc3bacfd0b 100644 --- a/doc/doc_en/whl_en.md +++ b/doc/doc_en/whl_en.md @@ -172,11 +172,9 @@ show help information paddleocr -h ``` -**Note**: The whl package uses the `PP-OCRv3` model by default, and the input shape used by the recognition model is `3,48,320`, so if you use the recognition function, you need to add the parameter `--rec_image_shape 3,48,320`, if you do not use the default `PP- OCRv3` model, you do not need to set this parameter. - * detection classification and recognition ```bash -paddleocr --image_dir PaddleOCR/doc/imgs_en/img_12.jpg --use_angle_cls true --lang en --rec_image_shape 3,48,320 +paddleocr --image_dir PaddleOCR/doc/imgs_en/img_12.jpg --use_angle_cls true --lang en ``` Output will be a list, each item contains bounding box, text and recognition confidence @@ -189,7 +187,7 @@ Output will be a list, each item contains bounding box, text and recognition con * detection and recognition ```bash -paddleocr --image_dir PaddleOCR/doc/imgs_en/img_12.jpg --lang en --rec_image_shape 3,48,320 +paddleocr --image_dir PaddleOCR/doc/imgs_en/img_12.jpg --lang en ``` Output will be a list, each item contains bounding box, text and recognition confidence @@ -202,7 +200,7 @@ Output will be a list, each item contains bounding box, text and recognition con * classification and recognition ```bash -paddleocr --image_dir PaddleOCR/doc/imgs_words_en/word_10.png --use_angle_cls true --det false --lang en --rec_image_shape 3,48,320 +paddleocr --image_dir PaddleOCR/doc/imgs_words_en/word_10.png --use_angle_cls true --det false --lang en ``` Output will be a list, each item contains text and recognition confidence @@ -225,7 +223,7 @@ Output will be a list, each item only contains bounding box * only recognition ```bash -paddleocr --image_dir PaddleOCR/doc/imgs_words_en/word_10.png --det false --lang en --rec_image_shape 3,48,320 +paddleocr --image_dir PaddleOCR/doc/imgs_words_en/word_10.png --det false --lang en ``` Output will be a list, each item contains text and recognition confidence @@ -368,4 +366,4 @@ im_show.save('result.jpg') | cls | Enable classification when `ppocr.ocr` func exec((Use use_angle_cls in command line mode to control whether to start classification in the forward direction) | FALSE | | show_log | Whether to print log| FALSE | | type | Perform ocr or table structuring, the value is selected in ['ocr','structure'] | ocr | -| ocr_version | OCR Model version number, the current model support list is as follows: PP-OCRv3 support Chinese and English detection and recognition model and direction classifier model, PP-OCRv2 support Chinese detection and recognition model, PP-OCR support Chinese detection, recognition and direction classifier, multilingual recognition model | PP-OCRv3 | +| ocr_version | OCR Model version number, the current model support list is as follows: PP-OCRv3 supports Chinese and English detection, recognition, multilingual recognition, direction classifier models, PP-OCRv2 support Chinese detection and recognition model, PP-OCR support Chinese detection, recognition and direction classifier, multilingual recognition model | PP-OCRv3 | diff --git a/doc/imgs_en/wandb_metrics.png b/doc/imgs_en/wandb_metrics.png new file mode 100644 index 0000000000000000000000000000000000000000..45f0041ae4d3819c2bf9c9fababcceb3ff20a115 Binary files /dev/null and b/doc/imgs_en/wandb_metrics.png differ diff --git a/doc/imgs_en/wandb_models.png b/doc/imgs_en/wandb_models.png new file mode 100644 index 0000000000000000000000000000000000000000..f9a7042bd59fa16179bd8a1f1e0eb49031300e4f Binary files /dev/null and b/doc/imgs_en/wandb_models.png differ diff --git a/doc/imgs_words/arabic/ar_1.jpg b/doc/imgs_words/arabic/ar_1.jpg index 33192651f8491be38373fabe2a8aec43fcd22a41..71d7bf252d73a6139a6129ec2bce9dad77920ce9 100644 Binary files a/doc/imgs_words/arabic/ar_1.jpg and b/doc/imgs_words/arabic/ar_1.jpg differ diff --git a/doc/imgs_words/arabic/ar_2.jpg b/doc/imgs_words/arabic/ar_2.jpg index 66c10840a090c674c143abf7296219876dd05817..017d3f6fbc3650d5e6a61ca4a25f6cb81232d8a5 100644 Binary files a/doc/imgs_words/arabic/ar_2.jpg and b/doc/imgs_words/arabic/ar_2.jpg differ diff --git a/doc/ppocr_v3/GTC.png b/doc/ppocr_v3/GTC.png index 2af2261d51d2279f171727a5a0b5a8d974763d80..30a9cdd146283e2e64fc0965cb06309b64707819 100644 Binary files a/doc/ppocr_v3/GTC.png and b/doc/ppocr_v3/GTC.png differ diff --git a/doc/ppocr_v3/LCNet_SVTR.png b/doc/ppocr_v3/LCNet_SVTR.png new file mode 100644 index 0000000000000000000000000000000000000000..7f0d701d27502999fcee6d0872d02b9fe1554e3c Binary files /dev/null and b/doc/ppocr_v3/LCNet_SVTR.png differ diff --git a/doc/ppocr_v3/LKPAN.png b/doc/ppocr_v3/LKPAN.png index ff0578f6901603185809e10c85793c212c40dc48..6b1605362317da48110b64a1a774b6f1e017eaa1 100644 Binary files a/doc/ppocr_v3/LKPAN.png and b/doc/ppocr_v3/LKPAN.png differ diff --git a/doc/ppocr_v3/RSEFPN.png b/doc/ppocr_v3/RSEFPN.png index 87f7f69fb516d496c9357d81b97e5bdb750f808a..ddf7c52fb5b01874bd931d23bd4d41bf979dcf31 100644 Binary files a/doc/ppocr_v3/RSEFPN.png and b/doc/ppocr_v3/RSEFPN.png differ diff --git a/doc/ppocr_v3/UIM.png b/doc/ppocr_v3/UIM.png new file mode 100644 index 0000000000000000000000000000000000000000..7479bdf4a9174be6b431aaee29093df92e008684 Binary files /dev/null and b/doc/ppocr_v3/UIM.png differ diff --git a/doc/ppocr_v3/ppocr_v3.png b/doc/ppocr_v3/ppocr_v3.png deleted file mode 100644 index 123c125acdcbc9e2ef6e4d6a0a1c92d01136ffde..0000000000000000000000000000000000000000 Binary files a/doc/ppocr_v3/ppocr_v3.png and /dev/null differ diff --git a/doc/ppocr_v3/ppocrv3_det_cml.png b/doc/ppocr_v3/ppocrv3_det_cml.png new file mode 100644 index 0000000000000000000000000000000000000000..ccb5c8b21faeab75027690e520b072186972f796 Binary files /dev/null and b/doc/ppocr_v3/ppocrv3_det_cml.png differ diff --git a/doc/ppocr_v3/svtr_g2.png b/doc/ppocr_v3/svtr_g2.png index d589891d5897533243845a993bd56d8f75726cfc..2573afafbbb6f5ad270320e45c7c3bdb47d8adc2 100644 Binary files a/doc/ppocr_v3/svtr_g2.png and b/doc/ppocr_v3/svtr_g2.png differ diff --git a/doc/ppocr_v3/svtr_g4.png b/doc/ppocr_v3/svtr_g4.png index 234a85c44b2cc3d968942480a596b2be5e45f53d..f85d66d97f619d57edb4223a0996901050ea7959 100644 Binary files a/doc/ppocr_v3/svtr_g4.png and b/doc/ppocr_v3/svtr_g4.png differ diff --git a/doc/ppocr_v3/svtr_tiny.png b/doc/ppocr_v3/svtr_tiny.png index 91b3eacb9f1242806ad3520cc36252351fc7baf1..01e22e74b539b12072a677bc5081df92f81ef963 100644 Binary files a/doc/ppocr_v3/svtr_tiny.png and b/doc/ppocr_v3/svtr_tiny.png differ diff --git a/doc/ppocr_v3/teacher_dml.png b/doc/ppocr_v3/teacher_dml.png new file mode 100644 index 0000000000000000000000000000000000000000..ea09cacda87ae4c0d44cb0f1c18ee1f10c50b957 Binary files /dev/null and b/doc/ppocr_v3/teacher_dml.png differ diff --git a/doc/ppocr_v3/v3_rec_pipeline.png b/doc/ppocr_v3/v3_rec_pipeline.png new file mode 100644 index 0000000000000000000000000000000000000000..aa61cc4f1652f958977fdab8d2dca56c57f8f816 Binary files /dev/null and b/doc/ppocr_v3/v3_rec_pipeline.png differ diff --git a/doc/ppocrv3_framework.png b/doc/ppocrv3_framework.png index c05398248fa7273382e9691a26d932bddc3cf84f..e05279f7f57301c480c0cc11d940af0b5bf69668 100644 Binary files a/doc/ppocrv3_framework.png and b/doc/ppocrv3_framework.png differ diff --git a/paddleocr.py b/paddleocr.py index 417350839ac4d1e512c7396831f89ab4b2d6c724..f7871db6470c75db82e8251dff5361c099c4adda 100644 --- a/paddleocr.py +++ b/paddleocr.py @@ -67,6 +67,10 @@ MODEL_URLS = { 'url': 'https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar', }, + 'ml': { + 'url': + 'https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/Multilingual_PP-OCRv3_det_infer.tar' + } }, 'rec': { 'ch': { @@ -79,6 +83,56 @@ MODEL_URLS = { 'https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_infer.tar', 'dict_path': './ppocr/utils/en_dict.txt' }, + 'korean': { + 'url': + 'https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/korean_PP-OCRv3_rec_infer.tar', + 'dict_path': './ppocr/utils/dict/korean_dict.txt' + }, + 'japan': { + 'url': + 'https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/japan_PP-OCRv3_rec_infer.tar', + 'dict_path': './ppocr/utils/dict/japan_dict.txt' + }, + 'chinese_cht': { + 'url': + 'https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/chinese_cht_PP-OCRv3_rec_infer.tar', + 'dict_path': './ppocr/utils/dict/chinese_cht_dict.txt' + }, + 'ta': { + 'url': + 'https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/ta_PP-OCRv3_rec_infer.tar', + 'dict_path': './ppocr/utils/dict/ta_dict.txt' + }, + 'te': { + 'url': + 'https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/te_PP-OCRv3_rec_infer.tar', + 'dict_path': './ppocr/utils/dict/te_dict.txt' + }, + 'ka': { + 'url': + 'https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/ka_PP-OCRv3_rec_infer.tar', + 'dict_path': './ppocr/utils/dict/ka_dict.txt' + }, + 'latin': { + 'url': + 'https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/latin_PP-OCRv3_rec_infer.tar', + 'dict_path': './ppocr/utils/dict/latin_dict.txt' + }, + 'arabic': { + 'url': + 'https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/arabic_PP-OCRv3_rec_infer.tar', + 'dict_path': './ppocr/utils/dict/arabic_dict.txt' + }, + 'cyrillic': { + 'url': + 'https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/cyrillic_PP-OCRv3_rec_infer.tar', + 'dict_path': './ppocr/utils/dict/cyrillic_dict.txt' + }, + 'devanagari': { + 'url': + 'https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/devanagari_PP-OCRv3_rec_infer.tar', + 'dict_path': './ppocr/utils/dict/devanagari_dict.txt' + }, }, 'cls': { 'ch': { @@ -259,7 +313,7 @@ def parse_lang(lang): 'af', 'az', 'bs', 'cs', 'cy', 'da', 'de', 'es', 'et', 'fr', 'ga', 'hr', 'hu', 'id', 'is', 'it', 'ku', 'la', 'lt', 'lv', 'mi', 'ms', 'mt', 'nl', 'no', 'oc', 'pi', 'pl', 'pt', 'ro', 'rs_latin', 'sk', 'sl', 'sq', 'sv', - 'sw', 'tl', 'tr', 'uz', 'vi' + 'sw', 'tl', 'tr', 'uz', 'vi', 'french', 'german' ] arabic_lang = ['ar', 'fa', 'ug', 'ur'] cyrillic_lang = [ @@ -285,8 +339,10 @@ def parse_lang(lang): det_lang = "ch" elif lang == 'structure': det_lang = 'structure' - else: + elif lang in ["en", "latin"]: det_lang = "en" + else: + det_lang = "ml" return lang, det_lang @@ -356,6 +412,10 @@ class PaddleOCR(predict_system.TextSystem): params.cls_model_dir, cls_url = confirm_model_dir_url( params.cls_model_dir, os.path.join(BASE_DIR, 'whl', 'cls'), cls_model_config['url']) + if params.ocr_version == 'PP-OCRv3': + params.rec_image_shape = "3, 48, 320" + else: + params.rec_image_shape = "3, 32, 320" # download model maybe_download(params.det_model_dir, det_url) maybe_download(params.rec_model_dir, rec_url) diff --git a/ppocr/modeling/heads/rec_sar_head.py b/ppocr/modeling/heads/rec_sar_head.py index 27693ebc16a2b494d25455892ac4513b4d16803b..0e6b34404b61b44bebcbc7d67ddfd0a95382c39b 100644 --- a/ppocr/modeling/heads/rec_sar_head.py +++ b/ppocr/modeling/heads/rec_sar_head.py @@ -99,8 +99,8 @@ class SAREncoder(nn.Layer): if valid_ratios is not None: valid_hf = [] T = holistic_feat.shape[1] - for i, valid_ratio in enumerate(valid_ratios): - valid_step = min(T, math.ceil(T * valid_ratio)) - 1 + for i in range(len(valid_ratios)): + valid_step = min(T, math.ceil(T * valid_ratios[i])) - 1 valid_hf.append(holistic_feat[i, valid_step, :]) valid_hf = paddle.stack(valid_hf, axis=0) else: @@ -252,8 +252,8 @@ class ParallelSARDecoder(BaseDecoder): if valid_ratios is not None: # cal mask of attention weight - for i, valid_ratio in enumerate(valid_ratios): - valid_width = min(w, math.ceil(w * valid_ratio)) + for i in range(len(valid_ratios)): + valid_width = min(w, math.ceil(w * valid_ratios[i])) if valid_width < w: attn_weight[i, :, :, valid_width:, :] = float('-inf') diff --git a/ppocr/utils/loggers/__init__.py b/ppocr/utils/loggers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b1e92f734e84b7e0278f8e7940ef3baf137c159e --- /dev/null +++ b/ppocr/utils/loggers/__init__.py @@ -0,0 +1,3 @@ +from .vdl_logger import VDLLogger +from .wandb_logger import WandbLogger +from .loggers import Loggers diff --git a/ppocr/utils/loggers/base_logger.py b/ppocr/utils/loggers/base_logger.py new file mode 100644 index 0000000000000000000000000000000000000000..3a7fc3593ba8e69fdd5bed386c7ae4ff0d459988 --- /dev/null +++ b/ppocr/utils/loggers/base_logger.py @@ -0,0 +1,15 @@ +import os +from abc import ABC, abstractmethod + +class BaseLogger(ABC): + def __init__(self, save_dir): + self.save_dir = save_dir + os.makedirs(self.save_dir, exist_ok=True) + + @abstractmethod + def log_metrics(self, metrics, prefix=None): + pass + + @abstractmethod + def close(self): + pass \ No newline at end of file diff --git a/ppocr/utils/loggers/loggers.py b/ppocr/utils/loggers/loggers.py new file mode 100644 index 0000000000000000000000000000000000000000..260146620811c8e72da66e9f2c7bbcbaef90b90d --- /dev/null +++ b/ppocr/utils/loggers/loggers.py @@ -0,0 +1,18 @@ +from .wandb_logger import WandbLogger + +class Loggers(object): + def __init__(self, loggers): + super().__init__() + self.loggers = loggers + + def log_metrics(self, metrics, prefix=None, step=None): + for logger in self.loggers: + logger.log_metrics(metrics, prefix=prefix, step=step) + + def log_model(self, is_best, prefix, metadata=None): + for logger in self.loggers: + logger.log_model(is_best=is_best, prefix=prefix, metadata=metadata) + + def close(self): + for logger in self.loggers: + logger.close() \ No newline at end of file diff --git a/ppocr/utils/loggers/vdl_logger.py b/ppocr/utils/loggers/vdl_logger.py new file mode 100644 index 0000000000000000000000000000000000000000..c345f93235b239873f0ddcd49c8b1b8966877a03 --- /dev/null +++ b/ppocr/utils/loggers/vdl_logger.py @@ -0,0 +1,21 @@ +from .base_logger import BaseLogger +from visualdl import LogWriter + +class VDLLogger(BaseLogger): + def __init__(self, save_dir): + super().__init__(save_dir) + self.vdl_writer = LogWriter(logdir=save_dir) + + def log_metrics(self, metrics, prefix=None, step=None): + if not prefix: + prefix = "" + updated_metrics = {prefix + "/" + k: v for k, v in metrics.items()} + + for k, v in updated_metrics.items(): + self.vdl_writer.add_scalar(k, v, step) + + def log_model(self, is_best, prefix, metadata=None): + pass + + def close(self): + self.vdl_writer.close() \ No newline at end of file diff --git a/ppocr/utils/loggers/wandb_logger.py b/ppocr/utils/loggers/wandb_logger.py new file mode 100644 index 0000000000000000000000000000000000000000..b9c6711696569e825638e0a27394071020b29cb5 --- /dev/null +++ b/ppocr/utils/loggers/wandb_logger.py @@ -0,0 +1,78 @@ +import os +from .base_logger import BaseLogger + +class WandbLogger(BaseLogger): + def __init__(self, + project=None, + name=None, + id=None, + entity=None, + save_dir=None, + config=None, + **kwargs): + try: + import wandb + self.wandb = wandb + except ModuleNotFoundError: + raise ModuleNotFoundError( + "Please install wandb using `pip install wandb`" + ) + + self.project = project + self.name = name + self.id = id + self.save_dir = save_dir + self.config = config + self.kwargs = kwargs + self.entity = entity + self._run = None + self._wandb_init = dict( + project=self.project, + name=self.name, + id=self.id, + entity=self.entity, + dir=self.save_dir, + resume="allow" + ) + self._wandb_init.update(**kwargs) + + _ = self.run + + if self.config: + self.run.config.update(self.config) + + @property + def run(self): + if self._run is None: + if self.wandb.run is not None: + logger.info( + "There is a wandb run already in progress " + "and newly created instances of `WandbLogger` will reuse" + " this run. If this is not desired, call `wandb.finish()`" + "before instantiating `WandbLogger`." + ) + self._run = self.wandb.run + else: + self._run = self.wandb.init(**self._wandb_init) + return self._run + + def log_metrics(self, metrics, prefix=None, step=None): + if not prefix: + prefix = "" + updated_metrics = {prefix.lower() + "/" + k: v for k, v in metrics.items()} + + self.run.log(updated_metrics, step=step) + + def log_model(self, is_best, prefix, metadata=None): + model_path = os.path.join(self.save_dir, prefix + '.pdparams') + artifact = self.wandb.Artifact('model-{}'.format(self.run.id), type='model', metadata=metadata) + artifact.add_file(model_path, name="model_ckpt.pdparams") + + aliases = [prefix] + if is_best: + aliases.append("best") + + self.run.log_artifact(artifact, aliases=aliases) + + def close(self): + self.run.finish() \ No newline at end of file diff --git a/tools/program.py b/tools/program.py index 90fd309ae9e1ae23723d8e67c62a905e79a073d3..7c02dc0149f36085ef05ca378b79d27e92d6dd57 100755 --- a/tools/program.py +++ b/tools/program.py @@ -31,6 +31,7 @@ from ppocr.utils.stats import TrainingStats from ppocr.utils.save_load import save_model from ppocr.utils.utility import print_dict, AverageMeter from ppocr.utils.logging import get_logger +from ppocr.utils.loggers import VDLLogger, WandbLogger, Loggers from ppocr.utils import profiler from ppocr.data import build_dataloader @@ -161,7 +162,7 @@ def train(config, eval_class, pre_best_model_dict, logger, - vdl_writer=None, + log_writer=None, scaler=None): cal_metric_during_train = config['Global'].get('cal_metric_during_train', False) @@ -300,10 +301,8 @@ def train(config, stats['lr'] = lr train_stats.update(stats) - if vdl_writer is not None and dist.get_rank() == 0: - for k, v in train_stats.get().items(): - vdl_writer.add_scalar('TRAIN/{}'.format(k), v, global_step) - vdl_writer.add_scalar('TRAIN/lr', lr, global_step) + if log_writer is not None and dist.get_rank() == 0: + log_writer.log_metrics(metrics=train_stats.get(), prefix="TRAIN", step=global_step) if dist.get_rank() == 0 and ( (global_step > 0 and global_step % print_batch_step == 0) or @@ -349,11 +348,9 @@ def train(config, logger.info(cur_metric_str) # logger metric - if vdl_writer is not None: - for k, v in cur_metric.items(): - if isinstance(v, (float, int)): - vdl_writer.add_scalar('EVAL/{}'.format(k), - cur_metric[k], global_step) + if log_writer is not None: + log_writer.log_metrics(metrics=cur_metric, prefix="EVAL", step=global_step) + if cur_metric[main_indicator] >= best_model_dict[ main_indicator]: best_model_dict.update(cur_metric) @@ -374,10 +371,12 @@ def train(config, ])) logger.info(best_str) # logger best metric - if vdl_writer is not None: - vdl_writer.add_scalar('EVAL/best_{}'.format(main_indicator), - best_model_dict[main_indicator], - global_step) + if log_writer is not None: + log_writer.log_metrics(metrics={ + "best_{}".format(main_indicator): best_model_dict[main_indicator] + }, prefix="EVAL", step=global_step) + + log_writer.log_model(is_best=True, prefix="best_accuracy", metadata=best_model_dict) reader_start = time.time() if dist.get_rank() == 0: @@ -392,6 +391,10 @@ def train(config, best_model_dict=best_model_dict, epoch=epoch, global_step=global_step) + + if log_writer is not None: + log_writer.log_model(is_best=False, prefix="latest") + if dist.get_rank() == 0 and epoch > 0 and epoch % save_epoch_step == 0: save_model( model, @@ -404,11 +407,14 @@ def train(config, best_model_dict=best_model_dict, epoch=epoch, global_step=global_step) + if log_writer is not None: + log_writer.log_model(is_best=False, prefix='iter_epoch_{}'.format(epoch)) + best_str = 'best metric, {}'.format(', '.join( ['{}: {}'.format(k, v) for k, v in best_model_dict.items()])) logger.info(best_str) - if dist.get_rank() == 0 and vdl_writer is not None: - vdl_writer.close() + if dist.get_rank() == 0 and log_writer is not None: + log_writer.close() return @@ -565,15 +571,32 @@ def preprocess(is_train=False): config['Global']['distributed'] = dist.get_world_size() != 1 - if config['Global']['use_visualdl'] and dist.get_rank() == 0: - from visualdl import LogWriter + loggers = [] + + if 'use_visualdl' in config['Global'] and config['Global']['use_visualdl']: save_model_dir = config['Global']['save_model_dir'] vdl_writer_path = '{}/vdl/'.format(save_model_dir) - os.makedirs(vdl_writer_path, exist_ok=True) - vdl_writer = LogWriter(logdir=vdl_writer_path) + log_writer = VDLLogger(save_model_dir) + loggers.append(log_writer) + if ('use_wandb' in config['Global'] and config['Global']['use_wandb']) or 'wandb' in config: + save_dir = config['Global']['save_model_dir'] + wandb_writer_path = "{}/wandb".format(save_dir) + if "wandb" in config: + wandb_params = config['wandb'] + else: + wandb_params = dict() + wandb_params.update({'save_dir': save_model_dir}) + log_writer = WandbLogger(**wandb_params, config=config) + loggers.append(log_writer) else: - vdl_writer = None + log_writer = None print_dict(config, logger) + + if loggers: + log_writer = Loggers(loggers) + else: + log_writer = None + logger.info('train with paddle {} and device {}'.format(paddle.__version__, device)) - return config, device, logger, vdl_writer + return config, device, logger, log_writer