提交 0d7ee968 编写于 作者: 文幕地方's avatar 文幕地方

Merge branch 'dygraph' of https://github.com/PaddlePaddle/PaddleOCR into fix_vqa

......@@ -29,3 +29,5 @@ paddleocr.egg-info/
/deploy/android_demo/app/PaddleLite/
/deploy/android_demo/app/.cxx/
/deploy/android_demo/app/cache/
test_tipc/web/models/
test_tipc/web/node_modules/
......@@ -61,7 +61,7 @@ from combobox import ComboBox
from libs.constants import *
from libs.utils import *
from libs.settings import Settings
from libs.shape import Shape, DEFAULT_LINE_COLOR, DEFAULT_FILL_COLOR
from libs.shape import Shape, DEFAULT_LINE_COLOR, DEFAULT_FILL_COLOR,DEFAULT_LOCK_COLOR
from libs.stringBundle import StringBundle
from libs.canvas import Canvas
from libs.zoomWidget import ZoomWidget
......@@ -126,7 +126,7 @@ class MainWindow(QMainWindow, WindowMixin):
self.labelHist = []
self.lastOpenDir = None
self.result_dic = []
self.result_dic_locked = []
self.changeFileFolder = False
self.haveAutoReced = False
self.labelFile = None
......@@ -395,6 +395,7 @@ class MainWindow(QMainWindow, WindowMixin):
delete = action(getStr('delBox'), self.deleteSelectedShape,
'backspace', 'delete', getStr('delBoxDetail'), enabled=False)
copy = action(getStr('dupBox'), self.copySelectedShape,
'Ctrl+C', 'copy', getStr('dupBoxDetail'),
enabled=False)
......@@ -405,6 +406,7 @@ class MainWindow(QMainWindow, WindowMixin):
showAll = action(getStr('showBox'), partial(self.togglePolygons, True),
'Ctrl+A', 'hide', getStr('showAllBoxDetail'),
enabled=False)
help = action(getStr('tutorial'), self.showTutorialDialog, None, 'help', getStr('tutorialDetail'))
showInfo = action(getStr('info'), self.showInfoDialog, None, 'help', getStr('info'))
......@@ -476,6 +478,10 @@ class MainWindow(QMainWindow, WindowMixin):
undo = action(getStr("undo"), self.undoShapeEdit,
'Ctrl+Z', "undo", getStr("undo"), enabled=False)
lock = action(getStr("lockBox"), self.lockSelectedShape,
None, "lock", getStr("lockBoxDetail"),
enabled=False)
self.editButton.setDefaultAction(edit)
self.newButton.setDefaultAction(create)
......@@ -538,13 +544,13 @@ class MainWindow(QMainWindow, WindowMixin):
fitWindow=fitWindow, fitWidth=fitWidth,
zoomActions=zoomActions, saveLabel=saveLabel,
undo=undo, undoLastPoint=undoLastPoint,open_dataset_dir=open_dataset_dir,
rotateLeft=rotateLeft,rotateRight=rotateRight,
rotateLeft=rotateLeft,rotateRight=rotateRight,lock=lock,
fileMenuActions=(
opendir, open_dataset_dir, saveLabel, resetAll, quit),
beginner=(), advanced=(),
editMenu=(createpoly, edit, copy, delete,singleRere,None, undo, undoLastPoint,
None, rotateLeft, rotateRight, None, color1, self.drawSquaresOption),
beginnerContext=(create, edit, copy, delete, singleRere, rotateLeft, rotateRight,),
None, rotateLeft, rotateRight, None, color1, self.drawSquaresOption,lock),
beginnerContext=(create, edit, copy, delete, singleRere, rotateLeft, rotateRight,lock),
advancedContext=(createMode, editMode, edit, copy,
delete, shapeLineColor, shapeFillColor),
onLoadActive=(
......@@ -998,6 +1004,7 @@ class MainWindow(QMainWindow, WindowMixin):
self.actions.delete.setEnabled(n_selected)
self.actions.copy.setEnabled(n_selected)
self.actions.edit.setEnabled(n_selected == 1)
self.actions.lock.setEnabled(n_selected)
def addLabel(self, shape):
shape.paintLabel = self.displayLabelOption.isChecked()
......@@ -1041,7 +1048,7 @@ class MainWindow(QMainWindow, WindowMixin):
def loadLabels(self, shapes):
s = []
for label, points, line_color, fill_color, difficult in shapes:
shape = Shape(label=label)
shape = Shape(label=label,line_color=line_color)
for x, y in points:
# Ensure the labels are within the bounds of the image. If not, fix them.
......@@ -1051,6 +1058,7 @@ class MainWindow(QMainWindow, WindowMixin):
shape.addPoint(QPointF(x, y))
shape.difficult = difficult
#shape.locked = False
shape.close()
s.append(shape)
......@@ -1063,10 +1071,12 @@ class MainWindow(QMainWindow, WindowMixin):
# shape.fill_color = QColor(*fill_color)
# else:
# shape.fill_color = generateColorByText(label)
self.addLabel(shape)
self.updateComboBox()
self.canvas.loadShapes(s)
def singleLabel(self, shape):
if shape is None:
......@@ -1106,10 +1116,9 @@ class MainWindow(QMainWindow, WindowMixin):
difficult=s.difficult) # bool
shapes = [] if mode == 'Auto' else \
[format_shape(shape) for shape in self.canvas.shapes]
[format_shape(shape) for shape in self.canvas.shapes if shape.line_color != DEFAULT_LOCK_COLOR]
# Can add differrent annotation formats here
for box in self.result_dic:
for box in self.result_dic :
trans_dic = {"label": box[1][0], "points": box[0], 'difficult': False}
if trans_dic["label"] == "" and mode == 'Auto':
continue
......@@ -1120,7 +1129,6 @@ class MainWindow(QMainWindow, WindowMixin):
for box in shapes:
trans_dic.append({"transcription": box['label'], "points": box['points'], 'difficult': box['difficult']})
self.PPlabel[annotationFilePath] = trans_dic
if mode == 'Auto':
self.Cachelabel[annotationFilePath] = trans_dic
......@@ -1313,6 +1321,7 @@ class MainWindow(QMainWindow, WindowMixin):
# unicodeFilePath = os.path.abspath(unicodeFilePath)
# Tzutalin 20160906 : Add file list and dock to move faster
# Highlight the file item
if unicodeFilePath and self.fileListWidget.count() > 0:
if unicodeFilePath in self.mImgList:
index = self.mImgList.index(unicodeFilePath)
......@@ -1322,6 +1331,7 @@ class MainWindow(QMainWindow, WindowMixin):
###
self.iconlist.clear()
self.additems5(None)
for i in range(5):
item_tooltip = self.iconlist.item(i).toolTip()
# print(i,"---",item_tooltip)
......@@ -1340,7 +1350,6 @@ class MainWindow(QMainWindow, WindowMixin):
if unicodeFilePath and os.path.exists(unicodeFilePath):
self.canvas.verified = False
cvimg = cv2.imdecode(np.fromfile(unicodeFilePath, dtype=np.uint8), 1)
height, width, depth = cvimg.shape
cvimg = cv2.cvtColor(cvimg, cv2.COLOR_BGR2RGB)
......@@ -1361,16 +1370,19 @@ class MainWindow(QMainWindow, WindowMixin):
else:
self.dirty = False
self.actions.save.setEnabled(True)
if len(self.canvas.lockedShapes) != 0:
self.actions.save.setEnabled(True)
self.setDirty()
self.canvas.setEnabled(True)
self.adjustScale(initial=True)
self.paintCanvas()
self.addRecentFile(self.filePath)
self.toggleActions(True)
self.showBoundingBoxFromPPlabel(filePath)
self.setWindowTitle(__appname__ + ' ' + filePath)
# Default : select last item if there is at least one item
if self.labelList.count():
self.labelList.setCurrentItem(self.labelList.item(self.labelList.count() - 1))
......@@ -1380,15 +1392,23 @@ class MainWindow(QMainWindow, WindowMixin):
return True
return False
def showBoundingBoxFromPPlabel(self, filePath):
width, height = self.image.width(), self.image.height()
imgidx = self.getImglabelidx(filePath)
if imgidx not in self.PPlabel.keys():
return
shapes = []
for box in self.PPlabel[imgidx]:
shapes.append((box['transcription'], box['points'], None, None, box['difficult']))
shapes =[]
#box['ratio'] of the shapes saved in lockedShapes contains the ratio of the
# four corner coordinates of the shapes to the height and width of the image
for box in self.canvas.lockedShapes:
if self.canvas.isInTheSameImage:
shapes.append((box['transcription'], [[s[0]*width,s[1]*height]for s in box['ratio']],
DEFAULT_LOCK_COLOR, None, box['difficult']))
else:
shapes.append(('锁定框:待检测', [[s[0]*width,s[1]*height]for s in box['ratio']],
DEFAULT_LOCK_COLOR, None, box['difficult']))
if imgidx in self.PPlabel.keys():
for box in self.PPlabel[imgidx]:
shapes.append((box['transcription'], box['points'], None, None, box['difficult']))
self.loadLabels(shapes)
self.canvas.verified = False
......@@ -1646,9 +1666,37 @@ class MainWindow(QMainWindow, WindowMixin):
else:
return fullFilePath
return ''
def saveLockedShapes(self):
self.canvas.lockedShapes = []
self.canvas.selectedShapes = []
for s in self.canvas.shapes:
if s.line_color == DEFAULT_LOCK_COLOR:
self.canvas.selectedShapes.append(s)
self.lockSelectedShape()
for s in self.canvas.shapes:
if s.line_color == DEFAULT_LOCK_COLOR:
self.canvas.selectedShapes.remove(s)
self.canvas.shapes.remove(s)
def _saveFile(self, annotationFilePath, mode='Manual'):
if len(self.canvas.lockedShapes) != 0:
self.saveLockedShapes()
if mode == 'Manual':
self.result_dic_locked = []
img = cv2.imread(self.filePath)
width, height = self.image.width(), self.image.height()
for shape in self.canvas.lockedShapes:
box = [[int(p[0]*width), int(p[1]*height)] for p in shape['ratio']]
assert len(box) == 4
result = [(shape['transcription'],1)]
result.insert(0, box)
self.result_dic_locked.append(result)
self.result_dic += self.result_dic_locked
self.result_dic_locked = []
if annotationFilePath and self.saveLabels(annotationFilePath, mode=mode):
self.setClean()
self.statusBar().showMessage('Saved to %s' % annotationFilePath)
......@@ -1663,13 +1711,13 @@ class MainWindow(QMainWindow, WindowMixin):
self.savePPlabel(mode='Auto')
self.fileListWidget.insertItem(int(currIndex), item)
self.openNextImg()
if not self.canvas.isInTheSameImage:
self.openNextImg()
self.actions.saveRec.setEnabled(True)
self.actions.saveLabel.setEnabled(True)
elif mode == 'Auto':
if annotationFilePath and self.saveLabels(annotationFilePath, mode=mode):
self.setClean()
self.statusBar().showMessage('Saved to %s' % annotationFilePath)
self.statusBar().show()
......@@ -1733,7 +1781,9 @@ class MainWindow(QMainWindow, WindowMixin):
if discardChanges == QMessageBox.No:
return True
elif discardChanges == QMessageBox.Yes:
self.canvas.isInTheSameImage = True
self.saveFile()
self.canvas.isInTheSameImage = False
return True
else:
return False
......@@ -1872,6 +1922,7 @@ class MainWindow(QMainWindow, WindowMixin):
# org_box = [dic['points'] for dic in self.PPlabel[self.getImglabelidx(self.filePath)]]
if self.canvas.shapes:
self.result_dic = []
self.result_dic_locked = [] # result_dic_locked stores the ocr result of self.canvas.lockedShapes
rec_flag = 0
for shape in self.canvas.shapes:
box = [[int(p.x()), int(p.y())] for p in shape.points]
......@@ -1883,21 +1934,32 @@ class MainWindow(QMainWindow, WindowMixin):
return
result = self.ocr.ocr(img_crop, cls=True, det=False)
if result[0][0] != '':
result.insert(0, box)
print('result in reRec is ', result)
self.result_dic.append(result)
if shape.line_color == DEFAULT_LOCK_COLOR:
shape.label = result[0][0]
result.insert(0, box)
self.result_dic_locked.append(result)
else:
result.insert(0, box)
self.result_dic.append(result)
else:
print('Can not recognise the box')
self.result_dic.append([box,(self.noLabelText,0)])
if self.noLabelText == shape.label or result[1][0] == shape.label:
print('label no change')
else:
rec_flag += 1
if len(self.result_dic) > 0 and rec_flag > 0:
if shape.line_color == DEFAULT_LOCK_COLOR:
shape.label = result[0][0]
self.result_dic_locked.append([box,(self.noLabelText,0)])
else:
self.result_dic.append([box,(self.noLabelText,0)])
try:
if self.noLabelText == shape.label or result[1][0] == shape.label:
print('label no change')
else:
rec_flag += 1
except IndexError as e:
print('Can not recognise the box')
if (len(self.result_dic) > 0 and rec_flag > 0)or self.canvas.lockedShapes:
self.canvas.isInTheSameImage = True
self.saveFile(mode='Auto')
self.loadFile(self.filePath)
self.canvas.isInTheSameImage = False
self.setDirty()
elif len(self.result_dic) == len(self.canvas.shapes) and rec_flag == 0:
QMessageBox.information(self, "Information", "The recognition result remains unchanged!")
......@@ -2107,6 +2169,44 @@ class MainWindow(QMainWindow, WindowMixin):
self.labelList.clearSelection()
self._noSelectionSlot = False
self.canvas.loadShapes(shapes, replace=replace)
print("loadShapes")#1
def lockSelectedShape(self):
"""lock the selsected shapes.
Add self.selectedShapes to lock self.canvas.lockedShapes,
which holds the ratio of the four coordinates of the locked shapes
to the width and height of the image
"""
width, height = self.image.width(), self.image.height()
def format_shape(s):
return dict(label=s.label, # str
line_color=s.line_color.getRgb(),
fill_color=s.fill_color.getRgb(),
ratio=[[int(p.x())/width, int(p.y())/height] for p in s.points], # QPonitF
# add chris
difficult=s.difficult) # bool
#lock
if len(self.canvas.lockedShapes) == 0:
for s in self.canvas.selectedShapes:
s.line_color = DEFAULT_LOCK_COLOR
s.locked = True
shapes = [format_shape(shape) for shape in self.canvas.selectedShapes]
trans_dic = []
for box in shapes:
trans_dic.append({"transcription": box['label'], "ratio": box['ratio'], 'difficult': box['difficult']})
self.canvas.lockedShapes = trans_dic
self.actions.save.setEnabled(True)
#unlock
else:
for s in self.canvas.shapes:
s.line_color = DEFAULT_LINE_COLOR
self.canvas.lockedShapes = []
self.result_dic_locked = []
self.setDirty()
self.actions.save.setEnabled(True)
def inverted(color):
......
......@@ -78,14 +78,14 @@ PPOCRLabel # run
```bash
cd PaddleOCR/PPOCRLabel
python3 setup.py bdist_wheel
pip3 install dist/PPOCRLabel-1.0.0-py2.py3-none-any.whl
pip3 install dist/PPOCRLabel-1.0.2-py2.py3-none-any.whl
```
#### 1.2.3 Run PPOCRLabel by Python Script
```bash
cd ./PPOCRLabel # Switch to the PPOCRLabel directory
python PPOCRLabel.py --lang ch
python PPOCRLabel.py
```
......
......@@ -78,7 +78,7 @@ PPOCRLabel --lang ch # 启动
```bash
cd PaddleOCR/PPOCRLabel
python3 setup.py bdist_wheel
pip3 install dist/PPOCRLabel-1.0.0-py2.py3-none-any.whl -i https://mirror.baidu.com/pypi/simple
pip3 install dist/PPOCRLabel-1.0.2-py2.py3-none-any.whl -i https://mirror.baidu.com/pypi/simple
```
#### 1.2.3 通过Python脚本运行PPOCRLabel
......
......@@ -87,6 +87,10 @@ class Canvas(QWidget):
#initialisation for panning
self.pan_initial_pos = QPoint()
#lockedshapes related
self.lockedShapes = []
self.isInTheSameImage = False
def setDrawingColor(self, qColor):
self.drawingLineColor = qColor
self.drawingRectColor = qColor
......
此差异已折叠。
......@@ -30,6 +30,7 @@ DEFAULT_SELECT_LINE_COLOR = QColor(255, 255, 255)
DEFAULT_SELECT_FILL_COLOR = QColor(0, 128, 255, 155)
DEFAULT_VERTEX_FILL_COLOR = QColor(0, 255, 0, 255)
DEFAULT_HVERTEX_FILL_COLOR = QColor(255, 0, 0)
DEFAULT_LOCK_COLOR = QColor(255, 0, 255)
MIN_Y_LABEL = 10
......@@ -57,7 +58,7 @@ class Shape(object):
self.selected = False
self.difficult = difficult
self.paintLabel = paintLabel
self.locked = False
self._highlightIndex = None
self._highlightMode = self.NEAR_VERTEX
self._highlightSettings = {
......
......@@ -60,7 +60,7 @@ class StringBundle:
def __createLookupFallbackList(self, localeStr):
resultPaths = []
basePath = "\strings" if os.name == 'nt' else ":/strings"
basePath = "\strings" if os.name == 'nt' else "/strings"
resultPaths.append(basePath)
if localeStr is not None:
# Don't follow standard BCP47. Simple fallback
......
......@@ -104,4 +104,6 @@ singleRe=Re-recognition RectBox
labelDialogOption=Pop-up Label Input Dialog
undo=Undo
undoLastPoint=Undo Last Point
autoSaveMode=Auto Export Label Mode
\ No newline at end of file
autoSaveMode=Auto Export Label Mode
lockBox=Lock selected box/Unlock all box
lockBoxDetail=Lock selected box/Unlock all box
\ No newline at end of file
......@@ -104,4 +104,6 @@ singleRe=重识别此区块
labelDialogOption=弹出标记输入框
undo=撤销
undoLastPoint=撤销上个点
autoSaveMode=自动导出标记结果
\ No newline at end of file
autoSaveMode=自动导出标记结果
lockBox=锁定框/解除锁定框
lockBoxDetail=若当前没有框处于锁定状态则锁定选中的框,若存在锁定框则解除所有锁定框的锁定状态
......@@ -33,7 +33,7 @@ setup(
package_dir={'PPOCRLabel': ''},
include_package_data=True,
entry_points={"console_scripts": ["PPOCRLabel= PPOCRLabel.PPOCRLabel:main"]},
version='1.0.0',
version='1.0.2',
install_requires=requirements,
license='Apache License 2.0',
description='PPOCRLabel is a semi-automatic graphic annotation tool suitable for OCR field, with built-in PPOCR model to automatically detect and re-recognize data. It is written in python3 and pyqt5, supporting rectangular box annotation and four-point annotation modes. Annotations can be directly used for the training of PPOCR detection and recognition models',
......
......@@ -24,7 +24,7 @@ PaddleOCR aims to create multilingual, awesome, leading, and practical OCR tools
**Recent updates**
- 2021.12.21 OCR open source online course starts. The lesson starts at 8:30 every night and lasts for ten days. Free registration: https://aistudio.baidu.com/aistudio/course/introduce/25207
- 2021.12.21 release PaddleOCR v2.4, release 1 text detection algorithm (PSENet), 3 text recognition algorithms (NRTR、SEED、SAR), 1 key information extraction algorithm (SDMGR) and 3 DocVQA algorithms (LayoutLM、LayoutLMv2,LayoutXLM).
- 2021.12.21 release PaddleOCR v2.4, release 1 text detection algorithm (PSENet), 3 text recognition algorithms (NRTR、SEED、SAR), 1 key information extraction algorithm (SDMGR, [tutorial](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.4/ppstructure/docs/kie.md)) and 3 DocVQA algorithms (LayoutLM, LayoutLMv2, LayoutXLM, [tutorial](https://github.com/PaddlePaddle/PaddleOCR/tree/release/2.4/ppstructure/vqa)).
- PaddleOCR R&D team would like to share the key points of PP-OCRv2, at 20:15 pm on September 8th, [Course Address](https://aistudio.baidu.com/aistudio/education/group/info/6758).
- 2021.9.7 release PaddleOCR v2.3, [PP-OCRv2](#PP-OCRv2) is proposed. The inference speed of PP-OCRv2 is 220% higher than that of PP-OCR server in CPU device. The F-score of PP-OCRv2 is 7% higher than that of PP-OCR mobile.
- 2021.8.3 released PaddleOCR v2.2, add a new structured documents analysis toolkit, i.e., [PP-Structure](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.2/ppstructure/README.md), support layout analysis and table recognition (One-key to export chart images to Excel files).
......@@ -39,7 +39,7 @@ PaddleOCR aims to create multilingual, awesome, leading, and practical OCR tools
- General PP-OCR server series models: detection (47.1M) + direction classifier (1.4M) + recognition (94.9M) = 143.4M
- Support Chinese, English, and digit recognition, vertical text recognition, and long text recognition
- Support multi-language recognition: about 80 languages like Korean, Japanese, German, French, etc
- document structurize system PP-Structure
- PP-Structure: a document structurize system
- support layout analysis and table recognition (support export to Excel)
- support key information extraction
- support DocVQA
......@@ -90,7 +90,7 @@ Mobile DEMO experience (based on EasyEdge and Paddle-Lite, supports iOS and Andr
| Model introduction | Model name | Recommended scene | Detection model | Direction classifier | Recognition model |
| ------------------------------------------------------------ | ---------------------------- | ----------------- | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ |
| Chinese and English ultra-lightweight PP-OCRv2 model(11.6M) | ch_PP-OCRv2_xx |Mobile & Server|[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_distill_train.tar)| [inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv2/ch/ch_PP-OCRv2_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_train.tar)|
| Chinese and English ultra-lightweight PP-OCRv2 model(11.6M) | ch_PP-OCRv2_xx |Mobile & Server|[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_distill_train.tar)| [inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_train.tar)|
| Chinese and English ultra-lightweight PP-OCR model (9.4M) | ch_ppocr_mobile_v2.0_xx | Mobile & server |[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_train.tar)|[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) |[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_train.tar) |
| Chinese and English general PP-OCR model (143.4M) | ch_ppocr_server_v2.0_xx | Server |[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_train.tar) |[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_traingit.tar) |[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_train.tar) |
......@@ -102,11 +102,11 @@ For a new language request, please refer to [Guideline for new language_requests
## Tutorials
- [Environment Preparation](./doc/doc_en/environment_en.md)
- [Quick Start](./doc/doc_en/quickstart_en.md)
- [PaddleOCR Overview and Installation](./doc/doc_en/paddleOCR_overview_en.md)
- [PaddleOCR Overview and Project Clone](./doc/doc_en/paddleOCR_overview_en.md)
- PP-OCR Industry Landing: from Training to Deployment
- [PP-OCR Model and Configuration](./doc/doc_en/models_and_config_en.md)
- [PP-OCR Model Zoo](./doc/doc_en/models_en.md)
- [PP-OCR Model Download](./doc/doc_en/models_list_en.md)
- [Python Inference for PP-OCR Model Library](./doc/doc_en/inference_ppocr_en.md)
- [Python Inference for PP-OCR Model Zoo](./doc/doc_en/inference_ppocr_en.md)
- [PP-OCR Training](./doc/doc_en/training_en.md)
- [Text Detection](./doc/doc_en/detection_en.md)
- [Text Recognition](./doc/doc_en/recognition_en.md)
......@@ -120,6 +120,8 @@ For a new language request, please refer to [Guideline for new language_requests
- [PP-Structure: Information Extraction](./ppstructure/README.md)
- [Layout Parser](./ppstructure/layout/README.md)
- [Table Recognition](./ppstructure/table/README.md)
- [DocVQA](https://github.com/PaddlePaddle/PaddleOCR/tree/release/2.4/ppstructure/vqa)
- [Key Information Extraction](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.4/ppstructure/docs/kie.md)
- Academic Circles
- [Two-stage Algorithm](./doc/doc_en/algorithm_overview_en.md)
- [PGNet Algorithm](./doc/doc_en/pgnet_en.md)
......
......@@ -19,8 +19,8 @@ PaddleOCR旨在打造一套丰富、领先、且实用的OCR工具库,助力
## 近期更新
- 2021.12.21 《OCR十讲》课程开讲,12月21日起每晚八点半线上授课! 【免费】报名地址:https://aistudio.baidu.com/aistudio/course/introduce/25207
- 2021.12.21 发布PaddleOCR v2.4。OCR算法新增1种文本检测算法(PSENet),3种文本识别算法(NRTR、SEED、SAR);文档结构化算法新增1种关键信息提取算法(SDMGR),3种DocVQA算法(LayoutLM、LayoutLMv2,LayoutXLM)。
- 2021.12.21《动手学OCR · 十讲》课程开讲,12月21日起每晚八点半线上授课![免费报名地址](https://aistudio.baidu.com/aistudio/course/introduce/25207)
- 2021.12.21 发布PaddleOCR v2.4。OCR算法新增1种文本检测算法(PSENet),3种文本识别算法(NRTR、SEED、SAR);文档结构化算法新增1种关键信息提取算法(SDMGR[文档](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.4/ppstructure/docs/kie.md)),3种DocVQA算法(LayoutLM、LayoutLMv2,LayoutXLM,[文档](https://github.com/PaddlePaddle/PaddleOCR/tree/release/2.4/ppstructure/vqa))。
- PaddleOCR研发团队对最新发版内容技术深入解读,9月8日晚上20:15,[课程回放](https://aistudio.baidu.com/aistudio/education/group/info/6758)
- 2021.9.7 发布PaddleOCR v2.3与[PP-OCRv2](#PP-OCRv2),CPU推理速度相比于PP-OCR server提升220%;效果相比于PP-OCR mobile 提升7%。
- 2021.8.3 发布PaddleOCR v2.2,新增文档结构分析[PP-Structure](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.2/ppstructure/README_ch.md)工具包,支持版面分析与表格识别(含Excel导出)。
......@@ -54,8 +54,7 @@ PaddleOCR旨在打造一套丰富、领先、且实用的OCR工具库,助力
- 加入社区:微信扫描下方二维码加入官方交流群,与各行各业开发者充分交流,期待您的加入。
- 社区贡献:[社区贡献](./doc/doc_ch/thirdparty.md)文档中包含了社区用户**使用PaddleOCR开发的各种工具、应用**以及**为PaddleOCR贡献的功能、优化的文档与代码**等,是官方为社区开发者打造的荣誉墙、也是帮助优质项目宣传的广播站。如果您的OCR项目未被收集在文档中,可根据文档说明与我们联系。最新社区贡献可查看[此处](#社区贡献)
- 社区常规赛:作为社区贡献的具体承载形式,社区常规赛是面向OCR开发者的积分赛事。首届社区常规赛与《动手学OCR · 十讲》课程联合推广,课程详情可参考[链接](https://aistudio.baidu.com/aistudio/course/introduce/25207),课程奖励与作业说明可参考[链接](https://github.com/PaddlePaddle/PaddleOCR/issues/4982)
- 社区常规赛:作为社区贡献的具体承载形式,社区常规赛是面向OCR开发者的积分赛事。首届社区常规赛与[《动手学OCR · 十讲》课程](https://aistudio.baidu.com/aistudio/course/introduce/25207)联合推广。社区常规赛的赛题详情与报名方法可参考[链接](https://github.com/PaddlePaddle/PaddleOCR/issues/4982)
<div align="center">
<img src="https://raw.githubusercontent.com/PaddlePaddle/PaddleOCR/dygraph/doc/joinus.PNG" width = "200" height = "200" />
......@@ -64,22 +63,33 @@ PaddleOCR旨在打造一套丰富、领先、且实用的OCR工具库,助力
## 零代码体验
- 在线网站体验:超轻量PP-OCR mobile模型体验地址:https://www.paddlepaddle.org.cn/hub/scene/ocr
- 移动端:[安装包DEMO下载地址](https://ai.baidu.com/easyedge/app/openSource?from=paddlelite)(基于EasyEdge和Paddle-Lite, 支持iOS和Android系统)
<a name="模型下载"></a>
## PP-OCR系列模型列表(更新中)
| 模型简介 | 模型名称 | 推荐场景 | 检测模型 | 方向分类器 | 识别模型 |
| ------------------------------------- | ----------------------- | --------------- | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ |
| 中英文超轻量PP-OCRv2模型(13.0M) | ch_PP-OCRv2_xx | 移动端&服务器端 | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_distill_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_train.tar) |
| 中英文超轻量PP-OCR mobile模型(9.4M) | ch_ppocr_mobile_v2.0_xx | 移动端&服务器端 | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_pre.tar) |
| 中英文通用PP-OCR server模型(143.4M) | ch_ppocr_server_v2.0_xx | 服务器端 | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_pre.tar) |
更多模型下载(包括多语言),可以参考[PP-OCR 系列模型下载](./doc/doc_ch/models_list.md)
## 文档教程
- [运行环境准备](./doc/doc_ch/environment.md)
- [快速开始(中英文/多语言/文档分析)](./doc/doc_ch/quickstart.md)
- [PaddleOCR全景图与项目克隆](./doc/doc_ch/paddleOCR_overview.md)
- PP-OCR产业落地:从训练到部署
- [PP-OCR模型与配置文件](./doc/doc_ch/models_and_config.md)
- [PP-OCR模型](./doc/doc_ch/models.md)
- [PP-OCR模型下载](./doc/doc_ch/models_list.md)
- [PP-OCR模型库快速推理](./doc/doc_ch/inference_ppocr.md)
- [PP-OCR模型训练](./doc/doc_ch/training.md)
- [文本检测](./doc/doc_ch/detection.md)
- [文本识别](./doc/doc_ch/recognition.md)
- [文本方向分类器](./doc/doc_ch/angle_class.md)
- [知识蒸馏](./doc/doc_ch/knowledge_distillation.md)
- [配置文件内容与生成](./doc/doc_ch/config.md)
- PP-OCR模型推理部署
- [基于C++预测引擎推理](./deploy/cpp_infer/readme.md)
......@@ -89,6 +99,8 @@ PaddleOCR旨在打造一套丰富、领先、且实用的OCR工具库,助力
- [PP-Structure信息提取](./ppstructure/README_ch.md)
- [版面分析](./ppstructure/layout/README_ch.md)
- [表格识别](./ppstructure/table/README_ch.md)
- [DocVQA](https://github.com/PaddlePaddle/PaddleOCR/tree/release/2.4/ppstructure/vqa)
- [关键信息提取](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.4/ppstructure/docs/kie.md)
- OCR学术圈
- [两阶段模型介绍与下载](./doc/doc_ch/algorithm_overview.md)
- [端到端PGNet算法](./doc/doc_ch/pgnet.md)
......@@ -119,7 +131,7 @@ PaddleOCR旨在打造一套丰富、领先、且实用的OCR工具库,助力
</div>
[1] PP-OCR是一个实用的超轻量OCR系统。主要由DB文本检测、检测框矫正和CRNN文本识别三部分组成。该系统从骨干网络选择和调整、预测头部的设计、数据增强、学习率变换策略、正则化参数选择、预训练模型使用以及模型自动裁剪量化8个方面,采用19个有效策略,对各个模块的模型进行效果调优和瘦身(如绿框所示),最终得到整体大小为3.5M的超轻量中英文OCR和2.8M的英文数字OCR。更多细节请参考PP-OCR技术方案 https://arxiv.org/abs/2009.09941
[2] PP-OCRv2在PP-OCR的基础上,进一步在5个方面重点优化,检测模型采用CML协同互学习知识蒸馏策略和CopyPaste数据增广策略;识别模型采用LCNet轻量级骨干网络、UDML 改进知识蒸馏策略和Enhanced CTC loss损失函数改进(如上图红框所示),进一步在推理速度和预测效果上取得明显提升。更多细节请参考PP-OCRv2[技术报告](https://arxiv.org/abs/2109.03144)
[2] PP-OCRv2在PP-OCR的基础上,进一步在5个方面重点优化,检测模型采用CML协同互学习知识蒸馏策略和CopyPaste数据增广策略;识别模型采用LCNet轻量级骨干网络、UDML 改进知识蒸馏策略和[Enhanced CTC loss](./doc/doc_ch/enhanced_ctc_loss.md)损失函数改进(如上图红框所示),进一步在推理速度和预测效果上取得明显提升。更多细节请参考PP-OCRv2[技术报告](https://arxiv.org/abs/2109.03144)
<a name="效果展示"></a>
......
......@@ -21,6 +21,7 @@ Architecture:
model_type: det
Models:
Teacher:
pretrained: ./pretrain_models/ch_ppocr_server_v2.0_det_train/best_accuracy
freeze_params: true
return_all_feats: false
model_type: det
......@@ -36,6 +37,7 @@ Architecture:
name: DBHead
k: 50
Student:
pretrained:
freeze_params: false
return_all_feats: false
model_type: det
......@@ -52,6 +54,7 @@ Architecture:
name: DBHead
k: 50
Student2:
pretrained:
freeze_params: false
return_all_feats: false
model_type: det
......
......@@ -18,6 +18,7 @@ Global:
Architecture:
name: DistillationModel
algorithm: Distillation
model_type: det
Models:
Student:
pretrained: ./pretrain_models/MobileNetV3_large_x0_5_pretrained
......
......@@ -18,6 +18,7 @@ Global:
Architecture:
name: DistillationModel
algorithm: Distillation
model_type: det
Models:
Student:
pretrained: ./pretrain_models/MobileNetV3_large_x0_5_pretrained
......
......@@ -18,8 +18,8 @@ python3.7 -m pip install paddle2onnx
- 安装 ONNX
```
# 建议安装 1.4.0 版本,可根据环境更换版本号
python3.7 -m pip install onnxruntime==1.4.0
# 建议安装 1.9.0 版本,可根据环境更换版本号
python3.7 -m pip install onnxruntime==1.9.0
```
## 2. 模型转换
......@@ -47,13 +47,15 @@ paddle2onnx --model_dir=./inference/ch_ppocr_mobile_v2.0_det_infer/ \
--params_filename=inference.pdiparams \
--save_file=./inference/det_mobile_onnx/model.onnx \
--opset_version=10 \
--input_shape_dict="{'x': [-1, 3, -1, -1]}" \
--enable_onnx_checker=True
```
执行完毕后,ONNX 模型会被保存在 `./inference/det_mobile_onnx/` 路径下
* 注意:以下几个模型暂不支持转换为 ONNX 模型:
NRTR、SAR、RARE、SRN
* 注意:对于OCR模型,转化过程中必须采用动态shape的形式,即加入选项--input_shape_dict="{'x': [-1, 3, -1, -1]}",否则预测结果可能与直接使用Paddle预测有细微不同。
另外,以下几个模型暂不支持转换为 ONNX 模型:
NRTR、SAR、RARE、SRN
## 3. onnx 预测
......@@ -72,5 +74,3 @@ root INFO: 1.jpg [[[291, 295], [334, 292], [348, 844], [305, 847]], [[344, 296]
The predict time of ../../doc/imgs/1.jpg: 0.06162881851196289
The visualized image saved in ./inference_results/det_res_1.jpg
```
* 注意:ONNX暂时不支持变长预测,需要将输入resize到固定输入,预测结果可能与直接使用Paddle预测有细微不同。
......@@ -57,7 +57,7 @@ PaddleOCR基于动态图开源的文本识别算法列表:
- [x] SAR([paper](https://arxiv.org/abs/1811.00751v2))
- [x] SEED([paper](https://arxiv.org/pdf/2005.10977.pdf))
参考[DTRB][3](https://arxiv.org/abs/1904.01906)文字识别训练和评估流程,使用MJSynth和SynthText两个文字识别数据集训练,在IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE数据集上进行评估,算法效果如下:
参考[DTRB](https://arxiv.org/abs/1904.01906)[3]文字识别训练和评估流程,使用MJSynth和SynthText两个文字识别数据集训练,在IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE数据集上进行评估,算法效果如下:
|模型|骨干网络|Avg Accuracy|模型存储命名|下载链接|
|---|---|---|---|---|
......
......@@ -146,7 +146,7 @@ PaddleOCR欢迎大家向repo中积极贡献代码,下面给出一些贡献代
-`远程仓库` Clone到本地
```
# 拉取develop分支的代码
# 拉取dygraph分支的代码
git clone https://github.com/{your_name}/PaddleOCR.git -b dygraph
cd PaddleOCR
```
......@@ -191,11 +191,11 @@ git checkout -b new_branch
也可以基于远程或者上游的分支创建新的分支,命令如下。
```
# 基于用户远程仓库(origin)的develop创建new_branch分支
git checkout -b new_branch origin/develop
# 基于上游远程仓库(upstream)的develop创建new_branch分支
# 基于用户远程仓库(origin)的dygraph创建new_branch分支
git checkout -b new_branch origin/dygraph
# 基于上游远程仓库(upstream)的dygraph创建new_branch分支
# 如果需要从upstream创建新的分支,需要首先使用git fetch upstream获取上游代码
git checkout -b new_branch upstream/develop
git checkout -b new_branch upstream/dygraph
```
最终会显示切换到新的分支,输出信息如下
......@@ -246,8 +246,8 @@ git commit -m "your commit info"
```
git fetch upstream
# 如果是希望提交到其他分支,则需要从upstream的其他分支pull代码,这里是develop
git pull upstream develop
# 如果是希望提交到其他分支,则需要从upstream的其他分支pull代码,这里是dygraph
git pull upstream dygraph
```
#### 3.2.7 push到远程仓库
......
## 数据合成工具
# 数据合成工具
除了开源数据,用户还可使用合成工具自行合成。这里整理了常用的数据合成工具,持续更新中,欢迎各位小伙伴贡献工具~
- [text_renderer](https://github.com/Sanster/text_renderer)
- [SynthText](https://github.com/ankush-me/SynthText)
......@@ -6,3 +6,4 @@
- [TextRecognitionDataGenerator](https://github.com/Belval/TextRecognitionDataGenerator)
- [SynthText3D](https://github.com/MhLiao/SynthText3D)
- [UnrealText](https://github.com/Jyouhou/UnrealText/)
- [SynthTIGER](https://github.com/clovaai/synthtiger)
\ No newline at end of file
......@@ -78,11 +78,11 @@ json.dumps编码前的图像标注信息是包含多个字典的list,字典中
cd PaddleOCR/
# 根据backbone的不同选择下载对应的预训练模型
# 下载MobileNetV3的预训练模型
wget -P ./pretrain_models/ https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileNetV3_large_x0_5_pretrained.pdparams
wget -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/pretrained/MobileNetV3_large_x0_5_pretrained.pdparams
# 或,下载ResNet18_vd的预训练模型
wget -P ./pretrain_models/ https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNet18_vd_pretrained.pdparams
wget -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/pretrained/ResNet18_vd_pretrained.pdparams
# 或,下载ResNet50_vd的预训练模型
wget -P ./pretrain_models/ https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNet50_vd_ssld_pretrained.pdparams
wget -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/pretrained/ResNet50_vd_ssld_pretrained.pdparams
```
<a name="2-----"></a>
......
......@@ -281,4 +281,274 @@ paddle.save(s_params, "ch_PP-OCRv2_rec_train/student.pdparams")
### 2.2 检测配置文件解析
* coming soon!
检测模型蒸馏的配置文件在PaddleOCR/configs/det/ch_PP-OCRv2/目录下,包含三个蒸馏配置文件:
- ch_PP-OCRv2_det_cml.yml,采用cml蒸馏,采用一个大模型蒸馏两个小模型,且两个小模型互相学习的方法
- ch_PP-OCRv2_det_dml.yml,采用DML的蒸馏,两个Student模型互蒸馏的方法
- ch_PP-OCRv2_det_distill.yml,采用Teacher大模型蒸馏小模型Student的方法
#### 2.2.1 模型结构
知识蒸馏任务中,模型结构配置如下所示:
```
Architecture:
name: DistillationModel # 结构名称,蒸馏任务中,为DistillationModel,用于构建对应的结构
algorithm: Distillation # 算法名称
Models: # 模型,包含子网络的配置信息
Student: # 子网络名称,至少需要包含`pretrained`与`freeze_params`信息,其他的参数为子网络的构造参数
pretrained: ./pretrain_models/MobileNetV3_large_x0_5_pretrained
freeze_params: false # 是否需要固定参数
return_all_feats: false # 子网络的参数,表示是否需要返回所有的features,如果为False,则只返回最后的输出
model_type: det
algorithm: DB
Backbone:
name: MobileNetV3
scale: 0.5
model_name: large
disable_se: True
Neck:
name: DBFPN
out_channels: 96
Head:
name: DBHead
k: 50
Teacher: # 另外一个子网络,这里给的是普通大模型蒸小模型的蒸馏示例,
pretrained: ./pretrain_models/ch_ppocr_server_v2.0_det_train/best_accuracy
freeze_params: true # Teacher模型是训练好的,不需要参与训练,freeze_params设置为True
return_all_feats: false
model_type: det
algorithm: DB
Transform:
Backbone:
name: ResNet
layers: 18
Neck:
name: DBFPN
out_channels: 256
Head:
name: DBHead
k: 50
```
如果是采用DML,即两个小模型互相学习的方法,上述配置文件里的Teacher网络结构需要设置为Student模型一样的配置,具体参考配置文件[ch_PP-OCRv2_det_dml.yml](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.4/configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_dml.yml)

下面介绍[ch_PP-OCRv2_det_cml.yml](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.4/configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_cml.yml)的配置文件参数:
```
Architecture:
name: DistillationModel
algorithm: Distillation
model_type: det
Models:
Teacher: # CML蒸馏的Teacher模型配置
pretrained: ./pretrain_models/ch_ppocr_server_v2.0_det_train/best_accuracy
freeze_params: true # Teacher 不训练
return_all_feats: false
model_type: det
algorithm: DB
Transform:
Backbone:
name: ResNet
layers: 18
Neck:
name: DBFPN
out_channels: 256
Head:
name: DBHead
k: 50
Student: # CML蒸馏的Student模型配置
pretrained: ./pretrain_models/MobileNetV3_large_x0_5_pretrained
freeze_params: false
return_all_feats: false
model_type: det
algorithm: DB
Backbone:
name: MobileNetV3
scale: 0.5
model_name: large
disable_se: True
Neck:
name: DBFPN
out_channels: 96
Head:
name: DBHead
k: 50
Student2: # CML蒸馏的Student2模型配置
pretrained: ./pretrain_models/MobileNetV3_large_x0_5_pretrained
freeze_params: false
return_all_feats: false
model_type: det
algorithm: DB
Transform:
Backbone:
name: MobileNetV3
scale: 0.5
model_name: large
disable_se: True
Neck:
name: DBFPN
out_channels: 96
Head:
name: DBHead
k: 50
```
蒸馏模型`DistillationModel`类的具体实现代码可以参考[distillation_model.py](../../ppocr/modeling/architectures/distillation_model.py)
最终模型`forward`输出为一个字典,key为所有的子网络名称,例如这里为`Student``Teacher`,value为对应子网络的输出,可以为`Tensor`(只返回该网络的最后一层)和`dict`(也返回了中间的特征信息)。
在蒸馏任务中,为了方便添加蒸馏损失函数,每个网络的输出保存为`dict`,其中包含子模块输出。每个子网络的输出结果均为`dict`,key包含`backbone_out`,`neck_out`, `head_out``value`为对应模块的tensor,最终对于上述配置文件,`DistillationModel`的输出格式如下。
```json
{
"Teacher": {
"backbone_out": tensor,
"neck_out": tensor,
"head_out": tensor,
},
"Student": {
"backbone_out": tensor,
"neck_out": tensor,
"head_out": tensor,
}
}
```
#### 2.1.2 损失函数
知识蒸馏任务中,检测ch_PP-OCRv2_det_distill.yml蒸馏损失函数配置如下所示。
```yaml
Loss:
name: CombinedLoss # 损失函数名称,基于改名称,构建用于损失函数的类
loss_config_list: # 损失函数配置文件列表,为CombinedLoss的必备函数
- DistillationDilaDBLoss: # 基于蒸馏的DB损失函数,继承自标准的DBloss
weight: 1.0 # 损失函数的权重,loss_config_list中,每个损失函数的配置都必须包含该字段
model_name_pairs: # 对于蒸馏模型的预测结果,提取这两个子网络的输出,计算Teacher模型和Student模型输出的loss
- ["Student", "Teacher"]
key: maps # 取子网络输出dict中,该key对应的tensor
balance_loss: true # 以下几个参数为标准DBloss的配置参数
main_loss_type: DiceLoss
alpha: 5
beta: 10
ohem_ratio: 3
- DistillationDBLoss: # 基于蒸馏的DB损失函数,继承自标准的DBloss,用于计算Student和GT之间的loss
weight: 1.0
model_name_list: ["Student"] # 模型名字只有Student,表示计算Student和GT之间的loss
name: DBLoss
balance_loss: true
main_loss_type: DiceLoss
alpha: 5
beta: 10
ohem_ratio: 3
```
同理,检测ch_PP-OCRv2_det_cml.yml蒸馏损失函数配置如下所示。相比较于ch_PP-OCRv2_det_distill.yml的损失函数配置,cml蒸馏的损失函数配置做了3个改动:
```yaml
Loss:
name: CombinedLoss
loss_config_list:
- DistillationDilaDBLoss:
weight: 1.0
model_name_pairs:
- ["Student", "Teacher"]
- ["Student2", "Teacher"] # 改动1,计算两个Student和Teacher的损失
key: maps
balance_loss: true
main_loss_type: DiceLoss
alpha: 5
beta: 10
ohem_ratio: 3
- DistillationDMLLoss: # 改动2,增加计算两个Student之间的损失
model_name_pairs:
- ["Student", "Student2"]
maps_name: "thrink_maps"
weight: 1.0
# act: None
key: maps
- DistillationDBLoss:
weight: 1.0
model_name_list: ["Student", "Student2"] # 改动3,计算两个Student和GT之间的损失
balance_loss: true
main_loss_type: DiceLoss
alpha: 5
beta: 10
ohem_ratio: 3
```
关于`DistillationDilaDBLoss`更加具体的实现可以参考: [distillation_loss.py](https://github.com/PaddlePaddle/PaddleOCR/blob/release%2F2.4/ppocr/losses/distillation_loss.py#L185)。关于`DistillationDBLoss`等蒸馏损失函数更加具体的实现可以参考[distillation_loss.py](https://github.com/PaddlePaddle/PaddleOCR/blob/04c44974b13163450dfb6bd2c327863f8a194b3c/ppocr/losses/distillation_loss.py?_pjax=%23js-repo-pjax-container%2C%20div%5Bitemtype%3D%22http%3A%2F%2Fschema.org%2FSoftwareSourceCode%22%5D%20main%2C%20%5Bdata-pjax-container%5D#L148)
#### 2.1.3 后处理
知识蒸馏任务中,检测蒸馏后处理配置如下所示。
```yaml
PostProcess:
name: DistillationDBPostProcess # DB检测蒸馏任务的CTC解码后处理,继承自标准的DBPostProcess类
model_name: ["Student", "Student2", "Teacher"] # 对于蒸馏模型的预测结果,提取多个子网络的输出,进行解码,不需要后处理的网络可以不在model_name中设置
thresh: 0.3
box_thresh: 0.6
max_candidates: 1000
unclip_ratio: 1.5
```
以上述配置为例,最终会同时计算`Student``Student2``Teacher` 3个子网络的输出做后处理计算。同时,由于有多个输入,后处理返回的输出也有多个,
关于`DistillationDBPostProcess`更加具体的实现可以参考: [db_postprocess.py](../../ppocr/postprocess/db_postprocess.py#L195)
#### 2.1.4 蒸馏指标计算
知识蒸馏任务中,检测蒸馏指标计算配置如下所示。
```yaml
Metric:
name: DistillationMetric
base_metric_name: DetMetric
main_indicator: hmean
key: "Student"
```
由于蒸馏需要包含多个网络,甚至多个Student网络,在计算指标的时候只需要计算一个Student网络的指标即可,`key`字段设置为`Student`则表示只计算`Student`网络的精度。
#### 2.1.5 检测蒸馏模型finetune
检测蒸馏有三种方式:
- 采用ch_PP-OCRv2_det_distill.yml,Teacher模型设置为PaddleOCR提供的模型或者您训练好的大模型
- 采用ch_PP-OCRv2_det_cml.yml,采用cml蒸馏,同样Teacher模型设置为PaddleOCR提供的模型或者您训练好的大模型
- 采用ch_PP-OCRv2_det_dml.yml,采用DML的蒸馏,两个Student模型互蒸馏的方法,在PaddleOCR采用的数据集上大约有1.7%的精度提升。
在具体finetune时,需要在网络结构的`pretrained`参数中设置要加载的预训练模型。
在精度提升方面,cml的精度>dml的精度>distill蒸馏方法的精度。当数据量不足或者Teacher模型精度与Student精度相差不大的时候,这个结论或许会改变。
另外,由于PaddleOCR提供的蒸馏预训练模型包含了多个模型的参数,如果您希望提取Student模型的参数,可以参考如下代码:
```
# 下载蒸馏训练模型的参数
wget https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_distill_train.tar
```
```python
import paddle
# 加载预训练模型
all_params = paddle.load("ch_PP-OCRv2_det_distill_train/best_accuracy.pdparams")
# 查看权重参数的keys
print(all_params.keys())
# 学生模型的权重提取
s_params = {key[len("Student."):]: all_params[key] for key in all_params if "Student." in key}
# 查看学生模型权重参数的keys
print(s_params.keys())
# 保存
paddle.save(s_params, "ch_PP-OCRv2_det_distill_train/student.pdparams")
```
最终`Student`模型的参数将会保存在`ch_PP-OCRv2_det_distill_train/student.pdparams`中,用于模型的fine-tune。
# PP-OCR模型与配置文件
PP-OCR模型与配置文件一章主要补充一些OCR模型的基本概念、配置文件的内容与作用以便对模型后续的参数调整和训练中拥有更好的体验。
本章包含三个部分,首先在[PP-OCR模型下载](./models_list.md)中解释PP-OCR模型的类型概念,并提供所有模型的下载链接。然后在[配置文件内容与生成](./config.md)中详细说明调整PP-OCR模型所需的参数。最后的[模型库快速使用](./inference_ppocr.md)是对第一节PP-OCR模型库使用方法的介绍,可以通过Python推理引擎快速利用丰富的模型库模型获得测试结果。
------
下面我们首先了解一些OCR相关的基本概念:
- [1. OCR 简要介绍](#1-ocr-----)
* [1.1 OCR 检测模型基本概念](#11-ocr---------)
* [1.2 OCR 识别模型基本概念](#12-ocr---------)
* [1.3 PP-OCR模型](#13-pp-ocr--)
<a name="1-ocr-----"></a>
## 1. OCR 简要介绍
本节简要介绍OCR检测模型、识别模型的基本概念,并介绍PaddleOCR的PP-OCR模型。
OCR(Optical Character Recognition,光学字符识别)目前是文字识别的统称,已不限于文档或书本文字识别,更包括识别自然场景下的文字,又可以称为STR(Scene Text Recognition)。
OCR文字识别一般包括两个部分,文本检测和文本识别;文本检测首先利用检测算法检测到图像中的文本行;然后检测到的文本行用识别算法去识别到具体文字。
<a name="11-ocr---------"></a>
### 1.1 OCR 检测模型基本概念
文本检测就是要定位图像中的文字区域,然后通常以边界框的形式将单词或文本行标记出来。传统的文字检测算法多是通过手工提取特征的方式,特点是速度快,简单场景效果好,但是面对自然场景,效果会大打折扣。当前多是采用深度学习方法来做。
基于深度学习的文本检测算法可以大致分为以下几类:
1. 基于目标检测的方法;一般是预测得到文本框后,通过NMS筛选得到最终文本框,多是四点文本框,对弯曲文本场景效果不理想。典型算法为EAST、Text Box等方法。
2. 基于分割的方法;将文本行当成分割目标,然后通过分割结果构建外接文本框,可以处理弯曲文本,对于文本交叉场景问题效果不理想。典型算法为DB、PSENet等方法。
3. 混合目标检测和分割的方法;
<a name="12-ocr---------"></a>
### 1.2 OCR 识别模型基本概念
OCR识别算法的输入数据一般是文本行,背景信息不多,文字占据主要部分,识别算法目前可以分为两类算法:
1. 基于CTC的方法;即识别算法的文字预测模块是基于CTC的,常用的算法组合为CNN+RNN+CTC。目前也有一些算法尝试在网络中加入transformer模块等等。
2. 基于Attention的方法;即识别算法的文字预测模块是基于Attention的,常用算法组合是CNN+RNN+Attention。
<a name="13-pp-ocr--"></a>
### 1.3 PP-OCR模型
PaddleOCR 中集成了很多OCR算法,文本检测算法有DB、EAST、SAST等等,文本识别算法有CRNN、RARE、StarNet、Rosetta、SRN等算法。
其中PaddleOCR针对中英文自然场景通用OCR,推出了PP-OCR系列模型,PP-OCR模型由DB+CRNN算法组成,利用海量中文数据训练加上模型调优方法,在中文场景上具备较高的文本检测识别能力。并且PaddleOCR推出了高精度超轻量PP-OCRv2模型,检测模型仅3M,识别模型仅8.5M,利用[PaddleSlim](https://github.com/PaddlePaddle/PaddleSlim)的模型量化方法,可以在保持精度不降低的情况下,将检测模型压缩到0.8M,识别压缩到3M,更加适用于移动端部署场景。
......@@ -143,8 +143,10 @@ PaddleOCR主要聚焦通用OCR,如果有垂类需求,您可以用PaddleOCR+
具体的训练教程可点击下方链接跳转:
\- [文本检测模型训练](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.3/doc/doc_ch/detection.md)
- [文本检测模型训练](./detection.md)
\- [文本识别模型训练](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.3/doc/doc_ch/recognition.md)
- [文本识别模型训练](./recognition.md)
- [文本方向分类器训练](./angle_class.md)
- [知识蒸馏](./knowledge_distillation.md)
\- [文本方向分类器训练](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.3/doc/doc_ch/angle_class.md)
......@@ -9,3 +9,4 @@ There are the commonly used data synthesis tools, which will be continuously upd
* [TextRecognitionDataGenerator](https://github.com/Belval/TextRecognitionDataGenerator)
* [SynthText3D](https://github.com/MhLiao/SynthText3D)
* [UnrealText](https://github.com/Jyouhou/UnrealText/)
* [SynthTIGER](https://github.com/clovaai/synthtiger)
\ No newline at end of file
......@@ -67,11 +67,11 @@ And the responding download link of backbone pretrain weights can be found in (h
```shell
cd PaddleOCR/
# Download the pre-trained model of MobileNetV3
wget -P ./pretrain_models/ https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileNetV3_large_x0_5_pretrained.pdparams
wget -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/pretrained/MobileNetV3_large_x0_5_pretrained.pdparams
# or, download the pre-trained model of ResNet18_vd
wget -P ./pretrain_models/ https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNet18_vd_pretrained.pdparams
wget -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/pretrained/ResNet18_vd_pretrained.pdparams
# or, download the pre-trained model of ResNet50_vd
wget -P ./pretrain_models/ https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNet50_vd_ssld_pretrained.pdparams
wget -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/pretrained/ResNet50_vd_ssld_pretrained.pdparams
```
......
# PP-OCR Model and Configuration
The chapter on PP-OCR model and configuration file mainly adds some basic concepts of OCR model and the content and role of configuration file to have a better experience in the subsequent parameter adjustment and training of the model.
This chapter contains three parts. Firstly, [PP-OCR Model Download](. /models_list_en.md) explains the concept of PP-OCR model types and provides links to download all models. Then in [Yml Configuration](. /config_en.md) details the parameters needed to fine-tune the PP-OCR models. The final [Python Inference for PP-OCR Model Library](. /inference_ppocr_en.md) is an introduction to the use of the PP-OCR model library in the first section, which can quickly utilize the rich model library models to obtain test results through the Python inference engine.
------
Let's first understand some basic concepts.
- [INTRODUCTION ABOUT OCR](#introduction-about-ocr)
* [BASIC CONCEPTS OF OCR DETECTION MODEL](#basic-concepts-of-ocr-detection-model)
* [Basic concepts of OCR recognition model](#basic-concepts-of-ocr-recognition-model)
* [PP-OCR model](#pp-ocr-model)
* [And a table of contents](#and-a-table-of-contents)
* [On the right](#on-the-right)
## 1. INTRODUCTION ABOUT OCR
This section briefly introduces the basic concepts of OCR detection model and recognition model, and introduces PaddleOCR's PP-OCR model.
OCR (Optical Character Recognition, Optical Character Recognition) is currently the general term for text recognition. It is not limited to document or book text recognition, but also includes recognizing text in natural scenes. It can also be called STR (Scene Text Recognition).
OCR text recognition generally includes two parts, text detection and text recognition. The text detection module first uses detection algorithms to detect text lines in the image. And then the recognition algorithm to identify the specific text in the text line.
### 1.1 BASIC CONCEPTS OF OCR DETECTION MODEL
Text detection can locate the text area in the image, and then usually mark the word or text line in the form of a bounding box. Traditional text detection algorithms mostly extract features manually, which are characterized by fast speed and good effect in simple scenes, but the effect will be greatly reduced when faced with natural scenes. Currently, deep learning methods are mostly used.
Text detection algorithms based on deep learning can be roughly divided into the following categories:
1. Method based on target detection. Generally, after the text box is predicted, the final text box is filtered through NMS, which is mostly four-point text box, which is not ideal for curved text scenes. Typical algorithms are methods such as EAST and Text Box.
2. Method based on text segmentation. The text line is regarded as the segmentation target, and then the external text box is constructed through the segmentation result, which can handle curved text, and the effect is not ideal for the text cross scene problem. Typical algorithms are DB, PSENet and other methods.
3. Hybrid target detection and segmentation method.
### 1.2 Basic concepts of OCR recognition model
The input of the OCR recognition algorithm is generally text lines images which has less background information, and the text information occupies the main part. The recognition algorithm can be divided into two types of algorithms:
1. CTC-based method. The text prediction module of the recognition algorithm is based on CTC, and the commonly used algorithm combination is CNN+RNN+CTC. There are also some algorithms that try to add transformer modules to the network and so on.
2. Attention-based method. The text prediction module of the recognition algorithm is based on Attention, and the commonly used algorithm combination is CNN+RNN+Attention.
### 1.3 PP-OCR model
PaddleOCR integrates many OCR algorithms, text detection algorithms include DB, EAST, SAST, etc., text recognition algorithms include CRNN, RARE, StarNet, Rosetta, SRN and other algorithms.
Among them, PaddleOCR has released the PP-OCR series model for the general OCR in Chinese and English natural scenes. The PP-OCR model is composed of the DB+CRNN algorithm. It uses massive Chinese data training and model tuning methods to have high text detection and recognition capabilities in Chinese scenes. And PaddleOCR has launched a high-precision and ultra-lightweight PP-OCRv2 model. The detection model is only 3M, and the recognition model is only 8.5M. Using [PaddleSlim](https://github.com/PaddlePaddle/PaddleSlim)'s model quantification method, the detection model can be compressed to 0.8M without reducing the accuracy. The recognition is compressed to 3M, which is more suitable for mobile deployment scenarios.
doc/joinus.PNG

185.9 KB | W: | H:

doc/joinus.PNG

199.8 KB | W: | H:

doc/joinus.PNG
doc/joinus.PNG
doc/joinus.PNG
doc/joinus.PNG
  • 2-up
  • Swipe
  • Onion skin
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"collapsed": false
},
"source": [
"# 文本检测FAQ\n",
"\n",
"本节罗列一些开发者们使用PaddleOCR的文本检测模型常遇到的一些问题,并给出相应的问题解决方法或建议。\n",
"\n",
"FAQ分两个部分来介绍,分别是:\n",
" - 文本检测训练相关\n",
" - 文本检测预测相关"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false
},
"source": [
"## 1. 文本检测训练相关FAQ\n",
"\n",
"**1.1 PaddleOCR提供的文本检测算法包括哪些?**\n",
"\n",
"**A**:PaddleOCR中包含多种文本检测模型,包括基于回归的文本检测方法EAST、SAST,和基于分割的文本检测方法DB,PSENet。\n",
"\n",
"\n",
"**1.2:请问PaddleOCR项目中的中文超轻量和通用模型用了哪些数据集?训练多少样本,gpu什么配置,跑了多少个epoch,大概跑了多久?**\n",
"\n",
"**A**:对于超轻量DB检测模型,训练数据包括开源数据集lsvt,rctw,CASIA,CCPD,MSRA,MLT,BornDigit,iflytek,SROIE和合成的数据集等,总数据量越10W,数据集分为5个部分,训练时采用随机采样策略,在4卡V100GPU上约训练500epoch,耗时3天。\n",
"\n",
"\n",
"**1.3 文本检测训练标签是否需要具体文本标注,标签中的”###”是什么意思?**\n",
"\n",
"**A**:文本检测训练只需要文本区域的坐标即可,标注可以是四点或者十四点,按照左上,右上,右下,左下的顺序排列。PaddleOCR提供的标签文件中包含文本字段,对于文本区域文字不清晰会使用###代替。训练检测模型时,不会用到标签中的文本字段。\n",
" \n",
"**1.4 对于文本行较紧密的情况下训练的文本检测模型效果较差?**\n",
"\n",
"**A**:使用基于分割的方法,如DB,检测密集文本行时,最好收集一批数据进行训练,并且在训练时,并将生成二值图像的[shrink_ratio](https://github.com/PaddlePaddle/PaddleOCR/blob/8b656a3e13631dfb1ac21d2095d4d4a4993ef710/ppocr/data/imaug/make_shrink_map.py?_pjax=%23js-repo-pjax-container%2C%20div%5Bitemtype%3D%22http%3A%2F%2Fschema.org%2FSoftwareSourceCode%22%5D%20main%2C%20%5Bdata-pjax-container%5D#L37)参数调小一些。另外,在预测的时候,可以适当减小[unclip_ratio](https://github.com/PaddlePaddle/PaddleOCR/blob/8b656a3e13631dfb1ac21d2095d4d4a4993ef710/configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml?_pjax=%23js-repo-pjax-container%2C%20div%5Bitemtype%3D%22http%3A%2F%2Fschema.org%2FSoftwareSourceCode%22%5D%20main%2C%20%5Bdata-pjax-container%5D#L59)参数,unclip_ratio参数值越大检测框就越大。\n",
"\n",
"\n",
"**1.5 对于一些尺寸较大的文档类图片, DB在检测时会有较多的漏检,怎么避免这种漏检的问题呢?**\n",
"\n",
"**A**:首先,需要确定是模型没有训练好的问题还是预测时处理的问题。如果是模型没有训练好,建议多加一些数据进行训练,或者在训练的时候多加一些数据增强。\n",
"如果是预测图像过大的问题,可以增大预测时输入的最长边设置参数[det_limit_side_len](https://github.com/PaddlePaddle/PaddleOCR/blob/8b656a3e13631dfb1ac21d2095d4d4a4993ef710/tools/infer/utility.py?_pjax=%23js-repo-pjax-container%2C%20div%5Bitemtype%3D%22http%3A%2F%2Fschema.org%2FSoftwareSourceCode%22%5D%20main%2C%20%5Bdata-pjax-container%5D#L47),默认为960。\n",
"其次,可以通过可视化后处理的分割图观察漏检的文字是否有分割结果,如果没有分割结果,说明是模型没有训练好。如果有完整的分割区域,说明是预测后处理的问题,建议调整[DB后处理参数](https://github.com/PaddlePaddle/PaddleOCR/blob/8b656a3e13631dfb1ac21d2095d4d4a4993ef710/tools/infer/utility.py?_pjax=%23js-repo-pjax-container%2C%20div%5Bitemtype%3D%22http%3A%2F%2Fschema.org%2FSoftwareSourceCode%22%5D%20main%2C%20%5Bdata-pjax-container%5D#L51-L53)。\n",
"\n",
"\n",
"**1.6 DB模型弯曲文本(如略微形变的文档图像)漏检问题?**\n",
"\n",
"**A**: DB后处理中计算文本框平均得分时,是求rectangle区域的平均分数,容易造成弯曲文本漏检,已新增求polygon区域的平均分数,会更准确,但速度有所降低,可按需选择,在相关pr中可查看[可视化对比效果](https://github.com/PaddlePaddle/PaddleOCR/pull/2604)。该功能通过参数 [det_db_score_mode](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.1/tools/infer/utility.py#L51)进行选择,参数值可选[`fast`(默认)、`slow`],`fast`对应原始的rectangle方式,`slow`对应polygon方式。感谢用户[buptlihang](https://github.com/buptlihang)提[pr](https://github.com/PaddlePaddle/PaddleOCR/pull/2574)帮助解决该问题。\n",
"\n",
"\n",
"**1.7 简单的对于精度要求不高的OCR任务,数据集需要准备多少张呢?**\n",
"\n",
"**A**:(1)训练数据的数量和需要解决问题的复杂度有关系。难度越大,精度要求越高,则数据集需求越大,而且一般情况实际中的训练数据越多效果越好。\n",
"\n",
"(2)对于精度要求不高的场景,检测任务和识别任务需要的数据量是不一样的。对于检测任务,500张图像可以保证基本的检测效果。对于识别任务,需要保证识别字典中每个字符出现在不同场景的行文本图像数目需要大于200张(举例,如果有字典中有5个字,每个字都需要出现在200张图片以上,那么最少要求的图像数量应该在200-1000张之间),这样可以保证基本的识别效果。\n",
"\n",
"\n",
"**1.8 当训练数据量少时,如何获取更多的数据?**\n",
"\n",
"**A**:当训练数据量少时,可以尝试以下三种方式获取更多的数据:(1)人工采集更多的训练数据,最直接也是最有效的方式。(2)基于PIL和opencv基本图像处理或者变换。例如PIL中ImageFont, Image, ImageDraw三个模块将文字写到背景中,opencv的旋转仿射变换,高斯滤波等。(3)利用数据生成算法合成数据,例如pix2pix等算法。\n",
"\n",
"\n",
"**1.9 如何更换文本检测/识别的backbone?**\n",
"\n",
"A:无论是文字检测,还是文字识别,骨干网络的选择是预测效果和预测效率的权衡。一般,选择更大规模的骨干网络,例如ResNet101_vd,则检测或识别更准确,但预测耗时相应也会增加。而选择更小规模的骨干网络,例如MobileNetV3_small_x0_35,则预测更快,但检测或识别的准确率会大打折扣。幸运的是不同骨干网络的检测或识别效果与在ImageNet数据集图像1000分类任务效果正相关。飞桨图像分类套件PaddleClas汇总了ResNet_vd、Res2Net、HRNet、MobileNetV3、GhostNet等23种系列的分类网络结构,在上述图像分类任务的top1识别准确率,GPU(V100和T4)和CPU(骁龙855)的预测耗时以及相应的117个预训练模型下载地址。\n",
"\n",
"(1)文字检测骨干网络的替换,主要是确定类似与ResNet的4个stages,以方便集成后续的类似FPN的检测头。此外,对于文字检测问题,使用ImageNet训练的分类预训练模型,可以加速收敛和效果提升。\n",
"\n",
"(2)文字识别的骨干网络的替换,需要注意网络宽高stride的下降位置。由于文本识别一般宽高比例很大,因此高度下降频率少一些,宽度下降频率多一些。可以参考[PaddleOCR中MobileNetV3骨干网络的改动](https://github.com/PaddlePaddle/PaddleOCR/blob/release%2F2.3/ppocr/modeling/backbones/rec_mobilenet_v3.py)。\n",
"\n",
"\n",
"**1.10 如何对检测模型finetune,比如冻结前面的层或某些层使用小的学习率学习?**\n",
"\n",
"**A**:如果是冻结某些层,可以将变量的stop_gradient属性设置为True,这样计算这个变量之前的所有参数都不会更新了,参考:https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/faq/train_cn.html#id4\n",
"\n",
"如果对某些层使用更小的学习率学习,静态图里还不是很方便,一个方法是在参数初始化的时候,给权重的属性设置固定的学习率,参考:https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api/paddle/fluid/param_attr/ParamAttr_cn.html#paramattr\n",
"\n",
"实际上我们实验发现,直接加载模型去fine-tune,不设置某些层不同学习率,效果也都不错\n",
"\n",
"**1.11 DB的预处理部分,图片的长和宽为什么要处理成32的倍数?**\n",
"\n",
"**A**:和网络下采样的倍数(stride)有关。以检测中的resnet骨干网络为例,图像输入网络之后,需要经过5次2倍降采样,共32倍,因此建议输入的图像尺寸为32的倍数。\n",
"\n",
"\n",
"**1.12 在PP-OCR系列的模型中,文本检测的骨干网络为什么没有使用SEBlock?**\n",
"\n",
"**A**:SE模块是MobileNetV3网络一个重要模块,目的是估计特征图每个特征通道重要性,给特征图每个特征分配权重,提高网络的表达能力。但是,对于文本检测,输入网络的分辨率比较大,一般是640\\*640,利用SE模块估计特征图每个特征通道重要性比较困难,网络提升能力有限,但是该模块又比较耗时,因此在PP-OCR系统中,文本检测的骨干网络没有使用SE模块。实验也表明,当去掉SE模块,超轻量模型大小可以减小40%,文本检测效果基本不受影响。详细可以参考PP-OCR技术文章,https://arxiv.org/abs/2009.09941.\n",
"\n",
"\n",
"**1.13 PP-OCR检测效果不好,该如何优化?**\n",
"\n",
"A: 具体问题具体分析:\n",
"- 如果在你的场景上检测效果不可用,首选是在你的数据上做finetune训练;\n",
"- 如果图像过大,文字过于密集,建议不要过度压缩图像,可以尝试修改检测预处理的resize逻辑,防止图像被过度压缩;\n",
"- 检测框大小过于紧贴文字或检测框过大,可以调整db_unclip_ratio这个参数,加大参数可以扩大检测框,减小参数可以减小检测框大小;\n",
"- 检测框存在很多漏检问题,可以减小DB检测后处理的阈值参数det_db_box_thresh,防止一些检测框被过滤掉,也可以尝试设置det_db_score_mode为'slow';\n",
"- 其他方法可以选择use_dilation为True,对检测输出的feature map做膨胀处理,一般情况下,会有效果改善;\n",
"\n",
"\n",
"## 2. 文本检测预测相关FAQ\n",
"\n",
"**2.1 DB有些框太贴文本了反而去掉了一些文本的边角影响识别,这个问题有什么办法可以缓解吗?**\n",
"\n",
"**A**:可以把后处理的参数[unclip_ratio](https://github.com/PaddlePaddle/PaddleOCR/blob/d80afce9b51f09fd3d90e539c40eba8eb5e50dd6/tools/infer/utility.py?_pjax=%23js-repo-pjax-container%2C%20div%5Bitemtype%3D%22http%3A%2F%2Fschema.org%2FSoftwareSourceCode%22%5D%20main%2C%20%5Bdata-pjax-container%5D#L52)适当调大一点,该参数越大文本框越大。\n",
"\n",
"\n",
"**2.2 为什么PaddleOCR检测预测是只支持一张图片测试?即test_batch_size_per_card=1**\n",
"\n",
"**A**:预测的时候,对图像等比例缩放,最长边960,不同图像等比例缩放后长宽不一致,无法组成batch,所以设置为test_batch_size为1。\n",
"\n",
"\n",
"**2.3 在CPU上加速PaddleOCR的文本检测模型预测?**\n",
"\n",
"**A**:x86 CPU可以使用mkldnn(OneDNN)进行加速;在支持mkldnn加速的CPU上开启[enable_mkldnn](https://github.com/PaddlePaddle/PaddleOCR/blob/8b656a3e13631dfb1ac21d2095d4d4a4993ef710/tools/infer/utility.py#L105)参数。另外,配合增加CPU上预测使用的[线程数num_threads](https://github.com/PaddlePaddle/PaddleOCR/blob/8b656a3e13631dfb1ac21d2095d4d4a4993ef710/tools/infer/utility.py#L106),可以有效加快CPU上的预测速度。\n",
"\n",
"**2.4 在GPU上加速PaddleOCR的文本检测模型预测?**\n",
"\n",
"**A**:GPU加速预测推荐使用TensorRT。\n",
"- 1. 从[链接](https://paddleinference.paddlepaddle.org.cn/master/user_guides/download_lib.html)下载带TensorRT的Paddle安装包或者预测库。\n",
"- 2. 从Nvidia官网下载TensorRT版本,注意下载的TensorRT版本与paddle安装包中编译的TensorRT版本一致。\n",
"- 3. 设置环境变量LD_LIBRARY_PATH,指向TensorRT的lib文件夹\n",
"```\n",
"export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:<TensorRT-${version}/lib>\n",
"```\n",
"- 4. 开启PaddleOCR预测的[tensorrt选项](https://github.com/PaddlePaddle/PaddleOCR/blob/8b656a3e13631dfb1ac21d2095d4d4a4993ef710/tools/infer/utility.py?_pjax=%23js-repo-pjax-container%2C%20div%5Bitemtype%3D%22http%3A%2F%2Fschema.org%2FSoftwareSourceCode%22%5D%20main%2C%20%5Bdata-pjax-container%5D#L38)。\n",
"\n",
"**2.5 如何在移动端部署PaddleOCR模型?**\n",
"\n",
"**A**: 飞桨Paddle有专门针对移动端部署的工具[PaddleLite](https://github.com/PaddlePaddle/Paddle-Lite),并且PaddleOCR提供了DB+CRNN为demo的android arm部署代码,参考[链接](https://github.com/PaddlePaddle/PaddleOCR/blob/release%2F2.3/deploy/lite/readme.md)。\n",
"\n",
"\n",
"**2.6 如何使用PaddleOCR多进程预测?**\n",
"\n",
"**A**: 近期PaddleOCR新增了[多进程预测控制参数](https://github.com/PaddlePaddle/PaddleOCR/blob/8b656a3e13631dfb1ac21d2095d4d4a4993ef710/tools/infer/utility.py?_pjax=%23js-repo-pjax-container%2C%20div%5Bitemtype%3D%22http%3A%2F%2Fschema.org%2FSoftwareSourceCode%22%5D%20main%2C%20%5Bdata-pjax-container%5D#L111),`use_mp`表示是否使用多进程,`total_process_num`表示在使用多进程时的进程数。具体使用方式请参考[文档](https://github.com/PaddlePaddle/PaddleOCR/blob/release%2F2.3/doc/doc_ch/inference.md#1-%E8%B6%85%E8%BD%BB%E9%87%8F%E4%B8%AD%E6%96%87ocr%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86)。\n",
"\n",
"**2.7 预测时显存爆炸、内存泄漏问题?**\n",
"\n",
"**A**: 如果是训练模型的预测,由于模型太大或者输入图像太大导致显存不够用,可以参考代码在主函数运行前加上paddle.no_grad(),即可减小显存占用。如果是inference模型预测时显存占用过高,可以配置Config时,加入[config.enable_memory_optim()](https://github.com/PaddlePaddle/PaddleOCR/blob/8b656a3e13631dfb1ac21d2095d4d4a4993ef710/tools/infer/utility.py?_pjax=%23js-repo-pjax-container%2C%20div%5Bitemtype%3D%22http%3A%2F%2Fschema.org%2FSoftwareSourceCode%22%5D%20main%2C%20%5Bdata-pjax-container%5D#L267)用于减小内存占用。\n",
"\n",
"另外关于使用Paddle预测时出现内存泄漏的问题,建议安装paddle最新版本,内存泄漏已修复。"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "py35-paddle1.2.0"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
因为 它太大了无法显示 source diff 。你可以改为 查看blob
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"collapsed": false
},
"source": [
"\n",
"# 文本识别算法理论\n",
"\n",
"本章主要介绍文本识别算法的理论知识,包括背景介绍、算法分类和部分经典论文思路。\n",
"\n",
"通过本章的学习,你可以掌握:\n",
"\n",
"1. 文本识别的目标\n",
"\n",
"2. 文本识别算法的分类\n",
"\n",
"3. 各类算法的典型思想\n",
"\n",
"\n",
"## 1 背景介绍\n",
"\n",
"文本识别是OCR(Optical Character Recognition)的一个子任务,其任务为识别一个固定区域的的文本内容。在OCR的两阶段方法里,它接在文本检测后面,将图像信息转换为文字信息。\n",
"\n",
"具体地,模型输入一张定位好的文本行,由模型预测出图片中的文字内容和置信度,可视化结果如下图所示:\n",
"\n",
"<center><img src=https://ai-studio-static-online.cdn.bcebos.com/a7c3404f778b489db9c1f686c7d2ff4d63b67c429b454f98b91ade7b89f8e903 width=\"600\"></center>\n",
"\n",
"<center><img src=https://ai-studio-static-online.cdn.bcebos.com/e72b1d6f80c342ac951d092bc8c325149cebb3763ec849ec8a2f54e7c8ad60ca width=\"600\"></center>\n",
"\n",
"\n",
"文本识别的应用场景很多,有文档识别、路标识别、车牌识别、工业编号识别等等,根据实际场景可以把文本识别任务分为两个大类:**规则文本识别**和**不规则文本识别**。\n",
"\n",
"* 规则文本识别:主要指印刷字体、扫描文本等,认为文本大致处在水平线位置\n",
"\n",
"* 不规则文本识别: 往往出现在自然场景中,且由于文本曲率、方向、变形等方面差异巨大,文字往往不在水平位置,存在弯曲、遮挡、模糊等问题。\n",
"\n",
"\n",
"下图展示的是 IC15 和 IC13 的数据样式,它们分别代表了不规则文本和规则文本。可以看出不规则文本往往存在扭曲、模糊、字体差异大等问题,更贴近真实场景,也存在更大的挑战性。\n",
"\n",
"因此目前各大算法都试图在不规则数据集上获得更高的指标。\n",
"\n",
"<center><img src=https://ai-studio-static-online.cdn.bcebos.com/bae4fce1370b4751a3779542323d0765a02a44eace7b44d2a87a241c13c6f8cf width=\"400\">\n",
"<br><center>IC15 图片样例(不规则文本)</center>\n",
"<img src=https://ai-studio-static-online.cdn.bcebos.com/b55800d3276f4f5fad170ea1b567eb770177fce226f945fba5d3247a48c15c34 width=\"400\"></center>\n",
"<br><center>IC13 图片样例(规则文本)</center>\n",
"\n",
"\n",
"不同的识别算法在对比能力时,往往也在这两大类公开数据集上比较。对比多个维度上的效果,目前较为通用的英文评估集合分类如下:\n",
"\n",
"<center><img src=https://ai-studio-static-online.cdn.bcebos.com/4d0aada261064031a16816b39a37f2ff6af70dbb57004cb7a106ae6485f14684 width=\"600\"></center>\n",
"\n",
"## 2 文本识别算法分类\n",
"\n",
"在传统的文本识别方法中,任务分为3个步骤,即图像预处理、字符分割和字符识别。需要对特定场景进行建模,一旦场景变化就会失效。面对复杂的文字背景和场景变动,基于深度学习的方法具有更优的表现。\n",
"\n",
"多数现有的识别算法可用如下统一框架表示,算法流程被划分为4个阶段:\n",
"\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/a2750f4170864f69a3af36fc13db7b606d851f2f467d43cea6fbf3521e65450f)\n",
"\n",
"\n",
"我们整理了主流的算法类别和主要论文,参考下表:\n",
"\n",
"<center>\n",
" \n",
"| 算法类别 | 主要思路 | 主要论文 |\n",
"| -------- | --------------- | -------- |\n",
"| 传统算法 | 滑动窗口、字符提取、动态规划 | - |\n",
"| ctc | 基于ctc的方法,序列不对齐,更快速识别 | CRNN, Rosetta |\n",
"| Attention | 基于attention的方法,应用于非常规文本 | RARE, DAN, PREN |\n",
"| Transformer | 基于transformer的方法 | SRN, NRTR, Master, ABINet |\n",
"| 校正 | 校正模块学习文本边界并校正成水平方向 | RARE, ASTER, SAR | \n",
"| 分割 | 基于分割的方法,提取字符位置再做分类 | Text Scanner, Mask TextSpotter |\n",
" \n",
"</center>\n",
"\n",
"\n",
"### 2.1 规则文本识别\n",
"\n",
"\n",
"文本识别的主流算法有两种,分别是基于 CTC (Conectionist Temporal Classification) 的算法和 Sequence2Sequence 算法,区别主要在解码阶段。\n",
"\n",
"基于 CTC 的算法是将编码产生的序列接入 CTC 进行解码;基于 Sequence2Sequence 的方法则是把序列接入循环神经网络(Recurrent Neural Network, RNN)模块进行循环解码,两种方式都验证有效也是主流的两大做法。\n",
"\n",
"<center><img src=https://ai-studio-static-online.cdn.bcebos.com/f64eee66e4a6426f934c1befc3b138629324cf7360c74f72bd6cf3c0de9d49bd width=\"600\"></center>\n",
"<br><center>左:基于CTC的方法,右:基于Sequece2Sequence的方法 </center>\n",
"\n",
"\n",
"#### 2.1.1 基于CTC的算法\n",
"\n",
"基于 CTC 最典型的算法是CRNN (Convolutional Recurrent Neural Network)[1],它的特征提取部分使用主流的卷积结构,常用的有ResNet、MobileNet、VGG等。由于文本识别任务的特殊性,输入数据中存在大量的上下文信息,卷积神经网络的卷积核特性使其更关注于局部信息,缺乏长依赖的建模能力,因此仅使用卷积网络很难挖掘到文本之间的上下文联系。为了解决这一问题,CRNN文本识别算法引入了双向 LSTM(Long Short-Term Memory) 用来增强上下文建模,通过实验证明双向LSTM模块可以有效的提取出图片中的上下文信息。最终将输出的特征序列输入到CTC模块,直接解码序列结果。该结构被验证有效,并广泛应用在文本识别任务中。Rosetta[2]是FaceBook提出的识别网络,由全卷积模型和CTC组成。Gao Y[3]等人使用CNN卷积替代LSTM,参数更少,性能提升精度持平。\n",
"\n",
"<center><img src=https://ai-studio-static-online.cdn.bcebos.com/d3c96dd9e9794fddb12fa16f926abdd3485194f0a2b749e792e436037490899b width=\"600\"></center>\n",
"<center> CRNN 结构图 </center>\n",
"\n",
"\n",
"#### 2.1.2 Sequence2Sequence 算法\n",
"\n",
"Sequence2Sequence 算法是由编码器 Encoder 把所有的输入序列都编码成一个统一的语义向量,然后再由解码器Decoder解码。在解码器Decoder解码的过程中,不断地将前一个时刻的输出作为后一个时刻的输入,循环解码,直到输出停止符为止。一般编码器是一个RNN,对于每个输入的词,编码器输出向量和隐藏状态,并将隐藏状态用于下一个输入的单词,循环得到语义向量;解码器是另一个RNN,它接收编码器输出向量并输出一系列字以创建转换。受到 Sequence2Sequence 在翻译领域的启发, Shi[4]提出了一种基于注意的编解码框架来识别文本,通过这种方式,rnn能够从训练数据中学习隐藏在字符串中的字符级语言模型。\n",
"\n",
"<center><img src=https://ai-studio-static-online.cdn.bcebos.com/f575333696b7438d919975dc218e61ccda1305b638c5497f92b46a7ec3b85243 width=\"400\" hight=\"500\"></center>\n",
"<center> Sequence2Sequence 结构图 </center>\n",
"\n",
"以上两个算法在规则文本上都有很不错的效果,但由于网络设计的局限性,这类方法很难解决弯曲和旋转的不规则文本识别任务。为了解决这类问题,部分算法研究人员在以上两类算法的基础上提出了一系列改进算法。\n",
"\n",
"### 2.2 不规则文本识别\n",
"\n",
"* 不规则文本识别算法可以被分为4大类:基于校正的方法;基于 Attention 的方法;基于分割的方法;基于 Transformer 的方法。\n",
"\n",
"#### 2.2.1 基于校正的方法\n",
"\n",
"基于校正的方法利用一些视觉变换模块,将非规则的文本尽量转换为规则文本,然后使用常规方法进行识别。\n",
"\n",
"RARE[4]模型首先提出了对不规则文本的校正方案,整个网络分为两个主要部分:一个空间变换网络STN(Spatial Transformer Network) 和一个基于Sequence2Squence的识别网络。其中STN就是校正模块,不规则文本图像进入STN,通过TPS(Thin-Plate-Spline)变换成一个水平方向的图像,该变换可以一定程度上校正弯曲、透射变换的文本,校正后送入序列识别网络进行解码。\n",
"\n",
"<center><img src=https://ai-studio-static-online.cdn.bcebos.com/66406f89507245e8a57969b9bed26bfe0227a8cf17a84873902dd4a464b97bb5 width=\"600\"></center>\n",
"<center> RARE 结构图 </center>\n",
"\n",
"RARE论文指出,该方法在不规则文本数据集上有较大的优势,特别比较了CUTE80和SVTP这两个数据集,相较CRNN高出5个百分点以上,证明了校正模块的有效性。基于此[6]同样结合了空间变换网络(STN)和基于注意的序列识别网络的文本识别系统。\n",
"\n",
"基于校正的方法有较好的迁移性,除了RARE这类基于Attention的方法外,STAR-Net[5]将校正模块应用到基于CTC的算法上,相比传统CRNN也有很好的提升。\n",
"\n",
"#### 2.2.2 基于Attention的方法\n",
"\n",
"基于 Attention 的方法主要关注的是序列之间各部分的相关性,该方法最早在机器翻译领域提出,认为在文本翻译的过程中当前词的结果主要由某几个单词影响的,因此需要给有决定性的单词更大的权重。在文本识别领域也是如此,将编码后的序列解码时,每一步都选择恰当的context来生成下一个状态,这样有利于得到更准确的结果。\n",
"\n",
"R^2AM [7] 首次将 Attention 引入文本识别领域,该模型首先将输入图像通过递归卷积层提取编码后的图像特征,然后利用隐式学习到的字符级语言统计信息通过递归神经网络解码输出字符。在解码过程中引入了Attention 机制实现了软特征选择,以更好地利用图像特征,这一有选择性的处理方式更符合人类的直觉。\n",
"\n",
"<center><img src=https://ai-studio-static-online.cdn.bcebos.com/a64ef10d4082422c8ac81dcda4ab75bf1db285d6b5fd462a8f309240445654d5 width=\"600\"></center>\n",
"<center> R^2AM 结构图 </center>\n",
"\n",
"后续有大量算法在Attention领域进行探索和更新,例如SAR[8]将1D attention拓展到2D attention上,校正模块提到的RARE也是基于Attention的方法。实验证明基于Attention的方法相比CTC的方法有很好的精度提升。\n",
"\n",
"<center><img src=https://ai-studio-static-online.cdn.bcebos.com/4e2507fb58d94ec7a9b4d17151a986c84c5053114e05440cb1e7df423d32cb02 width=\"600\"></center>\n",
"\n",
"\n",
"#### 2.2.3 基于分割的方法\n",
"\n",
"基于分割的方法是将文本行的各字符作为独立个体,相比与对整个文本行做矫正后识别,识别分割出的单个字符更加容易。它试图从输入的文本图像中定位每个字符的位置,并应用字符分类器来获得这些识别结果,将复杂的全局问题简化成了局部问题解决,在不规则文本场景下有比较不错的效果。然而这种方法需要字符级别的标注,数据获取上存在一定的难度。Lyu[9]等人提出了一种用于单词识别的实例分词模型,该模型在其识别部分使用了基于 FCN(Fully Convolutional Network) 的方法。[10]从二维角度考虑文本识别问题,设计了一个字符注意FCN来解决文本识别问题,当文本弯曲或严重扭曲时,该方法对规则文本和非规则文本都具有较优的定位结果。\n",
"\n",
"<center><img src=https://ai-studio-static-online.cdn.bcebos.com/fd3e8ef0d6ce4249b01c072de31297ca5d02fc84649846388f890163b624ff10 width=\"800\"></center>\n",
"<center> Mask TextSpotter 结构图 </center>\n",
"\n",
"\n",
"\n",
"#### 2.2.4 基于Transformer的方法\n",
"\n",
"随着 Transformer 的快速发展,分类和检测领域都验证了 Transformer 在视觉任务中的有效性。如规则文本识别部分所说,CNN在长依赖建模上存在局限性,Transformer 结构恰好解决了这一问题,它可以在特征提取器中关注全局信息,并且可以替换额外的上下文建模模块(LSTM)。\n",
"\n",
"一部分文本识别算法使用 Transformer 的 Encoder 结构和卷积共同提取序列特征,Encoder 由多个 MultiHeadAttentionLayer 和 Positionwise Feedforward Layer 堆叠而成的block组成。MulitHeadAttention 中的 self-attention 利用矩阵乘法模拟了RNN的时序计算,打破了RNN中时序长时依赖的障碍。也有一部分算法使用 Transformer 的 Decoder 模块解码,相比传统RNN可获得更强的语义信息,同时并行计算具有更高的效率。\n",
"\n",
"SRN[11] 算法将Transformer的Encoder模块接在ResNet50后,增强了2D视觉特征。并提出了一个并行注意力模块,将读取顺序用作查询,使得计算与时间无关,最终并行输出所有时间步长的对齐视觉特征。此外SRN还利用Transformer的Eecoder作为语义模块,将图片的视觉信息和语义信息做融合,在遮挡、模糊等不规则文本上有较大的收益。\n",
"\n",
"NRTR[12] 使用了完整的Transformer结构对输入图片进行编码和解码,只使用了简单的几个卷积层做高层特征提取,在文本识别上验证了Transformer结构的有效性。\n",
"\n",
"<center><img src=https://ai-studio-static-online.cdn.bcebos.com/e7859f4469a842f0bd450e7e793a679d6e828007544241d09785c9b4ea2424a2 width=\"800\"></center>\n",
"<center> NRTR 结构图 </center>\n",
"\n",
"SRACN[13]使用Transformer的解码器替换LSTM,再一次验证了并行训练的高效性和精度优势。\n",
"\n",
"## 3 总结\n",
"\n",
"本节主要介绍了文本识别相关的理论知识和主流算法,包括基于CTC的方法、基于Sequence2Sequence的方法以及基于分割的方法,并分别列举了经典论文的思路和贡献。下一节将基于CRNN算法进行实践课程讲解,从组网到优化完成整个训练过程,\n",
"\n",
"## 4 参考文献\n",
"\n",
"\n",
"[1]Shi, B., Bai, X., & Yao, C. (2016). An end-to-end trainable neural network for image-based sequence recognition and its application to scene text recognition. IEEE transactions on pattern analysis and machine intelligence, 39(11), 2298-2304.\n",
"\n",
"[2]Fedor Borisyuk, Albert Gordo, and Viswanath Sivakumar. Rosetta: Large scale system for text detection and recognition in images. In Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining, pages 71–79. ACM, 2018.\n",
"\n",
"[3]Gao, Y., Chen, Y., Wang, J., & Lu, H. (2017). Reading scene text with attention convolutional sequence modeling. arXiv preprint arXiv:1709.04303.\n",
"\n",
"[4]Shi, B., Wang, X., Lyu, P., Yao, C., & Bai, X. (2016). Robust scene text recognition with automatic rectification. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 4168-4176).\n",
"\n",
"[5] Star-Net Max Jaderberg, Karen Simonyan, Andrew Zisserman, et al. Spa- tial transformer networks. In Advances in neural information processing systems, pages 2017–2025, 2015.\n",
"\n",
"[6]Baoguang Shi, Mingkun Yang, XingGang Wang, Pengyuan Lyu, Xiang Bai, and Cong Yao. Aster: An attentional scene text recognizer with flexible rectification. IEEE transactions on pattern analysis and machine intelligence, 31(11):855–868, 2018.\n",
"\n",
"[7] Lee C Y , Osindero S . Recursive Recurrent Nets with Attention Modeling for OCR in the Wild[C]// IEEE Conference on Computer Vision & Pattern Recognition. IEEE, 2016.\n",
"\n",
"[8]Li, H., Wang, P., Shen, C., & Zhang, G. (2019, July). Show, attend and read: A simple and strong baseline for irregular text recognition. In Proceedings of the AAAI Conference on Artificial Intelligence (Vol. 33, No. 01, pp. 8610-8617).\n",
"\n",
"[9]P. Lyu, C. Yao, W. Wu, S. Yan, and X. Bai. Multi-oriented scene text detection via corner localization and region segmentation. In Proc. CVPR, pages 7553–7563, 2018.\n",
"\n",
"[10] Liao, M., Zhang, J., Wan, Z., Xie, F., Liang, J., Lyu, P., ... & Bai, X. (2019, July). Scene text recognition from two-dimensional perspective. In Proceedings of the AAAI Conference on Artificial Intelligence (Vol. 33, No. 01, pp. 8714-8721).\n",
"\n",
"[11] Yu, D., Li, X., Zhang, C., Liu, T., Han, J., Liu, J., & Ding, E. (2020). Towards accurate scene text recognition with semantic reasoning networks. In Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (pp. 12113-12122).\n",
"\n",
"[12] Sheng, F., Chen, Z., & Xu, B. (2019, September). NRTR: A no-recurrence sequence-to-sequence model for scene text recognition. In 2019 International Conference on Document Analysis and Recognition (ICDAR) (pp. 781-786). IEEE.\n",
"\n",
"[13]Yang, L., Wang, P., Li, H., Li, Z., & Zhang, Y. (2020). A holistic representation guided attention network for scene text recognition. Neurocomputing, 414, 67-75.\n",
"\n",
"[14]Wang, T., Zhu, Y., Jin, L., Luo, C., Chen, X., Wu, Y., ... & Cai, M. (2020, April). Decoupled attention network for text recognition. In Proceedings of the AAAI Conference on Artificial Intelligence (Vol. 34, No. 07, pp. 12216-12224).\n",
"\n",
"[15] Wang, Y., Xie, H., Fang, S., Wang, J., Zhu, S., & Zhang, Y. (2021). From two to one: A new scene text recognizer with visual language modeling network. In Proceedings of the IEEE/CVF International Conference on Computer Vision (pp. 14194-14203).\n",
"\n",
"[16] Fang, S., Xie, H., Wang, Y., Mao, Z., & Zhang, Y. (2021). Read Like Humans: Autonomous, Bidirectional and Iterative Language Modeling for Scene Text Recognition. In Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (pp. 7098-7107).\n",
"\n",
"[17] Yan, R., Peng, L., Xiao, S., & Yao, G. (2021). Primitive Representation Learning for Scene Text Recognition. In Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (pp. 284-293)."
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false
},
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "py35-paddle1.2.0"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"collapsed": false
},
"source": [
"# 1. 课程预备知识\n",
"\n",
"本课所涉及的OCR模型建立在深度学习的基础之上,因此与其相关的基础知识、环境配置、项目工程与其他资料将在本节介绍,尤其对深度学习不熟悉的读者可以查看和学习相应内容。\n",
"\n",
"### 1.1 预备知识\n",
"\n",
"深度学习的“学习”由机器学习中的神经元、感知机、多层神经网络等内容一路发展而来,因此了解基础的机器学习算法对于深度学习的理解和应用有很大帮助。而深度学习的“深”则体现在对大量信息处理过程中使用的卷积、池化等一系列以向量为基础的数学运算。如果缺乏这两者的理论基础,可以学习李宏毅老师的[线性代数](https://aistudio.baidu.com/aistudio/course/introduce/2063)和[机器学习](https://aistudio.baidu.com/aistudio/course/introduce/1978)课程。\n",
"\n",
"对于深度学习本身的理解,可以参考百度杰出架构师毕然老师的零基础课程:[百度架构师手把手带你零基础实践深度学习](https://aistudio.baidu.com/aistudio/course/introduce/1297),其中覆盖了深度学习的发展历史,通过一个经典案例介绍深度学习的完整组成部分,是一套以实践为导向的深度学习课程。\n",
"\n",
"对于理论知识的实践,[Python基础知识](https://aistudio.baidu.com/aistudio/course/introduce/1224)必不可少,同时为了快速复现深度学习模型,本课程使用的深度学习框架为:飞桨PaddlePaddle。如果你已经使用过其他框架,通过[快速上手文档](https://www.paddlepaddle.org.cn/documentation/docs/zh/practices/quick_start/hello_paddle.html)可以迅速了解飞桨的使用方法。\n",
"\n",
"### 1.2 基础环境准备\n",
"\n",
"如果你想在本地环境运行本课程的代码且之前未搭建过Python环境,可以根据[零基础运行环境准备](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.3/doc/doc_ch/environment.md),根据自己的操作系统安装Anaconda或docker环境。\n",
"\n",
"如果你没有本地资源,可以通过AI Studio实训平台完成代码运行,其中的每个项目都通过Notebook的方式呈现,方便开发者学习。若对Notebook的相关操作不熟悉,可以参考[AI Studio项目说明](https://ai.baidu.com/ai-doc/AISTUDIO/0k3e2tfzm)。\n",
"\n",
"### 1.3 获取和运行代码\n",
"\n",
"本课程依托PaddleOCR的代码库形成,首先,克隆PaddleOCR的完整项目:\n",
"\n",
"```bash\n",
"#【推荐】\n",
"git clone https://github.com/PaddlePaddle/PaddleOCR\n",
"\n",
"# 如果因为网络问题无法pull成功,也可选择使用码云上的托管:\n",
"git clone https://gitee.com/paddlepaddle/PaddleOCR\n",
"```\n",
"\n",
"> 注:码云托管代码可能无法实时同步本github项目更新,存在3~5天延时,请优先使用推荐方式。\n",
">\n",
"> ​\t\t如果你不熟悉git操作,可以直接在PaddleOCR的首页的 `Code` 中下载压缩包\n",
"\n",
"然后安装第三方库:\n",
"\n",
"```\n",
"cd PaddleOCR\n",
"pip3 install -r requirements.txt\n",
"```\n",
"\n",
"\n",
"\n",
"### 1.4 查阅资料\n",
"\n",
"[PaddleOCR使用文档](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.3/README_ch.md#%E6%96%87%E6%A1%A3%E6%95%99%E7%A8%8B) (中文) 中详细介绍了如何使用PaddleOCR完成模型应用、训练和部署。文档内容丰富,大多数用户的问题都在文档或FAQ中有所描述,尤其在[FAQ(中文)](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.3/doc/doc_ch/FAQ.md)中,按照深度学习的应用过程沉淀了用户的常见问题,建议大家仔细阅读。\n",
"\n",
"### 1.5 寻求帮助\n",
"\n",
"如果你在使用PaddleOCR的过程中遇到BUG、易用性或者文档相关的问题,可通过[Github issue](https://github.com/PaddlePaddle/PaddleOCR/issues)与官方联系,请按照issue模板尽可能多的提供信息,以便官方人员迅速定位问题。同时,微信群是广大PaddleOCR用户的日常交流阵地,更适合提问一些咨询类问题,除了有PaddleOCR团队成员以外,还会有热心开发者回答大家的问题。"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "py35-paddle1.2.0"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
......@@ -345,7 +345,7 @@ class KieLabelEncode(object):
max_num = 300
temp_bboxes = np.zeros([max_num, 4])
h, _ = bboxes.shape
temp_bboxes[:h, :h] = bboxes
temp_bboxes[:h, :] = bboxes
temp_relations = np.zeros([max_num, max_num, 5])
temp_relations[:h, :h, :] = relations
......
......@@ -23,7 +23,6 @@ import sys
import six
import cv2
import numpy as np
import fasttext
class DecodeImage(object):
......@@ -136,6 +135,7 @@ class ToCHWImage(object):
class Fasttext(object):
def __init__(self, path="None", **kwargs):
import fasttext
self.fast_model = fasttext.load_model(path)
def __call__(self, data):
......
......@@ -138,13 +138,16 @@ def load_pretrained_params(model, path):
params = paddle.load(path + '.pdparams')
state_dict = model.state_dict()
new_state_dict = {}
for k1, k2 in zip(state_dict.keys(), params.keys()):
if list(state_dict[k1].shape) == list(params[k2].shape):
new_state_dict[k1] = params[k2]
for k1 in params.keys():
if k1 not in state_dict.keys():
logger.warning("The pretrained params {} not in model".format(k1))
else:
logger.warning(
"The shape of model params {} {} not matched with loaded params {} {} !".
format(k1, state_dict[k1].shape, k2, params[k2].shape))
if list(state_dict[k1].shape) == list(params[k1].shape):
new_state_dict[k1] = params[k1]
else:
logger.warning(
"The shape of model params {} {} not matched with loaded params {} {} !".
format(k1, state_dict[k1].shape, k1, params[k1].shape))
model.set_state_dict(new_state_dict)
logger.info("load pretrain successful from {}".format(path))
return model
......
......@@ -3,7 +3,7 @@ model_name:PPOCRv2_ocr_det_kl
python:python3.7
Global.pretrained_model:null
Global.save_inference_dir:null
infer_model:./inference/ch_PP-OCRv2_det_infer/
infer_model:./inference/ch_PP-OCRv2_det_infer
infer_export:deploy/slim/quantization/quant_kl.py -c configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_cml.yml -o
infer_quant:True
inference:tools/infer/predict_det.py
......
......@@ -26,7 +26,7 @@ null:null
##
===========================infer_params===========================
Global.save_inference_dir:./output/
Global.pretrained_model:
Global.checkpoints:
norm_export:null
quant_export:null
fpgm_export:deploy/slim/prune/export_prune_model.py -c configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml -o
......
......@@ -26,7 +26,7 @@ null:null
##
===========================infer_params===========================
Global.save_inference_dir:./output/
Global.pretrained_model:
Global.checkpoints:
norm_export:tools/export_model.py -c configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml -o
quant_export:null
fpgm_export:null
......
......@@ -26,7 +26,7 @@ null:null
##
===========================infer_params===========================
Global.save_inference_dir:./output/
Global.pretrained_model:
Global.checkpoints:
norm_export:null
quant_export:null
fpgm_export:deploy/slim/prune/export_prune_model.py -c test_tipc/configs/ch_ppocr_mobile_v2.0_rec_FPGM/rec_chinese_lite_train_v2.0.yml -o
......
......@@ -13,7 +13,7 @@ inference:tools/infer/predict_rec.py
--rec_batch_num:1
--use_tensorrt:False|True
--precision:int8
--det_model_dir:
--rec_model_dir:
--image_dir:./inference/rec_inference
null:null
--benchmark:True
......
......@@ -26,7 +26,7 @@ null:null
##
===========================infer_params===========================
Global.save_inference_dir:./output/
Global.pretrained_model:
Global.checkpoints:
norm_export:tools/export_model.py -c test_tipc/configs/ch_ppocr_server_v2.0_det/det_r50_vd_db.yml -o
quant_export:null
fpgm_export:null
......
......@@ -26,7 +26,7 @@ null:null
##
===========================infer_params===========================
Global.save_inference_dir:./output/
Global.pretrained_model:
Global.checkpoints:
norm_export:tools/export_model.py -c configs/det/det_mv3_db.yml -o
quant_export:null
fpgm_export:null
......
......@@ -26,7 +26,7 @@ null:null
##
===========================infer_params===========================
Global.save_inference_dir:./output/
Global.pretrained_model:
Global.checkpoints:
norm_export:tools/export_model.py -c test_tipc/configs/det_mv3_east_v2.0/det_mv3_east.yml -o
quant_export:null
fpgm_export:null
......
......@@ -26,7 +26,7 @@ null:null
##
===========================infer_params===========================
Global.save_inference_dir:./output/
Global.pretrained_model:
Global.checkpoints:
norm_export:tools/export_model.py -c test_tipc/configs/det_mv3_pse_v2.0/det_mv3_pse.yml -o
quant_export:null
fpgm_export:null
......
......@@ -26,7 +26,7 @@ null:null
##
===========================infer_params===========================
Global.save_inference_dir:./output/
Global.pretrained_model:
Global.checkpoints:
norm_export:tools/export_model.py -c configs/det/det_r50_vd_db.yml -o
quant_export:null
fpgm_export:null
......@@ -34,7 +34,7 @@ distill_export:null
export1:null
export2:null
##
train_model:./inference/ch_ppocr_server_v2.0_det_train/best_accuracy
train_model:./inference/det_r50_vd_db_v2.0_train/best_accuracy
infer_export:tools/export_model.py -c configs/det/det_r50_vd_db.yml -o
infer_quant:False
inference:tools/infer/predict_det.py
......
......@@ -26,7 +26,7 @@ null:null
##
===========================infer_params===========================
Global.save_inference_dir:./output/
Global.pretrained_model:
Global.checkpoints:
norm_export:tools/export_model.py -c test_tipc/configs/det_r50_vd_east_v2.0/det_r50_vd_east.yml -o
quant_export:null
fpgm_export:null
......
......@@ -26,7 +26,7 @@ null:null
##
===========================infer_params===========================
Global.save_inference_dir:./output/
Global.pretrained_model:
Global.checkpoints:
norm_export:tools/export_model.py -c test_tipc/configs/det_r50_vd_pse_v2.0/det_r50_vd_pse.yml -o
quant_export:null
fpgm_export:null
......
......@@ -26,7 +26,7 @@ null:null
##
===========================infer_params===========================
Global.save_inference_dir:./output/
Global.pretrained_model:
Global.checkpoints:
norm_export:tools/export_model.py -c test_tipc/configs/det_r50_vd_sast_icdar15_v2.0/det_r50_vd_sast_icdar2015.yml -o
quant_export:null
fpgm_export:null
......@@ -43,7 +43,7 @@ inference:tools/infer/predict_det.py
--cpu_threads:1|6
--rec_batch_num:1
--use_tensorrt:False
--precision:fp32|fp16|int8
--precision:fp32|int8
--det_model_dir:
--image_dir:./inference/ch_det_data_50/all-sum-510/
null:null
......
......@@ -26,7 +26,7 @@ null:null
##
===========================infer_params===========================
Global.save_inference_dir:./output/
Global.pretrained_model:
Global.checkpoints:
norm_export:tools/export_model.py -c test_tipc/configs/det_r50_vd_sast_totaltext_v2.0/det_r50_vd_sast_totaltext.yml -o
quant_export:null
fpgm_export:null
......@@ -43,7 +43,7 @@ inference:tools/infer/predict_det.py
--cpu_threads:1|6
--rec_batch_num:1
--use_tensorrt:False
--precision:fp32|fp16|int8
--precision:fp32|int8
--det_model_dir:
--image_dir:./inference/ch_det_data_50/all-sum-510/
null:null
......
......@@ -26,7 +26,7 @@ null:null
##
===========================infer_params===========================
Global.save_inference_dir:./output/
Global.pretrained_model:
Global.checkpoints:
norm_export:tools/export_model.py -c configs/e2e/e2e_r50_vd_pg.yml -o
quant_export:null
fpgm_export:null
......
# Web 端基础预测功能测试
Web 端主要基于 Jest-Puppeteer 完成 e2e 测试,其中 Puppeteer 操作 Chrome 完成推理流程,Jest 完成测试流程。
>Puppeteer 是一个 Node 库,它提供了一个高级 API 来通过 DevTools 协议控制 Chromium 或 Chrome
>Jest 是一个 JavaScript 测试框架,旨在确保任何 JavaScript 代码的正确性。
#### 环境准备
* 安装 Node(包含 npm ) (https://nodejs.org/zh-cn/download/)
* 确认是否安装成功,在命令行执行
```sh
# 显示所安 node 版本号,即表示成功安装
node -v
```
* 确认 npm 是否安装成成
```sh
# npm 随着 node 一起安装,一般无需额外安装
# 显示所安 npm 版本号,即表示成功安装
npm -v
```
#### 使用
```sh
# web 测试环境准备
bash test_tipc/prepare_js.sh 'js_infer'
# web 推理测试
bash test_tipc/test_inference_js.sh
```
#### 流程设计
###### paddlejs prepare
1. 判断 node, npm 是否安装
2. 下载测试模型,当前检测模型是 ch_PP-OCRv2_det_infer ,识别模型是 ch_PP-OCRv2_rec_infer[1, 3, 32, 320]。如果需要替换模型,可直接将模型文件放在test_tipc/web/models/目录下。
- 文本检测模型:https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer.tar
- 文本识别模型:https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_infer.tar
- 文本识别模型[1, 3, 32, 320]:https://paddlejs.bj.bcebos.com/models/ch_PP-OCRv2_rec_infer.tar
- 保证较为准确的识别效果,需要将文本识别模型导出为输入shape是[1, 3, 32, 320]的静态模型
3. 转换模型, model.pdmodel model.pdiparams 转换为 model.json chunk.dat(检测模型保存地址:test_tipc/web/models/ch_PP-OCRv2/det,识别模型保存地址:test_tipc/web/models/ch_PP-OCRv2/rec)
4. 安装最新版本 ocr sdk @paddlejs-models/ocr@latest
5. 安装测试环境依赖 puppeteer、jest、jest-puppeteer,如果检查到已经安装,则不会进行二次安装
###### paddlejs infer test
1. Jest 执行 server command:`python3 -m http.server 9811` 开启本地服务
2. 启动 Jest 测试服务,通过 jest-puppeteer 插件完成 chrome 操作,加载 @paddlejs-models/ocr 脚本完成推理流程
3. 测试用例为原图识别后的文本结果与预期文本结果(expect.json)进行对比,测试通过有两个标准:
* 原图识别结果逐字符与预期结果对比,误差不超过 **10个字符**
* 原图识别结果每个文本框字符内容与预期结果进行相似度对比,相似度不小于 0.9(全部一致则相似度为1)。
只有满足上述两个标准,视为测试通过。通过为如下显示:
<img width="600" src="https://user-images.githubusercontent.com/43414102/146406599-80b30c66-f2f8-4f57-a68a-007c479ff0f7.png">
......@@ -179,7 +179,7 @@ elif [ ${MODE} = "whole_infer" ];then
cd ./inference/ && tar xf rec_r34_vd_tps_bilstm_ctc_v2.0_train.tar && cd ../
fi
if [ ${model_name} == "ch_ppocr_server_v2.0_rec" ]; then
wget -nc -P ./inference/ https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/ch_ppocr_server_v2.0_rec_train.tar --no-check-certificate
wget -nc -P ./inference/ https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_train.tar --no-check-certificate
cd ./inference/ && tar xf ch_ppocr_server_v2.0_rec_train.tar && cd ../
fi
if [ ${model_name} == "ch_ppocr_mobile_v2.0_rec" ]; then
......@@ -239,18 +239,21 @@ fi
if [ ${MODE} = "klquant_whole_infer" ]; then
wget -nc -P ./train_data/ https://paddleocr.bj.bcebos.com/dygraph_v2.0/test/icdar2015_lite.tar --no-check-certificate
cd ./train_data/ && tar xf icdar2015_lite.tar
ln -s ./icdar2015_lite ./icdar2015 && cd ../
cd ./train_data/ && tar xf icdar2015_lite.tar && rm -rf ./icdar2015 && ln -s ./icdar2015_lite ./icdar2015 && cd ../
if [ ${model_name} = "ch_ppocr_mobile_v2.0_det_KL" ]; then
wget -nc -P ./inference https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_infer.tar --no-check-certificate
wget -nc -P ./inference https://paddleocr.bj.bcebos.com/dygraph_v2.0/test/ch_det_data_50.tar --no-check-certificate
cd ./inference && tar xf ch_ppocr_mobile_v2.0_det_infer.tar && tar xf ch_det_data_50.tar && cd ../
fi
if [ ${model_name} = "ch_PPOCRv2_det" ]; then
eval_model_name="ch_PP-OCRv2_det_infer"
if [ ${model_name} = "PPOCRv2_ocr_rec_kl" ]; then
wget -nc -P ./inference https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_infer.tar --no-check-certificate
wget -nc -P ./inference/ https://paddleocr.bj.bcebos.com/dygraph_v2.0/test/rec_inference.tar --no-check-certificate
cd ./inference && tar xf rec_inference.tar && tar xf ch_PP-OCRv2_rec_infer.tar && cd ../
fi
if [ ${model_name} = "PPOCRv2_ocr_det_kl" ]; then
wget -nc -P ./inference https://paddleocr.bj.bcebos.com/dygraph_v2.0/test/ch_det_data_50.tar --no-check-certificate
wget -nc -P ./inference/ https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer.tar --no-check-certificate
cd ./inference && tar xf ${eval_model_name}.tar && tar xf ch_det_data_50.tar && cd ../
wget -nc -P ./inference https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer.tar --no-check-certificate
cd ./inference && tar xf ch_PP-OCRv2_det_infer.tar && tar xf ch_det_data_50.tar && cd ../
fi
if [ ${model_name} = "ch_ppocr_mobile_v2.0_rec_KL" ]; then
wget -nc -P ./inference/ https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_infer.tar --no-check-certificate
......
#!/bin/bash
set -o errexit
set -o nounset
shopt -s extglob
# paddlejs prepare 主要流程
# 1. 判断 node, npm 是否安装
# 2. 下载测试模型,当前检测模型是 ch_PP-OCRv2_det_infer ,识别模型是 ch_PP-OCRv2_rec_infer [1, 3, 32, 320]。如果需要替换模型,可直接将模型文件放在test_tipc/web/models/目录下。
# - 文本检测模型:https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer.tar
# - 文本识别模型:https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_infer.tar
# - 文本识别模型[1, 3, 32, 320]:https://paddlejs.bj.bcebos.com/models/ch_PP-OCRv2_rec_infer.tar
# - 保证较为准确的识别效果,需要将文本识别模型导出为输入shape[1, 3, 32, 320]的静态模型
# 3. 转换模型, model.pdmodel model.pdiparams 转换为 model.json chunk.dat(检测模型保存地址:test_tipc/web/models/ch_PP-OCRv2/det,识别模型保存地址:test_tipc/web/models/ch_PP-OCRv2/rec)
# 4. 安装最新版本 ocr sdk @paddlejs-models/ocr@latest
# 5. 安装测试环境依赖 puppeteer、jest、jest-puppeteer,如果检查到已经安装,则不会进行二次安装
# 判断是否安装了node
if ! type node >/dev/null 2>&1; then
echo -e "\033[31m node 未安装 \033[0m"
exit
fi
# 判断是否安装了npm
if ! type npm >/dev/null 2>&1; then
echo -e "\033[31m npm 未安装 \033[0m"
exit
fi
# MODE be 'js_infer'
MODE=$1
# js_infer MODE , load model file and convert model to js_infer
if [ ${MODE} != "js_infer" ];then
echo "Please change mode to 'js_infer'"
exit
fi
# saved_model_name
det_saved_model_name=ch_PP-OCRv2_det_infer
rec_saved_model_name=ch_PP-OCRv2_rec_infer
# model_path
model_path=test_tipc/web/models/
rm -rf $model_path
echo ${model_path}${det_saved_model_name}
echo ${model_path}${rec_saved_model_name}
# download ocr_det inference model
wget -nc -P $model_path https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer.tar
cd $model_path && tar xf ch_PP-OCRv2_det_infer.tar && cd ../../../
# download ocr_rec inference model
wget -nc -P $model_path https://paddlejs.bj.bcebos.com/models/ch_PP-OCRv2_rec_infer.tar
cd $model_path && tar xf ch_PP-OCRv2_rec_infer.tar && cd ../../../
MYDIR=`pwd`
echo $MYDIR
pip3 install paddlejsconverter
# convert inference model to web model: model.json、chunk.dat
paddlejsconverter \
--modelPath=$model_path$det_saved_model_name/inference.pdmodel \
--paramPath=$model_path$det_saved_model_name/inference.pdiparams \
--outputDir=$model_path$det_saved_model_name/ \
paddlejsconverter \
--modelPath=$model_path$rec_saved_model_name/inference.pdmodel \
--paramPath=$model_path$rec_saved_model_name/inference.pdiparams \
--outputDir=$model_path$rec_saved_model_name/ \
# always install latest ocr sdk
cd test_tipc/web
echo -e "\033[33m Installing the latest ocr sdk... \033[0m"
npm install @paddlejs-models/ocr@latest
npm info @paddlejs-models/ocr
echo -e "\033[32m The latest ocr sdk installed completely.!~ \033[0m"
# install dependencies
if [ `npm list --dept 0 | grep puppeteer | wc -l` -ne 0 ] && [ `npm list --dept 0 | grep jest | wc -l` -ne 0 ];then
echo -e "\033[32m Dependencies have installed \033[0m"
else
echo -e "\033[33m Installing dependencies ... \033[0m"
npm install jest jest-puppeteer puppeteer
echo -e "\033[32m Dependencies installed completely.!~ \033[0m"
fi
# del package-lock.json
rm package-lock.json
#!/bin/bash
set -o errexit
set -o nounset
cd test_tipc/web
# run ocr test in chrome
./node_modules/.bin/jest --config ./jest.config.js
......@@ -183,7 +183,7 @@ function func_inference(){
if [[ ${precision} =~ "fp16" || ${precision} =~ "int8" ]] && [ ${use_trt} = "False" ]; then
continue
fi
if [[ ${use_trt} = "False" || ${precision} =~ "int8" ]] && [ ${_flag_quant} = "True" ]; then
if [[ ${use_trt} = "False" && ${precision} =~ "int8" ]] && [ ${_flag_quant} = "True" ]; then
continue
fi
for batch_size in ${batch_size_list[*]}; do
......@@ -227,7 +227,12 @@ if [ ${MODE} = "whole_infer" ] || [ ${MODE} = "klquant_whole_infer" ]; then
for infer_model in ${infer_model_dir_list[*]}; do
# run export
if [ ${infer_run_exports[Count]} != "null" ];then
save_infer_dir=$(dirname $infer_model)
if [ ${MODE} = "klquant_whole_infer" ]; then
save_infer_dir="${infer_model}_klquant"
fi
if [ ${MODE} = "whole_infer" ]; then
save_infer_dir="${infer_model}"
fi
set_export_weight=$(func_set_params "${export_weight}" "${infer_model}")
set_save_infer_key=$(func_set_params "${save_infer_key}" "${save_infer_dir}")
export_cmd="${python} ${infer_run_exports[Count]} ${set_export_weight} ${set_save_infer_key}"
......@@ -259,7 +264,6 @@ else
env=""
elif [ ${#gpu} -le 1 ];then
env="export CUDA_VISIBLE_DEVICES=${gpu}"
eval ${env}
elif [ ${#gpu} -le 15 ];then
IFS=","
array=(${gpu})
......@@ -280,6 +284,7 @@ else
set_amp_config=" "
fi
for trainer in ${trainer_list[*]}; do
eval ${env}
flag_quant=False
if [ ${trainer} = ${pact_key} ]; then
run_train=${pact_trainer}
......@@ -332,7 +337,6 @@ else
cmd="${python} -m paddle.distributed.launch --ips=${ips} --gpus=${gpu} ${run_train} ${set_use_gpu} ${set_save_model} ${set_pretrain} ${set_epoch} ${set_autocast} ${set_batchsize} ${set_train_params1} ${set_amp_config}"
fi
# run train
eval "unset CUDA_VISIBLE_DEVICES"
eval $cmd
status_check $? "${cmd}" "${status_log}"
......
{
"text": [
"纯臻营养护发素",
"产品信息/参数",
"(45元/每公斤,100公斤起订)",
"每瓶22元,1000瓶起订)",
"【品牌】:代加工方式/OEMODM",
"【品名】:纯臻营养护发素",
"【产品编号】:YM-X-3011",
"ODMOEM",
"【净含量】:220ml",
"【适用人群】:适合所有肤质",
"【主要成分】:鲸蜡硬脂醇、燕麦β-葡聚",
"糖、椰油酰胺丙基甜菜碱、泛醌",
"(成品包材)",
"【主要功能】:可紧致头发磷层,从而达到",
"即时持久改善头发光泽的效果,给干燥的头",
"发足够的滋养"
]
}
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<meta http-equiv="X-UA-Compatible" content="ie=edge">
<title>ocr test</title>
</head>
<body>
<img id="ocr" src="./test.jpg" />
</body>
<script src="./node_modules/@paddlejs-models/ocr/lib/index.js"></script>
</html>
\ No newline at end of file
const expectData = require('./expect.json');
describe('e2e test ocr model', () => {
beforeAll(async () => {
await page.goto(PATH);
});
it('ocr infer and diff test', async () => {
page.on('console', msg => console.log('PAGE LOG:', msg.text()));
const text = await page.evaluate(async () => {
const $ocr = document.querySelector('#ocr');
const ocr = paddlejs['ocr'];
await ocr.init('./models/ch_PP-OCRv2_det_infer', './models/ch_PP-OCRv2_rec_infer');
const res = await ocr.recognize($ocr);
return res.text;
});
// 模型文字识别结果与预期结果diff的字符数
let diffNum = 0;
// 文本框字符串相似度
let similarity = 0;
// 预期字符diff数
const expectedDiffNum = 10;
// 预期文本框字符串相似度
const expecteSimilarity = 0.9;
// 预期文本内容
const expectResult = expectData.text;
expectResult && expectResult.forEach((item, index) => {
const word = text[index];
// 逐字符对比
for(let i = 0; i < item.length; i++) {
if (item[i] !== word[i]) {
console.log('expect: ', item[i], ' word: ', word[i]);
diffNum++;
}
}
// 文本框字符串相似度对比
const s = similar(item, word);
similarity += s;
});
similarity = similarity / expectResult.length;
expect(diffNum).toBeLessThanOrEqual(expectedDiffNum);
expect(similarity).toBeGreaterThanOrEqual(expecteSimilarity);
function similar(string, expect) {
if (!string || !expect) {
return 0;
}
const length = string.length > expect.length ? string.length : expect.length;
const n = string.length;
const m = expect.length;
let data = [];
const min = (a, b, c) => {
return a < b ? (a < c ? a : c) : (b < c ? b : c);
};
let i, j, si, ej, cost;
if (n === 0) return m;
if (m === 0) return n;
for (i = 0; i <= n; i++) {
data[i] = [];
[i][0] = i
}
for (j = 0; j <= m; j++) {
data[0][j] = j;
}
for (i = 1; i <= n; i++) {
si = string.charAt(i - 1);
for (j = 1; j <= m; j++) {
ej = expect.charAt(j - 1);
cost = si === ej ? 0 : 1;
data[i][j] = min(data[i - 1][j] + 1, data[i][j - 1] + 1, data[i - 1][j - 1] + cost);
}
}
return (1 - data[n][m] / length);
}
});
});
// jest-puppeteer.config.js
module.exports = {
launch: {
headless: false,
product: 'chrome'
},
browserContext: 'default',
server: {
command: 'python3 -m http.server 9811',
port: 9811,
launchTimeout: 10000,
debug: true
}
};
// For a detailed explanation regarding each configuration property and type check, visit:
// https://jestjs.io/docs/en/configuration.html
module.exports = {
preset: 'jest-puppeteer',
// All imported modules in your tests should be mocked automatically
// automock: false,
// Automatically clear mock calls and instances between every test
clearMocks: true,
// An object that configures minimum threshold enforcement for coverage results
// coverageThreshold: undefined,
// A set of global variables that need to be available in all test environments
globals: {
PATH: 'http://localhost:9811'
},
// The maximum amount of workers used to run your tests. Can be specified as % or a number. E.g. maxWorkers: 10% will use 10% of your CPU amount + 1 as the maximum worker number. maxWorkers: 2 will use a maximum of 2 workers.
// maxWorkers: "50%",
// An array of directory names to be searched recursively up from the requiring module's location
// moduleDirectories: [
// "node_modules"
// ],
// An array of file extensions your modules use
moduleFileExtensions: [
'js',
'json',
'jsx',
'ts',
'tsx',
'node'
],
// The root directory that Jest should scan for tests and modules within
// rootDir: undefined,
// A list of paths to directories that Jest should use to search for files in
roots: [
'<rootDir>'
],
// Allows you to use a custom runner instead of Jest's default test runner
// runner: "jest-runner",
// The paths to modules that run some code to configure or set up the testing environment before each test
// setupFiles: [],
// A list of paths to modules that run some code to configure or set up the testing framework before each test
// setupFilesAfterEnv: [],
// The number of seconds after which a test is considered as slow and reported as such in the results.
// slowTestThreshold: 5,
// A list of paths to snapshot serializer modules Jest should use for snapshot testing
// snapshotSerializers: [],
// The test environment that will be used for testing
// testEnvironment: 'jsdom',
// Options that will be passed to the testEnvironment
// testEnvironmentOptions: {},
// An array of regexp pattern strings that are matched against all test paths, matched tests are skipped
testPathIgnorePatterns: [
'/node_modules/'
],
// The regexp pattern or array of patterns that Jest uses to detect test files
testRegex: '.(.+)\\.test\\.(js|ts)$',
// This option allows the use of a custom results processor
// testResultsProcessor: undefined,
// This option allows use of a custom test runner
// testRunner: "jest-circus/runner",
// This option sets the URL for the jsdom environment. It is reflected in properties such as location.href
testURL: 'http://localhost:9898/',
// Setting this value to "fake" allows the use of fake timers for functions such as "setTimeout"
// timers: "real",
// A map from regular expressions to paths to transformers
transform: {
'^.+\\.js$': 'babel-jest'
},
// An array of regexp pattern strings that are matched against all source file paths, matched files will skip transformation
transformIgnorePatterns: [
'/node_modules/',
'\\.pnp\\.[^\\/]+$'
],
// An array of regexp pattern strings that are matched against all modules before the module loader will automatically return a mock for them
// unmockedModulePathPatterns: undefined,
// Indicates whether each individual test should be reported during the run
verbose: true,
// An array of regexp patterns that are matched against all source file paths before re-running tests in watch mode
// watchPathIgnorePatterns: [],
// Whether to use watchman for file crawling
// watchman: true,
testTimeout: 50000
};
......@@ -101,17 +101,22 @@ class TextDetector(object):
else:
logger.info("unknown det_algorithm:{}".format(self.det_algorithm))
sys.exit(0)
if self.use_onnx:
pre_process_list[0] = {
'DetResizeForTest': {
'image_shape': [640, 640]
}
}
self.preprocess_op = create_operators(pre_process_list)
self.postprocess_op = build_post_process(postprocess_params)
self.predictor, self.input_tensor, self.output_tensors, self.config = utility.create_predictor(
args, 'det', logger)
if self.use_onnx:
img_h, img_w = self.input_tensor.shape[2:]
if img_h is not None and img_w is not None and img_h > 0 and img_w > 0:
pre_process_list[0] = {
'DetResizeForTest': {
'image_shape': [img_h, img_w]
}
}
self.preprocess_op = create_operators(pre_process_list)
if args.benchmark:
import auto_log
pid = os.getpid()
......
......@@ -109,7 +109,10 @@ class TextRecognizer(object):
assert imgC == img.shape[2]
imgW = int((32 * max_wh_ratio))
if self.use_onnx:
imgW = 100
w = self.input_tensor.shape[3:][0]
if w is not None and w > 0:
imgW = w
h, w = img.shape[:2]
ratio = w / float(h)
if math.ceil(imgH * ratio) > imgW:
......
......@@ -15,6 +15,7 @@
import argparse
import os
import sys
import platform
import cv2
import numpy as np
import paddle
......@@ -313,6 +314,10 @@ def create_predictor(args, mode, logger):
def get_infer_gpuid():
sysstr = platform.system()
if sysstr == "Windows":
return 0
if not paddle.fluid.core.is_compiled_with_rocm():
cmd = "env | grep CUDA_VISIBLE_DEVICES"
else:
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册