未验证 提交 e77b1f0f 编写于 作者: W WuZhe 提交者: GitHub

Merge branch 'dygraph' into dygraph

......@@ -61,7 +61,7 @@ from combobox import ComboBox
from libs.constants import *
from libs.utils import *
from libs.settings import Settings
from libs.shape import Shape, DEFAULT_LINE_COLOR, DEFAULT_FILL_COLOR
from libs.shape import Shape, DEFAULT_LINE_COLOR, DEFAULT_FILL_COLOR,DEFAULT_LOCK_COLOR
from libs.stringBundle import StringBundle
from libs.canvas import Canvas
from libs.zoomWidget import ZoomWidget
......@@ -126,7 +126,7 @@ class MainWindow(QMainWindow, WindowMixin):
self.labelHist = []
self.lastOpenDir = None
self.result_dic = []
self.result_dic_locked = []
self.changeFileFolder = False
self.haveAutoReced = False
self.labelFile = None
......@@ -395,6 +395,7 @@ class MainWindow(QMainWindow, WindowMixin):
delete = action(getStr('delBox'), self.deleteSelectedShape,
'backspace', 'delete', getStr('delBoxDetail'), enabled=False)
copy = action(getStr('dupBox'), self.copySelectedShape,
'Ctrl+C', 'copy', getStr('dupBoxDetail'),
enabled=False)
......@@ -406,6 +407,7 @@ class MainWindow(QMainWindow, WindowMixin):
'Ctrl+A', 'hide', getStr('showAllBoxDetail'),
enabled=False)
help = action(getStr('tutorial'), self.showTutorialDialog, None, 'help', getStr('tutorialDetail'))
showInfo = action(getStr('info'), self.showInfoDialog, None, 'help', getStr('info'))
showSteps = action(getStr('steps'), self.showStepsDialog, None, 'help', getStr('steps'))
......@@ -477,6 +479,10 @@ class MainWindow(QMainWindow, WindowMixin):
undo = action(getStr("undo"), self.undoShapeEdit,
'Ctrl+Z', "undo", getStr("undo"), enabled=False)
lock = action(getStr("lockBox"), self.lockSelectedShape,
None, "lock", getStr("lockBoxDetail"),
enabled=False)
self.editButton.setDefaultAction(edit)
self.newButton.setDefaultAction(create)
self.DelButton.setDefaultAction(deleteImg)
......@@ -538,13 +544,13 @@ class MainWindow(QMainWindow, WindowMixin):
fitWindow=fitWindow, fitWidth=fitWidth,
zoomActions=zoomActions, saveLabel=saveLabel,
undo=undo, undoLastPoint=undoLastPoint,open_dataset_dir=open_dataset_dir,
rotateLeft=rotateLeft,rotateRight=rotateRight,
rotateLeft=rotateLeft,rotateRight=rotateRight,lock=lock,
fileMenuActions=(
opendir, open_dataset_dir, saveLabel, resetAll, quit),
beginner=(), advanced=(),
editMenu=(createpoly, edit, copy, delete,singleRere,None, undo, undoLastPoint,
None, rotateLeft, rotateRight, None, color1, self.drawSquaresOption),
beginnerContext=(create, edit, copy, delete, singleRere, rotateLeft, rotateRight,),
None, rotateLeft, rotateRight, None, color1, self.drawSquaresOption,lock),
beginnerContext=(create, edit, copy, delete, singleRere, rotateLeft, rotateRight,lock),
advancedContext=(createMode, editMode, edit, copy,
delete, shapeLineColor, shapeFillColor),
onLoadActive=(
......@@ -998,6 +1004,7 @@ class MainWindow(QMainWindow, WindowMixin):
self.actions.delete.setEnabled(n_selected)
self.actions.copy.setEnabled(n_selected)
self.actions.edit.setEnabled(n_selected == 1)
self.actions.lock.setEnabled(n_selected)
def addLabel(self, shape):
shape.paintLabel = self.displayLabelOption.isChecked()
......@@ -1041,7 +1048,7 @@ class MainWindow(QMainWindow, WindowMixin):
def loadLabels(self, shapes):
s = []
for label, points, line_color, fill_color, difficult in shapes:
shape = Shape(label=label)
shape = Shape(label=label,line_color=line_color)
for x, y in points:
# Ensure the labels are within the bounds of the image. If not, fix them.
......@@ -1051,6 +1058,7 @@ class MainWindow(QMainWindow, WindowMixin):
shape.addPoint(QPointF(x, y))
shape.difficult = difficult
#shape.locked = False
shape.close()
s.append(shape)
......@@ -1065,9 +1073,11 @@ class MainWindow(QMainWindow, WindowMixin):
# shape.fill_color = generateColorByText(label)
self.addLabel(shape)
self.updateComboBox()
self.canvas.loadShapes(s)
def singleLabel(self, shape):
if shape is None:
# print('rm empty label')
......@@ -1106,10 +1116,9 @@ class MainWindow(QMainWindow, WindowMixin):
difficult=s.difficult) # bool
shapes = [] if mode == 'Auto' else \
[format_shape(shape) for shape in self.canvas.shapes]
[format_shape(shape) for shape in self.canvas.shapes if shape.line_color != DEFAULT_LOCK_COLOR]
# Can add differrent annotation formats here
for box in self.result_dic:
for box in self.result_dic :
trans_dic = {"label": box[1][0], "points": box[0], 'difficult': False}
if trans_dic["label"] == "" and mode == 'Auto':
continue
......@@ -1120,7 +1129,6 @@ class MainWindow(QMainWindow, WindowMixin):
for box in shapes:
trans_dic.append({"transcription": box['label'], "points": box['points'], 'difficult': box['difficult']})
self.PPlabel[annotationFilePath] = trans_dic
if mode == 'Auto':
self.Cachelabel[annotationFilePath] = trans_dic
......@@ -1313,6 +1321,7 @@ class MainWindow(QMainWindow, WindowMixin):
# unicodeFilePath = os.path.abspath(unicodeFilePath)
# Tzutalin 20160906 : Add file list and dock to move faster
# Highlight the file item
if unicodeFilePath and self.fileListWidget.count() > 0:
if unicodeFilePath in self.mImgList:
index = self.mImgList.index(unicodeFilePath)
......@@ -1322,6 +1331,7 @@ class MainWindow(QMainWindow, WindowMixin):
###
self.iconlist.clear()
self.additems5(None)
for i in range(5):
item_tooltip = self.iconlist.item(i).toolTip()
# print(i,"---",item_tooltip)
......@@ -1340,7 +1350,6 @@ class MainWindow(QMainWindow, WindowMixin):
if unicodeFilePath and os.path.exists(unicodeFilePath):
self.canvas.verified = False
cvimg = cv2.imdecode(np.fromfile(unicodeFilePath, dtype=np.uint8), 1)
height, width, depth = cvimg.shape
cvimg = cv2.cvtColor(cvimg, cv2.COLOR_BGR2RGB)
......@@ -1361,12 +1370,15 @@ class MainWindow(QMainWindow, WindowMixin):
else:
self.dirty = False
self.actions.save.setEnabled(True)
if len(self.canvas.lockedShapes) != 0:
self.actions.save.setEnabled(True)
self.setDirty()
self.canvas.setEnabled(True)
self.adjustScale(initial=True)
self.paintCanvas()
self.addRecentFile(self.filePath)
self.toggleActions(True)
self.showBoundingBoxFromPPlabel(filePath)
self.setWindowTitle(__appname__ + ' ' + filePath)
......@@ -1380,12 +1392,20 @@ class MainWindow(QMainWindow, WindowMixin):
return True
return False
def showBoundingBoxFromPPlabel(self, filePath):
width, height = self.image.width(), self.image.height()
imgidx = self.getImglabelidx(filePath)
if imgidx not in self.PPlabel.keys():
return
shapes = []
shapes =[]
#box['ratio'] of the shapes saved in lockedShapes contains the ratio of the
# four corner coordinates of the shapes to the height and width of the image
for box in self.canvas.lockedShapes:
if self.canvas.isInTheSameImage:
shapes.append((box['transcription'], [[s[0]*width,s[1]*height]for s in box['ratio']],
DEFAULT_LOCK_COLOR, None, box['difficult']))
else:
shapes.append(('锁定框:待检测', [[s[0]*width,s[1]*height]for s in box['ratio']],
DEFAULT_LOCK_COLOR, None, box['difficult']))
if imgidx in self.PPlabel.keys():
for box in self.PPlabel[imgidx]:
shapes.append((box['transcription'], box['points'], None, None, box['difficult']))
......@@ -1647,8 +1667,36 @@ class MainWindow(QMainWindow, WindowMixin):
return fullFilePath
return ''
def saveLockedShapes(self):
self.canvas.lockedShapes = []
self.canvas.selectedShapes = []
for s in self.canvas.shapes:
if s.line_color == DEFAULT_LOCK_COLOR:
self.canvas.selectedShapes.append(s)
self.lockSelectedShape()
for s in self.canvas.shapes:
if s.line_color == DEFAULT_LOCK_COLOR:
self.canvas.selectedShapes.remove(s)
self.canvas.shapes.remove(s)
def _saveFile(self, annotationFilePath, mode='Manual'):
if len(self.canvas.lockedShapes) != 0:
self.saveLockedShapes()
if mode == 'Manual':
self.result_dic_locked = []
img = cv2.imread(self.filePath)
width, height = self.image.width(), self.image.height()
for shape in self.canvas.lockedShapes:
box = [[int(p[0]*width), int(p[1]*height)] for p in shape['ratio']]
assert len(box) == 4
result = [(shape['transcription'],1)]
result.insert(0, box)
self.result_dic_locked.append(result)
self.result_dic += self.result_dic_locked
self.result_dic_locked = []
if annotationFilePath and self.saveLabels(annotationFilePath, mode=mode):
self.setClean()
self.statusBar().showMessage('Saved to %s' % annotationFilePath)
......@@ -1663,13 +1711,13 @@ class MainWindow(QMainWindow, WindowMixin):
self.savePPlabel(mode='Auto')
self.fileListWidget.insertItem(int(currIndex), item)
if not self.canvas.isInTheSameImage:
self.openNextImg()
self.actions.saveRec.setEnabled(True)
self.actions.saveLabel.setEnabled(True)
elif mode == 'Auto':
if annotationFilePath and self.saveLabels(annotationFilePath, mode=mode):
self.setClean()
self.statusBar().showMessage('Saved to %s' % annotationFilePath)
self.statusBar().show()
......@@ -1733,7 +1781,9 @@ class MainWindow(QMainWindow, WindowMixin):
if discardChanges == QMessageBox.No:
return True
elif discardChanges == QMessageBox.Yes:
self.canvas.isInTheSameImage = True
self.saveFile()
self.canvas.isInTheSameImage = False
return True
else:
return False
......@@ -1872,6 +1922,7 @@ class MainWindow(QMainWindow, WindowMixin):
# org_box = [dic['points'] for dic in self.PPlabel[self.getImglabelidx(self.filePath)]]
if self.canvas.shapes:
self.result_dic = []
self.result_dic_locked = [] # result_dic_locked stores the ocr result of self.canvas.lockedShapes
rec_flag = 0
for shape in self.canvas.shapes:
box = [[int(p.x()), int(p.y())] for p in shape.points]
......@@ -1883,21 +1934,32 @@ class MainWindow(QMainWindow, WindowMixin):
return
result = self.ocr.ocr(img_crop, cls=True, det=False)
if result[0][0] != '':
if shape.line_color == DEFAULT_LOCK_COLOR:
shape.label = result[0][0]
result.insert(0, box)
self.result_dic_locked.append(result)
else:
result.insert(0, box)
print('result in reRec is ', result)
self.result_dic.append(result)
else:
print('Can not recognise the box')
if shape.line_color == DEFAULT_LOCK_COLOR:
shape.label = result[0][0]
self.result_dic_locked.append([box,(self.noLabelText,0)])
else:
self.result_dic.append([box,(self.noLabelText,0)])
try:
if self.noLabelText == shape.label or result[1][0] == shape.label:
print('label no change')
else:
rec_flag += 1
if len(self.result_dic) > 0 and rec_flag > 0:
except IndexError as e:
print('Can not recognise the box')
if (len(self.result_dic) > 0 and rec_flag > 0)or self.canvas.lockedShapes:
self.canvas.isInTheSameImage = True
self.saveFile(mode='Auto')
self.loadFile(self.filePath)
self.canvas.isInTheSameImage = False
self.setDirty()
elif len(self.result_dic) == len(self.canvas.shapes) and rec_flag == 0:
QMessageBox.information(self, "Information", "The recognition result remains unchanged!")
......@@ -2107,6 +2169,44 @@ class MainWindow(QMainWindow, WindowMixin):
self.labelList.clearSelection()
self._noSelectionSlot = False
self.canvas.loadShapes(shapes, replace=replace)
print("loadShapes")#1
def lockSelectedShape(self):
"""lock the selsected shapes.
Add self.selectedShapes to lock self.canvas.lockedShapes,
which holds the ratio of the four coordinates of the locked shapes
to the width and height of the image
"""
width, height = self.image.width(), self.image.height()
def format_shape(s):
return dict(label=s.label, # str
line_color=s.line_color.getRgb(),
fill_color=s.fill_color.getRgb(),
ratio=[[int(p.x())/width, int(p.y())/height] for p in s.points], # QPonitF
# add chris
difficult=s.difficult) # bool
#lock
if len(self.canvas.lockedShapes) == 0:
for s in self.canvas.selectedShapes:
s.line_color = DEFAULT_LOCK_COLOR
s.locked = True
shapes = [format_shape(shape) for shape in self.canvas.selectedShapes]
trans_dic = []
for box in shapes:
trans_dic.append({"transcription": box['label'], "ratio": box['ratio'], 'difficult': box['difficult']})
self.canvas.lockedShapes = trans_dic
self.actions.save.setEnabled(True)
#unlock
else:
for s in self.canvas.shapes:
s.line_color = DEFAULT_LINE_COLOR
self.canvas.lockedShapes = []
self.result_dic_locked = []
self.setDirty()
self.actions.save.setEnabled(True)
def inverted(color):
......
......@@ -78,14 +78,14 @@ PPOCRLabel # run
```bash
cd PaddleOCR/PPOCRLabel
python3 setup.py bdist_wheel
pip3 install dist/PPOCRLabel-1.0.0-py2.py3-none-any.whl
pip3 install dist/PPOCRLabel-1.0.2-py2.py3-none-any.whl
```
#### 1.2.3 Run PPOCRLabel by Python Script
```bash
cd ./PPOCRLabel # Switch to the PPOCRLabel directory
python PPOCRLabel.py --lang ch
python PPOCRLabel.py
```
......
......@@ -78,7 +78,7 @@ PPOCRLabel --lang ch # 启动
```bash
cd PaddleOCR/PPOCRLabel
python3 setup.py bdist_wheel
pip3 install dist/PPOCRLabel-1.0.0-py2.py3-none-any.whl -i https://mirror.baidu.com/pypi/simple
pip3 install dist/PPOCRLabel-1.0.2-py2.py3-none-any.whl -i https://mirror.baidu.com/pypi/simple
```
#### 1.2.3 通过Python脚本运行PPOCRLabel
......
......@@ -87,6 +87,10 @@ class Canvas(QWidget):
#initialisation for panning
self.pan_initial_pos = QPoint()
#lockedshapes related
self.lockedShapes = []
self.isInTheSameImage = False
def setDrawingColor(self, qColor):
self.drawingLineColor = qColor
self.drawingRectColor = qColor
......
因为 它太大了无法显示 source diff 。你可以改为 查看blob
......@@ -30,6 +30,7 @@ DEFAULT_SELECT_LINE_COLOR = QColor(255, 255, 255)
DEFAULT_SELECT_FILL_COLOR = QColor(0, 128, 255, 155)
DEFAULT_VERTEX_FILL_COLOR = QColor(0, 255, 0, 255)
DEFAULT_HVERTEX_FILL_COLOR = QColor(255, 0, 0)
DEFAULT_LOCK_COLOR = QColor(255, 0, 255)
MIN_Y_LABEL = 10
......@@ -57,7 +58,7 @@ class Shape(object):
self.selected = False
self.difficult = difficult
self.paintLabel = paintLabel
self.locked = False
self._highlightIndex = None
self._highlightMode = self.NEAR_VERTEX
self._highlightSettings = {
......
......@@ -60,7 +60,7 @@ class StringBundle:
def __createLookupFallbackList(self, localeStr):
resultPaths = []
basePath = "\strings" if os.name == 'nt' else ":/strings"
basePath = "\strings" if os.name == 'nt' else "/strings"
resultPaths.append(basePath)
if localeStr is not None:
# Don't follow standard BCP47. Simple fallback
......
......@@ -105,3 +105,5 @@ labelDialogOption=Pop-up Label Input Dialog
undo=Undo
undoLastPoint=Undo Last Point
autoSaveMode=Auto Export Label Mode
lockBox=Lock selected box/Unlock all box
lockBoxDetail=Lock selected box/Unlock all box
\ No newline at end of file
......@@ -105,3 +105,5 @@ labelDialogOption=弹出标记输入框
undo=撤销
undoLastPoint=撤销上个点
autoSaveMode=自动导出标记结果
lockBox=锁定框/解除锁定框
lockBoxDetail=若当前没有框处于锁定状态则锁定选中的框,若存在锁定框则解除所有锁定框的锁定状态
......@@ -33,7 +33,7 @@ setup(
package_dir={'PPOCRLabel': ''},
include_package_data=True,
entry_points={"console_scripts": ["PPOCRLabel= PPOCRLabel.PPOCRLabel:main"]},
version='1.0.0',
version='1.0.2',
install_requires=requirements,
license='Apache License 2.0',
description='PPOCRLabel is a semi-automatic graphic annotation tool suitable for OCR field, with built-in PPOCR model to automatically detect and re-recognize data. It is written in python3 and pyqt5, supporting rectangular box annotation and four-point annotation modes. Annotations can be directly used for the training of PPOCR detection and recognition models',
......
......@@ -39,7 +39,7 @@ PaddleOCR aims to create multilingual, awesome, leading, and practical OCR tools
- General PP-OCR server series models: detection (47.1M) + direction classifier (1.4M) + recognition (94.9M) = 143.4M
- Support Chinese, English, and digit recognition, vertical text recognition, and long text recognition
- Support multi-language recognition: about 80 languages like Korean, Japanese, German, French, etc
- document structurize system PP-Structure
- PP-Structure: a document structurize system
- support layout analysis and table recognition (support export to Excel)
- support key information extraction
- support DocVQA
......@@ -90,7 +90,7 @@ Mobile DEMO experience (based on EasyEdge and Paddle-Lite, supports iOS and Andr
| Model introduction | Model name | Recommended scene | Detection model | Direction classifier | Recognition model |
| ------------------------------------------------------------ | ---------------------------- | ----------------- | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ |
| Chinese and English ultra-lightweight PP-OCRv2 model(11.6M) | ch_PP-OCRv2_xx |Mobile & Server|[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_distill_train.tar)| [inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv2/ch/ch_PP-OCRv2_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_train.tar)|
| Chinese and English ultra-lightweight PP-OCRv2 model(11.6M) | ch_PP-OCRv2_xx |Mobile & Server|[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_distill_train.tar)| [inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_train.tar)|
| Chinese and English ultra-lightweight PP-OCR model (9.4M) | ch_ppocr_mobile_v2.0_xx | Mobile & server |[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_train.tar)|[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) |[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_train.tar) |
| Chinese and English general PP-OCR model (143.4M) | ch_ppocr_server_v2.0_xx | Server |[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_train.tar) |[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_traingit.tar) |[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_train.tar) |
......@@ -102,11 +102,11 @@ For a new language request, please refer to [Guideline for new language_requests
## Tutorials
- [Environment Preparation](./doc/doc_en/environment_en.md)
- [Quick Start](./doc/doc_en/quickstart_en.md)
- [PaddleOCR Overview and Installation](./doc/doc_en/paddleOCR_overview_en.md)
- [PaddleOCR Overview and Project Clone](./doc/doc_en/paddleOCR_overview_en.md)
- PP-OCR Industry Landing: from Training to Deployment
- [PP-OCR Model and Configuration](./doc/doc_en/models_and_config_en.md)
- [PP-OCR Model Zoo](./doc/doc_en/models_en.md)
- [PP-OCR Model Download](./doc/doc_en/models_list_en.md)
- [Python Inference for PP-OCR Model Library](./doc/doc_en/inference_ppocr_en.md)
- [Python Inference for PP-OCR Model Zoo](./doc/doc_en/inference_ppocr_en.md)
- [PP-OCR Training](./doc/doc_en/training_en.md)
- [Text Detection](./doc/doc_en/detection_en.md)
- [Text Recognition](./doc/doc_en/recognition_en.md)
......
......@@ -54,8 +54,7 @@ PaddleOCR旨在打造一套丰富、领先、且实用的OCR工具库,助力
- 加入社区:微信扫描下方二维码加入官方交流群,与各行各业开发者充分交流,期待您的加入。
- 社区贡献:[社区贡献](./doc/doc_ch/thirdparty.md)文档中包含了社区用户**使用PaddleOCR开发的各种工具、应用**以及**为PaddleOCR贡献的功能、优化的文档与代码**等,是官方为社区开发者打造的荣誉墙、也是帮助优质项目宣传的广播站。如果您的OCR项目未被收集在文档中,可根据文档说明与我们联系。最新社区贡献可查看[此处](#社区贡献)
- 社区常规赛:作为社区贡献的具体承载形式,社区常规赛是面向OCR开发者的积分赛事。首届社区常规赛与《动手学OCR · 十讲》课程联合推广,课程详情可参考[链接](https://aistudio.baidu.com/aistudio/course/introduce/25207),课程奖励与作业说明可参考[链接](https://github.com/PaddlePaddle/PaddleOCR/issues/4982)
- 社区常规赛:作为社区贡献的具体承载形式,社区常规赛是面向OCR开发者的积分赛事。首届社区常规赛与[《动手学OCR · 十讲》课程](https://aistudio.baidu.com/aistudio/course/introduce/25207)联合推广。社区常规赛的赛题详情与报名方法可参考[链接](https://github.com/PaddlePaddle/PaddleOCR/issues/4982)
<div align="center">
<img src="https://raw.githubusercontent.com/PaddlePaddle/PaddleOCR/dygraph/doc/joinus.PNG" width = "200" height = "200" />
......@@ -64,22 +63,33 @@ PaddleOCR旨在打造一套丰富、领先、且实用的OCR工具库,助力
## 零代码体验
- 在线网站体验:超轻量PP-OCR mobile模型体验地址:https://www.paddlepaddle.org.cn/hub/scene/ocr
- 移动端:[安装包DEMO下载地址](https://ai.baidu.com/easyedge/app/openSource?from=paddlelite)(基于EasyEdge和Paddle-Lite, 支持iOS和Android系统)
<a name="模型下载"></a>
## PP-OCR系列模型列表(更新中)
| 模型简介 | 模型名称 | 推荐场景 | 检测模型 | 方向分类器 | 识别模型 |
| ------------------------------------- | ----------------------- | --------------- | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ |
| 中英文超轻量PP-OCRv2模型(13.0M) | ch_PP-OCRv2_xx | 移动端&服务器端 | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_distill_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_train.tar) |
| 中英文超轻量PP-OCR mobile模型(9.4M) | ch_ppocr_mobile_v2.0_xx | 移动端&服务器端 | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_pre.tar) |
| 中英文通用PP-OCR server模型(143.4M) | ch_ppocr_server_v2.0_xx | 服务器端 | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_pre.tar) |
更多模型下载(包括多语言),可以参考[PP-OCR 系列模型下载](./doc/doc_ch/models_list.md)
## 文档教程
- [运行环境准备](./doc/doc_ch/environment.md)
- [快速开始(中英文/多语言/文档分析)](./doc/doc_ch/quickstart.md)
- [PaddleOCR全景图与项目克隆](./doc/doc_ch/paddleOCR_overview.md)
- PP-OCR产业落地:从训练到部署
- [PP-OCR模型与配置文件](./doc/doc_ch/models_and_config.md)
- [PP-OCR模型](./doc/doc_ch/models.md)
- [PP-OCR模型下载](./doc/doc_ch/models_list.md)
- [PP-OCR模型库快速推理](./doc/doc_ch/inference_ppocr.md)
- [PP-OCR模型训练](./doc/doc_ch/training.md)
- [文本检测](./doc/doc_ch/detection.md)
- [文本识别](./doc/doc_ch/recognition.md)
- [文本方向分类器](./doc/doc_ch/angle_class.md)
- [知识蒸馏](./doc/doc_ch/knowledge_distillation.md)
- [配置文件内容与生成](./doc/doc_ch/config.md)
- PP-OCR模型推理部署
- [基于C++预测引擎推理](./deploy/cpp_infer/readme.md)
......@@ -121,7 +131,7 @@ PaddleOCR旨在打造一套丰富、领先、且实用的OCR工具库,助力
</div>
[1] PP-OCR是一个实用的超轻量OCR系统。主要由DB文本检测、检测框矫正和CRNN文本识别三部分组成。该系统从骨干网络选择和调整、预测头部的设计、数据增强、学习率变换策略、正则化参数选择、预训练模型使用以及模型自动裁剪量化8个方面,采用19个有效策略,对各个模块的模型进行效果调优和瘦身(如绿框所示),最终得到整体大小为3.5M的超轻量中英文OCR和2.8M的英文数字OCR。更多细节请参考PP-OCR技术方案 https://arxiv.org/abs/2009.09941
[2] PP-OCRv2在PP-OCR的基础上,进一步在5个方面重点优化,检测模型采用CML协同互学习知识蒸馏策略和CopyPaste数据增广策略;识别模型采用LCNet轻量级骨干网络、UDML 改进知识蒸馏策略和Enhanced CTC loss损失函数改进(如上图红框所示),进一步在推理速度和预测效果上取得明显提升。更多细节请参考PP-OCRv2[技术报告](https://arxiv.org/abs/2109.03144)
[2] PP-OCRv2在PP-OCR的基础上,进一步在5个方面重点优化,检测模型采用CML协同互学习知识蒸馏策略和CopyPaste数据增广策略;识别模型采用LCNet轻量级骨干网络、UDML 改进知识蒸馏策略和[Enhanced CTC loss](./doc/doc_ch/enhanced_ctc_loss.md)损失函数改进(如上图红框所示),进一步在推理速度和预测效果上取得明显提升。更多细节请参考PP-OCRv2[技术报告](https://arxiv.org/abs/2109.03144)
<a name="效果展示"></a>
......
......@@ -21,6 +21,7 @@ Architecture:
model_type: det
Models:
Teacher:
pretrained: ./pretrain_models/ch_ppocr_server_v2.0_det_train/best_accuracy
freeze_params: true
return_all_feats: false
model_type: det
......@@ -36,6 +37,7 @@ Architecture:
name: DBHead
k: 50
Student:
pretrained:
freeze_params: false
return_all_feats: false
model_type: det
......@@ -52,6 +54,7 @@ Architecture:
name: DBHead
k: 50
Student2:
pretrained:
freeze_params: false
return_all_feats: false
model_type: det
......
......@@ -18,6 +18,7 @@ Global:
Architecture:
name: DistillationModel
algorithm: Distillation
model_type: det
Models:
Student:
pretrained: ./pretrain_models/MobileNetV3_large_x0_5_pretrained
......
......@@ -18,6 +18,7 @@ Global:
Architecture:
name: DistillationModel
algorithm: Distillation
model_type: det
Models:
Student:
pretrained: ./pretrain_models/MobileNetV3_large_x0_5_pretrained
......
......@@ -18,8 +18,8 @@ python3.7 -m pip install paddle2onnx
- 安装 ONNX
```
# 建议安装 1.4.0 版本,可根据环境更换版本号
python3.7 -m pip install onnxruntime==1.4.0
# 建议安装 1.9.0 版本,可根据环境更换版本号
python3.7 -m pip install onnxruntime==1.9.0
```
## 2. 模型转换
......@@ -47,13 +47,15 @@ paddle2onnx --model_dir=./inference/ch_ppocr_mobile_v2.0_det_infer/ \
--params_filename=inference.pdiparams \
--save_file=./inference/det_mobile_onnx/model.onnx \
--opset_version=10 \
--input_shape_dict="{'x': [-1, 3, -1, -1]}" \
--enable_onnx_checker=True
```
执行完毕后,ONNX 模型会被保存在 `./inference/det_mobile_onnx/` 路径下
* 注意:以下几个模型暂不支持转换为 ONNX 模型:
NRTR、SAR、RARE、SRN
* 注意:对于OCR模型,转化过程中必须采用动态shape的形式,即加入选项--input_shape_dict="{'x': [-1, 3, -1, -1]}",否则预测结果可能与直接使用Paddle预测有细微不同。
另外,以下几个模型暂不支持转换为 ONNX 模型:
NRTR、SAR、RARE、SRN
## 3. onnx 预测
......@@ -72,5 +74,3 @@ root INFO: 1.jpg [[[291, 295], [334, 292], [348, 844], [305, 847]], [[344, 296]
The predict time of ../../doc/imgs/1.jpg: 0.06162881851196289
The visualized image saved in ./inference_results/det_res_1.jpg
```
* 注意:ONNX暂时不支持变长预测,需要将输入resize到固定输入,预测结果可能与直接使用Paddle预测有细微不同。
......@@ -57,7 +57,7 @@ PaddleOCR基于动态图开源的文本识别算法列表:
- [x] SAR([paper](https://arxiv.org/abs/1811.00751v2))
- [x] SEED([paper](https://arxiv.org/pdf/2005.10977.pdf))
参考[DTRB][3](https://arxiv.org/abs/1904.01906)文字识别训练和评估流程,使用MJSynth和SynthText两个文字识别数据集训练,在IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE数据集上进行评估,算法效果如下:
参考[DTRB](https://arxiv.org/abs/1904.01906)[3]文字识别训练和评估流程,使用MJSynth和SynthText两个文字识别数据集训练,在IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE数据集上进行评估,算法效果如下:
|模型|骨干网络|Avg Accuracy|模型存储命名|下载链接|
|---|---|---|---|---|
......
......@@ -146,14 +146,14 @@ PaddleOCR欢迎大家向repo中积极贡献代码,下面给出一些贡献代
-`远程仓库` Clone到本地
```
# 拉取develop分支的代码
# 拉取dygraph分支的代码
git clone https://github.com/{your_name}/PaddleOCR.git -b dygraph
cd PaddleOCR
```
> 多数情况下clone失败是由于网络原因,请稍后重试或配置代理
#### 3.2.2 和 `远程仓库` 建立连接
#### 3.2.2 通过Token方式登录与建立连接
首先查看当前 `远程仓库` 的信息。
......@@ -163,7 +163,24 @@ git remote -v
# origin https://github.com/{your_name}/PaddleOCR.git (push)
```
只有clone的 `远程仓库` 的信息,也就是自己用户名下的 PaddleOCR,接下来我们创建一个原始 PaddleOCR 仓库的远程主机,命名为 upstream。
只有clone的 `远程仓库` 的信息,也就是自己用户名下的 PaddleOCR。由于Github的登录方式变化,需要通过Token的方式重新配置 `远程仓库` 的地址。生成Token的方式如下:
1. 找到个人访问令牌(token):在Github页面右上角点击自己的头像,然后依次选择 Settings --> Developer settings --> Personal access tokens
2. 点击 Generate new token:在Note中填入token名称,例如’paddle‘。在Select scopes选择repo(必选)、admin:repo_hook、delete_repo等,可根据自身需要勾选。然后点击Generate token生成token。最后复制生成的token。
删除原始的origin配置
```
git remote rm origin
```
将remote分支改成 `https://oauth2:{token}@github.com/{your_name}/PaddleOCR.git`。例如:如果token值为12345,你的用户名为PPOCR,则运行下方命令
```
git remote add origin https://oauth2:12345@github.com/PPOCR/PaddleOCR.git
```
这样我们就与自己的 `远程仓库` 建立了连接。接下来我们创建一个原始 PaddleOCR 仓库的远程主机,命名为 upstream。
```
git remote add upstream https://github.com/PaddlePaddle/PaddleOCR.git
......@@ -172,8 +189,8 @@ git remote add upstream https://github.com/PaddlePaddle/PaddleOCR.git
使用 `git remote -v` 查看当前 `远程仓库` 的信息,输出如下,发现包括了origin和upstream 2个 `远程仓库`
```
origin https://github.com/{your_name}/PaddleOCR.git (fetch)
origin https://github.com/{your_name}/PaddleOCR.git (push)
origin https://oauth2:{token}@github.com/{your_name}/PaddleOCR.git (fetch)
origin https://oauth2:{token}@github.com/{your_name}/PaddleOCR.git (push)
upstream https://github.com/PaddlePaddle/PaddleOCR.git (fetch)
upstream https://github.com/PaddlePaddle/PaddleOCR.git (push)
```
......@@ -182,21 +199,22 @@ upstream https://github.com/PaddlePaddle/PaddleOCR.git (push)
#### 3.2.3 创建本地分支
可以基于当前分支创建新的本地分支,命令如下
首先获取 upstream 的最新代码,然后基于上游仓库 (upstream)的dygraph创建new_branch分支
```
git checkout -b new_branch
git fetch upstream
git checkout -b new_branch upstream/dygraph
```
也可以基于远程或者上游的分支创建新的分支,命令如下。
```
# 基于用户远程仓库(origin)的develop创建new_branch分支
git checkout -b new_branch origin/develop
# 基于上游远程仓库(upstream)的develop创建new_branch分支
# 如果需要从upstream创建新的分支,需要首先使用git fetch upstream获取上游代码
git checkout -b new_branch upstream/develop
```
> 如果对于新Fork的PaddleOCR项目,用户远程仓库(origin)与上游(upstream)仓库的分支更新情况相同,也可以基于origin仓库的默认分支或指定分支创建新的本地分支,命令如下。
>
> ```
> # 基于用户远程仓库(origin)的dygraph创建new_branch分支
> git checkout -b new_branch origin/dygraph
>
> # 基于用户远程仓库(origin)的默认分支创建new_branch分支
> git checkout -b new_branch
> ```
最终会显示切换到新的分支,输出信息如下
......@@ -205,6 +223,8 @@ Branch new_branch set up to track remote branch develop from upstream.
Switched to a new branch 'new_branch'
```
切换分支之后即可在此分支上进行文件改动
#### 3.2.4 使用pre-commit勾子
Paddle 开发人员使用 pre-commit 工具来管理 Git 预提交钩子。 它可以帮助我们格式化源代码(C++,Python),在提交(commit)前自动检查一些基本事宜(如每个文件只有一个 EOL,Git 中不要添加大文件等)。
......@@ -234,23 +254,15 @@ pre-commit
![img](../precommit_pass.png)
使用下面的命令完成提交。
提交修改,并写明修改内容("your commit info")
```
git commit -m "your commit info"
```
#### 3.2.6 保持本地仓库最新
获取 upstream 的最新代码并更新当前分支。这里的upstream来自于2.2节的`和远程仓库建立连接`部分。
#### 3.2.6 Push到远程仓库
```
git fetch upstream
# 如果是希望提交到其他分支,则需要从upstream的其他分支pull代码,这里是develop
git pull upstream develop
```
#### 3.2.7 push到远程仓库
使用push命令将修改的commit提交到 `远程仓库`
```
git push origin new_branch
......@@ -258,7 +270,7 @@ git push origin new_branch
#### 3.2.7 提交Pull Request
点击new pull request,选择本地分支和目标分支,如下图所示。在PR的描述说明中,填写该PR所完成的功能。接下来等待review,如果有需要修改的地方,参照上述步骤更新 origin 中的对应分支即可。
打开自己的远程仓库界面,选择提交的分支。点击new pull request或contribute进入PR界面。选择本地分支和目标分支,如下图所示。在PR的描述说明中,填写该PR所完成的功能。接下来等待review,如果有需要修改的地方,参照上述步骤更新 origin 中的对应分支即可。
![banner](../pr.png)
......@@ -285,8 +297,8 @@ git push origin new_branch
- 删除本地分支
```
# 切换到develop分支,否则无法删除当前分支
git checkout develop
# 切换到dygraph分支,否则无法删除当前分支
git checkout dygraph
# 删除new_branch分支
git branch -D new_branch
......@@ -310,7 +322,6 @@ git push origin new_branch
- 请注意每个commit的名称:应能反映当前commit的内容,不能太随意。
3)如果解决了某个Issue的问题,请在该Pull Request的第一个评论框中加上:fix #issue_number,这样当该Pull Request被合并后,会自动关闭对应的Issue。关键词包括:close, closes, closed, fix, fixes, fixed, resolve, resolves, resolved,请选择合适的词汇。详细可参考[Closing issues via commit messages](https://help.github.com/articles/closing-issues-via-commit-messages)
此外,在回复评审人意见时,请您遵守以下约定:
......
......@@ -78,11 +78,11 @@ json.dumps编码前的图像标注信息是包含多个字典的list,字典中
cd PaddleOCR/
# 根据backbone的不同选择下载对应的预训练模型
# 下载MobileNetV3的预训练模型
wget -P ./pretrain_models/ https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileNetV3_large_x0_5_pretrained.pdparams
wget -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/pretrained/MobileNetV3_large_x0_5_pretrained.pdparams
# 或,下载ResNet18_vd的预训练模型
wget -P ./pretrain_models/ https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNet18_vd_pretrained.pdparams
wget -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/pretrained/ResNet18_vd_pretrained.pdparams
# 或,下载ResNet50_vd的预训练模型
wget -P ./pretrain_models/ https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNet50_vd_ssld_pretrained.pdparams
wget -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/pretrained/ResNet50_vd_ssld_pretrained.pdparams
```
<a name="2-----"></a>
......
......@@ -527,7 +527,6 @@ PostProcess:
关于`DistillationDBPostProcess`更加具体的实现可以参考: [db_postprocess.py](../../ppocr/postprocess/db_postprocess.py#L195)
<a name="224"></a>
#### 2.2.4 蒸馏指标计算
......
# PP-OCR模型与配置文件
PP-OCR模型与配置文件一章主要补充一些OCR模型的基本概念、配置文件的内容与作用以便对模型后续的参数调整和训练中拥有更好的体验。
本章包含三个部分,首先在[PP-OCR模型下载](./models_list.md)中解释PP-OCR模型的类型概念,并提供所有模型的下载链接。然后在[配置文件内容与生成](./config.md)中详细说明调整PP-OCR模型所需的参数。最后的[模型库快速使用](./inference_ppocr.md)是对第一节PP-OCR模型库使用方法的介绍,可以通过Python推理引擎快速利用丰富的模型库模型获得测试结果。
------
下面我们首先了解一些OCR相关的基本概念:
- [1. OCR 简要介绍](#1-ocr-----)
* [1.1 OCR 检测模型基本概念](#11-ocr---------)
* [1.2 OCR 识别模型基本概念](#12-ocr---------)
* [1.3 PP-OCR模型](#13-pp-ocr--)
<a name="1-ocr-----"></a>
## 1. OCR 简要介绍
本节简要介绍OCR检测模型、识别模型的基本概念,并介绍PaddleOCR的PP-OCR模型。
OCR(Optical Character Recognition,光学字符识别)目前是文字识别的统称,已不限于文档或书本文字识别,更包括识别自然场景下的文字,又可以称为STR(Scene Text Recognition)。
OCR文字识别一般包括两个部分,文本检测和文本识别;文本检测首先利用检测算法检测到图像中的文本行;然后检测到的文本行用识别算法去识别到具体文字。
<a name="11-ocr---------"></a>
### 1.1 OCR 检测模型基本概念
文本检测就是要定位图像中的文字区域,然后通常以边界框的形式将单词或文本行标记出来。传统的文字检测算法多是通过手工提取特征的方式,特点是速度快,简单场景效果好,但是面对自然场景,效果会大打折扣。当前多是采用深度学习方法来做。
基于深度学习的文本检测算法可以大致分为以下几类:
1. 基于目标检测的方法;一般是预测得到文本框后,通过NMS筛选得到最终文本框,多是四点文本框,对弯曲文本场景效果不理想。典型算法为EAST、Text Box等方法。
2. 基于分割的方法;将文本行当成分割目标,然后通过分割结果构建外接文本框,可以处理弯曲文本,对于文本交叉场景问题效果不理想。典型算法为DB、PSENet等方法。
3. 混合目标检测和分割的方法;
<a name="12-ocr---------"></a>
### 1.2 OCR 识别模型基本概念
OCR识别算法的输入数据一般是文本行,背景信息不多,文字占据主要部分,识别算法目前可以分为两类算法:
1. 基于CTC的方法;即识别算法的文字预测模块是基于CTC的,常用的算法组合为CNN+RNN+CTC。目前也有一些算法尝试在网络中加入transformer模块等等。
2. 基于Attention的方法;即识别算法的文字预测模块是基于Attention的,常用算法组合是CNN+RNN+Attention。
<a name="13-pp-ocr--"></a>
### 1.3 PP-OCR模型
PaddleOCR 中集成了很多OCR算法,文本检测算法有DB、EAST、SAST等等,文本识别算法有CRNN、RARE、StarNet、Rosetta、SRN等算法。
其中PaddleOCR针对中英文自然场景通用OCR,推出了PP-OCR系列模型,PP-OCR模型由DB+CRNN算法组成,利用海量中文数据训练加上模型调优方法,在中文场景上具备较高的文本检测识别能力。并且PaddleOCR推出了高精度超轻量PP-OCRv2模型,检测模型仅3M,识别模型仅8.5M,利用[PaddleSlim](https://github.com/PaddlePaddle/PaddleSlim)的模型量化方法,可以在保持精度不降低的情况下,将检测模型压缩到0.8M,识别压缩到3M,更加适用于移动端部署场景。
......@@ -143,8 +143,10 @@ PaddleOCR主要聚焦通用OCR,如果有垂类需求,您可以用PaddleOCR+
具体的训练教程可点击下方链接跳转:
\- [文本检测模型训练](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.3/doc/doc_ch/detection.md)
- [文本检测模型训练](./detection.md)
\- [文本识别模型训练](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.3/doc/doc_ch/recognition.md)
- [文本识别模型训练](./recognition.md)
- [文本方向分类器训练](./angle_class.md)
- [知识蒸馏](./knowledge_distillation.md)
\- [文本方向分类器训练](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.3/doc/doc_ch/angle_class.md)
......@@ -67,11 +67,11 @@ And the responding download link of backbone pretrain weights can be found in (h
```shell
cd PaddleOCR/
# Download the pre-trained model of MobileNetV3
wget -P ./pretrain_models/ https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileNetV3_large_x0_5_pretrained.pdparams
wget -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/pretrained/MobileNetV3_large_x0_5_pretrained.pdparams
# or, download the pre-trained model of ResNet18_vd
wget -P ./pretrain_models/ https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNet18_vd_pretrained.pdparams
wget -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/pretrained/ResNet18_vd_pretrained.pdparams
# or, download the pre-trained model of ResNet50_vd
wget -P ./pretrain_models/ https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNet50_vd_ssld_pretrained.pdparams
wget -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/pretrained/ResNet50_vd_ssld_pretrained.pdparams
```
......
# PP-OCR Model and Configuration
The chapter on PP-OCR model and configuration file mainly adds some basic concepts of OCR model and the content and role of configuration file to have a better experience in the subsequent parameter adjustment and training of the model.
This chapter contains three parts. Firstly, [PP-OCR Model Download](. /models_list_en.md) explains the concept of PP-OCR model types and provides links to download all models. Then in [Yml Configuration](. /config_en.md) details the parameters needed to fine-tune the PP-OCR models. The final [Python Inference for PP-OCR Model Library](. /inference_ppocr_en.md) is an introduction to the use of the PP-OCR model library in the first section, which can quickly utilize the rich model library models to obtain test results through the Python inference engine.
------
Let's first understand some basic concepts.
- [INTRODUCTION ABOUT OCR](#introduction-about-ocr)
* [BASIC CONCEPTS OF OCR DETECTION MODEL](#basic-concepts-of-ocr-detection-model)
* [Basic concepts of OCR recognition model](#basic-concepts-of-ocr-recognition-model)
* [PP-OCR model](#pp-ocr-model)
* [And a table of contents](#and-a-table-of-contents)
* [On the right](#on-the-right)
## 1. INTRODUCTION ABOUT OCR
This section briefly introduces the basic concepts of OCR detection model and recognition model, and introduces PaddleOCR's PP-OCR model.
OCR (Optical Character Recognition, Optical Character Recognition) is currently the general term for text recognition. It is not limited to document or book text recognition, but also includes recognizing text in natural scenes. It can also be called STR (Scene Text Recognition).
OCR text recognition generally includes two parts, text detection and text recognition. The text detection module first uses detection algorithms to detect text lines in the image. And then the recognition algorithm to identify the specific text in the text line.
### 1.1 BASIC CONCEPTS OF OCR DETECTION MODEL
Text detection can locate the text area in the image, and then usually mark the word or text line in the form of a bounding box. Traditional text detection algorithms mostly extract features manually, which are characterized by fast speed and good effect in simple scenes, but the effect will be greatly reduced when faced with natural scenes. Currently, deep learning methods are mostly used.
Text detection algorithms based on deep learning can be roughly divided into the following categories:
1. Method based on target detection. Generally, after the text box is predicted, the final text box is filtered through NMS, which is mostly four-point text box, which is not ideal for curved text scenes. Typical algorithms are methods such as EAST and Text Box.
2. Method based on text segmentation. The text line is regarded as the segmentation target, and then the external text box is constructed through the segmentation result, which can handle curved text, and the effect is not ideal for the text cross scene problem. Typical algorithms are DB, PSENet and other methods.
3. Hybrid target detection and segmentation method.
### 1.2 Basic concepts of OCR recognition model
The input of the OCR recognition algorithm is generally text lines images which has less background information, and the text information occupies the main part. The recognition algorithm can be divided into two types of algorithms:
1. CTC-based method. The text prediction module of the recognition algorithm is based on CTC, and the commonly used algorithm combination is CNN+RNN+CTC. There are also some algorithms that try to add transformer modules to the network and so on.
2. Attention-based method. The text prediction module of the recognition algorithm is based on Attention, and the commonly used algorithm combination is CNN+RNN+Attention.
### 1.3 PP-OCR model
PaddleOCR integrates many OCR algorithms, text detection algorithms include DB, EAST, SAST, etc., text recognition algorithms include CRNN, RARE, StarNet, Rosetta, SRN and other algorithms.
Among them, PaddleOCR has released the PP-OCR series model for the general OCR in Chinese and English natural scenes. The PP-OCR model is composed of the DB+CRNN algorithm. It uses massive Chinese data training and model tuning methods to have high text detection and recognition capabilities in Chinese scenes. And PaddleOCR has launched a high-precision and ultra-lightweight PP-OCRv2 model. The detection model is only 3M, and the recognition model is only 8.5M. Using [PaddleSlim](https://github.com/PaddlePaddle/PaddleSlim)'s model quantification method, the detection model can be compressed to 0.8M without reducing the accuracy. The recognition is compressed to 3M, which is more suitable for mobile deployment scenarios.
doc/joinus.PNG

188.5 KB | W: | H:

doc/joinus.PNG

193.0 KB | W: | H:

doc/joinus.PNG
doc/joinus.PNG
doc/joinus.PNG
doc/joinus.PNG
  • 2-up
  • Swipe
  • Onion skin
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"collapsed": false
},
"source": [
"![](https://ai-studio-static-online.cdn.bcebos.com/72b2077605dd49b78f7f647d6821d10231f6bc52d7ed463da451a6a0bd1fc5ff)\n",
"*注:以上图片来自网络*\n",
"\n",
"# 1. OCR技术背景\n",
"## 1.1 OCR技术的应用场景\n",
"\n",
"* **<font color=red>OCR是什么</font>**\n",
"\n",
"OCR(Optical Character Recognition,光学字符识别)是计算机视觉重要方向之一。传统定义的OCR一般面向扫描文档类对象,现在我们常说的OCR一般指场景文字识别(Scene Text Recognition,STR),主要面向自然场景,如下图中所示的牌匾等各种自然场景可见的文字。\n",
"\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/c87c0e6f6c0a42cdbc552a4f973c1b0217c369194c1243558753896f3e66032c)\n",
"<center>图1 文档场景文字识别 VS. 自然场景文字识别</center>\n",
"\n",
"<br>\n",
"\n",
"* **<font color=red>OCR有哪些应用场景?</font>**\n",
"\n",
"OCR技术有着丰富的应用场景,一类典型的场景是日常生活中广泛应用的面向垂类的结构化文本识别,比如车牌识别、银行卡信息识别、身份证信息识别、火车票信息识别等等。这些小垂类的共同特点是格式固定,因此非常适合使用OCR技术进行自动化,可以极大的减轻人力成本,提升效率。\n",
"\n",
"这种面向垂类的结构化文本识别是目前ocr应用最广泛、并且技术相对较成熟的场景。\n",
"\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/56e0df91d0d34443aacb17c9a1c5c186608ee675092648a693503df7fe45e535)\n",
"<center>图2 OCR技术的应用场景</center>\n",
"\n",
"除了面向垂类的结构化文本识别,通用OCR技术也有广泛的应用,并且常常和其他技术结合完成多模态任务,例如在视频场景中,经常使用OCR技术进行字幕自动翻译、内容安全监控等等,或者与视觉特征相结合,完成视频理解、视频搜索等任务。\n",
"\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/ca2341a51eb242ee8e1afe121ce3ebbc87a113cef1b643ed9bba92d0c8ee4f0f)\n",
"<center>图3 多模态场景中的通用OCR</center>\n",
"\n",
"## 1.2 OCR技术挑战\n",
"OCR的技术难点可以分为算法层和应用层两方面。\n",
"\n",
"* **<font color=red>算法层</font>**\n",
"\n",
"OCR丰富的应用场景,决定了它会存在很多技术难点。这里给出了常见的8种问题:\n",
"\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/a56831fbf0c449fe9156a893002cadfe110ccfea835b4d90854a7ce4b1df2a4f)\n",
"<center>图4 OCR算法层技术难点</center>\n",
"\n",
"这些问题给文本检测和文本识别都带来了巨大的技术挑战,可以看到,这些挑战主要都是面向自然场景,目前学术界的研究也主要聚焦在自然场景,OCR领域在学术上的常用数据集也都是自然场景。针对这些问题的研究很多,相对来说,识别比检测面临更大的挑战。\n",
"\n",
"* **<font color=red>应用层</font>**\n",
"\n",
"在实际应用中,尤其是在广泛的通用场景下,除了上一节总结的仿射变换、尺度问题、光照不足、拍摄模糊等算法层面的技术难点,OCR技术还面临两大落地难点:\n",
"1. **海量数据要求OCR能够实时处理。** OCR应用常对接海量数据,我们要求或希望数据能够得到实时处理,模型的速度做到实时是一个不小的挑战。\n",
"2. **端侧应用要求OCR模型足够轻量,识别速度足够快。** OCR应用常部署在移动端或嵌入式硬件,端侧OCR应用一般有两种模式:上传到服务器 vs. 端侧直接识别,考虑到上传到服务器的方式对网络有要求,实时性较低,并且请求量过大时服务器压力大,以及数据传输的安全性问题,我们希望能够直接在端侧完成OCR识别,而端侧的存储空间和计算能力有限,因此对OCR模型的大小和预测速度有很高的要求。\n",
"\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/5bafdc3da1614c41a95ae39a2c36632f95e2893031a64929b9f49d4a4985cd2d)\n",
"<center>图5 OCR应用层技术难点</center>\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false
},
"source": [
"# 2. OCR前沿算法\n",
"\n",
"虽然OCR是一个相对具体的任务,但涉及了多方面的技术,包括文本检测、文本识别、端到端文本识别、文档分析等等。学术上关于OCR各项相关技术的研究层出不穷,下文将简要介绍OCR任务中的几种关键技术的相关工作。\n",
"\n",
"## 2.1 文本检测\n",
"\n",
"文本检测的任务是定位出输入图像中的文字区域。近年来学术界关于文本检测的研究非常丰富,一类方法将文本检测视为目标检测中的一个特定场景,基于通用目标检测算法进行改进适配,如TextBoxes[1]基于一阶段目标检测器SSD[2]算法,调整目标框使之适合极端长宽比的文本行,CTPN[3]则是基于Faster RCNN[4]架构改进而来。但是文本检测与目标检测在目标信息以及任务本身上仍存在一些区别,如文本一般长宽比较大,往往呈“条状”,文本行之间可能比较密集,弯曲文本等,因此又衍生了很多专用于文本检测的算法,如EAST[5]、PSENet[6]、DBNet[7]等等。\n",
"\n",
"<center><img src=\"https://ai-studio-static-online.cdn.bcebos.com/548b50212935402abb2e671c158c204737c2c64b9464442a8f65192c8a31b44d\" width=\"500\"></center>\n",
"<center>图6 文本检测任务示例</center>\n",
"\n",
"<br>\n",
"\n",
"目前较为流行的文本检测算法可以大致分为**基于回归**和**基于分割**的两大类文本检测算法,也有一些算法将二者相结合。基于回归的算法借鉴通用物体检测算法,通过设定anchor回归检测框,或者直接做像素回归,这类方法对规则形状文本检测效果较好,但是对不规则形状的文本检测效果会相对差一些,比如CTPN[3]对水平文本的检测效果较好,但对倾斜、弯曲文本的检测效果较差,SegLink[8]对长文本比较好,但对分布稀疏的文本效果较差;基于分割的算法引入了Mask-RCNN[9],这类算法在各种场景、对各种形状文本的检测效果都可以达到一个更高的水平,但缺点就是后处理一般会比较复杂,因此常常存在速度问题,并且无法解决重叠文本的检测问题。\n",
"\n",
"<center><img src=\"https://ai-studio-static-online.cdn.bcebos.com/4f4ea65578384900909efff93d0b7386e86ece144d8c4677b7bc94b4f0337cfb\" width=\"800\"></center>\n",
"<center>图7 文本检测算法概览</center>\n",
"\n",
"<br>\n",
"\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/473ba28cd0274d568f90eb8ca9e78864d994f3ebffe6419cb638e193c607b7b3)|![](https://ai-studio-static-online.cdn.bcebos.com/e968807b3ed9493cab20f3be0d8dc07b0baf8b8cecb24ee99ccda9d3a241832a)|![](https://ai-studio-static-online.cdn.bcebos.com/53b9e85ce46645c08481d7d7377720f5eea5ac30e37e4e9c9930e1f26b02e278)\n",
"|---|---|---|\n",
"<center>图8 (左)基于回归的CTPN[3]算法优化anchor (中)基于分割的DB[7]算法优化后处理 (右)回归+分割的SAST[10]算法</center>\n",
"\n",
"<br>\n",
"\n",
"文本检测相关技术将在第二章进行详细解读和实战。\n",
"\n",
"## 2.2 文本识别\n",
"\n",
"文本识别的任务是识别出图像中的文字内容,一般输入来自于文本检测得到的文本框截取出的图像文字区域。文本识别一般可以根据待识别文本形状分为**规则文本识别**和**不规则文本识别**两大类。规则文本主要指印刷字体、扫描文本等,文本大致处在水平线位置;不规则文本往往不在水平位置,存在弯曲、遮挡、模糊等问题。不规则文本场景具有很大的挑战性,也是目前文本识别领域的主要研究方向。\n",
"\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/b292f21e50c94debab7496d4ced96a93774a8525c12346f49cb151bde2a58fe8)\n",
"<center>图9 (左)规则文本 VS. (右)不规则文本</center>\n",
"\n",
"<br>\n",
"\n",
"规则文本识别的算法根据解码方式的不同可以大致分为基于CTC和Sequence2Sequence两种,将网络学习到的序列特征 转化为 最终的识别结果 的处理方式不同。基于CTC的算法以经典的CRNN[11]为代表。\n",
"\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/403ca85c59d344f88d3b1229ca14b1e90c5c73c9f1d248b7aa94103f9d0af597)\n",
"<center>图10 基于CTC的识别算法 VS. 基于Attention的识别算法</center>\n",
"\n",
"不规则文本的识别算法相比更为丰富,如STAR-Net[12]等方法通过加入TPS等矫正模块,将不规则文本矫正为规则的矩形后再进行识别;RARE[13]等基于Attention的方法增强了对序列之间各部分相关性的关注;基于分割的方法将文本行的各字符作为独立个体,相比与对整个文本行做矫正后识别,识别分割出的单个字符更加容易;此外,随着近年来Transfomer[14]的快速发展和在各类任务中的有效性验证,也出现了一批基于Transformer的文本识别算法,这类方法利用transformer结构解决CNN在长依赖建模上的局限性问题,也取得了不错的效果。\n",
"\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/0fa30c3789424473ad9be1c87a4f742c1db69e3defb64651906e5334ed9571a8)\n",
"<center>图11 基于字符分割的识别算法[15]</center>\n",
"\n",
"<br>\n",
"\n",
"文本识别相关技术将在第三章进行详细解读和实战。\n",
"\n",
"## 2.3 文档结构化识别\n",
"\n",
"传统意义上的OCR技术可以解决文字的检测和识别需求,但在实际应用场景中,最终需要获取的往往是结构化的信息,如身份证、发票的信息格式化抽取,表格的结构化识别等等,多在快递单据抽取、合同内容比对、金融保理单信息比对、物流业单据识别等场景下应用。OCR结果+后处理是一种常用的结构化方案,但流程往往比较复杂,并且后处理需要精细设计,泛化性也比较差。在OCR技术逐渐成熟、结构化信息抽取需求日益旺盛的背景下,版面分析、表格识别、关键信息提取等关于智能文档分析的各种技术受到了越来越多的关注和研究。\n",
"\n",
"* **版面分析**\n",
"\n",
"版面分析(Layout Analysis)主要是对文档图像进行内容分类,类别一般可分为纯文本、标题、表格、图片等。现有方法一般将文档中不同的板式当做不同的目标进行检测或分割,如Soto Carlos[16]在目标检测算法Faster R-CNN的基础上,结合上下文信息并利用文档内容的固有位置信息来提高区域检测性能;Sarkar Mausoom[17]等人提出了一种基于先验的分割机制,在非常高的分辨率的图像上训练文档分割模型,解决了过度缩小原始图像导致的密集区域不同结构无法区分进而合并的问题。\n",
"\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/dedb212e8972497998685ff51af7bfe03fdea57f6acd450281ad100807086e1a)\n",
"<center>图12 版面分析任务示意图</center>\n",
"\n",
"<br>\n",
"\n",
"* **表格识别**\n",
"\n",
"表格识别(Table Recognition)的任务就是将文档里的表格信息进行识别和转换到excel文件中。文本图像中表格种类和样式复杂多样,例如不同的行列合并,不同的内容文本类型等,除此之外文档的样式和拍摄时的光照环境等都为表格识别带来了极大的挑战。这些挑战使得表格识别一直是文档理解领域的研究难点。\n",
"\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/47119a2a2f9a45788390d6506f90d5de7449738008aa4c0ab619b18f37bd8d57)\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/22ca5749441441e69dc0eaeb670832a5d0ae0ce522f34731be7d609a2d36e8c1)\n",
"<center>图13 表格识别任务示意图</center>\n",
"\n",
"<br>\n",
"\n",
"表格识别的方法种类较为丰富,早期的基于启发式规则的传统算法,如Kieninger[18]等人提出的T-Rect等算法,一般通过人工设计规则,连通域检测分析处理;近年来随着深度学习的发展,开始涌现一些基于CNN的表格结构识别算法,如Siddiqui Shoaib Ahmed[19]等人提出的DeepTabStR,Raja Sachin[20]等人提出的TabStruct-Net等;此外,随着图神经网络(Graph Neural Network)的兴起,也有一些研究者尝试将图神经网络应用到表格结构识别问题上,基于图神经网络,将表格识别看作图重建问题,如Xue Wenyuan[21]等人提出的TGRNet;基于端到端的方法直接使用网络完成表格结构的HTML表示输出,端到端的方法大多采用Seq2Seq方法来完成表格结构的预测,如一些基于Attention或Transformer的方法,如TableMaster[22]。\n",
"\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/a9a3c91898c84f03b382583859526c4b451ace862dbc4a15838f5dde4d0ea657)\n",
"<center>图14 表格识别方法示意图</center>\n",
"\n",
"<br>\n",
"\n",
"* **关键信息提取**\n",
"\n",
"关键信息提取(Key Information Extraction,KIE)是Document VQA中的一个重要任务,主要从图像中提取所需要的关键信息,如从身份证中提取出姓名和公民身份号码信息,这类信息的种类往往在特定任务下是固定的,但是在不同任务间是不同的。\n",
"\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/8af011647bb4464f80d07f3efeac469baed27c8185ef4c4883a19f40e8ba91f5)\n",
"<center>图15 DocVQA任务示意图</center>\n",
"\n",
"<br>\n",
"\n",
"KIE通常分为两个子任务进行研究:\n",
"\n",
"- SER: 语义实体识别 (Semantic Entity Recognition),对每一个检测到的文本进行分类,如将其分为姓名,身份证。如下图中的黑色框和红色框。\n",
"- RE: 关系抽取 (Relation Extraction),对每一个检测到的文本进行分类,如将其分为问题和的答案。然后对每一个问题找到对应的答案。如下图中的红色框和黑色框分别代表问题和答案,黄色线代表问题和答案之间的对应关系。\n",
"\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/2f1bc1a3e4a341ab9552bbf5f6c2be71ba78d7d65da64818b776efe0691e310b)\n",
"<center>图16 ser与re任务</center>\n",
"\n",
"<br>\n",
"\n",
"一般的KIE方法基于命名实体识别(Named Entity Recognition,NER)[4]来研究,但是这类方法只利用了图像中的文本信息,缺少对视觉和结构信息的使用,因此精度不高。在此基础上,近几年的方法都开始将视觉和结构信息与文本信息融合到一起,按照对多模态信息进行融合时所采用的的原理可以将这些方法分为下面四种:\n",
"\n",
"- 基于Grid的方法\n",
"- 基于Token的方法\n",
"- 基于GCN的方法\n",
"- 基于End to End 的方法\n",
"\n",
"<br>\n",
"\n",
"文档分析相关技术将在第六章进行详细解读和实战。\n",
"\n",
"## 2.4 其他相关技术\n",
"\n",
"前面主要介绍了OCR领域的三种关键技术:文本检测、文本识别、文档结构化识别,更多其他OCR相关前沿技术介绍,包括端到端文本识别、OCR中的图像预处理技术、OCR数据合成等,可参考教程第七章和第八章。\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false
},
"source": [
"# 3. OCR技术的产业实践\n",
"\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/3d5f18f7598f405884fa2fab041c95ce415af40712e9489996747f9d122c3d90)\n",
"\n",
"> 你是小王,该怎么办? \n",
"> 1. 我不会,我不行,我不干了😭\n",
"> 2. 建议老板找外包公司或者商业化方案,反正花老板的钱😊\n",
"> 3. 网上找找类似项目,面向Github编程😏\n",
"\n",
"<br>\n",
"\n",
"OCR技术最终还是要落到产业实践当中。虽然学术上关于OCR技术的研究很多,OCR技术的商业化应用相比于其他AI技术也已经相对成熟,但在实际的产业应用中,还是存在一些难点与挑战。下文将从技术和产业实践两个角度进行分析。\n",
"\n",
"\n",
"## 3.1 产业实践难点\n",
"\n",
"在实际的产业实践中,开发者常常需要依托开源社区资源启动或推进项目,而开发者使用开源模型又往往面临三大难题:\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/7e5e79240b9c4f13b675b56bc12edf540f159c922bf24e3cbc4a0635a356c7f9)\n",
"<center>图17 OCR技术产业实践三大难题</center>\n",
"\n",
"**1. 找不到、选不出**\n",
"\n",
"开源社区资源丰富,但是信息不对称导致开发者并不能高效地解决痛点问题。一方面,开源社区资源过于丰富,开发者面对一项需求,无法快速从海量的代码仓库中找到匹配业务需求的项目,即存在“找不到”的问题;另一方面,在算法选型时,英文公开数据集上的指标,无法给开发者常常面对的中文场景提供直接的参考,逐个算法验证需要耗费大量时间和人力,且不能保证选出最合适的算法,即“选不出”。\n",
"\n",
"**2. 不适用产业场景**\n",
"\n",
"开源社区中的工作往往更多地偏向效果优化,如学术论文代码开源或复现,一般更侧重算法效果,平衡考虑模型大小和速度的工作相比就少很多,而模型大小和预测耗时在产业实践中是两项不容忽视的指标,其重要程度不亚于模型效果。无论是移动端和服务器端,待识别的图像数目往往非常多,都希望模型更小,精度更高,预测速度更快。GPU太贵,最好使用CPU跑起来更经济。在满足业务需求的前提下,模型越轻量占用的资源越少。\n",
"\n",
"**3. 优化难、训练部署问题多**\n",
"\n",
"直接使用开源算法或模型一般无法直接满足业务需求,实际业务场景中,OCR面临的问题多种多样,业务场景个性化往往需要自定义数据集重新训练,现有的开源项目上,实验各种优化方法的成本较高。此外,OCR应用场景十分丰富,服务端和各种移动端设备上都有着广泛的应用需求,硬件环境多样化就需要支持丰富的部署方式,而开源社区的项目更侧重算法和模型,在预测部署这部分明显支撑不足。要把OCR技术从论文上的算法做到技术落地应用,对开发者的算法和工程能力都有很高的要求。\n",
"\n",
"## 3.2 产业级OCR开发套件PaddleOCR\n",
"\n",
"OCR产业实践需要一套完整全流程的解决方案,来加快研发进度,节约宝贵的研发时间。也就是说,超轻量模型及其全流程解决方案,尤其对于算力、存储空间有限的移动端、嵌入式设备而言,可以说是刚需。\n",
"\n",
"在此背景下,产业级OCR开发套件[PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR)应运而生。\n",
"\n",
"PaddleOCR的建设思路从用户画像和需求出发,依托飞桨核心框架,精选并复现丰富的前沿算法,基于复现的算法研发更适用于产业落地的PP特色模型,并打通训推一体,提供多种预测部署方式,满足实际应用的不同需求场景。\n",
"\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/e09929b4a31e44f9b5e3d542d12411332669d2e1a21d45ad88b1dd91142ec86c)\n",
"<center>图18 PaddleOCR开发套件全景图</center>\n",
"\n",
"<br>\n",
"\n",
"从全景图可以看出,PaddleOCR依托于飞桨核心框架,在模型算法、预训练模型库、工业级部署等层面均提供了丰富的解决方案,并且提供了数据合成、半自动数据标注工具,满足开发者的数据生产需求。\n",
"\n",
"**在模型算法层面**,PaddleOCR对**文字检测识别**和**文档结构化分析**两类任务分别提供了解决方案。在文字检测识别方面,PaddleOCR复现或开源了4种文本检测算法、8种文本识别算法、1种端到端文本识别算法,并在此基础上研发了PP-OCR系列的通用文本检测识别解决方案;在文档结构化分析方面,PaddleOCR提供了版面分析、表格识别、关键信息抽取、命名实体识别等算法,并在此基础提出了PP-Structure文档分析解决方案。丰富的精选算法可以满足开发者不同业务场景的需求,代码框架的统一也方便开发者进行不同算法的优化和性能对比。\n",
"\n",
"**在预训练模型库层面**,基于PP-OCR和PP-Structure解决方案,PaddleOCR研发并开源了适用于产业实践的PP系列特色模型,包括通用、超轻量和多语言的文本检测识别模型,和复杂文档分析模型。PP系列特色模型均在原始算法上进行了深度优化,使其在效果和性能上均能达到产业实用级别,开发者既可以直接应用于业务场景,也可以用业务数据进行简单的finetune,便可以轻松研发出适用于自己业务需求的“实用模型”。\n",
"\n",
"**在工业级部署层面**,PaddleOCR提供了基于Paddle Inference的服务器端预测方案,基于Paddle Serving的服务化部署方案,以及基于Paddle-Lite的端侧部署方案,满足不同硬件环境下的部署需求,同时提供了基于PaddleSlim的模型压缩方案,可以进一步压缩模型大小。以上部署方式都完成了训推一体全流程打通,以保障开发者可以高效部署,稳定可靠。\n",
"\n",
"**在数据工具层面**,PaddleOCR提供了半自动数据标注工具PPOCRLabel和数据合成工具Style-Text,助力开发者更方便的生产模型训练所需的数据集和标注信息。PPOCRLabel作为业界首个开源的半自动OCR数据标注工具,针对标注过程枯燥繁琐、机械性高,大量训练数据所需人工标记,时间金钱成本昂贵的问题,内置PP-OCR模型实现预标注+人工校验的标注模式,可以极大提升标注效率,节省人力成本。数据合成工具Style-Text主要解决实际场景真实数据严重不足,传统合成算法无法合成文字风格(字体、颜色、间距、背景)的问题,只需要少许目标场景图像,就可以批量合成大量与目标场景风格相近的文本图像。\n",
"\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/90a358d6a62c49b7b8db47e18c77878c60f80cf9c81541bfa3befea68d9dbc0f)\n",
"<center>图19 PPOCRLabel使用示意图</center>\n",
"\n",
"<br>\n",
"\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/b63b10bc530c42bea3d3b923da6000f1cfef006d7eec4ff3bdc0439bd9c333c9)\n",
"<center>图20 Style-Text合成效果示例</center>\n",
"\n",
"<br>\n",
"\n",
"### 3.2.1 PP-OCR与PP-Structrue\n",
"\n",
"PP系列特色模型是飞桨各视觉开发套件针对产业实践需求进行深度优化的模型,力求速度与精度平衡。PaddleOCR中的PP系列特色模型包括针对文字检测识别任务的PP-OCR系列模型和针对文档分析的PP-Structure系列模型。\n",
"\n",
"**(1)PP-OCR中英文模型**\n",
"\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/3372558042044d43983b815069e1e43cb84432b993ed400f946976e75bd51f38)\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/f0a0b936382c42dd8809e98759b4c84434d79386606b4d5b8a86416db6dbaeee)\n",
"<center>图21 PP-OCR中英文模型识别结果示例</center>\n",
"\n",
"<br>\n",
"\n",
"PP-OCR中英文模型采用的典型的两阶段OCR算法,即检测模型+识别模型的组成方式,具体的算法框架如下:\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/8af1371b5e3c486bb90a041903200c7c666c8bbc98c245dc802ff8c4da98617e)\n",
"<center>图22 PP-OCR系统pipeline示意图</center>\n",
"\n",
"<br>\n",
"\n",
"可以看到,除输入输出外,PP-OCR核心框架包含了3个模块,分别是:文本检测模块、检测框矫正模块、文本识别模块。\n",
"- 文本检测模块:核心是一个基于[DB](https://arxiv.org/abs/1911.08947)检测算法训练的文本检测模型,检测出图像中的文字区域;\n",
"- 检测框矫正模块:将检测到的文本框输入检测框矫正模块,在这一阶段,将四点表示的文本框矫正为矩形框,方便后续进行文本识别,另一方面会进行文本方向判断和校正,例如如果判断文本行是倒立的情况,则会进行转正,该功能通过训练一个文本方向分类器实现;\n",
"- 文本识别模块:最后文本识别模块对矫正后的检测框进行文本识别,得到每个文本框内的文字内容,PP-OCR中使用的经典文本识别算法[CRNN](https://arxiv.org/abs/1507.05717)。\n",
"\n",
"PaddleOCR先后推出了PP-OCR[23]和PP-OCRv2[24]模型。\n",
"\n",
"PP-OCR模型分为mobile版(轻量版)和server版(通用版),其中mobile版模型主要基于轻量级骨干网络MobileNetV3进行优化,优化后模型(检测模型+文本方向分类模型+识别模型)大小仅8.1M,CPU上平均单张图像预测耗时350ms,T4 GPU上约110ms,裁剪量化后,可在精度不变的情况下进一步压缩到3.5M,便于端侧部署,在骁龙855上测试预测耗时仅260ms。更多的PP-OCR评估数据可参考[benchmark](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.2/doc/doc_ch/benchmark.md)。\n",
"\n",
"PP-OCRv2保持了PP-OCR的整体框架,主要做了效果上的进一步策略优化。提升包括3个方面:\n",
"- 在模型效果上,相对于PP-OCR mobile版本提升超7%;\n",
"- 在速度上,相对于PP-OCR server版本提升超过220%;\n",
"- 在模型大小上,11.6M的总大小,服务器端和移动端都可以轻松部署。\n",
"\n",
"PP-OCR和PP-OCRv2的具体优化策略将在第四章中进行详细解读。\n",
"\n",
"除了中英文模型,PaddleOCR也基于不同的数据集训练并开源了英文数字模型、多语言识别模型,以上均为超轻量模型,适用于不同的语言场景。\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/5978652a826647b98344cf61aa1c2027662af989b73e4a0e917d83718422eeb0)\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/1a8a8e24b5a440d388dae767adf0ea9c049335b04e964abbb176f58c5b028d7e)\n",
"<center>图23 PP-OCR的英文数字模型和多语言模型识别效果示意图</center>\n",
"\n",
"<br>\n",
"\n",
"**(2)PP-Structure文档分析模型**\n",
"\n",
"PP-Structure支持版面分析(layout analysis)、表格识别(table recognition)、文档视觉问答(DocVQA)三种子任务。\n",
"\n",
"PP-Structure核心功能点如下:\n",
"- 支持对图片形式的文档进行版面分析,可以划分文字、标题、表格、图片以及列表5类区域(与Layout-Parser联合使用)\n",
"- 支持文字、标题、图片以及列表区域提取为文字字段(与PP-OCR联合使用)\n",
"- 支持表格区域进行结构化分析,最终结果输出Excel文件\n",
"- 支持Python whl包和命令行两种方式,简单易用\n",
"- 支持版面分析和表格结构化两类任务自定义训练\n",
"- 支持VQA任务-SER和RE\n",
"\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/129708c265644dbc90d6c8f7db224b3a6f11f37bb586463a82e7ccb50bcc2e76)\n",
"<center>图24 PP-Structure系统示意图(本图仅含版面分析+表格识别)</center>\n",
"\n",
"<br>\n",
"\n",
"PP-Structure的具体方案将在第六章中进行详细解读。\n",
"\n",
"### 3.2.2 工业级部署方案\n",
"\n",
"飞桨支持全流程、全场景推理部署,模型来源主要分为三种,第一种使用PaddlePaddle API构建网络结构进行训练所得,第二种是基于飞桨套件系列,飞桨套件提供了丰富的模型库、简洁易用的API,具备开箱即用,包括视觉模型库PaddleCV、智能语音库PaddleSpeech以及自然语言处理库PaddleNLP等,第三种采用X2Paddle工具从第三方框架(PyTorh、ONNX、TensorFlow等)产出的模型。\n",
"\n",
"飞桨模型可以选用PaddleSlim工具进行压缩、量化以及蒸馏,支持五种部署方案,分别为服务化Paddle Serving、服务端/云端Paddle Inference、移动端/边缘端Paddle Lite、网页前端Paddle.js, 对于Paddle不支持的硬件,比如MCU、地平线、鲲云等国产芯片,可以借助Paddle2ONNX转化为支持ONNX的第三方框架。\n",
"\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/c9ffe78e7db14e4eb103e7f393a16fbf2ab438540250474a8e0e7adc4aeb7ee0)\n",
"<center>图25 飞桨支持部署方式</center>\n",
"\n",
"<br>\n",
"\n",
"Paddle Inference支持服务端和云端部署,具备高性能与通用性,针对不同平台和不同应用场景进行了深度的适配和优化,Paddle Inference是飞桨的原生推理库,保证模型在服务器端即训即用,快速部署,适用于高性能硬件上使用多种应用语言环境部署算法复杂的模型,硬件覆盖x86 CPU、Nvidia GPU、以及百度昆仑XPU、华为昇腾等AI加速器。\n",
"\n",
"Paddle Lite 是端侧推理引擎,具有轻量化和高性能特点,针对端侧设备和各应用场景进行了深度的设配和优化。当前支持Android、IOS、嵌入式Linux设备、macOS 等多个平台,硬件覆盖ARM CPU和GPU、X86 CPU和新硬件如百度昆仑、华为昇腾与麒麟、瑞芯微等。\n",
"\n",
"Paddle Serving是一套高性能服务框架,旨在帮助用户几个步骤快速将模型在云端服务化部署。目前Paddle Serving支持自定义前后处理、模型组合、模型热加载更新、多机多卡多模型、分布式推理、K8S部署、安全网关和模型加密部署、支持多语言多客户端访问等功能,Paddle Serving官方还提供了包括PaddleOCR在内的40多种模型的部署示例,以帮助用户更快上手。\n",
"\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/4d8063d74194434ea9b7c9f81c7fbdfd2131e13770124d2e99c1b9670f12e019)\n",
"<center>图26 飞桨支持部署方式</center>\n",
"\n",
"<br>\n",
"\n",
"以上部署方案将在第五章中基于PP-OCRv2模型进行详细解读与实战。"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false
},
"source": [
"# 4. 总结\n",
"\n",
"本节首先介绍了OCR技术的应用场景和前沿算法,然后分析了OCR技术在产业实践中的难点与三大挑战。\n",
"\n",
"本教程后续章节内容安排如下:\n",
"\n",
"* 第二、三章分别介绍检测、识别技术并实践;\n",
"* 第四章介绍PP-OCR优化策略; \n",
"* 第五章进行预测部署实战; \n",
"* 第六章介绍文档结构化; \n",
"* 第七章介绍端到端、数据预处理、数据合成等其他OCR相关算法; \n",
"* 第八章介绍OCR相关数据集和数据合成工具。\n",
"\n",
"# 参考文献\n",
"\n",
"[1] Liao, Minghui, et al. \"Textboxes: A fast text detector with a single deep neural network.\" Thirty-first AAAI conference on artificial intelligence. 2017.\n",
"\n",
"[2] Liu W, Anguelov D, Erhan D, et al. Ssd: Single shot multibox detector[C]//European conference on computer vision. Springer, Cham, 2016: 21-37.\n",
"\n",
"[3] Tian, Zhi, et al. \"Detecting text in natural image with connectionist text proposal network.\" European conference on computer vision. Springer, Cham, 2016.\n",
"\n",
"[4] Ren S, He K, Girshick R, et al. Faster r-cnn: Towards real-time object detection with region proposal networks[J]. Advances in neural information processing systems, 2015, 28: 91-99.\n",
"\n",
"[5] Zhou, Xinyu, et al. \"East: an efficient and accurate scene text detector.\" Proceedings of the IEEE conference on Computer Vision and Pattern Recognition. 2017.\n",
"\n",
"[6] Wang, Wenhai, et al. \"Shape robust text detection with progressive scale expansion network.\" Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition. 2019.\n",
"\n",
"[7] Liao, Minghui, et al. \"Real-time scene text detection with differentiable binarization.\" Proceedings of the AAAI Conference on Artificial Intelligence. Vol. 34. No. 07. 2020.\n",
"\n",
"[8] Deng, Dan, et al. \"Pixellink: Detecting scene text via instance segmentation.\" Proceedings of the AAAI Conference on Artificial Intelligence. Vol. 32. No. 1. 2018.\n",
"\n",
"[9] He K, Gkioxari G, Dollár P, et al. Mask r-cnn[C]//Proceedings of the IEEE international conference on computer vision. 2017: 2961-2969.\n",
"\n",
"[10] Wang P, Zhang C, Qi F, et al. A single-shot arbitrarily-shaped text detector based on context attended multi-task \n",
"learning[C]//Proceedings of the 27th ACM international conference on multimedia. 2019: 1277-1285.\n",
"\n",
"[11] Shi, B., Bai, X., & Yao, C. (2016). An end-to-end trainable neural network for image-based sequence recognition and its application to scene text recognition. IEEE transactions on pattern analysis and machine intelligence, 39(11), 2298-2304.\n",
"\n",
"[12] Star-Net Max Jaderberg, Karen Simonyan, Andrew Zisserman, et al. Spa- tial transformer networks. In Advances in neural information processing systems, pages 2017–2025, 2015.\n",
"\n",
"[13] Shi, B., Wang, X., Lyu, P., Yao, C., & Bai, X. (2016). Robust scene text recognition with automatic rectification. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 4168-4176).\n",
"\n",
"[14] Sheng, F., Chen, Z., & Xu, B. (2019, September). NRTR: A no-recurrence sequence-to-sequence model for scene text recognition. In 2019 International Conference on Document Analysis and Recognition (ICDAR) (pp. 781-786). IEEE.\n",
"\n",
"[15] Lyu P, Liao M, Yao C, et al. Mask textspotter: An end-to-end trainable neural network for spotting text with arbitrary shapes[C]//Proceedings of the European Conference on Computer Vision (ECCV). 2018: 67-83.\n",
"\n",
"[16] Soto C, Yoo S. Visual detection with context for document layout analysis[C]//Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP). 2019: 3464-3470.\n",
"\n",
"[17] Sarkar M, Aggarwal M, Jain A, et al. Document Structure Extraction using Prior based High Resolution Hierarchical Semantic Segmentation[C]//European Conference on Computer Vision. Springer, Cham, 2020: 649-666.\n",
"\n",
"[18] Kieninger T, Dengel A. A paper-to-HTML table converting system[C]//Proceedings of document analysis systems (DAS). 1998, 98: 356-365.\n",
"\n",
"[19] Siddiqui S A, Fateh I A, Rizvi S T R, et al. Deeptabstr: Deep learning based table structure recognition[C]//2019 International Conference on Document Analysis and Recognition (ICDAR). IEEE, 2019: 1403-1409.\n",
"\n",
"[20] Raja S, Mondal A, Jawahar C V. Table structure recognition using top-down and bottom-up cues[C]//European Conference on Computer Vision. Springer, Cham, 2020: 70-86.\n",
"\n",
"[21] Xue W, Yu B, Wang W, et al. TGRNet: A Table Graph Reconstruction Network for Table Structure Recognition[J]. arXiv preprint arXiv:2106.10598, 2021.\n",
"\n",
"[22] Ye J, Qi X, He Y, et al. PingAn-VCGroup's Solution for ICDAR 2021 Competition on Scientific Literature Parsing Task B: Table Recognition to HTML[J]. arXiv preprint arXiv:2105.01848, 2021.\n",
"\n",
"[23] Du Y, Li C, Guo R, et al. PP-OCR: A practical ultra lightweight OCR system[J]. arXiv preprint arXiv:2009.09941, 2020.\n",
"\n",
"[24] Du Y, Li C, Guo R, et al. PP-OCRv2: Bag of Tricks for Ultra Lightweight OCR System[J]. arXiv preprint arXiv:2109.03144, 2021.\n",
"\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "py35-paddle1.2.0"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
因为 它太大了无法显示 source diff 。你可以改为 查看blob
因为 它太大了无法显示 source diff 。你可以改为 查看blob
因为 它太大了无法显示 source diff 。你可以改为 查看blob
因为 它太大了无法显示 source diff 。你可以改为 查看blob
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"collapsed": false
},
"source": [
"# 文档分析技术\n",
"\n",
"本章主要介绍文档分析技术的理论知识,包括背景介绍、算法分类和对应思路。\n",
"\n",
"通过本章的学习,你可以掌握:\n",
"\n",
"1. 版面分析的分类和典型思想\n",
"2. 表格识别的分类和典型思想\n",
"3. 信息提取的分类和典型思想"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false
},
"source": [
"\n",
"作为信息承载工具,文档的不同布局代表了各种不同的信息,如清单和身份证。文档分析是一个从文档中阅读、解释和提取信息的自动化过程。文档分析常包含以下几个研究方向:\n",
"\n",
"1. 版面分析模块: 将每个文档页面划分为不同的内容区域。该模块不仅可用于划定相关区域和不相关区域,还可用于对其识别的内容类型进行分类。\n",
"2. 光学字符识别 (OCR) 模块: 定位并识别文档中存在的所有文本。\n",
"3. 表格识别模块: 将文档里的表格信息进行识别和转换到excel文件中。\n",
"4. 信息提取模块: 借助OCR结果和图像信息来理解和识别文档中表达的特定信息或信息之间的关系。\n",
"\n",
"由于OCR模块在前面的章节中进行了详细的介绍,接下来将针对上面版面分析、表格识别和信息提取三个模块做单独的介绍。对于每一个模块,会介绍该模块的经典或常用方法以及数据集。"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false
},
"source": [
"## 1. 版面分析\n",
"\n",
"### 1.1 背景介绍\n",
"\n",
"版面分析主要用于文档检索,关键信息提取,内容分类等,其任务主要是对文档图像进行内容分类,内容的类别一般可分为纯文本、标题、表格、图片和列表等。但是文档布局、格式的多样性和复杂性,文档图像质量差,大规模的带标注的数据集的缺少等问题使得版面分析仍然是一个很有挑战性的任务。\n",
"版面分析任务的可视化如下图所示:\n",
"<center class=\"img\">\n",
"<img src=\"https://ai-studio-static-online.cdn.bcebos.com/2510dc76c66c49b8af079f25d08a9dcba726b2ce53d14c8ba5cd9bd57acecf19\" width=\"1000\"/></center>\n",
"<center>图 1:版面分析效果图</center>\n",
"\n",
"现有的解决办法一般是基于目标检测或语义分割的方法,这类方法基将文档中不同的板式当做不同的目标进行检测或分割。\n",
"\n",
"一些代表性论文被划分为上述两个类别中,具体如下表所示:\n",
"\n",
"| 类别 | 主要论文 |\n",
"| ---------------- | -------- |\n",
"| 基于目标检测的方法 | [Visual Detection with Context](https://aclanthology.org/D19-1348.pdf),[Object Detection](https://arxiv.org/pdf/2003.13197v1.pdf),[VSR](https://arxiv.org/pdf/2105.06220v1.pdf)|\n",
"| 基于语义分割的方法 |[Semantic Segmentation](https://arxiv.org/pdf/1911.12170v2.pdf) |\n",
"\n",
"\n",
"### 1.2 基于目标检测的方法 \n",
"\n",
"Soto Carlos[1]在目标检测算法Faster R-CNN的基础上,结合上下文信息并利用文档内容的固有位置信息来提高区域检测性能。Li Kai [2]等人也提出了一种基于目标检测的文档分析方法,通过引入了特征金字塔对齐模块,区域对齐模块,渲染层对齐模块来解决跨域的问题,这三个模块相互补充,并从一般的图像角度和特定的文档图像角度调整域,从而解决了大型标记训练数据集与目标域不同的问题。下图是一个基于目标检测Faster R-CNN算法进行版面分析的流程图。\n",
"\n",
"<center class=\"img\">\n",
"<img src=\"https://ai-studio-static-online.cdn.bcebos.com/d396e0d6183243898c0961250ee7a49bc536677079fb4ba2ac87c653f5472f01\" width=\"800\"/></center>\n",
"<center>图 2:基于Faster R-CNN的版面分析流程图</center>\n",
"\n",
"### 1.3 基于语义分割的方法 \n",
"\n",
"Sarkar Mausoom[3]等人提出了一种基于先验的分割机制,在非常高的分辨率的图像上训练文档分割模型,解决了过度缩小原始图像导致的密集区域不同结构无法区分进而合并的问题。Zhang Peng[4]等人结合文档中的视觉、语义和关系提出了一个统一的框架VSR(Vision, Semantics and Relations)用于文档布局分析,该框架使用一个双流网络来提取特定模态的视觉和语义特征,并通过自适应聚合模块自适应地融合这些特征,解决了现有基于CV的方法不同模态融合效率低下和布局组件之间缺乏关系建模的局限性。\n",
"\n",
"### 1.4 数据集\n",
"\n",
"虽然现有的方法可以在一定程度上解决版面分析任务,但是该类方法依赖于大量有标记的训练数据。最近也有很多数据集被提出用于文档分析任务。\n",
"\n",
"1. PubLayNet[5]: 该数据集包含50万张文档图像,其中40万用于训练,5万用于验证,5万用于测试,共标记了表格,文本,图像,标题和列表五种形式\n",
"2. HJDataset[6]: 数据集包含2271张文档图像, 除了内容区域的边界框和掩码之外,它还包括布局元素的层次结构和阅读顺序。\n",
"\n",
"PubLayNet数据集样例如下图所示:\n",
"<center class=\"two\">\n",
"<img src=\"https://ai-studio-static-online.cdn.bcebos.com/4b153117c9384f98a0ce5a6c6e7c205a4b1c57e95c894ccb9688cbfc94e68a1c\" width=\"400\"/><img src=\"https://ai-studio-static-online.cdn.bcebos.com/efb9faea39554760b280f9e0e70631d2915399fa97774eecaa44ee84411c4994\" width=\"400\"/>\n",
"</center>\n",
"<center>图 3:PubLayNet样例</center>\n",
"参考文献:\n",
"\n",
"[1]:Soto C, Yoo S. Visual detection with context for document layout analysis[C]//Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP). 2019: 3464-3470.\n",
"\n",
"[2]:Li K, Wigington C, Tensmeyer C, et al. Cross-domain document object detection: Benchmark suite and method[C]//Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition. 2020: 12915-12924.\n",
"\n",
"[3]:Sarkar M, Aggarwal M, Jain A, et al. Document Structure Extraction using Prior based High Resolution Hierarchical Semantic Segmentation[C]//European Conference on Computer Vision. Springer, Cham, 2020: 649-666.\n",
"\n",
"[4]:Zhang P, Li C, Qiao L, et al. VSR: A Unified Framework for Document Layout Analysis combining Vision, Semantics and Relations[J]. arXiv preprint arXiv:2105.06220, 2021.\n",
"\n",
"[5]:Zhong X, Tang J, Yepes A J. Publaynet: largest dataset ever for document layout analysis[C]//2019 International Conference on Document Analysis and Recognition (ICDAR). IEEE, 2019: 1015-1022.\n",
"\n",
"[6]:Li M, Xu Y, Cui L, et al. DocBank: A benchmark dataset for document layout analysis[J]. arXiv preprint arXiv:2006.01038, 2020.\n",
"\n",
"[7]:Shen Z, Zhang K, Dell M. A large dataset of historical japanese documents with complex layouts[C]//Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops. 2020: 548-549."
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false
},
"source": [
"## 2. 表格识别\n",
"\n",
"### 2.1 背景介绍\n",
"\n",
"表格是各类文档中常见的页面元素,随着各类文档的爆炸性增长,如何高效地从文档中找到表格并获取内容与结构信息即表格识别,成为了一个亟需解决的问题。表格识别的难点总结如下:\n",
"\n",
"1. 表格种类和样式复杂多样,例如*不同的行列合并,不同的内容文本类型*等。\n",
"2. 文档的样式本身的样式多样。\n",
"3. 拍摄时的光照环境等\n",
"\n",
"表格识别的任务就是将文档里的表格信息转换到excel文件中,任务可视化如下:\n",
"\n",
"<center class=\"img\">\n",
"<img src=\"https://ai-studio-static-online.cdn.bcebos.com/99faa017e28b4928a408573406870ecaa251b626e0e84ab685e4b6f06f601a5f\" width=\"1600\"/></center>\n",
"\n",
"\n",
"<center>图 4:表格识别示例图,其中左边为原图,右边为表格识别后的结果图,以Excel形式呈现</center>\n",
"\n",
"现有的表格识别算法根据表格结构重建的原理可以分为下面四大类:\n",
"1. 基于启发式规则的方法\n",
"2. 基于CNN的方法\n",
"3. 基于GCN的方法\n",
"4. 基于End to End的方法\n",
"\n",
"一些代表性论文被划分为上述四个类别中,具体如下表所示:\n",
"| 类别 | 思路 | 主要论文 |\n",
"| ---------------- | ---- | -------- |\n",
"|基于启发式规则的方法|人工设计规则,连通域检测分析处理|[T-Rect](https://www.researchgate.net/profile/Andreas-Dengel/publication/249657389_A_Paper-to-HTML_Table_Converting_System/links/0c9605322c9a67274d000000/A-Paper-to-HTML-Table-Converting-System.pdf),[pdf2table](https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.724.7272&rep=rep1&type=pdf)|\n",
"| 基于CNN的方法 | 目标检测,语义分割 | [CascadeTabNet](https://arxiv.org/pdf/2004.12629v2.pdf), [Multi-Type-TD-TSR](https://arxiv.org/pdf/2105.11021v1.pdf), [LGPMA](https://arxiv.org/pdf/2105.06224v2.pdf), [tabstruct-net](https://arxiv.org/pdf/2010.04565v1.pdf), [CDeC-Net](https://arxiv.org/pdf/2008.10831v1.pdf), [TableNet](https://arxiv.org/pdf/2001.01469v1.pdf), [TableSense](https://arxiv.org/pdf/2106.13500v1.pdf), [Deepdesrt](https://www.dfki.de/fileadmin/user_upload/import/9672_PID4966073.pdf), [Deeptabstr](https://www.dfki.de/fileadmin/user_upload/import/10649_DeepTabStR.pdf), [GTE](https://arxiv.org/pdf/2005.00589v2.pdf), [Cycle-CenterNet](https://arxiv.org/pdf/2109.02199v1.pdf), [FCN](https://www.researchgate.net/publication/339027294_Rethinking_Semantic_Segmentation_for_Table_Structure_Recognition_in_Documents)|\n",
"| 基于GCN的方法 | 基于图神经网络,将表格识别看作图重建问题 | [GNN](https://arxiv.org/pdf/1905.13391v2.pdf), [TGRNet](https://arxiv.org/pdf/2106.10598v3.pdf), [GraphTSR](https://arxiv.org/pdf/1908.04729v2.pdf)|\n",
"| 基于End to End的方法 | 利用attention机制 | [Table-Master](https://arxiv.org/pdf/2105.01848v1.pdf)|\n",
"\n",
"### 2.2 基于启发式规则的传统算法\n",
"早期的表格识别研究主要是基于启发式规则的方法。例如由Kieninger[1]等人提出的T-Rect系统使用自底向上的方法对文档图像进行连通域分析,然后按照定义的规则进行合并,得到逻辑文本块。而之后由Yildiz[2]等人提出的pdf2table则是第一个在PDF文档上进行表格识别的方法,它利用了PDF文件的一些特有信息(例如文字、绘制路径等图像文档中难以获取的信息)来协助表格识别。而在最近的工作中,Koci[3]等人将页面中的布局区域表示为图(Graph)的形式,然后使用了Remove and Conquer(RAC)算法从中将表格作为一个子图识别出来。\n",
"\n",
"\n",
"<center class=\"img\">\n",
"<img src=\"https://ai-studio-static-online.cdn.bcebos.com/66aeedb3f0924d80aee15f185e6799cc687b51fc20b74b98b338ca2ea25be3f3\" width=\"1000\"/></center>\n",
"<center>图 5:启发式算法示意图</center>\n",
"\n",
"### 2.3 基于深度学习CNN的方法\n",
"随着深度学习技术在计算机视觉、自然语言处理、语音处理等领域的飞速发展,研究者将深度学习技术应用到表格识别领域并取得了不错的效果。\n",
"\n",
"Siddiqui Shoaib Ahmed[12]等人在DeepTabStR算法中,将表格结构识别问题表述为对象检测问题,并利用可变形卷积来进更好的进行表格单元格的检测。Raja Sachin[6]等人提出TabStruct-Net将单元格检测和结构识别在视觉上结合起来进行表格结构识别,解决了现有方法由于表格布局发生较大变化而识别错误的问题,但是该方法无法处理行列出现较多空单元格的问题。\n",
"\n",
"\n",
"<center class=\"img\">\n",
"<img src=\"https://ai-studio-static-online.cdn.bcebos.com/838be28836444bc1835ac30a25613d8b045a1b5aedd44b258499fe9f93dd298f\" width=\"1600\"/></center>\n",
"<center>图 6:基于深度学习CNN的算法示意图</center>\n",
"\n",
"<center class=\"img\">\n",
"<img src=\"https://ai-studio-static-online.cdn.bcebos.com/4c40dda737bd44b09a533e1b1dd2e4c6a90ceea083bf4238b7f3c7b21087f409\" width=\"1600\"/></center>\n",
"<center>图 7:基于深度学习CNN的算法错误示例</center>\n",
"\n",
"之前的表格结构识别方法一般是从不同粒度(行/列、文本区域)的元素开始处理问题,容易忽略空单元格合并的问题。Qiao Liang[10]等人提出了一个新框架LGPMA,通过掩码重评分策略充分利用来自局部和全局特征的信息,进而可以获得更可靠的对齐单元格区域,最后引入了包括单元格匹配、空单元格搜索和空单元格合并的表格结构复原pipeline来处理表格结构识别问题。\n",
"\n",
"除了以上单独做表格识别的算法外,也有部分方法将表格检测和表格识别在一个模型里完成,Schreiber Sebastian[11]等人提出了DeepDeSRT,通过Faster RCNN进行表格检测,通过FCN语义分割模型用于表格结构行列检测,但是该方法是用两个独立的模型来解决这两个问题。Prasad Devashish[4]等人提出了一种基于端到端深度学习的方法CascadeTabNet,使用Cascade Mask R-CNN HRNet模型同时进行表格检测和结构识别,解决了以往方法使用独立的两个方法处理表格识别问题的不足。Paliwal Shubham[8]等人提出一种新颖的端到端深度多任务架构TableNet,用于表格检测和结构识别,同时在训练期间向TableNet添加额外的空间语义特征,进一步提高了模型性能。Zheng Xinyi[13]等人提出了表格识别的系统框架GTE,利用单元格检测网络来指导表格检测网络的训练,同时提出了一种层次网络和一种新的基于聚类的单元格结构识别算法,该框架可以接入到任何目标检测模型的后面,方便训练不同的表格识别算法。之前的研究主要集中在从扫描的PDF文档中解析具有简单布局的,对齐良好的表格图像,但是现实场景中的表格一般很复杂,可能存在严重变形,弯曲或者遮挡等问题,因此Long Rujiao[14]等人同时构造了一个现实复杂场景下的表格识别数据集WTW,并提出了一种Cycle-CenterNet方法,它利用循环配对模块优化和提出的新配对损失,将离散单元精确地分组到结构化表中,提高了表格识别的性能。\n",
"\n",
"<center class=\"img\">\n",
"<img src=\"https://ai-studio-static-online.cdn.bcebos.com/a01f714cbe1f42fc9c45c6658317d9d7da2cec9726844f6b9fa75e30cadc9f76\" width=\"1600\"/></center>\n",
"<center>图 8:端到端算法示意图</center>\n",
"\n",
"基于CNN的方法对跨行列的表格无法很好的处理,因此在后续的方法中,分为了两个研究方法来解决表格中跨行列的问题。\n",
"\n",
"### 2.4 基于深度学习GCN的方法\n",
"近些年来,随着图卷积神经网络(Graph Convolutional Network)的兴起,也有一些研究者尝试将图神经网络应用到表格结构识别问题上。Qasim Shah Rukh[20]等人将表格结构识别问题转换为与图神经网络兼容的图问题,并设计了一种新颖的可微架构,该架构既可以利用卷积神经网络提取特征的优点,也可以利用图神经网络顶点之间有效交互的优点,但是该方法只使用了单元格的位置特征,没有利用语义特征。Chi Zewen[19]等人提出了一种新颖的图神经网络GraphTSR,用于PDF文件中的表格结构识别,它以表格中的单元格为输入,然后通过利用图的边和节点相连的特性来预测单元格之间的关系来识别表格结构,一定程度上解决了跨行或者跨列的单元格识别问题。Xue Wenyuan[21]等人将表格结构识别的问题重新表述为表图重建,并提出了一种用于表格结构识别的端到端方法TGRNet,该方法包含单元格检测分支和单元格逻辑位置分支,这两个分支共同预测不同单元格的空间位置和逻辑位置,解决了之前方法没有关注单元格逻辑位置的问题。\n",
"\n",
"GraphTSR表格识别算法示意图:\n",
"\n",
"<center class=\"img\">\n",
"<img src=\"https://ai-studio-static-online.cdn.bcebos.com/8ff89661142045a8aef54f8a7a2c69b1d243f8269034406a9e66bee2149f730f\" width=\"1600\"/></center>\n",
"<center>图 9:GraphTSR表格识别算法示意图</center>\n",
"\n",
"### 2.5 基于端到端的方法\n",
"\n",
"和其他使用后处理完成表格结构的重建不同,基于端到端的方法直接使用网络完成表格结构的HTML表示输出\n",
"\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/7865e58a83824facacfaa91bec12ccf834217cb706454dc5a0c165c203db79fb) | ![](https://ai-studio-static-online.cdn.bcebos.com/77d913b1b92f4a349b8f448e08ba78458d687eef4af142678a073830999f3edc))\n",
"---|---\n",
"图 10:端到端方法的输入输出|图 11:Image Caption示例\n",
"\n",
"端到端的方法大多采用Image Caption(看图说话)的Seq2Seq方法来完成表格结构的预测,如一些基于Attention或Transformer的方法。\n",
"\n",
"<center class=\"img\">\n",
"<img src=\"https://ai-studio-static-online.cdn.bcebos.com/3571280a9c364d3499a062e3edc724294fb5eaef8b38440991941e87f0af0c3b\" width=\"800\"/></center>\n",
"<center>图 12:Seq2Seq示意图</center>\n",
"\n",
"Ye Jiaquan[22]在TableMaster中通过改进基于Transformer的Master文字算法来得到表格结构输出模型。此外,还添加了一个分支进行框的坐标回归,作者并没有在最后一层将模型拆分为两个分支,而是在第一个 Transformer 解码层之后就将序列预测和框回归解耦为两个分支。其网络结构和原始Master网络的对比如下图所示:\n",
"\n",
"\n",
"<center class=\"img\">\n",
"<img src=\"https://ai-studio-static-online.cdn.bcebos.com/f573709447a848b4ba7c73a2e297f0304caaca57c5c94588aada1f4cd893946c\" width=\"800\"/></center>\n",
"<center>图 13:左:master网络图,右:TableMaster网络图</center>\n",
"\n",
"\n",
"### 2.6 数据集\n",
"\n",
"由于深度学习方法是数据驱动的方法,需要大量的标注数据对模型进行训练,而现有的数据集规模偏小也是一个重要的制约因素,因此也有一些数据集被提出。\n",
"\n",
"1. PubTabNet[16]: 包含568k表格图像和相应的结构化HTML表示。\n",
"2. PubMed Tables(PubTables-1M)[17]:表格结构识别数据集,包含高度详细的结构注释,460,589张pdf图像用于表格检测任务, 947,642张表格图像用于表格识别任务。\n",
"3. TableBank[18]: 表格检测和识别数据集,使用互联网上Word和Latex文档构建了包含417K高质量标注的表格数据。\n",
"4. SciTSR[19]: 表格结构识别数据集,图像大部分从论文中转换而来,其中包含来自PDF文件的15,000个表格及其相应的结构标签。\n",
"5. TabStructDB[12]: 包括1081个表格区域,这些区域用行和列信息密集标记。\n",
"6. WTW[14]: 大规模数据集场景表格检测识别数据集,该数据集包含各种变形,弯曲和遮挡等情况下的表格数据,共包含14,581 张图像。\n",
"\n",
"数据集示例\n",
"\n",
"<center class=\"img\">\n",
"<img src=\"https://ai-studio-static-online.cdn.bcebos.com/c9763df56e67434f97cd435100d50ded71ba66d9d4f04d7f8f896d613cdf02b0\" /></center>\n",
"<center>图 14:PubTables-1M数据集样例图</center>\n",
"\n",
"<center class=\"img\">\n",
"<img src=\"https://ai-studio-static-online.cdn.bcebos.com/64de203bbe584642a74f844ac4b61d1ec3c5a38cacb84443ac961fbcc54a66ce\" width=\"600\"/></center>\n",
"<center>图 15:WTW数据集样例图</center>\n",
"\n",
"\n",
"\n",
"参考文献\n",
"\n",
"[1]:Kieninger T, Dengel A. A paper-to-HTML table converting system[C]//Proceedings of document analysis systems (DAS). 1998, 98: 356-365.\n",
"\n",
"[2]:Yildiz B, Kaiser K, Miksch S. pdf2table: A method to extract table information from pdf files[C]//IICAI. 2005: 1773-1785.\n",
"\n",
"[3]:Koci E, Thiele M, Lehner W, et al. Table recognition in spreadsheets via a graph representation[C]//2018 13th IAPR International Workshop on Document Analysis Systems (DAS). IEEE, 2018: 139-144.\n",
"\n",
"[4]:Prasad D, Gadpal A, Kapadni K, et al. CascadeTabNet: An approach for end to end table detection and structure recognition from image-based documents[C]//Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops. 2020: 572-573.\n",
"\n",
"[5]:Fischer P, Smajic A, Abrami G, et al. Multi-Type-TD-TSR–Extracting Tables from Document Images Using a Multi-stage Pipeline for Table Detection and Table Structure Recognition: From OCR to Structured Table Representations[C]//German Conference on Artificial Intelligence (Künstliche Intelligenz). Springer, Cham, 2021: 95-108.\n",
"\n",
"[6]:Raja S, Mondal A, Jawahar C V. Table structure recognition using top-down and bottom-up cues[C]//European Conference on Computer Vision. Springer, Cham, 2020: 70-86.\n",
"\n",
"[7]:Agarwal M, Mondal A, Jawahar C V. Cdec-net: Composite deformable cascade network for table detection in document images[C]//2020 25th International Conference on Pattern Recognition (ICPR). IEEE, 2021: 9491-9498.\n",
"\n",
"[8]:Paliwal S S, Vishwanath D, Rahul R, et al. Tablenet: Deep learning model for end-to-end table detection and tabular data extraction from scanned document images[C]//2019 International Conference on Document Analysis and Recognition (ICDAR). IEEE, 2019: 128-133.\n",
"\n",
"[9]:Dong H, Liu S, Han S, et al. Tablesense: Spreadsheet table detection with convolutional neural networks[C]//Proceedings of the AAAI Conference on Artificial Intelligence. 2019, 33(01): 69-76.\n",
"\n",
"[10]:Qiao L, Li Z, Cheng Z, et al. LGPMA: Complicated Table Structure Recognition with Local and Global Pyramid Mask Alignment[J]. arXiv preprint arXiv:2105.06224, 2021.\n",
"\n",
"[11]:Schreiber S, Agne S, Wolf I, et al. Deepdesrt: Deep learning for detection and structure recognition of tables in document images[C]//2017 14th IAPR international conference on document analysis and recognition (ICDAR). IEEE, 2017, 1: 1162-1167.\n",
"\n",
"[12]:Siddiqui S A, Fateh I A, Rizvi S T R, et al. Deeptabstr: Deep learning based table structure recognition[C]//2019 International Conference on Document Analysis and Recognition (ICDAR). IEEE, 2019: 1403-1409.\n",
"\n",
"[13]:Zheng X, Burdick D, Popa L, et al. Global table extractor (gte): A framework for joint table identification and cell structure recognition using visual context[C]//Proceedings of the IEEE/CVF Winter Conference on Applications of Computer Vision. 2021: 697-706.\n",
"\n",
"[14]:Long R, Wang W, Xue N, et al. Parsing Table Structures in the Wild[C]//Proceedings of the IEEE/CVF International Conference on Computer Vision. 2021: 944-952.\n",
"\n",
"[15]:Siddiqui S A, Khan P I, Dengel A, et al. Rethinking semantic segmentation for table structure recognition in documents[C]//2019 International Conference on Document Analysis and Recognition (ICDAR). IEEE, 2019: 1397-1402.\n",
"\n",
"[16]:Zhong X, ShafieiBavani E, Jimeno Yepes A. Image-based table recognition: data, model, and evaluation[C]//Computer Vision–ECCV 2020: 16th European Conference, Glasgow, UK, August 23–28, 2020, Proceedings, Part XXI 16. Springer International Publishing, 2020: 564-580.\n",
"\n",
"[17]:Smock B, Pesala R, Abraham R. PubTables-1M: Towards a universal dataset and metrics for training and evaluating table extraction models[J]. arXiv preprint arXiv:2110.00061, 2021.\n",
"\n",
"[18]:Li M, Cui L, Huang S, et al. Tablebank: Table benchmark for image-based table detection and recognition[C]//Proceedings of the 12th Language Resources and Evaluation Conference. 2020: 1918-1925.\n",
"\n",
"[19]:Chi Z, Huang H, Xu H D, et al. Complicated table structure recognition[J]. arXiv preprint arXiv:1908.04729, 2019.\n",
"\n",
"[20]:Qasim S R, Mahmood H, Shafait F. Rethinking table recognition using graph neural networks[C]//2019 International Conference on Document Analysis and Recognition (ICDAR). IEEE, 2019: 142-147.\n",
"\n",
"[21]:Xue W, Yu B, Wang W, et al. TGRNet: A Table Graph Reconstruction Network for Table Structure Recognition[J]. arXiv preprint arXiv:2106.10598, 2021.\n",
"\n",
"[22]:Ye J, Qi X, He Y, et al. PingAn-VCGroup's Solution for ICDAR 2021 Competition on Scientific Literature Parsing Task B: Table Recognition to HTML[J]. arXiv preprint arXiv:2105.01848, 2021.\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false
},
"source": [
"## 3. Document VQA\n",
"\n",
"老板派任务:开发一个身份证识别系统\n",
"\n",
"<center class=\"img\">\n",
"<img src=\"https://ai-studio-static-online.cdn.bcebos.com/63bbe893465e4f98b3aec80a042758b520d43e1a993a47e39bce1123c2d29b3f\" width=\"1600\"/></center>\n",
"\n",
"\n",
"> 如何选择方案 \n",
"> 1. 文字检测之后用规则来进行信息提取\n",
"> 2. 文字检测之后用规模型来进行信息提取\n",
"> 3. 外包出去\n",
"\n",
"\n",
"### 3.1 背景介绍\n",
"在VQA(Visual Question Answering)任务中,主要针对图像内容进行提问和回答,但是对于文本图像来说,关注的内容是图像中的文字信息,因此这类方法可以分为自然场景的Text-VQA和扫描文档场景的DocVQA,三者的关系如下图所示。\n",
"\n",
"<center class=\"img\">\n",
"<img src=\"https://ai-studio-static-online.cdn.bcebos.com/a91cfd5152284152b020ca8a396db7a21fd982e3661540d5998cc19c17d84861\" width=\"600\"/></center>\n",
"<center>图 16: VQA层级</center>\n",
"\n",
"VQA,Text-VQA和DocVQA的示例图如下图所示。\n",
"\n",
"|任务类型|VQA | Text-VQA | DocVQA| \n",
"|---|---|---|---|\n",
"|任务描述|针对**图片内容**提出问题|针对**图片上的文字内容**提出问题|针对**文档图像的文字内容**提出问题|\n",
"|示例图片|![vqa](https://ai-studio-static-online.cdn.bcebos.com/fc21b593276247249591231b3373608151ed8ae7787f4d6ba39e8779fdd12201)|![textvqa](https://ai-studio-static-online.cdn.bcebos.com/cd2404edf3bf430b89eb9b2509714499380cd02e4aa74ec39ca6d7aebcf9a559)|![docvqa](https://ai-studio-static-online.cdn.bcebos.com/0eec30a6f91b4f949c56729b856f7ff600d06abee0774642801c070303edfe83)|\n",
"\n",
"DocVQA由于其更加贴近实际应用场景,涌现出了大批学术界和工业界的工作。在常用的场景中,DocVQA里提问的问题都是固定的,比如身份证场景下的问题一般为\n",
"1. 公民身份号码是什么?\n",
"2. 姓名是什么?\n",
"3. 名族是什么?\n",
"\n",
"<center class=\"img\">\n",
"<img src=\"https://ai-studio-static-online.cdn.bcebos.com/2d2b86468daf47c98be01f44b8d6efa64bc09e43cd764298afb127f19b07aede\" width=\"800\"/></center>\n",
"<center>图 17: 身份证示例</center>\n",
"\n",
"\n",
"基于这样的先验知识,DocVQA的 研究开始偏向Key Information Extraction(KIE)任务,本次我们也主要讨论KIE相关的研究,KIE任务主要从图像中提取所需要的关键信息,如从身份证中提取出姓名和公民身份号码信息。\n",
"\n",
"KIE通常分为两个子任务进行研究\n",
"1. SER: 语义实体识别 (Semantic Entity Recognition),对每一个检测到的文本进行分类,如将其分为姓名,身份证。如下图中的黑色框和红色框。\n",
"2. RE: 关系抽取 (Relation Extraction),对每一个检测到的文本进行分类,如将其分为问题和的答案。然后对每一个问题找到对应的答案。如下图中的红色框和黑色框分别代表问题和答案,黄色线代表问题和答案之间的对应关系。\n",
"\n",
"\n",
"<center class=\"img\">\n",
"<img src=\"https://ai-studio-static-online.cdn.bcebos.com/899470ba601349fbbc402a4c83e6cdaee08aaa10b5004977b1f684f346ebe31f\" width=\"800\"/></center>\n",
"<center>图 18: SER,RE任务示例</center>\n",
"\n",
"一般的KIE方法基于命名实体识别(Named Entity Recognition,NER)[4]来研究,但是这类方法只利用了图像中的文本信息,缺少对视觉和结构信息的使用,因此精度不高。在此基础上,近几年的方法都开始将视觉和结构信息与文本信息融合到一起,按照对多模态信息进行融合时所采用的的原理可以将这些方法分为下面三种:\n",
"\n",
"1. 基于Grid的方法\n",
"1. 基于Token的方法\n",
"2. 基于GCN的方法\n",
"3. 基于End to End 的方法\n",
"\n",
"一些代表性论文被划分为上述三个类别中,具体如下表所示:\n",
"| 类别 | 思路 | 主要论文 |\n",
"| ---------------- | ---- | -------- |\n",
"| 基于Grid的方法 |在图像上多模态信息的融合(文本,布局,图像)| [Chargrid](https://arxiv.org/pdf/1809.08799) |\n",
"| 基于Token的方法 |利用Bert这类方法进行多模态信息的融合|[LayoutLM](https://arxiv.org/pdf/1912.13318), [LayoutLMv2](https://arxiv.org/pdf/2012.14740), [StrucText](https://arxiv.org/pdf/2108.02923), |\n",
"| 基于GCN的方法 |利用图网络结构进行多模态信息的融合 |[GCN](https://arxiv.org/pdf/1903.11279), [PICK](https://arxiv.org/pdf/2004.07464), [SDMG-R](https://arxiv.org/pdf/2103.14470),[SERA](https://arxiv.org/pdf/2110.09915) |\n",
"| 基于End to End的方法 |将OCR和关键信息提取统一到一个网络 |[Trie](https://arxiv.org/pdf/2005.13118) |\n",
"\n",
"### 3.2 基于Grid的方法\n",
"\n",
"基于Grid的方法在图像层面进行多模态信息的融合。Chargrid[5]首先对图像进行字符级的文字检测和识别,然后通过将类别的one-hot编码填充到对应的字符区域(下图中右图的非黑色部分)内来完成对网络输入的构建,输入最后通过encoder-decoder结构的CNN网络来进行关键信息的坐标检测和类别分类。\n",
"\n",
"<center class=\"img\">\n",
"<img src=\"https://ai-studio-static-online.cdn.bcebos.com/f248841769ec4312a9015b4befda37bf29db66226431420ca1faad517783875e\" width=\"800\"/></center>\n",
"<center>图 19: Chargrid数据示例</center>\n",
"\n",
"\n",
"<center class=\"img\">\n",
"<img src=\"https://ai-studio-static-online.cdn.bcebos.com/0682e52e275b4187a0e74f54961a50091fd3a0cdff734e17bedcbc993f6e29f9\" width=\"800\"/></center>\n",
"<center>图 20: Chargrid网络</center>\n",
"\n",
"\n",
"相比于传统的仅基于文本的方法,该方法能够同时利用文本信息和结构信息,因此能够取得一定的精度提升,但是该方法对文本和结构信息的融合只是做了简单的嵌入,并没有很好的将二者进行融合\n",
"\n",
"### 3.3 基于Token的方法\n",
"LayoutLM[6]将2D位置信息和文本信息一起编码到BERT模型中,并且借鉴NLP中Bert的预训练思想,在大规模的数据集上进行预训练,在下游任务中,LayoutLM还加入了图像信息来进一步提升模型性能。LayoutLM虽然将文本,位置和图像信息做了融合,但是图像信息是在下游任务的训练中进行融合,这样对三种信息的多模态融合并不充分。LayoutLMv2[7]在LayoutLM的基础上,通过transformers在预训练阶段将图像信息和文本,layout信息进行融合,还在Transformer中加入空间感知自注意力机制辅助模型更好地融合视觉和文本特征。LayoutLMv2虽然在预训练阶段对文本,位置和图像信息做了融合,但是由于预训练任务的限制,模型学到的视觉特征不够精细。StrucTexT[8]在以往多模态方法的基础上,在预训练任务提出Sentence Length Prediction (SLP) 和Paired Boxes Direction (PBD)两个新任务来帮助网络学习精细的视觉特征,其中SLP任务让模型学习文本段的长度,PDB任务让模型学习Box方向之间的匹配关系。通过这两个新的预训练任务,能够加速文本、视觉和布局信息之间的深度跨模态融合。\n",
"\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/17a26ade09ee4311b90e49a1c61d88a72a82104478434f9dabd99c27a65d789b) | ![](https://ai-studio-static-online.cdn.bcebos.com/d75addba67ef4b06a02ae40145e609d3692d613ff9b74cec85123335b465b3cc))\n",
"---|---\n",
"图 21:transformer算法流程图|图 22:LayoutLMv2算法流程图\n",
"\n",
"### 3.4 基于GCN的方法\n",
"\n",
"现有的基于GCN的方法[10]虽然利用了文字和结构信息,但是没有对图像信息进行很好的利用。PICK[11]在GCN网络中加入了图像信息并且提出graph learning module来自动学习edge的类型。SDMG-R [12]将图像编码为双模态图,图的节点为文字区域的视觉和文本信息,边表示相邻文本直接的空间关系,通过迭代地沿边传播信息和推理图节点类别,SDMG-R解决了现有的方法对没见过的模板无能为力的问题。\n",
"\n",
"\n",
"PICK流程图如下图所示:\n",
"\n",
"<center class=\"img\">\n",
"<img src=\"https://ai-studio-static-online.cdn.bcebos.com/d3282959e6b2448c89b762b3b9bbf6197a0364b101214a1f83cf01a28623c01c\" width=\"800\"/></center>\n",
"<center>图 23:PICK算法流程图</center>\n",
"\n",
"SERA[10]将依存句法分析里的biaffine parser引入到文档关系抽取中,并且使用GCN来融合文本和视觉信息。\n",
"\n",
"<center class=\"img\">\n",
"<img src=\"https://ai-studio-static-online.cdn.bcebos.com/a97b7647968a4fa59e7b14b384dd7ffe812f158db8f741459b6e6bb0e8b657c7\" width=\"800\"/></center>\n",
"<center>图 24:SERA算法流程图</center>\n",
"\n",
"### 3.5 基于End to End 的方法\n",
"\n",
"现有的方法将KIE分为两个独立的任务:文本读取和信息提取,然而他们主要关注于改进信息提取任务,而忽略了文本读取和信息提取是相互关联的,因此,Trie[9]提出了一个统一的端到端网络,可以同时学习这两个任务,并且在学习过程中相互加强。\n",
"\n",
"<center class=\"img\">\n",
"<img src=\"https://ai-studio-static-online.cdn.bcebos.com/6e4a3b0f65254f6b9d40cea0875854d4f47e1dca6b1e408cad435b3629600608\" width=\"1300\"/></center>\n",
"<center>图 25: Trie算法流程图</center>\n",
"\n",
"\n",
"### 3.6 数据集\n",
"用于KIE的数据集主要有下面两个:\n",
"1. SROIE: SROIE数据集[2]的任务3旨在从扫描收据中提取四个预定义的信息:公司、日期、地址或总数。数据集中有626个样本用于训练,347个样本用于测试。\n",
"2. FUNSD: FUNSD数据集[3]是一个用于从扫描文档中提取表单信息的数据集。它包含199个标注好的真实扫描表单。199个样本中149个用于训练,50个用于测试。FUNSD数据集为每个单词分配一个语义实体标签:问题、答案、标题或其他。\n",
"3. XFUN: XFUN数据集是微软提出的一个多语言数据集,包含7种语言,每种语言包含149张训练集,50张测试集。\n",
"\n",
"\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/dfdf530d79504761919c1f093f9a86dac21e6db3304c4892998ea1823f3187c6) | ![](https://ai-studio-static-online.cdn.bcebos.com/3b2a9f9476be4e7f892b73bd7096ce8d88fe98a70bae47e6ab4c5fcc87e83861))\n",
"---|---\n",
"图 26: sroie示例图|图 27: xfun示例图\n",
"\n",
"参考文献:\n",
"\n",
"[1]:Mathew M, Karatzas D, Jawahar C V. Docvqa: A dataset for vqa on document images[C]//Proceedings of the IEEE/CVF Winter Conference on Applications of Computer Vision. 2021: 2200-2209.\n",
"\n",
"[2]:Huang Z, Chen K, He J, et al. Icdar2019 competition on scanned receipt ocr and information extraction[C]//2019 International Conference on Document Analysis and Recognition (ICDAR). IEEE, 2019: 1516-1520.\n",
"\n",
"[3]:Jaume G, Ekenel H K, Thiran J P. Funsd: A dataset for form understanding in noisy scanned documents[C]//2019 International Conference on Document Analysis and Recognition Workshops (ICDARW). IEEE, 2019, 2: 1-6.\n",
"\n",
"[4]:Lample G, Ballesteros M, Subramanian S, et al. Neural architectures for named entity recognition[J]. arXiv preprint arXiv:1603.01360, 2016.\n",
"\n",
"[5]:Katti A R, Reisswig C, Guder C, et al. Chargrid: Towards understanding 2d documents[J]. arXiv preprint arXiv:1809.08799, 2018.\n",
"\n",
"[6]:Xu Y, Li M, Cui L, et al. Layoutlm: Pre-training of text and layout for document image understanding[C]//Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining. 2020: 1192-1200.\n",
"\n",
"[7]:Xu Y, Xu Y, Lv T, et al. LayoutLMv2: Multi-modal pre-training for visually-rich document understanding[J]. arXiv preprint arXiv:2012.14740, 2020.\n",
"\n",
"[8]:Li Y, Qian Y, Yu Y, et al. StrucTexT: Structured Text Understanding with Multi-Modal Transformers[C]//Proceedings of the 29th ACM International Conference on Multimedia. 2021: 1912-1920.\n",
"\n",
"[9]:Zhang P, Xu Y, Cheng Z, et al. Trie: End-to-end text reading and information extraction for document understanding[C]//Proceedings of the 28th ACM International Conference on Multimedia. 2020: 1413-1422.\n",
"\n",
"[10]:Liu X, Gao F, Zhang Q, et al. Graph convolution for multimodal information extraction from visually rich documents[J]. arXiv preprint arXiv:1903.11279, 2019.\n",
"\n",
"[11]:Yu W, Lu N, Qi X, et al. Pick: Processing key information extraction from documents using improved graph learning-convolutional networks[C]//2020 25th International Conference on Pattern Recognition (ICPR). IEEE, 2021: 4363-4370.\n",
"\n",
"[12]:Sun H, Kuang Z, Yue X, et al. Spatial Dual-Modality Graph Reasoning for Key Information Extraction[J]. arXiv preprint arXiv:2103.14470, 2021."
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false
},
"source": [
"## 4. 总结\n",
"本节我们主要介绍了文档分析技术相关的三个子模块的理论知识:版面分析、表格识别和信息提取。下面我们会基于PaddleOCR框架对这表格识别和DOC-VQA进行实战教程的讲解。"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "py35-paddle1.2.0"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
......@@ -344,7 +344,7 @@ class KieLabelEncode(object):
max_num = 300
temp_bboxes = np.zeros([max_num, 4])
h, _ = bboxes.shape
temp_bboxes[:h, :h] = bboxes
temp_bboxes[:h, :] = bboxes
temp_relations = np.zeros([max_num, max_num, 5])
temp_relations[:h, :h, :] = relations
......
......@@ -23,7 +23,6 @@ import sys
import six
import cv2
import numpy as np
import fasttext
class DecodeImage(object):
......@@ -136,6 +135,7 @@ class ToCHWImage(object):
class Fasttext(object):
def __init__(self, path="None", **kwargs):
import fasttext
self.fast_model = fasttext.load_model(path)
def __call__(self, data):
......
......@@ -111,13 +111,16 @@ def load_pretrained_params(model, path):
params = paddle.load(path + '.pdparams')
state_dict = model.state_dict()
new_state_dict = {}
for k1, k2 in zip(state_dict.keys(), params.keys()):
if list(state_dict[k1].shape) == list(params[k2].shape):
new_state_dict[k1] = params[k2]
for k1 in params.keys():
if k1 not in state_dict.keys():
logger.warning("The pretrained params {} not in model".format(k1))
else:
if list(state_dict[k1].shape) == list(params[k1].shape):
new_state_dict[k1] = params[k1]
else:
logger.warning(
"The shape of model params {} {} not matched with loaded params {} {} !".
format(k1, state_dict[k1].shape, k2, params[k2].shape))
format(k1, state_dict[k1].shape, k1, params[k1].shape))
model.set_state_dict(new_state_dict)
logger.info("load pretrain successful from {}".format(path))
return model
......
......@@ -96,10 +96,7 @@ git clone https://gitee.com/paddlepaddle/PaddleOCR
- **(3)安装PaddleNLP**
```bash
# 需要使用PaddleNLP最新的代码版本进行安装
git clone https://github.com/PaddlePaddle/PaddleNLP -b develop
cd PaddleNLP
pip3 install -e .
pip3 install "paddlenlp>=2.2.1"
```
......
......@@ -3,7 +3,7 @@ model_name:PPOCRv2_ocr_det_kl
python:python3.7
Global.pretrained_model:null
Global.save_inference_dir:null
infer_model:./inference/ch_PP-OCRv2_det_infer/
infer_model:./inference/ch_PP-OCRv2_det_infer
infer_export:deploy/slim/quantization/quant_kl.py -c configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_cml.yml -o
infer_quant:True
inference:tools/infer/predict_det.py
......
......@@ -13,7 +13,7 @@ inference:tools/infer/predict_rec.py
--rec_batch_num:1
--use_tensorrt:False|True
--precision:int8
--det_model_dir:
--rec_model_dir:
--image_dir:./inference/rec_inference
null:null
--benchmark:True
......
......@@ -43,7 +43,7 @@ inference:tools/infer/predict_det.py
--cpu_threads:1|6
--rec_batch_num:1
--use_tensorrt:False
--precision:fp32|fp16|int8
--precision:fp32|int8
--det_model_dir:
--image_dir:./inference/ch_det_data_50/all-sum-510/
null:null
......
......@@ -43,7 +43,7 @@ inference:tools/infer/predict_det.py
--cpu_threads:1|6
--rec_batch_num:1
--use_tensorrt:False
--precision:fp32|fp16|int8
--precision:fp32|int8
--det_model_dir:
--image_dir:./inference/ch_det_data_50/all-sum-510/
null:null
......
......@@ -179,7 +179,7 @@ elif [ ${MODE} = "whole_infer" ];then
cd ./inference/ && tar xf rec_r34_vd_tps_bilstm_ctc_v2.0_train.tar && cd ../
fi
if [ ${model_name} == "ch_ppocr_server_v2.0_rec" ]; then
wget -nc -P ./inference/ https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/ch_ppocr_server_v2.0_rec_train.tar --no-check-certificate
wget -nc -P ./inference/ https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_train.tar --no-check-certificate
cd ./inference/ && tar xf ch_ppocr_server_v2.0_rec_train.tar && cd ../
fi
if [ ${model_name} == "ch_ppocr_mobile_v2.0_rec" ]; then
......@@ -239,18 +239,23 @@ fi
if [ ${MODE} = "klquant_whole_infer" ]; then
wget -nc -P ./train_data/ https://paddleocr.bj.bcebos.com/dygraph_v2.0/test/icdar2015_lite.tar --no-check-certificate
cd ./train_data/ && tar xf icdar2015_lite.tar
ln -s ./icdar2015_lite ./icdar2015 && cd ../
cd ./train_data/ && tar xf icdar2015_lite.tar && rm -rf ./icdar2015 && ln -s ./icdar2015_lite ./icdar2015 && cd ../
if [ ${model_name} = "ch_ppocr_mobile_v2.0_det_KL" ]; then
wget -nc -P ./inference https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_infer.tar --no-check-certificate
wget -nc -P ./inference https://paddleocr.bj.bcebos.com/dygraph_v2.0/test/ch_det_data_50.tar --no-check-certificate
cd ./inference && tar xf ch_ppocr_mobile_v2.0_det_infer.tar && tar xf ch_det_data_50.tar && cd ../
fi
if [ ${model_name} = "ch_PPOCRv2_det" ]; then
eval_model_name="ch_PP-OCRv2_det_infer"
if [ ${model_name} = "PPOCRv2_ocr_rec_kl" ]; then
wget -nc -P ./inference https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_infer.tar --no-check-certificate
wget -nc -P ./inference/ https://paddleocr.bj.bcebos.com/dygraph_v2.0/test/rec_inference.tar --no-check-certificate
wget -nc -P ./train_data/ https://paddleocr.bj.bcebos.com/dygraph_v2.0/test/ic15_data.tar --no-check-certificate
cd ./train_data/ && tar xf ic15_data.tar && cd ../
cd ./inference && tar xf rec_inference.tar && tar xf ch_PP-OCRv2_rec_infer.tar && cd ../
fi
if [ ${model_name} = "PPOCRv2_ocr_det_kl" ]; then
wget -nc -P ./inference https://paddleocr.bj.bcebos.com/dygraph_v2.0/test/ch_det_data_50.tar --no-check-certificate
wget -nc -P ./inference/ https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer.tar --no-check-certificate
cd ./inference && tar xf ${eval_model_name}.tar && tar xf ch_det_data_50.tar && cd ../
wget -nc -P ./inference https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer.tar --no-check-certificate
cd ./inference && tar xf ch_PP-OCRv2_det_infer.tar && tar xf ch_det_data_50.tar && cd ../
fi
if [ ${model_name} = "ch_ppocr_mobile_v2.0_rec_KL" ]; then
wget -nc -P ./inference/ https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_infer.tar --no-check-certificate
......
......@@ -183,7 +183,7 @@ function func_inference(){
if [[ ${precision} =~ "fp16" || ${precision} =~ "int8" ]] && [ ${use_trt} = "False" ]; then
continue
fi
if [[ ${use_trt} = "False" || ${precision} =~ "int8" ]] && [ ${_flag_quant} = "True" ]; then
if [[ ${use_trt} = "False" && ${precision} =~ "int8" ]] && [ ${_flag_quant} = "True" ]; then
continue
fi
for batch_size in ${batch_size_list[*]}; do
......@@ -227,7 +227,12 @@ if [ ${MODE} = "whole_infer" ] || [ ${MODE} = "klquant_whole_infer" ]; then
for infer_model in ${infer_model_dir_list[*]}; do
# run export
if [ ${infer_run_exports[Count]} != "null" ];then
save_infer_dir=$(dirname $infer_model)
if [ ${MODE} = "klquant_whole_infer" ]; then
save_infer_dir="${infer_model}_klquant"
fi
if [ ${MODE} = "whole_infer" ]; then
save_infer_dir="${infer_model}"
fi
set_export_weight=$(func_set_params "${export_weight}" "${infer_model}")
set_save_infer_key=$(func_set_params "${save_infer_key}" "${save_infer_dir}")
export_cmd="${python} ${infer_run_exports[Count]} ${set_export_weight} ${set_save_infer_key}"
......@@ -259,7 +264,6 @@ else
env=""
elif [ ${#gpu} -le 1 ];then
env="export CUDA_VISIBLE_DEVICES=${gpu}"
eval ${env}
elif [ ${#gpu} -le 15 ];then
IFS=","
array=(${gpu})
......@@ -280,6 +284,7 @@ else
set_amp_config=" "
fi
for trainer in ${trainer_list[*]}; do
eval ${env}
flag_quant=False
if [ ${trainer} = ${pact_key} ]; then
run_train=${pact_trainer}
......@@ -332,7 +337,6 @@ else
cmd="${python} -m paddle.distributed.launch --ips=${ips} --gpus=${gpu} ${run_train} ${set_use_gpu} ${set_save_model} ${set_pretrain} ${set_epoch} ${set_autocast} ${set_batchsize} ${set_train_params1} ${set_amp_config}"
fi
# run train
eval "unset CUDA_VISIBLE_DEVICES"
eval $cmd
status_check $? "${cmd}" "${status_log}"
......
......@@ -101,16 +101,21 @@ class TextDetector(object):
else:
logger.info("unknown det_algorithm:{}".format(self.det_algorithm))
sys.exit(0)
self.preprocess_op = create_operators(pre_process_list)
self.postprocess_op = build_post_process(postprocess_params)
self.predictor, self.input_tensor, self.output_tensors, self.config = utility.create_predictor(
args, 'det', logger)
if self.use_onnx:
img_h, img_w = self.input_tensor.shape[2:]
if img_h is not None and img_w is not None and img_h > 0 and img_w > 0:
pre_process_list[0] = {
'DetResizeForTest': {
'image_shape': [640, 640]
'image_shape': [img_h, img_w]
}
}
self.preprocess_op = create_operators(pre_process_list)
self.postprocess_op = build_post_process(postprocess_params)
self.predictor, self.input_tensor, self.output_tensors, self.config = utility.create_predictor(
args, 'det', logger)
if args.benchmark:
import auto_log
......
......@@ -109,7 +109,10 @@ class TextRecognizer(object):
assert imgC == img.shape[2]
imgW = int((32 * max_wh_ratio))
if self.use_onnx:
imgW = 100
w = self.input_tensor.shape[3:][0]
if w is not None and w > 0:
imgW = w
h, w = img.shape[:2]
ratio = w / float(h)
if math.ceil(imgH * ratio) > imgW:
......
......@@ -15,6 +15,7 @@
import argparse
import os
import sys
import platform
import cv2
import numpy as np
import paddle
......@@ -313,6 +314,10 @@ def create_predictor(args, mode, logger):
def get_infer_gpuid():
sysstr = platform.system()
if sysstr == "Windows":
return 0
if not paddle.fluid.core.is_compiled_with_rocm():
cmd = "env | grep CUDA_VISIBLE_DEVICES"
else:
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册