提交 a0252354 编写于 作者: L LDOUBLEV

Merge branch 'dygraph' of https://github.com/PaddlePaddle/PaddleOCR into dygraph

...@@ -1149,6 +1149,9 @@ class MainWindow(QMainWindow): ...@@ -1149,6 +1149,9 @@ class MainWindow(QMainWindow):
for box in self.result_dic: for box in self.result_dic:
trans_dic = {"label": box[1][0], "points": box[0], "difficult": False} trans_dic = {"label": box[1][0], "points": box[0], "difficult": False}
if self.kie_mode: if self.kie_mode:
if len(box) == 3:
trans_dic.update({"key_cls": box[2]})
else:
trans_dic.update({"key_cls": "None"}) trans_dic.update({"key_cls": "None"})
if trans_dic["label"] == "" and mode == 'Auto': if trans_dic["label"] == "" and mode == 'Auto':
continue continue
...@@ -2047,6 +2050,7 @@ class MainWindow(QMainWindow): ...@@ -2047,6 +2050,7 @@ class MainWindow(QMainWindow):
rec_flag = 0 rec_flag = 0
for shape in self.canvas.shapes: for shape in self.canvas.shapes:
box = [[int(p.x()), int(p.y())] for p in shape.points] box = [[int(p.x()), int(p.y())] for p in shape.points]
kie_cls = shape.key_cls
if len(box) > 4: if len(box) > 4:
box = self.gen_quad_from_poly(np.array(box)) box = self.gen_quad_from_poly(np.array(box))
...@@ -2062,15 +2066,25 @@ class MainWindow(QMainWindow): ...@@ -2062,15 +2066,25 @@ class MainWindow(QMainWindow):
if shape.line_color == DEFAULT_LOCK_COLOR: if shape.line_color == DEFAULT_LOCK_COLOR:
shape.label = result[0][0] shape.label = result[0][0]
result.insert(0, box) result.insert(0, box)
if self.kie_mode:
result.append(kie_cls)
self.result_dic_locked.append(result) self.result_dic_locked.append(result)
else: else:
result.insert(0, box) result.insert(0, box)
if self.kie_mode:
result.append(kie_cls)
self.result_dic.append(result) self.result_dic.append(result)
else: else:
print('Can not recognise the box') print('Can not recognise the box')
if shape.line_color == DEFAULT_LOCK_COLOR: if shape.line_color == DEFAULT_LOCK_COLOR:
shape.label = result[0][0] shape.label = result[0][0]
if self.kie_mode:
self.result_dic_locked.append([box, (self.noLabelText, 0), kie_cls])
else:
self.result_dic_locked.append([box, (self.noLabelText, 0)]) self.result_dic_locked.append([box, (self.noLabelText, 0)])
else:
if self.kie_mode:
self.result_dic.append([box, (self.noLabelText, 0), kie_cls])
else: else:
self.result_dic.append([box, (self.noLabelText, 0)]) self.result_dic.append([box, (self.noLabelText, 0)])
try: try:
...@@ -2322,13 +2336,6 @@ class MainWindow(QMainWindow): ...@@ -2322,13 +2336,6 @@ class MainWindow(QMainWindow):
else: else:
labeldict[file] = [] labeldict[file] = []
# if len(labeldict) != len(csv_paths):
# msg = 'ERROR, box label and excel label are not in the same number\n' + \
# 'box label: ' + str(len(labeldict)) + '\n' + \
# 'excel label: ' + str(len(csv_paths)) + '\n' + \
# 'Please check the label.txt and tableRec_excel_output\n'
# QMessageBox.information(self, "Information", msg)
# return
train_split, val_split, test_split = partitionDialog.getDataPartition() train_split, val_split, test_split = partitionDialog.getDataPartition()
# check validate # check validate
if train_split + val_split + test_split > 100: if train_split + val_split + test_split > 100:
...@@ -2351,14 +2358,8 @@ class MainWindow(QMainWindow): ...@@ -2351,14 +2358,8 @@ class MainWindow(QMainWindow):
filename, _ = os.path.splitext(os.path.basename(image_path)) filename, _ = os.path.splitext(os.path.basename(image_path))
csv_path = os.path.join(TableRec_excel_dir, filename + '.xlsx') csv_path = os.path.join(TableRec_excel_dir, filename + '.xlsx')
if not os.path.exists(csv_path): if not os.path.exists(csv_path):
msg = 'ERROR, Can not find ' + csv_path continue
QMessageBox.information(self, "Information", msg)
return
# read xlsx file, convert to HTML
# xd = pd.ExcelFile(csv_path)
# df = xd.parse()
# structure = df.to_html(index = False)
excel = xlrd.open_workbook(csv_path) excel = xlrd.open_workbook(csv_path)
sheet0 = excel.sheet_by_index(0) # only sheet 0 sheet0 = excel.sheet_by_index(0) # only sheet 0
merged_cells = sheet0.merged_cells # (0,1,1,3) start row, end row, start col, end col merged_cells = sheet0.merged_cells # (0,1,1,3) start row, end row, start col, end col
...@@ -2370,7 +2371,6 @@ class MainWindow(QMainWindow): ...@@ -2370,7 +2371,6 @@ class MainWindow(QMainWindow):
token_list = convert_token(html_list) token_list = convert_token(html_list)
# load box annotations # load box annotations
cells = [] cells = []
for anno in labeldict[image_path]: for anno in labeldict[image_path]:
......
此差异已折叠。
...@@ -221,10 +221,10 @@ def convert_token(html_list): ...@@ -221,10 +221,10 @@ def convert_token(html_list):
token_list.append("<td") token_list.append("<td")
if 'colspan' in col: if 'colspan' in col:
_, n = col.split('colspan=') _, n = col.split('colspan=')
token_list.append(" colspan=\"{}\"".format(n)) token_list.append(" colspan=\"{}\"".format(n[0]))
if 'rowspan' in col: if 'rowspan' in col:
_, n = col.split('rowspan=') _, n = col.split('rowspan=')
token_list.append(" rowspan=\"{}\"".format(n)) token_list.append(" rowspan=\"{}\"".format(n[0]))
token_list.extend([">", "</td>"]) token_list.extend([">", "</td>"])
token_list.append("</tr>") token_list.append("</tr>")
token_list.append("</tbody>") token_list.append("</tbody>")
......
...@@ -112,4 +112,4 @@ keyDialogTip=Enter object label ...@@ -112,4 +112,4 @@ keyDialogTip=Enter object label
keyChange=Change Box Key keyChange=Change Box Key
TableRecognition=Table Recognition TableRecognition=Table Recognition
cellreRecognition=Cell Re-Recognition cellreRecognition=Cell Re-Recognition
exportJSON=export JSON(PubTabNet) exportJSON=Export Excel Label(PubTabNet)
...@@ -84,7 +84,7 @@ mhelp=帮助 ...@@ -84,7 +84,7 @@ mhelp=帮助
iconList=缩略图 iconList=缩略图
detectionBoxposition=检测框位置 detectionBoxposition=检测框位置
recognitionResult=识别结果 recognitionResult=识别结果
creatPolygon=边形标注 creatPolygon=标注
drawSquares=正方形标注 drawSquares=正方形标注
rotateLeft=图片左旋转90度 rotateLeft=图片左旋转90度
rotateRight=图片右旋转90度 rotateRight=图片右旋转90度
......
...@@ -106,6 +106,7 @@ PaddleOCR support a variety of cutting-edge algorithms related to OCR, and devel ...@@ -106,6 +106,7 @@ PaddleOCR support a variety of cutting-edge algorithms related to OCR, and devel
- [Serving](./deploy/pdserving/README.md) - [Serving](./deploy/pdserving/README.md)
- [Mobile](./deploy/lite/readme.md) - [Mobile](./deploy/lite/readme.md)
- [Paddle2ONNX](./deploy/paddle2onnx/readme.md) - [Paddle2ONNX](./deploy/paddle2onnx/readme.md)
- [PaddleCloud](./deploy/paddlecloud/README.md)
- [Benchmark](./doc/doc_en/benchmark_en.md) - [Benchmark](./doc/doc_en/benchmark_en.md)
- [PP-Structure 🔥](./ppstructure/README.md) - [PP-Structure 🔥](./ppstructure/README.md)
- [Quick Start](./ppstructure/docs/quickstart_en.md) - [Quick Start](./ppstructure/docs/quickstart_en.md)
......
...@@ -27,14 +27,28 @@ PaddleOCR旨在打造一套丰富、领先、且实用的OCR工具库,助力 ...@@ -27,14 +27,28 @@ PaddleOCR旨在打造一套丰富、领先、且实用的OCR工具库,助力
## 近期更新 ## 近期更新
- 2022.5.9 发布PaddleOCR v2.5。发布内容包括: - **🔥2022.5.11~13 每晚8:30【超强OCR技术详解与产业应用实战】三日直播课**
- [PP-OCRv3](./doc/doc_ch/ppocr_introduction.md#pp-ocrv3),速度可比情况下,中文场景效果相比于PP-OCRv2再提升5%,英文场景提升11%,80语种多语言模型平均识别准确率提升5%以上; - 11日:开源最强OCR系统PP-OCRv3揭秘
- 半自动标注工具[PPOCRLabelv2](./PPOCRLabel):新增表格文字图像、图像关键信息抽取任务和不规则文字图像的标注功能; - 12日:云边端全覆盖的PP-OCRv3训练部署实战
- OCR产业落地工具集:打通22种训练部署软硬件环境与方式,覆盖企业90%的训练部署环境需求 - 13日:OCR产业应用全流程拆解与实战
- 交互式OCR开源电子书[《动手学OCR》](./doc/doc_ch/ocr_book.md),覆盖OCR全栈技术的前沿理论与代码实践,并配套教学视频。
- 2021.12.21 发布PaddleOCR v2.4。OCR算法新增1种文本检测算法(PSENet),3种文本识别算法(NRTR、SEED、SAR);文档结构化算法新增1种关键信息提取算法(SDMGR,[文档](./ppstructure/docs/kie.md)),3种DocVQA算法(LayoutLM、LayoutLMv2,LayoutXLM,[文档](./ppstructure/vqa))。 赶紧扫码报名吧!
- 2021.9.7 发布PaddleOCR v2.3与[PP-OCRv2](./doc/doc_ch/ppocr_introduction.md#pp-ocrv2),CPU推理速度相比于PP-OCR server提升220%;效果相比于PP-OCR mobile 提升7%。 <div align="center">
- 2021.8.3 发布PaddleOCR v2.2,新增文档结构分析[PP-Structure](./ppstructure/README_ch.md)工具包,支持版面分析与表格识别(含Excel导出)。 <img src="https://raw.githubusercontent.com/PaddlePaddle/PaddleOCR/dygraph/doc/joinus.PNG" width = "150" height = "150" />
</div>
- **🔥2022.5.9 发布PaddleOCR [release/2.5](https://github.com/PaddlePaddle/PaddleOCR/tree/release/2.5)**
- 发布[PP-OCRv3](./doc/doc_ch/ppocr_introduction.md#pp-ocrv3),速度可比情况下,中文场景效果相比于PP-OCRv2再提升5%,英文场景提升11%,80语种多语言模型平均识别准确率提升5%以上;
- 发布半自动标注工具[PPOCRLabelv2](./PPOCRLabel):新增表格文字图像、图像关键信息抽取任务和不规则文字图像的标注功能;
- 发布OCR产业落地工具集:打通22种训练部署软硬件环境与方式,覆盖企业90%的训练部署环境需求;
- 发布交互式OCR开源电子书[《动手学OCR》](./doc/doc_ch/ocr_book.md),覆盖OCR全栈技术的前沿理论与代码实践,并配套教学视频。
- 2021.12.21 发布PaddleOCR [release/2.4](https://github.com/PaddlePaddle/PaddleOCR/tree/release/2.4)
- OCR算法新增1种文本检测算法([PSENet](./doc/doc_ch/algorithm_det_psenet.md)),3种文本识别算法([NRTR](./doc/doc_ch/algorithm_rec_nrtr.md)[SEED](./doc/doc_ch/algorithm_rec_seed.md)[SAR](./doc/doc_ch/algorithm_rec_sar.md));
- 文档结构化算法新增1种关键信息提取算法([SDMGR](./ppstructure/docs/kie.md)),3种[DocVQA](./ppstructure/vqa)算法(LayoutLM、LayoutLMv2,LayoutXLM)。
- 2021.9.7 发布PaddleOCR [release/2.3](https://github.com/PaddlePaddle/PaddleOCR/tree/release/2.3)
- 发布[PP-OCRv2](./doc/doc_ch/ppocr_introduction.md#pp-ocrv2),CPU推理速度相比于PP-OCR server提升220%;效果相比于PP-OCR mobile 提升7%。
- 2021.8.3 发布PaddleOCR [release/2.2](https://github.com/PaddlePaddle/PaddleOCR/tree/release/2.2)
- 发布文档结构分析[PP-Structure](./ppstructure/README_ch.md)工具包,支持版面分析与表格识别(含Excel导出)。
> [更多](./doc/doc_ch/update.md) > [更多](./doc/doc_ch/update.md)
...@@ -108,6 +122,7 @@ PaddleOCR旨在打造一套丰富、领先、且实用的OCR工具库,助力 ...@@ -108,6 +122,7 @@ PaddleOCR旨在打造一套丰富、领先、且实用的OCR工具库,助力
- [服务化部署](./deploy/pdserving/README_CN.md) - [服务化部署](./deploy/pdserving/README_CN.md)
- [端侧部署](./deploy/lite/readme.md) - [端侧部署](./deploy/lite/readme.md)
- [Paddle2ONNX模型转化与预测](./deploy/paddle2onnx/readme.md) - [Paddle2ONNX模型转化与预测](./deploy/paddle2onnx/readme.md)
- [云上飞桨部署工具](./deploy/paddlecloud/README.md)
- [Benchmark](./doc/doc_ch/benchmark.md) - [Benchmark](./doc/doc_ch/benchmark.md)
- [PP-Structure文档分析🔥](./ppstructure/README_ch.md) - [PP-Structure文档分析🔥](./ppstructure/README_ch.md)
- [快速开始](./ppstructure/docs/quickstart.md) - [快速开始](./ppstructure/docs/quickstart.md)
...@@ -126,11 +141,7 @@ PaddleOCR旨在打造一套丰富、领先、且实用的OCR工具库,助力 ...@@ -126,11 +141,7 @@ PaddleOCR旨在打造一套丰富、领先、且实用的OCR工具库,助力
- [文本识别算法](./doc/doc_ch/algorithm_overview.md#12-%E6%96%87%E6%9C%AC%E8%AF%86%E5%88%AB%E7%AE%97%E6%B3%95) - [文本识别算法](./doc/doc_ch/algorithm_overview.md#12-%E6%96%87%E6%9C%AC%E8%AF%86%E5%88%AB%E7%AE%97%E6%B3%95)
- [端到端算法](./doc/doc_ch/algorithm_overview.md#2-%E6%96%87%E6%9C%AC%E8%AF%86%E5%88%AB%E7%AE%97%E6%B3%95) - [端到端算法](./doc/doc_ch/algorithm_overview.md#2-%E6%96%87%E6%9C%AC%E8%AF%86%E5%88%AB%E7%AE%97%E6%B3%95)
- [使用PaddleOCR架构添加新算法](./doc/doc_ch/add_new_algorithm.md) - [使用PaddleOCR架构添加新算法](./doc/doc_ch/add_new_algorithm.md)
- [场景应用](./doc/doc_ch/application.md) - [场景应用](./applications)
- [金融场景(表单/票据等)]()
- [工业场景(电表度数/车牌等)]()
- [教育场景(手写体/公式等)]()
- [医疗场景(化验单等)]()
- 数据标注与合成 - 数据标注与合成
- [半自动标注工具PPOCRLabel](./PPOCRLabel/README_ch.md) - [半自动标注工具PPOCRLabel](./PPOCRLabel/README_ch.md)
- [数据合成工具Style-Text](./StyleText/README_ch.md) - [数据合成工具Style-Text](./StyleText/README_ch.md)
......
...@@ -16,14 +16,14 @@ ...@@ -16,14 +16,14 @@
<center><img src='https://ai-studio-static-online.cdn.bcebos.com/9bd844b970f94e5ba0bc0c5799bd819ea9b1861bb306471fabc2d628864d418e'></center> <center><img src='https://ai-studio-static-online.cdn.bcebos.com/9bd844b970f94e5ba0bc0c5799bd819ea9b1861bb306471fabc2d628864d418e'></center>
<center>图1 多模态表单识别流程图</center> <center>图1 多模态表单识别流程图</center>
注:欢迎再AIStudio领取免费算力体验线上实训,项目链接: [多模态表单识别](https://aistudio.baidu.com/aistudio/projectdetail/3815918)(配备Tesla V100、A100等高级算力资源) 注:欢迎再AIStudio领取免费算力体验线上实训,项目链接: [多模态表单识别](https://aistudio.baidu.com/aistudio/projectdetail/3884375)(配备Tesla V100、A100等高级算力资源)
# 2 安装说明 # 2 安装说明
下载PaddleOCR源码,项目中已经帮大家打包好的PaddleOCR(已经修改好配置文件),无需下载解压即可,只需安装依赖环境~ 下载PaddleOCR源码,上述AIStudio项目中已经帮大家打包好的PaddleOCR(已经修改好配置文件),无需下载解压即可,只需安装依赖环境~
```python ```python
...@@ -33,7 +33,7 @@ ...@@ -33,7 +33,7 @@
```python ```python
# 如仍需安装or安装更新,可以执行以下步骤 # 如仍需安装or安装更新,可以执行以下步骤
! git clone https://github.com/PaddlePaddle/PaddleOCR.git -b dygraph # ! git clone https://github.com/PaddlePaddle/PaddleOCR.git -b dygraph
# ! git clone https://gitee.com/PaddlePaddle/PaddleOCR # ! git clone https://gitee.com/PaddlePaddle/PaddleOCR
``` ```
...@@ -290,7 +290,7 @@ Eval.dataset.transforms.DetResizeForTest:评估尺寸,添加如下参数 ...@@ -290,7 +290,7 @@ Eval.dataset.transforms.DetResizeForTest:评估尺寸,添加如下参数
<center><img src="https://ai-studio-static-online.cdn.bcebos.com/5a75137c5f924dfeb6956b5818812298cc3dc7992ac84954b4175be9adf83c77"></center> <center><img src="https://ai-studio-static-online.cdn.bcebos.com/5a75137c5f924dfeb6956b5818812298cc3dc7992ac84954b4175be9adf83c77"></center>
<center>图8 文本检测方案2-模型评估</center> <center>图8 文本检测方案2-模型评估</center>
使用训练好的模型进行评估,更新模型路径`Global.checkpoints`,这里为大家提供训练好的模型`./pretrain/ch_db_mv3-student1600-finetune/best_accuracy` 使用训练好的模型进行评估,更新模型路径`Global.checkpoints`,这里为大家提供训练好的模型`./pretrain/ch_db_mv3-student1600-finetune/best_accuracy`[模型下载地址](https://paddleocr.bj.bcebos.com/fanliku/sheet_recognition/ch_db_mv3-student1600-finetune.zip)
```python ```python
...@@ -538,7 +538,7 @@ Train.dataset.ratio_list:动态采样 ...@@ -538,7 +538,7 @@ Train.dataset.ratio_list:动态采样
<center>图16 文本识别方案3-模型评估</center> <center>图16 文本识别方案3-模型评估</center>
使用训练好的模型进行评估,更新模型路径`Global.checkpoints`,这里为大家提供训练好的模型`./pretrain/rec_mobile_pp-OCRv2-student-readldata/best_accuracy` 使用训练好的模型进行评估,更新模型路径`Global.checkpoints`,这里为大家提供训练好的模型`./pretrain/rec_mobile_pp-OCRv2-student-readldata/best_accuracy`[模型下载地址](https://paddleocr.bj.bcebos.com/fanliku/sheet_recognition/rec_mobile_pp-OCRv2-student-realdata.zip)
```python ```python
......
此差异已折叠。
...@@ -77,6 +77,9 @@ int main(int argc, char **argv) { ...@@ -77,6 +77,9 @@ int main(int argc, char **argv) {
for (int i = 0; i < cv_all_img_names.size(); ++i) { for (int i = 0; i < cv_all_img_names.size(); ++i) {
if (FLAGS_benchmark) { if (FLAGS_benchmark) {
cout << cv_all_img_names[i] << '\t'; cout << cv_all_img_names[i] << '\t';
if (FLAGS_rec && FLAGS_det) {
Utility::print_result(ocr_results[i]);
} else if (FLAGS_det) {
for (int n = 0; n < ocr_results[i].size(); n++) { for (int n = 0; n < ocr_results[i].size(); n++) {
for (int m = 0; m < ocr_results[i][n].box.size(); m++) { for (int m = 0; m < ocr_results[i][n].box.size(); m++) {
cout << ocr_results[i][n].box[m][0] << ' ' cout << ocr_results[i][n].box[m][0] << ' '
...@@ -84,6 +87,9 @@ int main(int argc, char **argv) { ...@@ -84,6 +87,9 @@ int main(int argc, char **argv) {
} }
} }
cout << endl; cout << endl;
} else {
Utility::print_result(ocr_results[i]);
}
} else { } else {
cout << cv_all_img_names[i] << "\n"; cout << cv_all_img_names[i] << "\n";
Utility::print_result(ocr_results[i]); Utility::print_result(ocr_results[i]);
......
...@@ -32,40 +32,46 @@ void DBDetector::LoadModel(const std::string &model_dir) { ...@@ -32,40 +32,46 @@ void DBDetector::LoadModel(const std::string &model_dir) {
if (this->precision_ == "int8") { if (this->precision_ == "int8") {
precision = paddle_infer::Config::Precision::kInt8; precision = paddle_infer::Config::Precision::kInt8;
} }
config.EnableTensorRtEngine(1 << 20, 10, 3, precision, false, false); config.EnableTensorRtEngine(1 << 20, 1, 20, precision, false, false);
std::map<std::string, std::vector<int>> min_input_shape = { std::map<std::string, std::vector<int>> min_input_shape = {
{"x", {1, 3, 50, 50}}, {"x", {1, 3, 50, 50}},
{"conv2d_92.tmp_0", {1, 96, 20, 20}}, {"conv2d_92.tmp_0", {1, 120, 20, 20}},
{"conv2d_91.tmp_0", {1, 96, 10, 10}}, {"conv2d_91.tmp_0", {1, 24, 10, 10}},
{"nearest_interp_v2_1.tmp_0", {1, 96, 10, 10}}, {"conv2d_59.tmp_0", {1, 96, 20, 20}},
{"nearest_interp_v2_2.tmp_0", {1, 96, 20, 20}}, {"nearest_interp_v2_1.tmp_0", {1, 256, 10, 10}},
{"nearest_interp_v2_3.tmp_0", {1, 24, 20, 20}}, {"nearest_interp_v2_2.tmp_0", {1, 256, 20, 20}},
{"nearest_interp_v2_4.tmp_0", {1, 24, 20, 20}}, {"conv2d_124.tmp_0", {1, 256, 20, 20}},
{"nearest_interp_v2_5.tmp_0", {1, 24, 20, 20}}, {"nearest_interp_v2_3.tmp_0", {1, 64, 20, 20}},
{"nearest_interp_v2_4.tmp_0", {1, 64, 20, 20}},
{"nearest_interp_v2_5.tmp_0", {1, 64, 20, 20}},
{"elementwise_add_7", {1, 56, 2, 2}}, {"elementwise_add_7", {1, 56, 2, 2}},
{"nearest_interp_v2_0.tmp_0", {1, 96, 2, 2}}}; {"nearest_interp_v2_0.tmp_0", {1, 256, 2, 2}}};
std::map<std::string, std::vector<int>> max_input_shape = { std::map<std::string, std::vector<int>> max_input_shape = {
{"x", {1, 3, this->max_side_len_, this->max_side_len_}}, {"x", {1, 3, this->max_side_len_, this->max_side_len_}},
{"conv2d_92.tmp_0", {1, 96, 400, 400}}, {"conv2d_92.tmp_0", {1, 120, 400, 400}},
{"conv2d_91.tmp_0", {1, 96, 200, 200}}, {"conv2d_91.tmp_0", {1, 24, 200, 200}},
{"nearest_interp_v2_1.tmp_0", {1, 96, 200, 200}}, {"conv2d_59.tmp_0", {1, 96, 400, 400}},
{"nearest_interp_v2_2.tmp_0", {1, 96, 400, 400}}, {"nearest_interp_v2_1.tmp_0", {1, 256, 200, 200}},
{"nearest_interp_v2_3.tmp_0", {1, 24, 400, 400}}, {"nearest_interp_v2_2.tmp_0", {1, 256, 400, 400}},
{"nearest_interp_v2_4.tmp_0", {1, 24, 400, 400}}, {"conv2d_124.tmp_0", {1, 256, 400, 400}},
{"nearest_interp_v2_5.tmp_0", {1, 24, 400, 400}}, {"nearest_interp_v2_3.tmp_0", {1, 64, 400, 400}},
{"nearest_interp_v2_4.tmp_0", {1, 64, 400, 400}},
{"nearest_interp_v2_5.tmp_0", {1, 64, 400, 400}},
{"elementwise_add_7", {1, 56, 400, 400}}, {"elementwise_add_7", {1, 56, 400, 400}},
{"nearest_interp_v2_0.tmp_0", {1, 96, 400, 400}}}; {"nearest_interp_v2_0.tmp_0", {1, 256, 400, 400}}};
std::map<std::string, std::vector<int>> opt_input_shape = { std::map<std::string, std::vector<int>> opt_input_shape = {
{"x", {1, 3, 640, 640}}, {"x", {1, 3, 640, 640}},
{"conv2d_92.tmp_0", {1, 96, 160, 160}}, {"conv2d_92.tmp_0", {1, 120, 160, 160}},
{"conv2d_91.tmp_0", {1, 96, 80, 80}}, {"conv2d_91.tmp_0", {1, 24, 80, 80}},
{"nearest_interp_v2_1.tmp_0", {1, 96, 80, 80}}, {"conv2d_59.tmp_0", {1, 96, 160, 160}},
{"nearest_interp_v2_2.tmp_0", {1, 96, 160, 160}}, {"nearest_interp_v2_1.tmp_0", {1, 256, 80, 80}},
{"nearest_interp_v2_3.tmp_0", {1, 24, 160, 160}}, {"nearest_interp_v2_2.tmp_0", {1, 256, 160, 160}},
{"nearest_interp_v2_4.tmp_0", {1, 24, 160, 160}}, {"conv2d_124.tmp_0", {1, 256, 160, 160}},
{"nearest_interp_v2_5.tmp_0", {1, 24, 160, 160}}, {"nearest_interp_v2_3.tmp_0", {1, 64, 160, 160}},
{"nearest_interp_v2_4.tmp_0", {1, 64, 160, 160}},
{"nearest_interp_v2_5.tmp_0", {1, 64, 160, 160}},
{"elementwise_add_7", {1, 56, 40, 40}}, {"elementwise_add_7", {1, 56, 40, 40}},
{"nearest_interp_v2_0.tmp_0", {1, 96, 40, 40}}}; {"nearest_interp_v2_0.tmp_0", {1, 256, 40, 40}}};
config.SetTRTDynamicShapeInfo(min_input_shape, max_input_shape, config.SetTRTDynamicShapeInfo(min_input_shape, max_input_shape,
opt_input_shape); opt_input_shape);
......
...@@ -143,15 +143,17 @@ void CRNNRecognizer::LoadModel(const std::string &model_dir) { ...@@ -143,15 +143,17 @@ void CRNNRecognizer::LoadModel(const std::string &model_dir) {
if (this->precision_ == "int8") { if (this->precision_ == "int8") {
precision = paddle_infer::Config::Precision::kInt8; precision = paddle_infer::Config::Precision::kInt8;
} }
config.EnableTensorRtEngine(1 << 20, 10, 3, precision, false, false); config.EnableTensorRtEngine(1 << 20, 10, 15, precision, false, false);
int imgH = this->rec_image_shape_[1]; int imgH = this->rec_image_shape_[1];
int imgW = this->rec_image_shape_[2]; int imgW = this->rec_image_shape_[2];
std::map<std::string, std::vector<int>> min_input_shape = { std::map<std::string, std::vector<int>> min_input_shape = {
{"x", {1, 3, imgH, 10}}, {"lstm_0.tmp_0", {10, 1, 96}}}; {"x", {1, 3, imgH, 10}}, {"lstm_0.tmp_0", {10, 1, 96}}};
std::map<std::string, std::vector<int>> max_input_shape = { std::map<std::string, std::vector<int>> max_input_shape = {
{"x", {1, 3, imgH, 2000}}, {"lstm_0.tmp_0", {1000, 1, 96}}}; {"x", {this->rec_batch_num_, 3, imgH, 2500}},
{"lstm_0.tmp_0", {1000, 1, 96}}};
std::map<std::string, std::vector<int>> opt_input_shape = { std::map<std::string, std::vector<int>> opt_input_shape = {
{"x", {1, 3, imgH, imgW}}, {"lstm_0.tmp_0", {25, 1, 96}}}; {"x", {this->rec_batch_num_, 3, imgH, imgW}},
{"lstm_0.tmp_0", {25, 1, 96}}};
config.SetTRTDynamicShapeInfo(min_input_shape, max_input_shape, config.SetTRTDynamicShapeInfo(min_input_shape, max_input_shape,
opt_input_shape); opt_input_shape);
......
# 云上飞桨部署工具
[云上飞桨(PaddleCloud)](https://github.com/PaddlePaddle/PaddleCloud) 是面向飞桨框架及其模型套件的部署工具,
为用户提供了模型套件Docker化部署和Kubernetes集群部署两种方式,可以满足不同场景与环境的部署需求。
本章节我们将使用PaddleCloud提供的OCR标准镜像以及云原生组件来训练和部署PP-OCRv3识别模型。
## 云上飞桨部署工具的优势
<div align="center">
<img src="./images/paddlecloud.png" title="architecture" width="80%" height="80%" alt="">
</div>
- **模型套件Docker镜像大礼包。**
PaddleCloud为用户提供了飞桨模型套件Docker镜像大礼包,这些镜像中包含运行模型套件案例的所有依赖并能持续更新,支持异构硬件环境和常见CUDA版本、开箱即用。
- **具有丰富的云上飞桨组件。**
云上飞桨具有丰富的云原生功能组件,包括样本数据缓存组件、分布式训练组件、推理服务组件等,使用这些组件用户可以快速地在Kubernetes集群上进行训练和部署工作。
- **功能强大的自运维能力。**
云上飞桨组件基于Kubernetes的Operator机制提供了功能强大的自运维能力,如训练组件支持多种架构模式并具有分布式容错与弹性训练的能力,推理服务组件支持自动扩缩容与蓝绿发版等。
- **针对飞桨框架的定制优化。**
除了部署便捷与自运维的优势,PaddleCloud还针对飞桨框架进行了正对性优化,如通过缓存样本数据来加速云上飞桨分布式训练作业、基于飞桨框架和调度器的协同设计来优化集群GPU利用率等。
## 1. PP-OCRv3 Docker化部署
PaddleCloud基于 [Tekton](https://github.com/tektoncd/pipeline) 为OCR模型套件提供了镜像持续构建的能力,并支持CPU、GPU以及常见CUDA版本的镜像。
您可以查看 [PaddleOCR 镜像仓库](https://hub.docker.com/repository/docker/paddlecloud/paddleocr) 来获取所有的镜像列表。
同时我们也将PP-OCRv3识别模型的训练与推理实战案例放置到了AI Studio平台上,您可以点击 [PP-OCRv3识别训推一体项目实战](https://aistudio.baidu.com/aistudio/projectdetail/3916206?channelType=0&channel=0) 在平台上快速体验。
> **适用场景**:本地测试开发环境、单机部署环境。
### 1.1 安装Docker
如果您所使用的机器上还没有安装 Docker,您可以参考 [Docker 官方文档](https://docs.docker.com/get-docker/) 来进行安装。
如果您需要使用支持 GPU 版本的镜像,则还需安装好NVIDIA相关驱动和 [nvidia-docker](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#docker)
**注意**:如果您使用的是Windows系统,需要开启 [WSL2(Linux子系统功能)功能](https://docs.microsoft.com/en-us/windows/wsl/install)
### 1.2 启动容器
**使用CPU版本的Docker镜像**
```bash
# 这是加上参数 --shm-size=32g 是为了防止容器里内存不足
docker run --name ppocr -v $PWD:/mnt -p 8888:8888 -it --shm-size=32g paddlecloud/paddleocr:2.5-cpu-efbb0a /bin/bash
```
**使用GPU版本的Docker镜像**
```bash
docker run --name ppocr --runtime=nvidia -v $PWD:/mnt -p 8888:8888 -it --shm-size=32g paddlecloud/paddleocr:2.5-gpu-cuda10.2-cudnn7-efbb0a /bin/bash
```
进入容器内,则可进行 PP-OCRv3 模型的训练和部署工作。
### 1.3 准备训练数据
本教程以HierText数据集为例,HierText是第一个具有自然场景和文档中文本分层注释的数据集。
该数据集包含从 Open Images 数据集中选择的 11639 张图像,提供高质量的单词 (~1.2M)、行和段落级别的注释。
我们已经将数据集上传到百度云对象存储(BOS),您可以通过运行如下指令,完成数据集的下载和解压操作:
```bash
# 下载数据集
$ wget -P /mnt https://paddleflow-public.hkg.bcebos.com/ppocr/hiertext1.tar
# 解压数据集
$ tar xf /mnt/hiertext1.tar -C /mnt && mv /mnt/hiertext1 /mnt/hiertext
```
运行上述命令后,在 `/mnt` 目录下包含以下文件:
```
/mnt/hiertext
└─ train/ HierText训练集数据
└─ validation/ HierText验证集数据
└─ label_hiertext_train.txt HierText训练集的行标注
└─ label_hiertext_val.txt HierText验证集的行标注
```
### 1.4 修改配置文件
PP-OCRv3模型配置文件位于`/home/PaddleOCR/configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml`,需要修改的配置如下:
- 修改训练数据配置:
```yaml
Train:
dataset:
name: SimpleDataSet
data_dir: ./train_data/icdar2015/text_localization/
label_file_list:
- ./train_data/icdar2015/text_localization/train_icdar2015_label.txt
```
修改为:
```yaml
Train:
dataset:
name: SimpleDataSet
data_dir: /mnt/
label_file_list:
- /mnt/hiertext/label_hiertext_train.txt
```
- 修改验证数据配置:
```yaml
Eval:
dataset:
name: SimpleDataSet
data_dir: ./train_data/icdar2015/text_localization/
label_file_list:
- ./train_data/icdar2015/text_localization/test_icdar2015_label.txt
```
修改为:
```yaml
Eval:
dataset:
name: SimpleDataSet
data_dir: /mnt/
label_file_list:
- /mnt/hiertext/label_hiertext_val.txt
```
### 1.5 启动训练
下载PP-OCRv3的蒸馏预训练模型并进行训练的方式如下
```bash
# 下载预训练模型到/home/PaddleOCR/pre_train文件夹下
$ mkdir /home/PaddleOCR/pre_train
$ wget -P /home/PaddleOCR/pre_train https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_distill_train.tar
$ tar xf /home/PaddleOCR/pre_train/ch_PP-OCRv3_det_distill_train.tar -C /home/PaddleOCR/pre_train/
```
启动训练,训练模型默认保存在`output`目录下,加载PP-OCRv3检测预训练模型。
```bash
# 这里以 GPU 训练为例,使用 CPU 进行训练的话,需要指定参数 Global.use_gpu=false
python3 tools/train.py -c configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml -o Global.save_model_dir=./output/ Global.pretrained_model=./pre_train/ch_PP-OCRv3_det_distill_train/best_accuracy
```
如果要使用多GPU分布式训练,请使用如下命令:
```bash
# 启动训练,训练模型默认保存在output目录下,--gpus '0,1,2,3'表示使用0,1,2,3号GPU训练
python3 -m paddle.distributed.launch --log_dir=./debug/ --gpus '0,1,2,3' tools/train.py -c configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml -o Global.save_model_dir=./output/ Global.pretrained_model=./pre_train/ch_PP-OCRv3_det_distill_train/best_accuracy
```
### 1.6 模型评估
训练过程中保存的模型在output目录下,包含以下文件:
```
best_accuracy.states
best_accuracy.pdparams # 默认保存最优精度的模型参数
best_accuracy.pdopt # 默认保存最优精度的优化器相关参数
latest.states
latest.pdparams # 默认保存的最新模型参数
latest.pdopt # 默认保存的最新模型的优化器相关参数
```
其中,best_accuracy是保存的最优模型,可以直接使用该模型评估
```bash
# 进行模型评估
cd /home/PaddleOCR/
python3 tools/eval.py -c configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml -o Global.checkpoints=./output/best_accuracy
```
## 2. PP-OCRv3云端部署
PaddleCloud基于Kubernetes的Operator机制为您提供了多个功能强大的云原生组件,如样本数据缓存组件、分布式训练组件、 以及模型推理服务组件,
使用这些组件您可以快速地在云上进行分布式训练和模型服务化部署。更多关于PaddleCloud云原生组件的内容,请参考文档 [PaddleCloud架构概览](https://github.com/PaddlePaddle/PaddleCloud/blob/main/docs/zh_CN/paddlecloud-overview.md)
> **适用场景**:基于Kubernetes的多机部署环境。
### 2.1 安装云上飞桨组件
**环境要求**
- [Kubernetes v1.16+](https://kubernetes.io/zh/)
- [kubectl](https://kubernetes.io/docs/tasks/tools/)
- [Helm](https://helm.sh/zh/docs/intro/install/)
如果您没有Kubernetes环境,可以使用MicroK8S在本地搭建环境,更多详情请参考 [MicroK8S官方文档](https://microk8s.io/docs/getting-started)
使用Helm一键安装所有组件和所有依赖
```bash
# 添加PaddleCloud Chart仓库
$ helm repo add paddlecloud https://paddleflow-public.hkg.bcebos.com/charts
$ helm repo update
# 安装云上飞桨组件
$ helm install pdc paddlecloud/paddlecloud --set tags.all-dep=true --namespace paddlecloud --create-namespace
# 检查所有云上飞桨组件是否成功启动,命名空间下的所有Pod都为Runing状态则安装成功。
$ kubectl get pods -n paddlecloud
NAME READY STATUS RESTARTS AGE
pdc-hostpath-5b6bd6787d-bxvxg 1/1 Running 0 10h
juicefs-csi-node-pkldt 3/3 Running 0 10h
juicefs-csi-controller-0 3/3 Running 0 10h
pdc-paddlecloud-sampleset-767bdf6947-pb6zm 1/1 Running 0 10h
pdc-paddlecloud-paddlejob-7cc8b7bfc6-7gqnh 1/1 Running 0 10h
pdc-minio-7cc967669d-824q5 1/1 Running 0 10h
pdc-redis-master-0 1/1 Running 0 10h
```
更多安装参数请参考[PaddleCloud安装指南](https://github.com/PaddlePaddle/PaddleCloud/blob/main/docs/zh_CN/installation.md)
### 2.2 云原生组件介绍
<div align="center">
<img src="./images/architecture.jpeg" title="architecture" width="60%" height="60%" alt="">
</div>
- **数据缓存组件。** 数据缓存组件使用JuiceFS作为缓存引擎,能够将远程样本数据缓存到训练集群本地,大幅加速云上飞桨分布式训练作业。
- **分布式训练组件。** 分布式训练组件支持参数服务器(PS)与集合通信(Collective)两种架构模式,方便用户在云上快速运行飞桨分布式训练作业。
以下内容我们将使用这两个云原生组件来在Kubernetes集群中部署PP-OCRv3识别模型的训练作业。
### 2.3 准备hiertext数据集
使用数据缓存组件来准备数据集,编写SampleSet Yaml文件如下:
```yaml
# hiertext.yaml
apiVersion: batch.paddlepaddle.org/v1alpha1
kind: SampleSet
metadata:
name: hiertext
namespace: paddlecloud
spec:
partitions: 1
source:
uri: bos://paddleflow-public.hkg.bcebos.com/ppocr/hiertext
secretRef:
name: none
secretRef:
name: data-center
```
然后在命令行中,使用kubectl执行如下命令。
```bash
# 创建hiertext数据集
$ kubectl apply -f hiertext.yaml
sampleset.batch.paddlepaddle.org/hiertext created
# 查看数据集的状态
$ kubectl get sampleset hiertext -n paddlecloud
NAME TOTAL SIZE CACHED SIZE AVAIL SPACE RUNTIME PHASE AGE
hiertext 3.3 GiB 3.2 GiB 12 GiB 1/1 Ready 11m
```
### 2.4 训练PP-OCRv3模型
使用训练组件在Kubernetes集群上训练PP-OCRv3模型,编写PaddleJob Yaml文件如下:
```yaml
# ppocrv3.yaml
apiVersion: batch.paddlepaddle.org/v1
kind: PaddleJob
metadata:
name: ppocrv3
namespace: paddlecloud
spec:
cleanPodPolicy: OnCompletion
sampleSetRef:
name: hiertext
namespace: paddlecloud
mountPath: /mnt/hiertext
worker:
replicas: 1
template:
spec:
containers:
- name: ppocrv3
image: paddlecloud/paddleocr:2.5-gpu-cuda10.2-cudnn7-efbb0a
command:
- /bin/bash
args:
- "-c"
- >
mkdir /home/PaddleOCR/pre_train &&
wget -P ./pre_train https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_distill_train.tar &&
tar xf ./pre_train/ch_PP-OCRv3_det_distill_train.tar -C ./pre_train/ &&
python tools/train.py -c configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml -o
Train.dataset.data_dir=/mnt/
Train.dataset.label_file_list=[\"/mnt/hiertext/label_hiertext_train.txt\"]
Eval.dataset.data_dir=/mnt/
Eval.dataset.label_file_list=[\"/mnt/hiertext/label_hiertext_val.txt\"]
Global.save_model_dir=./output/
Global.pretrained_model=./pre_train/ch_PP-OCRv3_det_distill_train/best_accuracy
resources:
limits:
nvidia.com/gpu: 1
volumeMounts: # 添加 shared memory 挂载以防止缓存出错
- mountPath: /dev/shm
name: dshm
volumes:
- name: dshm
emptyDir:
medium: Memory
```
本案例采用GPU进行训练,如果您只有CPU机器,则可以将镜像替换成CPU版本 `paddlecloud/paddleocr:2.5-cpu-efbb0a`,并在args中加上参数`Global.use_gpu=false`
```bash
# 创建PaddleJob训练模型
$ kubectl apply -f ppocrv3.yaml
paddlejob.batch.paddlepaddle.org/ppocrv3 created
# 查看PaddleJob状态
$ kubectl get pods -n paddlecloud -l paddle-res-name=ppocrv3-worker-0
NAME READY STATUS RESTARTS AGE
ppocrv3-worker-0 1/1 Running 0 4s
# 查看训练日志
$ kubectl logs -f ppocrv3-worker-0 -n paddlecloud
```
## 更多资源
欢迎关注[云上飞桨项目PaddleCloud](https://github.com/PaddlePaddle/PaddleCloud),我们为您提供了飞桨模型套件标准镜像以及全栈的云原生模型套件部署组件,如您有任何关于飞桨模型套件的部署问题,请联系我们。
如果你发现任何PaddleCloud存在的问题或者是建议, 欢迎通过[GitHub Issues](https://github.com/PaddlePaddle/PaddleCloud/issues)给我们提issues。
\ No newline at end of file
...@@ -19,8 +19,16 @@ import base64 ...@@ -19,8 +19,16 @@ import base64
import os import os
import argparse import argparse
def str2bool(v):
return v.lower() in ("true", "t", "1")
parser = argparse.ArgumentParser(description="args for paddleserving") parser = argparse.ArgumentParser(description="args for paddleserving")
parser.add_argument("--image_dir", type=str, default="../../doc/imgs/") parser.add_argument("--image_dir", type=str, default="../../doc/imgs/")
parser.add_argument("--det", type=str2bool, default=True)
parser.add_argument("--rec", type=str2bool, default=True)
args = parser.parse_args() args = parser.parse_args()
...@@ -46,11 +54,15 @@ for idx, img_file in enumerate(os.listdir(test_img_dir)): ...@@ -46,11 +54,15 @@ for idx, img_file in enumerate(os.listdir(test_img_dir)):
# check success # check success
if result["err_no"] == 0: if result["err_no"] == 0:
ocr_result = result["value"][0] ocr_result = result["value"][0]
if not args.det:
print(ocr_result)
else:
try: try:
for item in eval(ocr_result): for item in eval(ocr_result):
# return transcription and points # return transcription and points
print("{}, {}".format(item[0], item[1])) print("{}, {}".format(item[0], item[1]))
except Exception as e: except Exception as e:
print(ocr_result)
print("No results") print("No results")
continue continue
......
...@@ -47,9 +47,9 @@ python deploy/slim/quantization/quant.py -c configs/det/ch_ppocr_v2.0/ch_det_mv3 ...@@ -47,9 +47,9 @@ python deploy/slim/quantization/quant.py -c configs/det/ch_ppocr_v2.0/ch_det_mv3
``` ```
# 下载检测预训练模型: # 下载检测预训练模型:
wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_distill_train.tar wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_distill_train.tar
tar xf https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_distill_train.tar tar xf ch_PP-OCRv3_det_distill_train.tar
python deploy/slim/quantization/quant.py -c configs/det/ch_PP-OCRv3_det/ch_PP-OCRv3_det_cml.yml -o Global.pretrained_model='./ch_PP-OCRv3_det_distill_train/best_accuracy' Global.save_model_dir=./output/quant_model_distill/ python deploy/slim/quantization/quant.py -c configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml -o Global.pretrained_model='./ch_PP-OCRv3_det_distill_train/best_accuracy' Global.save_model_dir=./output/quant_model_distill/
``` ```
如果要训练识别模型的量化,修改配置文件和加载的模型参数即可。 如果要训练识别模型的量化,修改配置文件和加载的模型参数即可。
......
...@@ -54,9 +54,9 @@ Model distillation and model quantization can be used at the same time, taking t ...@@ -54,9 +54,9 @@ Model distillation and model quantization can be used at the same time, taking t
``` ```
# download provided model # download provided model
wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_distill_train.tar wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_distill_train.tar
tar xf https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_distill_train.tar tar xf ch_PP-OCRv3_det_distill_train.tar
python deploy/slim/quantization/quant.py -c configs/det/ch_PP-OCRv3_det/ch_PP-OCRv3_det_cml.yml -o Global.pretrained_model='./ch_PP-OCRv3_det_distill_train/best_accuracy' Global.save_model_dir=./output/quant_model_distill/ python deploy/slim/quantization/quant.py -c configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml -o Global.pretrained_model='./ch_PP-OCRv3_det_distill_train/best_accuracy' Global.save_model_dir=./output/quant_model_distill/
``` ```
If you want to quantify the text recognition model, you can modify the configuration file and loaded model parameters. If you want to quantify the text recognition model, you can modify the configuration file and loaded model parameters.
......
...@@ -65,6 +65,7 @@ ...@@ -65,6 +65,7 @@
- [x] [NRTR](./algorithm_rec_nrtr.md) - [x] [NRTR](./algorithm_rec_nrtr.md)
- [x] [SAR](./algorithm_rec_sar.md) - [x] [SAR](./algorithm_rec_sar.md)
- [x] [SEED](./algorithm_rec_seed.md) - [x] [SEED](./algorithm_rec_seed.md)
- [x] [SVTR](./algorithm_rec_svtr.md)
参考[DTRB](https://arxiv.org/abs/1904.01906)[3]文字识别训练和评估流程,使用MJSynth和SynthText两个文字识别数据集训练,在IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE数据集上进行评估,算法效果如下: 参考[DTRB](https://arxiv.org/abs/1904.01906)[3]文字识别训练和评估流程,使用MJSynth和SynthText两个文字识别数据集训练,在IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE数据集上进行评估,算法效果如下:
...@@ -82,6 +83,7 @@ ...@@ -82,6 +83,7 @@
|NRTR|NRTR_MTB| 84.21% | rec_mtb_nrtr | [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_mtb_nrtr_train.tar) | |NRTR|NRTR_MTB| 84.21% | rec_mtb_nrtr | [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_mtb_nrtr_train.tar) |
|SAR|Resnet31| 87.20% | rec_r31_sar | [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/rec/rec_r31_sar_train.tar) | |SAR|Resnet31| 87.20% | rec_r31_sar | [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/rec/rec_r31_sar_train.tar) |
|SEED|Aster_Resnet| 85.35% | rec_resnet_stn_bilstm_att | [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/rec/rec_resnet_stn_bilstm_att.tar) | |SEED|Aster_Resnet| 85.35% | rec_resnet_stn_bilstm_att | [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/rec/rec_resnet_stn_bilstm_att.tar) |
|SVTR|SVTR-Tiny| 89.25% | rec_svtr_tiny_none_ctc_en | [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/rec_svtr_tiny_none_ctc_en_train.tar) |
<a name="2"></a> <a name="2"></a>
...@@ -90,5 +92,3 @@ ...@@ -90,5 +92,3 @@
已支持的端到端OCR算法列表(戳链接获取使用教程): 已支持的端到端OCR算法列表(戳链接获取使用教程):
- [x] [PGNet](./algorithm_e2e_pgnet.md) - [x] [PGNet](./algorithm_e2e_pgnet.md)
# 场景应用
\ No newline at end of file
...@@ -59,7 +59,7 @@ python3 tools/infer/predict_det.py --image_dir="./doc/imgs/1.jpg" --det_model_di ...@@ -59,7 +59,7 @@ python3 tools/infer/predict_det.py --image_dir="./doc/imgs/1.jpg" --det_model_di
### 2.1 超轻量中文识别模型推理 ### 2.1 超轻量中文识别模型推理
**注意** `PP-OCRv3`的识别模型使用的输入shape为`3,48,320`, 需要添加参数`--rec_image_shape=3,48,320`,如果不使用`PP-OCRv3`的识别模型,则无需设置该参数 **注意** `PP-OCRv3`的识别模型使用的输入shape为`3,48,320`, 如果使用其他识别模型,则需根据模型设置参数`--rec_image_shape`。此外,`PP-OCRv3`的识别模型默认使用的`rec_algorithm``SVTR_LCNet`,注意和原始`SVTR`的区别
超轻量中文识别模型推理,可以执行如下命令: 超轻量中文识别模型推理,可以执行如下命令:
...@@ -67,7 +67,7 @@ python3 tools/infer/predict_det.py --image_dir="./doc/imgs/1.jpg" --det_model_di ...@@ -67,7 +67,7 @@ python3 tools/infer/predict_det.py --image_dir="./doc/imgs/1.jpg" --det_model_di
# 下载超轻量中文识别模型: # 下载超轻量中文识别模型:
wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar
tar xf ch_PP-OCRv3_rec_infer.tar tar xf ch_PP-OCRv3_rec_infer.tar
python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words/ch/word_4.jpg" --rec_model_dir="./ch_PP-OCRv3_rec_infer/" --rec_image_shape=3,48,320 python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words/ch/word_4.jpg" --rec_model_dir="./ch_PP-OCRv3_rec_infer/"
``` ```
![](../imgs_words/ch/word_4.jpg) ![](../imgs_words/ch/word_4.jpg)
...@@ -121,17 +121,17 @@ Predicts of ./doc/imgs_words/ch/word_4.jpg:['0', 0.9999982] ...@@ -121,17 +121,17 @@ Predicts of ./doc/imgs_words/ch/word_4.jpg:['0', 0.9999982]
## 4. 文本检测、方向分类和文字识别串联推理 ## 4. 文本检测、方向分类和文字识别串联推理
**注意** `PP-OCRv3`的识别模型使用的输入shape为`3,48,320`, 需要添加参数`--rec_image_shape=3,48,320`,如果不使用`PP-OCRv3`的识别模型,则无需设置该参数 **注意** `PP-OCRv3`的识别模型使用的输入shape为`3,48,320`, 如果使用其他识别模型,则需根据模型设置参数`--rec_image_shape`。此外,`PP-OCRv3`的识别模型默认使用的`rec_algorithm``SVTR_LCNet`,注意和原始`SVTR`的区别
以超轻量中文OCR模型推理为例,在执行预测时,需要通过参数`image_dir`指定单张图像或者图像集合的路径、参数`det_model_dir`,`cls_model_dir``rec_model_dir`分别指定检测,方向分类和识别的inference模型路径。参数`use_angle_cls`用于控制是否启用方向分类模型。`use_mp`表示是否使用多进程。`total_process_num`表示在使用多进程时的进程数。可视化识别结果默认保存到 ./inference_results 文件夹里面。 以超轻量中文OCR模型推理为例,在执行预测时,需要通过参数`image_dir`指定单张图像或者图像集合的路径、参数`det_model_dir`,`cls_model_dir``rec_model_dir`分别指定检测,方向分类和识别的inference模型路径。参数`use_angle_cls`用于控制是否启用方向分类模型。`use_mp`表示是否使用多进程。`total_process_num`表示在使用多进程时的进程数。可视化识别结果默认保存到 ./inference_results 文件夹里面。
```shell ```shell
# 使用方向分类器 # 使用方向分类器
python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./ch_PP-OCRv3_det_infer/" --cls_model_dir="./cls/" --rec_model_dir="./ch_PP-OCRv3_rec_infer/" --use_angle_cls=true --rec_image_shape=3,48,320 python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./ch_PP-OCRv3_det_infer/" --cls_model_dir="./cls/" --rec_model_dir="./ch_PP-OCRv3_rec_infer/" --use_angle_cls=true
# 不使用方向分类器 # 不使用方向分类器
python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./ch_PP-OCRv3_det_infer/" --rec_model_dir="./ch_PP-OCRv3_rec_infer/" --use_angle_cls=false --rec_image_shape=3,48,320 python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./ch_PP-OCRv3_det_infer/" --rec_model_dir="./ch_PP-OCRv3_rec_infer/" --use_angle_cls=false
# 使用多进程 # 使用多进程
python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./ch_PP-OCRv3_det_infer/" --rec_model_dir="./ch_PP-OCRv3_rec_infer/" --use_angle_cls=false --use_mp=True --total_process_num=6 --rec_image_shape=3,48,320 python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./ch_PP-OCRv3_det_infer/" --rec_model_dir="./ch_PP-OCRv3_rec_infer/" --use_angle_cls=false --use_mp=True --total_process_num=6
``` ```
执行命令后,识别结果图像如下: 执行命令后,识别结果图像如下:
......
...@@ -424,9 +424,9 @@ Architecture: ...@@ -424,9 +424,9 @@ Architecture:
``` ```
如果是采用DML,即两个小模型互相学习的方法,上述配置文件里的Teacher网络结构需要设置为Student模型一样的配置,具体参考配置文件[ch_PP-OCRv3_det_dml.yml](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.4/configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_dml.yml) 如果是采用DML,即两个小模型互相学习的方法,上述配置文件里的Teacher网络结构需要设置为Student模型一样的配置,具体参考配置文件[ch_PP-OCRv3_det_dml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_dml.yml)

下面介绍[ch_PP-OCRv3_det_cml.yml](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.4/configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml)的配置文件参数: 下面介绍[ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml)的配置文件参数:
``` ```
Architecture: Architecture:
......
...@@ -22,7 +22,7 @@ PaddleOCR提供的可下载模型包括`推理模型`、`训练模型`、`预训 ...@@ -22,7 +22,7 @@ PaddleOCR提供的可下载模型包括`推理模型`、`训练模型`、`预训
|模型类型|模型格式|简介| |模型类型|模型格式|简介|
|--- | --- | --- | |--- | --- | --- |
|推理模型|inference.pdmodel、inference.pdiparams|用于预测引擎推理,[详情](./inference.md)| |推理模型|inference.pdmodel、inference.pdiparams|用于预测引擎推理,[详情](./inference_ppocr.md)|
|训练模型、预训练模型|\*.pdparams、\*.pdopt、\*.states |训练过程中保存的模型的参数、优化器状态和训练中间信息,多用于模型指标评估和恢复训练| |训练模型、预训练模型|\*.pdparams、\*.pdopt、\*.states |训练过程中保存的模型的参数、优化器状态和训练中间信息,多用于模型指标评估和恢复训练|
|nb模型|\*.nb|经过飞桨Paddle-Lite工具优化后的模型,适用于移动端/IoT端等端侧部署场景(需使用飞桨Paddle Lite部署)。| |nb模型|\*.nb|经过飞桨Paddle-Lite工具优化后的模型,适用于移动端/IoT端等端侧部署场景(需使用飞桨Paddle Lite部署)。|
...@@ -114,7 +114,7 @@ PaddleOCR提供的可下载模型包括`推理模型`、`训练模型`、`预训 ...@@ -114,7 +114,7 @@ PaddleOCR提供的可下载模型包括`推理模型`、`训练模型`、`预训
| ka_PP-OCRv3_rec | ppocr/utils/dict/ka_dict.txt |卡纳达文识别|[ka_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/ka_PP-OCRv3_rec.yml)|9.9M|[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/ka_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/ka_PP-OCRv3_rec_train.tar) | | ka_PP-OCRv3_rec | ppocr/utils/dict/ka_dict.txt |卡纳达文识别|[ka_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/ka_PP-OCRv3_rec.yml)|9.9M|[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/ka_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/ka_PP-OCRv3_rec_train.tar) |
| ta_PP-OCRv3_rec | ppocr/utils/dict/ta_dict.txt |泰米尔文识别|[ta_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/ta_PP-OCRv3_rec.yml)|9.6M|[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/ta_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/ta_PP-OCRv3_rec_train.tar) | | ta_PP-OCRv3_rec | ppocr/utils/dict/ta_dict.txt |泰米尔文识别|[ta_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/ta_PP-OCRv3_rec.yml)|9.6M|[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/ta_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/ta_PP-OCRv3_rec_train.tar) |
| latin_PP-OCRv3_rec | ppocr/utils/dict/latin_dict.txt | 拉丁文识别 | [latin_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/latin_PP-OCRv3_rec.yml) |9.7M|[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/latin_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/latin_PP-OCRv3_rec_train.tar) | | latin_PP-OCRv3_rec | ppocr/utils/dict/latin_dict.txt | 拉丁文识别 | [latin_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/latin_PP-OCRv3_rec.yml) |9.7M|[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/latin_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/latin_PP-OCRv3_rec_train.tar) |
| arabic_PP-OCRv3_rec | ppocr/utils/dict/arabic_dict.txt | 阿拉伯字母 | [arabic_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/rec_arabic_lite_train.yml) |9.6M|[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/arabic_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/arabic_PP-OCRv3_rec_train.tar) | | arabic_PP-OCRv3_rec | ppocr/utils/dict/arabic_dict.txt | 阿拉伯字母 | [arabic_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/arabic_PP-OCRv3_rec.yml) |9.6M|[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/arabic_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/arabic_PP-OCRv3_rec_train.tar) |
| cyrillic_PP-OCRv3_rec | ppocr/utils/dict/cyrillic_dict.txt | 斯拉夫字母 | [cyrillic_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/cyrillic_PP-OCRv3_rec.yml) |9.6M|[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/cyrillic_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/cyrillic_PP-OCRv3_rec_train.tar) | | cyrillic_PP-OCRv3_rec | ppocr/utils/dict/cyrillic_dict.txt | 斯拉夫字母 | [cyrillic_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/cyrillic_PP-OCRv3_rec.yml) |9.6M|[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/cyrillic_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/cyrillic_PP-OCRv3_rec_train.tar) |
| devanagari_PP-OCRv3_rec | ppocr/utils/dict/devanagari_dict.txt |梵文字母 | [devanagari_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/devanagari_PP-OCRv3_rec.yml) |9.9M|[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/devanagari_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/devanagari_PP-OCRv3_rec_train.tar) | | devanagari_PP-OCRv3_rec | ppocr/utils/dict/devanagari_dict.txt |梵文字母 | [devanagari_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/devanagari_PP-OCRv3_rec.yml) |9.9M|[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/devanagari_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/devanagari_PP-OCRv3_rec_train.tar) |
......
...@@ -238,10 +238,10 @@ python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs ...@@ -238,10 +238,10 @@ python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs
## 4 预测部署 ## 4 预测部署
除了安装whl包进行快速预测,ppocr 也提供了多种预测部署方式,如有需求可阅读相关文档: 除了安装whl包进行快速预测,ppocr 也提供了多种预测部署方式,如有需求可阅读相关文档:
- [基于Python脚本预测引擎推理](./inference.md) - [基于Python脚本预测引擎推理](./inference_ppocr.md)
- [基于C++预测引擎推理](../../deploy/cpp_infer/readme.md) - [基于C++预测引擎推理](../../deploy/cpp_infer/readme_ch.md)
- [服务化部署](../../deploy/hubserving/readme.md) - [服务化部署](../../deploy/hubserving/readme.md)
- [端侧部署](https://github.com/PaddlePaddle/PaddleOCR/blob/develop/deploy/lite/readme.md) - [端侧部署](../../deploy/lite/readme_ch.md)
- [Benchmark](./benchmark.md) - [Benchmark](./benchmark.md)
......
# 使用Paddle Serving预测推理
阅读本文档之前,请先阅读文档 [基于Python预测引擎推理](./inference.md)
同本地执行预测一样,我们需要保存一份可以用于Paddle Serving的模型。
接下来首先介绍如何将训练的模型转换成Paddle Serving模型,然后将依次介绍文本检测、文本识别以及两者串联基于预测引擎推理。
### 一、 准备环境
我们先安装Paddle Serving相关组件
我们推荐用户使用GPU来做Paddle Serving的OCR服务部署
**CUDA版本:9.X/10.X**
**CUDNN版本:7.X**
**操作系统版本:Linux/Windows**
**Python版本: 2.7/3.5/3.6/3.7**
**Python操作指南:**
目前Serving用于OCR的部分功能还在测试当中,因此在这里我们给出[Servnig latest package](https://github.com/PaddlePaddle/Serving/blob/develop/doc/Latest_Packages_CN.md)
大家根据自己的环境选择需要安装的whl包即可,例如以Python 3.5为例,执行下列命令
```
#CPU/GPU版本选择一个
#GPU版本服务端
#CUDA 9
python -m pip install -U https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.0.0.post9-py3-none-any.whl
#CUDA 10
python -m pip install -U https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.0.0.post10-py3-none-any.whl
#CPU版本服务端
python -m pip install -U https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server-0.0.0-py3-none-any.whl
#客户端和App包使用以下链接(CPU,GPU通用)
python -m pip install -U https://paddle-serving.bj.bcebos.com/whl/paddle_serving_client-0.0.0-cp36-none-any.whl https://paddle-serving.bj.bcebos.com/whl/paddle_serving_app-0.0.0-py3-none-any.whl
```
## 二、训练模型转Serving模型
在前序文档 [基于Python预测引擎推理](./inference.md) 中,我们提供了如何把训练的checkpoint转换成Paddle模型。Paddle模型通常由一个文件夹构成,内含模型结构描述文件`model`和模型参数文件`params`。Serving模型由两个文件夹构成,用于存放客户端和服务端的配置。
我们以`ch_rec_r34_vd_crnn`模型作为例子,下载链接在:
```
wget --no-check-certificate https://paddleocr.bj.bcebos.com/ch_models/ch_rec_r34_vd_crnn_infer.tar
tar xf ch_rec_r34_vd_crnn_infer.tar
```
因此我们按照Serving模型转换教程,运行下列python文件。
```
python tools/inference_to_serving.py --model_dir ch_rec_r34_vd_crnn
```
最终会在`serving_client_dir``serving_server_dir`生成客户端和服务端的模型配置。其中`serving_server_dir``serving_client_dir`的名字可以自定义。最终文件结构如下
```
/ch_rec_r34_vd_crnn/
├── serving_client_dir # 客户端配置文件夹
└── serving_server_dir # 服务端配置文件夹
```
## 三、文本检测模型Serving推理
启动服务可以根据实际需求选择启动`标准版`或者`快速版`,两种方式的对比如下表:
|版本|特点|适用场景|
|-|-|-|
|标准版|稳定性高,分布式部署|适用于吞吐量大,需要跨机房部署的情况|
|快速版|部署方便,预测速度快|适用于对预测速度要求高,迭代速度快的场景,Windows用户只能选择快速版|
接下来的命令中,我们会指定快速版和标准版的命令。需要说明的是,标准版只能用Linux平台,快速版可以支持Linux/Windows。
文本检测模型推理,默认使用DB模型的配置参数,识别默认为CRNN。
配置文件在`params.py`中,我们贴出配置部分,如果需要做改动,也在这个文件内部进行修改。
```
def read_params():
cfg = Config()
#use gpu
cfg.use_gpu = False # 是否使用GPU
cfg.use_pdserving = True # 是否使用paddleserving,必须为True
#params for text detector
cfg.det_algorithm = "DB" # 检测算法, DB/EAST等
cfg.det_model_dir = "./det_mv_server/" # 检测算法模型路径
cfg.det_max_side_len = 960
#DB params
cfg.det_db_thresh =0.3
cfg.det_db_box_thresh =0.5
cfg.det_db_unclip_ratio =2.0
#EAST params
cfg.det_east_score_thresh = 0.8
cfg.det_east_cover_thresh = 0.1
cfg.det_east_nms_thresh = 0.2
#params for text recognizer
cfg.rec_algorithm = "CRNN" # 识别算法, CRNN/RARE等
cfg.rec_model_dir = "./ocr_rec_server/" # 识别算法模型路径
cfg.rec_image_shape = "3, 32, 320"
cfg.rec_batch_num = 30
cfg.max_text_length = 25
cfg.rec_char_dict_path = "./ppocr_keys_v1.txt" # 识别算法字典文件
cfg.use_space_char = True
#params for text classifier
cfg.use_angle_cls = True # 是否启用分类算法
cfg.cls_model_dir = "./ocr_clas_server/" # 分类算法模型路径
cfg.cls_image_shape = "3, 48, 192"
cfg.label_list = ['0', '180']
cfg.cls_batch_num = 30
cfg.cls_thresh = 0.9
return cfg
```
与本地预测不同的是,Serving预测需要一个客户端和一个服务端,因此接下来的教程都是两行代码。
在正式执行服务端启动命令之前,先export PYTHONPATH到工程主目录下。
```
export PYTHONPATH=$PWD:$PYTHONPATH
cd deploy/pdserving
```
为了方便用户复现Demo程序,我们提供了Chinese and English ultra-lightweight OCR model (8.1M)版本的Serving模型
```
wget --no-check-certificate https://paddleocr.bj.bcebos.com/deploy/pdserving/ocr_pdserving_suite.tar.gz
tar xf ocr_pdserving_suite.tar.gz
```
### 1. 超轻量中文检测模型推理
超轻量中文检测模型推理,可以执行如下命令启动服务端:
```
#根据环境只需要启动其中一个就可以
python det_rpc_server.py #标准版,Linux用户
python det_local_server.py #快速版,Windows/Linux用户
```
客户端
```
python det_web_client.py
```
Serving的推测和本地预测不同点在于,客户端发送请求到服务端,服务端需要检测到文字框之后返回框的坐标,此处没有后处理的图片,只能看到坐标值。
## 四、文本识别模型Serving推理
下面将介绍超轻量中文识别模型推理、基于CTC损失的识别模型推理和基于Attention损失的识别模型推理。对于中文文本识别,建议优先选择基于CTC损失的识别模型,实践中也发现基于Attention损失的效果不如基于CTC损失的识别模型。此外,如果训练时修改了文本的字典,请参考下面的自定义文本识别字典的推理。
### 1. 超轻量中文识别模型推理
超轻量中文识别模型推理,可以执行如下命令启动服务端:
需要注意params.py中的`--use_gpu`的值
```
#根据环境只需要启动其中一个就可以
python rec_rpc_server.py #标准版,Linux用户
python rec_local_server.py #快速版,Windows/Linux用户
```
如果需要使用CPU版本,还需增加 `--use_gpu False`
客户端
```
python rec_web_client.py
```
![](../imgs_words/ch/word_4.jpg)
执行命令后,上面图像的预测结果(识别的文本和得分)会打印到屏幕上,示例如下:
```
{u'result': {u'score': [u'0.89547354'], u'pred_text': ['实力活力']}}
```
## 五、方向分类模型推理
下面将介绍方向分类模型推理。
### 1. 方向分类模型推理
方向分类模型推理, 可以执行如下命令启动服务端:
需要注意params.py中的`--use_gpu`的值
```
#根据环境只需要启动其中一个就可以
python clas_rpc_server.py #标准版,Linux用户
python clas_local_server.py #快速版,Windows/Linux用户
```
客户端
```
python rec_web_client.py
```
![](../imgs_words/ch/word_4.jpg)
执行命令后,上面图像的预测结果(分类的方向和得分)会打印到屏幕上,示例如下:
```
{u'result': {u'direction': [u'0'], u'score': [u'0.9999963']}}
```
## 六、文本检测、方向分类和文字识别串联Serving推理
### 1. 超轻量中文OCR模型推理
在执行预测时,需要通过参数`image_dir`指定单张图像或者图像集合的路径、参数`det_model_dir`,`cls_model_dir``rec_model_dir`分别指定检测,方向分类和识别的inference模型路径。参数`use_angle_cls`用于控制是否启用方向分类模型。与本地预测不同的是,为了减少网络传输耗时,可视化识别结果目前不做处理,用户收到的是推理得到的文字字段。
执行如下命令启动服务端:
需要注意params.py中的`--use_gpu`的值
```
#标准版,Linux用户
#GPU用户
python -m paddle_serving_server_gpu.serve --model det_infer_server --port 9293 --gpu_id 0
python -m paddle_serving_server_gpu.serve --model cls_infer_server --port 9294 --gpu_id 0
python ocr_rpc_server.py
#CPU用户
python -m paddle_serving_server.serve --model det_infer_server --port 9293
python -m paddle_serving_server.serve --model cls_infer_server --port 9294
python ocr_rpc_server.py
#快速版,Windows/Linux用户
python ocr_local_server.py
```
客户端
```
python rec_web_client.py
```
# 更新 # 更新
- 2022.5.9 发布PaddleOCR v2.5。发布内容包括: - 2022.5.9 发布PaddleOCR v2.5。发布内容包括:
- [PP-OCRv3](./doc/doc_ch/ppocr_introduction.md#pp-ocrv3),速度可比情况下,中文场景效果相比于PP-OCRv2再提升5%,英文场景提升11%,80语种多语言模型平均识别准确率提升5%以上; - [PP-OCRv3](./ppocr_introduction.md#pp-ocrv3),速度可比情况下,中文场景效果相比于PP-OCRv2再提升5%,英文场景提升11%,80语种多语言模型平均识别准确率提升5%以上;
- 半自动标注工具[PPOCRLabelv2](./PPOCRLabel):新增表格文字图像、图像关键信息抽取任务和不规则文字图像的标注功能; - 半自动标注工具[PPOCRLabelv2](../../PPOCRLabel):新增表格文字图像、图像关键信息抽取任务和不规则文字图像的标注功能;
- OCR产业落地工具集:打通22种训练部署软硬件环境与方式,覆盖企业90%的训练部署环境需求 - OCR产业落地工具集:打通22种训练部署软硬件环境与方式,覆盖企业90%的训练部署环境需求
- 交互式OCR开源电子书[《动手学OCR》](./doc/doc_ch/ocr_book.md),覆盖OCR全栈技术的前沿理论与代码实践,并配套教学视频。 - 交互式OCR开源电子书[《动手学OCR》](./ocr_book.md),覆盖OCR全栈技术的前沿理论与代码实践,并配套教学视频。
- 2022.5.7 添加对[Weights & Biases](https://docs.wandb.ai/)训练日志记录工具的支持。 - 2022.5.7 添加对[Weights & Biases](https://docs.wandb.ai/)训练日志记录工具的支持。
- 2021.12.21 《OCR十讲》课程开讲,12月21日起每晚八点半线上授课! 【免费】报名地址:https://aistudio.baidu.com/aistudio/course/introduce/25207 - 2021.12.21 《OCR十讲》课程开讲,12月21日起每晚八点半线上授课! 【免费】报名地址:https://aistudio.baidu.com/aistudio/course/introduce/25207
- 2021.12.21 发布PaddleOCR v2.4。OCR算法新增1种文本检测算法(PSENet),3种文本识别算法(NRTR、SEED、SAR);文档结构化算法新增1种关键信息提取算法(SDMGR),3种DocVQA算法(LayoutLM、LayoutLMv2,LayoutXLM)。 - 2021.12.21 发布PaddleOCR v2.4。OCR算法新增1种文本检测算法(PSENet),3种文本识别算法(NRTR、SEED、SAR);文档结构化算法新增1种关键信息提取算法(SDMGR),3种DocVQA算法(LayoutLM、LayoutLMv2,LayoutXLM)。
......
...@@ -40,7 +40,7 @@ Please prepare your environment referring to [prepare the environment](./environ ...@@ -40,7 +40,7 @@ Please prepare your environment referring to [prepare the environment](./environ
The above EAST model is trained using the ICDAR2015 text detection public dataset. For the download of the dataset, please refer to [ocr_datasets](./dataset/ocr_datasets_en.md). The above EAST model is trained using the ICDAR2015 text detection public dataset. For the download of the dataset, please refer to [ocr_datasets](./dataset/ocr_datasets_en.md).
After the data download is complete, please refer to [Text Detection Training Tutorial](./detection.md) for training. PaddleOCR has modularized the code structure, so that you only need to **replace the configuration file** to train different detection models. After the data download is complete, please refer to [Text Detection Training Tutorial](./detection_en.md) for training. PaddleOCR has modularized the code structure, so that you only need to **replace the configuration file** to train different detection models.
<a name="4"></a> <a name="4"></a>
......
...@@ -37,7 +37,7 @@ Please prepare your environment referring to [prepare the environment](./environ ...@@ -37,7 +37,7 @@ Please prepare your environment referring to [prepare the environment](./environ
The above FCE model is trained using the CTW1500 text detection public dataset. For the download of the dataset, please refer to [ocr_datasets](./dataset/ocr_datasets_en.md). The above FCE model is trained using the CTW1500 text detection public dataset. For the download of the dataset, please refer to [ocr_datasets](./dataset/ocr_datasets_en.md).
After the data download is complete, please refer to [Text Detection Training Tutorial](./detection.md) for training. PaddleOCR has modularized the code structure, so that you only need to **replace the configuration file** to train different detection models. After the data download is complete, please refer to [Text Detection Training Tutorial](./detection_en.md) for training. PaddleOCR has modularized the code structure, so that you only need to **replace the configuration file** to train different detection models.
<a name="4"></a> <a name="4"></a>
## 4. Inference and Deployment ## 4. Inference and Deployment
......
...@@ -39,7 +39,7 @@ Please prepare your environment referring to [prepare the environment](./environ ...@@ -39,7 +39,7 @@ Please prepare your environment referring to [prepare the environment](./environ
The above PSE model is trained using the ICDAR2015 text detection public dataset. For the download of the dataset, please refer to [ocr_datasets](./dataset/ocr_datasets_en.md). The above PSE model is trained using the ICDAR2015 text detection public dataset. For the download of the dataset, please refer to [ocr_datasets](./dataset/ocr_datasets_en.md).
After the data download is complete, please refer to [Text Detection Training Tutorial](./detection.md) for training. PaddleOCR has modularized the code structure, so that you only need to **replace the configuration file** to train different detection models. After the data download is complete, please refer to [Text Detection Training Tutorial](./detection_en.md) for training. PaddleOCR has modularized the code structure, so that you only need to **replace the configuration file** to train different detection models.
<a name="4"></a> <a name="4"></a>
## 4. Inference and Deployment ## 4. Inference and Deployment
......
...@@ -36,7 +36,7 @@ The results of detection and recognition are as follows: ...@@ -36,7 +36,7 @@ The results of detection and recognition are as follows:
<a name="Environment_Configuration"></a> <a name="Environment_Configuration"></a>
## 2. Environment Configuration ## 2. Environment Configuration
Please refer to [Operation Environment Preparation](./environment_en.md) to configure PaddleOCR operating environment first, refer to [PaddleOCR Overview and Project Clone](./paddleOCR_overview_en.md) to clone the project Please refer to [Operation Environment Preparation](./environment_en.md) to configure PaddleOCR operating environment first, refer to [Project Clone](./clone_en.md) to clone the project
<a name="Quick_Use"></a> <a name="Quick_Use"></a>
## 3. Quick Use ## 3. Quick Use
......
...@@ -41,6 +41,12 @@ On Total-Text dataset, the text detection result is as follows: ...@@ -41,6 +41,12 @@ On Total-Text dataset, the text detection result is as follows:
| --- | --- | --- | --- | --- | --- | | --- | --- | --- | --- | --- | --- |
|SAST|ResNet50_vd|89.63%|78.44%|83.66%|[trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_sast_totaltext_v2.0_train.tar)| |SAST|ResNet50_vd|89.63%|78.44%|83.66%|[trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_sast_totaltext_v2.0_train.tar)|
On CTW1500 dataset, the text detection result is as follows:
|Model|Backbone|Precision|Recall|Hmean| Download link|
| --- | --- | --- | --- | --- |---|
|FCE|ResNet50_dcn|88.39%|82.18%|85.27%| [trained model](https://paddleocr.bj.bcebos.com/contribution/det_r50_dcn_fce_ctw_v2.0_train.tar) |
**Note:** Additional data, like icdar2013, icdar2017, COCO-Text, ArT, was added to the model training of SAST. Download English public dataset in organized format used by PaddleOCR from: **Note:** Additional data, like icdar2013, icdar2017, COCO-Text, ArT, was added to the model training of SAST. Download English public dataset in organized format used by PaddleOCR from:
* [Baidu Drive](https://pan.baidu.com/s/12cPnZcVuV1zn5DOd4mqjVw) (download code: 2bpi). * [Baidu Drive](https://pan.baidu.com/s/12cPnZcVuV1zn5DOd4mqjVw) (download code: 2bpi).
* [Google Drive](https://drive.google.com/drive/folders/1ll2-XEVyCQLpJjawLDiRlvo_i4BqHCJe?usp=sharing) * [Google Drive](https://drive.google.com/drive/folders/1ll2-XEVyCQLpJjawLDiRlvo_i4BqHCJe?usp=sharing)
...@@ -58,6 +64,7 @@ Supported text recognition algorithms (Click the link to get the tutorial): ...@@ -58,6 +64,7 @@ Supported text recognition algorithms (Click the link to get the tutorial):
- [x] [NRTR](./algorithm_rec_nrtr_en.md) - [x] [NRTR](./algorithm_rec_nrtr_en.md)
- [x] [SAR](./algorithm_rec_sar_en.md) - [x] [SAR](./algorithm_rec_sar_en.md)
- [x] [SEED](./algorithm_rec_seed_en.md) - [x] [SEED](./algorithm_rec_seed_en.md)
- [x] [SVTR](./algorithm_rec_svtr_en.md)
Refer to [DTRB](https://arxiv.org/abs/1904.01906), the training and evaluation result of these above text recognition (using MJSynth and SynthText for training, evaluate on IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE) is as follow: Refer to [DTRB](https://arxiv.org/abs/1904.01906), the training and evaluation result of these above text recognition (using MJSynth and SynthText for training, evaluate on IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE) is as follow:
...@@ -75,6 +82,7 @@ Refer to [DTRB](https://arxiv.org/abs/1904.01906), the training and evaluation r ...@@ -75,6 +82,7 @@ Refer to [DTRB](https://arxiv.org/abs/1904.01906), the training and evaluation r
|NRTR|NRTR_MTB| 84.21% | rec_mtb_nrtr | [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_mtb_nrtr_train.tar) | |NRTR|NRTR_MTB| 84.21% | rec_mtb_nrtr | [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_mtb_nrtr_train.tar) |
|SAR|Resnet31| 87.20% | rec_r31_sar | [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.1/rec/rec_r31_sar_train.tar) | |SAR|Resnet31| 87.20% | rec_r31_sar | [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.1/rec/rec_r31_sar_train.tar) |
|SEED|Aster_Resnet| 85.35% | rec_resnet_stn_bilstm_att | [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.1/rec/rec_resnet_stn_bilstm_att.tar) | |SEED|Aster_Resnet| 85.35% | rec_resnet_stn_bilstm_att | [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.1/rec/rec_resnet_stn_bilstm_att.tar) |
|SVTR|SVTR-Tiny| 89.25% | rec_svtr_tiny_none_ctc_en | [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/rec_svtr_tiny_none_ctc_en_train.tar) |
<a name="2"></a> <a name="2"></a>
......
...@@ -33,13 +33,13 @@ Using MJSynth and SynthText two text recognition datasets for training, and eval ...@@ -33,13 +33,13 @@ Using MJSynth and SynthText two text recognition datasets for training, and eval
<a name="2"></a> <a name="2"></a>
## 2. Environment ## 2. Environment
Please refer to ["Environment Preparation"](./environment.md) to configure the PaddleOCR environment, and refer to ["Project Clone"](./clone.md) to clone the project code. Please refer to ["Environment Preparation"](./environment_en.md) to configure the PaddleOCR environment, and refer to ["Project Clone"](./clone_en.md) to clone the project code.
<a name="3"></a> <a name="3"></a>
## 3. Model Training / Evaluation / Prediction ## 3. Model Training / Evaluation / Prediction
Please refer to [Text Recognition Tutorial](./recognition.md). PaddleOCR modularizes the code, and training different recognition models only requires **changing the configuration file**. Please refer to [Text Recognition Tutorial](./recognition_en.md). PaddleOCR modularizes the code, and training different recognition models only requires **changing the configuration file**.
Training: Training:
......
...@@ -33,13 +33,13 @@ Using MJSynth and SynthText two text recognition datasets for training, and eval ...@@ -33,13 +33,13 @@ Using MJSynth and SynthText two text recognition datasets for training, and eval
<a name="2"></a> <a name="2"></a>
## 2. Environment ## 2. Environment
Please refer to ["Environment Preparation"](./environment.md) to configure the PaddleOCR environment, and refer to ["Project Clone"](./clone.md) to clone the project code. Please refer to ["Environment Preparation"](./environment_en.md) to configure the PaddleOCR environment, and refer to ["Project Clone"](./clone_en.md) to clone the project code.
<a name="3"></a> <a name="3"></a>
## 3. Model Training / Evaluation / Prediction ## 3. Model Training / Evaluation / Prediction
Please refer to [Text Recognition Tutorial](./recognition.md). PaddleOCR modularizes the code, and training different recognition models only requires **changing the configuration file**. Please refer to [Text Recognition Tutorial](./recognition_en.md). PaddleOCR modularizes the code, and training different recognition models only requires **changing the configuration file**.
Training: Training:
......
...@@ -29,13 +29,13 @@ Using MJSynth and SynthText two text recognition datasets for training, and eval ...@@ -29,13 +29,13 @@ Using MJSynth and SynthText two text recognition datasets for training, and eval
<a name="2"></a> <a name="2"></a>
## 2. Environment ## 2. Environment
Please refer to ["Environment Preparation"](./environment.md) to configure the PaddleOCR environment, and refer to ["Project Clone"](./clone.md) to clone the project code. Please refer to ["Environment Preparation"](./environment_en.md) to configure the PaddleOCR environment, and refer to ["Project Clone"](./clone_en.md) to clone the project code.
<a name="3"></a> <a name="3"></a>
## 3. Model Training / Evaluation / Prediction ## 3. Model Training / Evaluation / Prediction
Please refer to [Text Recognition Tutorial](./recognition.md). PaddleOCR modularizes the code, and training different recognition models only requires **changing the configuration file**. Please refer to [Text Recognition Tutorial](./recognition_en.md). PaddleOCR modularizes the code, and training different recognition models only requires **changing the configuration file**.
Training: Training:
......
...@@ -31,13 +31,13 @@ Note:In addition to using the two text recognition datasets MJSynth and SynthTex ...@@ -31,13 +31,13 @@ Note:In addition to using the two text recognition datasets MJSynth and SynthTex
<a name="2"></a> <a name="2"></a>
## 2. Environment ## 2. Environment
Please refer to ["Environment Preparation"](./environment.md) to configure the PaddleOCR environment, and refer to ["Project Clone"](./clone.md) to clone the project code. Please refer to ["Environment Preparation"](./environment_en.md) to configure the PaddleOCR environment, and refer to ["Project Clone"](./clone_en.md) to clone the project code.
<a name="3"></a> <a name="3"></a>
## 3. Model Training / Evaluation / Prediction ## 3. Model Training / Evaluation / Prediction
Please refer to [Text Recognition Tutorial](./recognition.md). PaddleOCR modularizes the code, and training different recognition models only requires **changing the configuration file**. Please refer to [Text Recognition Tutorial](./recognition_en.md). PaddleOCR modularizes the code, and training different recognition models only requires **changing the configuration file**.
Training: Training:
......
...@@ -31,13 +31,13 @@ Using MJSynth and SynthText two text recognition datasets for training, and eval ...@@ -31,13 +31,13 @@ Using MJSynth and SynthText two text recognition datasets for training, and eval
<a name="2"></a> <a name="2"></a>
## 2. Environment ## 2. Environment
Please refer to ["Environment Preparation"](./environment.md) to configure the PaddleOCR environment, and refer to ["Project Clone"](./clone.md) to clone the project code. Please refer to ["Environment Preparation"](./environment_en.md) to configure the PaddleOCR environment, and refer to ["Project Clone"](./clone_en.md) to clone the project code.
<a name="3"></a> <a name="3"></a>
## 3. Model Training / Evaluation / Prediction ## 3. Model Training / Evaluation / Prediction
Please refer to [Text Recognition Tutorial](./recognition.md). PaddleOCR modularizes the code, and training different recognition models only requires **changing the configuration file**. Please refer to [Text Recognition Tutorial](./recognition_en.md). PaddleOCR modularizes the code, and training different recognition models only requires **changing the configuration file**.
Training: Training:
......
...@@ -30,13 +30,13 @@ Using MJSynth and SynthText two text recognition datasets for training, and eval ...@@ -30,13 +30,13 @@ Using MJSynth and SynthText two text recognition datasets for training, and eval
<a name="2"></a> <a name="2"></a>
## 2. Environment ## 2. Environment
Please refer to ["Environment Preparation"](./environment.md) to configure the PaddleOCR environment, and refer to ["Project Clone"](./clone.md) to clone the project code. Please refer to ["Environment Preparation"](./environment_en.md) to configure the PaddleOCR environment, and refer to ["Project Clone"](./clone_en.md) to clone the project code.
<a name="3"></a> <a name="3"></a>
## 3. Model Training / Evaluation / Prediction ## 3. Model Training / Evaluation / Prediction
Please refer to [Text Recognition Tutorial](./recognition.md). PaddleOCR modularizes the code, and training different recognition models only requires **changing the configuration file**. Please refer to [Text Recognition Tutorial](./recognition_en.md). PaddleOCR modularizes the code, and training different recognition models only requires **changing the configuration file**.
Training: Training:
......
...@@ -34,7 +34,7 @@ The accuracy (%) and model files of SVTR on the public dataset of scene text rec ...@@ -34,7 +34,7 @@ The accuracy (%) and model files of SVTR on the public dataset of scene text rec
<a name="2"></a> <a name="2"></a>
## 2. Environment ## 2. Environment
Please refer to ["Environment Preparation"](./environment.md) to configure the PaddleOCR environment, and refer to ["Project Clone"](./clone.md) to clone the project code. Please refer to ["Environment Preparation"](./environment_en.md) to configure the PaddleOCR environment, and refer to ["Project Clone"](./clone_en.md) to clone the project code.
#### Dataset Preparation #### Dataset Preparation
...@@ -44,7 +44,7 @@ Please refer to ["Environment Preparation"](./environment.md) to configure the P ...@@ -44,7 +44,7 @@ Please refer to ["Environment Preparation"](./environment.md) to configure the P
<a name="3"></a> <a name="3"></a>
## 3. Model Training / Evaluation / Prediction ## 3. Model Training / Evaluation / Prediction
Please refer to [Text Recognition Tutorial](./recognition.md). PaddleOCR modularizes the code, and training different recognition models only requires **changing the configuration file**. Please refer to [Text Recognition Tutorial](./recognition_en.md). PaddleOCR modularizes the code, and training different recognition models only requires **changing the configuration file**.
Training: Training:
......
...@@ -56,7 +56,7 @@ python3 tools/infer/predict_det.py --image_dir="./doc/imgs/1.jpg" --det_model_di ...@@ -56,7 +56,7 @@ python3 tools/infer/predict_det.py --image_dir="./doc/imgs/1.jpg" --det_model_di
<a name="LIGHTWEIGHT_RECOGNITION"></a> <a name="LIGHTWEIGHT_RECOGNITION"></a>
### 1. Lightweight Chinese Recognition Model Inference ### 1. Lightweight Chinese Recognition Model Inference
**Note**: The input shape used by the recognition model of `PP-OCRv3` is `3,48,320`, and the parameter `--rec_image_shape=3,48,320` needs to be added. If the recognition model of `PP-OCRv3` is not used, this parameter does not need to be set. **Note**: The input shape used by the recognition model of `PP-OCRv3` is `3, 48, 320`. If you use other recognition models, you need to set the parameter `--rec_image_shape` according to the model. In addition, the `rec_algorithm` used by the recognition model of `PP-OCRv3` is `SVTR_LCNet` by default. Note the difference from the original `SVTR`.
For lightweight Chinese recognition model inference, you can execute the following commands: For lightweight Chinese recognition model inference, you can execute the following commands:
...@@ -120,19 +120,18 @@ After executing the command, the prediction results (classification angle and sc ...@@ -120,19 +120,18 @@ After executing the command, the prediction results (classification angle and sc
<a name="CONCATENATION"></a> <a name="CONCATENATION"></a>
## Text Detection Angle Classification and Recognition Inference Concatenation ## Text Detection Angle Classification and Recognition Inference Concatenation
**Note**: The input shape used by the recognition model of `PP-OCRv3` is `3,48,320`, and the parameter `--rec_image_shape=3,48,320` needs to be added. If the recognition model of `PP-OCRv3` is not used, this parameter does not need to be set. **Note**: The input shape used by the recognition model of `PP-OCRv3` is `3, 48, 320`. If you use other recognition models, you need to set the parameter `--rec_image_shape` according to the model. In addition, the `rec_algorithm` used by the recognition model of `PP-OCRv3` is `SVTR_LCNet` by default. Note the difference from the original `SVTR`.
When performing prediction, you need to specify the path of a single image or a folder of images through the parameter `image_dir`, the parameter `det_model_dir` specifies the path to detect the inference model, the parameter `cls_model_dir` specifies the path to angle classification inference model and the parameter `rec_model_dir` specifies the path to identify the inference model. The parameter `use_angle_cls` is used to control whether to enable the angle classification model. The parameter `use_mp` specifies whether to use multi-process to infer `total_process_num` specifies process number when using multi-process. The parameter . The visualized recognition results are saved to the `./inference_results` folder by default. When performing prediction, you need to specify the path of a single image or a folder of images through the parameter `image_dir`, the parameter `det_model_dir` specifies the path to detect the inference model, the parameter `cls_model_dir` specifies the path to angle classification inference model and the parameter `rec_model_dir` specifies the path to identify the inference model. The parameter `use_angle_cls` is used to control whether to enable the angle classification model. The parameter `use_mp` specifies whether to use multi-process to infer `total_process_num` specifies process number when using multi-process. The parameter . The visualized recognition results are saved to the `./inference_results` folder by default.
```shell ```shell
# use direction classifier # use direction classifier
python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./ch_PP-OCRv3_det_infer/" --cls_model_dir="./cls/" --rec_model_dir="./ch_PP-OCRv2_rec_infer/" --use_angle_cls=true --rec_image_shape=3,48,320 python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./ch_PP-OCRv3_det_infer/" --cls_model_dir="./cls/" --rec_model_dir="./ch_PP-OCRv3_rec_infer/" --use_angle_cls=true
# not use use direction classifier # not use use direction classifier
python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./ch_PP-OCRv2_det_infer/" --rec_model_dir="./ch_PP-OCRv2_rec_infer/" --use_angle_cls=false --rec_image_shape=3,48,320 python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./ch_PP-OCRv3_det_infer/" --rec_model_dir="./ch_PP-OCRv3_rec_infer/" --use_angle_cls=false
# use multi-process # use multi-process
python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./ch_PP-OCRv2_det_infer/" --rec_model_dir="./ch_PP-OCRv2_rec_infer/" --use_angle_cls=false --use_mp=True --total_process_num=6 --rec_image_shape=3,48,320 python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./ch_PP-OCRv3_det_infer/" --rec_model_dir="./ch_PP-OCRv3_rec_infer/" --use_angle_cls=false --use_mp=True --total_process_num=6
``` ```
......
...@@ -438,10 +438,10 @@ Architecture: ...@@ -438,10 +438,10 @@ Architecture:
``` ```
If DML is used, that is, the method of two small models learning from each other, the Teacher network structure in the above configuration file needs to be set to the same configuration as the Student model. If DML is used, that is, the method of two small models learning from each other, the Teacher network structure in the above configuration file needs to be set to the same configuration as the Student model.
Refer to the configuration file for details. [ch_PP-OCRv3_det_dml.yml](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.4/configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_dml.yml) Refer to the configuration file for details. [ch_PP-OCRv3_det_dml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_dml.yml)
The following describes the configuration file parameters [ch_PP-OCRv3_det_cml.yml](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.4/configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml): The following describes the configuration file parameters [ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml):
``` ```
Architecture: Architecture:
......
...@@ -20,7 +20,7 @@ The downloadable models provided by PaddleOCR include `inference model`, `traine ...@@ -20,7 +20,7 @@ The downloadable models provided by PaddleOCR include `inference model`, `traine
|model type|model format|description| |model type|model format|description|
|--- | --- | --- | |--- | --- | --- |
|inference model|inference.pdmodel、inference.pdiparams|Used for inference based on Paddle inference engine,[detail](./inference_en.md)| |inference model|inference.pdmodel、inference.pdiparams|Used for inference based on Paddle inference engine,[detail](./inference_ppocr_en.md)|
|trained model, pre-trained model|\*.pdparams、\*.pdopt、\*.states |The checkpoints model saved in the training process, which stores the parameters of the model, mostly used for model evaluation and continuous training.| |trained model, pre-trained model|\*.pdparams、\*.pdopt、\*.states |The checkpoints model saved in the training process, which stores the parameters of the model, mostly used for model evaluation and continuous training.|
|nb model|\*.nb| Model optimized by Paddle-Lite, which is suitable for mobile-side deployment scenarios (Paddle-Lite is needed for nb model deployment). | |nb model|\*.nb| Model optimized by Paddle-Lite, which is suitable for mobile-side deployment scenarios (Paddle-Lite is needed for nb model deployment). |
...@@ -37,7 +37,7 @@ Relationship of the above models is as follows. ...@@ -37,7 +37,7 @@ Relationship of the above models is as follows.
|model name|description|config|model size|download| |model name|description|config|model size|download|
| --- | --- | --- | --- | --- | | --- | --- | --- | --- | --- |
|ch_PP-OCRv3_det_slim| [New] slim quantization with distillation lightweight model, supporting Chinese, English, multilingual text detection |[ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml)| 1.1M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_slim_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/ch/ch_PP-OCRv3_det_slim_distill_train.tar) / [nb model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_slim_infer.nb)| |ch_PP-OCRv3_det_slim| [New] slim quantization with distillation lightweight model, supporting Chinese, English, multilingual text detection |[ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml)| 1.1M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_slim_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_slim_distill_train.tar) / [nb model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_slim_infer.nb)|
|ch_PP-OCRv3_det| [New] Original lightweight model, supporting Chinese, English, multilingual text detection |[ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml)| 3.8M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_distill_train.tar)| |ch_PP-OCRv3_det| [New] Original lightweight model, supporting Chinese, English, multilingual text detection |[ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml)| 3.8M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_distill_train.tar)|
|ch_PP-OCRv2_det_slim| [New] slim quantization with distillation lightweight model, supporting Chinese, English, multilingual text detection|[ch_PP-OCRv2_det_cml.yml](../../configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_cml.yml)| 3M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_slim_quant_infer.tar)| |ch_PP-OCRv2_det_slim| [New] slim quantization with distillation lightweight model, supporting Chinese, English, multilingual text detection|[ch_PP-OCRv2_det_cml.yml](../../configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_cml.yml)| 3M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_slim_quant_infer.tar)|
|ch_PP-OCRv2_det| [New] Original lightweight model, supporting Chinese, English, multilingual text detection|[ch_PP-OCRv2_det_cml.yml](../../configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_cml.yml)|3M|[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_distill_train.tar)| |ch_PP-OCRv2_det| [New] Original lightweight model, supporting Chinese, English, multilingual text detection|[ch_PP-OCRv2_det_cml.yml](../../configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_cml.yml)|3M|[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_distill_train.tar)|
...@@ -75,7 +75,7 @@ Relationship of the above models is as follows. ...@@ -75,7 +75,7 @@ Relationship of the above models is as follows.
|model name|description|config|model size|download| |model name|description|config|model size|download|
| --- | --- | --- | --- | --- | | --- | --- | --- | --- | --- |
|ch_PP-OCRv3_rec_slim | [New] Slim qunatization with distillation lightweight model, supporting Chinese, English text recognition |[ch_PP-OCRv3_rec_distillation.yml](../../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml)| 4.9M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/ch/ch_PP-OCRv3_rec_slim_train.tar) / [nb model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_infer.nb) | |ch_PP-OCRv3_rec_slim | [New] Slim qunatization with distillation lightweight model, supporting Chinese, English text recognition |[ch_PP-OCRv3_rec_distillation.yml](../../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml)| 4.9M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_train.tar) / [nb model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_infer.nb) |
|ch_PP-OCRv3_rec| [New] Original lightweight model, supporting Chinese, English, multilingual text recognition |[ch_PP-OCRv3_rec_distillation.yml](../../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml)| 12.4M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_train.tar) | |ch_PP-OCRv3_rec| [New] Original lightweight model, supporting Chinese, English, multilingual text recognition |[ch_PP-OCRv3_rec_distillation.yml](../../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml)| 12.4M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_train.tar) |
|ch_PP-OCRv2_rec_slim| Slim qunatization with distillation lightweight model, supporting Chinese, English text recognition|[ch_PP-OCRv2_rec.yml](../../configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec.yml)| 9M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_train.tar) | |ch_PP-OCRv2_rec_slim| Slim qunatization with distillation lightweight model, supporting Chinese, English text recognition|[ch_PP-OCRv2_rec.yml](../../configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec.yml)| 9M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_train.tar) |
|ch_PP-OCRv2_rec| Original lightweight model, supporting Chinese, English, multilingual text recognition |[ch_PP-OCRv2_rec_distillation.yml](../../configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec_distillation.yml)|8.5M|[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_train.tar) | |ch_PP-OCRv2_rec| Original lightweight model, supporting Chinese, English, multilingual text recognition |[ch_PP-OCRv2_rec_distillation.yml](../../configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec_distillation.yml)|8.5M|[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_train.tar) |
...@@ -91,7 +91,7 @@ Relationship of the above models is as follows. ...@@ -91,7 +91,7 @@ Relationship of the above models is as follows.
|model name|description|config|model size|download| |model name|description|config|model size|download|
| --- | --- | --- | --- | --- | | --- | --- | --- | --- | --- |
|en_PP-OCRv3_rec_slim | [New] Slim qunatization with distillation lightweight model, supporting english, English text recognition |[en_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/en_PP-OCRv3_rec.yml)| 3.2M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/PP-OCRv3_rec_slim_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_slim_train.tar) / [nb model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_slim_infer.nb) | |en_PP-OCRv3_rec_slim | [New] Slim qunatization with distillation lightweight model, supporting english, English text recognition |[en_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/en_PP-OCRv3_rec.yml)| 3.2M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_slim_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_slim_train.tar) / [nb model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_slim_infer.nb) |
|en_PP-OCRv3_rec| [New] Original lightweight model, supporting english, English, multilingual text recognition |[en_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/en_PP-OCRv3_rec.yml)| 9.6M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_train.tar) | |en_PP-OCRv3_rec| [New] Original lightweight model, supporting english, English, multilingual text recognition |[en_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/en_PP-OCRv3_rec.yml)| 9.6M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_train.tar) |
|en_number_mobile_slim_v2.0_rec|Slim pruned and quantized lightweight model, supporting English and number recognition|[rec_en_number_lite_train.yml](../../configs/rec/multi_language/rec_en_number_lite_train.yml)| 2.7M | [inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/en_number_mobile_v2.0_rec_slim_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/en_number_mobile_v2.0_rec_slim_train.tar) | |en_number_mobile_slim_v2.0_rec|Slim pruned and quantized lightweight model, supporting English and number recognition|[rec_en_number_lite_train.yml](../../configs/rec/multi_language/rec_en_number_lite_train.yml)| 2.7M | [inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/en_number_mobile_v2.0_rec_slim_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/en_number_mobile_v2.0_rec_slim_train.tar) |
|en_number_mobile_v2.0_rec|Original lightweight model, supporting English and number recognition|[rec_en_number_lite_train.yml](../../configs/rec/multi_language/rec_en_number_lite_train.yml)|2.6M|[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/en_number_mobile_v2.0_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/en_number_mobile_v2.0_rec_train.tar) | |en_number_mobile_v2.0_rec|Original lightweight model, supporting English and number recognition|[rec_en_number_lite_train.yml](../../configs/rec/multi_language/rec_en_number_lite_train.yml)|2.6M|[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/en_number_mobile_v2.0_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/en_number_mobile_v2.0_rec_train.tar) |
...@@ -108,7 +108,7 @@ Relationship of the above models is as follows. ...@@ -108,7 +108,7 @@ Relationship of the above models is as follows.
| ka_PP-OCRv3_rec | ppocr/utils/dict/ka_dict.txt | Lightweight model for Kannada recognition |[ka_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/ka_PP-OCRv3_rec.yml)|9.9M|[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/ka_PP-OCRv3_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/ka_PP-OCRv3_rec_train.tar) | | ka_PP-OCRv3_rec | ppocr/utils/dict/ka_dict.txt | Lightweight model for Kannada recognition |[ka_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/ka_PP-OCRv3_rec.yml)|9.9M|[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/ka_PP-OCRv3_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/ka_PP-OCRv3_rec_train.tar) |
| ta_PP-OCRv3_rec | ppocr/utils/dict/ta_dict.txt |Lightweight model for Tamil recognition|[ta_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/ta_PP-OCRv3_rec.yml)|9.6M|[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/ta_PP-OCRv3_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/ta_PP-OCRv3_rec_train.tar) | | ta_PP-OCRv3_rec | ppocr/utils/dict/ta_dict.txt |Lightweight model for Tamil recognition|[ta_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/ta_PP-OCRv3_rec.yml)|9.6M|[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/ta_PP-OCRv3_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/ta_PP-OCRv3_rec_train.tar) |
| latin_PP-OCRv3_rec | ppocr/utils/dict/latin_dict.txt | Lightweight model for latin recognition | [latin_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/latin_PP-OCRv3_rec.yml) |9.7M|[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/latin_PP-OCRv3_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/latin_PP-OCRv3_rec_train.tar) | | latin_PP-OCRv3_rec | ppocr/utils/dict/latin_dict.txt | Lightweight model for latin recognition | [latin_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/latin_PP-OCRv3_rec.yml) |9.7M|[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/latin_PP-OCRv3_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/latin_PP-OCRv3_rec_train.tar) |
| arabic_PP-OCRv3_rec | ppocr/utils/dict/arabic_dict.txt | Lightweight model for arabic recognition | [arabic_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/rec_arabic_lite_train.yml) |9.6M|[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/arabic_PP-OCRv3_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/arabic_PP-OCRv3_rec_train.tar) | | arabic_PP-OCRv3_rec | ppocr/utils/dict/arabic_dict.txt | Lightweight model for arabic recognition | [arabic_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/arabic_PP-OCRv3_rec.yml) |9.6M|[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/arabic_PP-OCRv3_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/arabic_PP-OCRv3_rec_train.tar) |
| cyrillic_PP-OCRv3_rec | ppocr/utils/dict/cyrillic_dict.txt | Lightweight model for cyrillic recognition | [cyrillic_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/cyrillic_PP-OCRv3_rec.yml) |9.6M|[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/cyrillic_PP-OCRv3_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/cyrillic_PP-OCRv3_rec_train.tar) | | cyrillic_PP-OCRv3_rec | ppocr/utils/dict/cyrillic_dict.txt | Lightweight model for cyrillic recognition | [cyrillic_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/cyrillic_PP-OCRv3_rec.yml) |9.6M|[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/cyrillic_PP-OCRv3_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/cyrillic_PP-OCRv3_rec_train.tar) |
| devanagari_PP-OCRv3_rec | ppocr/utils/dict/devanagari_dict.txt | Lightweight model for devanagari recognition | [devanagari_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/devanagari_PP-OCRv3_rec.yml) |9.9M|[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/devanagari_PP-OCRv3_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/devanagari_PP-OCRv3_rec_train.tar) | | devanagari_PP-OCRv3_rec | ppocr/utils/dict/devanagari_dict.txt | Lightweight model for devanagari recognition | [devanagari_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/devanagari_PP-OCRv3_rec.yml) |9.9M|[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/devanagari_PP-OCRv3_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/devanagari_PP-OCRv3_rec_train.tar) |
......
...@@ -187,10 +187,10 @@ In addition to installing the whl package for quick forecasting, ...@@ -187,10 +187,10 @@ In addition to installing the whl package for quick forecasting,
PPOCR also provides a variety of forecasting deployment methods. PPOCR also provides a variety of forecasting deployment methods.
If necessary, you can read related documents: If necessary, you can read related documents:
- [Python Inference](./inference_en.md) - [Python Inference](./inference_ppocr_en.md)
- [C++ Inference](../../deploy/cpp_infer/readme_en.md) - [C++ Inference](../../deploy/cpp_infer/readme.md)
- [Serving](../../deploy/hubserving/readme_en.md) - [Serving](../../deploy/hubserving/readme_en.md)
- [Mobile](https://github.com/PaddlePaddle/PaddleOCR/blob/develop/deploy/lite/readme_en.md) - [Mobile](../../deploy/lite/readme.md)
- [Benchmark](./benchmark_en.md) - [Benchmark](./benchmark_en.md)
......
# E-book: *Dive Into OCR* # E-book: *Dive Into OCR*
"Dive Into OCR" is a textbook that combines OCR theory and practice, written by the PaddleOCR team, Chen Zhineng, a Junior Research Fellow at Fudan University, Huang Wenhui, a senior expert in the field of vision at China Mobile Research Institute, and other industry-university-research colleagues, as well as OCR developers. The main features are as follows: "Dive Into OCR" is a textbook that combines OCR theory and practice, written by the PaddleOCR team, Chen Zhineng, a Pre-tenure Professor at Fudan University, Huang Wenhui, a senior expert in the field of vision at China Mobile Research Institute, and other industry-university-research colleagues, as well as OCR developers. The main features are as follows:
- OCR full-stack technology covering text detection, recognition and document analysis - OCR full-stack technology covering text detection, recognition and document analysis
- Closely integrate theory and practice, cross the code implementation gap, and supporting instructional videos - Closely integrate theory and practice, cross the code implementation gap, and supporting instructional videos
......
...@@ -38,7 +38,7 @@ On the basis of PP-OCR, PP-OCRv2 is further optimized in five aspects. The detec ...@@ -38,7 +38,7 @@ On the basis of PP-OCR, PP-OCRv2 is further optimized in five aspects. The detec
PP-OCRv3 upgraded the detection model and recognition model in 9 aspects based on PP-OCRv2: PP-OCRv3 upgraded the detection model and recognition model in 9 aspects based on PP-OCRv2:
- PP-OCRv3 detector upgrades the CML(Collaborative Mutual Learning) text detection strategy proposed in PP-OCRv2, and further optimizes the effect of teacher model and student model respectively. In the optimization of teacher model, a pan module with large receptive field named LK-PAN is proposed and the DML distillation strategy is adopted; In the optimization of student model, a FPN module with residual attention mechanism named RSE-FPN is proposed. - PP-OCRv3 detector upgrades the CML(Collaborative Mutual Learning) text detection strategy proposed in PP-OCRv2, and further optimizes the effect of teacher model and student model respectively. In the optimization of teacher model, a pan module with large receptive field named LK-PAN is proposed and the DML distillation strategy is adopted; In the optimization of student model, a FPN module with residual attention mechanism named RSE-FPN is proposed.
- PP-OCRv3 recognizer is optimized based on text recognition algorithm [SVTR](https://arxiv.org/abs/2205.00159). SVTR no longer adopts RNN by introducing transformers structure, which can mine the context information of text line image more effectively, so as to improve the ability of text recognition. PP-OCRv3 adopts lightweight text recognition network SVTR_LCNet, guided training of CTC loss by attention loss, data augmentation strategy TextConAug, better pre-trained model by self-supervised TextRotNet, UDML(Unified Deep Mutual Learning), and UIM (Unlabeled Images Mining) to accelerate the model and improve the effect. - PP-OCRv3 recognizer is optimized based on text recognition algorithm [SVTR](https://arxiv.org/abs/2205.00159). SVTR no longer adopts RNN by introducing transformers structure, which can mine the context information of text line image more effectively, so as to improve the ability of text recognition. PP-OCRv3 adopts lightweight text recognition network SVTR_LCNet, guided training of CTC by attention, data augmentation strategy TextConAug, better pre-trained model by self-supervised TextRotNet, UDML(Unified Deep Mutual Learning), and UIM (Unlabeled Images Mining) to accelerate the model and improve the effect.
PP-OCRv3 pipeline is as follows: PP-OCRv3 pipeline is as follows:
......
# RECENT UPDATES # RECENT UPDATES
- 2022.5.9 release PaddleOCR v2.5, including: - 2022.5.9 release PaddleOCR v2.5, including:
- [PP-OCRv3](./doc/doc_en/ppocr_introduction_en.md#pp-ocrv3): With comparable speed, the effect of Chinese scene is further improved by 5% compared with PP-OCRv2, the effect of English scene is improved by 11%, and the average recognition accuracy of 80 language multilingual models is improved by more than 5%. - [PP-OCRv3](./ppocr_introduction_en.md#pp-ocrv3): With comparable speed, the effect of Chinese scene is further improved by 5% compared with PP-OCRv2, the effect of English scene is improved by 11%, and the average recognition accuracy of 80 language multilingual models is improved by more than 5%.
- [PPOCRLabelv2](./PPOCRLabel): Add the annotation function for table recognition task, key information extraction task and irregular text image. - [PPOCRLabelv2](../../PPOCRLabel): Add the annotation function for table recognition task, key information extraction task and irregular text image.
- Interactive e-book [*"Dive into OCR"*](./doc/doc_en/ocr_book_en.md), covers the cutting-edge theory and code practice of OCR full stack technology. - Interactive e-book [*"Dive into OCR"*](./ocr_book_en.md), covers the cutting-edge theory and code practice of OCR full stack technology.
- 2022.5.7 Add support for metric and model logging during training to [Weights & Biases](https://docs.wandb.ai/). - 2022.5.7 Add support for metric and model logging during training to [Weights & Biases](https://docs.wandb.ai/).
- 2021.12.21 OCR open source online course starts. The lesson starts at 8:30 every night and lasts for ten days. Free registration: https://aistudio.baidu.com/aistudio/course/introduce/25207 - 2021.12.21 OCR open source online course starts. The lesson starts at 8:30 every night and lasts for ten days. Free registration: https://aistudio.baidu.com/aistudio/course/introduce/25207
- 2021.12.21 release PaddleOCR v2.4, release 1 text detection algorithm (PSENet), 3 text recognition algorithms (NRTR、SEED、SAR), 1 key information extraction algorithm (SDMGR) and 3 DocVQA algorithms (LayoutLM、LayoutLMv2,LayoutXLM). - 2021.12.21 release PaddleOCR v2.4, release 1 text detection algorithm (PSENet), 3 text recognition algorithms (NRTR、SEED、SAR), 1 key information extraction algorithm (SDMGR) and 3 DocVQA algorithms (LayoutLM、LayoutLMv2,LayoutXLM).
......
doc/features.png

1.1 MB | W: | H:

doc/features.png

1.1 MB | W: | H:

doc/features.png
doc/features.png
doc/features.png
doc/features.png
  • 2-up
  • Swipe
  • Onion skin
doc/features_en.png

1.2 MB | W: | H:

doc/features_en.png

1.2 MB | W: | H:

doc/features_en.png
doc/features_en.png
doc/features_en.png
doc/features_en.png
  • 2-up
  • Swipe
  • Onion skin
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"collapsed": false
},
"source": [
"# 文本检测FAQ\n",
"\n",
"本节罗列一些开发者们使用PaddleOCR的文本检测模型常遇到的一些问题,并给出相应的问题解决方法或建议。\n",
"\n",
"FAQ分两个部分来介绍,分别是:\n",
" - 文本检测训练相关\n",
" - 文本检测预测相关"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false
},
"source": [
"## 1. 文本检测训练相关FAQ\n",
"\n",
"**1.1 PaddleOCR提供的文本检测算法包括哪些?**\n",
"\n",
"**A**:PaddleOCR中包含多种文本检测模型,包括基于回归的文本检测方法EAST、SAST,和基于分割的文本检测方法DB,PSENet。\n",
"\n",
"\n",
"**1.2:请问PaddleOCR项目中的中文超轻量和通用模型用了哪些数据集?训练多少样本,gpu什么配置,跑了多少个epoch,大概跑了多久?**\n",
"\n",
"**A**:对于超轻量DB检测模型,训练数据包括开源数据集lsvt,rctw,CASIA,CCPD,MSRA,MLT,BornDigit,iflytek,SROIE和合成的数据集等,总数据量越10W,数据集分为5个部分,训练时采用随机采样策略,在4卡V100GPU上约训练500epoch,耗时3天。\n",
"\n",
"\n",
"**1.3 文本检测训练标签是否需要具体文本标注,标签中的”###”是什么意思?**\n",
"\n",
"**A**:文本检测训练只需要文本区域的坐标即可,标注可以是四点或者十四点,按照左上,右上,右下,左下的顺序排列。PaddleOCR提供的标签文件中包含文本字段,对于文本区域文字不清晰会使用###代替。训练检测模型时,不会用到标签中的文本字段。\n",
" \n",
"**1.4 对于文本行较紧密的情况下训练的文本检测模型效果较差?**\n",
"\n",
"**A**:使用基于分割的方法,如DB,检测密集文本行时,最好收集一批数据进行训练,并且在训练时,并将生成二值图像的[shrink_ratio](https://github.com/PaddlePaddle/PaddleOCR/blob/8b656a3e13631dfb1ac21d2095d4d4a4993ef710/ppocr/data/imaug/make_shrink_map.py?_pjax=%23js-repo-pjax-container%2C%20div%5Bitemtype%3D%22http%3A%2F%2Fschema.org%2FSoftwareSourceCode%22%5D%20main%2C%20%5Bdata-pjax-container%5D#L37)参数调小一些。另外,在预测的时候,可以适当减小[unclip_ratio](https://github.com/PaddlePaddle/PaddleOCR/blob/8b656a3e13631dfb1ac21d2095d4d4a4993ef710/configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml?_pjax=%23js-repo-pjax-container%2C%20div%5Bitemtype%3D%22http%3A%2F%2Fschema.org%2FSoftwareSourceCode%22%5D%20main%2C%20%5Bdata-pjax-container%5D#L59)参数,unclip_ratio参数值越大检测框就越大。\n",
"\n",
"\n",
"**1.5 对于一些尺寸较大的文档类图片, DB在检测时会有较多的漏检,怎么避免这种漏检的问题呢?**\n",
"\n",
"**A**:首先,需要确定是模型没有训练好的问题还是预测时处理的问题。如果是模型没有训练好,建议多加一些数据进行训练,或者在训练的时候多加一些数据增强。\n",
"如果是预测图像过大的问题,可以增大预测时输入的最长边设置参数[det_limit_side_len](https://github.com/PaddlePaddle/PaddleOCR/blob/8b656a3e13631dfb1ac21d2095d4d4a4993ef710/tools/infer/utility.py?_pjax=%23js-repo-pjax-container%2C%20div%5Bitemtype%3D%22http%3A%2F%2Fschema.org%2FSoftwareSourceCode%22%5D%20main%2C%20%5Bdata-pjax-container%5D#L47),默认为960。\n",
"其次,可以通过可视化后处理的分割图观察漏检的文字是否有分割结果,如果没有分割结果,说明是模型没有训练好。如果有完整的分割区域,说明是预测后处理的问题,建议调整[DB后处理参数](https://github.com/PaddlePaddle/PaddleOCR/blob/8b656a3e13631dfb1ac21d2095d4d4a4993ef710/tools/infer/utility.py?_pjax=%23js-repo-pjax-container%2C%20div%5Bitemtype%3D%22http%3A%2F%2Fschema.org%2FSoftwareSourceCode%22%5D%20main%2C%20%5Bdata-pjax-container%5D#L51-L53)。\n",
"\n",
"\n",
"**1.6 DB模型弯曲文本(如略微形变的文档图像)漏检问题?**\n",
"\n",
"**A**: DB后处理中计算文本框平均得分时,是求rectangle区域的平均分数,容易造成弯曲文本漏检,已新增求polygon区域的平均分数,会更准确,但速度有所降低,可按需选择,在相关pr中可查看[可视化对比效果](https://github.com/PaddlePaddle/PaddleOCR/pull/2604)。该功能通过参数 [det_db_score_mode](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.1/tools/infer/utility.py#L51)进行选择,参数值可选[`fast`(默认)、`slow`],`fast`对应原始的rectangle方式,`slow`对应polygon方式。感谢用户[buptlihang](https://github.com/buptlihang)提[pr](https://github.com/PaddlePaddle/PaddleOCR/pull/2574)帮助解决该问题。\n",
"\n",
"\n",
"**1.7 简单的对于精度要求不高的OCR任务,数据集需要准备多少张呢?**\n",
"\n",
"**A**:(1)训练数据的数量和需要解决问题的复杂度有关系。难度越大,精度要求越高,则数据集需求越大,而且一般情况实际中的训练数据越多效果越好。\n",
"\n",
"(2)对于精度要求不高的场景,检测任务和识别任务需要的数据量是不一样的。对于检测任务,500张图像可以保证基本的检测效果。对于识别任务,需要保证识别字典中每个字符出现在不同场景的行文本图像数目需要大于200张(举例,如果有字典中有5个字,每个字都需要出现在200张图片以上,那么最少要求的图像数量应该在200-1000张之间),这样可以保证基本的识别效果。\n",
"\n",
"\n",
"**1.8 当训练数据量少时,如何获取更多的数据?**\n",
"\n",
"**A**:当训练数据量少时,可以尝试以下三种方式获取更多的数据:(1)人工采集更多的训练数据,最直接也是最有效的方式。(2)基于PIL和opencv基本图像处理或者变换。例如PIL中ImageFont, Image, ImageDraw三个模块将文字写到背景中,opencv的旋转仿射变换,高斯滤波等。(3)利用数据生成算法合成数据,例如pix2pix等算法。\n",
"\n",
"\n",
"**1.9 如何更换文本检测/识别的backbone?**\n",
"\n",
"A:无论是文字检测,还是文字识别,骨干网络的选择是预测效果和预测效率的权衡。一般,选择更大规模的骨干网络,例如ResNet101_vd,则检测或识别更准确,但预测耗时相应也会增加。而选择更小规模的骨干网络,例如MobileNetV3_small_x0_35,则预测更快,但检测或识别的准确率会大打折扣。幸运的是不同骨干网络的检测或识别效果与在ImageNet数据集图像1000分类任务效果正相关。飞桨图像分类套件PaddleClas汇总了ResNet_vd、Res2Net、HRNet、MobileNetV3、GhostNet等23种系列的分类网络结构,在上述图像分类任务的top1识别准确率,GPU(V100和T4)和CPU(骁龙855)的预测耗时以及相应的117个预训练模型下载地址。\n",
"\n",
"(1)文字检测骨干网络的替换,主要是确定类似与ResNet的4个stages,以方便集成后续的类似FPN的检测头。此外,对于文字检测问题,使用ImageNet训练的分类预训练模型,可以加速收敛和效果提升。\n",
"\n",
"(2)文字识别的骨干网络的替换,需要注意网络宽高stride的下降位置。由于文本识别一般宽高比例很大,因此高度下降频率少一些,宽度下降频率多一些。可以参考[PaddleOCR中MobileNetV3骨干网络的改动](https://github.com/PaddlePaddle/PaddleOCR/blob/release%2F2.3/ppocr/modeling/backbones/rec_mobilenet_v3.py)。\n",
"\n",
"\n",
"**1.10 如何对检测模型finetune,比如冻结前面的层或某些层使用小的学习率学习?**\n",
"\n",
"**A**:如果是冻结某些层,可以将变量的stop_gradient属性设置为True,这样计算这个变量之前的所有参数都不会更新了,参考:https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/faq/train_cn.html#id4\n",
"\n",
"如果对某些层使用更小的学习率学习,静态图里还不是很方便,一个方法是在参数初始化的时候,给权重的属性设置固定的学习率,参考:https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api/paddle/fluid/param_attr/ParamAttr_cn.html#paramattr\n",
"\n",
"实际上我们实验发现,直接加载模型去fine-tune,不设置某些层不同学习率,效果也都不错。\n",
"\n",
"**1.11 DB的预处理部分,图片的长和宽为什么要处理成32的倍数?**\n",
"\n",
"**A**:和网络下采样的倍数(stride)有关。以检测中的resnet骨干网络为例,图像输入网络之后,需要经过5次2倍降采样,共32倍,因此建议输入的图像尺寸为32的倍数。\n",
"\n",
"\n",
"**1.12 在PP-OCR系列的模型中,文本检测的骨干网络为什么没有使用SEBlock?**\n",
"\n",
"**A**:SE模块是MobileNetV3网络一个重要模块,目的是估计特征图每个特征通道重要性,给特征图每个特征分配权重,提高网络的表达能力。但是,对于文本检测,输入网络的分辨率比较大,一般是640\\*640,利用SE模块估计特征图每个特征通道重要性比较困难,网络提升能力有限,但是该模块又比较耗时,因此在PP-OCR系统中,文本检测的骨干网络没有使用SE模块。实验也表明,当去掉SE模块,超轻量模型大小可以减小40%,文本检测效果基本不受影响。详细可以参考PP-OCR技术文章,https://arxiv.org/abs/2009.09941.\n",
"\n",
"\n",
"**1.13 PP-OCR检测效果不好,该如何优化?**\n",
"\n",
"**A**: 具体问题具体分析:\n",
"- 如果在你的场景上检测效果不可用,首选是在你的数据上做finetune训练;\n",
"- 如果图像过大,文字过于密集,建议不要过度压缩图像,可以尝试修改检测预处理的resize逻辑,防止图像被过度压缩;\n",
"- 检测框大小过于紧贴文字或检测框过大,可以调整db_unclip_ratio这个参数,加大参数可以扩大检测框,减小参数可以减小检测框大小;\n",
"- 检测框存在很多漏检问题,可以减小DB检测后处理的阈值参数det_db_box_thresh,防止一些检测框被过滤掉,也可以尝试设置det_db_score_mode为'slow';\n",
"- 其他方法可以选择use_dilation为True,对检测输出的feature map做膨胀处理,一般情况下,会有效果改善;\n",
"\n",
"\n",
"## 2. 文本检测预测相关FAQ\n",
"\n",
"**2.1 DB有些框太贴文本了反而去掉了一些文本的边角影响识别,这个问题有什么办法可以缓解吗?**\n",
"\n",
"**A**:可以把后处理的参数[unclip_ratio](https://github.com/PaddlePaddle/PaddleOCR/blob/d80afce9b51f09fd3d90e539c40eba8eb5e50dd6/tools/infer/utility.py?_pjax=%23js-repo-pjax-container%2C%20div%5Bitemtype%3D%22http%3A%2F%2Fschema.org%2FSoftwareSourceCode%22%5D%20main%2C%20%5Bdata-pjax-container%5D#L52)适当调大一点,该参数越大文本框越大。\n",
"\n",
"\n",
"**2.2 为什么PaddleOCR检测预测是只支持一张图片测试?即test_batch_size_per_card=1**\n",
"\n",
"**A**:预测的时候,对图像等比例缩放,最长边960,不同图像等比例缩放后长宽不一致,无法组成batch,所以设置为test_batch_size为1。\n",
"\n",
"\n",
"**2.3 在CPU上加速PaddleOCR的文本检测模型预测?**\n",
"\n",
"**A**:x86 CPU可以使用mkldnn(OneDNN)进行加速;在支持mkldnn加速的CPU上开启[enable_mkldnn](https://github.com/PaddlePaddle/PaddleOCR/blob/8b656a3e13631dfb1ac21d2095d4d4a4993ef710/tools/infer/utility.py#L105)参数。另外,配合增加CPU上预测使用的[线程数num_threads](https://github.com/PaddlePaddle/PaddleOCR/blob/8b656a3e13631dfb1ac21d2095d4d4a4993ef710/tools/infer/utility.py#L106),可以有效加快CPU上的预测速度。\n",
"\n",
"**2.4 在GPU上加速PaddleOCR的文本检测模型预测?**\n",
"\n",
"**A**:GPU加速预测推荐使用TensorRT。\n",
"- 1. 从[链接](https://paddleinference.paddlepaddle.org.cn/master/user_guides/download_lib.html)下载带TensorRT的Paddle安装包或者预测库。\n",
"- 2. 从Nvidia官网下载[TensorRT](https://developer.nvidia.com/tensorrt),注意下载的TensorRT版本与paddle安装包中编译的TensorRT版本一致。\n",
"- 3. 设置环境变量`LD_LIBRARY_PATH`,指向TensorRT的lib文件夹\n",
"```\n",
"export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:<TensorRT-${version}/lib>\n",
"```\n",
"- 4. 开启PaddleOCR预测的[tensorrt选项](https://github.com/PaddlePaddle/PaddleOCR/blob/8b656a3e13631dfb1ac21d2095d4d4a4993ef710/tools/infer/utility.py?_pjax=%23js-repo-pjax-container%2C%20div%5Bitemtype%3D%22http%3A%2F%2Fschema.org%2FSoftwareSourceCode%22%5D%20main%2C%20%5Bdata-pjax-container%5D#L38)。\n",
"\n",
"**2.5 如何在移动端部署PaddleOCR模型?**\n",
"\n",
"**A**: 飞桨Paddle有专门针对移动端部署的工具[PaddleLite](https://github.com/PaddlePaddle/Paddle-Lite),并且PaddleOCR提供了DB+CRNN为demo的android arm部署代码,参考[链接](https://github.com/PaddlePaddle/PaddleOCR/blob/release%2F2.3/deploy/lite/readme.md)。\n",
"\n",
"\n",
"**2.6 如何使用PaddleOCR多进程预测?**\n",
"\n",
"**A**: 近期PaddleOCR新增了[多进程预测控制参数](https://github.com/PaddlePaddle/PaddleOCR/blob/8b656a3e13631dfb1ac21d2095d4d4a4993ef710/tools/infer/utility.py?_pjax=%23js-repo-pjax-container%2C%20div%5Bitemtype%3D%22http%3A%2F%2Fschema.org%2FSoftwareSourceCode%22%5D%20main%2C%20%5Bdata-pjax-container%5D#L111),`use_mp`表示是否使用多进程,`total_process_num`表示在使用多进程时的进程数。具体使用方式请参考[文档](https://github.com/PaddlePaddle/PaddleOCR/blob/release%2F2.3/doc/doc_ch/inference.md#1-%E8%B6%85%E8%BD%BB%E9%87%8F%E4%B8%AD%E6%96%87ocr%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86)。\n",
"\n",
"**2.7 预测时显存爆炸、内存泄漏问题?**\n",
"\n",
"**A**: 如果是训练模型的预测,由于模型太大或者输入图像太大导致显存不够用,可以参考代码在主函数运行前加上paddle.no_grad(),即可减小显存占用。如果是inference模型预测时显存占用过高,可以配置Config时,加入[config.enable_memory_optim()](https://github.com/PaddlePaddle/PaddleOCR/blob/8b656a3e13631dfb1ac21d2095d4d4a4993ef710/tools/infer/utility.py?_pjax=%23js-repo-pjax-container%2C%20div%5Bitemtype%3D%22http%3A%2F%2Fschema.org%2FSoftwareSourceCode%22%5D%20main%2C%20%5Bdata-pjax-container%5D#L267)用于减小内存占用。\n",
"\n",
"另外关于使用Paddle预测时出现内存泄漏的问题,建议安装paddle最新版本,内存泄漏已修复。"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "py35-paddle1.2.0"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"collapsed": false
},
"source": [
"\n",
"# 文本识别算法理论\n",
"\n",
"本章主要介绍文本识别算法的理论知识,包括背景介绍、算法分类和部分经典论文思路。\n",
"\n",
"通过本章的学习,你可以掌握:\n",
"\n",
"1. 文本识别的目标\n",
"\n",
"2. 文本识别算法的分类\n",
"\n",
"3. 各类算法的典型思想\n",
"\n",
"\n",
"## 1 背景介绍\n",
"\n",
"文本识别是OCR(Optical Character Recognition)的一个子任务,其任务为识别一个固定区域的文本内容。在OCR的两阶段方法里,它接在文本检测后面,将图像信息转换为文字信息。\n",
"\n",
"具体地,模型输入一张定位好的文本行,由模型预测出图片中的文字内容和置信度,可视化结果如下图所示:\n",
"\n",
"<center><img src=https://ai-studio-static-online.cdn.bcebos.com/a7c3404f778b489db9c1f686c7d2ff4d63b67c429b454f98b91ade7b89f8e903 width=\"600\"></center>\n",
"\n",
"<center><img src=https://ai-studio-static-online.cdn.bcebos.com/e72b1d6f80c342ac951d092bc8c325149cebb3763ec849ec8a2f54e7c8ad60ca width=\"600\"></center>\n",
"\n",
"\n",
"文本识别的应用场景很多,有文档识别、路标识别、车牌识别、工业编号识别等等,根据实际场景可以把文本识别任务分为两个大类:**规则文本识别**和**不规则文本识别**。\n",
"\n",
"* 规则文本识别:主要指印刷字体、扫描文本等,认为文本大致处在水平线位置\n",
"\n",
"* 不规则文本识别: 往往出现在自然场景中,且由于文本曲率、方向、变形等方面差异巨大,文字往往不在水平位置,存在弯曲、遮挡、模糊等问题。\n",
"\n",
"\n",
"下图展示的是 IC15 和 IC13 的数据样式,它们分别代表了不规则文本和规则文本。可以看出不规则文本往往存在扭曲、模糊、字体差异大等问题,更贴近真实场景,也存在更大的挑战性。\n",
"\n",
"因此目前各大算法都试图在不规则数据集上获得更高的指标。\n",
"\n",
"<center><img src=https://ai-studio-static-online.cdn.bcebos.com/bae4fce1370b4751a3779542323d0765a02a44eace7b44d2a87a241c13c6f8cf width=\"400\">\n",
"<br><center>IC15 图片样例(不规则文本)</center>\n",
"<img src=https://ai-studio-static-online.cdn.bcebos.com/b55800d3276f4f5fad170ea1b567eb770177fce226f945fba5d3247a48c15c34 width=\"400\"></center>\n",
"<br><center>IC13 图片样例(规则文本)</center>\n",
"\n",
"\n",
"不同的识别算法在对比能力时,往往也在这两大类公开数据集上比较。对比多个维度上的效果,目前较为通用的英文评估集合分类如下:\n",
"\n",
"<center><img src=https://ai-studio-static-online.cdn.bcebos.com/4d0aada261064031a16816b39a37f2ff6af70dbb57004cb7a106ae6485f14684 width=\"600\"></center>\n",
"\n",
"## 2 文本识别算法分类\n",
"\n",
"在传统的文本识别方法中,任务分为3个步骤,即图像预处理、字符分割和字符识别。需要对特定场景进行建模,一旦场景变化就会失效。面对复杂的文字背景和场景变动,基于深度学习的方法具有更优的表现。\n",
"\n",
"多数现有的识别算法可用如下统一框架表示,算法流程被划分为4个阶段:\n",
"\n",
"![](https://ai-studio-static-online.cdn.bcebos.com/a2750f4170864f69a3af36fc13db7b606d851f2f467d43cea6fbf3521e65450f)\n",
"\n",
"\n",
"我们整理了主流的算法类别和主要论文,参考下表:\n",
"\n",
"<center>\n",
" \n",
"| 算法类别 | 主要思路 | 主要论文 |\n",
"| -------- | --------------- | -------- |\n",
"| 传统算法 | 滑动窗口、字符提取、动态规划 | - |\n",
"| ctc | 基于ctc的方法,序列不对齐,更快速识别 | CRNN, Rosetta |\n",
"| Attention | 基于attention的方法,应用于非常规文本 | RARE, DAN, PREN |\n",
"| Transformer | 基于transformer的方法 | SRN, NRTR, Master, ABINet |\n",
"| 校正 | 校正模块学习文本边界并校正成水平方向 | RARE, ASTER, SAR | \n",
"| 分割 | 基于分割的方法,提取字符位置再做分类 | Text Scanner, Mask TextSpotter |\n",
" \n",
"</center>\n",
"\n",
"\n",
"### 2.1 规则文本识别\n",
"\n",
"\n",
"文本识别的主流算法有两种,分别是基于 CTC (Conectionist Temporal Classification) 的算法和 Sequence2Sequence 算法,区别主要在解码阶段。\n",
"\n",
"基于 CTC 的算法是将编码产生的序列接入 CTC 进行解码;基于 Sequence2Sequence 的方法则是把序列接入循环神经网络(Recurrent Neural Network, RNN)模块进行循环解码,两种方式都验证有效也是主流的两大做法。\n",
"\n",
"<center><img src=https://ai-studio-static-online.cdn.bcebos.com/f64eee66e4a6426f934c1befc3b138629324cf7360c74f72bd6cf3c0de9d49bd width=\"600\"></center>\n",
"<br><center>左:基于CTC的方法,右:基于Sequece2Sequence的方法 </center>\n",
"\n",
"\n",
"#### 2.1.1 基于CTC的算法\n",
"\n",
"基于 CTC 最典型的算法是CRNN (Convolutional Recurrent Neural Network)[1],它的特征提取部分使用主流的卷积结构,常用的有ResNet、MobileNet、VGG等。由于文本识别任务的特殊性,输入数据中存在大量的上下文信息,卷积神经网络的卷积核特性使其更关注于局部信息,缺乏长依赖的建模能力,因此仅使用卷积网络很难挖掘到文本之间的上下文联系。为了解决这一问题,CRNN文本识别算法引入了双向 LSTM(Long Short-Term Memory) 用来增强上下文建模,通过实验证明双向LSTM模块可以有效的提取出图片中的上下文信息。最终将输出的特征序列输入到CTC模块,直接解码序列结果。该结构被验证有效,并广泛应用在文本识别任务中。Rosetta[2]是FaceBook提出的识别网络,由全卷积模型和CTC组成。Gao Y[3]等人使用CNN卷积替代LSTM,参数更少,性能提升精度持平。\n",
"\n",
"<center><img src=https://ai-studio-static-online.cdn.bcebos.com/d3c96dd9e9794fddb12fa16f926abdd3485194f0a2b749e792e436037490899b width=\"600\"></center>\n",
"<center> CRNN 结构图 </center>\n",
"\n",
"\n",
"#### 2.1.2 Sequence2Sequence 算法\n",
"\n",
"Sequence2Sequence 算法是由编码器 Encoder 把所有的输入序列都编码成一个统一的语义向量,然后再由解码器Decoder解码。在解码器Decoder解码的过程中,不断地将前一个时刻的输出作为后一个时刻的输入,循环解码,直到输出停止符为止。一般编码器是一个RNN,对于每个输入的词,编码器输出向量和隐藏状态,并将隐藏状态用于下一个输入的单词,循环得到语义向量;解码器是另一个RNN,它接收编码器输出向量并输出一系列字以创建转换。受到 Sequence2Sequence 在翻译领域的启发, Shi[4]提出了一种基于注意的编解码框架来识别文本,通过这种方式,rnn能够从训练数据中学习隐藏在字符串中的字符级语言模型。\n",
"\n",
"<center><img src=https://ai-studio-static-online.cdn.bcebos.com/f575333696b7438d919975dc218e61ccda1305b638c5497f92b46a7ec3b85243 width=\"400\" hight=\"500\"></center>\n",
"<center> Sequence2Sequence 结构图 </center>\n",
"\n",
"以上两个算法在规则文本上都有很不错的效果,但由于网络设计的局限性,这类方法很难解决弯曲和旋转的不规则文本识别任务。为了解决这类问题,部分算法研究人员在以上两类算法的基础上提出了一系列改进算法。\n",
"\n",
"### 2.2 不规则文本识别\n",
"\n",
"* 不规则文本识别算法可以被分为4大类:基于校正的方法;基于 Attention 的方法;基于分割的方法;基于 Transformer 的方法。\n",
"\n",
"#### 2.2.1 基于校正的方法\n",
"\n",
"基于校正的方法利用一些视觉变换模块,将非规则的文本尽量转换为规则文本,然后使用常规方法进行识别。\n",
"\n",
"RARE[4]模型首先提出了对不规则文本的校正方案,整个网络分为两个主要部分:一个空间变换网络STN(Spatial Transformer Network) 和一个基于Sequence2Squence的识别网络。其中STN就是校正模块,不规则文本图像进入STN,通过TPS(Thin-Plate-Spline)变换成一个水平方向的图像,该变换可以一定程度上校正弯曲、透射变换的文本,校正后送入序列识别网络进行解码。\n",
"\n",
"<center><img src=https://ai-studio-static-online.cdn.bcebos.com/66406f89507245e8a57969b9bed26bfe0227a8cf17a84873902dd4a464b97bb5 width=\"600\"></center>\n",
"<center> RARE 结构图 </center>\n",
"\n",
"RARE论文指出,该方法在不规则文本数据集上有较大的优势,特别比较了CUTE80和SVTP这两个数据集,相较CRNN高出5个百分点以上,证明了校正模块的有效性。基于此[6]同样结合了空间变换网络(STN)和基于注意的序列识别网络的文本识别系统。\n",
"\n",
"基于校正的方法有较好的迁移性,除了RARE这类基于Attention的方法外,STAR-Net[5]将校正模块应用到基于CTC的算法上,相比传统CRNN也有很好的提升。\n",
"\n",
"#### 2.2.2 基于Attention的方法\n",
"\n",
"基于 Attention 的方法主要关注的是序列之间各部分的相关性,该方法最早在机器翻译领域提出,认为在文本翻译的过程中当前词的结果主要由某几个单词影响的,因此需要给有决定性的单词更大的权重。在文本识别领域也是如此,将编码后的序列解码时,每一步都选择恰当的context来生成下一个状态,这样有利于得到更准确的结果。\n",
"\n",
"R^2AM [7] 首次将 Attention 引入文本识别领域,该模型首先将输入图像通过递归卷积层提取编码后的图像特征,然后利用隐式学习到的字符级语言统计信息通过递归神经网络解码输出字符。在解码过程中引入了Attention 机制实现了软特征选择,以更好地利用图像特征,这一有选择性的处理方式更符合人类的直觉。\n",
"\n",
"<center><img src=https://ai-studio-static-online.cdn.bcebos.com/a64ef10d4082422c8ac81dcda4ab75bf1db285d6b5fd462a8f309240445654d5 width=\"600\"></center>\n",
"<center> R^2AM 结构图 </center>\n",
"\n",
"后续有大量算法在Attention领域进行探索和更新,例如SAR[8]将1D attention拓展到2D attention上,校正模块提到的RARE也是基于Attention的方法。实验证明基于Attention的方法相比CTC的方法有很好的精度提升。\n",
"\n",
"<center><img src=https://ai-studio-static-online.cdn.bcebos.com/4e2507fb58d94ec7a9b4d17151a986c84c5053114e05440cb1e7df423d32cb02 width=\"600\"></center>\n",
"\n",
"\n",
"#### 2.2.3 基于分割的方法\n",
"\n",
"基于分割的方法是将文本行的各字符作为独立个体,相比与对整个文本行做矫正后识别,识别分割出的单个字符更加容易。它试图从输入的文本图像中定位每个字符的位置,并应用字符分类器来获得这些识别结果,将复杂的全局问题简化成了局部问题解决,在不规则文本场景下有比较不错的效果。然而这种方法需要字符级别的标注,数据获取上存在一定的难度。Lyu[9]等人提出了一种用于单词识别的实例分词模型,该模型在其识别部分使用了基于 FCN(Fully Convolutional Network) 的方法。[10]从二维角度考虑文本识别问题,设计了一个字符注意FCN来解决文本识别问题,当文本弯曲或严重扭曲时,该方法对规则文本和非规则文本都具有较优的定位结果。\n",
"\n",
"<center><img src=https://ai-studio-static-online.cdn.bcebos.com/fd3e8ef0d6ce4249b01c072de31297ca5d02fc84649846388f890163b624ff10 width=\"800\"></center>\n",
"<center> Mask TextSpotter 结构图 </center>\n",
"\n",
"\n",
"\n",
"#### 2.2.4 基于Transformer的方法\n",
"\n",
"随着 Transformer 的快速发展,分类和检测领域都验证了 Transformer 在视觉任务中的有效性。如规则文本识别部分所说,CNN在长依赖建模上存在局限性,Transformer 结构恰好解决了这一问题,它可以在特征提取器中关注全局信息,并且可以替换额外的上下文建模模块(LSTM)。\n",
"\n",
"一部分文本识别算法使用 Transformer 的 Encoder 结构和卷积共同提取序列特征,Encoder 由多个 MultiHeadAttentionLayer 和 Positionwise Feedforward Layer 堆叠而成的block组成。MulitHeadAttention 中的 self-attention 利用矩阵乘法模拟了RNN的时序计算,打破了RNN中时序长时依赖的障碍。也有一部分算法使用 Transformer 的 Decoder 模块解码,相比传统RNN可获得更强的语义信息,同时并行计算具有更高的效率。\n",
"\n",
"SRN[11] 算法将Transformer的Encoder模块接在ResNet50后,增强了2D视觉特征。并提出了一个并行注意力模块,将读取顺序用作查询,使得计算与时间无关,最终并行输出所有时间步长的对齐视觉特征。此外SRN还利用Transformer的Eecoder作为语义模块,将图片的视觉信息和语义信息做融合,在遮挡、模糊等不规则文本上有较大的收益。\n",
"\n",
"NRTR[12] 使用了完整的Transformer结构对输入图片进行编码和解码,只使用了简单的几个卷积层做高层特征提取,在文本识别上验证了Transformer结构的有效性。\n",
"\n",
"<center><img src=https://ai-studio-static-online.cdn.bcebos.com/e7859f4469a842f0bd450e7e793a679d6e828007544241d09785c9b4ea2424a2 width=\"800\"></center>\n",
"<center> NRTR 结构图 </center>\n",
"\n",
"SRACN[13]使用Transformer的解码器替换LSTM,再一次验证了并行训练的高效性和精度优势。\n",
"\n",
"## 3 总结\n",
"\n",
"本节主要介绍了文本识别相关的理论知识和主流算法,包括基于CTC的方法、基于Sequence2Sequence的方法以及基于分割的方法,并分别列举了经典论文的思路和贡献。下一节将基于CRNN算法进行实践课程讲解,从组网到优化完成整个训练过程,\n",
"\n",
"## 4 参考文献\n",
"\n",
"\n",
"[1]Shi, B., Bai, X., & Yao, C. (2016). An end-to-end trainable neural network for image-based sequence recognition and its application to scene text recognition. IEEE transactions on pattern analysis and machine intelligence, 39(11), 2298-2304.\n",
"\n",
"[2]Fedor Borisyuk, Albert Gordo, and Viswanath Sivakumar. Rosetta: Large scale system for text detection and recognition in images. In Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining, pages 71–79. ACM, 2018.\n",
"\n",
"[3]Gao, Y., Chen, Y., Wang, J., & Lu, H. (2017). Reading scene text with attention convolutional sequence modeling. arXiv preprint arXiv:1709.04303.\n",
"\n",
"[4]Shi, B., Wang, X., Lyu, P., Yao, C., & Bai, X. (2016). Robust scene text recognition with automatic rectification. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 4168-4176).\n",
"\n",
"[5] Star-Net Max Jaderberg, Karen Simonyan, Andrew Zisserman, et al. Spa- tial transformer networks. In Advances in neural information processing systems, pages 2017–2025, 2015.\n",
"\n",
"[6]Baoguang Shi, Mingkun Yang, XingGang Wang, Pengyuan Lyu, Xiang Bai, and Cong Yao. Aster: An attentional scene text recognizer with flexible rectification. IEEE transactions on pattern analysis and machine intelligence, 31(11):855–868, 2018.\n",
"\n",
"[7] Lee C Y , Osindero S . Recursive Recurrent Nets with Attention Modeling for OCR in the Wild[C]// IEEE Conference on Computer Vision & Pattern Recognition. IEEE, 2016.\n",
"\n",
"[8]Li, H., Wang, P., Shen, C., & Zhang, G. (2019, July). Show, attend and read: A simple and strong baseline for irregular text recognition. In Proceedings of the AAAI Conference on Artificial Intelligence (Vol. 33, No. 01, pp. 8610-8617).\n",
"\n",
"[9]P. Lyu, C. Yao, W. Wu, S. Yan, and X. Bai. Multi-oriented scene text detection via corner localization and region segmentation. In Proc. CVPR, pages 7553–7563, 2018.\n",
"\n",
"[10] Liao, M., Zhang, J., Wan, Z., Xie, F., Liang, J., Lyu, P., ... & Bai, X. (2019, July). Scene text recognition from two-dimensional perspective. In Proceedings of the AAAI Conference on Artificial Intelligence (Vol. 33, No. 01, pp. 8714-8721).\n",
"\n",
"[11] Yu, D., Li, X., Zhang, C., Liu, T., Han, J., Liu, J., & Ding, E. (2020). Towards accurate scene text recognition with semantic reasoning networks. In Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (pp. 12113-12122).\n",
"\n",
"[12] Sheng, F., Chen, Z., & Xu, B. (2019, September). NRTR: A no-recurrence sequence-to-sequence model for scene text recognition. In 2019 International Conference on Document Analysis and Recognition (ICDAR) (pp. 781-786). IEEE.\n",
"\n",
"[13]Yang, L., Wang, P., Li, H., Li, Z., & Zhang, Y. (2020). A holistic representation guided attention network for scene text recognition. Neurocomputing, 414, 67-75.\n",
"\n",
"[14]Wang, T., Zhu, Y., Jin, L., Luo, C., Chen, X., Wu, Y., ... & Cai, M. (2020, April). Decoupled attention network for text recognition. In Proceedings of the AAAI Conference on Artificial Intelligence (Vol. 34, No. 07, pp. 12216-12224).\n",
"\n",
"[15] Wang, Y., Xie, H., Fang, S., Wang, J., Zhu, S., & Zhang, Y. (2021). From two to one: A new scene text recognizer with visual language modeling network. In Proceedings of the IEEE/CVF International Conference on Computer Vision (pp. 14194-14203).\n",
"\n",
"[16] Fang, S., Xie, H., Wang, Y., Mao, Z., & Zhang, Y. (2021). Read Like Humans: Autonomous, Bidirectional and Iterative Language Modeling for Scene Text Recognition. In Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (pp. 7098-7107).\n",
"\n",
"[17] Yan, R., Peng, L., Xiao, S., & Yao, G. (2021). Primitive Representation Learning for Scene Text Recognition. In Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (pp. 284-293)."
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false
},
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "py35-paddle1.2.0"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"collapsed": false
},
"source": [
"# 1. 课程预备知识\n",
"\n",
"本课所涉及的OCR模型建立在深度学习的基础之上,因此与其相关的基础知识、环境配置、项目工程与其他资料将在本节介绍,尤其对深度学习不熟悉的读者可以查看和学习相应内容。\n",
"\n",
"### 1.1 预备知识\n",
"\n",
"深度学习的“学习”由机器学习中的神经元、感知机、多层神经网络等内容一路发展而来,因此了解基础的机器学习算法对于深度学习的理解和应用有很大帮助。而深度学习的“深”则体现在对大量信息处理过程中使用的卷积、池化等一系列以向量为基础的数学运算。如果缺乏这两者的理论基础,可以学习李宏毅老师的[线性代数](https://aistudio.baidu.com/aistudio/course/introduce/2063)和[机器学习](https://aistudio.baidu.com/aistudio/course/introduce/1978)课程。\n",
"\n",
"对于深度学习本身的理解,可以参考百度杰出架构师毕然老师的零基础课程:[百度架构师手把手带你零基础实践深度学习](https://aistudio.baidu.com/aistudio/course/introduce/1297),其中覆盖了深度学习的发展历史,通过一个经典案例介绍深度学习的完整组成部分,是一套以实践为导向的深度学习课程。\n",
"\n",
"对于理论知识的实践,[Python基础知识](https://aistudio.baidu.com/aistudio/course/introduce/1224)必不可少,同时为了快速复现深度学习模型,本课程使用的深度学习框架为:飞桨PaddlePaddle。如果你已经使用过其他框架,通过[快速上手文档](https://www.paddlepaddle.org.cn/documentation/docs/zh/practices/quick_start/hello_paddle.html)可以迅速了解飞桨的使用方法。\n",
"\n",
"### 1.2 基础环境准备\n",
"\n",
"如果你想在本地环境运行本课程的代码且之前未搭建过Python环境,可以根据[零基础运行环境准备](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.3/doc/doc_ch/environment.md),根据自己的操作系统安装Anaconda或docker环境。\n",
"\n",
"如果你没有本地资源,可以通过AI Studio实训平台完成代码运行,其中的每个项目都通过Notebook的方式呈现,方便开发者学习。若对Notebook的相关操作不熟悉,可以参考[AI Studio项目说明](https://ai.baidu.com/ai-doc/AISTUDIO/0k3e2tfzm)。\n",
"\n",
"### 1.3 获取和运行代码\n",
"\n",
"本课程依托PaddleOCR的代码库形成,首先,克隆PaddleOCR的完整项目:\n",
"\n",
"```bash\n",
"#【推荐】\n",
"git clone https://github.com/PaddlePaddle/PaddleOCR\n",
"\n",
"# 如果因为网络问题无法pull成功,也可选择使用码云上的托管:\n",
"git clone https://gitee.com/paddlepaddle/PaddleOCR\n",
"```\n",
"\n",
"> 注:码云托管代码可能无法实时同步本github项目更新,存在3~5天延时,请优先使用推荐方式。\n",
">\n",
"> ​\t\t如果你不熟悉git操作,可以直接在PaddleOCR的首页的 `Code` 中下载压缩包\n",
"\n",
"然后安装第三方库:\n",
"\n",
"```bash\n",
"cd PaddleOCR\n",
"pip3 install -r requirements.txt\n",
"```\n",
"\n",
"\n",
"\n",
"### 1.4 查阅资料\n",
"\n",
"[PaddleOCR使用文档](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.3/README_ch.md#%E6%96%87%E6%A1%A3%E6%95%99%E7%A8%8B) (中文) 中详细介绍了如何使用PaddleOCR完成模型应用、训练和部署。文档内容丰富,大多数用户的问题都在文档或FAQ中有所描述,尤其在[FAQ(中文)](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.3/doc/doc_ch/FAQ.md)中,按照深度学习的应用过程沉淀了用户的常见问题,建议大家仔细阅读。\n",
"\n",
"### 1.5 寻求帮助\n",
"\n",
"如果你在使用PaddleOCR的过程中遇到BUG、易用性或者文档相关的问题,可通过[Github issue](https://github.com/PaddlePaddle/PaddleOCR/issues)与官方联系,请按照issue模板尽可能多的提供信息,以便官方人员迅速定位问题。同时,微信群是广大PaddleOCR用户的日常交流阵地,更适合提问一些咨询类问题,除了有PaddleOCR团队成员以外,还会有热心开发者回答大家的问题。"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "py35-paddle1.2.0"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Text detection FAQ\n",
"\n",
"This section lists some of the problems that developers often encounter when using PaddleOCR's text detection model, and gives corresponding solutions or suggestions.\n",
"\n",
"The FAQ is introduced in two parts, namely:\n",
" -Text detection training related\n",
" -Text detection and prediction related"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 1. FAQ about Text Detection Training\n",
"\n",
"**1.1 What are the text detection algorithms provided by PaddleOCR?**\n",
"\n",
"**A**: PaddleOCR contains a variety of text detection models, including regression-based text detection methods EAST and SAST, and segmentation-based text detection methods DB, PSENet.\n",
"\n",
"\n",
"**1.2: What data sets are used in the Chinese ultra-lightweight and general models in the PaddleOCR project? How many samples were trained, what configuration of GPUs, how many epochs were run, and how long did they run?**\n",
"\n",
"**A**: For the ultra-lightweight DB detection model, the training data includes open source data sets lsvt, rctw, CASIA, CCPD, MSRA, MLT, BornDigit, iflytek, SROIE and synthetic data sets, etc. The total data volume is 10W, The data set is divided into 5 parts. A random sampling strategy is used during training. The training takes about 500 epochs on a 4-card V100GPU, which takes 3 days.\n",
"\n",
"\n",
"**1.3 Does the text detection training label require specific text labeling? What does the \"###\" in the label mean?**\n",
"\n",
"**A**: Text detection training only needs the coordinates of the text area. The label can be four or fourteen points, arranged in the order of upper left, upper right, lower right, and lower left. The label file provided by PaddleOCR contains text fields. For unclear text in the text area, ### will be used instead. When training the detection model, the text field in the label will not be used.\n",
" \n",
"**1.4 Is the effect of the text detection model trained when the text lines are tight?**\n",
"\n",
"**A**: When using segmentation-based methods, such as DB, to detect dense text lines, it is best to collect a batch of data for training, and during training, a binary image will be generated [shrink_ratio](https://github.com/PaddlePaddle/PaddleOCR/blob/8b656a3e13631dfb1ac21d2095d4d4a4993ef710/ppocr/data/imaug/make_shrink_map.py?_pjax=%23js-repo-pjax-container%2C%20div%5Bitemtype%3D%22http%3A%2F%2Fschema.org%2FSoftwareSourceCode%22%5D%20main%2C%20%5Bdata-pjax-container%5D#L37)Turn down the parameter. In addition, when forecasting, you can appropriately reduce [unclip_ratio](https://github.com/PaddlePaddle/PaddleOCR/blob/8b656a3e13631dfb1ac21d2095d4d4a4993ef710/configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml?_pjax=%23js-repo-pjax-container%2C%20div%5Bitemtype%3D%22http%3A%2F%2Fschema.org%2FSoftwareSourceCode%22%5D%20main%2C%20%5Bdata-pjax-container%5D#L59) parameter, the larger the unclip_ratio parameter value, the larger the detection frame.\n",
"\n",
"\n",
"**1.5 For some large-sized document images, DB will have more missed inspections during inspection. How to avoid this kind of missed inspections?**\n",
"\n",
"**A**: First of all, you need to determine whether the model is not well-trained or is the problem handled during prediction. If the model is not well trained, it is recommended to add more data for training, or add more data to enhance it during training.\n",
"If the problem is that the predicted image is too large, you can increase the longest side setting parameter [det_limit_side_len] entered during prediction [det_limit_side_len](https://github.com/PaddlePaddle/PaddleOCR/blob/8b656a3e13631dfb1ac21d2095d4d4a4993ef710/tools/infer/utility.py?_pjax=%23js-repo-pjax-container%2C%20div%5Bitemtype%3D%22http%3A%2F%2Fschema.org%2FSoftwareSourceCode%22%5D%20main%2C%20%5Bdata-pjax-container%5D#L47), which is 960 by default.\n",
"Secondly, you can observe whether the missed text has segmentation results by visualizing the post-processed segmentation map. If there is no segmentation result, the model is not well trained. If there is a complete segmentation area, it means that it is a problem of post-prediction processing. In this case, it is recommended to adjust [DB post-processing parameters](https://github.com/PaddlePaddle/PaddleOCR/blob/8b656a3e13631dfb1ac21d2095d4d4a4993ef710/tools/infer/utility.py?_pjax=%23js-repo-pjax-container%2C%20div%5Bitemtype%3D%22http%3A%2F%2Fschema.org%2FSoftwareSourceCode%22%5D%20main%2C%20%5Bdata-pjax-container%5D#L51-L53)。\n",
"\n",
"\n",
"**1.6 The problem of missed detection of DB model bending text (such as a slightly deformed document image)?**\n",
"\n",
"**A**: When calculating the average score of the text box in the DB post-processing, it is the average score of the rectangle area, which is easy to cause the missed detection of the curved text. The average score of the polygon area has been added, which will be more accurate, but the speed is somewhat different. Decrease, can be selected as needed, and you can view the [Visual Contrast Effect] (https://github.com/PaddlePaddle/PaddleOCR/pull/2604) in the relevant pr. This function is selected by the parameter [det_db_score_mode](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.1/tools/infer/utility.py#L51), the parameter value is optional [`fast` (default) , `slow`], `fast` corresponds to the original rectangle mode, and `slow` corresponds to the polygon mode. Thanks to the user [buptlihang](https://github.com/buptlihang) for mentioning [pr](https://github.com/PaddlePaddle/PaddleOCR/pull/2574) to help solve this problem.\n",
"\n",
"\n",
"**1.7 For simple OCR tasks with low accuracy requirements, how many data sets do I need to prepare?**\n",
"\n",
"**A**: (1) The amount of training data is related to the complexity of the problem to be solved. The greater the difficulty and the higher the accuracy requirements, the greater the data set requirements, and in general, the more training data in practice, the better the effect.\n",
"\n",
"(2) For scenes with low accuracy requirements, the amount of data required for detection tasks and recognition tasks is different. For inspection tasks, 500 images can guarantee the basic inspection results. For recognition tasks, it is necessary to ensure that the number of line text images in which each character in the recognition dictionary appears in different scenes needs to be greater than 200 (for example, if there are 5 words in the dictionary, each word needs to appear in more than 200 pictures, then The minimum required number of images should be between 200-1000), so that the basic recognition effect can be guaranteed.\n",
"\n",
"\n",
"**1.8 How to get more data when the amount of training data is small?**\n",
"\n",
"**A**: When the amount of training data is small, you can try the following three ways to get more data: (1) Collect more training data manually, the most direct and effective way. (2) Basic image processing or transformation based on PIL and opencv. For example, the three modules of ImageFont, Image, ImageDraw in PIL write text into the background, opencv's rotating affine transformation, Gaussian filtering and so on. (3) Synthesize data using data generation algorithms, such as algorithms such as pix2pix.\n",
"\n",
"\n",
"**1.9 How to replace the backbone of text detection/recognition?**\n",
"\n",
"A: Whether it is text detection or text recognition, the choice of backbone network is a trade-off between prediction effect and prediction efficiency. Generally, if you choose a larger-scale backbone network, such as ResNet101_vd, the detection or recognition will be more accurate, but the prediction time will increase accordingly. However, choosing a smaller-scale backbone network, such as MobileNetV3_small_x0_35, will predict faster, but the accuracy of detection or recognition will be greatly reduced. Fortunately, the detection or recognition effects of different backbone networks are positively correlated with the image 1000 classification task in the ImageNet dataset. PaddleClas, a flying paddle image classification suite, summarizes 23 series of classification network structures such as ResNet_vd, Res2Net, HRNet, MobileNetV3, GhostNet, etc. The top1 recognition accuracy rate of the above image classification task, GPU (V100 and T4) and CPU (Snapdragon 855) The prediction time-consuming and the corresponding 117 pre-training model download addresses.\n",
"\n",
"(1) The replacement of the text detection backbone network is mainly to determine 4 stages similar to ResNet to facilitate the integration of subsequent detection heads similar to FPN. In addition, for the text detection problem, the classification pre-training model trained by ImageNet can accelerate the convergence and improve the effect.\n",
"\n",
"(2) The replacement of the backbone network for text recognition requires attention to the drop position of the network width and height stride. Since text recognition generally has a large ratio of width to height, the frequency of height reduction is less, and the frequency of width reduction is more. You can refer to [Changes to the MobileNetV3 backbone network in PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR/blob/release%2F2.3/ppocr/modeling/backbones/rec_mobilenet_v3.py)。\n",
"\n",
"\n",
"**1.10 How to finetune the detection model, such as freezing the previous layer or learning with a small learning rate for some layers?**\n",
"\n",
"**A**: If you freeze certain layers, you can set the stop_gradient property of the variable to True, so that all the parameters before calculating this variable will not be updated, refer to: https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/faq/train_cn.html#id4\n",
"\n",
"If learning with a smaller learning rate for some layers is not very convenient in the static graph, one method is to set a fixed learning rate for the weight attribute when the parameters are initialized, refer to: https://www.paddlepaddle.org.cn/documentation/docs/en/develop/api/paddle/fluid/param_attr/ParamAttr_cn.html#paramattr\n",
"\n",
"In fact, our experiment found that directly loading the model to fine-tune without setting different learning rates of certain layers, the effect is also good.\n",
"\n",
"**1.11 In the preprocessing part of DB, why should the length and width of the picture be processed into multiples of 32?**\n",
"\n",
"**A**: It is related to the stride of the network downsampling. Take the resnet backbone network under inspection as an example. After the image is input to the network, it needs to be downsampled by 2 times for 5 times, a total of 32 times. Therefore, it is recommended that the input image size be a multiple of 32.\n",
"\n",
"\n",
"**1.12 In the PP-OCR series models, why does the backbone network for text detection not use SEBlock?**\n",
"\n",
"**A**: The SE module is an important module of the MobileNetV3 network. Its purpose is to estimate the importance of each feature channel of the feature map, assign weights to each feature of the feature map, and improve the expressive ability of the network. However, for text detection, the resolution of the input network is relatively large, generally 640\\*640. It is difficult to use the SE module to estimate the importance of each feature channel of the feature map. The network improvement ability is limited, but the module is relatively time-consuming. In the PP-OCR system, the backbone network for text detection does not use the SE module. Experiments also show that when the SE module is removed, the size of the ultra-lightweight model can be reduced by 40%, and the text detection effect is basically not affected. For details, please refer to the PP-OCR technical article, https://arxiv.org/abs/2009.09941.\n",
"\n",
"\n",
"**1.13 The PP-OCR detection effect is not good, how to optimize it?**\n",
"\n",
"**A**: Specific analysis of specific issues:\n",
"- If the detection effect is not available on your scene, the first choice is to do finetune training on your data;\n",
"- If the image is too large and the text is too dense, it is recommended not to over-compress the image. You can try to modify the resize logic of the detection preprocessing to prevent the image from being over-compressed;\n",
"- The size of the detection frame is too close to the text or the detection frame is too large, you can adjust the db_unclip_ratio parameter, increasing the parameter can enlarge the detection frame, and reducing the parameter can reduce the size of the detection frame;\n",
"- There are many missed detection problems in the detection frame, which can reduce the threshold parameter det_db_box_thresh for DB detection to prevent some detection frames from being filtered out. You can also try to set det_db_score_mode to'slow';\n",
"- Other methods can choose use_dilation as True to expand the feature map of the detection output. In general, the effect will be improved.\n",
"\n",
"\n",
"## 2. FAQ about Text Detection and Prediction\n",
"\n",
"**2.1 In DB, some boxes are too pasted with text, but some corners of the text are removed to affect the recognition. Is there any way to alleviate this problem?**\n",
"\n",
"**A**: The post-processing parameter [unclip_ratio](https://github.com/PaddlePaddle/PaddleOCR/blob/d80afce9b51f09fd3d90e539c40eba8eb5e50dd6/tools/infer/utility.py?_pjax=%23js-repo-pjax-container%2C%20div%5Bitemtype%3D%22http%3A%2F%2Fschema.org%2FSoftwareSourceCode%22%5D%20main%2C%20%5Bdata-pjax-container%5D#L52) can be appropriately increased. the larger the parameter, the larger the text box.\n",
"\n",
"\n",
"**2.2 Why does the PaddleOCR detection prediction only support one image test? That is, test_batch_size_per_card=1**\n",
"\n",
"**A**: When predicting, the image is scaled in equal proportions, the longest side is 960, and the length and width of different images after scaling in equal proportions are inconsistent, and they cannot form a batch, so set test_batch_size to 1.\n",
"\n",
"\n",
"**2.3 Accelerate PaddleOCR's text detection model prediction on the CPU?**\n",
"\n",
"**A**: x86 CPU can use mkldnn (OneDNN) for acceleration; enable [enable_mkldnn](https://github.com/PaddlePaddle/PaddleOCR/blob/8b656a3e13631dfb1ac21d2095d4d4a4993ef710/tools/infer/utility.py#L105) Parameters. In addition, in conjunction with increasing the number of threads used for prediction on the CPU, [num_threads](https://github.com/PaddlePaddle/PaddleOCR/blob/8b656a3e13631dfb1ac21d2095d4d4a4993ef710/tools/infer/utility.py#L106) can effectively speed up the prediction speed on the CPU.\n",
"\n",
"**2.4 Accelerate PaddleOCR's text detection model prediction on GPU?**\n",
"\n",
"**A**: TensorRT is recommended for GPU accelerated prediction.\n",
"- 1. Download the Paddle installation package or prediction library with TensorRT from [link](https://paddleinference.paddlepaddle.org.cn/master/user_guides/download_lib.html).\n",
"- 2. Download the [TensorRT](https://developer.nvidia.com/tensorrt) from the Nvidia official website. Note that the downloaded TensorRT version is consistent with the TensorRT version compiled in the paddle installation package.\n",
"- 3. Set the environment variable `LD_LIBRARY_PATH` to point to the lib folder of TensorRT\n",
"```\n",
"export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:<TensorRT-${version}/lib>\n",
"```\n",
"- 4. Enable [tensorrt option](https://github.com/PaddlePaddle/PaddleOCR/blob/8b656a3e13631dfb1ac21d2095d4d4a4993ef710/tools/infer/utility.py?_pjax=%23js-repo-pjax-container%2%5Bitemtype%3D%22http%3A%2F%2Fschema.org%2FSoftwareSourceCode%22%5D%20main%2C%20%5Bdata-pjax-container%5D#L38).\n",
"\n",
"**2.5 How to deploy PaddleOCR model on the mobile terminal?**\n",
"\n",
"**A**: Flying Oar Paddle has a special tool for mobile deployment [PaddleLite](https://github.com/PaddlePaddle/Paddle-Lite), and PaddleOCR provides DB+CRNN as the demo android arm deployment code , Refer to [link](https://github.com/PaddlePaddle/PaddleOCR/blob/release%2F2.3/deploy/lite/readme.md).\n",
"\n",
"\n",
"**2.6 How to use PaddleOCR multi-process prediction?**\n",
"\n",
"**A**: PaddleOCR recently added [Multi-Process Predictive Control Parameters](https://github.com/PaddlePaddle/PaddleOCR/blob/8b656a3e13631dfb1ac21d2095d4d4a4993ef710/tools/infer/utility.py?_pjax=%23js-repo-pjax-container%2C%20div%5Bitemtype%3D%22http%3A%2F%2Fschema.org%2FSoftwareSourceCode%22%5D%20main%2C%20%5Bdata-pjax-container%5D#L111), `use_mp` indicates whether When using multiple processes, `total_process_num` indicates the number of processes when using multiple processes. For specific usage, please refer to [document](https://github.com/PaddlePaddle/PaddleOCR/blob/release%2F2.3/doc/doc_ch/inference.md#1-%E8%B6%85%E8%BD%BB%E9%87%8F%E4%B8%AD%E6%96%87ocr%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86).\n",
"\n",
"**2.7 Video memory explosion and memory leak during prediction?**\n",
"\n",
"**A**: If it is the prediction of the training model, the video memory is not enough because the model is too large or the input image is too large, you can refer to the code and add paddle.no_grad() before the main function runs to reduce the video memory usage. If the memory usage of the inference model is too high, you can add [config.enable_memory_optim()](https://github.com/PaddlePaddle/PaddleOCR/blob/8b656a3e13631dfb1ac21d2095d4d4a4993ef710/tools/infer/utility.py?_pjax=%23js-repo-pjax-container%2C%20div%5Bitemtype%3D%22http%3A%2F%2Fschema.org%2FSoftwareSourceCode%22%5D%20main%2C%20%5Bdata-pjax-container%5D#L267) to reduce the memory usage when configuring Config.\n",
"\n",
"In addition, regarding the memory leak when using Paddle to predict, it is recommended to install the latest version of paddle. The memory leak has been fixed."
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"tags": []
},
"source": [
"# 1. Course Prerequisites\n",
"\n",
"The OCR model involved in this course is based on deep learning, so its related basic knowledge, environment configuration, project engineering and other materials will be introduced in this section, especially for readers who are not familiar with deep learning. content.\n",
"\n",
"### 1.1 Preliminary Knowledge\n",
"\n",
"The \"learning\" of deep learning has been developed from the content of neurons, perceptrons, and multilayer neural networks in machine learning. Therefore, understanding the basic machine learning algorithms is of great help to the understanding and application of deep learning. The \"deepness\" of deep learning is embodied in a series of vector-based mathematical operations such as convolution and pooling used in the process of processing a large amount of information. If you lack the theoretical foundation of the two, you can learn from teacher Li Hongyi's [Linear Algebra](https://aistudio.baidu.com/aistudio/course/introduce/2063) and [Machine Learning](https://aistudio.baidu.com/aistudio/course/introduce/1978) courses.\n",
"\n",
"For the understanding of deep learning itself, you can refer to the zero-based course of Bai Ran, an outstanding architect of Baidu: [Baidu architects take you hands-on with zero-based practice deep learning](https://aistudio.baidu.com/aistudio/course/introduce/1297), which covers the development history of deep learning and introduces the complete components of deep learning through a classic case. It is a set of practice-oriented deep learning courses.\n",
"\n",
"For the practice of theoretical knowledge, [Python basic knowledge](https://aistudio.baidu.com/aistudio/course/introduce/1224) is essential. At the same time, in order to quickly reproduce the deep learning model, the deep learning framework used in this course For: Flying PaddlePaddle. If you have used other frameworks, you can quickly learn how to use flying paddles through [Quick Start Document](https://www.paddlepaddle.org.cn/documentation/docs/zh/practices/quick_start/hello_paddle.html).\n",
"\n",
"### 1.2 Basic Environment Preparation\n",
"\n",
"If you want to run the code of this course in a local environment and have not built a Python environment before, you can follow the [zero-base operating environment preparation](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.3/doc/doc_ch/environment.md), install Anaconda or docker environment according to your operating system.\n",
"\n",
"If you don't have local resources, you can run the code through the AI Studio training platform. Each item in it is presented in a notebook, which is convenient for developers to learn. If you are not familiar with the related operations of Notebook, you can refer to [AI Studio Project Description](https://ai.baidu.com/ai-doc/AISTUDIO/0k3e2tfzm).\n",
"\n",
"### 1.3 Get and Run the Code\n",
"\n",
"This course relies on the formation of PaddleOCR's code repository. First, clone the complete project of PaddleOCR:\n",
"\n",
"```bash\n",
"# [recommend]\n",
"git clone https://github.com/PaddlePaddle/PaddleOCR\n",
"\n",
"# If you cannot pull successfully due to network problems, you can also choose to use the hosting on Code Cloud:\n",
"git clone https://gitee.com/paddlepaddle/PaddleOCR\n",
"```\n",
"\n",
"> Note: The code cloud hosted code may not be able to synchronize the update of this github project in real time, there is a delay of 3~5 days, please use the recommended method first.\n",
">\n",
"> If you are not familiar with git operations, you can download the compressed package directly from the `Code` on the homepage of PaddleOCR\n",
"\n",
"Then install third-party libraries:\n",
"\n",
"```bash\n",
"cd PaddleOCR\n",
"pip3 install -r requirements.txt\n",
"```\n",
"\n",
"\n",
"\n",
"### 1.4 Access to Information\n",
"\n",
"[PaddleOCR Usage Document](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.3/README.md) describes in detail how to use PaddleOCR to complete model application, training and deployment. The document is rich in content, most of the user’s questions are described in the document or FAQ, especially in [FAQ](https://github.com/PaddlePaddle/PaddleOCR/blob/release%2F2.3/doc/doc_en/FAQ_en.md), in accordance with the application process of deep learning, has precipitated the user's common questions, it is recommended that you read it carefully.\n",
"\n",
"### 1.5 Ask for Help\n",
"\n",
"If you encounter BUG, ease of use or documentation related issues while using PaddleOCR, you can contact the official via [Github issue](https://github.com/PaddlePaddle/PaddleOCR/issues), please follow the issue template Provide as much information as possible so that official personnel can quickly locate the problem. At the same time, the WeChat group is the daily communication position for the majority of PaddleOCR users, and it is more suitable for asking some consulting questions. In addition to the PaddleOCR team members, there will also be enthusiastic developers answering your questions."
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "py35-paddle1.2.0"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
...@@ -47,8 +47,8 @@ __all__ = [ ...@@ -47,8 +47,8 @@ __all__ = [
] ]
SUPPORT_DET_MODEL = ['DB'] SUPPORT_DET_MODEL = ['DB']
VERSION = '2.5.0.1' VERSION = '2.5.0.3'
SUPPORT_REC_MODEL = ['CRNN'] SUPPORT_REC_MODEL = ['CRNN', 'SVTR_LCNet']
BASE_DIR = os.path.expanduser("~/.paddleocr/") BASE_DIR = os.path.expanduser("~/.paddleocr/")
DEFAULT_OCR_MODEL_VERSION = 'PP-OCRv3' DEFAULT_OCR_MODEL_VERSION = 'PP-OCRv3'
......
...@@ -22,6 +22,9 @@ from numpy.fft import fft ...@@ -22,6 +22,9 @@ from numpy.fft import fft
from numpy.linalg import norm from numpy.linalg import norm
import sys import sys
def vector_slope(vec):
assert len(vec) == 2
return abs(vec[1] / (vec[0] + 1e-8))
class FCENetTargets: class FCENetTargets:
"""Generate the ground truth targets of FCENet: Fourier Contour Embedding """Generate the ground truth targets of FCENet: Fourier Contour Embedding
...@@ -233,9 +236,9 @@ class FCENetTargets: ...@@ -233,9 +236,9 @@ class FCENetTargets:
head_inds = [head_start, head_end] head_inds = [head_start, head_end]
tail_inds = [tail_start, tail_end] tail_inds = [tail_start, tail_end]
else: else:
if self.vector_slope(points[1] - points[0]) + self.vector_slope( if vector_slope(points[1] - points[0]) + vector_slope(
points[3] - points[2]) < self.vector_slope(points[ points[3] - points[2]) < vector_slope(points[
2] - points[1]) + self.vector_slope(points[0] - points[ 2] - points[1]) + vector_slope(points[0] - points[
3]): 3]):
horizontal_edge_inds = [[0, 1], [2, 3]] horizontal_edge_inds = [[0, 1], [2, 3]]
vertical_edge_inds = [[3, 0], [1, 2]] vertical_edge_inds = [[3, 0], [1, 2]]
......
...@@ -15,10 +15,13 @@ ...@@ -15,10 +15,13 @@
import copy import copy
import importlib import importlib
from paddle.jit import to_static
from paddle.static import InputSpec
from .base_model import BaseModel from .base_model import BaseModel
from .distillation_model import DistillationModel from .distillation_model import DistillationModel
__all__ = ['build_model'] __all__ = ["build_model", "apply_to_static"]
def build_model(config): def build_model(config):
...@@ -30,3 +33,36 @@ def build_model(config): ...@@ -30,3 +33,36 @@ def build_model(config):
mod = importlib.import_module(__name__) mod = importlib.import_module(__name__)
arch = getattr(mod, name)(config) arch = getattr(mod, name)(config)
return arch return arch
def apply_to_static(model, config, logger):
if config["Global"].get("to_static", False) is not True:
return model
assert "image_shape" in config[
"Global"], "image_shape must be assigned for static training mode..."
supported_list = ["DB", "SVTR"]
if config["Architecture"]["algorithm"] in ["Distillation"]:
algo = list(config["Architecture"]["Models"].values())[0]["algorithm"]
else:
algo = config["Architecture"]["algorithm"]
assert algo in supported_list, f"algorithms that supports static training must in in {supported_list} but got {algo}"
specs = [
InputSpec(
[None] + config["Global"]["image_shape"], dtype='float32')
]
if algo == "SVTR":
specs.append([
InputSpec(
[None, config["Global"]["max_text_length"]],
dtype='int64'), InputSpec(
[None, config["Global"]["max_text_length"]], dtype='int64'),
InputSpec(
[None], dtype='int64'), InputSpec(
[None], dtype='float64')
])
model = to_static(model, input_spec=specs)
logger.info("Successfully to apply @to_static with specs: {}".format(specs))
return model
...@@ -83,7 +83,7 @@ class SAREncoder(nn.Layer): ...@@ -83,7 +83,7 @@ class SAREncoder(nn.Layer):
def forward(self, feat, img_metas=None): def forward(self, feat, img_metas=None):
if img_metas is not None: if img_metas is not None:
assert len(img_metas[0]) == feat.shape[0] assert len(img_metas[0]) == paddle.shape(feat)[0]
valid_ratios = None valid_ratios = None
if img_metas is not None and self.mask: if img_metas is not None and self.mask:
...@@ -98,9 +98,10 @@ class SAREncoder(nn.Layer): ...@@ -98,9 +98,10 @@ class SAREncoder(nn.Layer):
if valid_ratios is not None: if valid_ratios is not None:
valid_hf = [] valid_hf = []
T = holistic_feat.shape[1] T = paddle.shape(holistic_feat)[1]
for i in range(len(valid_ratios)): for i in range(paddle.shape(valid_ratios)[0]):
valid_step = min(T, math.ceil(T * valid_ratios[i])) - 1 valid_step = paddle.minimum(
T, paddle.ceil(valid_ratios[i] * T).astype('int32')) - 1
valid_hf.append(holistic_feat[i, valid_step, :]) valid_hf.append(holistic_feat[i, valid_step, :])
valid_hf = paddle.stack(valid_hf, axis=0) valid_hf = paddle.stack(valid_hf, axis=0)
else: else:
...@@ -247,13 +248,14 @@ class ParallelSARDecoder(BaseDecoder): ...@@ -247,13 +248,14 @@ class ParallelSARDecoder(BaseDecoder):
# bsz * (seq_len + 1) * h * w * attn_size # bsz * (seq_len + 1) * h * w * attn_size
attn_weight = self.conv1x1_2(attn_weight) attn_weight = self.conv1x1_2(attn_weight)
# bsz * (seq_len + 1) * h * w * 1 # bsz * (seq_len + 1) * h * w * 1
bsz, T, h, w, c = attn_weight.shape bsz, T, h, w, c = paddle.shape(attn_weight)
assert c == 1 assert c == 1
if valid_ratios is not None: if valid_ratios is not None:
# cal mask of attention weight # cal mask of attention weight
for i in range(len(valid_ratios)): for i in range(paddle.shape(valid_ratios)[0]):
valid_width = min(w, math.ceil(w * valid_ratios[i])) valid_width = paddle.minimum(
w, paddle.ceil(valid_ratios[i] * w).astype("int32"))
if valid_width < w: if valid_width < w:
attn_weight[i, :, :, valid_width:, :] = float('-inf') attn_weight[i, :, :, valid_width:, :] = float('-inf')
...@@ -288,7 +290,7 @@ class ParallelSARDecoder(BaseDecoder): ...@@ -288,7 +290,7 @@ class ParallelSARDecoder(BaseDecoder):
img_metas: [label, valid_ratio] img_metas: [label, valid_ratio]
''' '''
if img_metas is not None: if img_metas is not None:
assert len(img_metas[0]) == feat.shape[0] assert paddle.shape(img_metas[0])[0] == paddle.shape(feat)[0]
valid_ratios = None valid_ratios = None
if img_metas is not None and self.mask: if img_metas is not None and self.mask:
...@@ -302,7 +304,6 @@ class ParallelSARDecoder(BaseDecoder): ...@@ -302,7 +304,6 @@ class ParallelSARDecoder(BaseDecoder):
# bsz * (seq_len + 1) * C # bsz * (seq_len + 1) * C
out_dec = self._2d_attention( out_dec = self._2d_attention(
in_dec, feat, out_enc, valid_ratios=valid_ratios) in_dec, feat, out_enc, valid_ratios=valid_ratios)
# bsz * (seq_len + 1) * num_classes
return out_dec[:, 1:, :] # bsz * seq_len * num_classes return out_dec[:, 1:, :] # bsz * seq_len * num_classes
...@@ -395,7 +396,6 @@ class SARHead(nn.Layer): ...@@ -395,7 +396,6 @@ class SARHead(nn.Layer):
if self.training: if self.training:
label = targets[0] # label label = targets[0] # label
label = paddle.to_tensor(label, dtype='int64')
final_out = self.decoder( final_out = self.decoder(
feat, holistic_feat, label, img_metas=targets) feat, holistic_feat, label, img_metas=targets)
else: else:
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
opencv-contrib-python==4.4.0.46
pypandoc
python-docx
\ No newline at end of file
...@@ -61,6 +61,11 @@ def init_args(): ...@@ -61,6 +61,11 @@ def init_args():
type=str2bool, type=str2bool,
default=True, default=True,
help='In the forward, whether the non-table area is recognition by ocr') help='In the forward, whether the non-table area is recognition by ocr')
parser.add_argument(
"--recovery",
type=bool,
default=False,
help='Whether to enable layout of recovery')
return parser return parser
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册