diff --git a/PPOCRLabel/PPOCRLabel.py b/PPOCRLabel/PPOCRLabel.py
index f09a998a9dcbddb3d1239ec61c7a194751362618..7c7802a73ed32680142f8119b10a0393d1fab9cc 100644
--- a/PPOCRLabel/PPOCRLabel.py
+++ b/PPOCRLabel/PPOCRLabel.py
@@ -1439,8 +1439,8 @@ class MainWindow(QMainWindow):
DEFAULT_LOCK_COLOR, key_cls, box['difficult']))
if imgidx in self.PPlabel.keys():
for box in self.PPlabel[imgidx]:
- key_cls = None if not self.kie_mode else box['key_cls']
- shapes.append((box['transcription'], box['points'], None, key_cls, box['difficult']))
+ key_cls = None if not self.kie_mode else box.get('key_cls', 'None')
+ shapes.append((box['transcription'], box['points'], None, key_cls, box.get('difficult', False)))
self.loadLabels(shapes)
self.canvas.verified = False
@@ -1492,7 +1492,7 @@ class MainWindow(QMainWindow):
event.ignore()
else:
settings = self.settings
- # If it loads images from dir, don't load it at the begining
+ # If it loads images from dir, don't load it at the beginning
if self.dirname is None:
settings[SETTING_FILENAME] = self.filePath if self.filePath else ''
else:
@@ -1584,7 +1584,7 @@ class MainWindow(QMainWindow):
for image, info in label_dict.items():
for box in info:
if "key_cls" not in box:
- continue
+ box.update({"key_cls": "None"})
self.existed_key_cls_set.add(box["key_cls"])
if len(self.existed_key_cls_set) > 0:
for key_text in self.existed_key_cls_set:
@@ -1606,8 +1606,6 @@ class MainWindow(QMainWindow):
fit_to_content={'column': True, 'row': False},
flags=None
)
- else:
- self.keyDialog.labelList.addItems(self.existed_key_cls_set)
def importDirImages(self, dirpath, isDelete=False):
if not self.mayContinue() or not dirpath:
diff --git a/PPOCRLabel/README.md b/PPOCRLabel/README.md
index 0a1a7536912b6591517050bfa62260aaef6077cd..95baf66b297bb17b9396a20b8aa156fbd8273e39 100644
--- a/PPOCRLabel/README.md
+++ b/PPOCRLabel/README.md
@@ -12,6 +12,7 @@ PPOCRLabel is a semi-automatic graphic annotation tool suitable for OCR field, w
- Add KIE Mode by using `--kie`, for [detection + identification + keyword extraction] labeling.
- 2022.01:(by [PeterH0323](https://github.com/peterh0323) )
- Improve user experience: prompt for the number of files and labels, optimize interaction, and fix bugs such as only use CPU when inference
+ - New functions: Support using `C` or `X` to rotate box.
- 2021.11.17:
- Support install and start PPOCRLabel through the whl package (by [d2623587501](https://github.com/d2623587501))
- Dataset segmentation: Divide the annotation file into training, verification and testing parts (refer to section 3.5 below, by [MrCuiHao](https://github.com/MrCuiHao))
diff --git a/PPOCRLabel/README_ch.md b/PPOCRLabel/README_ch.md
index 99c088de83f9cba775733a0473b50596683c47ab..5534b308f3b49905bfaa33b24e128e2bbcc1ef5b 100644
--- a/PPOCRLabel/README_ch.md
+++ b/PPOCRLabel/README_ch.md
@@ -11,7 +11,8 @@ PPOCRLabel是一款适用于OCR领域的半自动化图形标注工具,内置P
- 2022.02:(by [PeterH0323](https://github.com/peterh0323) )
- 新增:使用 `--kie` 进入 KIE 功能,用于打【检测+识别+关键字提取】的标签
- 2022.01:(by [PeterH0323](https://github.com/peterh0323) )
- - 提升用户体验:新增文件与标记数目提示、优化交互、修复gpu使用等问题
+ - 提升用户体验:新增文件与标记数目提示、优化交互、修复gpu使用等问题。
+ - 新增功能:使用 `C` 和 `X` 对标记框进行旋转。
- 2021.11.17:
- 新增支持通过whl包安装和启动PPOCRLabel(by [d2623587501](https://github.com/d2623587501))
- 标注数据集切分:对标注数据进行训练、验证与测试集划分(参考下方3.5节,by [MrCuiHao](https://github.com/MrCuiHao))
diff --git a/README.md b/README.md
index df53191a12b2a64290db25ba4c6c600f2e489628..259ccb5aa02352ca2a2b81bf81d858cec2b47081 100644
--- a/README.md
+++ b/README.md
@@ -68,6 +68,8 @@ PaddleOCR support a variety of cutting-edge algorithms related to OCR, and devel
| Model introduction | Model name | Recommended scene | Detection model | Direction classifier | Recognition model |
| ------------------------------------------------------------ | ---------------------------- | ----------------- | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ |
+| Chinese and English ultra-lightweight PP-OCRv3 model(16.2M) | ch_PP-OCRv3_xx | Mobile & Server | [inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_distill_train.tar) | [inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) | [inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_train.tar) |
+| English ultra-lightweight PP-OCRv3 model(13.4M) | en_PP-OCRv3_xx | Mobile & Server | [inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_distill_train.tar) | [inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) | [inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_train.tar) |
| Chinese and English ultra-lightweight PP-OCRv2 model(11.6M) | ch_PP-OCRv2_xx |Mobile & Server|[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_distill_train.tar)| [inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_train.tar)|
| Chinese and English ultra-lightweight PP-OCR model (9.4M) | ch_ppocr_mobile_v2.0_xx | Mobile & server |[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_train.tar)|[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) |[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_train.tar) |
| Chinese and English general PP-OCR model (143.4M) | ch_ppocr_server_v2.0_xx | Server |[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_train.tar) |[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) |[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_train.tar) |
diff --git a/README_ch.md b/README_ch.md
index e2e6835c884da16928e7d36419c6daa6895dd9e1..c040853074b3b6f99895ee984ed9828140fa5713 100755
--- a/README_ch.md
+++ b/README_ch.md
@@ -71,6 +71,8 @@ PaddleOCR旨在打造一套丰富、领先、且实用的OCR工具库,助力
| 模型简介 | 模型名称 | 推荐场景 | 检测模型 | 方向分类器 | 识别模型 |
| ------------------------------------- | ----------------------- | --------------- | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ |
+| 中英文超轻量PP-OCRv3模型(16.2M) | ch_PP-OCRv3_xx | 移动端&服务器端 | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_distill_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_train.tar) |
+| 英文超轻量PP-OCRv3模型(13.4M) | en_PP-OCRv3_xx | 移动端&服务器端 | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_distill_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_train.tar) |
| 中英文超轻量PP-OCRv2模型(13.0M) | ch_PP-OCRv2_xx | 移动端&服务器端 | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_distill_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_train.tar) |
| 中英文超轻量PP-OCR mobile模型(9.4M) | ch_ppocr_mobile_v2.0_xx | 移动端&服务器端 | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_pre.tar) |
| 中英文通用PP-OCR server模型(143.4M) | ch_ppocr_server_v2.0_xx | 服务器端 | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_pre.tar) |
diff --git "a/applications/\345\244\232\346\250\241\346\200\201\350\241\250\345\215\225\350\257\206\345\210\253.md" "b/applications/\345\244\232\346\250\241\346\200\201\350\241\250\345\215\225\350\257\206\345\210\253.md"
index 2143a6da86abb5eb83788944cd0381b91dad86c1..e64a22e169482ae51cadf8b25d75c5d98651e80b 100644
--- "a/applications/\345\244\232\346\250\241\346\200\201\350\241\250\345\215\225\350\257\206\345\210\253.md"
+++ "b/applications/\345\244\232\346\250\241\346\200\201\350\241\250\345\215\225\350\257\206\345\210\253.md"
@@ -809,7 +809,7 @@ plt.imshow(img)
```
fout.write(img_path + "\t" + json.dumps(
{
- "ser_resule": result,
+ "ser_result": result,
}, ensure_ascii=False) + "\n")
```
diff --git a/configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_dml.yml b/configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_dml.yml
index 0553a65e448db21f196389f6abba7bf8808f0494..f3ad966488dbc2f6f7ca12033bc4a3d35e1b3bd7 100644
--- a/configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_dml.yml
+++ b/configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_dml.yml
@@ -37,7 +37,7 @@ Architecture:
Head:
name: DBHead
k: 50
- Student2:
+ Teacher:
pretrained: ./pretrain_models/MobileNetV3_large_x0_5_pretrained
freeze_params: false
return_all_feats: false
@@ -62,15 +62,15 @@ Loss:
loss_config_list:
- DistillationDMLLoss:
model_name_pairs:
- - ["Student", "Student2"]
+ - ["Student", "Teacher"]
maps_name: "thrink_maps"
weight: 1.0
act: "softmax"
- model_name_pairs: ["Student", "Student2"]
+ model_name_pairs: ["Student", "Teacher"]
key: maps
- DistillationDBLoss:
weight: 1.0
- model_name_list: ["Student", "Student2"]
+ model_name_list: ["Student", "Teacher"]
# key: maps
name: DBLoss
balance_loss: true
@@ -94,7 +94,7 @@ Optimizer:
PostProcess:
name: DistillationDBPostProcess
- model_name: ["Student", "Student2"]
+ model_name: ["Student", "Teacher"]
key: head_out
thresh: 0.3
box_thresh: 0.6
diff --git a/configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_dml.yml b/configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_dml.yml
new file mode 100644
index 0000000000000000000000000000000000000000..3f30ada13f2f1c3c01ba8886bbfba006da516f17
--- /dev/null
+++ b/configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_dml.yml
@@ -0,0 +1,173 @@
+Global:
+ use_gpu: true
+ epoch_num: 1200
+ log_smooth_window: 20
+ print_batch_step: 2
+ save_model_dir: ./output/ch_db_mv3/
+ save_epoch_step: 1200
+ # evaluation is run every 5000 iterations after the 4000th iteration
+ eval_batch_step: [3000, 2000]
+ cal_metric_during_train: False
+ pretrained_model: ./pretrain_models/MobileNetV3_large_x0_5_pretrained
+ checkpoints:
+ save_inference_dir:
+ use_visualdl: False
+ infer_img: doc/imgs_en/img_10.jpg
+ save_res_path: ./output/det_db/predicts_db.txt
+
+Architecture:
+ name: DistillationModel
+ algorithm: Distillation
+ model_type: det
+ Models:
+ Student:
+ return_all_feats: false
+ model_type: det
+ algorithm: DB
+ Backbone:
+ name: ResNet
+ in_channels: 3
+ layers: 50
+ Neck:
+ name: LKPAN
+ out_channels: 256
+ Head:
+ name: DBHead
+ kernel_list: [7,2,2]
+ k: 50
+ Student2:
+ return_all_feats: false
+ model_type: det
+ algorithm: DB
+ Backbone:
+ name: ResNet
+ in_channels: 3
+ layers: 50
+ Neck:
+ name: LKPAN
+ out_channels: 256
+ Head:
+ name: DBHead
+ kernel_list: [7,2,2]
+ k: 50
+
+
+Loss:
+ name: CombinedLoss
+ loss_config_list:
+ - DistillationDMLLoss:
+ model_name_pairs:
+ - ["Student", "Student2"]
+ maps_name: "thrink_maps"
+ weight: 1.0
+ act: "softmax"
+ model_name_pairs: ["Student", "Student2"]
+ key: maps
+ - DistillationDBLoss:
+ weight: 1.0
+ model_name_list: ["Student", "Student2"]
+ # key: maps
+ name: DBLoss
+ balance_loss: true
+ main_loss_type: DiceLoss
+ alpha: 5
+ beta: 10
+ ohem_ratio: 3
+
+
+Optimizer:
+ name: Adam
+ beta1: 0.9
+ beta2: 0.999
+ lr:
+ name: Cosine
+ learning_rate: 0.001
+ warmup_epoch: 2
+ regularizer:
+ name: 'L2'
+ factor: 0
+
+PostProcess:
+ name: DistillationDBPostProcess
+ model_name: ["Student", "Student2"]
+ key: head_out
+ thresh: 0.3
+ box_thresh: 0.6
+ max_candidates: 1000
+ unclip_ratio: 1.5
+
+Metric:
+ name: DistillationMetric
+ base_metric_name: DetMetric
+ main_indicator: hmean
+ key: "Student"
+
+Train:
+ dataset:
+ name: SimpleDataSet
+ data_dir: ./train_data/icdar2015/text_localization/
+ label_file_list:
+ - ./train_data/icdar2015/text_localization/train_icdar2015_label.txt
+ ratio_list: [1.0]
+ transforms:
+ - DecodeImage: # load image
+ img_mode: BGR
+ channel_first: False
+ - DetLabelEncode: # Class handling label
+ - CopyPaste:
+ - IaaAugment:
+ augmenter_args:
+ - { 'type': Fliplr, 'args': { 'p': 0.5 } }
+ - { 'type': Affine, 'args': { 'rotate': [-10, 10] } }
+ - { 'type': Resize, 'args': { 'size': [0.5, 3] } }
+ - EastRandomCropData:
+ size: [960, 960]
+ max_tries: 50
+ keep_ratio: true
+ - MakeBorderMap:
+ shrink_ratio: 0.4
+ thresh_min: 0.3
+ thresh_max: 0.7
+ - MakeShrinkMap:
+ shrink_ratio: 0.4
+ min_text_size: 8
+ - NormalizeImage:
+ scale: 1./255.
+ mean: [0.485, 0.456, 0.406]
+ std: [0.229, 0.224, 0.225]
+ order: 'hwc'
+ - ToCHWImage:
+ - KeepKeys:
+ keep_keys: ['image', 'threshold_map', 'threshold_mask', 'shrink_map', 'shrink_mask'] # the order of the dataloader list
+ loader:
+ shuffle: True
+ drop_last: False
+ batch_size_per_card: 8
+ num_workers: 4
+
+Eval:
+ dataset:
+ name: SimpleDataSet
+ data_dir: ./train_data/icdar2015/text_localization/
+ label_file_list:
+ - ./train_data/icdar2015/text_localization/test_icdar2015_label.txt
+ transforms:
+ - DecodeImage: # load image
+ img_mode: BGR
+ channel_first: False
+ - DetLabelEncode: # Class handling label
+ - DetResizeForTest:
+# image_shape: [736, 1280]
+ - NormalizeImage:
+ scale: 1./255.
+ mean: [0.485, 0.456, 0.406]
+ std: [0.229, 0.224, 0.225]
+ order: 'hwc'
+ - ToCHWImage:
+ - KeepKeys:
+ keep_keys: ['image', 'shape', 'polys', 'ignore_tags']
+ loader:
+ shuffle: False
+ drop_last: False
+ batch_size_per_card: 1 # must be 1
+ num_workers: 2
diff --git a/configs/rec/ch_PP-OCRv3/ch_PP-OCRv3_rec.yml b/configs/rec/PP-OCRv3/ch_PP-OCRv3_rec.yml
similarity index 99%
rename from configs/rec/ch_PP-OCRv3/ch_PP-OCRv3_rec.yml
rename to configs/rec/PP-OCRv3/ch_PP-OCRv3_rec.yml
index c45a1a3c8b8edc8a542bc740f6abd958b9a1e701..6453934b7324b2b351aeb6fdf8e4e4de24b022bf 100644
--- a/configs/rec/ch_PP-OCRv3/ch_PP-OCRv3_rec.yml
+++ b/configs/rec/PP-OCRv3/ch_PP-OCRv3_rec.yml
@@ -71,7 +71,7 @@ PostProcess:
Metric:
name: RecMetric
main_indicator: acc
- ignore_space: True
+ ignore_space: False
Train:
dataset:
diff --git a/configs/rec/ch_PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml b/configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml
similarity index 99%
rename from configs/rec/ch_PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml
rename to configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml
index 80ec7c6308aa006f46331503e00368444c425559..773a3649d8378cb39373b5b90837f17f9ecba335 100644
--- a/configs/rec/ch_PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml
+++ b/configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml
@@ -129,7 +129,7 @@ Loss:
key: head_out
multi_head: True
- DistillationSARLoss:
- weight: 1.0
+ weight: 0.5
model_name_list: ["Student", "Teacher"]
key: head_out
multi_head: True
@@ -145,7 +145,7 @@ Metric:
base_metric_name: RecMetric
main_indicator: acc
key: "Student"
- ignore_space: True
+ ignore_space: False
Train:
dataset:
diff --git a/configs/rec/PP-OCRv3/en_PP-OCRv3_rec.yml b/configs/rec/PP-OCRv3/en_PP-OCRv3_rec.yml
new file mode 100644
index 0000000000000000000000000000000000000000..ff536edec4d6e7a85a6e6c189d56a23ffabc5583
--- /dev/null
+++ b/configs/rec/PP-OCRv3/en_PP-OCRv3_rec.yml
@@ -0,0 +1,131 @@
+Global:
+ debug: false
+ use_gpu: true
+ epoch_num: 500
+ log_smooth_window: 20
+ print_batch_step: 10
+ save_model_dir: ./output/v3_en_mobile
+ save_epoch_step: 3
+ eval_batch_step: [0, 2000]
+ cal_metric_during_train: true
+ pretrained_model:
+ checkpoints:
+ save_inference_dir:
+ use_visualdl: false
+ infer_img: doc/imgs_words/ch/word_1.jpg
+ character_dict_path: ppocr/utils/en_dict.txt
+ max_text_length: &max_text_length 25
+ infer_mode: false
+ use_space_char: true
+ distributed: true
+ save_res_path: ./output/rec/predicts_ppocrv3_en.txt
+
+
+Optimizer:
+ name: Adam
+ beta1: 0.9
+ beta2: 0.999
+ lr:
+ name: Cosine
+ learning_rate: 0.001
+ warmup_epoch: 5
+ regularizer:
+ name: L2
+ factor: 3.0e-05
+
+
+Architecture:
+ model_type: rec
+ algorithm: SVTR
+ Transform:
+ Backbone:
+ name: MobileNetV1Enhance
+ scale: 0.5
+ last_conv_stride: [1, 2]
+ last_pool_type: avg
+ Head:
+ name: MultiHead
+ head_list:
+ - CTCHead:
+ Neck:
+ name: svtr
+ dims: 64
+ depth: 2
+ hidden_dims: 120
+ use_guide: True
+ Head:
+ fc_decay: 0.00001
+ - SARHead:
+ enc_dim: 512
+ max_text_length: *max_text_length
+
+Loss:
+ name: MultiLoss
+ loss_config_list:
+ - CTCLoss:
+ - SARLoss:
+
+PostProcess:
+ name: CTCLabelDecode
+
+Metric:
+ name: RecMetric
+ main_indicator: acc
+ ignore_space: False
+
+Train:
+ dataset:
+ name: SimpleDataSet
+ data_dir: ./train_data/
+ ext_op_transform_idx: 1
+ label_file_list:
+ - ./train_data/train_list.txt
+ transforms:
+ - DecodeImage:
+ img_mode: BGR
+ channel_first: false
+ - RecConAug:
+ prob: 0.5
+ ext_data_num: 2
+ image_shape: [48, 320, 3]
+ - RecAug:
+ - MultiLabelEncode:
+ - RecResizeImg:
+ image_shape: [3, 48, 320]
+ - KeepKeys:
+ keep_keys:
+ - image
+ - label_ctc
+ - label_sar
+ - length
+ - valid_ratio
+ loader:
+ shuffle: true
+ batch_size_per_card: 128
+ drop_last: true
+ num_workers: 4
+Eval:
+ dataset:
+ name: SimpleDataSet
+ data_dir: ./train_data
+ label_file_list:
+ - ./train_data/val_list.txt
+ transforms:
+ - DecodeImage:
+ img_mode: BGR
+ channel_first: false
+ - MultiLabelEncode:
+ - RecResizeImg:
+ image_shape: [3, 48, 320]
+ - KeepKeys:
+ keep_keys:
+ - image
+ - label_ctc
+ - label_sar
+ - length
+ - valid_ratio
+ loader:
+ shuffle: false
+ drop_last: false
+ batch_size_per_card: 128
+ num_workers: 4
diff --git a/configs/rec/PP-OCRv3/multi_language/.gitkeep b/configs/rec/PP-OCRv3/multi_language/.gitkeep
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/configs/rec/PP-OCRv3/multi_language/arabic_PP-OCRv3_rec.yml b/configs/rec/PP-OCRv3/multi_language/arabic_PP-OCRv3_rec.yml
new file mode 100644
index 0000000000000000000000000000000000000000..0ad1ab0adc189102ff07094fcda92d4f9ea9c662
--- /dev/null
+++ b/configs/rec/PP-OCRv3/multi_language/arabic_PP-OCRv3_rec.yml
@@ -0,0 +1,131 @@
+Global:
+ debug: false
+ use_gpu: true
+ epoch_num: 500
+ log_smooth_window: 20
+ print_batch_step: 10
+ save_model_dir: ./output/v3_arabic_mobile
+ save_epoch_step: 3
+ eval_batch_step: [0, 2000]
+ cal_metric_during_train: true
+ pretrained_model:
+ checkpoints:
+ save_inference_dir:
+ use_visualdl: false
+ infer_img: doc/imgs_words/ch/word_1.jpg
+ character_dict_path: ppocr/utils/dict/arabic_dict.txt
+ max_text_length: &max_text_length 25
+ infer_mode: false
+ use_space_char: true
+ distributed: true
+ save_res_path: ./output/rec/predicts_ppocrv3_arabic.txt
+
+
+Optimizer:
+ name: Adam
+ beta1: 0.9
+ beta2: 0.999
+ lr:
+ name: Cosine
+ learning_rate: 0.001
+ warmup_epoch: 5
+ regularizer:
+ name: L2
+ factor: 3.0e-05
+
+
+Architecture:
+ model_type: rec
+ algorithm: SVTR
+ Transform:
+ Backbone:
+ name: MobileNetV1Enhance
+ scale: 0.5
+ last_conv_stride: [1, 2]
+ last_pool_type: avg
+ Head:
+ name: MultiHead
+ head_list:
+ - CTCHead:
+ Neck:
+ name: svtr
+ dims: 64
+ depth: 2
+ hidden_dims: 120
+ use_guide: True
+ Head:
+ fc_decay: 0.00001
+ - SARHead:
+ enc_dim: 512
+ max_text_length: *max_text_length
+
+Loss:
+ name: MultiLoss
+ loss_config_list:
+ - CTCLoss:
+ - SARLoss:
+
+PostProcess:
+ name: CTCLabelDecode
+
+Metric:
+ name: RecMetric
+ main_indicator: acc
+ ignore_space: False
+
+Train:
+ dataset:
+ name: SimpleDataSet
+ data_dir: ./train_data/
+ ext_op_transform_idx: 1
+ label_file_list:
+ - ./train_data/train_list.txt
+ transforms:
+ - DecodeImage:
+ img_mode: BGR
+ channel_first: false
+ - RecConAug:
+ prob: 0.5
+ ext_data_num: 2
+ image_shape: [48, 320, 3]
+ - RecAug:
+ - MultiLabelEncode:
+ - RecResizeImg:
+ image_shape: [3, 48, 320]
+ - KeepKeys:
+ keep_keys:
+ - image
+ - label_ctc
+ - label_sar
+ - length
+ - valid_ratio
+ loader:
+ shuffle: true
+ batch_size_per_card: 128
+ drop_last: true
+ num_workers: 4
+Eval:
+ dataset:
+ name: SimpleDataSet
+ data_dir: ./train_data
+ label_file_list:
+ - ./train_data/val_list.txt
+ transforms:
+ - DecodeImage:
+ img_mode: BGR
+ channel_first: false
+ - MultiLabelEncode:
+ - RecResizeImg:
+ image_shape: [3, 48, 320]
+ - KeepKeys:
+ keep_keys:
+ - image
+ - label_ctc
+ - label_sar
+ - length
+ - valid_ratio
+ loader:
+ shuffle: false
+ drop_last: false
+ batch_size_per_card: 128
+ num_workers: 4
diff --git a/configs/rec/PP-OCRv3/multi_language/chinese_cht_PP-OCRv3_rec.yml b/configs/rec/PP-OCRv3/multi_language/chinese_cht_PP-OCRv3_rec.yml
new file mode 100644
index 0000000000000000000000000000000000000000..28e0c10aa0f83fdf8e621aae04bf2b7374255adc
--- /dev/null
+++ b/configs/rec/PP-OCRv3/multi_language/chinese_cht_PP-OCRv3_rec.yml
@@ -0,0 +1,131 @@
+Global:
+ debug: false
+ use_gpu: true
+ epoch_num: 500
+ log_smooth_window: 20
+ print_batch_step: 10
+ save_model_dir: ./output/v3_chinese_cht_mobile
+ save_epoch_step: 3
+ eval_batch_step: [0, 2000]
+ cal_metric_during_train: true
+ pretrained_model:
+ checkpoints:
+ save_inference_dir:
+ use_visualdl: false
+ infer_img: doc/imgs_words/ch/word_1.jpg
+ character_dict_path: ppocr/utils/dict/chinese_cht_dict.txt
+ max_text_length: &max_text_length 25
+ infer_mode: false
+ use_space_char: true
+ distributed: true
+ save_res_path: ./output/rec/predicts_ppocrv3_chinese_cht.txt
+
+
+Optimizer:
+ name: Adam
+ beta1: 0.9
+ beta2: 0.999
+ lr:
+ name: Cosine
+ learning_rate: 0.001
+ warmup_epoch: 5
+ regularizer:
+ name: L2
+ factor: 3.0e-05
+
+
+Architecture:
+ model_type: rec
+ algorithm: SVTR
+ Transform:
+ Backbone:
+ name: MobileNetV1Enhance
+ scale: 0.5
+ last_conv_stride: [1, 2]
+ last_pool_type: avg
+ Head:
+ name: MultiHead
+ head_list:
+ - CTCHead:
+ Neck:
+ name: svtr
+ dims: 64
+ depth: 2
+ hidden_dims: 120
+ use_guide: True
+ Head:
+ fc_decay: 0.00001
+ - SARHead:
+ enc_dim: 512
+ max_text_length: *max_text_length
+
+Loss:
+ name: MultiLoss
+ loss_config_list:
+ - CTCLoss:
+ - SARLoss:
+
+PostProcess:
+ name: CTCLabelDecode
+
+Metric:
+ name: RecMetric
+ main_indicator: acc
+ ignore_space: False
+
+Train:
+ dataset:
+ name: SimpleDataSet
+ data_dir: ./train_data/
+ ext_op_transform_idx: 1
+ label_file_list:
+ - ./train_data/train_list.txt
+ transforms:
+ - DecodeImage:
+ img_mode: BGR
+ channel_first: false
+ - RecConAug:
+ prob: 0.5
+ ext_data_num: 2
+ image_shape: [48, 320, 3]
+ - RecAug:
+ - MultiLabelEncode:
+ - RecResizeImg:
+ image_shape: [3, 48, 320]
+ - KeepKeys:
+ keep_keys:
+ - image
+ - label_ctc
+ - label_sar
+ - length
+ - valid_ratio
+ loader:
+ shuffle: true
+ batch_size_per_card: 128
+ drop_last: true
+ num_workers: 4
+Eval:
+ dataset:
+ name: SimpleDataSet
+ data_dir: ./train_data
+ label_file_list:
+ - ./train_data/val_list.txt
+ transforms:
+ - DecodeImage:
+ img_mode: BGR
+ channel_first: false
+ - MultiLabelEncode:
+ - RecResizeImg:
+ image_shape: [3, 48, 320]
+ - KeepKeys:
+ keep_keys:
+ - image
+ - label_ctc
+ - label_sar
+ - length
+ - valid_ratio
+ loader:
+ shuffle: false
+ drop_last: false
+ batch_size_per_card: 128
+ num_workers: 4
diff --git a/configs/rec/PP-OCRv3/multi_language/cyrillic_PP-OCRv3_rec.yml b/configs/rec/PP-OCRv3/multi_language/cyrillic_PP-OCRv3_rec.yml
new file mode 100644
index 0000000000000000000000000000000000000000..fbdbe6c44c689ea267c9995f832305d800046edb
--- /dev/null
+++ b/configs/rec/PP-OCRv3/multi_language/cyrillic_PP-OCRv3_rec.yml
@@ -0,0 +1,131 @@
+Global:
+ debug: false
+ use_gpu: true
+ epoch_num: 500
+ log_smooth_window: 20
+ print_batch_step: 10
+ save_model_dir: ./output/v3_cyrillic_mobile
+ save_epoch_step: 3
+ eval_batch_step: [0, 2000]
+ cal_metric_during_train: true
+ pretrained_model:
+ checkpoints:
+ save_inference_dir:
+ use_visualdl: false
+ infer_img: doc/imgs_words/ch/word_1.jpg
+ character_dict_path: ppocr/utils/dict/cyrillic_dict.txt
+ max_text_length: &max_text_length 25
+ infer_mode: false
+ use_space_char: true
+ distributed: true
+ save_res_path: ./output/rec/predicts_ppocrv3_cyrillic.txt
+
+
+Optimizer:
+ name: Adam
+ beta1: 0.9
+ beta2: 0.999
+ lr:
+ name: Cosine
+ learning_rate: 0.001
+ warmup_epoch: 5
+ regularizer:
+ name: L2
+ factor: 3.0e-05
+
+
+Architecture:
+ model_type: rec
+ algorithm: SVTR
+ Transform:
+ Backbone:
+ name: MobileNetV1Enhance
+ scale: 0.5
+ last_conv_stride: [1, 2]
+ last_pool_type: avg
+ Head:
+ name: MultiHead
+ head_list:
+ - CTCHead:
+ Neck:
+ name: svtr
+ dims: 64
+ depth: 2
+ hidden_dims: 120
+ use_guide: True
+ Head:
+ fc_decay: 0.00001
+ - SARHead:
+ enc_dim: 512
+ max_text_length: *max_text_length
+
+Loss:
+ name: MultiLoss
+ loss_config_list:
+ - CTCLoss:
+ - SARLoss:
+
+PostProcess:
+ name: CTCLabelDecode
+
+Metric:
+ name: RecMetric
+ main_indicator: acc
+ ignore_space: False
+
+Train:
+ dataset:
+ name: SimpleDataSet
+ data_dir: ./train_data/
+ ext_op_transform_idx: 1
+ label_file_list:
+ - ./train_data/train_list.txt
+ transforms:
+ - DecodeImage:
+ img_mode: BGR
+ channel_first: false
+ - RecConAug:
+ prob: 0.5
+ ext_data_num: 2
+ image_shape: [48, 320, 3]
+ - RecAug:
+ - MultiLabelEncode:
+ - RecResizeImg:
+ image_shape: [3, 48, 320]
+ - KeepKeys:
+ keep_keys:
+ - image
+ - label_ctc
+ - label_sar
+ - length
+ - valid_ratio
+ loader:
+ shuffle: true
+ batch_size_per_card: 128
+ drop_last: true
+ num_workers: 4
+Eval:
+ dataset:
+ name: SimpleDataSet
+ data_dir: ./train_data
+ label_file_list:
+ - ./train_data/val_list.txt
+ transforms:
+ - DecodeImage:
+ img_mode: BGR
+ channel_first: false
+ - MultiLabelEncode:
+ - RecResizeImg:
+ image_shape: [3, 48, 320]
+ - KeepKeys:
+ keep_keys:
+ - image
+ - label_ctc
+ - label_sar
+ - length
+ - valid_ratio
+ loader:
+ shuffle: false
+ drop_last: false
+ batch_size_per_card: 128
+ num_workers: 4
diff --git a/configs/rec/PP-OCRv3/multi_language/devanagari_PP-OCRv3_rec.yml b/configs/rec/PP-OCRv3/multi_language/devanagari_PP-OCRv3_rec.yml
new file mode 100644
index 0000000000000000000000000000000000000000..48eb38df36f931b76b8e9fb8369daf06ad037d25
--- /dev/null
+++ b/configs/rec/PP-OCRv3/multi_language/devanagari_PP-OCRv3_rec.yml
@@ -0,0 +1,131 @@
+Global:
+ debug: false
+ use_gpu: true
+ epoch_num: 500
+ log_smooth_window: 20
+ print_batch_step: 10
+ save_model_dir: ./output/v3_devanagari_mobile
+ save_epoch_step: 3
+ eval_batch_step: [0, 2000]
+ cal_metric_during_train: true
+ pretrained_model:
+ checkpoints:
+ save_inference_dir:
+ use_visualdl: false
+ infer_img: doc/imgs_words/ch/word_1.jpg
+ character_dict_path: ppocr/utils/dict/devanagari_dict.txt
+ max_text_length: &max_text_length 25
+ infer_mode: false
+ use_space_char: true
+ distributed: true
+ save_res_path: ./output/rec/predicts_ppocrv3_devanagari.txt
+
+
+Optimizer:
+ name: Adam
+ beta1: 0.9
+ beta2: 0.999
+ lr:
+ name: Cosine
+ learning_rate: 0.001
+ warmup_epoch: 5
+ regularizer:
+ name: L2
+ factor: 3.0e-05
+
+
+Architecture:
+ model_type: rec
+ algorithm: SVTR
+ Transform:
+ Backbone:
+ name: MobileNetV1Enhance
+ scale: 0.5
+ last_conv_stride: [1, 2]
+ last_pool_type: avg
+ Head:
+ name: MultiHead
+ head_list:
+ - CTCHead:
+ Neck:
+ name: svtr
+ dims: 64
+ depth: 2
+ hidden_dims: 120
+ use_guide: True
+ Head:
+ fc_decay: 0.00001
+ - SARHead:
+ enc_dim: 512
+ max_text_length: *max_text_length
+
+Loss:
+ name: MultiLoss
+ loss_config_list:
+ - CTCLoss:
+ - SARLoss:
+
+PostProcess:
+ name: CTCLabelDecode
+
+Metric:
+ name: RecMetric
+ main_indicator: acc
+ ignore_space: False
+
+Train:
+ dataset:
+ name: SimpleDataSet
+ data_dir: ./train_data/
+ ext_op_transform_idx: 1
+ label_file_list:
+ - ./train_data/train_list.txt
+ transforms:
+ - DecodeImage:
+ img_mode: BGR
+ channel_first: false
+ - RecConAug:
+ prob: 0.5
+ ext_data_num: 2
+ image_shape: [48, 320, 3]
+ - RecAug:
+ - MultiLabelEncode:
+ - RecResizeImg:
+ image_shape: [3, 48, 320]
+ - KeepKeys:
+ keep_keys:
+ - image
+ - label_ctc
+ - label_sar
+ - length
+ - valid_ratio
+ loader:
+ shuffle: true
+ batch_size_per_card: 128
+ drop_last: true
+ num_workers: 4
+Eval:
+ dataset:
+ name: SimpleDataSet
+ data_dir: ./train_data
+ label_file_list:
+ - ./train_data/val_list.txt
+ transforms:
+ - DecodeImage:
+ img_mode: BGR
+ channel_first: false
+ - MultiLabelEncode:
+ - RecResizeImg:
+ image_shape: [3, 48, 320]
+ - KeepKeys:
+ keep_keys:
+ - image
+ - label_ctc
+ - label_sar
+ - length
+ - valid_ratio
+ loader:
+ shuffle: false
+ drop_last: false
+ batch_size_per_card: 128
+ num_workers: 4
diff --git a/configs/rec/PP-OCRv3/multi_language/japan_PP-OCRv3_rec.yml b/configs/rec/PP-OCRv3/multi_language/japan_PP-OCRv3_rec.yml
new file mode 100644
index 0000000000000000000000000000000000000000..6cab0d447247e28bb58b30384d4f9d032d6ce9d0
--- /dev/null
+++ b/configs/rec/PP-OCRv3/multi_language/japan_PP-OCRv3_rec.yml
@@ -0,0 +1,131 @@
+Global:
+ debug: false
+ use_gpu: true
+ epoch_num: 500
+ log_smooth_window: 20
+ print_batch_step: 10
+ save_model_dir: ./output/v3_japan_mobile
+ save_epoch_step: 3
+ eval_batch_step: [0, 2000]
+ cal_metric_during_train: true
+ pretrained_model:
+ checkpoints:
+ save_inference_dir:
+ use_visualdl: false
+ infer_img: doc/imgs_words/ch/word_1.jpg
+ character_dict_path: ppocr/utils/dict/japan_dict.txt
+ max_text_length: &max_text_length 25
+ infer_mode: false
+ use_space_char: true
+ distributed: true
+ save_res_path: ./output/rec/predicts_ppocrv3_japan.txt
+
+
+Optimizer:
+ name: Adam
+ beta1: 0.9
+ beta2: 0.999
+ lr:
+ name: Cosine
+ learning_rate: 0.001
+ warmup_epoch: 5
+ regularizer:
+ name: L2
+ factor: 3.0e-05
+
+
+Architecture:
+ model_type: rec
+ algorithm: SVTR
+ Transform:
+ Backbone:
+ name: MobileNetV1Enhance
+ scale: 0.5
+ last_conv_stride: [1, 2]
+ last_pool_type: avg
+ Head:
+ name: MultiHead
+ head_list:
+ - CTCHead:
+ Neck:
+ name: svtr
+ dims: 64
+ depth: 2
+ hidden_dims: 120
+ use_guide: True
+ Head:
+ fc_decay: 0.00001
+ - SARHead:
+ enc_dim: 512
+ max_text_length: *max_text_length
+
+Loss:
+ name: MultiLoss
+ loss_config_list:
+ - CTCLoss:
+ - SARLoss:
+
+PostProcess:
+ name: CTCLabelDecode
+
+Metric:
+ name: RecMetric
+ main_indicator: acc
+ ignore_space: False
+
+Train:
+ dataset:
+ name: SimpleDataSet
+ data_dir: ./train_data/
+ ext_op_transform_idx: 1
+ label_file_list:
+ - ./train_data/train_list.txt
+ transforms:
+ - DecodeImage:
+ img_mode: BGR
+ channel_first: false
+ - RecConAug:
+ prob: 0.5
+ ext_data_num: 2
+ image_shape: [48, 320, 3]
+ - RecAug:
+ - MultiLabelEncode:
+ - RecResizeImg:
+ image_shape: [3, 48, 320]
+ - KeepKeys:
+ keep_keys:
+ - image
+ - label_ctc
+ - label_sar
+ - length
+ - valid_ratio
+ loader:
+ shuffle: true
+ batch_size_per_card: 128
+ drop_last: true
+ num_workers: 4
+Eval:
+ dataset:
+ name: SimpleDataSet
+ data_dir: ./train_data
+ label_file_list:
+ - ./train_data/val_list.txt
+ transforms:
+ - DecodeImage:
+ img_mode: BGR
+ channel_first: false
+ - MultiLabelEncode:
+ - RecResizeImg:
+ image_shape: [3, 48, 320]
+ - KeepKeys:
+ keep_keys:
+ - image
+ - label_ctc
+ - label_sar
+ - length
+ - valid_ratio
+ loader:
+ shuffle: false
+ drop_last: false
+ batch_size_per_card: 128
+ num_workers: 4
diff --git a/configs/rec/PP-OCRv3/multi_language/ka_PP-OCRv3_rec.yml b/configs/rec/PP-OCRv3/multi_language/ka_PP-OCRv3_rec.yml
new file mode 100644
index 0000000000000000000000000000000000000000..7a9c8241d1564e5f1295655ba64694a117064bd8
--- /dev/null
+++ b/configs/rec/PP-OCRv3/multi_language/ka_PP-OCRv3_rec.yml
@@ -0,0 +1,131 @@
+Global:
+ debug: false
+ use_gpu: true
+ epoch_num: 500
+ log_smooth_window: 20
+ print_batch_step: 10
+ save_model_dir: ./output/v3_ka_mobile
+ save_epoch_step: 3
+ eval_batch_step: [0, 2000]
+ cal_metric_during_train: true
+ pretrained_model:
+ checkpoints:
+ save_inference_dir:
+ use_visualdl: false
+ infer_img: doc/imgs_words/ch/word_1.jpg
+ character_dict_path: ppocr/utils/dict/ka_dict.txt
+ max_text_length: &max_text_length 25
+ infer_mode: false
+ use_space_char: true
+ distributed: true
+ save_res_path: ./output/rec/predicts_ppocrv3_ka.txt
+
+
+Optimizer:
+ name: Adam
+ beta1: 0.9
+ beta2: 0.999
+ lr:
+ name: Cosine
+ learning_rate: 0.001
+ warmup_epoch: 5
+ regularizer:
+ name: L2
+ factor: 3.0e-05
+
+
+Architecture:
+ model_type: rec
+ algorithm: SVTR
+ Transform:
+ Backbone:
+ name: MobileNetV1Enhance
+ scale: 0.5
+ last_conv_stride: [1, 2]
+ last_pool_type: avg
+ Head:
+ name: MultiHead
+ head_list:
+ - CTCHead:
+ Neck:
+ name: svtr
+ dims: 64
+ depth: 2
+ hidden_dims: 120
+ use_guide: True
+ Head:
+ fc_decay: 0.00001
+ - SARHead:
+ enc_dim: 512
+ max_text_length: *max_text_length
+
+Loss:
+ name: MultiLoss
+ loss_config_list:
+ - CTCLoss:
+ - SARLoss:
+
+PostProcess:
+ name: CTCLabelDecode
+
+Metric:
+ name: RecMetric
+ main_indicator: acc
+ ignore_space: False
+
+Train:
+ dataset:
+ name: SimpleDataSet
+ data_dir: ./train_data/
+ ext_op_transform_idx: 1
+ label_file_list:
+ - ./train_data/train_list.txt
+ transforms:
+ - DecodeImage:
+ img_mode: BGR
+ channel_first: false
+ - RecConAug:
+ prob: 0.5
+ ext_data_num: 2
+ image_shape: [48, 320, 3]
+ - RecAug:
+ - MultiLabelEncode:
+ - RecResizeImg:
+ image_shape: [3, 48, 320]
+ - KeepKeys:
+ keep_keys:
+ - image
+ - label_ctc
+ - label_sar
+ - length
+ - valid_ratio
+ loader:
+ shuffle: true
+ batch_size_per_card: 128
+ drop_last: true
+ num_workers: 4
+Eval:
+ dataset:
+ name: SimpleDataSet
+ data_dir: ./train_data
+ label_file_list:
+ - ./train_data/val_list.txt
+ transforms:
+ - DecodeImage:
+ img_mode: BGR
+ channel_first: false
+ - MultiLabelEncode:
+ - RecResizeImg:
+ image_shape: [3, 48, 320]
+ - KeepKeys:
+ keep_keys:
+ - image
+ - label_ctc
+ - label_sar
+ - length
+ - valid_ratio
+ loader:
+ shuffle: false
+ drop_last: false
+ batch_size_per_card: 128
+ num_workers: 4
diff --git a/configs/rec/PP-OCRv3/multi_language/korean_PP-OCRv3_rec.yml b/configs/rec/PP-OCRv3/multi_language/korean_PP-OCRv3_rec.yml
new file mode 100644
index 0000000000000000000000000000000000000000..29ff570772a621ba747e0388bcc0c042db0dba43
--- /dev/null
+++ b/configs/rec/PP-OCRv3/multi_language/korean_PP-OCRv3_rec.yml
@@ -0,0 +1,131 @@
+Global:
+ debug: false
+ use_gpu: true
+ epoch_num: 500
+ log_smooth_window: 20
+ print_batch_step: 10
+ save_model_dir: ./output/v3_korean_mobile
+ save_epoch_step: 3
+ eval_batch_step: [0, 2000]
+ cal_metric_during_train: true
+ pretrained_model:
+ checkpoints:
+ save_inference_dir:
+ use_visualdl: false
+ infer_img: doc/imgs_words/ch/word_1.jpg
+ character_dict_path: ppocr/utils/dict/korean_dict.txt
+ max_text_length: &max_text_length 25
+ infer_mode: false
+ use_space_char: true
+ distributed: true
+ save_res_path: ./output/rec/predicts_ppocrv3_korean.txt
+
+
+Optimizer:
+ name: Adam
+ beta1: 0.9
+ beta2: 0.999
+ lr:
+ name: Cosine
+ learning_rate: 0.001
+ warmup_epoch: 5
+ regularizer:
+ name: L2
+ factor: 3.0e-05
+
+
+Architecture:
+ model_type: rec
+ algorithm: SVTR
+ Transform:
+ Backbone:
+ name: MobileNetV1Enhance
+ scale: 0.5
+ last_conv_stride: [1, 2]
+ last_pool_type: avg
+ Head:
+ name: MultiHead
+ head_list:
+ - CTCHead:
+ Neck:
+ name: svtr
+ dims: 64
+ depth: 2
+ hidden_dims: 120
+ use_guide: True
+ Head:
+ fc_decay: 0.00001
+ - SARHead:
+ enc_dim: 512
+ max_text_length: *max_text_length
+
+Loss:
+ name: MultiLoss
+ loss_config_list:
+ - CTCLoss:
+ - SARLoss:
+
+PostProcess:
+ name: CTCLabelDecode
+
+Metric:
+ name: RecMetric
+ main_indicator: acc
+ ignore_space: False
+
+Train:
+ dataset:
+ name: SimpleDataSet
+ data_dir: ./train_data/
+ ext_op_transform_idx: 1
+ label_file_list:
+ - ./train_data/train_list.txt
+ transforms:
+ - DecodeImage:
+ img_mode: BGR
+ channel_first: false
+ - RecConAug:
+ prob: 0.5
+ ext_data_num: 2
+ image_shape: [48, 320, 3]
+ - RecAug:
+ - MultiLabelEncode:
+ - RecResizeImg:
+ image_shape: [3, 48, 320]
+ - KeepKeys:
+ keep_keys:
+ - image
+ - label_ctc
+ - label_sar
+ - length
+ - valid_ratio
+ loader:
+ shuffle: true
+ batch_size_per_card: 128
+ drop_last: true
+ num_workers: 4
+Eval:
+ dataset:
+ name: SimpleDataSet
+ data_dir: ./train_data
+ label_file_list:
+ - ./train_data/val_list.txt
+ transforms:
+ - DecodeImage:
+ img_mode: BGR
+ channel_first: false
+ - MultiLabelEncode:
+ - RecResizeImg:
+ image_shape: [3, 48, 320]
+ - KeepKeys:
+ keep_keys:
+ - image
+ - label_ctc
+ - label_sar
+ - length
+ - valid_ratio
+ loader:
+ shuffle: false
+ drop_last: false
+ batch_size_per_card: 128
+ num_workers: 4
diff --git a/configs/rec/PP-OCRv3/multi_language/latin_PP-OCRv3_rec.yml b/configs/rec/PP-OCRv3/multi_language/latin_PP-OCRv3_rec.yml
new file mode 100644
index 0000000000000000000000000000000000000000..1784bfe611366c45230fd2abf69ab16e3a1c3ae9
--- /dev/null
+++ b/configs/rec/PP-OCRv3/multi_language/latin_PP-OCRv3_rec.yml
@@ -0,0 +1,131 @@
+Global:
+ debug: false
+ use_gpu: true
+ epoch_num: 500
+ log_smooth_window: 20
+ print_batch_step: 10
+ save_model_dir: ./output/v3_latin_mobile
+ save_epoch_step: 3
+ eval_batch_step: [0, 2000]
+ cal_metric_during_train: true
+ pretrained_model:
+ checkpoints:
+ save_inference_dir:
+ use_visualdl: false
+ infer_img: doc/imgs_words/ch/word_1.jpg
+ character_dict_path: ppocr/utils/dict/latin_dict.txt
+ max_text_length: &max_text_length 25
+ infer_mode: false
+ use_space_char: true
+ distributed: true
+ save_res_path: ./output/rec/predicts_ppocrv3_latin.txt
+
+
+Optimizer:
+ name: Adam
+ beta1: 0.9
+ beta2: 0.999
+ lr:
+ name: Cosine
+ learning_rate: 0.001
+ warmup_epoch: 5
+ regularizer:
+ name: L2
+ factor: 3.0e-05
+
+
+Architecture:
+ model_type: rec
+ algorithm: SVTR
+ Transform:
+ Backbone:
+ name: MobileNetV1Enhance
+ scale: 0.5
+ last_conv_stride: [1, 2]
+ last_pool_type: avg
+ Head:
+ name: MultiHead
+ head_list:
+ - CTCHead:
+ Neck:
+ name: svtr
+ dims: 64
+ depth: 2
+ hidden_dims: 120
+ use_guide: True
+ Head:
+ fc_decay: 0.00001
+ - SARHead:
+ enc_dim: 512
+ max_text_length: *max_text_length
+
+Loss:
+ name: MultiLoss
+ loss_config_list:
+ - CTCLoss:
+ - SARLoss:
+
+PostProcess:
+ name: CTCLabelDecode
+
+Metric:
+ name: RecMetric
+ main_indicator: acc
+ ignore_space: False
+
+Train:
+ dataset:
+ name: SimpleDataSet
+ data_dir: ./train_data/
+ ext_op_transform_idx: 1
+ label_file_list:
+ - ./train_data/train_list.txt
+ transforms:
+ - DecodeImage:
+ img_mode: BGR
+ channel_first: false
+ - RecConAug:
+ prob: 0.5
+ ext_data_num: 2
+ image_shape: [48, 320, 3]
+ - RecAug:
+ - MultiLabelEncode:
+ - RecResizeImg:
+ image_shape: [3, 48, 320]
+ - KeepKeys:
+ keep_keys:
+ - image
+ - label_ctc
+ - label_sar
+ - length
+ - valid_ratio
+ loader:
+ shuffle: true
+ batch_size_per_card: 128
+ drop_last: true
+ num_workers: 4
+Eval:
+ dataset:
+ name: SimpleDataSet
+ data_dir: ./train_data
+ label_file_list:
+ - ./train_data/val_list.txt
+ transforms:
+ - DecodeImage:
+ img_mode: BGR
+ channel_first: false
+ - MultiLabelEncode:
+ - RecResizeImg:
+ image_shape: [3, 48, 320]
+ - KeepKeys:
+ keep_keys:
+ - image
+ - label_ctc
+ - label_sar
+ - length
+ - valid_ratio
+ loader:
+ shuffle: false
+ drop_last: false
+ batch_size_per_card: 128
+ num_workers: 4
diff --git a/configs/rec/PP-OCRv3/multi_language/ta_PP-OCRv3_rec.yml b/configs/rec/PP-OCRv3/multi_language/ta_PP-OCRv3_rec.yml
new file mode 100644
index 0000000000000000000000000000000000000000..70b26aa84a2178111edab9f094c369c5d22e31a9
--- /dev/null
+++ b/configs/rec/PP-OCRv3/multi_language/ta_PP-OCRv3_rec.yml
@@ -0,0 +1,131 @@
+Global:
+ debug: false
+ use_gpu: true
+ epoch_num: 500
+ log_smooth_window: 20
+ print_batch_step: 10
+ save_model_dir: ./output/v3_ta_mobile
+ save_epoch_step: 3
+ eval_batch_step: [0, 2000]
+ cal_metric_during_train: true
+ pretrained_model:
+ checkpoints:
+ save_inference_dir:
+ use_visualdl: false
+ infer_img: doc/imgs_words/ch/word_1.jpg
+ character_dict_path: ppocr/utils/dict/ta_dict.txt
+ max_text_length: &max_text_length 25
+ infer_mode: false
+ use_space_char: true
+ distributed: true
+ save_res_path: ./output/rec/predicts_ppocrv3_ta.txt
+
+
+Optimizer:
+ name: Adam
+ beta1: 0.9
+ beta2: 0.999
+ lr:
+ name: Cosine
+ learning_rate: 0.001
+ warmup_epoch: 5
+ regularizer:
+ name: L2
+ factor: 3.0e-05
+
+
+Architecture:
+ model_type: rec
+ algorithm: SVTR
+ Transform:
+ Backbone:
+ name: MobileNetV1Enhance
+ scale: 0.5
+ last_conv_stride: [1, 2]
+ last_pool_type: avg
+ Head:
+ name: MultiHead
+ head_list:
+ - CTCHead:
+ Neck:
+ name: svtr
+ dims: 64
+ depth: 2
+ hidden_dims: 120
+ use_guide: True
+ Head:
+ fc_decay: 0.00001
+ - SARHead:
+ enc_dim: 512
+ max_text_length: *max_text_length
+
+Loss:
+ name: MultiLoss
+ loss_config_list:
+ - CTCLoss:
+ - SARLoss:
+
+PostProcess:
+ name: CTCLabelDecode
+
+Metric:
+ name: RecMetric
+ main_indicator: acc
+ ignore_space: False
+
+Train:
+ dataset:
+ name: SimpleDataSet
+ data_dir: ./train_data/
+ ext_op_transform_idx: 1
+ label_file_list:
+ - ./train_data/train_list.txt
+ transforms:
+ - DecodeImage:
+ img_mode: BGR
+ channel_first: false
+ - RecConAug:
+ prob: 0.5
+ ext_data_num: 2
+ image_shape: [48, 320, 3]
+ - RecAug:
+ - MultiLabelEncode:
+ - RecResizeImg:
+ image_shape: [3, 48, 320]
+ - KeepKeys:
+ keep_keys:
+ - image
+ - label_ctc
+ - label_sar
+ - length
+ - valid_ratio
+ loader:
+ shuffle: true
+ batch_size_per_card: 128
+ drop_last: true
+ num_workers: 4
+Eval:
+ dataset:
+ name: SimpleDataSet
+ data_dir: ./train_data
+ label_file_list:
+ - ./train_data/val_list.txt
+ transforms:
+ - DecodeImage:
+ img_mode: BGR
+ channel_first: false
+ - MultiLabelEncode:
+ - RecResizeImg:
+ image_shape: [3, 48, 320]
+ - KeepKeys:
+ keep_keys:
+ - image
+ - label_ctc
+ - label_sar
+ - length
+ - valid_ratio
+ loader:
+ shuffle: false
+ drop_last: false
+ batch_size_per_card: 128
+ num_workers: 4
diff --git a/configs/rec/PP-OCRv3/multi_language/te_PP-OCRv3_rec.yml b/configs/rec/PP-OCRv3/multi_language/te_PP-OCRv3_rec.yml
new file mode 100644
index 0000000000000000000000000000000000000000..3617af79e3b9c5a55ef22d549465ba2109618e32
--- /dev/null
+++ b/configs/rec/PP-OCRv3/multi_language/te_PP-OCRv3_rec.yml
@@ -0,0 +1,131 @@
+Global:
+ debug: false
+ use_gpu: true
+ epoch_num: 500
+ log_smooth_window: 20
+ print_batch_step: 10
+ save_model_dir: ./output/v3_te_mobile
+ save_epoch_step: 3
+ eval_batch_step: [0, 2000]
+ cal_metric_during_train: true
+ pretrained_model:
+ checkpoints:
+ save_inference_dir:
+ use_visualdl: false
+ infer_img: doc/imgs_words/ch/word_1.jpg
+ character_dict_path: ppocr/utils/dict/te_dict.txt
+ max_text_length: &max_text_length 25
+ infer_mode: false
+ use_space_char: true
+ distributed: true
+ save_res_path: ./output/rec/predicts_ppocrv3_te.txt
+
+
+Optimizer:
+ name: Adam
+ beta1: 0.9
+ beta2: 0.999
+ lr:
+ name: Cosine
+ learning_rate: 0.001
+ warmup_epoch: 5
+ regularizer:
+ name: L2
+ factor: 3.0e-05
+
+
+Architecture:
+ model_type: rec
+ algorithm: SVTR
+ Transform:
+ Backbone:
+ name: MobileNetV1Enhance
+ scale: 0.5
+ last_conv_stride: [1, 2]
+ last_pool_type: avg
+ Head:
+ name: MultiHead
+ head_list:
+ - CTCHead:
+ Neck:
+ name: svtr
+ dims: 64
+ depth: 2
+ hidden_dims: 120
+ use_guide: True
+ Head:
+ fc_decay: 0.00001
+ - SARHead:
+ enc_dim: 512
+ max_text_length: *max_text_length
+
+Loss:
+ name: MultiLoss
+ loss_config_list:
+ - CTCLoss:
+ - SARLoss:
+
+PostProcess:
+ name: CTCLabelDecode
+
+Metric:
+ name: RecMetric
+ main_indicator: acc
+ ignore_space: False
+
+Train:
+ dataset:
+ name: SimpleDataSet
+ data_dir: ./train_data/
+ ext_op_transform_idx: 1
+ label_file_list:
+ - ./train_data/train_list.txt
+ transforms:
+ - DecodeImage:
+ img_mode: BGR
+ channel_first: false
+ - RecConAug:
+ prob: 0.5
+ ext_data_num: 2
+ image_shape: [48, 320, 3]
+ - RecAug:
+ - MultiLabelEncode:
+ - RecResizeImg:
+ image_shape: [3, 48, 320]
+ - KeepKeys:
+ keep_keys:
+ - image
+ - label_ctc
+ - label_sar
+ - length
+ - valid_ratio
+ loader:
+ shuffle: true
+ batch_size_per_card: 128
+ drop_last: true
+ num_workers: 4
+Eval:
+ dataset:
+ name: SimpleDataSet
+ data_dir: ./train_data
+ label_file_list:
+ - ./train_data/val_list.txt
+ transforms:
+ - DecodeImage:
+ img_mode: BGR
+ channel_first: false
+ - MultiLabelEncode:
+ - RecResizeImg:
+ image_shape: [3, 48, 320]
+ - KeepKeys:
+ keep_keys:
+ - image
+ - label_ctc
+ - label_sar
+ - length
+ - valid_ratio
+ loader:
+ shuffle: false
+ drop_last: false
+ batch_size_per_card: 128
+ num_workers: 4
diff --git a/deploy/Jetson/images/00057937.jpg b/deploy/Jetson/images/00057937.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..a35896e78d122c1916db8ff4135068e8d3a0d419
Binary files /dev/null and b/deploy/Jetson/images/00057937.jpg differ
diff --git a/deploy/Jetson/images/det_res_french_0.jpg b/deploy/Jetson/images/det_res_french_0.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..5f0e4886dfa542319034f529c4f08ece7ffd65cf
Binary files /dev/null and b/deploy/Jetson/images/det_res_french_0.jpg differ
diff --git a/deploy/Jetson/readme.md b/deploy/Jetson/readme.md
new file mode 100644
index 0000000000000000000000000000000000000000..14c88c3c3c8aa8e4bbfe6ce04cb51c58af762351
--- /dev/null
+++ b/deploy/Jetson/readme.md
@@ -0,0 +1,84 @@
+English | [简体中文](readme_ch.md)
+
+# Jetson Deployment for PaddleOCR
+
+This section introduces the deployment of PaddleOCR on Jetson NX, TX2, nano, AGX and other series of hardware.
+
+
+## 1. Prepare Environment
+
+You need to prepare a Jetson development hardware. If you need TensorRT, you need to prepare the TensorRT environment. It is recommended to use TensorRT version 7.1.3;
+
+1. Install PaddlePaddle in Jetson
+
+The PaddlePaddle download [link](https://www.paddlepaddle.org.cn/inference/user_guides/download_lib.html#python)
+Please select the appropriate installation package for your Jetpack version, cuda version, and trt version. Here, we download paddlepaddle_gpu-2.3.0rc0-cp36-cp36m-linux_aarch64.whl.
+
+Install PaddlePaddle:
+```shell
+pip3 install -U paddlepaddle_gpu-2.3.0rc0-cp36-cp36m-linux_aarch64.whl
+```
+
+
+2. Download PaddleOCR code and install dependencies
+
+Clone the PaddleOCR code:
+```
+git clone https://github.com/PaddlePaddle/PaddleOCR
+```
+
+and install dependencies:
+```
+cd PaddleOCR
+pip3 install -r requirements.txt
+```
+
+*Note: Jetson hardware CPU is poor, dependency installation is slow, please wait patiently*
+
+## 2. Perform prediction
+
+Obtain the PPOCR model from the [document](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/doc/doc_en/ppocr_introduction_en.md#6-model-zoo) model library. The following takes the PP-OCRv3 model as an example to introduce the use of the PPOCR model on Jetson:
+
+Download and unzip the PP-OCRv3 models.
+```
+wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar
+wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar
+tar xf ch_PP-OCRv3_det_infer.tar
+tar xf ch_PP-OCRv3_rec_infer.tar
+```
+
+The text detection inference:
+```
+cd PaddleOCR
+python3 tools/infer/predict_det.py --det_model_dir=./inference/ch_PP-OCRv2_det_infer/ --image_dir=./doc/imgs/french_0.jpg --use_gpu=True
+```
+
+After executing the command, the predicted information will be printed out in the terminal, and the visualization results will be saved in the `./inference_results/` directory.
+
+
+
+The text recognition inference:
+```
+python3 tools/infer/predict_det.py --rec_model_dir=./inference/ch_PP-OCRv2_rec_infer/ --image_dir=./doc/imgs_words/en/word_2.png --use_gpu=True --rec_image_shape="3,48,320"
+```
+
+After executing the command, the predicted information will be printed on the terminal, and the output is as follows:
+```
+[2022/04/28 15:41:45] root INFO: Predicts of ./doc/imgs_words/en/word_2.png:('yourself', 0.98084533)
+```
+
+The text detection and text recognition inference:
+
+```
+python3 tools/infer/predict_system.py --det_model_dir=./inference/ch_PP-OCRv2_det_infer/ --rec_model_dir=./inference/ch_PP-OCRv2_rec_infer/ --image_dir=./doc/imgs/00057937.jpg --use_gpu=True --rec_image_shape="3,48,320"
+```
+
+After executing the command, the predicted information will be printed out in the terminal, and the visualization results will be saved in the `./inference_results/` directory.
+
+
+To enable TRT prediction, you only need to set `--use_tensorrt=True` on the basis of the above command:
+```
+python3 tools/infer/predict_system.py --det_model_dir=./inference/ch_PP-OCRv2_det_infer/ --rec_model_dir=./inference/ch_PP-OCRv2_rec_infer/ --image_dir=./doc/imgs/ --rec_image_shape="3,48,320" --use_gpu=True --use_tensorrt=True
+```
+
+For more ppocr model predictions, please refer to[document](../../doc/doc_en/models_list_en.md)
diff --git a/deploy/Jetson/readme_ch.md b/deploy/Jetson/readme_ch.md
new file mode 100644
index 0000000000000000000000000000000000000000..7b0a344ffe3b4c0abefc78e50371b5c92aa25001
--- /dev/null
+++ b/deploy/Jetson/readme_ch.md
@@ -0,0 +1,86 @@
+[English](readme.md) | 简体中文
+
+# Jetson部署PaddleOCR模型
+
+本节介绍PaddleOCR在Jetson NX、TX2、nano、AGX等系列硬件的部署。
+
+
+## 1. 环境准备
+
+需要准备一台Jetson开发板,如果需要TensorRT预测,需准备好TensorRT环境,建议使用7.1.3版本的TensorRT;
+
+1. Jetson安装PaddlePaddle
+
+PaddlePaddle下载[链接](https://www.paddlepaddle.org.cn/inference/user_guides/download_lib.html#python)
+请选择适合的您Jetpack版本、cuda版本、trt版本的安装包。
+
+安装命令:
+```shell
+# 安装paddle,以paddlepaddle_gpu-2.3.0rc0-cp36-cp36m-linux_aarch64.whl 为例
+pip3 install -U paddlepaddle_gpu-2.3.0rc0-cp36-cp36m-linux_aarch64.whl
+```
+
+
+2. 下载PaddleOCR代码并安装依赖
+
+首先 clone PaddleOCR 代码:
+```
+git clone https://github.com/PaddlePaddle/PaddleOCR
+```
+
+然后,安装依赖:
+```
+cd PaddleOCR
+pip3 install -r requirements.txt
+```
+
+*注:jetson硬件CPU较差,依赖安装较慢,请耐心等待*
+
+
+## 2. 执行预测
+
+从[文档](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/doc/doc_ch/ppocr_introduction.md#6-%E6%A8%A1%E5%9E%8B%E5%BA%93) 模型库中获取PPOCR模型,下面以PP-OCRv3模型为例,介绍在PPOCR模型在jetson上的使用方式:
+
+下载并解压PP-OCRv3模型
+```
+wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar
+wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar
+tar xf ch_PP-OCRv3_det_infer.tar
+tar xf ch_PP-OCRv3_rec_infer.tar
+```
+
+执行文本检测预测:
+```
+cd PaddleOCR
+python3 tools/infer/predict_det.py --det_model_dir=./inference/ch_PP-OCRv2_det_infer/ --image_dir=./doc/imgs/french_0.jpg --use_gpu=True
+```
+
+执行命令后在终端会打印出预测的信息,并在 `./inference_results/` 下保存可视化结果。
+
+
+
+执行文本识别预测:
+```
+python3 tools/infer/predict_det.py --rec_model_dir=./inference/ch_PP-OCRv2_rec_infer/ --image_dir=./doc/imgs_words/en/word_2.png --use_gpu=True --rec_image_shape="3,48,320"
+```
+
+执行命令后在终端会打印出预测的信息,输出如下:
+```
+[2022/04/28 15:41:45] root INFO: Predicts of ./doc/imgs_words/en/word_2.png:('yourself', 0.98084533)
+```
+
+执行文本检测+文本识别串联预测:
+
+```
+python3 tools/infer/predict_system.py --det_model_dir=./inference/ch_PP-OCRv2_det_infer/ --rec_model_dir=./inference/ch_PP-OCRv2_rec_infer/ --image_dir=./doc/imgs/ --use_gpu=True --rec_image_shape="3,48,320"
+```
+
+执行命令后在终端会打印出预测的信息,并在 `./inference_results/` 下保存可视化结果。
+
+
+开启TRT预测只需要在以上命令基础上设置`--use_tensorrt=True`即可:
+```
+python3 tools/infer/predict_system.py --det_model_dir=./inference/ch_PP-OCRv2_det_infer/ --rec_model_dir=./inference/ch_PP-OCRv2_rec_infer/ --image_dir=./doc/imgs/00057937.jpg --use_gpu=True --use_tensorrt=True --rec_image_shape="3,48,320"
+```
+
+更多ppocr模型预测请参考[文档](../../doc/doc_ch/models_list.md)
diff --git a/deploy/README.md b/deploy/README.md
index 69e2438996a1329e801f842ef78d2d6e115c5831..0cfb793f92547de1215277f5e2348bc157ec8503 100644
--- a/deploy/README.md
+++ b/deploy/README.md
@@ -23,10 +23,9 @@ PP-OCR has supported muti deployment schemes. Click the link to get the specific
- [Python Inference](../doc/doc_en/inference_ppocr_en.md)
- [C++ Inference](./cpp_infer/readme.md)
- [Serving (Python/C++)](./pdserving/README.md)
-- [Paddle-Lite (ARM CPU/OpenCL ARM GPU/Metal ARM GPU)](./lite/readme.md)
+- [Paddle-Lite (ARM CPU/OpenCL ARM GPU)](./lite/readme.md)
- [Paddle.js](./paddlejs/README.md)
-- [Jetson Inference]()
-- [XPU Inference]()
+- [Jetson Inference](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/deploy/Jetson/readme.md)
- [Paddle2ONNX](./paddle2onnx/readme.md)
-If you need the deployment tutorial of academic algorithm models other than PP-OCR, please directly enter the main page of corresponding algorithms, [entrance](../doc/doc_en/algorithm_overview_en.md)。
\ No newline at end of file
+If you need the deployment tutorial of academic algorithm models other than PP-OCR, please directly enter the main page of corresponding algorithms, [entrance](../doc/doc_en/algorithm_overview_en.md)。
diff --git a/deploy/README_ch.md b/deploy/README_ch.md
index 63ae59537316480a302dca7c3714db3c1003553e..1773aedc2c757499bfc1f950e468a98c477636ac 100644
--- a/deploy/README_ch.md
+++ b/deploy/README_ch.md
@@ -23,10 +23,9 @@ PP-OCR模型已打通多种场景部署方案,点击链接获取具体的使
- [Python 推理](../doc/doc_ch/inference_ppocr.md)
- [C++ 推理](./cpp_infer/readme_ch.md)
- [Serving 服务化部署(Python/C++)](./pdserving/README_CN.md)
-- [Paddle-Lite 端侧部署(ARM CPU/OpenCL ARM GPU/Metal ARM GPU)](./lite/readme_ch.md)
+- [Paddle-Lite 端侧部署(ARM CPU/OpenCL ARM GPU)](./lite/readme_ch.md)
- [Paddle.js 部署](./paddlejs/README_ch.md)
-- [Jetson 推理]()
-- [XPU 推理]()
+- [Jetson 推理](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/deploy/Jetson/readme_ch.md)
- [Paddle2ONNX 推理](./paddle2onnx/readme_ch.md)
需要PP-OCR以外的学术算法模型的推理部署,请直接进入相应算法主页面,[入口](../doc/doc_ch/algorithm_overview.md)。
\ No newline at end of file
diff --git a/deploy/cpp_infer/readme.md b/deploy/cpp_infer/readme.md
index c62fe32bc310eb3f91a6c55c3ecf25cfa53c0c61..ddd15d49558454a5ffb0731665b118c929e607f0 100644
--- a/deploy/cpp_infer/readme.md
+++ b/deploy/cpp_infer/readme.md
@@ -104,7 +104,7 @@ opencv3/
tar -xf paddle_inference.tgz
```
-Finally you will see the the folder of `paddle_inference/` in the current path.
+Finally you will see the folder of `paddle_inference/` in the current path.
#### 1.3.2 Compile the inference source code
* If you want to get the latest Paddle inference library features, you can download the latest code from Paddle GitHub repository and compile the inference library from the source code. It is recommended to download the inference library with paddle version greater than or equal to 2.0.1.
@@ -208,6 +208,8 @@ Execute the built executable file:
./build/ppocr [--param1] [--param2] [...]
```
+**Note**:ppocr uses the `PP-OCRv3` model by default, and the input shape used by the recognition model is `3, 48, 320`, so if you use the recognition function, you need to add the parameter `--rec_img_h=48`, if you do not use the default `PP-OCRv3` model, you do not need to set this parameter.
+
Specifically,
##### 1. det+cls+rec:
@@ -220,6 +222,7 @@ Specifically,
--det=true \
--rec=true \
--cls=true \
+ --rec_img_h=48\
```
##### 2. det+rec:
@@ -231,6 +234,7 @@ Specifically,
--det=true \
--rec=true \
--cls=false \
+ --rec_img_h=48\
```
##### 3. det
@@ -250,6 +254,7 @@ Specifically,
--det=false \
--rec=true \
--cls=true \
+ --rec_img_h=48\
```
##### 5. rec
@@ -260,6 +265,7 @@ Specifically,
--det=false \
--rec=true \
--cls=false \
+ --rec_img_h=48\
```
##### 6. cls
@@ -335,10 +341,10 @@ The detection results will be shown on the screen, which is as follows.
```bash
predict img: ../../doc/imgs/12.jpg
../../doc/imgs/12.jpg
-0 det boxes: [[79,553],[399,541],[400,573],[80,585]] rec text: 打浦路252935号 rec score: 0.933757
-1 det boxes: [[31,509],[510,488],[511,529],[33,549]] rec text: 绿洲仕格维花园公寓 rec score: 0.951745
-2 det boxes: [[181,456],[395,448],[396,480],[182,488]] rec text: 打浦路15号 rec score: 0.91956
-3 det boxes: [[43,413],[480,391],[481,428],[45,450]] rec text: 上海斯格威铂尔多大酒店 rec score: 0.915914
+0 det boxes: [[74,553],[427,542],[428,571],[75,582]] rec text: 打浦路252935号 rec score: 0.947724
+1 det boxes: [[23,507],[513,488],[515,529],[24,548]] rec text: 绿洲仕格维花园公寓 rec score: 0.993728
+2 det boxes: [[187,456],[399,448],[400,480],[188,488]] rec text: 打浦路15号 rec score: 0.964994
+3 det boxes: [[42,413],[483,391],[484,428],[43,450]] rec text: 上海斯格威铂尔大酒店 rec score: 0.980086
The detection visualized image saved in ./output//12.jpg
```
diff --git a/deploy/cpp_infer/readme_ch.md b/deploy/cpp_infer/readme_ch.md
index 2a81e15a97cca45d525efe8739255acd12f8117f..e5a4869eca1d35765013e63011c680e59b33ac00 100644
--- a/deploy/cpp_infer/readme_ch.md
+++ b/deploy/cpp_infer/readme_ch.md
@@ -12,7 +12,7 @@
- [2.3 运行demo](#23)
- [3. FAQ](#3)
-本章节介绍PaddleOCR 模型的的C++部署方法。C++在性能计算上优于Python,因此,在大多数CPU、GPU部署场景,多采用C++的部署方式,本节将介绍如何在Linux\Windows (CPU\GPU)环境下配置C++环境并完成PaddleOCR模型部署。
+本章节介绍PaddleOCR 模型的C++部署方法。C++在性能计算上优于Python,因此,在大多数CPU、GPU部署场景,多采用C++的部署方式,本节将介绍如何在Linux\Windows (CPU\GPU)环境下配置C++环境并完成PaddleOCR模型部署。
@@ -213,6 +213,9 @@ CUDNN_LIB_DIR=/your_cudnn_lib_dir
本demo支持系统串联调用,也支持单个功能的调用,如,只使用检测或识别功能。
+**注意** ppocr默认使用`PP-OCRv3`模型,识别模型使用的输入shape为`3,48,320`, 因此如果使用识别功能,需要添加参数`--rec_img_h=48`,如果不使用默认的`PP-OCRv3`模型,则无需设置该参数。
+
+
运行方式:
```shell
./build/ppocr [--param1] [--param2] [...]
@@ -229,6 +232,7 @@ CUDNN_LIB_DIR=/your_cudnn_lib_dir
--det=true \
--rec=true \
--cls=true \
+ --rec_img_h=48\
```
##### 2. 检测+识别:
@@ -240,6 +244,7 @@ CUDNN_LIB_DIR=/your_cudnn_lib_dir
--det=true \
--rec=true \
--cls=false \
+ --rec_img_h=48\
```
##### 3. 检测:
@@ -259,6 +264,7 @@ CUDNN_LIB_DIR=/your_cudnn_lib_dir
--det=false \
--rec=true \
--cls=true \
+ --rec_img_h=48\
```
##### 5. 识别:
@@ -269,6 +275,7 @@ CUDNN_LIB_DIR=/your_cudnn_lib_dir
--det=false \
--rec=true \
--cls=false \
+ --rec_img_h=48\
```
##### 6. 分类:
@@ -343,10 +350,10 @@ CUDNN_LIB_DIR=/your_cudnn_lib_dir
```bash
predict img: ../../doc/imgs/12.jpg
../../doc/imgs/12.jpg
-0 det boxes: [[79,553],[399,541],[400,573],[80,585]] rec text: 打浦路252935号 rec score: 0.933757
-1 det boxes: [[31,509],[510,488],[511,529],[33,549]] rec text: 绿洲仕格维花园公寓 rec score: 0.951745
-2 det boxes: [[181,456],[395,448],[396,480],[182,488]] rec text: 打浦路15号 rec score: 0.91956
-3 det boxes: [[43,413],[480,391],[481,428],[45,450]] rec text: 上海斯格威铂尔多大酒店 rec score: 0.915914
+0 det boxes: [[74,553],[427,542],[428,571],[75,582]] rec text: 打浦路252935号 rec score: 0.947724
+1 det boxes: [[23,507],[513,488],[515,529],[24,548]] rec text: 绿洲仕格维花园公寓 rec score: 0.993728
+2 det boxes: [[187,456],[399,448],[400,480],[188,488]] rec text: 打浦路15号 rec score: 0.964994
+3 det boxes: [[42,413],[483,391],[484,428],[43,450]] rec text: 上海斯格威铂尔大酒店 rec score: 0.980086
The detection visualized image saved in ./output//12.jpg
```
diff --git a/deploy/hubserving/ocr_det/params.py b/deploy/hubserving/ocr_det/params.py
index ba41dd07f135402b5878add415c482edf12e2695..3a310e0b523af7ebc124382761c443ef6c4cbf0c 100755
--- a/deploy/hubserving/ocr_det/params.py
+++ b/deploy/hubserving/ocr_det/params.py
@@ -26,7 +26,7 @@ def read_params():
#params for text detector
cfg.det_algorithm = "DB"
- cfg.det_model_dir = "./inference/ch_PP-OCRv2_det_infer/"
+ cfg.det_model_dir = "./inference/ch_PP-OCRv3_det_infer/"
cfg.det_limit_side_len = 960
cfg.det_limit_type = 'max'
diff --git a/deploy/hubserving/ocr_rec/params.py b/deploy/hubserving/ocr_rec/params.py
index 09bdeeb3c62abe3a1d197719b79d4f523ff5e5e1..b8854c8c0718ddade7415e1d52b3deb0b2659868 100644
--- a/deploy/hubserving/ocr_rec/params.py
+++ b/deploy/hubserving/ocr_rec/params.py
@@ -26,9 +26,9 @@ def read_params():
#params for text recognizer
cfg.rec_algorithm = "CRNN"
- cfg.rec_model_dir = "./inference/ch_PP-OCRv2_rec_infer/"
+ cfg.rec_model_dir = "./inference/ch_PP-OCRv3_rec_infer/"
- cfg.rec_image_shape = "3, 32, 320"
+ cfg.rec_image_shape = "3, 48, 320"
cfg.rec_batch_num = 6
cfg.max_text_length = 25
diff --git a/deploy/hubserving/ocr_system/params.py b/deploy/hubserving/ocr_system/params.py
index 9972a3ded83589e7552b308c59b9dc09a9a4399b..4df1979a103345c3d2089cd0b1872bad49dcb944 100755
--- a/deploy/hubserving/ocr_system/params.py
+++ b/deploy/hubserving/ocr_system/params.py
@@ -26,7 +26,7 @@ def read_params():
#params for text detector
cfg.det_algorithm = "DB"
- cfg.det_model_dir = "./inference/ch_PP-OCRv2_det_infer/"
+ cfg.det_model_dir = "./inference/ch_PP-OCRv3_det_infer/"
cfg.det_limit_side_len = 960
cfg.det_limit_type = 'max'
@@ -44,9 +44,9 @@ def read_params():
#params for text recognizer
cfg.rec_algorithm = "CRNN"
- cfg.rec_model_dir = "./inference/ch_PP-OCRv2_rec_infer/"
+ cfg.rec_model_dir = "./inference/ch_PP-OCRv3_rec_infer/"
- cfg.rec_image_shape = "3, 32, 320"
+ cfg.rec_image_shape = "3, 48, 320"
cfg.rec_batch_num = 6
cfg.max_text_length = 25
diff --git a/deploy/hubserving/readme.md b/deploy/hubserving/readme.md
index ab6dbeff749beb5ddb14d116f2d3580ad074d337..183a25912c2c62371e6db6af2fde5c792fbcbecb 100755
--- a/deploy/hubserving/readme.md
+++ b/deploy/hubserving/readme.md
@@ -41,6 +41,7 @@ deploy/hubserving/ocr_system/
```
## 1. 近期更新
+* 2022.05.05 新增PP-OCRv3检测和识别模型。
* 2022.03.30 新增PP-Structure和表格识别两种服务。
## 2. 快速启动服务
@@ -53,10 +54,10 @@ pip3 install paddlehub==2.1.0 --upgrade -i https://mirror.baidu.com/pypi/simple
```
### 2.2 下载推理模型
-安装服务模块前,需要准备推理模型并放到正确路径。默认使用的是PP-OCRv2模型,默认模型路径为:
+安装服务模块前,需要准备推理模型并放到正确路径。默认使用的是PP-OCRv3模型,默认模型路径为:
```
-检测模型:./inference/ch_PP-OCRv2_det_infer/
-识别模型:./inference/ch_PP-OCRv2_rec_infer/
+检测模型:./inference/ch_PP-OCRv3_det_infer/
+识别模型:./inference/ch_PP-OCRv3_rec_infer/
方向分类器:./inference/ch_ppocr_mobile_v2.0_cls_infer/
表格结构识别模型:./inference/en_ppocr_mobile_v2.0_table_structure_infer/
```
@@ -223,6 +224,7 @@ hub serving start -c deploy/hubserving/ocr_system/config.json
- 2、 到相应的`module.py`和`params.py`等文件中根据实际需求修改代码。
例如,如果需要替换部署服务所用模型,则需要到`params.py`中修改模型路径参数`det_model_dir`和`rec_model_dir`,如果需要关闭文本方向分类器,则将参数`use_angle_cls`置为`False`,当然,同时可能还需要修改其他相关参数,请根据实际情况修改调试。 **强烈建议修改后先直接运行`module.py`调试,能正确运行预测后再启动服务测试。**
+**注意** PPOCR-v3识别模型使用的图片输入shape为`3,48,320`,因此需要修改`params.py`中的`cfg.rec_image_shape = "3, 48, 320"`,如果不使用PPOCR-v3识别模型,则无需修改该参数。
- 3、 卸载旧服务包
```hub uninstall ocr_system```
diff --git a/deploy/hubserving/readme_en.md b/deploy/hubserving/readme_en.md
index 8b99796a257f45d48cf3e0386c741ec798ee23e0..27eccbb5e9c465f20b3725f04aa1652e6829fa3c 100755
--- a/deploy/hubserving/readme_en.md
+++ b/deploy/hubserving/readme_en.md
@@ -41,6 +41,7 @@ deploy/hubserving/ocr_system/
```
## 1. Update
+* 2022.05.05 add PP-OCRv3 text detection and recognition models.
* 2022.03.30 add PP-Structure and table recognition services。
@@ -55,10 +56,10 @@ pip3 install paddlehub==2.1.0 --upgrade -i https://pypi.tuna.tsinghua.edu.cn/sim
```
### 2.2 Download inference model
-Before installing the service module, you need to prepare the inference model and put it in the correct path. By default, the PP-OCRv2 models are used, and the default model path is:
+Before installing the service module, you need to prepare the inference model and put it in the correct path. By default, the PP-OCRv3 models are used, and the default model path is:
```
-text detection model: ./inference/ch_PP-OCRv2_det_infer/
-text recognition model: ./inference/ch_PP-OCRv2_rec_infer/
+text detection model: ./inference/ch_PP-OCRv3_det_infer/
+text recognition model: ./inference/ch_PP-OCRv3_rec_infer/
text angle classifier: ./inference/ch_ppocr_mobile_v2.0_cls_infer/
tanle recognition: ./inference/en_ppocr_mobile_v2.0_table_structure_infer/
```
@@ -233,6 +234,7 @@ hub serving stop --port/-p XXXX
```
- 2. Modify the code in the corresponding files, like `module.py` and `params.py`, according to the actual needs.
For example, if you need to replace the model used by the deployed service, you need to modify model path parameters `det_model_dir` and `rec_model_dir` in `params.py`. If you want to turn off the text direction classifier, set the parameter `use_angle_cls` to `False`. Of course, other related parameters may need to be modified at the same time. Please modify and debug according to the actual situation. It is suggested to run `module.py` directly for debugging after modification before starting the service test.
+**Note** The image input shape used by the PPOCR-v3 recognition model is `3, 48, 320`, so you need to modify `cfg.rec_image_shape = "3, 48, 320"` in `params.py`, if you do not use the PPOCR-v3 recognition model, then there is no need to modify this parameter.
- 3. Uninstall old service module
```shell
hub uninstall ocr_system
diff --git a/deploy/lite/readme.md b/deploy/lite/readme.md
index 4225f8256af85373f5ed8d0213aa902531139c5a..9926e2dd8c973b25b5397fd5825f790528ede279 100644
--- a/deploy/lite/readme.md
+++ b/deploy/lite/readme.md
@@ -1,19 +1,18 @@
-- [Tutorial of PaddleOCR Mobile deployment](#tutorial-of-paddleocr-mobile-deployment)
- - [1. Preparation](#1-preparation)
+# Mobile deployment based on Paddle-Lite
+
+- [1. Preparation](#1-preparation)
- [Preparation environment](#preparation-environment)
- [1.1 Prepare the cross-compilation environment](#11-prepare-the-cross-compilation-environment)
- [1.2 Prepare Paddle-Lite library](#12-prepare-paddle-lite-library)
- - [2 Run](#2-run)
+- [2. Run](#2-run)
- [2.1 Inference Model Optimization](#21-inference-model-optimization)
- [2.2 Run optimized model on Phone](#22-run-optimized-model-on-phone)
- - [注意:](#注意)
- - [FAQ](#faq)
+- [FAQ](#faq)
-# Tutorial of PaddleOCR Mobile deployment
-This tutorial will introduce how to use [Paddle Lite](https://github.com/PaddlePaddle/Paddle-Lite) to deploy PaddleOCR ultra-lightweight Chinese and English detection models on mobile phones.
+This tutorial will introduce how to use [Paddle-Lite](https://github.com/PaddlePaddle/Paddle-Lite) to deploy PaddleOCR ultra-lightweight Chinese and English detection models on mobile phones.
-paddle-lite is a lightweight inference engine for PaddlePaddle. It provides efficient inference capabilities for mobile phones and IoT, and extensively integrates cross-platform hardware to provide lightweight deployment solutions for end-side deployment issues.
+Paddle-Lite is a lightweight inference engine for PaddlePaddle. It provides efficient inference capabilities for mobile phones and IoT, and extensively integrates cross-platform hardware to provide lightweight deployment solutions for end-side deployment issues.
## 1. Preparation
@@ -223,7 +222,7 @@ demo/cxx/ocr/
|-- ocr_db_crnn.cc C++ main code
```
-#### 注意:
+**Note**:
1. `ppocr_keys_v1.txt` is a Chinese dictionary file. If the nb model is used for English recognition or other language recognition, dictionary file should be replaced with a dictionary of the corresponding language. PaddleOCR provides a variety of dictionaries under ppocr/utils/, including:
```
dict/french_dict.txt # french
diff --git a/deploy/lite/readme_ch.md b/deploy/lite/readme_ch.md
index 0acc66f9513e54394cae6f4dc7c6a21b194d79e0..99a543d0d60455443dd872c56a5832c8ca0ff4e9 100644
--- a/deploy/lite/readme_ch.md
+++ b/deploy/lite/readme_ch.md
@@ -1,15 +1,14 @@
-- [端侧部署](#端侧部署)
- - [1. 准备环境](#1-准备环境)
+# 端侧部署
+
+- [1. 准备环境](#1-准备环境)
- [运行准备](#运行准备)
- [1.1 准备交叉编译环境](#11-准备交叉编译环境)
- [1.2 准备预测库](#12-准备预测库)
- - [2 开始运行](#2-开始运行)
+- [2 开始运行](#2-开始运行)
- [2.1 模型优化](#21-模型优化)
- [2.2 与手机联调](#22-与手机联调)
- - [注意:](#注意)
- - [FAQ](#faq)
+- [FAQ](#faq)
-# 端侧部署
本教程将介绍基于[Paddle Lite](https://github.com/PaddlePaddle/Paddle-Lite) 在移动端部署PaddleOCR超轻量中文检测、识别模型的详细步骤。
diff --git a/deploy/paddle2onnx/readme_ch.md b/deploy/paddle2onnx/readme_ch.md
index 8e821892142d65caddd6fa3bd8ff24a372fe9a5d..5004cab8338ee7e809033dcf5b8f2184b0da065a 100644
--- a/deploy/paddle2onnx/readme_ch.md
+++ b/deploy/paddle2onnx/readme_ch.md
@@ -39,14 +39,14 @@ python3.7 -m pip install onnxruntime==1.9.0
有两种方式获取Paddle静态图模型:在 [model_list](../../doc/doc_ch/models_list.md) 中下载PaddleOCR提供的预测模型;
参考[模型导出说明](../../doc/doc_ch/inference.md#训练模型转inference模型)把训练好的权重转为 inference_model。
-以 ppocr 中文检测、识别、分类模型为例:
+以 PP-OCRv3 中文检测、识别、分类模型为例:
```
-wget -nc -P ./inference https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer.tar
-cd ./inference && tar xf ch_PP-OCRv2_det_infer.tar && cd ..
+wget -nc -P ./inference https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar
+cd ./inference && tar xf ch_PP-OCRv3_det_infer.tar && cd ..
-wget -nc -P ./inference https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_infer.tar
-cd ./inference && tar xf ch_PP-OCRv2_rec_infer.tar && cd ..
+wget -nc -P ./inference https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar
+cd ./inference && tar xf ch_PP-OCRv3_rec_infer.tar && cd ..
wget -nc -P ./inference https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar
cd ./inference && tar xf ch_ppocr_mobile_v2.0_cls_infer.tar && cd ..
@@ -57,7 +57,7 @@ cd ./inference && tar xf ch_ppocr_mobile_v2.0_cls_infer.tar && cd ..
使用 Paddle2ONNX 将Paddle静态图模型转换为ONNX模型格式:
```
-paddle2onnx --model_dir ./inference/ch_PP-OCRv2_det_infer \
+paddle2onnx --model_dir ./inference/ch_PP-OCRv3_det_infer \
--model_filename inference.pdmodel \
--params_filename inference.pdiparams \
--save_file ./inference/det_onnx/model.onnx \
@@ -65,7 +65,7 @@ paddle2onnx --model_dir ./inference/ch_PP-OCRv2_det_infer \
--input_shape_dict="{'x':[-1,3,-1,-1]}" \
--enable_onnx_checker True
-paddle2onnx --model_dir ./inference/ch_PP-OCRv2_rec_infer \
+paddle2onnx --model_dir ./inference/ch_PP-OCRv3_rec_infer \
--model_filename inference.pdmodel \
--params_filename inference.pdiparams \
--save_file ./inference/rec_onnx/model.onnx \
@@ -105,8 +105,8 @@ python3.7 tools/infer/predict_system.py --use_gpu=False --use_onnx=True \
```
python3.7 tools/infer/predict_system.py --use_gpu=False \
--cls_model_dir=./inference/ch_ppocr_mobile_v2.0_cls_infer \
---rec_model_dir=./inference/ch_PP-OCRv2_rec_infer \
---det_model_dir=./inference/ch_PP-OCRv2_det_infer \
+--rec_model_dir=./inference/ch_PP-OCRv3_rec_infer \
+--det_model_dir=./inference/ch_PP-OCRv3_det_infer \
--image_dir=./deploy/lite/imgs/lite_demo.png
```
diff --git a/deploy/pdserving/README.md b/deploy/pdserving/README.md
index d3ba7d4cfbabb111831a6ecbce28c1ac352066fe..55e03c4c2654f336ed942ae03e61e88b61940006 100644
--- a/deploy/pdserving/README.md
+++ b/deploy/pdserving/README.md
@@ -15,6 +15,14 @@ Some Key Features of Paddle Serving:
- Industrial serving features supported, such as models management, online loading, online A/B testing etc.
- Highly concurrent and efficient communication between clients and servers supported.
+PaddleServing supports deployment in multiple languages. In this example, two deployment methods, python pipeline and C++, are provided. The comparison between the two is as follows:
+
+| Language | Speed | Secondary development | Do you need to compile |
+|-----|-----|---------|------------|
+| C++ | fast | Slightly difficult | Single model prediction does not need to be compiled, multi-model concatenation needs to be compiled |
+| python | general | easy | single-model/multi-model no compilation required |
+
+
The introduction and tutorial of Paddle Serving service deployment framework reference [document](https://github.com/PaddlePaddle/Serving/blob/develop/README.md).
@@ -25,6 +33,7 @@ The introduction and tutorial of Paddle Serving service deployment framework ref
- [Environmental preparation](#environmental-preparation)
- [Model conversion](#model-conversion)
- [Paddle Serving pipeline deployment](#paddle-serving-pipeline-deployment)
+ - [Paddle Serving C++ deployment](#C++)
- [WINDOWS Users](#windows-users)
- [FAQ](#faq)
@@ -41,23 +50,23 @@ PaddleOCR operating environment and Paddle Serving operating environment are nee
```bash
# Install serving which used to start the service
-wget https://paddle-serving.bj.bcebos.com/test-dev/whl/paddle_serving_server_gpu-0.7.0.post102-py3-none-any.whl
-pip3 install paddle_serving_server_gpu-0.7.0.post102-py3-none-any.whl
+wget https://paddle-serving.bj.bcebos.com/test-dev/whl/paddle_serving_server_gpu-0.8.3.post102-py3-none-any.whl
+pip3 install paddle_serving_server_gpu-0.8.3.post102-py3-none-any.whl
# Install paddle-serving-server for cuda10.1
-# wget https://paddle-serving.bj.bcebos.com/test-dev/whl/paddle_serving_server_gpu-0.7.0.post101-py3-none-any.whl
-# pip3 install paddle_serving_server_gpu-0.7.0.post101-py3-none-any.whl
+# wget https://paddle-serving.bj.bcebos.com/test-dev/whl/paddle_serving_server_gpu-0.8.3.post101-py3-none-any.whl
+# pip3 install paddle_serving_server_gpu-0.8.3.post101-py3-none-any.whl
# Install serving which used to start the service
-wget https://paddle-serving.bj.bcebos.com/test-dev/whl/paddle_serving_client-0.7.0-cp37-none-any.whl
-pip3 install paddle_serving_client-0.7.0-cp37-none-any.whl
+wget https://paddle-serving.bj.bcebos.com/test-dev/whl/paddle_serving_client-0.8.3-cp37-none-any.whl
+pip3 install paddle_serving_client-0.8.3-cp37-none-any.whl
# Install serving-app
-wget https://paddle-serving.bj.bcebos.com/test-dev/whl/paddle_serving_app-0.7.0-py3-none-any.whl
-pip3 install paddle_serving_app-0.7.0-py3-none-any.whl
+wget https://paddle-serving.bj.bcebos.com/test-dev/whl/paddle_serving_app-0.8.3-py3-none-any.whl
+pip3 install paddle_serving_app-0.8.3-py3-none-any.whl
```
- **note:** If you want to install the latest version of PaddleServing, refer to [link](https://github.com/PaddlePaddle/Serving/blob/v0.7.0/doc/Latest_Packages_CN.md).
+ **note:** If you want to install the latest version of PaddleServing, refer to [link](https://github.com/PaddlePaddle/Serving/blob/v0.8.3/doc/Latest_Packages_CN.md).
@@ -67,37 +76,37 @@ When using PaddleServing for service deployment, you need to convert the saved i
Firstly, download the [inference model](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.3/README_ch.md#pp-ocr%E7%B3%BB%E5%88%97%E6%A8%A1%E5%9E%8B%E5%88%97%E8%A1%A8%E6%9B%B4%E6%96%B0%E4%B8%AD) of PPOCR
```
# Download and unzip the OCR text detection model
-wget https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer.tar -O ch_PP-OCRv2_det_infer.tar && tar -xf ch_PP-OCRv2_det_infer.tar
+wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar -O ch_PP-OCRv3_det_infer.tar && tar -xf ch_PP-OCRv3_det_infer.tar
# Download and unzip the OCR text recognition model
-wget https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_infer.tar -O ch_PP-OCRv2_rec_infer.tar && tar -xf ch_PP-OCRv2_rec_infer.tar
+wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar -O ch_PP-OCRv3_rec_infer.tar && tar -xf ch_PP-OCRv3_rec_infer.tar
```
Then, you can use installed paddle_serving_client tool to convert inference model to mobile model.
```
# Detection model conversion
-python3 -m paddle_serving_client.convert --dirname ./ch_PP-OCRv2_det_infer/ \
+python3 -m paddle_serving_client.convert --dirname ./ch_PP-OCRv3_det_infer/ \
--model_filename inference.pdmodel \
--params_filename inference.pdiparams \
- --serving_server ./ppocr_det_mobile_2.0_serving/ \
- --serving_client ./ppocr_det_mobile_2.0_client/
+ --serving_server ./ppocr_det_v3_serving/ \
+ --serving_client ./ppocr_det_v3_client/
# Recognition model conversion
-python3 -m paddle_serving_client.convert --dirname ./ch_PP-OCRv2_rec_infer/ \
+python3 -m paddle_serving_client.convert --dirname ./ch_PP-OCRv3_rec_infer/ \
--model_filename inference.pdmodel \
--params_filename inference.pdiparams \
- --serving_server ./ppocr_rec_mobile_2.0_serving/ \
- --serving_client ./ppocr_rec_mobile_2.0_client/
+ --serving_server ./ppocr_rec_v3_serving/ \
+ --serving_client ./ppocr_rec_v3_client/
```
-After the detection model is converted, there will be additional folders of `ppocr_det_mobile_2.0_serving` and `ppocr_det_mobile_2.0_client` in the current folder, with the following format:
+After the detection model is converted, there will be additional folders of `ppocr_det_v3_serving` and `ppocr_det_v3_client` in the current folder, with the following format:
```
-|- ppocr_det_mobile_2.0_serving/
+|- ppocr_det_v3_serving/
|- __model__
|- __params__
|- serving_server_conf.prototxt
|- serving_server_conf.stream.prototxt
-|- ppocr_det_mobile_2.0_client
+|- ppocr_det_v3_client
|- serving_client_conf.prototxt
|- serving_client_conf.stream.prototxt
@@ -193,16 +202,13 @@ The recognition model is the same.
2021-05-13 03:42:36,979 chl2(In: ['rec'], Out: ['@DAGExecutor']) size[0/0]
```
+
## C++ Serving
Service deployment based on python obviously has the advantage of convenient secondary development. However, the real application often needs to pursue better performance. PaddleServing also provides a more performant C++ deployment version.
The C++ service deployment is the same as python in the environment setup and data preparation stages, the difference is when the service is started and the client sends requests.
-| Language | Speed | Secondary development | Do you need to compile |
-|-----|-----|---------|------------|
-| C++ | fast | Slightly difficult | Single model prediction does not need to be compiled, multi-model concatenation needs to be compiled |
-| python | general | easy | single-model/multi-model no compilation required |
1. Compile Serving
@@ -211,7 +217,7 @@ The C++ service deployment is the same as python in the environment setup and da
2. Run the following command to start the service.
```
# Start the service and save the running log in log.txt
- python3 -m paddle_serving_server.serve --model ppocrv2_det_serving ppocrv2_rec_serving --op GeneralDetectionOp GeneralInferOp --port 9293 &>log.txt &
+ python3 -m paddle_serving_server.serve --model ppocr_det_v3_serving ppocr_rec_v3_serving --op GeneralDetectionOp GeneralInferOp --port 9293 &>log.txt &
```
After the service is successfully started, a log similar to the following will be printed in log.txt

@@ -219,7 +225,7 @@ The C++ service deployment is the same as python in the environment setup and da
3. Send service request
Due to the need for pre and post-processing in the C++Server part, in order to speed up the input to the C++Server is only the base64 encoded string of the picture, it needs to be manually modified
- Change the feed_type field and shape field in ppocrv2_det_client/serving_client_conf.prototxt to the following:
+ Change the feed_type field and shape field in ppocr_det_v3_client/serving_client_conf.prototxt to the following:
```
feed_var {
@@ -234,7 +240,7 @@ The C++ service deployment is the same as python in the environment setup and da
start the client:
```
- python3 ocr_cpp_client.py ppocrv2_det_client ppocrv2_rec_client
+ python3 ocr_cpp_client.py ppocr_det_v3_client ppocr_rec_v3_client
```
After successfully running, the predicted result of the model will be printed in the cmd window. An example of the result is:

diff --git a/deploy/pdserving/README_CN.md b/deploy/pdserving/README_CN.md
index 7d6169569f92d927312ec6ba8ff667d613c4bfa7..0891611db5f39d322473354f7d988b10afa78cbd 100644
--- a/deploy/pdserving/README_CN.md
+++ b/deploy/pdserving/README_CN.md
@@ -9,13 +9,21 @@ PaddleOCR提供2种服务部署方式:
# 基于PaddleServing的服务部署
-本文档将介绍如何使用[PaddleServing](https://github.com/PaddlePaddle/Serving/blob/develop/README_CN.md)工具部署PP-OCR动态图模型的pipeline在线服务。
+本文档将介绍如何使用[PaddleServing](https://github.com/PaddlePaddle/Serving/blob/develop/README_CN.md) 工具部署PP-OCR动态图模型的pipeline在线服务。
相比较于hubserving部署,PaddleServing具备以下优点:
- 支持客户端和服务端之间高并发和高效通信
- 支持 工业级的服务能力 例如模型管理,在线加载,在线A/B测试等
- 支持 多种编程语言 开发客户端,例如C++, Python和Java
+PaddleServing 支持多种语言部署,本例中提供了python pipeline 和 C++ 两种部署方式,两者的对比如下:
+
+| 语言 | 速度 | 二次开发 | 是否需要编译 |
+|-----|-----|---------|------------|
+| C++ | 很快 | 略有难度 | 单模型预测无需编译,多模型串联需要编译 |
+| python | 一般 | 容易 | 单模型/多模型 均无需编译|
+
+
更多有关PaddleServing服务化部署框架介绍和使用教程参考[文档](https://github.com/PaddlePaddle/Serving/blob/develop/README_CN.md)。
AIStudio演示案例可参考 [基于PaddleServing的OCR服务化部署实战](https://aistudio.baidu.com/aistudio/projectdetail/3630726)。
@@ -24,6 +32,7 @@ AIStudio演示案例可参考 [基于PaddleServing的OCR服务化部署实战](h
- [环境准备](#环境准备)
- [模型转换](#模型转换)
- [Paddle Serving pipeline部署](#部署)
+- [Paddle Serving C++部署](#C++)
- [Windows用户](#Windows用户)
- [FAQ](#FAQ)
@@ -34,26 +43,33 @@ AIStudio演示案例可参考 [基于PaddleServing的OCR服务化部署实战](h
- 准备PaddleOCR的运行环境[链接](../../doc/doc_ch/installation.md)
+ ```
+ git clone https://github.com/PaddlePaddle/PaddleOCR
+
+ # 进入到工作目录
+ cd PaddleOCR/deploy/pdserving/
+ ```
+
- 准备PaddleServing的运行环境,步骤如下
```bash
# 安装serving,用于启动服务
-wget https://paddle-serving.bj.bcebos.com/test-dev/whl/paddle_serving_server_gpu-0.7.0.post102-py3-none-any.whl
-pip3 install paddle_serving_server_gpu-0.7.0.post102-py3-none-any.whl
+wget https://paddle-serving.bj.bcebos.com/test-dev/whl/paddle_serving_server_gpu-0.8.3.post102-py3-none-any.whl
+pip3 install paddle_serving_server_gpu-0.8.3.post102-py3-none-any.whl
# 如果是cuda10.1环境,可以使用下面的命令安装paddle-serving-server
-# wget https://paddle-serving.bj.bcebos.com/test-dev/whl/paddle_serving_server_gpu-0.7.0.post101-py3-none-any.whl
-# pip3 install paddle_serving_server_gpu-0.7.0.post101-py3-none-any.whl
+# wget https://paddle-serving.bj.bcebos.com/test-dev/whl/paddle_serving_server_gpu-0.8.3.post101-py3-none-any.whl
+# pip3 install paddle_serving_server_gpu-0.8.3.post101-py3-none-any.whl
# 安装client,用于向服务发送请求
-wget https://paddle-serving.bj.bcebos.com/test-dev/whl/paddle_serving_client-0.7.0-cp37-none-any.whl
-pip3 install paddle_serving_client-0.7.0-cp37-none-any.whl
+wget https://paddle-serving.bj.bcebos.com/test-dev/whl/paddle_serving_client-0.8.3-cp37-none-any.whl
+pip3 install paddle_serving_client-0.8.3-cp37-none-any.whl
# 安装serving-app
-wget https://paddle-serving.bj.bcebos.com/test-dev/whl/paddle_serving_app-0.7.0-py3-none-any.whl
-pip3 install paddle_serving_app-0.7.0-py3-none-any.whl
+wget https://paddle-serving.bj.bcebos.com/test-dev/whl/paddle_serving_app-0.8.3-py3-none-any.whl
+pip3 install paddle_serving_app-0.8.3-py3-none-any.whl
```
-**Note:** 如果要安装最新版本的PaddleServing参考[链接](https://github.com/PaddlePaddle/Serving/blob/v0.7.0/doc/Latest_Packages_CN.md)。
+**Note:** 如果要安装最新版本的PaddleServing参考[链接](https://github.com/PaddlePaddle/Serving/blob/v0.8.3/doc/Latest_Packages_CN.md)。
## 模型转换
@@ -64,38 +80,38 @@ pip3 install paddle_serving_app-0.7.0-py3-none-any.whl
```bash
# 下载并解压 OCR 文本检测模型
-wget https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer.tar -O ch_PP-OCRv2_det_infer.tar && tar -xf ch_PP-OCRv2_det_infer.tar
+wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar -O ch_PP-OCRv3_det_infer.tar && tar -xf ch_PP-OCRv3_det_infer.tar
# 下载并解压 OCR 文本识别模型
-wget https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_infer.tar -O ch_PP-OCRv2_rec_infer.tar && tar -xf ch_PP-OCRv2_rec_infer.tar
+wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar -O ch_PP-OCRv3_rec_infer.tar && tar -xf ch_PP-OCRv3_rec_infer.tar
```
接下来,用安装的paddle_serving_client把下载的inference模型转换成易于server部署的模型格式。
```bash
# 转换检测模型
-python3 -m paddle_serving_client.convert --dirname ./ch_PP-OCRv2_det_infer/ \
+python3 -m paddle_serving_client.convert --dirname ./ch_PP-OCRv3_det_infer/ \
--model_filename inference.pdmodel \
--params_filename inference.pdiparams \
- --serving_server ./ppocr_det_mobile_2.0_serving/ \
- --serving_client ./ppocr_det_mobile_2.0_client/
+ --serving_server ./ppocr_det_v3_serving/ \
+ --serving_client ./ppocr_det_v3_client/
# 转换识别模型
-python3 -m paddle_serving_client.convert --dirname ./ch_PP-OCRv2_rec_infer/ \
+python3 -m paddle_serving_client.convert --dirname ./ch_PP-OCRv3_rec_infer/ \
--model_filename inference.pdmodel \
--params_filename inference.pdiparams \
- --serving_server ./ppocr_rec_mobile_2.0_serving/ \
- --serving_client ./ppocr_rec_mobile_2.0_client/
+ --serving_server ./ppocr_rec_v3_serving/ \
+ --serving_client ./ppocr_rec_v3_client/
```
-检测模型转换完成后,会在当前文件夹多出`ppocr_det_mobile_2.0_serving` 和`ppocr_det_mobile_2.0_client`的文件夹,具备如下格式:
+检测模型转换完成后,会在当前文件夹多出`ppocr_det_v3_serving` 和`ppocr_det_v3_client`的文件夹,具备如下格式:
```
-|- ppocr_det_mobile_2.0_serving/
+|- ppocr_det_v3_serving/
|- __model__
|- __params__
|- serving_server_conf.prototxt
|- serving_server_conf.stream.prototxt
-|- ppocr_det_mobile_2.0_client
+|- ppocr_det_v3_client
|- serving_client_conf.prototxt
|- serving_client_conf.stream.prototxt
@@ -105,13 +121,8 @@ python3 -m paddle_serving_client.convert --dirname ./ch_PP-OCRv2_rec_infer/ \
## Paddle Serving pipeline部署
-1. 下载PaddleOCR代码,若已下载可跳过此步骤
- ```
- git clone https://github.com/PaddlePaddle/PaddleOCR
+1. 确认工作目录下文件结构:
- # 进入到工作目录
- cd PaddleOCR/deploy/pdserving/
- ```
pdserver目录包含启动pipeline服务和发送预测请求的代码,包括:
```
__init__.py
@@ -196,16 +207,12 @@ python3 -m paddle_serving_client.convert --dirname ./ch_PP-OCRv2_rec_infer/ \
C++ 服务部署在环境搭建和数据准备阶段与 python 相同,区别在于启动服务和客户端发送请求时不同。
-| 语言 | 速度 | 二次开发 | 是否需要编译 |
-|-----|-----|---------|------------|
-| C++ | 很快 | 略有难度 | 单模型预测无需编译,多模型串联需要编译 |
-| python | 一般 | 容易 | 单模型/多模型 均无需编译|
-
1. 准备 Serving 环境
为了提高预测性能,C++ 服务同样提供了多模型串联服务。与python pipeline服务不同,多模型串联的过程中需要将模型前后处理代码写在服务端,因此需要在本地重新编译生成serving。
首先需要下载Serving代码库, 把OCR文本检测预处理相关代码替换到Serving库中
+
```
git clone https://github.com/PaddlePaddle/Serving
@@ -223,7 +230,7 @@ cp -rf general_detection_op.cpp Serving/core/general-server/op
```
# 启动服务,运行日志保存在log.txt
- python3 -m paddle_serving_server.serve --model ppocrv2_det_serving ppocrv2_rec_serving --op GeneralDetectionOp GeneralInferOp --port 9293 &>log.txt &
+ python3 -m paddle_serving_server.serve --model ppocr_det_v3_serving ppocr_rec_v3_serving --op GeneralDetectionOp GeneralInferOp --port 9293 &>log.txt &
```
成功启动服务后,log.txt中会打印类似如下日志

@@ -231,7 +238,7 @@ cp -rf general_detection_op.cpp Serving/core/general-server/op
3. 发送服务请求:
由于需要在C++Server部分进行前后处理,为了加速传入C++Server的仅仅是图片的base64编码的字符串,故需要手动修改
- ppocrv2_det_client/serving_client_conf.prototxt 中 feed_type 字段 和 shape 字段,修改成如下内容:
+ ppocr_det_v3_client/serving_client_conf.prototxt 中 feed_type 字段 和 shape 字段,修改成如下内容:
```
feed_var {
name: "x"
@@ -243,7 +250,7 @@ cp -rf general_detection_op.cpp Serving/core/general-server/op
```
启动客户端
```
- python3 ocr_cpp_client.py ppocrv2_det_client ppocrv2_rec_client
+ python3 ocr_cpp_client.py ppocr_det_v3_client ppocr_rec_v3_client
```
成功运行后,模型预测的结果会打印在cmd窗口中,结果示例为:
diff --git a/deploy/pdserving/config.yml b/deploy/pdserving/config.yml
index 2aae922dfa12f46d1c0ebd352e8d3a7077065cf8..19cd9325ee8b241fd591678b9ba6452de9bec025 100644
--- a/deploy/pdserving/config.yml
+++ b/deploy/pdserving/config.yml
@@ -34,10 +34,10 @@ op:
client_type: local_predictor
#det模型路径
- model_config: ./ppocr_det_mobile_2.0_serving
+ model_config: ./ppocr_det_v3_serving
#Fetch结果列表,以client_config中fetch_var的alias_name为准
- fetch_list: ["save_infer_model/scale_0.tmp_1"]
+ fetch_list: ["sigmoid_0.tmp_0"]
#计算硬件ID,当devices为""或不写时为CPU预测;当devices为"0", "0,1,2"时为GPU预测,表示使用的GPU卡
devices: "0"
@@ -60,10 +60,10 @@ op:
client_type: local_predictor
#rec模型路径
- model_config: ./ppocr_rec_mobile_2.0_serving
+ model_config: ./ppocr_rec_v3_serving
#Fetch结果列表,以client_config中fetch_var的alias_name为准
- fetch_list: ["save_infer_model/scale_0.tmp_1"]
+ fetch_list: ["softmax_5.tmp_0"]
#计算硬件ID,当devices为""或不写时为CPU预测;当devices为"0", "0,1,2"时为GPU预测,表示使用的GPU卡
devices: "0"
diff --git a/deploy/pdserving/ocr_reader.py b/deploy/pdserving/ocr_reader.py
index 67099786ea73b66412dac8f965e20201f0ac1fdc..6a2d57b679d69ab11ac6f0fd74c47a342b391545 100644
--- a/deploy/pdserving/ocr_reader.py
+++ b/deploy/pdserving/ocr_reader.py
@@ -392,38 +392,8 @@ class OCRReader(object):
return norm_img_batch[0]
- def postprocess_old(self, outputs, with_score=False):
- rec_res = []
- rec_idx_lod = outputs["ctc_greedy_decoder_0.tmp_0.lod"]
- rec_idx_batch = outputs["ctc_greedy_decoder_0.tmp_0"]
- if with_score:
- predict_lod = outputs["softmax_0.tmp_0.lod"]
- for rno in range(len(rec_idx_lod) - 1):
- beg = rec_idx_lod[rno]
- end = rec_idx_lod[rno + 1]
- if isinstance(rec_idx_batch, list):
- rec_idx_tmp = [x[0] for x in rec_idx_batch[beg:end]]
- else: #nd array
- rec_idx_tmp = rec_idx_batch[beg:end, 0]
- preds_text = self.char_ops.decode(rec_idx_tmp)
- if with_score:
- beg = predict_lod[rno]
- end = predict_lod[rno + 1]
- if isinstance(outputs["softmax_0.tmp_0"], list):
- outputs["softmax_0.tmp_0"] = np.array(outputs[
- "softmax_0.tmp_0"]).astype(np.float32)
- probs = outputs["softmax_0.tmp_0"][beg:end, :]
- ind = np.argmax(probs, axis=1)
- blank = probs.shape[1]
- valid_ind = np.where(ind != (blank - 1))[0]
- score = np.mean(probs[valid_ind, ind[valid_ind]])
- rec_res.append([preds_text, score])
- else:
- rec_res.append([preds_text])
- return rec_res
-
def postprocess(self, outputs, with_score=False):
- preds = outputs["save_infer_model/scale_0.tmp_1"]
+ preds = outputs["softmax_5.tmp_0"]
try:
preds = preds.numpy()
except:
diff --git a/deploy/pdserving/web_service.py b/deploy/pdserving/web_service.py
index 07fd6102beaef4001f87574a2f0631e2b1012613..98e2dfba2f5abd3fc36bf3743b23f7eb7be3b9c4 100644
--- a/deploy/pdserving/web_service.py
+++ b/deploy/pdserving/web_service.py
@@ -56,7 +56,7 @@ class DetOp(Op):
return {"x": det_img[np.newaxis, :].copy()}, False, None, ""
def postprocess(self, input_dicts, fetch_dict, data_id, log_id):
- det_out = fetch_dict["save_infer_model/scale_0.tmp_1"]
+ det_out = fetch_dict["sigmoid_0.tmp_0"]
ratio_list = [
float(self.new_h) / self.ori_h, float(self.new_w) / self.ori_w
]
diff --git a/deploy/pdserving/web_service_det.py b/deploy/pdserving/web_service_det.py
index 0ca8dbc41bbdde4caf76bcfddabe4b9c2e94cb4b..7584608a9fed4bea93caa5c814c0450566696d56 100644
--- a/deploy/pdserving/web_service_det.py
+++ b/deploy/pdserving/web_service_det.py
@@ -55,7 +55,7 @@ class DetOp(Op):
return {"x": det_img[np.newaxis, :].copy()}, False, None, ""
def postprocess(self, input_dicts, fetch_dict, data_id, log_id):
- det_out = fetch_dict["save_infer_model/scale_0.tmp_1"]
+ det_out = fetch_dict["sigmoid_0.tmp_0"]
ratio_list = [
float(self.new_h) / self.ori_h, float(self.new_w) / self.ori_w
]
diff --git a/deploy/pdserving/win/ocr_reader.py b/deploy/pdserving/win/ocr_reader.py
index 3f219784fca79715d09ae9353a32d95e2e427cb6..18b9385aa0c7adf7c3e0cd38efd1160655881f0e 100644
--- a/deploy/pdserving/win/ocr_reader.py
+++ b/deploy/pdserving/win/ocr_reader.py
@@ -392,38 +392,8 @@ class OCRReader(object):
return norm_img_batch[0]
- def postprocess_old(self, outputs, with_score=False):
- rec_res = []
- rec_idx_lod = outputs["ctc_greedy_decoder_0.tmp_0.lod"]
- rec_idx_batch = outputs["ctc_greedy_decoder_0.tmp_0"]
- if with_score:
- predict_lod = outputs["softmax_0.tmp_0.lod"]
- for rno in range(len(rec_idx_lod) - 1):
- beg = rec_idx_lod[rno]
- end = rec_idx_lod[rno + 1]
- if isinstance(rec_idx_batch, list):
- rec_idx_tmp = [x[0] for x in rec_idx_batch[beg:end]]
- else: #nd array
- rec_idx_tmp = rec_idx_batch[beg:end, 0]
- preds_text = self.char_ops.decode(rec_idx_tmp)
- if with_score:
- beg = predict_lod[rno]
- end = predict_lod[rno + 1]
- if isinstance(outputs["softmax_0.tmp_0"], list):
- outputs["softmax_0.tmp_0"] = np.array(outputs[
- "softmax_0.tmp_0"]).astype(np.float32)
- probs = outputs["softmax_0.tmp_0"][beg:end, :]
- ind = np.argmax(probs, axis=1)
- blank = probs.shape[1]
- valid_ind = np.where(ind != (blank - 1))[0]
- score = np.mean(probs[valid_ind, ind[valid_ind]])
- rec_res.append([preds_text, score])
- else:
- rec_res.append([preds_text])
- return rec_res
-
def postprocess(self, outputs, with_score=False):
- preds = outputs["save_infer_model/scale_0.tmp_1"]
+ preds = outputs["softmax_5.tmp_0"]
try:
preds = preds.numpy()
except:
diff --git a/deploy/slim/prune/README_en.md b/deploy/slim/prune/README_en.md
index ceb9e204a74b5460d74766e8df154d22f39bd843..a2ee2cba48b6c4cf9c00630504bbb30bc932499a 100644
--- a/deploy/slim/prune/README_en.md
+++ b/deploy/slim/prune/README_en.md
@@ -1,7 +1,7 @@
# PP-OCR Models Pruning
-Generally, a more complex model would achive better performance in the task, but it also leads to some redundancy in the model. Model Pruning is a technique that reduces this redundancy by removing the sub-models in the neural network model, so as to reduce model calculation complexity and improve model inference performance.
+Generally, a more complex model would achieve better performance in the task, but it also leads to some redundancy in the model. Model Pruning is a technique that reduces this redundancy by removing the sub-models in the neural network model, so as to reduce model calculation complexity and improve model inference performance.
This example uses PaddleSlim provided[APIs of Pruning](https://github.com/PaddlePaddle/PaddleSlim/tree/develop/docs/zh_cn/api_cn/dygraph/pruners) to compress the OCR model.
[PaddleSlim](https://github.com/PaddlePaddle/PaddleSlim), an open source library which integrates model pruning, quantization (including quantization training and offline quantization), distillation, neural network architecture search, and many other commonly used and leading model compression technique in the industry.
diff --git a/deploy/slim/prune/sensitivity_anal.py b/deploy/slim/prune/sensitivity_anal.py
index 306f1a83ae0945614518514dcd00ca869254d5f8..be64a6bcf860c3e2e7a8a6fa20c4c241149a147b 100644
--- a/deploy/slim/prune/sensitivity_anal.py
+++ b/deploy/slim/prune/sensitivity_anal.py
@@ -94,7 +94,7 @@ def main(config, device, logger, vdl_writer):
config['Optimizer'],
epochs=config['Global']['epoch_num'],
step_each_epoch=len(train_dataloader),
- parameters=model.parameters())
+ model=model)
# build metric
eval_class = build_metric(config['Metric'])
diff --git a/deploy/slim/quantization/README.md b/deploy/slim/quantization/README.md
index d7c67a3bad4851aab5a27abb695da14314a7282e..894a29be067fc53d7553e8b02f7ad6a240c9bcbd 100644
--- a/deploy/slim/quantization/README.md
+++ b/deploy/slim/quantization/README.md
@@ -22,9 +22,7 @@
### 1. 安装PaddleSlim
```bash
-git clone https://github.com/PaddlePaddle/PaddleSlim.git
-cd PaddleSlim
-python setup.py install
+pip3 install paddleslim==2.2.2
```
### 2. 准备训练好的模型
@@ -43,7 +41,15 @@ python deploy/slim/quantization/quant.py -c configs/det/ch_ppocr_v2.0/ch_det_mv3
wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_train.tar
tar -xf ch_ppocr_mobile_v2.0_det_train.tar
python deploy/slim/quantization/quant.py -c configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml -o Global.pretrained_model=./ch_ppocr_mobile_v2.0_det_train/best_accuracy Global.save_model_dir=./output/quant_model
+```
+
+模型蒸馏和模型量化可以同时使用,以PPOCRv3检测模型为例:
+```
+# 下载检测预训练模型:
+wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_distill_train.tar
+tar xf https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_distill_train.tar
+python deploy/slim/quantization/quant.py -c configs/det/ch_PP-OCRv3_det/ch_PP-OCRv3_det_cml.yml -o Global.pretrained_model='./ch_PP-OCRv3_det_distill_train/best_accuracy' Global.save_model_dir=./output/quant_model_distill/
```
如果要训练识别模型的量化,修改配置文件和加载的模型参数即可。
diff --git a/deploy/slim/quantization/README_en.md b/deploy/slim/quantization/README_en.md
index 3f1fe67c9aa3b0b95ee006d97e39e3ce6a19ca22..ea77cb38117867224d87317884efda25bd777d92 100644
--- a/deploy/slim/quantization/README_en.md
+++ b/deploy/slim/quantization/README_en.md
@@ -25,9 +25,7 @@ After training, if you want to further compress the model size and accelerate th
### 1. Install PaddleSlim
```bash
-git clone https://github.com/PaddlePaddle/PaddleSlim.git
-cd PaddlSlim
-python setup.py install
+pip3 install paddleslim==2.2.2
```
@@ -52,6 +50,17 @@ python deploy/slim/quantization/quant.py -c configs/det/ch_ppocr_v2.0/ch_det_mv3
```
+Model distillation and model quantization can be used at the same time, taking the PPOCRv3 detection model as an example:
+```
+# download provided model
+wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_distill_train.tar
+tar xf https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_distill_train.tar
+
+python deploy/slim/quantization/quant.py -c configs/det/ch_PP-OCRv3_det/ch_PP-OCRv3_det_cml.yml -o Global.pretrained_model='./ch_PP-OCRv3_det_distill_train/best_accuracy' Global.save_model_dir=./output/quant_model_distill/
+```
+
+If you want to quantify the text recognition model, you can modify the configuration file and loaded model parameters.
+
### 4. Export inference model
Once we got the model after pruning and fine-tuning, we can export it as an inference model for the deployment of predictive tasks:
diff --git a/deploy/slim/quantization/export_model.py b/deploy/slim/quantization/export_model.py
index 90f79dab34a5f20d4556ae4b10ad1d4e1f8b7f0d..fd1c3e5e109667fa74f5ade18b78f634e4d325db 100755
--- a/deploy/slim/quantization/export_model.py
+++ b/deploy/slim/quantization/export_model.py
@@ -17,9 +17,9 @@ import sys
__dir__ = os.path.dirname(os.path.abspath(__file__))
sys.path.append(__dir__)
-sys.path.append(os.path.abspath(os.path.join(__dir__, '..', '..', '..')))
-sys.path.append(
- os.path.abspath(os.path.join(__dir__, '..', '..', '..', 'tools')))
+sys.path.insert(0, os.path.abspath(os.path.join(__dir__, '..', '..', '..')))
+sys.path.insert(
+ 0, os.path.abspath(os.path.join(__dir__, '..', '..', '..', 'tools')))
import argparse
@@ -129,7 +129,6 @@ def main():
quanter.quantize(model)
load_model(config, model)
- model.eval()
# build metric
eval_class = build_metric(config['Metric'])
@@ -142,6 +141,7 @@ def main():
# start eval
metric = program.eval(model, valid_dataloader, post_process_class,
eval_class, model_type, use_srn)
+ model.eval()
logger.info('metric eval ***************')
for k, v in metric.items():
@@ -156,7 +156,6 @@ def main():
if arch_config["algorithm"] in ["Distillation", ]: # distillation model
archs = list(arch_config["Models"].values())
for idx, name in enumerate(model.model_name_list):
- model.model_list[idx].eval()
sub_model_save_path = os.path.join(save_path, name, "inference")
export_single_model(model.model_list[idx], archs[idx],
sub_model_save_path, logger, quanter)
diff --git a/deploy/slim/quantization/quant.py b/deploy/slim/quantization/quant.py
index f7acb185add5d40b749e7442111891869dfaeb22..64521b5e06df61cf656da4087e6cd49f82adfadd 100755
--- a/deploy/slim/quantization/quant.py
+++ b/deploy/slim/quantization/quant.py
@@ -161,7 +161,13 @@ def main(config, device, logger, vdl_writer):
if config["Global"]["pretrained_model"] is not None:
pre_best_model_dict = load_model(config, model)
- quanter = QAT(config=quant_config, act_preprocess=PACT)
+ freeze_params = False
+ if config['Architecture']["algorithm"] in ["Distillation"]:
+ for key in config['Architecture']["Models"]:
+ freeze_params = freeze_params or config['Architecture']['Models'][
+ key].get('freeze_params', False)
+ act = None if freeze_params else PACT
+ quanter = QAT(config=quant_config, act_preprocess=act)
quanter.quantize(model)
if config['Global']['distributed']:
diff --git a/doc/datasets/funsd_demo/gt_train_00040534.jpg b/doc/datasets/funsd_demo/gt_train_00040534.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..9f7cf4d4977689b73e2ca91cbe9c877bb8f0c7ff
Binary files /dev/null and b/doc/datasets/funsd_demo/gt_train_00040534.jpg differ
diff --git a/doc/datasets/funsd_demo/gt_train_00070353.jpg b/doc/datasets/funsd_demo/gt_train_00070353.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..36d3345e5ec4c262764e63a972aaa82e98877681
Binary files /dev/null and b/doc/datasets/funsd_demo/gt_train_00070353.jpg differ
diff --git a/doc/datasets/xfund_demo/gt_zh_train_0.jpg b/doc/datasets/xfund_demo/gt_zh_train_0.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..6fdaf12fa1d79e6ea9029d665ab7488223459436
Binary files /dev/null and b/doc/datasets/xfund_demo/gt_zh_train_0.jpg differ
diff --git a/doc/datasets/xfund_demo/gt_zh_train_1.jpg b/doc/datasets/xfund_demo/gt_zh_train_1.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..6a1e53a3ba09b6f84809cfd10a15c42f42b9a163
Binary files /dev/null and b/doc/datasets/xfund_demo/gt_zh_train_1.jpg differ
diff --git a/doc/doc_ch/FAQ.md b/doc/doc_ch/FAQ.md
index ef7394ee182aec7168a66511e376243dc5f0a8aa..24f8a3e92be1c003ec7c37b74d14f4ae4117086a 100644
--- a/doc/doc_ch/FAQ.md
+++ b/doc/doc_ch/FAQ.md
@@ -114,7 +114,7 @@ A: PGNet不需要字符级别的标注,NMS操作以及ROI操作。同时提出
(3)端到端统计:
端对端召回率:准确检测并正确识别文本行在全部标注文本行的占比;
端到端准确率:准确检测并正确识别文本行在 检测到的文本行数量 的占比;
-准确检测的标准是检测框与标注框的IOU大于某个阈值,正确识别的的检测框中的文本与标注的文本相同。
+准确检测的标准是检测框与标注框的IOU大于某个阈值,正确识别的检测框中的文本与标注的文本相同。
diff --git a/doc/doc_ch/PP-OCRv3_introduction.md b/doc/doc_ch/PP-OCRv3_introduction.md
new file mode 100644
index 0000000000000000000000000000000000000000..dc0271f294cf43a26477dbc974b77297e04122ac
--- /dev/null
+++ b/doc/doc_ch/PP-OCRv3_introduction.md
@@ -0,0 +1,130 @@
+[English](../doc_en/PP-OCRv3_introduction_en.md) | 简体中文
+
+# PP-OCRv3
+
+- [1. 简介](#1)
+- [2. 检测优化](#2)
+- [3. 识别优化](#3)
+- [4. 端到端评估](#4)
+
+
+
+
+## 1. 简介
+
+PP-OCRv3在PP-OCRv2的基础上进一步升级。检测模型仍然基于DB算法,优化策略采用了带残差注意力机制的FPN结构RSEFPN、增大感受野的PAN结构LKPAN、基于DML训练的更优的教师模型;识别模型将base模型从CRNN替换成了IJCAI 2022论文[SVTR](),并采用SVTR轻量化、带指导训练CTC、数据增广策略RecConAug、自监督训练的更好的预训练模型、无标签数据的使用进行模型加速和效果提升。更多细节请参考PP-OCRv3[技术报告](./PP-OCRv3_introduction.md)。
+
+PP-OCRv3系统pipeline如下:
+
+
+

+
+
+
+## 2. 检测优化
+
+PP-OCRv3采用PP-OCRv2的[CML](https://arxiv.org/pdf/2109.03144.pdf)蒸馏策略,在蒸馏的student模型、teacher模型精度提升,CML蒸馏策略上分别做了优化。
+
+- 在蒸馏student模型精度提升方面,提出了基于残差结构的通道注意力模块RSEFPN(Residual Squeeze-and-Excitation FPN),用于提升student模型精度和召回。
+
+RSEFPN的网络结构如下图所示,RSEFPN在PP-OCRv2的FPN基础上,将FPN中的卷积层更换为了通道注意力结构的RSEConv层。
+
+
+

+
+
+RSEFPN将PP-OCR检测模型的精度hmean从81.3%提升到84.5%。模型大小从3M变为3.6M。
+
+*注:PP-OCRv2的FPN通道数仅为96和24,如果直接用SE模块代替FPN的卷积会导致精度下降,RSEConv引入残差结构可以防止训练中包含重要特征的通道被抑制。*
+
+- 在蒸馏的teacher模型精度提升方面,提出了LKPAN结构替换PP-OCRv2的FPN结构,并且使用ResNet50作为Backbone,更大的模型带来更多的精度提升。另外,对teacher模型使用[DML](https://arxiv.org/abs/1706.00384)蒸馏策略进一步提升teacher模型的精度。最终teacher的模型指标相比ppocr_server_v2.0从83.2%提升到了86.0%。
+
+*注:[PP-OCRv2的FPN结构](https://github.com/PaddlePaddle/PaddleOCR/blob/77acb3bfe51c8a46c684527f73cd218cefedb4a3/ppocr/modeling/necks/db_fpn.py#L107)对DB算法FPN结构做了轻量级设计*
+
+LKPAN的网络结构如下图所示:
+
+
+

+
+
+LKPAN(Large Kernel PAN)是一个具有更大感受野的轻量级[PAN](https://arxiv.org/pdf/1803.01534.pdf)结构。在LKPAN的path augmentation中,使用kernel size为`9*9`的卷积;更大的kernel size意味着更大的感受野,更容易检测大字体的文字以及极端长宽比的文字。LKPAN将PP-OCR检测模型的精度hmean从81.3%提升到84.9%。
+
+*注:LKPAN相比RSEFPN有更多的精度提升,但是考虑到模型大小和预测速度等因素,在student模型中使用RSEFPN。*
+
+采用上述策略,PP-OCRv3相比PP-OCRv2,hmean指标从83.3%提升到85.4%;预测速度从平均117ms/image变为124ms/image。
+
+3. PP-OCRv3检测模型消融实验
+
+|序号|策略|模型大小|hmean|Intel Gold 6148CPU+mkldnn预测耗时|
+|-|-|-|-|-|
+|0|PP-OCR|3M|81.3%|117ms|
+|1|PP-OCRV2|3M|83.3%|117ms|
+|2|0 + RESFPN|3.6M|84.5%|124ms|
+|3|0 + LKPAN|4.6M|84.9%|156ms|
+|4|ppocr_server_v2.0 |124M|83.2%||171ms|
+|5|teacher + DML + LKPAN|124M|86.0%|396ms|
+|6|0 + 2 + 5 + CML|3.6M|85.4%|124ms|
+
+
+
+
+## 3. 识别优化
+
+[SVTR](https://arxiv.org/abs/2205.00159) 证明了强大的单视觉模型(无需序列模型)即可高效准确完成文本识别任务,在中英文数据上均有优秀的表现。经过实验验证,SVTR_Tiny在自建的 [中文数据集上](https://arxiv.org/abs/2109.03144) ,识别精度可以提升10.7%,网络结构如下所示:
+
+
+
+由于 MKLDNN 加速库支持的模型结构有限,SVTR 在CPU+MKLDNN上相比PP-OCRv2慢了10倍。
+
+PP-OCRv3 期望在提升模型精度的同时,不带来额外的推理耗时。通过分析发现,SVTR_Tiny结构的主要耗时模块为Mixing Block,因此我们对 SVTR_Tiny 的结构进行了一系列优化(详细速度数据请参考下方消融实验表格):
+
+1. 将SVTR网络前半部分替换为PP-LCNet的前三个stage,保留4个 Global Mixing Block ,精度为76%,加速69%,网络结构如下所示:
+
+2. 将4个 Global Attenntion Block 减小到2个,精度为72.9%,加速69%,网络结构如下所示:
+
+3. 实验发现 Global Attention 的预测速度与输入其特征的shape有关,因此后移Global Mixing Block的位置到池化层之后,精度下降为71.9%,速度超越 CNN-base 的PP-OCRv2 22%,网络结构如下所示:
+
+
+为了提升模型精度同时不引入额外推理成本,PP-OCRv3参考GTC策略,使用Attention监督CTC训练,预测时完全去除Attention模块,在推理阶段不增加任何耗时, 精度提升3.8%,训练流程如下所示:
+
+
+在训练策略方面,PP-OCRv3参考 [SSL](https://github.com/ku21fan/STR-Fewer-Labels) 设计了文本方向任务,训练了适用于文本识别的预训练模型,加速模型收敛过程,精度提升了0.6%; 使用UDML蒸馏策略,进一步提升精度1.5%,训练流程所示:
+
+
+
+
+数据增强方面:
+
+1. 基于 [ConCLR](https://www.cse.cuhk.edu.hk/~byu/papers/C139-AAAI2022-ConCLR.pdf) 中的ConAug方法,设计了 RecConAug 数据增强方法,增强数据多样性,精度提升0.5%,增强可视化效果如下所示:
+
+
+2. 使用训练好的 SVTR_large 预测 120W 的 lsvt 无标注数据,取出其中得分大于0.95的数据,共得到81W识别数据加入到PP-OCRv3的训练数据中,精度提升1%。
+
+总体来讲PP-OCRv3识别从网络结构、训练策略、数据增强三个方向做了进一步优化:
+
+- 网络结构上:考虑[SVTR](https://arxiv.org/abs/2205.00159) 在中英文效果上的优越性,采用SVTR_Tiny作为base,选取Global Mixing Block和卷积组合提取特征,并将Global Mixing Block位置后移进行加速; 参考 [GTC](https://arxiv.org/pdf/2002.01276.pdf) 策略,使用注意力机制模块指导CTC训练,定位和识别字符,提升不规则文本的识别精度。
+- 训练策略上:参考 [SSL](https://github.com/ku21fan/STR-Fewer-Labels) 设计了方向分类前序任务,获取更优预训练模型,加速模型收敛过程,提升精度; 使用UDML蒸馏策略、监督attention、ctc两个分支得到更优模型。
+- 数据增强上:基于 [ConCLR](https://www.cse.cuhk.edu.hk/~byu/papers/C139-AAAI2022-ConCLR.pdf) 中的ConAug方法,改进得到 RecConAug 数据增广方法,支持随机结合任意多张图片,提升训练数据的上下文信息丰富度,增强模型鲁棒性;使用 SVTR_large 预测无标签数据,向训练集中补充81w高质量真实数据。
+
+基于上述策略,PP-OCRv3识别模型相比PP-OCRv2,在速度可比的情况下,精度进一步提升4.5%。 具体消融实验如下所示:
+
+实验细节:
+
+| id | 策略 | 模型大小 | 精度 | 速度(cpu + mkldnn)|
+|-----|-----|--------|----| --- |
+| 01 | PP-OCRv2 | 8M | 69.3% | 8.54ms |
+| 02 | SVTR_Tiny | 21M | 80.1% | 97ms |
+| 03 | LCNet_SVTR_G4 | 9.2M | 76% | 30ms |
+| 04 | LCNet_SVTR_G2 | 13M | 72.98% | 9.37ms |
+| 05 | PP-OCRv3 | 12M | 71.9% | 6.6ms |
+| 06 | + large input_shape | 12M | 73.98% | 7.6ms |
+| 06 | + GTC | 12M | 75.8% | 7.6ms |
+| 07 | + RecConAug | 12M | 76.3% | 7.6ms |
+| 08 | + SSL pretrain | 12M | 76.9% | 7.6ms |
+| 09 | + UDML | 12M | 78.4% | 7.6ms |
+| 10 | + unlabeled data | 12M | 79.4% | 7.6ms |
+
+注: 测试速度时,实验01-05输入图片尺寸均为(3,32,320),06-10输入图片尺寸均为(3,48,320)
+
+
+## 4. 端到端评估
diff --git a/doc/doc_ch/add_new_algorithm.md b/doc/doc_ch/add_new_algorithm.md
index 79c29249dd7dd0b25ffa7625d11ed2378bfafec4..bb97e00aa66771e99feb799dc4945f3d13b5b247 100644
--- a/doc/doc_ch/add_new_algorithm.md
+++ b/doc/doc_ch/add_new_algorithm.md
@@ -246,7 +246,7 @@ class MyMetric(object):
def get_metric(self):
"""
- return metircs {
+ return metrics {
'acc': 0,
'norm_edit_dis': 0,
}
diff --git a/doc/doc_ch/algorithm_det_east.md b/doc/doc_ch/algorithm_det_east.md
new file mode 100644
index 0000000000000000000000000000000000000000..b89018e3468aa7772af69da469e81c16e9d43dc9
--- /dev/null
+++ b/doc/doc_ch/algorithm_det_east.md
@@ -0,0 +1,96 @@
+# EAST
+
+- [1. 算法简介](#1)
+- [2. 环境配置](#2)
+- [3. 模型训练、评估、预测](#3)
+ - [3.1 训练](#3-1)
+ - [3.2 评估](#3-2)
+ - [3.3 预测](#3-3)
+- [4. 推理部署](#4)
+ - [4.1 Python推理](#4-1)
+ - [4.2 C++推理](#4-2)
+ - [4.3 Serving服务化部署](#4-3)
+ - [4.4 更多推理部署](#4-4)
+- [5. FAQ](#5)
+
+
+## 1. 算法简介
+
+论文信息:
+> [EAST: An Efficient and Accurate Scene Text Detector](https://arxiv.org/abs/1704.03155)
+> Xinyu Zhou, Cong Yao, He Wen, Yuzhi Wang, Shuchang Zhou, Weiran He, Jiajun Liang
+> CVPR, 2017
+
+
+在ICDAR2015文本检测公开数据集上,算法复现效果如下:
+
+|模型|骨干网络|配置文件|precision|recall|Hmean|下载链接|
+| --- | --- | --- | --- | --- | --- | --- |
+|EAST|ResNet50_vd|88.71%| 81.36%| 84.88%| [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_east_v2.0_train.tar)|
+|EAST| MobileNetV3| 78.2%| 79.1%| 78.65%| [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_east_v2.0_train.tar)|
+
+
+
+## 2. 环境配置
+请先参考[《运行环境准备》](./environment.md)配置PaddleOCR运行环境,参考[《项目克隆》](./clone.md)克隆项目代码。
+
+
+
+## 3. 模型训练、评估、预测
+
+上表中的EAST训练模型使用ICDAR2015文本检测公开数据集训练得到,数据集下载可参考 [ocr_datasets](./dataset/ocr_datasets.md)。
+
+数据下载完成后,请参考[文本检测训练教程](./detection.md)进行训练。PaddleOCR对代码进行了模块化,训练不同的检测模型只需要**更换配置文件**即可。
+
+
+
+## 4. 推理部署
+
+
+### 4.1 Python推理
+
+首先将EAST文本检测训练过程中保存的模型,转换成inference model。以基于Resnet50_vd骨干网络,在ICDAR2015英文数据集训练的模型为例([训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_east_v2.0_train.tar)),可以使用如下命令进行转换:
+
+```shell
+python3 tools/export_model.py -c configs/det/det_r50_vd_east.yml -o Global.pretrained_model=./det_r50_vd_east_v2.0_train/best_accuracy Global.save_inference_dir=./inference/det_r50_east/
+```
+
+EAST文本检测模型推理,需要设置参数--det_algorithm="EAST",执行预测:
+```shell
+python3 tools/infer/predict_det.py --image_dir="./doc/imgs_en/img_10.jpg" --det_model_dir="./inference/det_r50_east/" --det_algorithm="EAST"
+```
+
+可视化文本检测结果默认保存到`./inference_results`文件夹里面,结果文件的名称前缀为'det_res'。
+
+
+
+
+### 4.2 C++推理
+
+由于后处理暂未使用CPP编写,EAST文本检测模型暂不支持CPP推理。
+
+
+### 4.3 Serving服务化部署
+
+暂未支持
+
+
+### 4.4 更多推理部署
+
+暂未支持
+
+
+## 5. FAQ
+
+
+## 引用
+
+```bibtex
+@inproceedings{zhou2017east,
+ title={East: an efficient and accurate scene text detector},
+ author={Zhou, Xinyu and Yao, Cong and Wen, He and Wang, Yuzhi and Zhou, Shuchang and He, Weiran and Liang, Jiajun},
+ booktitle={Proceedings of the IEEE conference on Computer Vision and Pattern Recognition},
+ pages={5551--5560},
+ year={2017}
+}
+```
diff --git a/doc/doc_ch/algorithm_det_sast.md b/doc/doc_ch/algorithm_det_sast.md
new file mode 100644
index 0000000000000000000000000000000000000000..038d73fc15f3203bbcc17997c1a8e1c208f80ba8
--- /dev/null
+++ b/doc/doc_ch/algorithm_det_sast.md
@@ -0,0 +1,115 @@
+# SAST
+
+- [1. 算法简介](#1)
+- [2. 环境配置](#2)
+- [3. 模型训练、评估、预测](#3)
+ - [3.1 训练](#3-1)
+ - [3.2 评估](#3-2)
+ - [3.3 预测](#3-3)
+- [4. 推理部署](#4)
+ - [4.1 Python推理](#4-1)
+ - [4.2 C++推理](#4-2)
+ - [4.3 Serving服务化部署](#4-3)
+ - [4.4 更多推理部署](#4-4)
+- [5. FAQ](#5)
+
+
+## 1. 算法简介
+
+论文信息:
+> [A Single-Shot Arbitrarily-Shaped Text Detector based on Context Attended Multi-Task Learning](https://arxiv.org/abs/1908.05498)
+> Wang, Pengfei and Zhang, Chengquan and Qi, Fei and Huang, Zuming and En, Mengyi and Han, Junyu and Liu, Jingtuo and Ding, Errui and Shi, Guangming
+> ACM MM, 2019
+
+在ICDAR2015文本检测公开数据集上,算法复现效果如下:
+
+|模型|骨干网络|配置文件|precision|recall|Hmean|下载链接|
+| --- | --- | --- | --- | --- | --- | --- |
+|SAST|ResNet50_vd|[configs/det/det_r50_vd_sast_icdar15.yml](../../configs/det/det_r50_vd_sast_icdar15.yml)|91.39%|83.77%|87.42%|[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_sast_icdar15_v2.0_train.tar)|
+
+
+在Total-text文本检测公开数据集上,算法复现效果如下:
+
+|模型|骨干网络|配置文件|precision|recall|Hmean|下载链接|
+| --- | --- | --- | --- | --- | --- | --- |
+|SAST|ResNet50_vd|[configs/det/det_r50_vd_sast_totaltext.yml](../../configs/det/det_r50_vd_sast_totaltext.yml)|89.63%|78.44%|83.66%|[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_sast_totaltext_v2.0_train.tar)|
+
+
+
+## 2. 环境配置
+请先参考[《运行环境准备》](./environment.md)配置PaddleOCR运行环境,参考[《项目克隆》](./clone.md)克隆项目代码。
+
+
+
+## 3. 模型训练、评估、预测
+
+请参考[文本检测训练教程](./detection.md)。PaddleOCR对代码进行了模块化,训练不同的检测模型只需要**更换配置文件**即可。
+
+
+
+## 4. 推理部署
+
+
+### 4.1 Python推理
+#### (1). 四边形文本检测模型(ICDAR2015)
+首先将SAST文本检测训练过程中保存的模型,转换成inference model。以基于Resnet50_vd骨干网络,在ICDAR2015英文数据集训练的模型为例([模型下载地址](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_sast_icdar15_v2.0_train.tar)),可以使用如下命令进行转换:
+```
+python3 tools/export_model.py -c configs/det/det_r50_vd_sast_icdar15.yml -o Global.pretrained_model=./det_r50_vd_sast_icdar15_v2.0_train/best_accuracy Global.save_inference_dir=./inference/det_sast_ic15
+
+```
+**SAST文本检测模型推理,需要设置参数`--det_algorithm="SAST"`**,可以执行如下命令:
+```
+python3 tools/infer/predict_det.py --det_algorithm="SAST" --image_dir="./doc/imgs_en/img_10.jpg" --det_model_dir="./inference/det_sast_ic15/"
+```
+可视化文本检测结果默认保存到`./inference_results`文件夹里面,结果文件的名称前缀为'det_res'。结果示例如下:
+
+
+
+#### (2). 弯曲文本检测模型(Total-Text)
+首先将SAST文本检测训练过程中保存的模型,转换成inference model。以基于Resnet50_vd骨干网络,在Total-Text英文数据集训练的模型为例([模型下载地址](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_sast_totaltext_v2.0_train.tar)),可以使用如下命令进行转换:
+
+```
+python3 tools/export_model.py -c configs/det/det_r50_vd_sast_totaltext.yml -o Global.pretrained_model=./det_r50_vd_sast_totaltext_v2.0_train/best_accuracy Global.save_inference_dir=./inference/det_sast_tt
+
+```
+
+SAST文本检测模型推理,需要设置参数`--det_algorithm="SAST"`,同时,还需要增加参数`--det_sast_polygon=True`,可以执行如下命令:
+```
+python3 tools/infer/predict_det.py --det_algorithm="SAST" --image_dir="./doc/imgs_en/img623.jpg" --det_model_dir="./inference/det_sast_tt/" --det_sast_polygon=True
+```
+可视化文本检测结果默认保存到`./inference_results`文件夹里面,结果文件的名称前缀为'det_res'。结果示例如下:
+
+
+
+**注意**:本代码库中,SAST后处理Locality-Aware NMS有python和c++两种版本,c++版速度明显快于python版。由于c++版本nms编译版本问题,只有python3.5环境下会调用c++版nms,其他情况将调用python版nms。
+
+
+### 4.2 C++推理
+
+暂未支持
+
+
+### 4.3 Serving服务化部署
+
+暂未支持
+
+
+### 4.4 更多推理部署
+
+暂未支持
+
+
+## 5. FAQ
+
+
+## 引用
+
+```bibtex
+@inproceedings{wang2019single,
+ title={A Single-Shot Arbitrarily-Shaped Text Detector based on Context Attended Multi-Task Learning},
+ author={Wang, Pengfei and Zhang, Chengquan and Qi, Fei and Huang, Zuming and En, Mengyi and Han, Junyu and Liu, Jingtuo and Ding, Errui and Shi, Guangming},
+ booktitle={Proceedings of the 27th ACM International Conference on Multimedia},
+ pages={1277--1285},
+ year={2019}
+}
+```
diff --git a/doc/doc_ch/algorithm_inference.md b/doc/doc_ch/algorithm_inference.md
index 0222dec85a0be0973b2546cbb6e5347852242093..d8457ea364d9875d9f0796283d42c0b4b10aaa53 100755
--- a/doc/doc_ch/algorithm_inference.md
+++ b/doc/doc_ch/algorithm_inference.md
@@ -296,7 +296,7 @@ Predicts of ./doc/imgs_words_en/word_336.png:('super', 0.9999073)
**注意**:由于上述模型是参考[DTRB](https://arxiv.org/abs/1904.01906)文本识别训练和评估流程,与超轻量级中文识别模型训练有两方面不同:
-- 训练时采用的图像分辨率不同,训练上述模型采用的图像分辨率是[3,32,100],而中文模型训练时,为了保证长文本的识别效果,训练时采用的图像分辨率是[3, 32, 320]。预测推理程序默认的的形状参数是训练中文采用的图像分辨率,即[3, 32, 320]。因此,这里推理上述英文模型时,需要通过参数rec_image_shape设置识别图像的形状。
+- 训练时采用的图像分辨率不同,训练上述模型采用的图像分辨率是[3,32,100],而中文模型训练时,为了保证长文本的识别效果,训练时采用的图像分辨率是[3, 32, 320]。预测推理程序默认的形状参数是训练中文采用的图像分辨率,即[3, 32, 320]。因此,这里推理上述英文模型时,需要通过参数rec_image_shape设置识别图像的形状。
- 字符列表,DTRB论文中实验只是针对26个小写英文本母和10个数字进行实验,总共36个字符。所有大小字符都转成了小写字符,不在上面列表的字符都忽略,认为是空格。因此这里没有输入字符字典,而是通过如下命令生成字典.因此在推理时需要设置参数rec_char_dict_path,指定为英文字典"./ppocr/utils/ic15_dict.txt"。
diff --git a/doc/doc_ch/algorithm_rec_crnn.md b/doc/doc_ch/algorithm_rec_crnn.md
new file mode 100644
index 0000000000000000000000000000000000000000..490485ec10311b63032fadad599d764ea67e68f9
--- /dev/null
+++ b/doc/doc_ch/algorithm_rec_crnn.md
@@ -0,0 +1,140 @@
+# CRNN
+
+- [1. 算法简介](#1)
+- [2. 环境配置](#2)
+- [3. 模型训练、评估、预测](#3)
+ - [3.1 训练](#3-1)
+ - [3.2 评估](#3-2)
+ - [3.3 预测](#3-3)
+- [4. 推理部署](#4)
+ - [4.1 Python推理](#4-1)
+ - [4.2 C++推理](#4-2)
+ - [4.3 Serving服务化部署](#4-3)
+ - [4.4 更多推理部署](#4-4)
+- [5. FAQ](#5)
+
+
+## 1. 算法简介
+
+论文信息:
+> [An End-to-End Trainable Neural Network for Image-based Sequence Recognition and Its Application to Scene Text Recognition](https://arxiv.org/abs/1507.05717)
+
+> Baoguang Shi, Xiang Bai, Cong Yao
+
+> IEEE, 2015
+
+参考[DTRB](https://arxiv.org/abs/1904.01906) 文字识别训练和评估流程,使用MJSynth和SynthText两个文字识别数据集训练,在IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE数据集上进行评估,算法效果如下:
+
+|模型|骨干网络|Avg Accuracy|配置文件|下载链接|
+|---|---|---|---|---|
+|CRNN|Resnet34_vd|81.04%|[configs/rec/rec_r34_vd_none_bilstm_ctc.yml](../../configs/rec/rec_r34_vd_none_bilstm_ctc.yml)|[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_r34_vd_none_bilstm_ctc_v2.0_train.tar)|
+|CRNN|MobileNetV3|77.95%|[configs/rec/rec_mv3_none_bilstm_ctc.yml](../../configs/rec/rec_mv3_none_bilstm_ctc.yml)|[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_mv3_none_bilstm_ctc_v2.0_train.tar)|
+
+
+
+## 2. 环境配置
+请先参考[《运行环境准备》](./environment.md)配置PaddleOCR运行环境,参考[《项目克隆》](./clone.md)克隆项目代码。
+
+
+
+## 3. 模型训练、评估、预测
+
+请参考[文本识别训练教程](./recognition.md)。PaddleOCR对代码进行了模块化,训练不同的识别模型只需要**更换配置文件**即可。
+
+- 训练
+
+在完成数据准备后,便可以启动训练,训练命令如下:
+
+```
+#单卡训练(训练周期长,不建议)
+python3 tools/train.py -c configs/rec/rec_r34_vd_none_bilstm_ctc.yml
+
+#多卡训练,通过--gpus参数指定卡号
+python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c rec_r34_vd_none_bilstm_ctc.yml
+
+```
+
+- 评估
+
+```
+# GPU 评估, Global.pretrained_model 为待测权重
+python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/rec/rec_r34_vd_none_bilstm_ctc.yml -o Global.pretrained_model={path/to/weights}/best_accuracy
+```
+
+- 预测:
+
+```
+# 预测使用的配置文件必须与训练一致
+python3 tools/infer_rec.py -c configs/rec/rec_r34_vd_none_bilstm_ctc.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.infer_img=doc/imgs_words/en/word_1.png
+```
+
+
+## 4. 推理部署
+
+
+### 4.1 Python推理
+
+首先将 CRNN 文本识别训练过程中保存的模型,转换成inference model。以基于Resnet34_vd骨干网络,使用MJSynth和SynthText两个英文文本识别合成数据集训练的[模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_r34_vd_none_bilstm_ctc_v2.0_train.tar) 为例,可以使用如下命令进行转换:
+```shell
+python3 tools/export_model.py -c configs/rec/rec_r34_vd_none_bilstm_ctc.yml -o Global.pretrained_model=./rec_r34_vd_none_bilstm_ctc_v2.0_train/best_accuracy Global.save_inference_dir=./inference/rec_crnn
+```
+CRNN 文本识别模型推理,可以执行如下命令:
+
+```shell
+python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words_en/word_336.png" --rec_model_dir="./inference/rec_crnn/" --rec_image_shape="3, 32, 100" --rec_char_dict_path="./ppocr/utils/ic15_dict.txt"
+```
+
+
+
+执行命令后,上面图像的识别结果如下:
+
+```bash
+Predicts of ./doc/imgs_words_en/word_336.png:('super', 0.9999073)
+```
+
+**注意**:由于上述模型是参考[DTRB](https://arxiv.org/abs/1904.01906)文本识别训练和评估流程,与超轻量级中文识别模型训练有两方面不同:
+
+- 训练时采用的图像分辨率不同,训练上述模型采用的图像分辨率是[3,32,100],而中文模型训练时,为了保证长文本的识别效果,训练时采用的图像分辨率是[3, 32, 320]。预测推理程序默认的形状参数是训练中文采用的图像分辨率,即[3, 32, 320]。因此,这里推理上述英文模型时,需要通过参数rec_image_shape设置识别图像的形状。
+
+- 字符列表,DTRB论文中实验只是针对26个小写英文本母和10个数字进行实验,总共36个字符。所有大小字符都转成了小写字符,不在上面列表的字符都忽略,认为是空格。因此这里没有输入字符字典,而是通过如下命令生成字典.因此在推理时需要设置参数rec_char_dict_path,指定为英文字典"./ppocr/utils/ic15_dict.txt"。
+
+```
+self.character_str = "0123456789abcdefghijklmnopqrstuvwxyz"
+dict_character = list(self.character_str)
+```
+
+
+
+### 4.2 C++推理
+
+准备好推理模型后,参考[cpp infer](../../deploy/cpp_infer/)教程进行操作即可。
+
+
+### 4.3 Serving服务化部署
+
+准备好推理模型后,参考[pdserving](../../deploy/pdserving/)教程进行Serving服务化部署,包括Python Serving和C++ Serving两种模式。
+
+
+### 4.4 更多推理部署
+
+CRNN模型还支持以下推理部署方式:
+
+- Paddle2ONNX推理:准备好推理模型后,参考[paddle2onnx](../../deploy/paddle2onnx/)教程操作。
+
+
+## 5. FAQ
+
+
+## 引用
+
+```bibtex
+@ARTICLE{7801919,
+ author={Shi, Baoguang and Bai, Xiang and Yao, Cong},
+ journal={IEEE Transactions on Pattern Analysis and Machine Intelligence},
+ title={An End-to-End Trainable Neural Network for Image-Based Sequence Recognition and Its Application to Scene Text Recognition},
+ year={2017},
+ volume={39},
+ number={11},
+ pages={2298-2304},
+ doi={10.1109/TPAMI.2016.2646371}}
+```
diff --git a/doc/doc_ch/algorithm_rec_nrtr.md b/doc/doc_ch/algorithm_rec_nrtr.md
new file mode 100644
index 0000000000000000000000000000000000000000..d3b626d0241a6c36797cba20a58f007e383c7aee
--- /dev/null
+++ b/doc/doc_ch/algorithm_rec_nrtr.md
@@ -0,0 +1,154 @@
+# 场景文本识别算法-NRTR
+
+- [1. 算法简介](#1)
+- [2. 环境配置](#2)
+- [3. 模型训练、评估、预测](#3)
+ - [3.1 训练](#3-1)
+ - [3.2 评估](#3-2)
+ - [3.3 预测](#3-3)
+- [4. 推理部署](#4)
+ - [4.1 Python推理](#4-1)
+ - [4.2 C++推理](#4-2)
+ - [4.3 Serving服务化部署](#4-3)
+ - [4.4 更多推理部署](#4-4)
+- [5. FAQ](#5)
+
+
+## 1. 算法简介
+
+论文信息:
+> [NRTR: A No-Recurrence Sequence-to-Sequence Model For Scene Text Recognition](https://arxiv.org/abs/1806.00926)
+> Fenfen Sheng and Zhineng Chen and Bo Xu
+> ICDAR, 2019
+
+
+
+`NRTR`使用MJSynth和SynthText两个文字识别数据集训练,在IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE数据集上进行评估,算法复现效果如下:
+
+|模型|骨干网络|配置文件|Acc|下载链接|
+| --- | --- | --- | --- | --- |
+|NRTR|MTB|[rec_mtb_nrtr.yml](../../configs/rec/rec_mtb_nrtr.yml)|84.21%|[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_mtb_nrtr_train.tar)|
+
+
+## 2. 环境配置
+请先参考[《运行环境准备》](./environment.md)配置PaddleOCR运行环境,参考[《项目克隆》](./clone.md)克隆项目代码。
+
+
+
+## 3. 模型训练、评估、预测
+
+
+### 3.1 模型训练
+
+请参考[文本识别训练教程](./recognition.md)。PaddleOCR对代码进行了模块化,训练`NRTR`识别模型时需要**更换配置文件**为`NRTR`的[配置文件](../../configs/rec/rec_mtb_nrtr.yml)。
+
+#### 启动训练
+
+
+具体地,在完成数据准备后,便可以启动训练,训练命令如下:
+```shell
+#单卡训练(训练周期长,不建议)
+python3 tools/train.py -c configs/rec/rec_mtb_nrtr.yml
+
+#多卡训练,通过--gpus参数指定卡号
+python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/rec/rec_mtb_nrtr.yml
+```
+
+
+### 3.2 评估
+
+可下载已训练完成的[模型文件](#model),使用如下命令进行评估:
+
+```shell
+# 注意将pretrained_model的路径设置为本地路径。
+python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/rec/rec_mtb_nrtr.yml -o Global.pretrained_model=./rec_mtb_nrtr_train/best_accuracy
+```
+
+
+### 3.3 预测
+
+使用如下命令进行单张图片预测:
+```shell
+# 注意将pretrained_model的路径设置为本地路径。
+python3 tools/infer_rec.py -c configs/rec/rec_mtb_nrtr.yml -o Global.infer_img='./doc/imgs_words_en/word_10.png' Global.pretrained_model=./rec_mtb_nrtr_train/best_accuracy
+# 预测文件夹下所有图像时,可修改infer_img为文件夹,如 Global.infer_img='./doc/imgs_words_en/'。
+```
+
+
+
+## 4. 推理部署
+
+
+### 4.1 Python推理
+首先将训练得到best模型,转换成inference model。这里以训练完成的模型为例([模型下载地址](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_mtb_nrtr_train.tar) ),可以使用如下命令进行转换:
+
+```shell
+# 注意将pretrained_model的路径设置为本地路径。
+python3 tools/export_model.py -c configs/rec/rec_mtb_nrtr.yml -o Global.pretrained_model=./rec_mtb_nrtr_train/best_accuracy Global.save_inference_dir=./inference/rec_mtb_nrtr/
+```
+**注意:**
+- 如果您是在自己的数据集上训练的模型,并且调整了字典文件,请注意修改配置文件中的`character_dict_path`是否是所需要的字典文件。
+- 如果您修改了训练时的输入大小,请修改`tools/export_model.py`文件中的对应NRTR的`infer_shape`。
+
+转换成功后,在目录下有三个文件:
+```
+/inference/rec_mtb_nrtr/
+ ├── inference.pdiparams # 识别inference模型的参数文件
+ ├── inference.pdiparams.info # 识别inference模型的参数信息,可忽略
+ └── inference.pdmodel # 识别inference模型的program文件
+```
+
+执行如下命令进行模型推理:
+
+```shell
+python3 tools/infer/predict_rec.py --image_dir='./doc/imgs_words_en/word_10.png' --rec_model_dir='./inference/rec_mtb_nrtr/' --rec_algorithm='NRTR' --rec_image_shape='1,32,100' --rec_char_dict_path='./ppocr/utils/EN_symbol_dict.txt'
+# 预测文件夹下所有图像时,可修改image_dir为文件夹,如 --image_dir='./doc/imgs_words_en/'。
+```
+
+
+
+执行命令后,上面图像的预测结果(识别的文本和得分)会打印到屏幕上,示例如下:
+结果如下:
+```shell
+Predicts of ./doc/imgs_words_en/word_10.png:('pain', 0.9265879392623901)
+```
+
+**注意**:
+
+- 训练上述模型采用的图像分辨率是[1,32,100],需要通过参数`rec_image_shape`设置为您训练时的识别图像形状。
+- 在推理时需要设置参数`rec_char_dict_path`指定字典,如果您修改了字典,请修改该参数为您的字典文件。
+- 如果您修改了预处理方法,需修改`tools/infer/predict_rec.py`中NRTR的预处理为您的预处理方法。
+
+
+
+### 4.2 C++推理部署
+
+由于C++预处理后处理还未支持NRTR,所以暂未支持
+
+
+### 4.3 Serving服务化部署
+
+暂不支持
+
+
+### 4.4 更多推理部署
+
+暂不支持
+
+
+## 5. FAQ
+
+1. `NRTR`论文中使用Beam搜索进行解码字符,但是速度较慢,这里默认未使用Beam搜索,以贪婪搜索进行解码字符。
+
+## 引用
+
+```bibtex
+@article{Sheng2019NRTR,
+ title = {NRTR: A No-Recurrence Sequence-to-Sequence Model For Scene Text Recognition},
+ author = {Fenfen Sheng and Zhineng Chen andBo Xu},
+ booktitle = {ICDAR},
+ year = {2019},
+ url = {http://arxiv.org/abs/1806.00926},
+ pages = {781-786}
+}
+```
diff --git a/doc/doc_ch/algorithm_rec_rare.md b/doc/doc_ch/algorithm_rec_rare.md
new file mode 100644
index 0000000000000000000000000000000000000000..dddd27ef9880fc7f8735f32fa85a406a57a77c8e
--- /dev/null
+++ b/doc/doc_ch/algorithm_rec_rare.md
@@ -0,0 +1,121 @@
+# RARE
+
+- [1. 算法简介](#1)
+- [2. 环境配置](#2)
+- [3. 模型训练、评估、预测](#3)
+ - [3.1 训练](#3-1)
+ - [3.2 评估](#3-2)
+ - [3.3 预测](#3-3)
+- [4. 推理部署](#4)
+ - [4.1 Python推理](#4-1)
+ - [4.2 C++推理](#4-2)
+ - [4.3 Serving服务化部署](#4-3)
+ - [4.4 更多推理部署](#4-4)
+- [5. FAQ](#5)
+
+
+## 1. 算法简介
+
+论文信息:
+> [Robust Scene Text Recognition with Automatic Rectification](https://arxiv.org/abs/1603.03915v2)
+> Baoguang Shi, Xinggang Wang, Pengyuan Lyu, Cong Yao, Xiang Bai∗
+> CVPR, 2016
+
+使用MJSynth和SynthText两个文字识别数据集训练,在IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE数据集上进行评估,算法复现效果如下:
+
+|模型|骨干网络|配置文件|Avg Accuracy|下载链接|
+| --- | --- | --- | --- | --- |
+|RARE|Resnet34_vd|[configs/rec/rec_r34_vd_tps_bilstm_att.yml](../../configs/rec/rec_r34_vd_tps_bilstm_att.yml)|83.6%|[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_r34_vd_tps_bilstm_att_v2.0_train.tar)|
+|RARE|MobileNetV3|[configs/rec/rec_mv3_tps_bilstm_att.yml](../../configs/rec/rec_mv3_tps_bilstm_att.yml)|82.5%|[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_mv3_tps_bilstm_att_v2.0_train.tar)|
+
+
+
+## 2. 环境配置
+请先参考[《运行环境准备》](./environment.md)配置PaddleOCR运行环境,参考[《项目克隆》](./clone.md)克隆项目代码。
+
+
+## 3. 模型训练、评估、预测
+
+请参考[文本识别训练教程](./recognition.md)。PaddleOCR对代码进行了模块化,训练不同的识别模型只需要**更换配置文件**即可。以基于Resnet34_vd骨干网络为例:
+
+
+### 3.1 训练
+
+```
+#单卡训练(训练周期长,不建议)
+python3 tools/train.py -c configs/rec/rec_r34_vd_tps_bilstm_att.yml
+#多卡训练,通过--gpus参数指定卡号
+python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/rec/rec_r34_vd_tps_bilstm_att.yml
+```
+
+
+### 3.2 评估
+
+```
+# GPU评估, Global.pretrained_model为待评估模型
+python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/rec/rec_r34_vd_tps_bilstm_att.yml -o Global.pretrained_model={path/to/weights}/best_accuracy
+```
+
+
+### 3.3 预测
+
+```
+python3 tools/infer_rec.py -c configs/rec/rec_r34_vd_tps_bilstm_att.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.infer_img=doc/imgs_words/en/word_1.png
+```
+
+
+## 4. 推理部署
+
+
+### 4.1 Python推理
+首先将RARE文本识别训练过程中保存的模型,转换成inference model。以基于Resnet34_vd骨干网络,在MJSynth和SynthText两个文字识别数据集训练得到的模型为例( [模型下载地址](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_r34_vd_tps_bilstm_att_v2.0_train.tar) ),可以使用如下命令进行转换:
+
+```shell
+python3 tools/export_model.py -c configs/rec/rec_r34_vd_tps_bilstm_att.yml -o Global.pretrained_model=./rec_r34_vd_tps_bilstm_att_v2.0_train/best_accuracy Global.save_inference_dir=./inference/rec_rare
+```
+
+RARE文本识别模型推理,可以执行如下命令:
+
+```shell
+python3 tools/infer/predict_rec.py --image_dir="doc/imgs_words/en/word_1.png" --rec_model_dir="./inference/rec_rare/" --rec_image_shape="3, 32, 100" --rec_char_dict_path="./ppocr/utils/ic15_dict.txt"
+```
+推理结果如下所示:
+
+
+
+```
+Predicts of doc/imgs_words/en/word_1.png:('joint ', 0.9999969601631165)
+```
+
+
+
+### 4.2 C++推理
+
+暂不支持
+
+
+### 4.3 Serving服务化部署
+
+暂不支持
+
+
+### 4.4 更多推理部署
+
+RARE模型还支持以下推理部署方式:
+
+- Paddle2ONNX推理:准备好推理模型后,参考[paddle2onnx](../../deploy/paddle2onnx/)教程操作。
+
+
+## 5. FAQ
+
+
+## 引用
+
+```bibtex
+@inproceedings{2016Robust,
+ title={Robust Scene Text Recognition with Automatic Rectification},
+ author={ Shi, B. and Wang, X. and Lyu, P. and Cong, Y. and Xiang, B. },
+ booktitle={2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
+ year={2016},
+}
+```
diff --git a/doc/doc_ch/algorithm_rec_rosetta.md b/doc/doc_ch/algorithm_rec_rosetta.md
new file mode 100644
index 0000000000000000000000000000000000000000..c1784cac8c073eaaffc0258df2c419239332bb65
--- /dev/null
+++ b/doc/doc_ch/algorithm_rec_rosetta.md
@@ -0,0 +1,123 @@
+# Rosetta
+
+- [1. 算法简介](#1)
+- [2. 环境配置](#2)
+- [3. 模型训练、评估、预测](#3)
+ - [3.1 训练](#3-1)
+ - [3.2 评估](#3-2)
+ - [3.3 预测](#3-3)
+- [4. 推理部署](#4)
+ - [4.1 Python推理](#4-1)
+ - [4.2 C++推理](#4-2)
+ - [4.3 Serving服务化部署](#4-3)
+ - [4.4 更多推理部署](#4-4)
+- [5. FAQ](#5)
+
+
+## 1. 算法简介
+
+论文信息:
+> [Rosetta: Large Scale System for Text Detection and Recognition in Images](https://arxiv.org/abs/1910.05085)
+> Borisyuk F , Gordo A , V Sivakumar
+> KDD, 2018
+
+使用MJSynth和SynthText两个文字识别数据集训练,在IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE数据集上进行评估, 算法复现效果如下:
+
+|模型|骨干网络|配置文件|Avg Accuracy|下载链接|
+| --- | --- | --- | --- | --- |
+|Rosetta|Resnet34_vd|[configs/rec/rec_r34_vd_none_none_ctc.yml](../../configs/rec/rec_r34_vd_none_none_ctc.yml)|79.11%|[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_r34_vd_none_none_ctc_v2.0_train.tar)|
+|Rosetta|MobileNetV3|[configs/rec/rec_mv3_none_none_ctc.yml](../../configs/rec/rec_mv3_none_none_ctc.yml)|75.80%|[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_mv3_none_none_ctc_v2.0_train.tar)|
+
+
+
+## 2. 环境配置
+请先参考[《运行环境准备》](./environment.md)配置PaddleOCR运行环境,参考[《项目克隆》](./clone.md)克隆项目代码。
+
+
+
+## 3. 模型训练、评估、预测
+
+请参考[文本识别训练教程](./recognition.md)。PaddleOCR对代码进行了模块化,训练不同的识别模型只需要**更换配置文件**即可。 以基于Resnet34_vd骨干网络为例:
+
+
+### 3.1 训练
+
+```
+#单卡训练(训练周期长,不建议)
+python3 tools/train.py -c configs/rec/rec_r34_vd_none_none_ctc.yml
+#多卡训练,通过--gpus参数指定卡号
+python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/rec/rec_r34_vd_none_none_ctc.yml
+```
+
+
+### 3.2 评估
+
+```
+# GPU评估, Global.pretrained_model为待评估模型
+python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/rec/rec_r34_vd_none_none_ctc.yml -o Global.pretrained_model={path/to/weights}/best_accuracy
+```
+
+
+### 3.3 预测
+
+```
+python3 tools/infer_rec.py -c configs/rec/rec_r34_vd_none_none_ctc.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.infer_img=doc/imgs_words/en/word_1.png
+```
+
+
+
+## 4. 推理部署
+
+
+### 4.1 Python推理
+首先将Rosetta文本识别训练过程中保存的模型,转换成inference model。以基于Resnet34_vd骨干网络,在MJSynth和SynthText两个文字识别数据集训练得到的模型为例( [模型下载地址](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_r34_vd_none_none_ctc_v2.0_train.tar) ),可以使用如下命令进行转换:
+
+```shell
+python3 tools/export_model.py -c configs/rec/rec_r34_vd_none_none_ctc.yml -o Global.pretrained_model=./rec_r34_vd_none_none_ctc_v2.0_train/best_accuracy Global.save_inference_dir=./inference/rec_rosetta
+```
+
+Rosetta文本识别模型推理,可以执行如下命令:
+
+```shell
+python3 tools/infer/predict_rec.py --image_dir="doc/imgs_words/en/word_1.png" --rec_model_dir="./inference/rec_rosetta/" --rec_image_shape="3, 32, 100" --rec_char_dict_path="./ppocr/utils/ic15_dict.txt"
+```
+
+推理结果如下所示:
+
+
+
+```
+Predicts of doc/imgs_words/en/word_1.png:('joint', 0.9999982714653015)
+```
+
+
+### 4.2 C++推理
+
+暂不支持
+
+
+### 4.3 Serving服务化部署
+
+暂不支持
+
+
+### 4.4 更多推理部署
+
+Rosetta模型还支持以下推理部署方式:
+
+- Paddle2ONNX推理:准备好推理模型后,参考[paddle2onnx](../../deploy/paddle2onnx/)教程操作。
+
+
+## 5. FAQ
+
+
+## 引用
+
+```bibtex
+@inproceedings{2018Rosetta,
+ title={Rosetta: Large Scale System for Text Detection and Recognition in Images},
+ author={ Borisyuk, Fedor and Gordo, Albert and Sivakumar, Viswanath },
+ booktitle={the 24th ACM SIGKDD International Conference},
+ year={2018},
+}
+```
diff --git a/doc/doc_ch/algorithm_rec_sar.md b/doc/doc_ch/algorithm_rec_sar.md
index aedc16714518b2de220118f755e00c3ba6bc7a5e..b8304313994754480a89d708e39149d67f828c0d 100644
--- a/doc/doc_ch/algorithm_rec_sar.md
+++ b/doc/doc_ch/algorithm_rec_sar.md
@@ -24,7 +24,7 @@
使用MJSynth和SynthText两个文字识别数据集训练,在IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE数据集上进行评估,算法复现效果如下:
|模型|骨干网络|配置文件|Acc|下载链接|
-| --- | --- | --- | --- | --- | --- | --- |
+| --- | --- | --- | --- | --- |
|SAR|ResNet31|[rec_r31_sar.yml](../../configs/rec/rec_r31_sar.yml)|87.20%|[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/rec/rec_r31_sar_train.tar)|
注:除了使用MJSynth和SynthText两个文字识别数据集外,还加入了[SynthAdd](https://pan.baidu.com/share/init?surl=uV0LtoNmcxbO-0YA7Ch4dg)数据(提取码:627x),和部分真实数据,具体数据细节可以参考论文。
diff --git a/doc/doc_ch/algorithm_rec_seed.md b/doc/doc_ch/algorithm_rec_seed.md
new file mode 100644
index 0000000000000000000000000000000000000000..710e92272dc3169bf373d273534441a15c6be01c
--- /dev/null
+++ b/doc/doc_ch/algorithm_rec_seed.md
@@ -0,0 +1,113 @@
+# SEED
+
+- [1. 算法简介](#1)
+- [2. 环境配置](#2)
+- [3. 模型训练、评估、预测](#3)
+ - [3.1 训练](#3-1)
+ - [3.2 评估](#3-2)
+ - [3.3 预测](#3-3)
+- [4. 推理部署](#4)
+ - [4.1 Python推理](#4-1)
+ - [4.2 C++推理](#4-2)
+ - [4.3 Serving服务化部署](#4-3)
+ - [4.4 更多推理部署](#4-4)
+- [5. FAQ](#5)
+
+
+## 1. 算法简介
+
+论文信息:
+> [SEED: Semantics Enhanced Encoder-Decoder Framework for Scene Text Recognition](https://arxiv.org/pdf/2005.10977.pdf)
+
+> Qiao, Zhi and Zhou, Yu and Yang, Dongbao and Zhou, Yucan and Wang, Weiping
+
+> CVPR, 2020
+
+参考[DTRB](https://arxiv.org/abs/1904.01906) 文字识别训练和评估流程,使用MJSynth和SynthText两个文字识别数据集训练,在IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE数据集上进行评估,算法效果如下:
+
+|模型|骨干网络|Avg Accuracy|配置文件|下载链接|
+|---|---|---|---|---|
+|SEED|Aster_Resnet| 85.2% | [configs/rec/rec_resnet_stn_bilstm_att.yml](../../configs/rec/rec_resnet_stn_bilstm_att.yml) | [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/rec/rec_resnet_stn_bilstm_att.tar) |
+
+
+## 2. 环境配置
+请先参考[《运行环境准备》](./environment.md)配置PaddleOCR运行环境,参考[《项目克隆》](./clone.md)克隆项目代码。
+
+
+
+## 3. 模型训练、评估、预测
+
+请参考[文本识别训练教程](./recognition.md)。PaddleOCR对代码进行了模块化,训练不同的识别模型只需要**更换配置文件**即可。
+
+- 训练
+
+SEED模型需要额外加载FastText训练好的[语言模型](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz) ,并且安装 fasttext 依赖:
+
+```
+python3 -m pip install fasttext==0.9.1
+```
+
+然后,在完成数据准备后,便可以启动训练,训练命令如下:
+
+```
+#单卡训练(训练周期长,不建议)
+python3 tools/train.py -c configs/rec/rec_resnet_stn_bilstm_att.yml
+
+#多卡训练,通过--gpus参数指定卡号
+python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c rec_resnet_stn_bilstm_att.yml
+
+```
+
+- 评估
+
+```
+# GPU 评估, Global.pretrained_model 为待测权重
+python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/rec/rec_resnet_stn_bilstm_att.yml -o Global.pretrained_model={path/to/weights}/best_accuracy
+```
+
+- 预测:
+
+```
+# 预测使用的配置文件必须与训练一致
+python3 tools/infer_rec.py -c configs/rec/rec_resnet_stn_bilstm_att.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.infer_img=doc/imgs_words/en/word_1.png
+```
+
+
+## 4. 推理部署
+
+
+### 4.1 Python推理
+
+coming soon
+
+
+
+### 4.2 C++推理
+
+coming soon
+
+
+### 4.3 Serving服务化部署
+
+coming soon
+
+
+### 4.4 更多推理部署
+
+coming soon
+
+
+## 5. FAQ
+
+
+## 引用
+
+```bibtex
+@inproceedings{qiao2020seed,
+ title={Seed: Semantics enhanced encoder-decoder framework for scene text recognition},
+ author={Qiao, Zhi and Zhou, Yu and Yang, Dongbao and Zhou, Yucan and Wang, Weiping},
+ booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+ pages={13528--13537},
+ year={2020}
+}
+```
diff --git a/doc/doc_ch/algorithm_rec_srn.md b/doc/doc_ch/algorithm_rec_srn.md
index b124790761eed875434ecf509f7647fe23d1bc90..ca7961359eb902fafee959b26d02f324aece233a 100644
--- a/doc/doc_ch/algorithm_rec_srn.md
+++ b/doc/doc_ch/algorithm_rec_srn.md
@@ -24,7 +24,7 @@
使用MJSynth和SynthText两个文字识别数据集训练,在IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE数据集上进行评估,算法复现效果如下:
|模型|骨干网络|配置文件|Acc|下载链接|
-| --- | --- | --- | --- | --- | --- | --- |
+| --- | --- | --- | --- | --- |
|SRN|Resnet50_vd_fpn|[rec_r50_fpn_srn.yml](../../configs/rec/rec_r50_fpn_srn.yml)|86.31%|[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_r50_vd_srn_train.tar)|
diff --git a/doc/doc_ch/algorithm_rec_starnet.md b/doc/doc_ch/algorithm_rec_starnet.md
new file mode 100644
index 0000000000000000000000000000000000000000..c1ca761a0780b6bb8d15c12c820902c27d11d029
--- /dev/null
+++ b/doc/doc_ch/algorithm_rec_starnet.md
@@ -0,0 +1,139 @@
+# STAR-Net
+
+- [1. 算法简介](#1)
+- [2. 环境配置](#2)
+- [3. 模型训练、评估、预测](#3)
+ - [3.1 训练](#3-1)
+ - [3.2 评估](#3-2)
+ - [3.3 预测](#3-3)
+- [4. 推理部署](#4)
+ - [4.1 Python推理](#4-1)
+ - [4.2 C++推理](#4-2)
+ - [4.3 Serving服务化部署](#4-3)
+ - [4.4 更多推理部署](#4-4)
+- [5. FAQ](#5)
+
+
+## 1. 算法简介
+
+论文信息:
+> [STAR-Net: a spatial attention residue network for scene text recognition.](http://www.bmva.org/bmvc/2016/papers/paper043/paper043.pdf)
+
+> Wei Liu, Chaofeng Chen, Kwan-Yee K. Wong, Zhizhong Su and Junyu Han.
+
+> BMVC, pages 43.1-43.13, 2016
+
+参考[DTRB](https://arxiv.org/abs/1904.01906) 文字识别训练和评估流程,使用MJSynth和SynthText两个文字识别数据集训练,在IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE数据集上进行评估,算法效果如下:
+
+|模型|骨干网络|Avg Accuracy|配置文件|下载链接|
+|---|---|---|---|---|
+|StarNet|Resnet34_vd|84.44%|[configs/rec/rec_r34_vd_tps_bilstm_ctc.yml](../../configs/rec/rec_r34_vd_tps_bilstm_ctc.yml)|[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_r34_vd_tps_bilstm_ctc_v2.0_train.tar)|
+|StarNet|MobileNetV3|81.42%|[configs/rec/rec_mv3_tps_bilstm_ctc.yml](../../configs/rec/rec_mv3_tps_bilstm_ctc.yml)|[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_mv3_tps_bilstm_ctc_v2.0_train.tar)|
+
+
+
+## 2. 环境配置
+请先参考[《运行环境准备》](./environment.md)配置PaddleOCR运行环境,参考[《项目克隆》](./clone.md)克隆项目代码。
+
+
+
+## 3. 模型训练、评估、预测
+
+请参考[文本识别训练教程](./recognition.md)。PaddleOCR对代码进行了模块化,训练不同的识别模型只需要**更换配置文件**即可。
+
+- 训练
+
+在完成数据准备后,便可以启动训练,训练命令如下:
+
+```
+#单卡训练(训练周期长,不建议)
+python3 tools/train.py -c configs/rec/rec_r34_vd_tps_bilstm_ctc.yml
+
+#多卡训练,通过--gpus参数指定卡号
+python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c rec_r34_vd_tps_bilstm_ctc.yml
+
+```
+
+- 评估
+
+```
+# GPU 评估, Global.pretrained_model 为待测权重
+python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/rec/rec_r34_vd_tps_bilstm_ctc.yml -o Global.pretrained_model={path/to/weights}/best_accuracy
+```
+
+- 预测:
+
+```
+# 预测使用的配置文件必须与训练一致
+python3 tools/infer_rec.py -c configs/rec/rec_r34_vd_tps_bilstm_ctc.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.infer_img=doc/imgs_words/en/word_1.png
+```
+
+
+## 4. 推理部署
+
+
+### 4.1 Python推理
+
+首先将 STAR-Net 文本识别训练过程中保存的模型,转换成inference model。以基于Resnet34_vd骨干网络,使用MJSynth和SynthText两个英文文本识别合成数据集训练的[模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_r34_vd_none_bilstm_ctc_v2.0_train.tar) 为例,可以使用如下命令进行转换:
+```shell
+python3 tools/export_model.py -c configs/rec/rec_r34_vd_tps_bilstm_ctc.yml -o Global.pretrained_model=./rec_r34_vd_tps_bilstm_ctc_v2.0_train/best_accuracy Global.save_inference_dir=./inference/rec_starnet
+```
+STAR-Net 文本识别模型推理,可以执行如下命令:
+
+```shell
+python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words_en/word_336.png" --rec_model_dir="./inference/rec_starnet/" --rec_image_shape="3, 32, 100" --rec_char_dict_path="./ppocr/utils/ic15_dict.txt"
+```
+
+
+
+执行命令后,上面图像的识别结果如下:
+
+```bash
+Predicts of ./doc/imgs_words_en/word_336.png:('super', 0.9999073)
+```
+
+**注意**:由于上述模型是参考[DTRB](https://arxiv.org/abs/1904.01906)文本识别训练和评估流程,与超轻量级中文识别模型训练有两方面不同:
+
+- 训练时采用的图像分辨率不同,训练上述模型采用的图像分辨率是[3,32,100],而中文模型训练时,为了保证长文本的识别效果,训练时采用的图像分辨率是[3, 32, 320]。预测推理程序默认的形状参数是训练中文采用的图像分辨率,即[3, 32, 320]。因此,这里推理上述英文模型时,需要通过参数rec_image_shape设置识别图像的形状。
+
+- 字符列表,DTRB论文中实验只是针对26个小写英文本母和10个数字进行实验,总共36个字符。所有大小字符都转成了小写字符,不在上面列表的字符都忽略,认为是空格。因此这里没有输入字符字典,而是通过如下命令生成字典.因此在推理时需要设置参数rec_char_dict_path,指定为英文字典"./ppocr/utils/ic15_dict.txt"。
+
+```
+self.character_str = "0123456789abcdefghijklmnopqrstuvwxyz"
+dict_character = list(self.character_str)
+```
+
+
+
+### 4.2 C++推理
+
+准备好推理模型后,参考[cpp infer](../../deploy/cpp_infer/)教程进行操作即可。
+
+
+### 4.3 Serving服务化部署
+
+准备好推理模型后,参考[pdserving](../../deploy/pdserving/)教程进行Serving服务化部署,包括Python Serving和C++ Serving两种模式。
+
+
+### 4.4 更多推理部署
+
+STAR-Net模型还支持以下推理部署方式:
+
+- Paddle2ONNX推理:准备好推理模型后,参考[paddle2onnx](../../deploy/paddle2onnx/)教程操作。
+
+
+## 5. FAQ
+
+
+## 引用
+
+```bibtex
+@inproceedings{liu2016star,
+ title={STAR-Net: a spatial attention residue network for scene text recognition.},
+ author={Liu, Wei and Chen, Chaofeng and Wong, Kwan-Yee K and Su, Zhizhong and Han, Junyu},
+ booktitle={BMVC},
+ volume={2},
+ pages={7},
+ year={2016}
+}
+```
diff --git a/doc/doc_ch/algorithm_rec_svtr.md b/doc/doc_ch/algorithm_rec_svtr.md
new file mode 100644
index 0000000000000000000000000000000000000000..41a22ca65c00d0668298cdcf1486c1c3afdef919
--- /dev/null
+++ b/doc/doc_ch/algorithm_rec_svtr.md
@@ -0,0 +1,176 @@
+# 场景文本识别算法-SVTR
+
+- [1. 算法简介](#1)
+- [2. 环境配置](#2)
+- [3. 模型训练、评估、预测](#3)
+ - [3.1 训练](#3-1)
+ - [3.2 评估](#3-2)
+ - [3.3 预测](#3-3)
+- [4. 推理部署](#4)
+ - [4.1 Python推理](#4-1)
+ - [4.2 C++推理](#4-2)
+ - [4.3 Serving服务化部署](#4-3)
+ - [4.4 更多推理部署](#4-4)
+- [5. FAQ](#5)
+
+
+## 1. 算法简介
+
+论文信息:
+> [SVTR: Scene Text Recognition with a Single Visual Model](https://arxiv.org/abs/2205.00159)
+> Yongkun Du and Zhineng Chen and Caiyan Jia Xiaoting Yin and Tianlun Zheng and Chenxia Li and Yuning Du and Yu-Gang Jiang
+> IJCAI, 2022
+
+场景文本识别旨在将自然图像中的文本转录为数字字符序列,从而传达对场景理解至关重要的高级语义。这项任务由于文本变形、字体、遮挡、杂乱背景等方面的变化具有一定的挑战性。先前的方法为提高识别精度做出了许多工作。然而文本识别器除了准确度外,还因为实际需求需要考虑推理速度等因素。
+
+### SVTR算法简介
+
+主流的场景文本识别模型通常包含两个模块:用于特征提取的视觉模型和用于文本转录的序列模型。这种架构虽然准确,但复杂且效率较低,限制了在实际场景中的应用。SVTR提出了一种用于场景文本识别的单视觉模型,该模型在patch-wise image tokenization框架内,完全摒弃了序列建模,在精度具有竞争力的前提下,模型参数量更少,速度更快,主要有以下几点贡献:
+1. 首次发现单视觉模型可以达到与视觉语言模型相媲美甚至更高的准确率,并且其具有效率高和适应多语言的优点,在实际应用中很有前景。
+2. SVTR从字符组件的角度出发,逐渐的合并字符组件,自下而上地完成字符的识别。
+3. SVTR引入了局部和全局Mixing,分别用于提取字符组件特征和字符间依赖关系,与多尺度的特征一起,形成多粒度特征描述。
+
+
+
+SVTR在场景文本识别公开数据集上的精度(%)和模型文件如下:
+
+* 中文数据集来自于[Chinese Benckmark](https://arxiv.org/abs/2112.15093) ,SVTR的中文训练评估策略遵循该论文。
+
+| 模型 |IC13
857 | SVT |IIIT5k
3000 |IC15
1811| SVTP |CUTE80 | Avg_6 |IC15
2077 |IC13
1015 |IC03
867|IC03
860|Avg_10 | Chinese
scene_test| 下载链接 |
+|:----------:|:------:|:-----:|:---------:|:------:|:-----:|:-----:|:-----:|:-------:|:-------:|:-----:|:-----:|:---------------------------------------------:|:-----:|:-----:|
+| SVTR Tiny | 96.85 | 91.34 | 94.53 | 83.99 | 85.43 | 89.24 | 90.87 | 80.55 | 95.37 | 95.27 | 95.70 | 90.13 | 67.90 | [英文](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/rec_svtr_tiny_none_ctc_en_train.tar) / [中文](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/rec_svtr_tiny_none_ctc_ch_train.tar) |
+| SVTR Small | 95.92 | 93.04 | 95.03 | 84.70 | 87.91 | 92.01 | 91.63 | 82.72 | 94.88 | 96.08 | 96.28 | 91.02 | 69.00 | [英文](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/rec_svtr_small_none_ctc_en_train.tar) / [中文](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/rec_svtr_small_none_ctc_ch_train.tar) |
+| SVTR Base | 97.08 | 91.50 | 96.03 | 85.20 | 89.92 | 91.67 | 92.33 | 83.73 | 95.66 | 95.62 | 95.81 | 91.61 | 71.40 | [英文](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/rec_svtr_base_none_ctc_en_train.tar) / - |
+| SVTR Large | 97.20 | 91.65 | 96.30 | 86.58 | 88.37 | 95.14 | 92.82 | 84.54 | 96.35 | 96.54 | 96.74 | 92.24 | 72.10 | [英文](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/rec_svtr_large_none_ctc_en_train.tar) / [中文](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/rec_svtr_large_none_ctc_ch_train.tar) |
+
+
+
+
+## 2. 环境配置
+请先参考[《运行环境准备》](./environment.md)配置PaddleOCR运行环境,参考[《项目克隆》](./clone.md)克隆项目代码。
+
+
+
+## 3. 模型训练、评估、预测
+
+
+### 3.1 模型训练
+
+#### 数据集准备
+
+[英文数据集下载](https://github.com/clovaai/deep-text-recognition-benchmark#download-lmdb-dataset-for-traininig-and-evaluation-from-here)
+[中文数据集下载](https://github.com/fudanvi/benchmarking-chinese-text-recognition#download)
+
+#### 启动训练
+
+请参考[文本识别训练教程](./recognition.md)。PaddleOCR对代码进行了模块化,训练`SVTR`识别模型时需要**更换配置文件**为`SVTR`的[配置文件](../../configs/rec/rec_svtrnet.yml)。
+
+具体地,在完成数据准备后,便可以启动训练,训练命令如下:
+```shell
+#单卡训练(训练周期长,不建议)
+python3 tools/train.py -c configs/rec/rec_svtrnet.yml
+
+#多卡训练,通过--gpus参数指定卡号
+python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/rec/rec_svtrnet.yml
+```
+
+
+### 3.2 评估
+
+可下载`SVTR`提供的模型文件和配置文件:[下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/rec_svtr_tiny_none_ctc_en_train.tar) ,以`SVTR-T`为例,使用如下命令进行评估:
+
+```shell
+# 下载包含SVTR-T的模型文件和配置文件的tar压缩包并解压
+wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/rec_svtr_tiny_none_ctc_en_train.tar && tar xf rec_svtr_tiny_none_ctc_en_train.tar
+# 注意将pretrained_model的路径设置为本地路径。
+python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c ./rec_svtr_tiny_none_ctc_en_train/rec_svtr_tiny_6local_6global_stn_en.yml -o Global.pretrained_model=./rec_svtr_tiny_none_ctc_en_train/best_accuracy
+```
+
+
+### 3.3 预测
+
+使用如下命令进行单张图片预测:
+```shell
+# 注意将pretrained_model的路径设置为本地路径。
+python3 tools/infer_rec.py -c ./rec_svtr_tiny_none_ctc_en_train/rec_svtr_tiny_6local_6global_stn_en.yml -o Global.infer_img='./doc/imgs_words_en/word_10.png' Global.pretrained_model=./rec_svtr_tiny_none_ctc_en_train/best_accuracy
+# 预测文件夹下所有图像时,可修改infer_img为文件夹,如 Global.infer_img='./doc/imgs_words_en/'。
+```
+
+
+
+## 4. 推理部署
+
+
+### 4.1 Python推理
+首先将训练得到best模型,转换成inference model。下面以基于`SVTR-T`,在英文数据集训练的模型为例([模型和配置文件下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/rec_svtr_tiny_none_ctc_en_train.tar) ),可以使用如下命令进行转换:
+
+```shell
+# 注意将pretrained_model的路径设置为本地路径。
+python3 tools/export_model.py -c ./rec_svtr_tiny_none_ctc_en_train/rec_svtr_tiny_6local_6global_stn_en.yml -o Global.pretrained_model=./rec_svtr_tiny_none_ctc_en_train/best_accuracy Global.save_inference_dir=./inference/rec_svtr_tiny_stn_en
+```
+
+**注意:**
+- 如果您是在自己的数据集上训练的模型,并且调整了字典文件,请注意修改配置文件中的`character_dict_path`是否为所正确的字典文件。
+- 如果您修改了训练时的输入大小,请修改`tools/export_model.py`文件中的对应SVTR的`infer_shape`。
+
+转换成功后,在目录下有三个文件:
+```
+/inference/rec_svtr_tiny_stn_en/
+ ├── inference.pdiparams # 识别inference模型的参数文件
+ ├── inference.pdiparams.info # 识别inference模型的参数信息,可忽略
+ └── inference.pdmodel # 识别inference模型的program文件
+```
+
+
+执行如下命令进行模型推理:
+
+```shell
+python3 tools/infer/predict_rec.py --image_dir='./doc/imgs_words_en/word_10.png' --rec_model_dir='./inference/rec_svtr_tiny_stn_en/' --rec_algorithm='SVTR' --rec_image_shape='3,64,256' --rec_char_dict_path='./ppocr/utils/ic15_dict.txt'
+# 预测文件夹下所有图像时,可修改image_dir为文件夹,如 --image_dir='./doc/imgs_words_en/'。
+```
+
+
+执行命令后,上面图像的预测结果(识别的文本和得分)会打印到屏幕上,示例如下:
+结果如下:
+```shell
+Predicts of ./doc/imgs_words_en/word_10.png:('pain', 0.9999998807907104)
+```
+
+**注意**:
+
+- 如果您调整了训练时的输入分辨率,需要通过参数`rec_image_shape`设置为您需要的识别图像形状。
+- 在推理时需要设置参数`rec_char_dict_path`指定字典,如果您修改了字典,请修改该参数为您的字典文件。
+- 如果您修改了预处理方法,需修改`tools/infer/predict_rec.py`中SVTR的预处理为您的预处理方法。
+
+
+### 4.2 C++推理部署
+
+由于C++预处理后处理还未支持SVTR,所以暂未支持
+
+
+### 4.3 Serving服务化部署
+
+暂不支持
+
+
+### 4.4 更多推理部署
+
+暂不支持
+
+
+## 5. FAQ
+
+1. 由于`SVTR`使用的算子大多为矩阵相乘,在GPU环境下,速度具有优势,但在CPU开启mkldnn加速环境下,`SVTR`相比于被优化的卷积网络没有优势。
+
+
+## 引用
+
+```bibtex
+@article{Du2022SVTR,
+ title = {SVTR: Scene Text Recognition with a Single Visual Model},
+ author = {Du, Yongkun and Chen, Zhineng and Jia, Caiyan and Yin, Xiaoting and Zheng, Tianlun and Li, Chenxia and Du, Yuning and Jiang, Yu-Gang},
+ booktitle = {IJCAI},
+ year = {2022},
+ url = {https://arxiv.org/abs/2205.00159}
+}
+```
diff --git a/doc/doc_ch/benchmark.md b/doc/doc_ch/benchmark.md
index 39b9724abe04494c3e61f54597c64400a132e30b..bd11838a5ac2e09c1102f50ef0b1f47b9ddef6ef 100644
--- a/doc/doc_ch/benchmark.md
+++ b/doc/doc_ch/benchmark.md
@@ -13,7 +13,7 @@
说明:
-- 检测输入图像的的长边尺寸是960。
+- 检测输入图像的长边尺寸是960。
- 评估耗时阶段为图像预测耗时,不包括图像的预处理和后处理。
- `Intel至强6148`为服务器端CPU型号,测试中使用Intel MKL-DNN 加速。
- `骁龙855`为移动端处理平台型号。
diff --git a/doc/doc_ch/config.md b/doc/doc_ch/config.md
index 3c62acea7734ab629c71d515127aff4fcfb62d34..41ba8c1f7bd7b9032c531720d2db3817d9a47233 100644
--- a/doc/doc_ch/config.md
+++ b/doc/doc_ch/config.md
@@ -94,7 +94,7 @@
| name | 网络loss类名 | CTCLoss | 目前支持`CTCLoss`,`DBLoss`,`ClsLoss` |
| balance_loss | DBLossloss中是否对正负样本数量进行均衡(使用OHEM) | True | \ |
| ohem_ratio | DBLossloss中的OHEM的负正样本比例 | 3 | \ |
-| main_loss_type | DBLossloss中shrink_map所采用的的loss | DiceLoss | 支持`DiceLoss`,`BCELoss` |
+| main_loss_type | DBLossloss中shrink_map所采用的loss | DiceLoss | 支持`DiceLoss`,`BCELoss` |
| alpha | DBLossloss中shrink_map_loss的系数 | 5 | \ |
| beta | DBLossloss中threshold_map_loss的系数 | 10 | \ |
diff --git a/doc/doc_ch/dataset/docvqa_datasets.md b/doc/doc_ch/dataset/docvqa_datasets.md
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..3ec1865ee42be99ec19343428cd9ad6439686f15 100644
--- a/doc/doc_ch/dataset/docvqa_datasets.md
+++ b/doc/doc_ch/dataset/docvqa_datasets.md
@@ -0,0 +1,27 @@
+## DocVQA数据集
+这里整理了常见的DocVQA数据集,持续更新中,欢迎各位小伙伴贡献数据集~
+- [FUNSD数据集](#funsd)
+- [XFUND数据集](#xfund)
+
+
+#### 1、FUNSD数据集
+- **数据来源**:https://guillaumejaume.github.io/FUNSD/
+- **数据简介**:FUNSD数据集是一个用于表单理解的数据集,它包含199张真实的、完全标注的扫描版图片,类型包括市场报告、广告以及学术报告等,并分为149张训练集以及50张测试集。FUNSD数据集适用于多种类型的DocVQA任务,如字段级实体分类、字段级实体连接等。部分图像以及标注框可视化如下所示:
+
+

+

+
+ 图中,橙色区域代表`header`,淡蓝色区域代表`question`, 绿色区域表`answer`,粉红色代区域表`other`。
+
+- **下载地址**:https://guillaumejaume.github.io/FUNSD/download/
+
+
+#### 2、XFUND数据集
+- **数据来源**:https://github.com/doc-analysis/XFUND
+- **数据简介**:XFUND是一个多语种表单理解数据集,它包含7种不同语种的表单数据,并且全部用人工进行了键-值对形式的标注。其中每个语种的数据都包含了199张表单数据,并分为149张训练集以及50张测试集。部分图像以及标注框可视化如下所示:
+
+

+

+
+
+- **下载地址**:https://github.com/doc-analysis/XFUND/releases/tag/v1.0
diff --git a/doc/doc_ch/inference_ppocr.md b/doc/doc_ch/inference_ppocr.md
index 5fb3811eb40addd506dfa37d257c00a0c2a44258..946307556ba987fd7a78b3b21df8a54e22f0abb2 100644
--- a/doc/doc_ch/inference_ppocr.md
+++ b/doc/doc_ch/inference_ppocr.md
@@ -19,9 +19,9 @@
```
# 下载超轻量中文检测模型:
-wget https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer.tar
-tar xf ch_PP-OCRv2_det_infer.tar
-python3 tools/infer/predict_det.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./ch_PP-OCRv2_det_infer/"
+wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar
+tar xf ch_PP-OCRv3_det_infer.tar
+python3 tools/infer/predict_det.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./ch_PP-OCRv3_det_infer/"
```
@@ -40,13 +40,13 @@ python3 tools/infer/predict_det.py --image_dir="./doc/imgs/00018069.jpg" --det_m
如果输入图片的分辨率比较大,而且想使用更大的分辨率预测,可以设置det_limit_side_len 为想要的值,比如1216:
```
-python3 tools/infer/predict_det.py --image_dir="./doc/imgs/1.jpg" --det_model_dir="./inference/ch_PP-OCRv2_det_infer/" --det_limit_type=max --det_limit_side_len=1216
+python3 tools/infer/predict_det.py --image_dir="./doc/imgs/1.jpg" --det_model_dir="./ch_PP-OCRv3_det_infer/" --det_limit_type=max --det_limit_side_len=1216
```
如果想使用CPU进行预测,执行命令如下
```
-python3 tools/infer/predict_det.py --image_dir="./doc/imgs/1.jpg" --det_model_dir="./inference/ch_PP-OCRv2_det_infer/" --use_gpu=False
+python3 tools/infer/predict_det.py --image_dir="./doc/imgs/1.jpg" --det_model_dir="./ch_PP-OCRv3_det_infer/" --use_gpu=False
```
@@ -59,13 +59,15 @@ python3 tools/infer/predict_det.py --image_dir="./doc/imgs/1.jpg" --det_model_di
### 2.1 超轻量中文识别模型推理
+**注意** `PP-OCRv3`的识别模型使用的输入shape为`3,48,320`, 需要添加参数`--rec_image_shape=3,48,320`,如果不使用`PP-OCRv3`的识别模型,则无需设置该参数。
+
超轻量中文识别模型推理,可以执行如下命令:
```
# 下载超轻量中文识别模型:
-wget https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_infer.tar
-tar xf ch_PP-OCRv2_rec_infer.tar
-python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words/ch/word_4.jpg" --rec_model_dir="./ch_PP-OCRv2_rec_infer/"
+wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar
+tar xf ch_PP-OCRv3_rec_infer.tar
+python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words/ch/word_4.jpg" --rec_model_dir="./ch_PP-OCRv3_rec_infer/" --rec_image_shape=3,48,320
```

@@ -73,7 +75,7 @@ python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words/ch/word_4.jpg"
执行命令后,上面图像的预测结果(识别的文本和得分)会打印到屏幕上,示例如下:
```bash
-Predicts of ./doc/imgs_words/ch/word_4.jpg:('实力活力', 0.98458153)
+Predicts of ./doc/imgs_words/ch/word_4.jpg:('实力活力', 0.9956803321838379)
```
@@ -119,17 +121,19 @@ Predicts of ./doc/imgs_words/ch/word_4.jpg:['0', 0.9999982]
## 4. 文本检测、方向分类和文字识别串联推理
+**注意** `PP-OCRv3`的识别模型使用的输入shape为`3,48,320`, 需要添加参数`--rec_image_shape=3,48,320`,如果不使用`PP-OCRv3`的识别模型,则无需设置该参数。
+
以超轻量中文OCR模型推理为例,在执行预测时,需要通过参数`image_dir`指定单张图像或者图像集合的路径、参数`det_model_dir`,`cls_model_dir`和`rec_model_dir`分别指定检测,方向分类和识别的inference模型路径。参数`use_angle_cls`用于控制是否启用方向分类模型。`use_mp`表示是否使用多进程。`total_process_num`表示在使用多进程时的进程数。可视化识别结果默认保存到 ./inference_results 文件夹里面。
```shell
# 使用方向分类器
-python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./inference/ch_PP-OCRv2_det_infer/" --cls_model_dir="./inference/cls/" --rec_model_dir="./inference/ch_PP-OCRv2_rec_infer/" --use_angle_cls=true
+python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./ch_PP-OCRv3_det_infer/" --cls_model_dir="./cls/" --rec_model_dir="./ch_PP-OCRv3_rec_infer/" --use_angle_cls=true --rec_image_shape=3,48,320
# 不使用方向分类器
-python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./inference/ch_PP-OCRv2_det_infer/" --rec_model_dir="./inference/ch_PP-OCRv2_rec_infer/" --use_angle_cls=false
+python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./ch_PP-OCRv3_det_infer/" --rec_model_dir="./ch_PP-OCRv3_rec_infer/" --use_angle_cls=false --rec_image_shape=3,48,320
# 使用多进程
-python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./inference/ch_PP-OCRv2_det_infer/" --rec_model_dir="./inference/ch_PP-OCRv2_rec_infer/" --use_angle_cls=false --use_mp=True --total_process_num=6
+python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./ch_PP-OCRv3_det_infer/" --rec_model_dir="./ch_PP-OCRv3_rec_infer/" --use_angle_cls=false --use_mp=True --total_process_num=6 --rec_image_shape=3,48,320
```
执行命令后,识别结果图像如下:
-
+
diff --git a/doc/doc_ch/knowledge_distillation.md b/doc/doc_ch/knowledge_distillation.md
index c8ac40486871d4b63c799c22704725781889bce4..76ab5e62e0136bb8707a51537bf6693fc5687047 100644
--- a/doc/doc_ch/knowledge_distillation.md
+++ b/doc/doc_ch/knowledge_distillation.md
@@ -1,23 +1,23 @@
# 知识蒸馏
-+ [知识蒸馏](#0)
- + [1. 简介](#1)
- - [1.1 知识蒸馏介绍](#11)
- - [1.2 PaddleOCR知识蒸馏简介](#12)
- + [2. 配置文件解析](#2)
- + [2.1 识别配置文件解析](#21)
- - [2.1.1 模型结构](#211)
- - [2.1.2 损失函数](#212)
- - [2.1.3 后处理](#213)
- - [2.1.4 指标计算](#214)
- - [2.1.5 蒸馏模型微调](#215)
- + [2.2 检测配置文件解析](#22)
- - [2.2.1 模型结构](#221)
- - [2.2.2 损失函数](#222)
- - [2.2.3 后处理](#223)
- - [2.2.4 蒸馏指标计算](#224)
- - [2.2.5 检测蒸馏模型Fine-tune](#225)
+- [知识蒸馏](#知识蒸馏)
+ - [1. 简介](#1-简介)
+ - [1.1 知识蒸馏介绍](#11-知识蒸馏介绍)
+ - [1.2 PaddleOCR知识蒸馏简介](#12-paddleocr知识蒸馏简介)
+ - [2. 配置文件解析](#2-配置文件解析)
+ - [2.1 识别配置文件解析](#21-识别配置文件解析)
+ - [2.1.1 模型结构](#211-模型结构)
+ - [2.1.2 损失函数](#212-损失函数)
+ - [2.1.3 后处理](#213-后处理)
+ - [2.1.4 指标计算](#214-指标计算)
+ - [2.1.5 蒸馏模型微调](#215-蒸馏模型微调)
+ - [2.2 检测配置文件解析](#22-检测配置文件解析)
+ - [2.2.1 模型结构](#221-模型结构)
+ - [2.2.2 损失函数](#222-损失函数)
+ - [2.2.3 后处理](#223-后处理)
+ - [2.2.4 蒸馏指标计算](#224-蒸馏指标计算)
+ - [2.2.5 检测蒸馏模型finetune](#225-检测蒸馏模型finetune)
## 1. 简介
@@ -60,7 +60,7 @@ PaddleOCR中集成了知识蒸馏的算法,具体地,有以下几个主要
### 2.1 识别配置文件解析
-配置文件在[ch_PP-OCRv2_rec_distillation.yml](../../configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec_distillation.yml)。
+配置文件在[ch_PP-OCRv3_rec_distillation.yml](../../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml)。
#### 2.1.1 模型结构
@@ -69,7 +69,7 @@ PaddleOCR中集成了知识蒸馏的算法,具体地,有以下几个主要
```yaml
Architecture:
- model_type: &model_type "rec" # 模型类别,rec、det等,每个子网络的的模型类别都与
+ model_type: &model_type "rec" # 模型类别,rec、det等,每个子网络的模型类别都与
name: DistillationModel # 结构名称,蒸馏任务中,为DistillationModel,用于构建对应的结构
algorithm: Distillation # 算法名称
Models: # 模型,包含子网络的配置信息
@@ -78,37 +78,55 @@ Architecture:
freeze_params: false # 是否需要固定参数
return_all_feats: true # 子网络的参数,表示是否需要返回所有的features,如果为False,则只返回最后的输出
model_type: *model_type # 模型类别
- algorithm: CRNN # 子网络的算法名称,该子网络剩余参与均为构造参数,与普通的模型训练配置一致
+ algorithm: SVTR # 子网络的算法名称,该子网络其余参数均为构造参数,与普通的模型训练配置一致
Transform:
Backbone:
name: MobileNetV1Enhance
scale: 0.5
- Neck:
- name: SequenceEncoder
- encoder_type: rnn
- hidden_size: 64
+ last_conv_stride: [1, 2]
+ last_pool_type: avg
Head:
- name: CTCHead
- mid_channels: 96
- fc_decay: 0.00002
- Student: # 另外一个子网络,这里给的是DML的蒸馏示例,两个子网络结构相同,均需要学习参数
- pretrained: # 下面的组网参数同上
+ name: MultiHead
+ head_list:
+ - CTCHead:
+ Neck:
+ name: svtr
+ dims: 64
+ depth: 2
+ hidden_dims: 120
+ use_guide: True
+ Head:
+ fc_decay: 0.00001
+ - SARHead:
+ enc_dim: 512
+ max_text_length: *max_text_length
+ Student:
+ pretrained:
freeze_params: false
return_all_feats: true
model_type: *model_type
- algorithm: CRNN
+ algorithm: SVTR
Transform:
Backbone:
name: MobileNetV1Enhance
scale: 0.5
- Neck:
- name: SequenceEncoder
- encoder_type: rnn
- hidden_size: 64
+ last_conv_stride: [1, 2]
+ last_pool_type: avg
Head:
- name: CTCHead
- mid_channels: 96
- fc_decay: 0.00002
+ name: MultiHead
+ head_list:
+ - CTCHead:
+ Neck:
+ name: svtr
+ dims: 64
+ depth: 2
+ hidden_dims: 120
+ use_guide: True
+ Head:
+ fc_decay: 0.00001
+ - SARHead:
+ enc_dim: 512
+ max_text_length: *max_text_length
```
当然,这里如果希望添加更多的子网络进行训练,也可以按照`Student`与`Teacher`的添加方式,在配置文件中添加相应的字段。比如说如果希望有3个模型互相监督,共同训练,那么`Architecture`可以写为如下格式。
@@ -124,55 +142,82 @@ Architecture:
freeze_params: false
return_all_feats: true
model_type: *model_type
- algorithm: CRNN
+ algorithm: SVTR
Transform:
Backbone:
name: MobileNetV1Enhance
scale: 0.5
- Neck:
- name: SequenceEncoder
- encoder_type: rnn
- hidden_size: 64
+ last_conv_stride: [1, 2]
+ last_pool_type: avg
Head:
- name: CTCHead
- mid_channels: 96
- fc_decay: 0.00002
+ name: MultiHead
+ head_list:
+ - CTCHead:
+ Neck:
+ name: svtr
+ dims: 64
+ depth: 2
+ hidden_dims: 120
+ use_guide: True
+ Head:
+ fc_decay: 0.00001
+ - SARHead:
+ enc_dim: 512
+ max_text_length: *max_text_length
Student:
pretrained:
freeze_params: false
return_all_feats: true
model_type: *model_type
- algorithm: CRNN
+ algorithm: SVTR
Transform:
Backbone:
name: MobileNetV1Enhance
scale: 0.5
- Neck:
- name: SequenceEncoder
- encoder_type: rnn
- hidden_size: 64
+ last_conv_stride: [1, 2]
+ last_pool_type: avg
Head:
- name: CTCHead
- mid_channels: 96
- fc_decay: 0.00002
- Student2: # 知识蒸馏任务中引入的新的子网络,其他部分与上述配置相同
+ name: MultiHead
+ head_list:
+ - CTCHead:
+ Neck:
+ name: svtr
+ dims: 64
+ depth: 2
+ hidden_dims: 120
+ use_guide: True
+ Head:
+ fc_decay: 0.00001
+ - SARHead:
+ enc_dim: 512
+ max_text_length: *max_text_length
+ Student2:
pretrained:
freeze_params: false
return_all_feats: true
model_type: *model_type
- algorithm: CRNN
+ algorithm: SVTR
Transform:
Backbone:
name: MobileNetV1Enhance
scale: 0.5
- Neck:
- name: SequenceEncoder
- encoder_type: rnn
- hidden_size: 64
+ last_conv_stride: [1, 2]
+ last_pool_type: avg
Head:
- name: CTCHead
- mid_channels: 96
- fc_decay: 0.00002
+ name: MultiHead
+ head_list:
+ - CTCHead:
+ Neck:
+ name: svtr
+ dims: 64
+ depth: 2
+ hidden_dims: 120
+ use_guide: True
+ Head:
+ fc_decay: 0.00001
+ - SARHead:
+ enc_dim: 512
+ max_text_length: *max_text_length
```
最终该模型训练时,包含3个子网络:`Teacher`, `Student`, `Student2`。
@@ -205,34 +250,56 @@ Architecture:
```yaml
Loss:
- name: CombinedLoss # 损失函数名称,基于改名称,构建用于损失函数的类
- loss_config_list: # 损失函数配置文件列表,为CombinedLoss的必备函数
- - DistillationCTCLoss: # 基于蒸馏的CTC损失函数,继承自标准的CTC loss
- weight: 1.0 # 损失函数的权重,loss_config_list中,每个损失函数的配置都必须包含该字段
- model_name_list: ["Student", "Teacher"] # 对于蒸馏模型的预测结果,提取这两个子网络的输出,与gt计算CTC loss
- key: head_out # 取子网络输出dict中,该key对应的tensor
+ name: CombinedLoss
+ loss_config_list:
- DistillationDMLLoss: # 蒸馏的DML损失函数,继承自标准的DMLLoss
weight: 1.0 # 权重
act: "softmax" # 激活函数,对输入使用激活函数处理,可以为softmax, sigmoid或者为None,默认为None
+ use_log: true # 对输入计算log,如果函数已经
model_name_pairs: # 用于计算DML loss的子网络名称对,如果希望计算其他子网络的DML loss,可以在列表下面继续填充
- ["Student", "Teacher"]
key: head_out # 取子网络输出dict中,该key对应的tensor
+ multi_head: True # 是否为多头结构
+ dis_head: ctc # 指定用于计算损失函数的head
+ name: dml_ctc # 蒸馏loss的前缀名称,避免不同loss之间的命名冲突
+ - DistillationDMLLoss: # 蒸馏的DML损失函数,继承自标准的DMLLoss
+ weight: 0.5 # 权重
+ act: "softmax" # 激活函数,对输入使用激活函数处理,可以为softmax, sigmoid或者为None,默认为None
+ use_log: true # 对输入计算log,如果函数已经
+ model_name_pairs: # 用于计算DML loss的子网络名称对,如果希望计算其他子网络的DML loss,可以在列表下面继续填充
+ - ["Student", "Teacher"]
+ key: head_out # 取子网络输出dict中,该key对应的tensor
+ multi_head: True # 是否为多头结构
+ dis_head: sar # 指定用于计算损失函数的head
+ name: dml_sar # 蒸馏loss的前缀名称,避免不同loss之间的命名冲突
- DistillationDistanceLoss: # 蒸馏的距离损失函数
weight: 1.0 # 权重
mode: "l2" # 距离计算方法,目前支持l1, l2, smooth_l1
model_name_pairs: # 用于计算distance loss的子网络名称对
- ["Student", "Teacher"]
key: backbone_out # 取子网络输出dict中,该key对应的tensor
+ - DistillationCTCLoss: # 基于蒸馏的CTC损失函数,继承自标准的CTC loss
+ weight: 1.0 # 损失函数的权重,loss_config_list中,每个损失函数的配置都必须包含该字段
+ model_name_list: ["Student", "Teacher"] # 对于蒸馏模型的预测结果,提取这两个子网络的输出,与gt计算CTC loss
+ key: head_out # 取子网络输出dict中,该key对应的tensor
+ - DistillationSARLoss: # 基于蒸馏的SAR损失函数,继承自标准的SARLoss
+ weight: 1.0 # 损失函数的权重,loss_config_list中,每个损失函数的配置都必须包含该字段
+ model_name_list: ["Student", "Teacher"] # 对于蒸馏模型的预测结果,提取这两个子网络的输出,与gt计算CTC loss
+ key: head_out # 取子网络输出dict中,该key对应的tensor
+ multi_head: True # 是否为多头结构,为true时,取出其中的SAR分支计算损失函数
```
上述损失函数中,所有的蒸馏损失函数均继承自标准的损失函数类,主要功能为: 对蒸馏模型的输出进行解析,找到用于计算损失的中间节点(tensor),再使用标准的损失函数类去计算。
-以上述配置为例,最终蒸馏训练的损失函数包含下面3个部分。
+以上述配置为例,最终蒸馏训练的损失函数包含下面5个部分。
-- `Student`和`Teacher`的最终输出(`head_out`)与gt的CTC loss,权重为1。在这里因为2个子网络都需要更新参数,因此2者都需要计算与g的loss。
-- `Student`和`Teacher`的最终输出(`head_out`)之间的DML loss,权重为1。
+- `Student`和`Teacher`最终输出(`head_out`)的CTC分支与gt的CTC loss,权重为1。在这里因为2个子网络都需要更新参数,因此2者都需要计算与g的loss。
+- `Student`和`Teacher`最终输出(`head_out`)的SAR分支与gt的SAR loss,权重为1.0。在这里因为2个子网络都需要更新参数,因此2者都需要计算与g的loss。
+- `Student`和`Teacher`最终输出(`head_out`)的CTC分支之间的DML loss,权重为1。
+- `Student`和`Teacher`最终输出(`head_out`)的SAR分支之间的DML loss,权重为0.5。
- `Student`和`Teacher`的骨干网络输出(`backbone_out`)之间的l2 loss,权重为1。
+
关于`CombinedLoss`更加具体的实现可以参考: [combined_loss.py](../../ppocr/losses/combined_loss.py#L23)。关于`DistillationCTCLoss`等蒸馏损失函数更加具体的实现可以参考[distillation_loss.py](../../ppocr/losses/distillation_loss.py)。
@@ -245,6 +312,7 @@ PostProcess:
name: DistillationCTCLabelDecode # 蒸馏任务的CTC解码后处理,继承自标准的CTCLabelDecode类
model_name: ["Student", "Teacher"] # 对于蒸馏模型的预测结果,提取这两个子网络的输出,进行解码
key: head_out # 取子网络输出dict中,该key对应的tensor
+ multi_head: True # 多头结构时,会取出其中的CTC分支进行计算
```
以上述配置为例,最终会同时计算`Student`和`Teahcer` 2个子网络的CTC解码输出,返回一个`dict`,`key`为用于处理的子网络名称,`value`为用于处理的子网络列表。
@@ -262,6 +330,7 @@ Metric:
base_metric_name: RecMetric # 指标计算的基类,对于模型的输出,会基于该类,计算指标
main_indicator: acc # 指标的名称
key: "Student" # 选取该子网络的 main_indicator 作为作为保存保存best model的判断标准
+ ignore_space: False # 评估时是否忽略空格的影响
```
以上述配置为例,最终会使用`Student`子网络的acc指标作为保存best model的判断指标,同时,日志中也会打印出所有子网络的acc指标。
@@ -273,15 +342,15 @@ Metric:
对蒸馏得到的识别蒸馏进行微调有2种方式。
-(1)基于知识蒸馏的微调:这种情况比较简单,下载预训练模型,在[ch_PP-OCRv2_rec_distillation.yml](../../configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec_distillation.yml)中配置好预训练模型路径以及自己的数据路径,即可进行模型微调训练。
+(1)基于知识蒸馏的微调:这种情况比较简单,下载预训练模型,在[ch_PP-OCRv3_rec_distillation.yml](../../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml)中配置好预训练模型路径以及自己的数据路径,即可进行模型微调训练。
(2)微调时不使用知识蒸馏:这种情况,需要首先将预训练模型中的学生模型参数提取出来,具体步骤如下。
* 首先下载预训练模型并解压。
```shell
# 下面预训练模型并解压
-wget https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_train.tar
-tar -xf ch_PP-OCRv2_rec_train.tar
+wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_train.tar
+tar -xf ch_PP-OCRv3_rec_train.tar
```
* 然后使用python,对其中的学生模型参数进行提取
@@ -289,7 +358,7 @@ tar -xf ch_PP-OCRv2_rec_train.tar
```python
import paddle
# 加载预训练模型
-all_params = paddle.load("ch_PP-OCRv2_rec_train/best_accuracy.pdparams")
+all_params = paddle.load("ch_PP-OCRv3_rec_train/best_accuracy.pdparams")
# 查看权重参数的keys
print(all_params.keys())
# 学生模型的权重提取
@@ -297,18 +366,17 @@ s_params = {key[len("Student."):]: all_params[key] for key in all_params if "Stu
# 查看学生模型权重参数的keys
print(s_params.keys())
# 保存
-paddle.save(s_params, "ch_PP-OCRv2_rec_train/student.pdparams")
+paddle.save(s_params, "ch_PP-OCRv3_rec_train/student.pdparams")
```
-转化完成之后,使用[ch_PP-OCRv2_rec.yml](../../configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec.yml),修改预训练模型的路径(为导出的`student.pdparams`模型路径)以及自己的数据路径,即可进行模型微调。
+转化完成之后,使用[ch_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec.yml),修改预训练模型的路径(为导出的`student.pdparams`模型路径)以及自己的数据路径,即可进行模型微调。
### 2.2 检测配置文件解析
-检测模型蒸馏的配置文件在PaddleOCR/configs/det/ch_PP-OCRv2/目录下,包含三个蒸馏配置文件:
-- ch_PP-OCRv2_det_cml.yml,采用cml蒸馏,采用一个大模型蒸馏两个小模型,且两个小模型互相学习的方法
-- ch_PP-OCRv2_det_dml.yml,采用DML的蒸馏,两个Student模型互蒸馏的方法
-- ch_PP-OCRv2_det_distill.yml,采用Teacher大模型蒸馏小模型Student的方法
+检测模型蒸馏的配置文件在PaddleOCR/configs/det/ch_PP-OCRv3/目录下,包含两个个蒸馏配置文件:
+- ch_PP-OCRv3_det_cml.yml,采用cml蒸馏,采用一个大模型蒸馏两个小模型,且两个小模型互相学习的方法
+- ch_PP-OCRv3_det_dml.yml,采用DML的蒸馏,两个Student模型互蒸馏的方法
#### 2.2.1 模型结构
@@ -321,44 +389,44 @@ Architecture:
algorithm: Distillation # 算法名称
Models: # 模型,包含子网络的配置信息
Student: # 子网络名称,至少需要包含`pretrained`与`freeze_params`信息,其他的参数为子网络的构造参数
- pretrained: ./pretrain_models/MobileNetV3_large_x0_5_pretrained
freeze_params: false # 是否需要固定参数
return_all_feats: false # 子网络的参数,表示是否需要返回所有的features,如果为False,则只返回最后的输出
model_type: det
algorithm: DB
Backbone:
- name: MobileNetV3
- scale: 0.5
- model_name: large
- disable_se: True
+ name: ResNet
+ in_channels: 3
+ layers: 50
Neck:
- name: DBFPN
- out_channels: 96
+ name: LKPAN
+ out_channels: 256
Head:
name: DBHead
+ kernel_list: [7,2,2]
k: 50
- Teacher: # 另外一个子网络,这里给的是普通大模型蒸小模型的蒸馏示例,
- pretrained: ./pretrain_models/ch_ppocr_server_v2.0_det_train/best_accuracy
- freeze_params: true # Teacher模型是训练好的,不需要参与训练,freeze_params设置为True
+ Teacher: # 另外一个子网络,这里给的是DML蒸馏示例,
+ freeze_params: true
return_all_feats: false
model_type: det
algorithm: DB
Transform:
Backbone:
name: ResNet
- layers: 18
+ in_channels: 3
+ layers: 50
Neck:
- name: DBFPN
+ name: LKPAN
out_channels: 256
Head:
name: DBHead
+ kernel_list: [7,2,2]
k: 50
```
-如果是采用DML,即两个小模型互相学习的方法,上述配置文件里的Teacher网络结构需要设置为Student模型一样的配置,具体参考配置文件[ch_PP-OCRv2_det_dml.yml](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.4/configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_dml.yml)。
+如果是采用DML,即两个小模型互相学习的方法,上述配置文件里的Teacher网络结构需要设置为Student模型一样的配置,具体参考配置文件[ch_PP-OCRv3_det_dml.yml](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.4/configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_dml.yml)。
-下面介绍[ch_PP-OCRv2_det_cml.yml](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.4/configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_cml.yml)的配置文件参数:
+下面介绍[ch_PP-OCRv3_det_cml.yml](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.4/configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml)的配置文件参数:
```
Architecture:
@@ -375,12 +443,14 @@ Architecture:
Transform:
Backbone:
name: ResNet
- layers: 18
+ in_channels: 3
+ layers: 50
Neck:
- name: DBFPN
+ name: LKPAN
out_channels: 256
Head:
name: DBHead
+ kernel_list: [7,2,2]
k: 50
Student: # CML蒸馏的Student模型配置
pretrained: ./pretrain_models/MobileNetV3_large_x0_5_pretrained
@@ -392,10 +462,11 @@ Architecture:
name: MobileNetV3
scale: 0.5
model_name: large
- disable_se: True
+ disable_se: true
Neck:
- name: DBFPN
+ name: RSEFPN
out_channels: 96
+ shortcut: True
Head:
name: DBHead
k: 50
@@ -410,10 +481,11 @@ Architecture:
name: MobileNetV3
scale: 0.5
model_name: large
- disable_se: True
+ disable_se: true
Neck:
- name: DBFPN
+ name: RSEFPN
out_channels: 96
+ shortcut: True
Head:
name: DBHead
k: 50
@@ -445,34 +517,7 @@ Architecture:
#### 2.2.2 损失函数
-知识蒸馏任务中,检测ch_PP-OCRv2_det_distill.yml蒸馏损失函数配置如下所示。
-
-```yaml
-Loss:
- name: CombinedLoss # 损失函数名称,基于改名称,构建用于损失函数的类
- loss_config_list: # 损失函数配置文件列表,为CombinedLoss的必备函数
- - DistillationDilaDBLoss: # 基于蒸馏的DB损失函数,继承自标准的DBloss
- weight: 1.0 # 损失函数的权重,loss_config_list中,每个损失函数的配置都必须包含该字段
- model_name_pairs: # 对于蒸馏模型的预测结果,提取这两个子网络的输出,计算Teacher模型和Student模型输出的loss
- - ["Student", "Teacher"]
- key: maps # 取子网络输出dict中,该key对应的tensor
- balance_loss: true # 以下几个参数为标准DBloss的配置参数
- main_loss_type: DiceLoss
- alpha: 5
- beta: 10
- ohem_ratio: 3
- - DistillationDBLoss: # 基于蒸馏的DB损失函数,继承自标准的DBloss,用于计算Student和GT之间的loss
- weight: 1.0
- model_name_list: ["Student"] # 模型名字只有Student,表示计算Student和GT之间的loss
- name: DBLoss
- balance_loss: true
- main_loss_type: DiceLoss
- alpha: 5
- beta: 10
- ohem_ratio: 3
-```
-
-同理,检测ch_PP-OCRv2_det_cml.yml蒸馏损失函数配置如下所示。相比较于ch_PP-OCRv2_det_distill.yml的损失函数配置,cml蒸馏的损失函数配置做了3个改动:
+检测ch_PP-OCRv3_det_cml.yml蒸馏损失函数配置如下所示。
```yaml
Loss:
name: CombinedLoss
@@ -545,26 +590,25 @@ Metric:
#### 2.2.5 检测蒸馏模型finetune
-检测蒸馏有三种方式:
-- 采用ch_PP-OCRv2_det_distill.yml,Teacher模型设置为PaddleOCR提供的模型或者您训练好的大模型
-- 采用ch_PP-OCRv2_det_cml.yml,采用cml蒸馏,同样Teacher模型设置为PaddleOCR提供的模型或者您训练好的大模型
-- 采用ch_PP-OCRv2_det_dml.yml,采用DML的蒸馏,两个Student模型互蒸馏的方法,在PaddleOCR采用的数据集上大约有1.7%的精度提升。
+PP-OCRv3检测蒸馏有两种方式:
+- 采用ch_PP-OCRv3_det_cml.yml,采用cml蒸馏,同样Teacher模型设置为PaddleOCR提供的模型或者您训练好的大模型
+- 采用ch_PP-OCRv3_det_dml.yml,采用DML的蒸馏,两个Student模型互蒸馏的方法,在PaddleOCR采用的数据集上相比单独训练Student模型有1%-2%的提升。
在具体fine-tune时,需要在网络结构的`pretrained`参数中设置要加载的预训练模型。
-在精度提升方面,cml的精度>dml的精度>distill蒸馏方法的精度。当数据量不足或者Teacher模型精度与Student精度相差不大的时候,这个结论或许会改变。
+在精度提升方面,cml的精度>dml的精度蒸馏方法的精度。当数据量不足或者Teacher模型精度与Student精度相差不大的时候,这个结论或许会改变。
另外,由于PaddleOCR提供的蒸馏预训练模型包含了多个模型的参数,如果您希望提取Student模型的参数,可以参考如下代码:
```
# 下载蒸馏训练模型的参数
-wget https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_distill_train.tar
+wget https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv3_det_distill_train.tar
```
```python
import paddle
# 加载预训练模型
-all_params = paddle.load("ch_PP-OCRv2_det_distill_train/best_accuracy.pdparams")
+all_params = paddle.load("ch_PP-OCRv3_det_distill_train/best_accuracy.pdparams")
# 查看权重参数的keys
print(all_params.keys())
# 学生模型的权重提取
@@ -572,7 +616,7 @@ s_params = {key[len("Student."):]: all_params[key] for key in all_params if "Stu
# 查看学生模型权重参数的keys
print(s_params.keys())
# 保存
-paddle.save(s_params, "ch_PP-OCRv2_det_distill_train/student.pdparams")
+paddle.save(s_params, "ch_PP-OCRv3_det_distill_train/student.pdparams")
```
-最终`Student`模型的参数将会保存在`ch_PP-OCRv2_det_distill_train/student.pdparams`中,用于模型的fine-tune。
+最终`Student`模型的参数将会保存在`ch_PP-OCRv3_det_distill_train/student.pdparams`中,用于模型的fine-tune。
diff --git a/doc/doc_ch/models_list.md b/doc/doc_ch/models_list.md
index dbe244fe305af6cf5fd8ecdfb3263be7c3f7a02d..a8d99b51f17fa0912de77418e24d08ab10774c2c 100644
--- a/doc/doc_ch/models_list.md
+++ b/doc/doc_ch/models_list.md
@@ -1,13 +1,16 @@
-# PP-OCR系列模型列表(V2.1,2021年9月6日更新)
+# PP-OCR系列模型列表(V3,2022年4月28日更新)
> **说明**
-> 1. 2.1版模型相比2.0版模型,2.1的模型在模型精度上做了提升
-> 2. 2.0版模型和[1.1版模型](https://github.com/PaddlePaddle/PaddleOCR/blob/develop/doc/doc_ch/models_list.md) 的主要区别在于动态图训练vs.静态图训练,模型性能上无明显差距。
+> 1. V3版模型相比V2版模型,在模型精度上有进一步提升
+> 2. 2.0+版模型和[1.1版模型](https://github.com/PaddlePaddle/PaddleOCR/blob/develop/doc/doc_ch/models_list.md) 的主要区别在于动态图训练vs.静态图训练,模型性能上无明显差距。
> 3. 本文档提供的是PPOCR自研模型列表,更多基于公开数据集的算法介绍与预训练模型可以参考:[算法概览文档](./algorithm_overview.md)。
-- [PP-OCR系列模型列表(V2.1,2021年9月6日更新)](#pp-ocr系列模型列表v212021年9月6日更新)
+- PP-OCR系列模型列表(V3,2022年4月28日更新)
- [1. 文本检测模型](#1-文本检测模型)
+ - [1.1 中文检测模型](#1.1)
+ - [2.2 英文检测模型](#1.2)
+ - [1.3 多语言检测模型](#1.3)
- [2. 文本识别模型](#2-文本识别模型)
- [2.1 中文识别模型](#21-中文识别模型)
- [2.2 英文识别模型](#22-英文识别模型)
@@ -32,14 +35,42 @@ PaddleOCR提供的可下载模型包括`推理模型`、`训练模型`、`预训
## 1. 文本检测模型
+
+
+### 1.1 中文检测模型
+
|模型名称|模型简介|配置文件|推理模型大小|下载地址|
| --- | --- | --- | --- | --- |
-|ch_PP-OCRv2_det_slim|【最新】slim量化+蒸馏版超轻量模型,支持中英文、多语种文本检测|[ch_PP-OCRv2_det_cml.yml](../../configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_cml.yml)| 3M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_slim_quant_infer.tar)|
-|ch_PP-OCRv2_det|【最新】原始超轻量模型,支持中英文、多语种文本检测|[ch_PP-OCRv2_det_cml.yml](../../configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_cml.yml)|3M|[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_distill_train.tar)|
+|ch_PP-OCRv3_det_slim|【最新】slim量化+蒸馏版超轻量模型,支持中英文、多语种文本检测|[ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml)| 1.1M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_slim_infer.tar) / [训练模型(coming soon)](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_slim_distill_train.tar) / [slim模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_slim_infer.nb)|
+|ch_PP-OCRv3_det| 【最新】原始超轻量模型,支持中英文、多语种文本检测 |[ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml)| 3.8M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_distill_train.tar)|
+|ch_PP-OCRv2_det_slim| slim量化+蒸馏版超轻量模型,支持中英文、多语种文本检测|[ch_PP-OCRv2_det_cml.yml](../../configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_cml.yml)| 3M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_slim_quant_infer.tar)|
+|ch_PP-OCRv2_det| 原始超轻量模型,支持中英文、多语种文本检测|[ch_PP-OCRv2_det_cml.yml](../../configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_cml.yml)|3M|[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_distill_train.tar)|
|ch_ppocr_mobile_slim_v2.0_det|slim裁剪版超轻量模型,支持中英文、多语种文本检测|[ch_det_mv3_db_v2.0.yml](../../configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml)| 2.6M |[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/slim/ch_ppocr_mobile_v2.0_det_prune_infer.tar)|
|ch_ppocr_mobile_v2.0_det|原始超轻量模型,支持中英文、多语种文本检测|[ch_det_mv3_db_v2.0.yml](../../configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml)|3M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_train.tar)|
|ch_ppocr_server_v2.0_det|通用模型,支持中英文、多语种文本检测,比超轻量模型更大,但效果更好|[ch_det_res18_db_v2.0.yml](../../configs/det/ch_ppocr_v2.0/ch_det_res18_db_v2.0.yml)|47M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_train.tar)|
+
+
+### 1.2 英文检测模型
+
+|模型名称|模型简介|配置文件|推理模型大小|下载地址|
+| --- | --- | --- | --- | --- |
+|en_PP-OCRv3_det_slim |【最新】slim量化版超轻量模型,支持英文、数字检测 | [ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml) | 1.1M |[推理模型(coming soon)](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_slim_infer.tar) / [训练模型(coming soon)](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_slim_distill_train.tar) / [slim模型(coming soon)](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_slim_infer.nb) |
+|ch_PP-OCRv3_det |【最新】原始超轻量模型,支持英文、数字检测|[ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml)| 3.8M | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_distill_train.tar) |
+
+* 注:英文检测模型与中文检测模型结构完全相同,只有训练数据不同,在此仅提供相同的配置文件。
+
+
+
+### 1.3 多语言检测模型
+
+|模型名称|模型简介|配置文件|推理模型大小|下载地址|
+| --- | --- | --- | --- | --- |
+| ml_PP-OCRv3_det_slim |【最新】slim量化版超轻量模型,支持多语言检测 | [ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml) | 1.1M |[推理模型(coming soon)](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/Multilingual_PP-OCRv3_det_slim_infer.tar) / [训练模型(coming soon)](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/Multilingual_PP-OCRv3_det_slim_distill_train.tar) / [slim模型(coming soon)](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/Multilingual_PP-OCRv3_det_slim_infer.nb) |
+| ml_PP-OCRv3_det |【最新】原始超轻量模型,支持多语言检测 | [ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml)| 3.8M | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/Multilingual_PP-OCRv3_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/Multilingual_PP-OCRv3_det_distill_train.tar) |
+
+* 注:多语言检测模型与中文检测模型结构完全相同,只有训练数据不同,在此仅提供相同的配置文件。
+
## 2. 文本识别模型
@@ -50,8 +81,10 @@ PaddleOCR提供的可下载模型包括`推理模型`、`训练模型`、`预训
|模型名称|模型简介|配置文件|推理模型大小|下载地址|
| --- | --- | --- | --- | --- |
-|ch_PP-OCRv2_rec_slim|【最新】slim量化版超轻量模型,支持中英文、数字识别|[ch_PP-OCRv2_rec.yml](../../configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec.yml)| 9M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_train.tar) |
-|ch_PP-OCRv2_rec|【最新】原始超轻量模型,支持中英文、数字识别|[ch_PP-OCRv2_rec_distillation.yml](../../configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec_distillation.yml)|8.5M|[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_train.tar) |
+|ch_PP-OCRv3_rec_slim |【最新】slim量化版超轻量模型,支持中英文、数字识别|[ch_PP-OCRv3_rec_distillation.yml](../../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml)| 4.9M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_train.tar) / [slim模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_infer.nb) |
+|ch_PP-OCRv3_rec|【最新】原始超轻量模型,支持中英文、数字识别|[ch_PP-OCRv3_rec_distillation.yml](../../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml)| 12.4M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_train.tar) |
+|ch_PP-OCRv2_rec_slim| slim量化版超轻量模型,支持中英文、数字识别|[ch_PP-OCRv2_rec.yml](../../configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec.yml)| 9M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_train.tar) |
+|ch_PP-OCRv2_rec| 原始超轻量模型,支持中英文、数字识别|[ch_PP-OCRv2_rec_distillation.yml](../../configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec_distillation.yml)|8.5M|[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_train.tar) |
|ch_ppocr_mobile_slim_v2.0_rec|slim裁剪量化版超轻量模型,支持中英文、数字识别|[rec_chinese_lite_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml)| 6M |[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_slim_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_slim_train.tar) |
|ch_ppocr_mobile_v2.0_rec|原始超轻量模型,支持中英文、数字识别|[rec_chinese_lite_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml)|5.2M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_train.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_pre.tar) |
|ch_ppocr_server_v2.0_rec|通用模型,支持中英文、数字识别|[rec_chinese_common_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_common_train_v2.0.yml)|94.8M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_train.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_pre.tar) |
@@ -63,26 +96,28 @@ PaddleOCR提供的可下载模型包括`推理模型`、`训练模型`、`预训
|模型名称|模型简介|配置文件|推理模型大小|下载地址|
| --- | --- | --- | --- | --- |
+|en_PP-OCRv3_rec_slim |【最新】slim量化版超轻量模型,支持英文、数字识别 | [en_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/en_PP-OCRv3_rec.yml)| - |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_slim_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_slim_train.tar) / [slim模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_slim_infer.nb) |
+|ch_PP-OCRv3_rec |【最新】原始超轻量模型,支持英文、数字识别|[en_PP-OCRv3_rec.yml](../../configs/rec/en_PP-OCRv3/en_PP-OCRv3_rec.yml)| 9.6M | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_train.tar) |
|en_number_mobile_slim_v2.0_rec|slim裁剪量化版超轻量模型,支持英文、数字识别|[rec_en_number_lite_train.yml](../../configs/rec/multi_language/rec_en_number_lite_train.yml)| 2.7M | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/en_number_mobile_v2.0_rec_slim_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/en_number_mobile_v2.0_rec_slim_train.tar) |
|en_number_mobile_v2.0_rec|原始超轻量模型,支持英文、数字识别|[rec_en_number_lite_train.yml](../../configs/rec/multi_language/rec_en_number_lite_train.yml)|2.6M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/en_number_mobile_v2.0_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/en_number_mobile_v2.0_rec_train.tar) |
+
### 2.3 多语言识别模型(更多语言持续更新中...)
|模型名称|字典文件|模型简介|配置文件|推理模型大小|下载地址|
| --- | --- | --- | --- |--- | --- |
-| french_mobile_v2.0_rec | ppocr/utils/dict/french_dict.txt |法文识别|[rec_french_lite_train.yml](../../configs/rec/multi_language/rec_french_lite_train.yml)|2.65M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/french_mobile_v2.0_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/french_mobile_v2.0_rec_train.tar) |
-| german_mobile_v2.0_rec | ppocr/utils/dict/german_dict.txt |德文识别|[rec_german_lite_train.yml](../../configs/rec/multi_language/rec_german_lite_train.yml)|2.65M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/german_mobile_v2.0_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/german_mobile_v2.0_rec_train.tar) |
-| korean_mobile_v2.0_rec | ppocr/utils/dict/korean_dict.txt |韩文识别|[rec_korean_lite_train.yml](../../configs/rec/multi_language/rec_korean_lite_train.yml)|3.9M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/korean_mobile_v2.0_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/korean_mobile_v2.0_rec_train.tar) |
-| japan_mobile_v2.0_rec | ppocr/utils/dict/japan_dict.txt |日文识别|[rec_japan_lite_train.yml](../../configs/rec/multi_language/rec_japan_lite_train.yml)|4.23M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/japan_mobile_v2.0_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/japan_mobile_v2.0_rec_train.tar) |
-| chinese_cht_mobile_v2.0_rec | ppocr/utils/dict/chinese_cht_dict.txt | 中文繁体识别|rec_chinese_cht_lite_train.yml|5.63M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/chinese_cht_mobile_v2.0_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/chinese_cht_mobile_v2.0_rec_train.tar) |
-| te_mobile_v2.0_rec | ppocr/utils/dict/te_dict.txt | 泰卢固文识别|rec_te_lite_train.yml|2.63M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/te_mobile_v2.0_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/te_mobile_v2.0_rec_train.tar) |
-| ka_mobile_v2.0_rec | ppocr/utils/dict/ka_dict.txt |卡纳达文识别|rec_ka_lite_train.yml|2.63M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/ka_mobile_v2.0_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/ka_mobile_v2.0_rec_train.tar) |
-| ta_mobile_v2.0_rec | ppocr/utils/dict/ta_dict.txt |泰米尔文识别|rec_ta_lite_train.yml|2.63M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/ta_mobile_v2.0_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/ta_mobile_v2.0_rec_train.tar) |
-| latin_mobile_v2.0_rec | ppocr/utils/dict/latin_dict.txt | 拉丁文识别 | [rec_latin_lite_train.yml](../../configs/rec/multi_language/rec_latin_lite_train.yml) |2.6M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/latin_ppocr_mobile_v2.0_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/latin_ppocr_mobile_v2.0_rec_train.tar) |
-| arabic_mobile_v2.0_rec | ppocr/utils/dict/arabic_dict.txt | 阿拉伯字母 | [rec_arabic_lite_train.yml](../../configs/rec/multi_language/rec_arabic_lite_train.yml) |2.6M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/arabic_ppocr_mobile_v2.0_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/arabic_ppocr_mobile_v2.0_rec_train.tar) |
-| cyrillic_mobile_v2.0_rec | ppocr/utils/dict/cyrillic_dict.txt | 斯拉夫字母 | [rec_cyrillic_lite_train.yml](../../configs/rec/multi_language/rec_cyrillic_lite_train.yml) |2.6M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/cyrillic_ppocr_mobile_v2.0_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/cyrillic_ppocr_mobile_v2.0_rec_train.tar) |
-| devanagari_mobile_v2.0_rec | ppocr/utils/dict/devanagari_dict.txt |梵文字母 | [rec_devanagari_lite_train.yml](../../configs/rec/multi_language/rec_devanagari_lite_train.yml) |2.6M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/devanagari_ppocr_mobile_v2.0_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/devanagari_ppocr_mobile_v2.0_rec_train.tar) |
+| korean_PP-OCRv3_rec | ppocr/utils/dict/korean_dict.txt |韩文识别|[korean_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/korean_PP-OCRv3_rec.yml)|11M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/korean_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/korean_PP-OCRv3_rec_train.tar) |
+| japan_PP-OCRv3_rec | ppocr/utils/dict/japan_dict.txt |日文识别|[japan_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/japan_PP-OCRv3_rec.yml)|11M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/japan_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/japan_PP-OCRv3_rec_train.tar) |
+| chinese_cht_PP-OCRv3_rec | ppocr/utils/dict/chinese_cht_dict.txt | 中文繁体识别|[chinese_cht_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/chinese_cht_PP-OCRv3_rec.yml)|12M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/chinese_cht_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/chinese_cht_PP-OCRv3_rec_train.tar) |
+| te_PP-OCRv3_rec | ppocr/utils/dict/te_dict.txt | 泰卢固文识别|[te_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/te_PP-OCRv3_rec.yml)|9.6M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/te_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/te_PP-OCRv3_rec_train.tar) |
+| ka_PP-OCRv3_rec | ppocr/utils/dict/ka_dict.txt |卡纳达文识别|[ka_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/ka_PP-OCRv3_rec.yml)|9.9M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/ka_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/ka_PP-OCRv3_rec_train.tar) |
+| ta_PP-OCRv3_rec | ppocr/utils/dict/ta_dict.txt |泰米尔文识别|[ta_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/ta_PP-OCRv3_rec.yml)|9.6M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/ta_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/ta_PP-OCRv3_rec_train.tar) |
+| latin_PP-OCRv3_rec | ppocr/utils/dict/latin_dict.txt | 拉丁文识别 | [latin_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/latin_PP-OCRv3_rec.yml) |9.6M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/latin_ppocr_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/latin_ppocr_PP-OCRv3_rec_train.tar) |
+| arabic_PP-OCRv3_rec | ppocr/utils/dict/arabic_dict.txt | 阿拉伯字母 | [arabic_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/rec_arabic_lite_train.yml) |9.6M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/arabic_ppocr_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/arabic_ppocr_PP-OCRv3_rec_train.tar) |
+| cyrillic_PP-OCRv3_rec | ppocr/utils/dict/cyrillic_dict.txt | 斯拉夫字母 | [cyrillic_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/cyrillic_PP-OCRv3_rec.yml) |9.6M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/cyrillic_ppocr_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/cyrillic_ppocr_PP-OCRv3_rec_train.tar) |
+| devanagari_PP-OCRv3_rec | ppocr/utils/dict/devanagari_dict.txt |梵文字母 | [devanagari_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/devanagari_PP-OCRv3_rec.yml) |9.6M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/devanagari_ppocr_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/devanagari_ppocr_PP-OCRv3_rec_train.tar) |
+
更多支持语种请参考: [多语言模型](./multi_languages.md)
diff --git a/doc/doc_ch/multi_languages.md b/doc/doc_ch/multi_languages.md
index af9ff82e357e5945bfddf10337d0af3cd04390a0..6838b350403a7044e629e5fcc5893bced98af9d3 100644
--- a/doc/doc_ch/multi_languages.md
+++ b/doc/doc_ch/multi_languages.md
@@ -176,8 +176,62 @@ ppocr 还支持方向分类, 更多使用方式请参考:[whl包使用说明
ppocr 支持使用自己的数据进行自定义训练或finetune, 其中识别模型可以参考 [法语配置文件](../../configs/rec/multi_language/rec_french_lite_train.yml)
修改训练数据路径、字典等参数。
-具体数据准备、训练过程可参考:[文本检测](../doc_ch/detection.md)、[文本识别](../doc_ch/recognition.md),更多功能如预测部署、
-数据标注等功能可以阅读完整的[文档教程](../../README_ch.md)。
+详细数据准备、训练过程可参考:[文本识别](../doc_ch/recognition.md)、[文本检测](../doc_ch/detection.md)。
+
+假设已经准备好了训练数据,可根据以下步骤快速启动训练:
+
+- 修改配置文件
+
+以 `rec_french_lite_train.yml` 为例:
+
+```
+Global:
+ ...
+ # 添加自定义字典,如修改字典请将路径指向新字典
+ character_dict_path: ./ppocr/utils/dict/french_dict.txt
+ ...
+ # 识别空格
+ use_space_char: True
+
+...
+
+Train:
+ dataset:
+ # 数据集格式,支持LMDBDataSet以及SimpleDataSet
+ name: SimpleDataSet
+ # 数据集路径
+ data_dir: ./train_data/
+ # 训练集标签文件
+ label_file_list: ["./train_data/french_train.txt"]
+ ...
+
+Eval:
+ dataset:
+ # 数据集格式,支持LMDBDataSet以及SimpleDataSet
+ name: SimpleDataSet
+ # 数据集路径
+ data_dir: ./train_data
+ # 验证集标签文件
+ label_file_list: ["./train_data/french_val.txt"]
+ ...
+```
+
+- 启动训练:
+
+```
+# 下载预训练模型
+wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/french_mobile_v2.0_rec_train.tar
+tar -xf french_mobile_v2.0_rec_train.tar
+
+#加载预训练模型 单卡训练
+python3 tools/train.py -c configs/rec/rec_french_lite_train.yml -o Global.pretrained_model=french_mobile_v2.0_rec_train/best_accuracy
+
+#加载预训练模型 多卡训练,通过--gpus参数指定卡号
+python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/rec/rec_french_lite_train.yml -o Global.pretrained_model=french_mobile_v2.0_rec_train/best_accuracy
+```
+
+
+更多功能如预测部署、数据标注等功能可以阅读完整的[文档教程](../../README_ch.md)。
## 4 预测部署
diff --git a/doc/doc_ch/ppocr_introduction.md b/doc/doc_ch/ppocr_introduction.md
index 2e25ebc9501d2e916b86867bf265490aa0971be0..6527c5803b3135bda922b5478ebe9ddbbb9ae0d9 100644
--- a/doc/doc_ch/ppocr_introduction.md
+++ b/doc/doc_ch/ppocr_introduction.md
@@ -34,14 +34,23 @@ PP-OCR从骨干网络选择和调整、预测头部的设计、数据增强、
#### PP-OCRv2
-PP-OCRv2在PP-OCR的基础上,进一步在5个方面重点优化,检测模型采用CML协同互学习知识蒸馏策略和CopyPaste数据增广策略;识别模型采用LCNet轻量级骨干网络、UDML 改进知识蒸馏策略和[Enhanced CTC loss](./doc/doc_ch/enhanced_ctc_loss.md)损失函数改进(如上图红框所示),进一步在推理速度和预测效果上取得明显提升。更多细节请参考PP-OCRv2[技术报告](https://arxiv.org/abs/2109.03144)。
+PP-OCRv2在PP-OCR的基础上,进一步在5个方面重点优化,检测模型采用CML协同互学习知识蒸馏策略和CopyPaste数据增广策略;识别模型采用LCNet轻量级骨干网络、UDML 改进知识蒸馏策略和[Enhanced CTC loss](./enhanced_ctc_loss.md)损失函数改进(如上图红框所示),进一步在推理速度和预测效果上取得明显提升。更多细节请参考PP-OCRv2[技术报告](https://arxiv.org/abs/2109.03144)。
#### PP-OCRv3
+PP-OCRv3在PP-OCRv2的基础上进一步升级。检测模型仍然基于DB算法,优化策略采用了带残差注意力机制的FPN结构RSEFPN、增大感受野的PAN结构LKPAN、基于DML训练的更优的教师模型;识别模型将base模型从CRNN替换成了IJCAI 2022论文[SVTR](https://arxiv.org/abs/2205.00159),并采用SVTR轻量化、带指导训练CTC、数据增广策略RecConAug、自监督训练的更好的预训练模型、无标签数据的使用进行模型加速和效果提升。更多细节请参考PP-OCRv3[技术报告](./PP-OCRv3_introduction.md)。
+
+
+PP-OCRv3系统pipeline如下:
+
+
+

+
## 2. 特性
+- 超轻量PP-OCRv3系列:检测(3.6M)+ 方向分类器(1.4M)+ 识别(12M)= 17.0M
- 超轻量PP-OCRv2系列:检测(3.1M)+ 方向分类器(1.4M)+ 识别(8.5M)= 13.0M
- 超轻量PP-OCR mobile移动端系列:检测(3.0M)+方向分类器(1.4M)+ 识别(5.0M)= 9.4M
- 通用PP-OCR server系列:检测(47.1M)+方向分类器(1.4M)+ 识别(94.9M)= 143.4M
@@ -68,13 +77,13 @@ PP-OCRv2在PP-OCR的基础上,进一步在5个方面重点优化,检测模
-
+
PP-OCRv2 英文模型
-
+
@@ -84,15 +93,15 @@ PP-OCRv2在PP-OCR的基础上,进一步在5个方面重点优化,检测模
PP-OCRv2 其他语言模型
-
+
-
+
-
+
## 5. 使用教程
@@ -115,8 +124,10 @@ PP-OCR中英文模型列表如下:
| 模型简介 | 模型名称 | 推荐场景 | 检测模型 | 方向分类器 | 识别模型 |
| ------------------------------------- | ----------------------- | --------------- | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ |
-| 中英文超轻量PP-OCRv2模型(13.0M) | ch_PP-OCRv2_xx | 移动端&服务器端 | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_distill_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_train.tar) |
+| 中英文超轻量PP-OCRv3模型(16.2M) | ch_PP-OCRv3_xx | 移动端&服务器端 | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_distill_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_train.tar) |
+| 英文超轻量PP-OCRv3模型(13.4M) | en_PP-OCRv3_xx | 移动端&服务器端 | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_distill_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_train.tar) |
+| 中英文超轻量PP-OCRv2模型(13.0M) | ch_PP-OCRv2_xx | 移动端&服务器端 | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_distill_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_train.tar) |
| 中英文超轻量PP-OCR mobile模型(9.4M) | ch_ppocr_mobile_v2.0_xx | 移动端&服务器端 | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_pre.tar) |
| 中英文通用PP-OCR server模型(143.4M) | ch_ppocr_server_v2.0_xx | 服务器端 | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_pre.tar) |
-更多模型下载(包括英文数字模型、多语言模型、Paddle-Lite模型等),可以参考[PP-OCR 系列模型下载](./models_list.md)。
\ No newline at end of file
+更多模型下载(包括英文数字模型、多语言模型、Paddle-Lite模型等),可以参考[PP-OCR 系列模型下载](./models_list.md)。
diff --git a/doc/doc_ch/quickstart.md b/doc/doc_ch/quickstart.md
index ce0f6b1570f1570f7d12bf1ad24d7d9f9914f5f0..6301755de8e41e497b83d54c897b2b939d758cdc 100644
--- a/doc/doc_ch/quickstart.md
+++ b/doc/doc_ch/quickstart.md
@@ -59,21 +59,21 @@ cd /path/to/ppocr_img
如果不使用提供的测试图片,可以将下方`--image_dir`参数替换为相应的测试图片路径。
+**注意** whl包默认使用`PP-OCRv3`模型,识别模型使用的输入shape为`3,48,320`, 因此如果使用识别功能,需要添加参数`--rec_image_shape 3,48,320`,如果不使用默认的`PP-OCRv3`模型,则无需设置该参数。
+
#### 2.1.1 中英文模型
* 检测+方向分类器+识别全流程:`--use_angle_cls true`设置使用方向分类器识别180度旋转文字,`--use_gpu false`设置不使用GPU
```bash
- paddleocr --image_dir ./imgs/11.jpg --use_angle_cls true --use_gpu false
+ paddleocr --image_dir ./imgs/11.jpg --use_angle_cls true --use_gpu false --rec_image_shape 3,48,320
```
结果是一个list,每个item包含了文本框,文字和识别置信度
```bash
- [[[24.0, 36.0], [304.0, 34.0], [304.0, 72.0], [24.0, 74.0]], ['纯臻营养护发素', 0.964739]]
- [[[24.0, 80.0], [172.0, 80.0], [172.0, 104.0], [24.0, 104.0]], ['产品信息/参数', 0.98069626]]
- [[[24.0, 109.0], [333.0, 109.0], [333.0, 136.0], [24.0, 136.0]], ['(45元/每公斤,100公斤起订)', 0.9676722]]
+ [[[28.0, 37.0], [302.0, 39.0], [302.0, 72.0], [27.0, 70.0]], ('纯臻营养护发素', 0.9658738374710083)]
......
```
@@ -86,35 +86,34 @@ cd /path/to/ppocr_img
结果是一个list,每个item只包含文本框
```bash
- [[26.0, 457.0], [137.0, 457.0], [137.0, 477.0], [26.0, 477.0]]
- [[25.0, 425.0], [372.0, 425.0], [372.0, 448.0], [25.0, 448.0]]
- [[128.0, 397.0], [273.0, 397.0], [273.0, 414.0], [128.0, 414.0]]
+ [[27.0, 459.0], [136.0, 459.0], [136.0, 479.0], [27.0, 479.0]]
+ [[28.0, 429.0], [372.0, 429.0], [372.0, 445.0], [28.0, 445.0]]
......
```
- 单独使用识别:设置`--det`为`false`
```bash
- paddleocr --image_dir ./imgs_words/ch/word_1.jpg --det false
+ paddleocr --image_dir ./imgs_words/ch/word_1.jpg --det false --rec_image_shape 3,48,320
```
结果是一个list,每个item只包含识别结果和识别置信度
```bash
- ['韩国小馆', 0.9907421]
+ ['韩国小馆', 0.994467]
```
-如需使用2.0模型,请指定参数`--version PP-OCR`,paddleocr默认使用2.1模型(`--versioin PP-OCRv2`)。更多whl包使用可参考[whl包文档](./whl.md)
+如需使用2.0模型,请指定参数`--version PP-OCR`,paddleocr默认使用PP-OCRv3模型(`--versioin PP-OCRv3`)。更多whl包使用可参考[whl包文档](./whl.md)
#### 2.1.2 多语言模型
-Paddleocr目前支持80个语种,可以通过修改`--lang`参数进行切换,对于英文模型,指定`--lang=en`。
+Paddleocr目前支持80个语种,可以通过修改`--lang`参数进行切换,对于英文模型,指定`--lang=en`, PP-OCRv3目前只支持中文和英文模型,其他多语言模型会陆续更新。
``` bash
-paddleocr --image_dir ./imgs_en/254.jpg --lang=en
+paddleocr --image_dir ./imgs_en/254.jpg --lang=en --rec_image_shape 3,48,320
```
@@ -125,13 +124,9 @@ paddleocr --image_dir ./imgs_en/254.jpg --lang=en
结果是一个list,每个item包含了文本框,文字和识别置信度
```text
-[('PHO CAPITAL', 0.95723116), [[66.0, 50.0], [327.0, 44.0], [327.0, 76.0], [67.0, 82.0]]]
-[('107 State Street', 0.96311164), [[72.0, 90.0], [451.0, 84.0], [452.0, 116.0], [73.0, 121.0]]]
-[('Montpelier Vermont', 0.97389287), [[69.0, 132.0], [501.0, 126.0], [501.0, 158.0], [70.0, 164.0]]]
-[('8022256183', 0.99810505), [[71.0, 175.0], [363.0, 170.0], [364.0, 202.0], [72.0, 207.0]]]
-[('REG 07-24-201706:59 PM', 0.93537045), [[73.0, 299.0], [653.0, 281.0], [654.0, 318.0], [74.0, 336.0]]]
-[('045555', 0.99346405), [[509.0, 331.0], [651.0, 325.0], [652.0, 356.0], [511.0, 362.0]]]
-[('CT1', 0.9988654), [[535.0, 367.0], [654.0, 367.0], [654.0, 406.0], [535.0, 406.0]]]
+[[[67.0, 51.0], [327.0, 46.0], [327.0, 74.0], [68.0, 80.0]], ('PHOCAPITAL', 0.9944712519645691)]
+[[[72.0, 92.0], [453.0, 84.0], [454.0, 114.0], [73.0, 122.0]], ('107 State Street', 0.9744491577148438)]
+[[[69.0, 135.0], [501.0, 125.0], [501.0, 156.0], [70.0, 165.0]], ('Montpelier Vermont', 0.9357033967971802)]
......
```
@@ -181,9 +176,7 @@ im_show.save('result.jpg')
结果是一个list,每个item包含了文本框,文字和识别置信度
```bash
-[[[24.0, 36.0], [304.0, 34.0], [304.0, 72.0], [24.0, 74.0]], ['纯臻营养护发素', 0.964739]]
-[[[24.0, 80.0], [172.0, 80.0], [172.0, 104.0], [24.0, 104.0]], ['产品信息/参数', 0.98069626]]
-[[[24.0, 109.0], [333.0, 109.0], [333.0, 136.0], [24.0, 136.0]], ['(45元/每公斤,100公斤起订)', 0.9676722]]
+[[[28.0, 37.0], [302.0, 39.0], [302.0, 72.0], [27.0, 70.0]], ('纯臻营养护发素', 0.9658738374710083)]
......
```
diff --git a/doc/doc_ch/recognition.md b/doc/doc_ch/recognition.md
index 34a462f7ab704ce7c57fc7b8ef7f0fb3f1fb8931..8457df69ff4c09b196b0f0f91271a92344217d75 100644
--- a/doc/doc_ch/recognition.md
+++ b/doc/doc_ch/recognition.md
@@ -3,31 +3,30 @@
本文提供了PaddleOCR文本识别任务的全流程指南,包括数据准备、模型训练、调优、评估、预测,各个阶段的详细说明:
- [1. 数据准备](#1-数据准备)
- * [1.1 自定义数据集](#11-自定义数据集)
- * [1.2 数据下载](#12-数据下载)
- * [1.3 字典](#13-字典)
- * [1.4 添加空格类别](#14-添加空格类别)
- * [1.5 数据增强](#15-数据增强)
+ - [1.1. 准备数据集](#11-准备数据集)
+ - [1.2. 自定义数据集](#12-自定义数据集)
+ - [1.3. 数据下载](#13-数据下载)
+ - [1.4. 字典](#14-字典)
+ - [1.5. 添加空格类别](#15-添加空格类别)
+ - [1.6. 数据增强](#16-数据增强)
- [2. 开始训练](#2-开始训练)
- * [2.1 启动训练](#21-----)
- * [2.2 断点训练](#22-----)
- * [2.3 更换Backbone 训练](#23---backbone---)
- * [2.4 混合精度训练](#24---amp---)
- * [2.5 分布式训练](#25---fleet---)
- * [2.6 知识蒸馏训练](#26---distill---)
- * [2.7 多语言模型训练](#27-多语言模型训练)
- * [2.8 其他训练环境(Windows/macOS/Linux DCU)](#28---other---)
-- [3. 模型评估与预测](#3--------)
- * [3.1 指标评估](#31-----)
- * [3.2 测试识别效果](#32-------)
-- [4. 模型导出与预测](#4--------)
+ - [2.1. 启动训练](#21-启动训练)
+ - [2.2. 断点训练](#22-断点训练)
+ - [2.3. 更换Backbone 训练](#23-更换backbone-训练)
+ - [2.4. 混合精度训练](#24-混合精度训练)
+ - [2.5. 分布式训练](#25-分布式训练)
+ - [2.6. 知识蒸馏训练](#26-知识蒸馏训练)
+ - [2.7. 多语言模型训练](#27-多语言模型训练)
+ - [2.8. 其他训练环境](#28-其他训练环境)
+- [3. 模型评估与预测](#3-模型评估与预测)
+ - [3.1. 指标评估](#31-指标评估)
+ - [3.2. 测试识别效果](#32-测试识别效果)
+- [4. 模型导出与预测](#4-模型导出与预测)
- [5. FAQ](#5-faq)
-
-
# 1. 数据准备
-### 1.1 准备数据集
+## 1.1. 准备数据集
PaddleOCR 支持两种数据格式:
- `lmdb` 用于训练以lmdb格式存储的数据集(LMDBDataSet);
@@ -42,8 +41,8 @@ ln -sf
/train_data/dataset
mklink /d /train_data/dataset
```
-
-## 1.1 自定义数据集
+## 1.2. 自定义数据集
+
下面以通用数据集为例, 介绍如何准备数据集:
* 训练集
@@ -98,15 +97,12 @@ train_data/rec/train/word_002.jpg 用科技让复杂的世界更简单
| ...
```
-
-## 1.2 数据下载
+## 1.3. 数据下载
- ICDAR2015
若您本地没有数据集,可以在官网下载 [ICDAR2015](http://rrc.cvc.uab.es/?ch=4&com=downloads) 数据,用于快速验证。也可以参考[DTRB](https://github.com/clovaai/deep-text-recognition-benchmark#download-lmdb-dataset-for-traininig-and-evaluation-from-here) ,下载 benchmark 所需的lmdb格式数据集。
-如果希望复现SAR的论文指标,需要下载[SynthAdd](https://pan.baidu.com/share/init?surl=uV0LtoNmcxbO-0YA7Ch4dg), 提取码:627x。此外,真实数据集icdar2013, icdar2015, cocotext, IIIT5也作为训练数据的一部分。具体数据细节可以参考论文SAR。
-
如果你使用的是icdar2015的公开数据集,PaddleOCR 提供了一份用于训练 ICDAR2015 数据集的标签文件,通过以下方式下载:
```
@@ -132,9 +128,7 @@ python gen_label.py --mode="rec" --input_path="{path/of/origin/label}" --output_
* [百度网盘](https://pan.baidu.com/s/1bS_u207Rm7YbY33wOECKDA) 提取码:frgi
* [google drive](https://drive.google.com/file/d/18cSWX7wXSy4G0tbKJ0d9PuIaiwRLHpjA/view)
-
-
-## 1.3 字典
+## 1.4. 字典
最后需要提供一个字典({word_dict_name}.txt),使模型在训练时,可以将所有出现的字符映射为字典的索引。
@@ -169,20 +163,19 @@ PaddleOCR内置了一部分字典,可以按需使用。
`ppocr/utils/en_dict.txt` 是一个包含96个字符的英文字典
+
目前的多语言模型仍处在demo阶段,会持续优化模型并补充语种,**非常欢迎您为我们提供其他语言的字典和字体**,
如您愿意可将字典文件提交至 [dict](../../ppocr/utils/dict),我们会在Repo中感谢您。
- 自定义字典
-如需自定义dic文件,请在 `configs/rec/rec_icdar15_train.yml` 中添加 `character_dict_path` 字段, 指向您的字典路径。
+如需自定义dic文件,请在 `configs/rec/PP-OCRv3/en_PP-OCRv3_rec.yml` 中添加 `character_dict_path` 字段, 指向您的字典路径。
-
-## 1.4 添加空格类别
+## 1.5. 添加空格类别
如果希望支持识别"空格"类别, 请将yml文件中的 `use_space_char` 字段设置为 `True`。
-
-## 1.5 数据增强
+## 1.6. 数据增强
PaddleOCR提供了多种数据增强方式,默认配置文件中已经添加了数据增广。
@@ -192,23 +185,21 @@ PaddleOCR提供了多种数据增强方式,默认配置文件中已经添加
*由于OpenCV的兼容性问题,扰动操作暂时只支持Linux*
-
# 2. 开始训练
-PaddleOCR提供了训练脚本、评估脚本和预测脚本,本节将以 CRNN 识别模型为例:
+PaddleOCR提供了训练脚本、评估脚本和预测脚本,本节将以 PP-OCRv3 英文识别模型为例:
-
-## 2.1 启动训练
+## 2.1. 启动训练
首先下载pretrain model,您可以下载训练好的模型在 icdar2015 数据上进行finetune
```
cd PaddleOCR/
-# 下载MobileNetV3的预训练模型
-wget -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_mv3_none_bilstm_ctc_v2.0_train.tar
+# 下载英文PP-OCRv3的预训练模型
+wget -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_train.tar
# 解压模型参数
cd pretrain_models
-tar -xf rec_mv3_none_bilstm_ctc_v2.0_train.tar && rm -rf rec_mv3_none_bilstm_ctc_v2.0_train.tar
+tar -xf en_PP-OCRv3_rec_train.tar && rm -rf en_PP-OCRv3_rec_train.tar
```
开始训练:
@@ -220,44 +211,23 @@ tar -xf rec_mv3_none_bilstm_ctc_v2.0_train.tar && rm -rf rec_mv3_none_bilstm_ctc
# 训练icdar15英文数据 训练日志会自动保存为 "{save_model_dir}" 下的train.log
#单卡训练(训练周期长,不建议)
-python3 tools/train.py -c configs/rec/rec_icdar15_train.yml
+python3 tools/train.py -c configs/rec/PP-OCRv3/en_PP-OCRv3_rec.yml -o Global.pretrained_model=./pretrain_models/en_PP-OCRv3_rec_train/best_accuracy
#多卡训练,通过--gpus参数指定卡号
-python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/rec/rec_icdar15_train.yml
+python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/rec/PP-OCRv3/en_PP-OCRv3_rec.yml -o Global.pretrained_model=./pretrain_models/en_PP-OCRv3_rec_train/best_accuracy
```
-PaddleOCR支持训练和评估交替进行, 可以在 `configs/rec/rec_icdar15_train.yml` 中修改 `eval_batch_step` 设置评估频率,默认每500个iter评估一次。评估过程中默认将最佳acc模型,保存为 `output/rec_CRNN/best_accuracy` 。
+PaddleOCR支持训练和评估交替进行, 可以在 `configs/rec/PP-OCRv3/en_PP-OCRv3_rec.yml` 中修改 `eval_batch_step` 设置评估频率,默认每500个iter评估一次。评估过程中默认将最佳acc模型,保存为 `output/en_PP-OCRv3_rec/best_accuracy` 。
如果验证集很大,测试将会比较耗时,建议减少评估次数,或训练完再进行评估。
-**提示:** 可通过 -c 参数选择 `configs/rec/` 路径下的多种模型配置进行训练,PaddleOCR支持的识别算法有:
-
+**提示:** 可通过 -c 参数选择 `configs/rec/` 路径下的多种模型配置进行训练,PaddleOCR支持的识别算法可以参考[前沿算法列表](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/doc/doc_ch/algorithm_overview.md#12-%E6%96%87%E6%9C%AC%E8%AF%86%E5%88%AB%E7%AE%97%E6%B3%95):
-| 配置文件 | 算法名称 | backbone | trans | seq | pred |
-| :--------: | :-------: | :-------: | :-------: | :-----: | :-----: |
-| [rec_chinese_lite_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml) | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc |
-| [rec_chinese_common_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_common_train_v2.0.yml) | CRNN | ResNet34_vd | None | BiLSTM | ctc |
-| rec_icdar15_train.yml | CRNN | Mobilenet_v3 large 0.5 | None | BiLSTM | ctc |
-| rec_mv3_none_bilstm_ctc.yml | CRNN | Mobilenet_v3 large 0.5 | None | BiLSTM | ctc |
-| rec_mv3_none_none_ctc.yml | Rosetta | Mobilenet_v3 large 0.5 | None | None | ctc |
-| rec_r34_vd_none_bilstm_ctc.yml | CRNN | Resnet34_vd | None | BiLSTM | ctc |
-| rec_r34_vd_none_none_ctc.yml | Rosetta | Resnet34_vd | None | None | ctc |
-| rec_mv3_tps_bilstm_att.yml | CRNN | Mobilenet_v3 | TPS | BiLSTM | att |
-| rec_r34_vd_tps_bilstm_att.yml | CRNN | Resnet34_vd | TPS | BiLSTM | att |
-| rec_r50fpn_vd_none_srn.yml | SRN | Resnet50_fpn_vd | None | rnn | srn |
-| rec_mtb_nrtr.yml | NRTR | nrtr_mtb | None | transformer encoder | transformer decoder |
-| rec_r31_sar.yml | SAR | ResNet31 | None | LSTM encoder | LSTM decoder |
-| rec_resnet_stn_bilstm_att.yml | SEED | Aster_Resnet | STN | BiLSTM | att |
-
-*其中SEED模型需要额外加载FastText训练好的[语言模型](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz) ,并且安装 fasttext 依赖:
-```
-python3.7 -m pip install fasttext==0.9.1
-```
-训练中文数据,推荐使用[rec_chinese_lite_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml),如您希望尝试其他算法在中文数据集上的效果,请参考下列说明修改配置文件:
+训练中文数据,推荐使用[ch_PP-OCRv3_rec_distillation.yml](../../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml),如您希望尝试其他算法在中文数据集上的效果,请参考下列说明修改配置文件:
-以 `rec_chinese_lite_train_v2.0.yml` 为例:
+以 `ch_PP-OCRv3_rec_distillation.yml` 为例:
```
Global:
...
@@ -290,7 +260,7 @@ Train:
...
- RecResizeImg:
# 修改 image_shape 以适应长文本
- image_shape: [3, 32, 320]
+ image_shape: [3, 48, 320]
...
loader:
...
@@ -310,7 +280,7 @@ Eval:
...
- RecResizeImg:
# 修改 image_shape 以适应长文本
- image_shape: [3, 32, 320]
+ image_shape: [3, 48, 320]
...
loader:
# 单卡验证的batch_size
@@ -319,19 +289,16 @@ Eval:
```
**注意,预测/评估时的配置文件请务必与训练一致。**
-
-
-## 2.2 断点训练
+## 2.2. 断点训练
如果训练程序中断,如果希望加载训练中断的模型从而恢复训练,可以通过指定Global.checkpoints指定要加载的模型路径:
```shell
-python3 tools/train.py -c configs/rec/rec_icdar15_train.yml -o Global.checkpoints=./your/trained/model
+python3 tools/train.py -c configs/rec/PP-OCRv3/en_PP-OCRv3_rec.yml -o Global.checkpoints=./your/trained/model
```
**注意**:`Global.checkpoints`的优先级高于`Global.pretrained_model`的优先级,即同时指定两个参数时,优先加载`Global.checkpoints`指定的模型,如果`Global.checkpoints`指定的模型路径有误,会加载`Global.pretrained_model`指定的模型。
-
-## 2.3 更换Backbone 训练
+## 2.3. 更换Backbone 训练
PaddleOCR将网络划分为四部分,分别在[ppocr/modeling](../../ppocr/modeling)下。 进入网络的数据将按照顺序(transforms->backbones->necks->heads)依次通过这四个部分。
@@ -377,38 +344,32 @@ args1: args1
**注意**:如果要更换网络的其他模块,可以参考[文档](./add_new_algorithm.md)。
-
-## 2.4 混合精度训练
+## 2.4. 混合精度训练
如果您想进一步加快训练速度,可以使用[自动混合精度训练](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/01_paddle2.0_introduction/basic_concept/amp_cn.html), 以单机单卡为例,命令如下:
```shell
-python3 tools/train.py -c configs/rec/rec_icdar15_train.yml \
- -o Global.pretrained_model=./pretrain_models/rec_mv3_none_bilstm_ctc_v2.0_train \
+python3 tools/train.py -c configs/rec/PP-OCRv3/en_PP-OCRv3_rec.yml \
+ -o Global.pretrained_model=./pretrain_models/en_PP-OCRv3_rec_train/best_accuracy \
Global.use_amp=True Global.scale_loss=1024.0 Global.use_dynamic_loss_scaling=True
```
-
-## 2.5 分布式训练
+## 2.5. 分布式训练
多机多卡训练时,通过 `--ips` 参数设置使用的机器IP地址,通过 `--gpus` 参数设置使用的GPU ID:
```bash
-python3 -m paddle.distributed.launch --ips="xx.xx.xx.xx,xx.xx.xx.xx" --gpus '0,1,2,3' tools/train.py -c configs/rec/rec_icdar15_train.yml \
- -o Global.pretrained_model=./pretrain_models/rec_mv3_none_bilstm_ctc_v2.0_train
+python3 -m paddle.distributed.launch --ips="xx.xx.xx.xx,xx.xx.xx.xx" --gpus '0,1,2,3' tools/train.py -c configs/rec/PP-OCRv3/en_PP-OCRv3_rec.yml \
+ -o Global.pretrained_model=./pretrain_models/en_PP-OCRv3_rec_train/best_accuracy
```
**注意:** 采用多机多卡训练时,需要替换上面命令中的ips值为您机器的地址,机器之间需要能够相互ping通。另外,训练时需要在多个机器上分别启动命令。查看机器ip地址的命令为`ifconfig`。
-
-
-## 2.6 知识蒸馏训练
+## 2.6. 知识蒸馏训练
PaddleOCR支持了基于知识蒸馏的文本识别模型训练过程,更多内容可以参考[知识蒸馏说明文档](./knowledge_distillation.md)。
-
-
-## 2.7 多语言模型训练
+## 2.7. 多语言模型训练
PaddleOCR目前已支持80种(除中文外)语种识别,`configs/rec/multi_languages` 路径下提供了一个多语言的配置文件模版: [rec_multi_language_lite_train.yml](../../configs/rec/multi_language/rec_multi_language_lite_train.yml)。
@@ -464,8 +425,7 @@ Eval:
...
```
-
-## 2.8 其他训练环境
+## 2.8. 其他训练环境
- Windows GPU/CPU
在Windows平台上与Linux平台略有不同:
@@ -478,28 +438,25 @@ Windows平台只支持`单卡`的训练与预测,指定GPU进行训练`set CUD
- Linux DCU
DCU设备上运行需要设置环境变量 `export HIP_VISIBLE_DEVICES=0,1,2,3`,其余训练评估预测命令与Linux GPU完全相同。
-
-
# 3. 模型评估与预测
-
-## 3.1 指标评估
+## 3.1. 指标评估
+
+训练中模型参数默认保存在`Global.save_model_dir`目录下。在评估指标时,需要设置`Global.checkpoints`指向保存的参数文件。评估数据集可以通过 `configs/rec/PP-OCRv3/en_PP-OCRv3_rec.yml` 修改Eval中的 `label_file_path` 设置。
-训练中模型参数默认保存在`Global.save_model_dir`目录下。在评估指标时,需要设置`Global.checkpoints`指向保存的参数文件。评估数据集可以通过 `configs/rec/rec_icdar15_train.yml` 修改Eval中的 `label_file_path` 设置。
```
# GPU 评估, Global.checkpoints 为待测权重
-python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/rec/rec_icdar15_train.yml -o Global.checkpoints={path/to/weights}/best_accuracy
+python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/rec/PP-OCRv3/en_PP-OCRv3_rec.yml -o Global.checkpoints={path/to/weights}/best_accuracy
```
-
-## 3.2 测试识别效果
+## 3.2. 测试识别效果
使用 PaddleOCR 训练好的模型,可以通过以下脚本进行快速预测。
默认预测图片存储在 `infer_img` 里,通过 `-o Global.checkpoints` 加载训练好的参数文件:
-根据配置文件中设置的的 `save_model_dir` 和 `save_epoch_step` 字段,会有以下几种参数被保存下来:
+根据配置文件中设置的 `save_model_dir` 和 `save_epoch_step` 字段,会有以下几种参数被保存下来:
```
output/rec/
@@ -519,7 +476,7 @@ output/rec/
```
# 预测英文结果
-python3 tools/infer_rec.py -c configs/rec/rec_icdar15_train.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.load_static_weights=false Global.infer_img=doc/imgs_words/en/word_1.png
+python3 tools/infer_rec.py -c configs/rec/PP-OCRv3/en_PP-OCRv3_rec.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.infer_img=doc/imgs_words/en/word_1.png
```
预测图片:
@@ -538,7 +495,7 @@ infer_img: doc/imgs_words/en/word_1.png
```
# 预测中文结果
-python3 tools/infer_rec.py -c configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.load_static_weights=false Global.infer_img=doc/imgs_words/ch/word_1.jpg
+python3 tools/infer_rec.py -c configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.infer_img=doc/imgs_words/ch/word_1.jpg
```
预测图片:
@@ -552,8 +509,6 @@ infer_img: doc/imgs_words/ch/word_1.jpg
result: ('韩国小馆', 0.997218)
```
-
-
# 4. 模型导出与预测
inference 模型(`paddle.jit.save`保存的模型)
@@ -569,15 +524,15 @@ inference 模型(`paddle.jit.save`保存的模型)
# Global.pretrained_model 参数设置待转换的训练模型地址,不用添加文件后缀 .pdmodel,.pdopt或.pdparams。
# Global.save_inference_dir参数设置转换的模型将保存的地址。
-python3 tools/export_model.py -c configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml -o Global.pretrained_model=./ch_lite/ch_ppocr_mobile_v2.0_rec_train/best_accuracy Global.save_inference_dir=./inference/rec_crnn/
+python3 tools/export_model.py -c configs/rec/PP-OCRv3/en_PP-OCRv3_rec.yml -o Global.pretrained_model=./pretrain_models/en_PP-OCRv3_rec_train/best_accuracy Global.save_inference_dir=./inference/en_PP-OCRv3_rec/
```
-**注意:**如果您是在自己的数据集上训练的模型,并且调整了中文字符的字典文件,请注意修改配置文件中的`character_dict_path`是否是所需要的字典文件。
+**注意:**如果您是在自己的数据集上训练的模型,并且调整了中文字符的字典文件,请注意修改配置文件中的`character_dict_path`为自定义字典文件。
转换成功后,在目录下有三个文件:
```
-/inference/rec_crnn/
+inference/en_PP-OCRv3_rec/
├── inference.pdiparams # 识别inference模型的参数文件
├── inference.pdiparams.info # 识别inference模型的参数信息,可忽略
└── inference.pdmodel # 识别inference模型的program文件
@@ -588,11 +543,9 @@ python3 tools/export_model.py -c configs/rec/ch_ppocr_v2.0/rec_chinese_lite_trai
如果训练时修改了文本的字典,在使用inference模型预测时,需要通过`--rec_char_dict_path`指定使用的字典路径
```
- python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words_en/word_336.png" --rec_model_dir="./your inference model" --rec_image_shape="3, 32, 100" --rec_char_dict_path="your text dict path"
+ python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words_en/word_336.png" --rec_model_dir="./your inference model" --rec_image_shape="3, 48, 320" --rec_char_dict_path="your text dict path"
```
-
-
# 5. FAQ
Q1: 训练模型转inference 模型之后预测效果不一致?
diff --git a/doc/doc_ch/training.md b/doc/doc_ch/training.md
index d46b9af701ee1c605526c18d6994842b3faf8e14..f5bfcf0cfde95e5c3704ccf72a94948f3d093761 100644
--- a/doc/doc_ch/training.md
+++ b/doc/doc_ch/training.md
@@ -4,16 +4,16 @@
同时会简单介绍PaddleOCR模型训练数据的组成部分,以及如何在垂类场景中准备数据finetune模型。
-- [1.配置文件说明](#配置文件)
-- [2. 基本概念](#基本概念)
- * [2.1 学习率](#学习率)
- * [2.2 正则化](#正则化)
- * [2.3 评估指标](#评估指标)
-- [3. 数据与垂类场景](#数据与垂类场景)
- * [3.1 训练数据](#训练数据)
- * [3.2 垂类场景](#垂类场景)
- * [3.3 自己构建数据集](#自己构建数据集)
-* [4. 常见问题](#常见问题)
+- [1. 配置文件说明](#1-配置文件说明)
+- [2. 基本概念](#2-基本概念)
+ - [2.1 学习率](#21-学习率)
+ - [2.2 正则化](#22-正则化)
+ - [2.3 评估指标](#23-评估指标)
+- [3. 数据与垂类场景](#3-数据与垂类场景)
+ - [3.1 训练数据](#31-训练数据)
+ - [3.2 垂类场景](#32-垂类场景)
+ - [3.3 自己构建数据集](#33-自己构建数据集)
+- [4. 常见问题](#4-常见问题)
@@ -68,7 +68,7 @@ Optimizer:
(2)识别阶段: 字符识别准确率,即正确识别的文本行占标注的文本行数量的比例,只有整行文本识别对才算正确识别。
-(3)端到端统计: 端对端召回率:准确检测并正确识别文本行在全部标注文本行的占比; 端到端准确率:准确检测并正确识别文本行在 检测到的文本行数量 的占比; 准确检测的标准是检测框与标注框的IOU大于某个阈值,正确识别的的检测框中的文本与标注的文本相同。
+(3)端到端统计: 端对端召回率:准确检测并正确识别文本行在全部标注文本行的占比; 端到端准确率:准确检测并正确识别文本行在 检测到的文本行数量 的占比; 准确检测的标准是检测框与标注框的IOU大于某个阈值,正确识别的检测框中的文本与标注的文本相同。
diff --git a/doc/doc_ch/whl.md b/doc/doc_ch/whl.md
index b2eb4ba17cf70edeaea36b5e54fe976605de850f..d57f2ac3255a78b630e2ea4189ab182d5c7f71ba 100644
--- a/doc/doc_ch/whl.md
+++ b/doc/doc_ch/whl.md
@@ -199,46 +199,44 @@ for line in result:
paddleocr -h
```
+**注意** whl包默认使用`PP-OCRv3`模型,识别模型使用的输入shape为`3,48,320`, 因此如果使用识别功能,需要添加参数`--rec_image_shape 3,48,320`,如果不使用默认的`PP-OCRv3`模型,则无需设置该参数。
+
* 检测+方向分类器+识别全流程
```bash
-paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg --use_angle_cls true
+paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg --use_angle_cls true --rec_image_shape 3,48,320
```
结果是一个list,每个item包含了文本框,文字和识别置信度
```bash
-[[[24.0, 36.0], [304.0, 34.0], [304.0, 72.0], [24.0, 74.0]], ['纯臻营养护发素', 0.964739]]
-[[[24.0, 80.0], [172.0, 80.0], [172.0, 104.0], [24.0, 104.0]], ['产品信息/参数', 0.98069626]]
-[[[24.0, 109.0], [333.0, 109.0], [333.0, 136.0], [24.0, 136.0]], ['(45元/每公斤,100公斤起订)', 0.9676722]]µ
+[[[28.0, 37.0], [302.0, 39.0], [302.0, 72.0], [27.0, 70.0]], ('纯臻营养护发素', 0.9658738374710083)]
......
```
* 检测+识别
```bash
-paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg
+paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg --rec_image_shape 3,48,320
```
结果是一个list,每个item包含了文本框,文字和识别置信度
```bash
-[[[24.0, 36.0], [304.0, 34.0], [304.0, 72.0], [24.0, 74.0]], ['纯臻营养护发素', 0.964739]]
-[[[24.0, 80.0], [172.0, 80.0], [172.0, 104.0], [24.0, 104.0]], ['产品信息/参数', 0.98069626]]
-[[[24.0, 109.0], [333.0, 109.0], [333.0, 136.0], [24.0, 136.0]], ['(45元/每公斤,100公斤起订)', 0.9676722]]
+[[[28.0, 37.0], [302.0, 39.0], [302.0, 72.0], [27.0, 70.0]], ('纯臻营养护发素', 0.9658738374710083)]
......
```
* 方向分类器+识别
```bash
-paddleocr --image_dir PaddleOCR/doc/imgs_words/ch/word_1.jpg --use_angle_cls true --det false
+paddleocr --image_dir PaddleOCR/doc/imgs_words/ch/word_1.jpg --use_angle_cls true --det false --rec_image_shape 3,48,320
```
结果是一个list,每个item只包含识别结果和识别置信度
```bash
-['韩国小馆', 0.9907421]
+['韩国小馆', 0.994467]
```
* 单独执行检测
@@ -250,22 +248,21 @@ paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg --rec false
结果是一个list,每个item只包含文本框
```bash
-[[26.0, 457.0], [137.0, 457.0], [137.0, 477.0], [26.0, 477.0]]
-[[25.0, 425.0], [372.0, 425.0], [372.0, 448.0], [25.0, 448.0]]
-[[128.0, 397.0], [273.0, 397.0], [273.0, 414.0], [128.0, 414.0]]
+[[27.0, 459.0], [136.0, 459.0], [136.0, 479.0], [27.0, 479.0]]
+[[28.0, 429.0], [372.0, 429.0], [372.0, 445.0], [28.0, 445.0]]
......
```
* 单独执行识别
```bash
-paddleocr --image_dir PaddleOCR/doc/imgs_words/ch/word_1.jpg --det false
+paddleocr --image_dir PaddleOCR/doc/imgs_words/ch/word_1.jpg --det false --rec_image_shape 3,48,320
```
结果是一个list,每个item只包含识别结果和识别置信度
```bash
-['韩国小馆', 0.9907421]
+['韩国小馆', 0.994467]
```
* 单独执行方向分类器
@@ -419,5 +416,4 @@ im_show.save('result.jpg')
| cls | 前向时是否启动分类 (命令行模式下使用use_angle_cls控制前向是否启动分类) | FALSE |
| show_log | 是否打印logger信息 | FALSE |
| type | 执行ocr或者表格结构化, 值可选['ocr','structure'] | ocr |
-| ocr_version | OCR模型版本,可选PP-OCRv2, PP-OCR。PP-OCRv2 目前仅支持中文的检测和识别模型,PP-OCR支持中文的检测,识别,多语种识别,方向分类器等模型 | PP-OCRv2 |
-| structure_version | 表格结构化模型版本,可选 STRUCTURE。STRUCTURE支持表格结构化模型 | STRUCTURE |
+| ocr_version | OCR模型版本,可选PP-OCRv3, PP-OCRv2, PP-OCR。PP-OCRv3 目前仅支持中、英文的检测和识别模型,方向分类器模型;PP-OCRv2 目前仅支持中文的检测和识别模型;PP-OCR支持中文的检测,识别,多语种识别,方向分类器等模型 | PP-OCRv3 |
diff --git a/doc/doc_en/PP-OCRv3_introduction_en.md b/doc/doc_en/PP-OCRv3_introduction_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..791c95a6b560aef54ce1c70a6ced7ee1cd0f0368
--- /dev/null
+++ b/doc/doc_en/PP-OCRv3_introduction_en.md
@@ -0,0 +1 @@
+English | [简体中文](../doc_ch/PP-OCRv3_introduction.md)
diff --git a/doc/doc_en/add_new_algorithm_en.md b/doc/doc_en/add_new_algorithm_en.md
index db72fe7d4b4b5b5d12e06ff34140c9da1186319b..a8903b0a20ca38f29ccaf4fc7f368fa2dae317a0 100644
--- a/doc/doc_en/add_new_algorithm_en.md
+++ b/doc/doc_en/add_new_algorithm_en.md
@@ -237,7 +237,7 @@ class MyMetric(object):
def get_metric(self):
"""
- return metircs {
+ return metrics {
'acc': 0,
'norm_edit_dis': 0,
}
diff --git a/doc/doc_en/algorithm_det_east_en.md b/doc/doc_en/algorithm_det_east_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..3955809a49a595aa59717bafcfbb23146ae96bd2
--- /dev/null
+++ b/doc/doc_en/algorithm_det_east_en.md
@@ -0,0 +1,99 @@
+# EAST
+
+- [1. Introduction](#1)
+- [2. Environment](#2)
+- [3. Model Training / Evaluation / Prediction](#3)
+ - [3.1 Training](#3-1)
+ - [3.2 Evaluation](#3-2)
+ - [3.3 Prediction](#3-3)
+- [4. Inference and Deployment](#4)
+ - [4.1 Python Inference](#4-1)
+ - [4.2 C++ Inference](#4-2)
+ - [4.3 Serving](#4-3)
+ - [4.4 More](#4-4)
+- [5. FAQ](#5)
+
+
+## 1. Introduction
+
+Paper:
+> [EAST: An Efficient and Accurate Scene Text Detector](https://arxiv.org/abs/1704.03155)
+> Xinyu Zhou, Cong Yao, He Wen, Yuzhi Wang, Shuchang Zhou, Weiran He, Jiajun Liang
+> CVPR, 2017
+
+
+On the ICDAR2015 dataset, the text detection result is as follows:
+
+|Model|Backbone|Configuration|Precision|Recall|Hmean|Download|
+| --- | --- | --- | --- | --- | --- | --- |
+|EAST|ResNet50_vd|88.71%| 81.36%| 84.88%| [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_east_v2.0_train.tar)|
+|EAST| MobileNetV3| 78.2%| 79.1%| 78.65%| [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_east_v2.0_train.tar)|
+
+
+
+## 2. Environment
+Please prepare your environment referring to [prepare the environment](./environment_en.md) and [clone the repo](./clone_en.md).
+
+
+
+## 3. Model Training / Evaluation / Prediction
+
+The above EAST model is trained using the ICDAR2015 text detection public dataset. For the download of the dataset, please refer to [ocr_datasets](./dataset/ocr_datasets_en.md).
+
+After the data download is complete, please refer to [Text Detection Training Tutorial](./detection.md) for training. PaddleOCR has modularized the code structure, so that you only need to **replace the configuration file** to train different detection models.
+
+
+
+## 4. Inference and Deployment
+
+
+
+### 4.1 Python Inference
+
+First, convert the model saved in the EAST text detection training process into an inference model. Taking the model based on the Resnet50_vd backbone network and trained on the ICDAR2015 English dataset as example ([model download link](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_east_v2.0_train.tar)), you can use the following command to convert:
+
+```shell
+python3 tools/export_model.py -c configs/det/det_r50_vd_east.yml -o Global.pretrained_model=./det_r50_vd_east_v2.0_train/best_accuracy Global.save_inference_dir=./inference/det_r50_east/
+```
+
+For EAST text detection model inference, you need to set the parameter --det_algorithm="EAST", run the following command:
+```shell
+python3 tools/infer/predict_det.py --image_dir="./doc/imgs_en/img_10.jpg" --det_model_dir="./inference/det_r50_east/" --det_algorithm="EAST"
+```
+
+
+The visualized text detection results are saved to the `./inference_results` folder by default, and the name of the result file is prefixed with 'det_res'.
+
+
+
+
+
+### 4.2 C++ Inference
+
+Since the post-processing is not written in CPP, the EAST text detection model does not support CPP inference.
+
+
+### 4.3 Serving
+
+Not supported
+
+
+### 4.4 More
+
+Not supported
+
+
+## 5. FAQ
+
+
+## Citation
+
+```bibtex
+@inproceedings{zhou2017east,
+ title={East: an efficient and accurate scene text detector},
+ author={Zhou, Xinyu and Yao, Cong and Wen, He and Wang, Yuzhi and Zhou, Shuchang and He, Weiran and Liang, Jiajun},
+ booktitle={Proceedings of the IEEE conference on Computer Vision and Pattern Recognition},
+ pages={5551--5560},
+ year={2017}
+}
+```
diff --git a/doc/doc_en/algorithm_det_sast_en.md b/doc/doc_en/algorithm_det_sast_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..e3437d22be9d75835aaa43e72363b498225db9e1
--- /dev/null
+++ b/doc/doc_en/algorithm_det_sast_en.md
@@ -0,0 +1,118 @@
+# SAST
+
+- [1. Introduction](#1)
+- [2. Environment](#2)
+- [3. Model Training / Evaluation / Prediction](#3)
+ - [3.1 Training](#3-1)
+ - [3.2 Evaluation](#3-2)
+ - [3.3 Prediction](#3-3)
+- [4. Inference and Deployment](#4)
+ - [4.1 Python Inference](#4-1)
+ - [4.2 C++ Inference](#4-2)
+ - [4.3 Serving](#4-3)
+ - [4.4 More](#4-4)
+- [5. FAQ](#5)
+
+
+## 1. Introduction
+
+Paper:
+> [A Single-Shot Arbitrarily-Shaped Text Detector based on Context Attended Multi-Task Learning](https://arxiv.org/abs/1908.05498)
+> Wang, Pengfei and Zhang, Chengquan and Qi, Fei and Huang, Zuming and En, Mengyi and Han, Junyu and Liu, Jingtuo and Ding, Errui and Shi, Guangming
+> ACM MM, 2019
+
+On the ICDAR2015 dataset, the text detection result is as follows:
+
+|Model|Backbone|Configuration|Precision|Recall|Hmean|Download|
+| --- | --- | --- | --- | --- | --- | --- |
+|SAST|ResNet50_vd|[configs/det/det_r50_vd_sast_icdar15.yml](../../configs/det/det_r50_vd_sast_icdar15.yml)|91.39%|83.77%|87.42%|[trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_sast_icdar15_v2.0_train.tar)|
+
+
+On the Total-text dataset, the text detection result is as follows:
+
+|Model|Backbone|Configuration|Precision|Recall|Hmean|Download|
+| --- | --- | --- | --- | --- | --- | --- |
+|SAST|ResNet50_vd|[configs/det/det_r50_vd_sast_totaltext.yml](../../configs/det/det_r50_vd_sast_totaltext.yml)|89.63%|78.44%|83.66%|[trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_sast_totaltext_v2.0_train.tar)|
+
+
+
+## 2. Environment
+Please prepare your environment referring to [prepare the environment](./environment_en.md) and [clone the repo](./clone_en.md).
+
+
+
+## 3. Model Training / Evaluation / Prediction
+
+Please refer to [text detection training tutorial](./detection_en.md). PaddleOCR has modularized the code structure, so that you only need to **replace the configuration file** to train different detection models.
+
+
+## 4. Inference and Deployment
+
+
+### 4.1 Python Inference
+#### (1). Quadrangle text detection model (ICDAR2015)
+First, convert the model saved in the SAST text detection training process into an inference model. Taking the model based on the Resnet50_vd backbone network and trained on the ICDAR2015 English dataset as an example ([model download link](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_sast_icdar15_v2.0_train.tar)), you can use the following command to convert:
+
+```
+python3 tools/export_model.py -c configs/det/det_r50_vd_sast_icdar15.yml -o Global.pretrained_model=./det_r50_vd_sast_icdar15_v2.0_train/best_accuracy Global.save_inference_dir=./inference/det_sast_ic15
+```
+
+**For SAST quadrangle text detection model inference, you need to set the parameter `--det_algorithm="SAST"`**, run the following command:
+
+```
+python3 tools/infer/predict_det.py --det_algorithm="SAST" --image_dir="./doc/imgs_en/img_10.jpg" --det_model_dir="./inference/det_sast_ic15/"
+```
+
+The visualized text detection results are saved to the `./inference_results` folder by default, and the name of the result file is prefixed with 'det_res'. Examples of results are as follows:
+
+
+
+#### (2). Curved text detection model (Total-Text)
+First, convert the model saved in the SAST text detection training process into an inference model. Taking the model based on the Resnet50_vd backbone network and trained on the Total-Text English dataset as an example ([model download link](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_sast_totaltext_v2.0_train.tar)), you can use the following command to convert:
+
+```
+python3 tools/export_model.py -c configs/det/det_r50_vd_sast_totaltext.yml -o Global.pretrained_model=./det_r50_vd_sast_totaltext_v2.0_train/best_accuracy Global.save_inference_dir=./inference/det_sast_tt
+```
+
+For SAST curved text detection model inference, you need to set the parameter `--det_algorithm="SAST"` and `--det_sast_polygon=True`, run the following command:
+
+```
+python3 tools/infer/predict_det.py --det_algorithm="SAST" --image_dir="./doc/imgs_en/img623.jpg" --det_model_dir="./inference/det_sast_tt/" --det_sast_polygon=True
+```
+
+The visualized text detection results are saved to the `./inference_results` folder by default, and the name of the result file is prefixed with 'det_res'. Examples of results are as follows:
+
+
+
+**Note**: SAST post-processing locality aware NMS has two versions: Python and C++. The speed of C++ version is obviously faster than that of Python version. Due to the compilation version problem of NMS of C++ version, C++ version NMS will be called only in Python 3.5 environment, and python version NMS will be called in other cases.
+
+
+### 4.2 C++ Inference
+
+Not supported
+
+
+### 4.3 Serving
+
+Not supported
+
+
+### 4.4 More
+
+Not supported
+
+
+## 5. FAQ
+
+
+## Citation
+
+```bibtex
+@inproceedings{wang2019single,
+ title={A Single-Shot Arbitrarily-Shaped Text Detector based on Context Attended Multi-Task Learning},
+ author={Wang, Pengfei and Zhang, Chengquan and Qi, Fei and Huang, Zuming and En, Mengyi and Han, Junyu and Liu, Jingtuo and Ding, Errui and Shi, Guangming},
+ booktitle={Proceedings of the 27th ACM International Conference on Multimedia},
+ pages={1277--1285},
+ year={2019}
+}
+```
diff --git a/doc/doc_en/algorithm_rec_aster_en.md b/doc/doc_en/algorithm_rec_aster_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..1540681a19f94160e221c37173510395d0fd407f
--- /dev/null
+++ b/doc/doc_en/algorithm_rec_aster_en.md
@@ -0,0 +1,122 @@
+# STAR-Net
+
+- [1. Introduction](#1)
+- [2. Environment](#2)
+- [3. Model Training / Evaluation / Prediction](#3)
+ - [3.1 Training](#3-1)
+ - [3.2 Evaluation](#3-2)
+ - [3.3 Prediction](#3-3)
+- [4. Inference and Deployment](#4)
+ - [4.1 Python Inference](#4-1)
+ - [4.2 C++ Inference](#4-2)
+ - [4.3 Serving](#4-3)
+ - [4.4 More](#4-4)
+- [5. FAQ](#5)
+
+
+## 1. Introduction
+
+Paper:
+> [STAR-Net: a spatial attention residue network for scene text recognition.](http://www.bmva.org/bmvc/2016/papers/paper043/paper043.pdf)
+
+> Wei Liu, Chaofeng Chen, Kwan-Yee K. Wong, Zhizhong Su and Junyu Han.
+
+> BMVC, pages 43.1-43.13, 2016
+
+Using MJSynth and SynthText two text recognition datasets for training, and evaluating on IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE datasets, the algorithm reproduction effect is as follows:
+
+|Model|Backbone|ACC|config|Download link|
+| --- | --- | --- | --- | --- |
+|---|---|---|---|---|
+|StarNet|Resnet34_vd|84.44%|[configs/rec/rec_r34_vd_tps_bilstm_ctc.yml](../../configs/rec/rec_r34_vd_tps_bilstm_ctc.yml)|[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_r34_vd_tps_bilstm_ctc_v2.0_train.tar)|
+|StarNet|MobileNetV3|81.42%|[configs/rec/rec_mv3_tps_bilstm_ctc.yml](../../configs/rec/rec_mv3_tps_bilstm_ctc.yml)|[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_mv3_tps_bilstm_ctc_v2.0_train.tar)|
+
+
+## 2. Environment
+Please refer to ["Environment Preparation"](./environment.md) to configure the PaddleOCR environment, and refer to ["Project Clone"](./clone.md) to clone the project code.
+
+
+
+## 3. Model Training / Evaluation / Prediction
+
+Please refer to [Text Recognition Tutorial](./recognition.md). PaddleOCR modularizes the code, and training different recognition models only requires **changing the configuration file**.
+
+Training:
+
+Specifically, after the data preparation is completed, the training can be started. The training command is as follows:
+
+```
+#Single GPU training (long training period, not recommended)
+python3 tools/train.py -c configs/rec/rec_r34_vd_tps_bilstm_ctc.yml
+
+#Multi GPU training, specify the gpu number through the --gpus parameter
+python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c rec_r34_vd_tps_bilstm_ctc.yml
+```
+
+Evaluation:
+
+```
+# GPU evaluation
+python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/rec/rec_r34_vd_tps_bilstm_ctc.yml -o Global.pretrained_model={path/to/weights}/best_accuracy
+```
+
+Prediction:
+
+```
+# The configuration file used for prediction must match the training
+python3 tools/infer_rec.py -c configs/rec/rec_r34_vd_tps_bilstm_ctc.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.infer_img=doc/imgs_words/en/word_1.png
+```
+
+
+## 4. Inference and Deployment
+
+
+### 4.1 Python Inference
+First, the model saved during the STAR-Net text recognition training process is converted into an inference model. ( [Model download link](https://paddleocr.bj.bcebos.com/dygraph_v2.1/rec/rec_r31_STAR-Net_train.tar) ), you can use the following command to convert:
+
+```
+python3 tools/export_model.py -c configs/rec/rec_r34_vd_tps_bilstm_ctc.yml -o Global.pretrained_model=./rec_r34_vd_tps_bilstm_ctc_v2.0_train/best_accuracy Global.save_inference_dir=./inference/rec_starnet
+```
+
+For STAR-Net text recognition model inference, the following commands can be executed:
+
+```
+python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words_en/word_336.png" --rec_model_dir="./inference/rec_starnet/" --rec_image_shape="3, 32, 100" --rec_char_dict_path="./ppocr/utils/ic15_dict.txt"
+```
+
+
+### 4.2 C++ Inference
+
+With the inference model prepared, refer to the [cpp infer](../../deploy/cpp_infer/) tutorial for C++ inference.
+
+
+
+### 4.3 Serving
+
+With the inference model prepared, refer to the [pdserving](../../deploy/pdserving/) tutorial for service deployment by Paddle Serving.
+
+
+
+### 4.4 More
+
+More deployment schemes supported for STAR-Net:
+
+- Paddle2ONNX: with the inference model prepared, please refer to the [paddle2onnx](../../deploy/paddle2onnx/) tutorial.
+
+
+
+## 5. FAQ
+
+
+## Citation
+
+```bibtex
+@inproceedings{liu2016star,
+ title={STAR-Net: a spatial attention residue network for scene text recognition.},
+ author={Liu, Wei and Chen, Chaofeng and Wong, Kwan-Yee K and Su, Zhizhong and Han, Junyu},
+ booktitle={BMVC},
+ volume={2},
+ pages={7},
+ year={2016}
+}
+```
diff --git a/doc/doc_en/algorithm_rec_crnn_en.md b/doc/doc_en/algorithm_rec_crnn_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..571569ee445d756ca7bdfeea6d5f960187a5a666
--- /dev/null
+++ b/doc/doc_en/algorithm_rec_crnn_en.md
@@ -0,0 +1,123 @@
+# CRNN
+
+- [1. Introduction](#1)
+- [2. Environment](#2)
+- [3. Model Training / Evaluation / Prediction](#3)
+ - [3.1 Training](#3-1)
+ - [3.2 Evaluation](#3-2)
+ - [3.3 Prediction](#3-3)
+- [4. Inference and Deployment](#4)
+ - [4.1 Python Inference](#4-1)
+ - [4.2 C++ Inference](#4-2)
+ - [4.3 Serving](#4-3)
+ - [4.4 More](#4-4)
+- [5. FAQ](#5)
+
+
+## 1. Introduction
+
+Paper:
+> [An End-to-End Trainable Neural Network for Image-based Sequence Recognition and Its Application to Scene Text Recognition](https://arxiv.org/abs/1507.05717)
+
+> Baoguang Shi, Xiang Bai, Cong Yao
+
+> IEEE, 2015
+
+Using MJSynth and SynthText two text recognition datasets for training, and evaluating on IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE datasets, the algorithm reproduction effect is as follows:
+
+|Model|Backbone|ACC|config|Download link|
+| --- | --- | --- | --- | --- |
+|---|---|---|---|---|
+|CRNN|Resnet34_vd|81.04%|[configs/rec/rec_r34_vd_none_bilstm_ctc.yml](../../configs/rec/rec_r34_vd_none_bilstm_ctc.yml)|[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_r34_vd_none_bilstm_ctc_v2.0_train.tar)|
+|CRNN|MobileNetV3|77.95%|[configs/rec/rec_mv3_none_bilstm_ctc.yml](../../configs/rec/rec_mv3_none_bilstm_ctc.yml)|[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_mv3_none_bilstm_ctc_v2.0_train.tar)|
+
+
+## 2. Environment
+Please refer to ["Environment Preparation"](./environment.md) to configure the PaddleOCR environment, and refer to ["Project Clone"](./clone.md) to clone the project code.
+
+
+
+## 3. Model Training / Evaluation / Prediction
+
+Please refer to [Text Recognition Tutorial](./recognition.md). PaddleOCR modularizes the code, and training different recognition models only requires **changing the configuration file**.
+
+Training:
+
+Specifically, after the data preparation is completed, the training can be started. The training command is as follows:
+
+```
+#Single GPU training (long training period, not recommended)
+python3 tools/train.py -c configs/rec/rec_r34_vd_none_bilstm_ctc.yml
+
+#Multi GPU training, specify the gpu number through the --gpus parameter
+python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/rec/rec_r34_vd_none_bilstm_ctc.yml
+```
+
+Evaluation:
+
+```
+# GPU evaluation
+python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/rec/rec_r34_vd_none_bilstm_ctc.yml -o Global.pretrained_model={path/to/weights}/best_accuracy
+```
+
+Prediction:
+
+```
+# The configuration file used for prediction must match the training
+python3 tools/infer_rec.py -c configs/rec/rec_r34_vd_none_bilstm_ctc.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.infer_img=doc/imgs_words/en/word_1.png
+```
+
+
+## 4. Inference and Deployment
+
+
+### 4.1 Python Inference
+First, the model saved during the CRNN text recognition training process is converted into an inference model. ( [Model download link](https://paddleocr.bj.bcebos.com/dygraph_v2.1/rec/rec_r31_CRNN_train.tar) ), you can use the following command to convert:
+
+```
+python3 tools/export_model.py -c configs/rec/rec_r34_vd_none_bilstm_ctc.yml -o Global.pretrained_model=./rec_r34_vd_none_bilstm_ctc_v2.0_train/best_accuracy Global.save_inference_dir=./inference/rec_crnn
+```
+
+For CRNN text recognition model inference, the following commands can be executed:
+
+```
+python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words_en/word_336.png" --rec_model_dir="./inference/rec_crnn/" --rec_image_shape="3, 32, 100" --rec_char_dict_path="./ppocr/utils/ic15_dict.txt"
+```
+
+
+### 4.2 C++ Inference
+
+With the inference model prepared, refer to the [cpp infer](../../deploy/cpp_infer/) tutorial for C++ inference.
+
+
+
+### 4.3 Serving
+
+With the inference model prepared, refer to the [pdserving](../../deploy/pdserving/) tutorial for service deployment by Paddle Serving.
+
+
+
+### 4.4 More
+
+More deployment schemes supported for CRNN:
+
+- Paddle2ONNX: with the inference model prepared, please refer to the [paddle2onnx](../../deploy/paddle2onnx/) tutorial.
+
+
+
+## 5. FAQ
+
+
+## Citation
+
+```bibtex
+@ARTICLE{7801919,
+ author={Shi, Baoguang and Bai, Xiang and Yao, Cong},
+ journal={IEEE Transactions on Pattern Analysis and Machine Intelligence},
+ title={An End-to-End Trainable Neural Network for Image-Based Sequence Recognition and Its Application to Scene Text Recognition},
+ year={2017},
+ volume={39},
+ number={11},
+ pages={2298-2304},
+ doi={10.1109/TPAMI.2016.2646371}}
+```
diff --git a/doc/doc_en/algorithm_rec_nrtr_en.md b/doc/doc_en/algorithm_rec_nrtr_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..3f8fd0adee900cf889d70e8b78fb1122d54c7d08
--- /dev/null
+++ b/doc/doc_en/algorithm_rec_nrtr_en.md
@@ -0,0 +1,135 @@
+# NRTR
+
+- [1. Introduction](#1)
+- [2. Environment](#2)
+- [3. Model Training / Evaluation / Prediction](#3)
+ - [3.1 Training](#3-1)
+ - [3.2 Evaluation](#3-2)
+ - [3.3 Prediction](#3-3)
+- [4. Inference and Deployment](#4)
+ - [4.1 Python Inference](#4-1)
+ - [4.2 C++ Inference](#4-2)
+ - [4.3 Serving](#4-3)
+ - [4.4 More](#4-4)
+- [5. FAQ](#5)
+
+
+## 1. Introduction
+
+Paper:
+> [NRTR: A No-Recurrence Sequence-to-Sequence Model For Scene Text Recognition](https://arxiv.org/abs/1806.00926)
+> Fenfen Sheng and Zhineng Chen and Bo Xu
+> ICDAR, 2019
+
+Using MJSynth and SynthText two text recognition datasets for training, and evaluating on IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE datasets, the algorithm reproduction effect is as follows:
+
+|Model|Backbone|config|Acc|Download link|
+| --- | --- | --- | --- | --- |
+|NRTR|MTB|[rec_mtb_nrtr.yml](../../configs/rec/rec_mtb_nrtr.yml)|84.21%|[train model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_mtb_nrtr_train.tar)|
+
+
+## 2. Environment
+Please refer to ["Environment Preparation"](./environment.md) to configure the PaddleOCR environment, and refer to ["Project Clone"](./clone.md) to clone the project code.
+
+
+
+## 3. Model Training / Evaluation / Prediction
+
+Please refer to [Text Recognition Tutorial](./recognition.md). PaddleOCR modularizes the code, and training different recognition models only requires **changing the configuration file**.
+
+Training:
+
+Specifically, after the data preparation is completed, the training can be started. The training command is as follows:
+
+```
+#Single GPU training (long training period, not recommended)
+python3 tools/train.py -c configs/rec/rec_mtb_nrtr.yml
+
+#Multi GPU training, specify the gpu number through the --gpus parameter
+python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/rec/rec_mtb_nrtr.yml
+```
+
+Evaluation:
+
+```
+# GPU evaluation
+python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/rec/rec_mtb_nrtr.yml -o Global.pretrained_model={path/to/weights}/best_accuracy
+```
+
+Prediction:
+
+```
+# The configuration file used for prediction must match the training
+python3 tools/infer_rec.py -c configs/rec/rec_mtb_nrtr.yml -o Global.infer_img='./doc/imgs_words_en/word_10.png' Global.pretrained_model=./rec_mtb_nrtr_train/best_accuracy
+```
+
+
+## 4. Inference and Deployment
+
+
+### 4.1 Python Inference
+First, the model saved during the NRTR text recognition training process is converted into an inference model. ( [Model download link](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_mtb_nrtr_train.tar)) ), you can use the following command to convert:
+
+```
+python3 tools/export_model.py -c configs/rec/rec_mtb_nrtr.yml -o Global.pretrained_model=./rec_mtb_nrtr_train/best_accuracy Global.save_inference_dir=./inference/rec_mtb_nrtr
+```
+
+**Note:**
+- If you are training the model on your own dataset and have modified the dictionary file, please pay attention to modify the `character_dict_path` in the configuration file to the modified dictionary file.
+- If you modified the input size during training, please modify the `infer_shape` corresponding to NRTR in the `tools/export_model.py` file.
+
+After the conversion is successful, there are three files in the directory:
+```
+/inference/rec_mtb_nrtr/
+ ├── inference.pdiparams
+ ├── inference.pdiparams.info
+ └── inference.pdmodel
+```
+
+
+For NRTR text recognition model inference, the following commands can be executed:
+
+```
+python3 tools/infer/predict_rec.py --image_dir='./doc/imgs_words_en/word_10.png' --rec_model_dir='./inference/rec_mtb_nrtr/' --rec_algorithm='NRTR' --rec_image_shape='1,32,100' --rec_char_dict_path='./ppocr/utils/EN_symbol_dict.txt'
+```
+
+
+
+After executing the command, the prediction result (recognized text and score) of the image above is printed to the screen, an example is as follows:
+The result is as follows:
+```shell
+Predicts of ./doc/imgs_words_en/word_10.png:('pain', 0.9265879392623901)
+```
+
+
+### 4.2 C++ Inference
+
+Not supported
+
+
+### 4.3 Serving
+
+Not supported
+
+
+### 4.4 More
+
+Not supported
+
+
+## 5. FAQ
+
+1. In the `NRTR` paper, Beam search is used to decode characters, but the speed is slow. Beam search is not used by default here, and greedy search is used to decode characters.
+
+## Citation
+
+```bibtex
+@article{Sheng2019NRTR,
+ title = {NRTR: A No-Recurrence Sequence-to-Sequence Model For Scene Text Recognition},
+ author = {Fenfen Sheng and Zhineng Chen andBo Xu},
+ booktitle = {ICDAR},
+ year = {2019},
+ url = {http://arxiv.org/abs/1806.00926},
+ pages = {781-786}
+}
+```
diff --git a/doc/doc_en/algorithm_rec_rare_en.md b/doc/doc_en/algorithm_rec_rare_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..3aeb1e3adfdfaaeaf493d8d40c967973f0805cd1
--- /dev/null
+++ b/doc/doc_en/algorithm_rec_rare_en.md
@@ -0,0 +1,119 @@
+# RARE
+
+- [1. Introduction](#1)
+- [2. Environment](#2)
+- [3. Model Training / Evaluation / Prediction](#3)
+ - [3.1 Training](#3-1)
+ - [3.2 Evaluation](#3-2)
+ - [3.3 Prediction](#3-3)
+- [4. Inference and Deployment](#4)
+ - [4.1 Python Inference](#4-1)
+ - [4.2 C++ Inference](#4-2)
+ - [4.3 Serving](#4-3)
+ - [4.4 More](#4-4)
+- [5. FAQ](#5)
+
+
+## 1. Introduction
+
+Paper information:
+> [Robust Scene Text Recognition with Automatic Rectification](https://arxiv.org/abs/1603.03915v2)
+> Baoguang Shi, Xinggang Wang, Pengyuan Lyu, Cong Yao, Xiang Bai∗
+> CVPR, 2016
+
+Using MJSynth and SynthText two text recognition datasets for training, and evaluating on IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE datasets, the algorithm reproduction effect is as follows:
+
+|Models|Backbone Networks|Configuration Files|Avg Accuracy|Download Links|
+| --- | --- | --- | --- | --- |
+|RARE|Resnet34_vd|[configs/rec/rec_r34_vd_tps_bilstm_att.yml](../../configs/rec/rec_r34_vd_tps_bilstm_att.yml)|83.6%|[training model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_r34_vd_tps_bilstm_att_v2.0_train.tar)|
+|RARE|MobileNetV3|[configs/rec/rec_mv3_tps_bilstm_att.yml](../../configs/rec/rec_mv3_tps_bilstm_att.yml)|82.5%|[trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_mv3_tps_bilstm_att_v2.0_train.tar)|
+
+
+
+## 2. Environment
+Please refer to [Operating Environment Preparation](./environment_en.md) to configure the PaddleOCR operating environment, and refer to [Project Clone](./clone_en.md) to clone the project code.
+
+
+## 3. Model Training / Evaluation / Prediction
+
+Please refer to [Text Recognition Training Tutorial](./recognition_en.md). PaddleOCR modularizes the code, and training different recognition models only requires **changing the configuration file**. Take the backbone network based on Resnet34_vd as an example:
+
+
+### 3.1 Training
+
+````
+#Single card training (long training period, not recommended)
+python3 tools/train.py -c configs/rec/rec_r34_vd_tps_bilstm_att.yml
+#Multi-card training, specify the card number through the --gpus parameter
+python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/rec/rec_r34_vd_tps_bilstm_att.yml
+````
+
+
+### 3.2 Evaluation
+
+````
+# GPU evaluation, Global.pretrained_model is the model to be evaluated
+python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/rec/rec_r34_vd_tps_bilstm_att.yml -o Global.pretrained_model={path/to/weights}/best_accuracy
+````
+
+
+### 3.3 Prediction
+
+````
+python3 tools/infer_rec.py -c configs/rec/rec_r34_vd_tps_bilstm_att.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.infer_img=doc/imgs_words/en/word_1.png
+````
+
+
+## 4. Inference
+
+
+### 4.1 Python Inference
+First, convert the model saved during the RARE text recognition training process into an inference model. Take the model trained on the MJSynth and SynthText text recognition datasets based on the Resnet34_vd backbone network as an example ([Model download address](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_r34_vd_tps_bilstm_att_v2.0_train.tar) ), which can be converted using the following command:
+
+```shell
+python3 tools/export_model.py -c configs/rec/rec_r34_vd_tps_bilstm_att.yml -o Global.pretrained_model=./rec_r34_vd_tps_bilstm_att_v2.0_train/best_accuracy Global.save_inference_dir=./inference/rec_rare
+````
+
+RARE text recognition model inference, you can execute the following commands:
+
+```shell
+python3 tools/infer/predict_rec.py --image_dir="doc/imgs_words/en/word_1.png" --rec_model_dir="./inference/rec_rare/" --rec_image_shape="3, 32, 100" --rec_char_dict_path= "./ppocr/utils/ic15_dict.txt"
+````
+The inference results are as follows:
+
+
+
+````
+Predicts of doc/imgs_words/en/word_1.png:('joint ', 0.9999969601631165)
+````
+
+
+### 4.2 C++ Inference
+
+Not currently supported
+
+
+### 4.3 Serving
+
+Not currently supported
+
+
+### 4.4 More
+
+The RARE model also supports the following inference deployment methods:
+
+- Paddle2ONNX Inference: After preparing the inference model, refer to the [paddle2onnx](../../deploy/paddle2onnx/) tutorial.
+
+
+## 5. FAQ
+
+## Quote
+
+````bibtex
+@inproceedings{2016Robust,
+ title={Robust Scene Text Recognition with Automatic Rectification},
+ author={ Shi, B. and Wang, X. and Lyu, P. and Cong, Y. and Xiang, B. },
+ booktitle={2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
+ year={2016},
+}
+````
diff --git a/doc/doc_en/algorithm_rec_rosetta_en.md b/doc/doc_en/algorithm_rec_rosetta_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..2a1d7b312799d1d2b4d64f959e0f580ab4eac6f9
--- /dev/null
+++ b/doc/doc_en/algorithm_rec_rosetta_en.md
@@ -0,0 +1,121 @@
+# Rosetta
+
+- [1. Introduction](#1)
+- [2. Environment](#2)
+- [3. Model Training / Evaluation / Prediction](#3)
+ - [3.1 Training](#3-1)
+ - [3.2 Evaluation](#3-2)
+ - [3.3 Prediction](#3-3)
+- [4. Inference and Deployment](#4)
+ - [4.1 Python Inference](#4-1)
+ - [4.2 C++ Inference](#4-2)
+ - [4.3 Serving](#4-3)
+ - [4.4 More](#4-4)
+- [5. FAQ](#5)
+
+
+## 1. Introduction
+
+Paper information:
+> [Rosetta: Large Scale System for Text Detection and Recognition in Images](https://arxiv.org/abs/1910.05085)
+> Borisyuk F , Gordo A , V Sivakumar
+> KDD, 2018
+
+Using MJSynth and SynthText two text recognition datasets for training, and evaluating on IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE datasets, the algorithm reproduction effect is as follows:
+
+|Models|Backbone Networks|Configuration Files|Avg Accuracy|Download Links|
+| --- | --- | --- | --- | --- |
+|Rosetta|Resnet34_vd|[configs/rec/rec_r34_vd_none_none_ctc.yml](../../configs/rec/rec_r34_vd_none_none_ctc.yml)|79.11%|[training model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_r34_vd_none_none_ctc_v2.0_train.tar)|
+|Rosetta|MobileNetV3|[configs/rec/rec_mv3_none_none_ctc.yml](../../configs/rec/rec_mv3_none_none_ctc.yml)|75.80%|[training model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_mv3_none_none_ctc_v2.0_train.tar)|
+
+
+
+## 2. Environment
+Please refer to [Operating Environment Preparation](./environment_en.md) to configure the PaddleOCR operating environment, and refer to [Project Clone](./clone_en.md) to clone the project code.
+
+
+
+## 3. Model Training / Evaluation / Prediction
+
+Please refer to [Text Recognition Training Tutorial](./recognition_en.md). PaddleOCR modularizes the code, and training different recognition models only requires **changing the configuration file**. Take the backbone network based on Resnet34_vd as an example:
+
+
+### 3.1 Training
+
+````
+#Single card training (long training period, not recommended)
+python3 tools/train.py -c configs/rec/rec_r34_vd_none_none_ctc.yml
+#Multi-card training, specify the card number through the --gpus parameter
+python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/rec/rec_r34_vd_none_none_ctc.yml
+````
+
+
+### 3.2 Evaluation
+
+````
+# GPU evaluation, Global.pretrained_model is the model to be evaluated
+python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/rec/rec_r34_vd_none_none_ctc.yml -o Global.pretrained_model={path/to/weights}/best_accuracy
+````
+
+
+### 3.3 Prediction
+
+````
+python3 tools/infer_rec.py -c configs/rec/rec_r34_vd_none_none_ctc.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.infer_img=doc/imgs_words/en/word_1.png
+````
+
+
+## 4. Inference and Deployment
+
+
+### 4.1 Python Inference
+First, convert the model saved during the Rosetta text recognition training process into an inference model. Take the model trained on the MJSynth and SynthText text recognition datasets based on the Resnet34_vd backbone network as an example ( [Model download address](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_r34_vd_none_none_ctc_v2.0_train.tar) ), which can be converted using the following command:
+
+```shell
+python3 tools/export_model.py -c configs/rec/rec_r34_vd_none_none_ctc.yml -o Global.pretrained_model=./rec_r34_vd_none_none_ctc_v2.0_train/best_accuracy Global.save_inference_dir=./inference/rec_rosetta
+````
+
+Rosetta text recognition model inference, you can execute the following commands:
+
+```shell
+python3 tools/infer/predict_rec.py --image_dir="doc/imgs_words/en/word_1.png" --rec_model_dir="./inference/rec_rosetta/" --rec_image_shape="3, 32, 100" --rec_char_dict_path= "./ppocr/utils/ic15_dict.txt"
+````
+
+The inference results are as follows:
+
+
+
+````
+Predicts of doc/imgs_words/en/word_1.png:('joint', 0.9999982714653015)
+````
+
+
+### 4.2 C++ Inference
+
+Not currently supported
+
+
+### 4.3 Serving
+
+Not currently supported
+
+
+### 4.4 More
+
+The Rosetta model also supports the following inference deployment methods:
+
+- Paddle2ONNX Inference: After preparing the inference model, refer to the [paddle2onnx](../../deploy/paddle2onnx/) tutorial.
+
+
+## 5. FAQ
+
+## Quote
+
+````bibtex
+@inproceedings{2018Rosetta,
+ title={Rosetta: Large Scale System for Text Detection and Recognition in Images},
+ author={ Borisyuk, Fedor and Gordo, Albert and Sivakumar, Viswanath },
+ booktitle={the 24th ACM SIGKDD International Conference},
+ year={2018},
+}
+````
diff --git a/doc/doc_en/algorithm_rec_sar_en.md b/doc/doc_en/algorithm_rec_sar_en.md
index c8656f5071358951d3f408f525b4b48c2e89817e..8c1e6dbbfa5cc05da4d7423a535c6db74cf8f4c3 100644
--- a/doc/doc_en/algorithm_rec_sar_en.md
+++ b/doc/doc_en/algorithm_rec_sar_en.md
@@ -24,7 +24,7 @@ Paper:
Using MJSynth and SynthText two text recognition datasets for training, and evaluating on IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE datasets, the algorithm reproduction effect is as follows:
|Model|Backbone|config|Acc|Download link|
-| --- | --- | --- | --- | --- | --- | --- |
+| --- | --- | --- | --- | --- |
|SAR|ResNet31|[rec_r31_sar.yml](../../configs/rec/rec_r31_sar.yml)|87.20%|[train model](https://paddleocr.bj.bcebos.com/dygraph_v2.1/rec/rec_r31_sar_train.tar)|
Note:In addition to using the two text recognition datasets MJSynth and SynthText, [SynthAdd](https://pan.baidu.com/share/init?surl=uV0LtoNmcxbO-0YA7Ch4dg) data (extraction code: 627x), and some real data are used in training, the specific data details can refer to the paper.
diff --git a/doc/doc_en/algorithm_rec_seed_en.md b/doc/doc_en/algorithm_rec_seed_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..21679f42fd6302228804db49d731f9b69ec692b2
--- /dev/null
+++ b/doc/doc_en/algorithm_rec_seed_en.md
@@ -0,0 +1,111 @@
+# SEED
+
+- [1. Introduction](#1)
+- [2. Environment](#2)
+- [3. Model Training / Evaluation / Prediction](#3)
+ - [3.1 Training](#3-1)
+ - [3.2 Evaluation](#3-2)
+ - [3.3 Prediction](#3-3)
+- [4. Inference and Deployment](#4)
+ - [4.1 Python Inference](#4-1)
+ - [4.2 C++ Inference](#4-2)
+ - [4.3 Serving](#4-3)
+ - [4.4 More](#4-4)
+- [5. FAQ](#5)
+
+
+## 1. Introduction
+
+Paper:
+> [SEED: Semantics Enhanced Encoder-Decoder Framework for Scene Text Recognition](https://arxiv.org/pdf/2005.10977.pdf)
+
+> Qiao, Zhi and Zhou, Yu and Yang, Dongbao and Zhou, Yucan and Wang, Weiping
+
+> CVPR, 2020
+
+Using MJSynth and SynthText two text recognition datasets for training, and evaluating on IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE datasets, the algorithm reproduction effect is as follows:
+
+|Model|Backbone|ACC|config|Download link|
+| --- | --- | --- | --- | --- |
+|SEED|Aster_Resnet| 85.2% | [configs/rec/rec_resnet_stn_bilstm_att.yml](../../configs/rec/rec_resnet_stn_bilstm_att.yml) | [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/rec/rec_resnet_stn_bilstm_att.tar) |
+
+
+## 2. Environment
+Please refer to ["Environment Preparation"](./environment.md) to configure the PaddleOCR environment, and refer to ["Project Clone"](./clone.md) to clone the project code.
+
+
+
+## 3. Model Training / Evaluation / Prediction
+
+Please refer to [Text Recognition Tutorial](./recognition.md). PaddleOCR modularizes the code, and training different recognition models only requires **changing the configuration file**.
+
+Training:
+
+The SEED model needs to additionally load the [language model](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz) trained by FastText, and install the fasttext dependencies:
+
+```
+python3 -m pip install fasttext==0.9.1
+```
+
+Specifically, after the data preparation is completed, the training can be started. The training command is as follows:
+
+```
+#Single GPU training (long training period, not recommended)
+python3 tools/train.py -c configs/rec/rec_resnet_stn_bilstm_att.yml
+
+#Multi GPU training, specify the gpu number through the --gpus parameter
+python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c rec_resnet_stn_bilstm_att.yml
+```
+
+Evaluation:
+
+```
+# GPU evaluation
+python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/rec/rec_resnet_stn_bilstm_att.yml -o Global.pretrained_model={path/to/weights}/best_accuracy
+```
+
+Prediction:
+
+```
+# The configuration file used for prediction must match the training
+python3 tools/infer_rec.py -c configs/rec/rec_resnet_stn_bilstm_att.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.infer_img=doc/imgs_words/en/word_1.png
+```
+
+
+## 4. Inference and Deployment
+
+
+### 4.1 Python Inference
+
+Not support
+
+
+### 4.2 C++ Inference
+
+Not support
+
+
+### 4.3 Serving
+
+Not support
+
+
+### 4.4 More
+
+Not support
+
+
+## 5. FAQ
+
+
+## Citation
+
+```bibtex
+@inproceedings{qiao2020seed,
+ title={Seed: Semantics enhanced encoder-decoder framework for scene text recognition},
+ author={Qiao, Zhi and Zhou, Yu and Yang, Dongbao and Zhou, Yucan and Wang, Weiping},
+ booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+ pages={13528--13537},
+ year={2020}
+}
+```
diff --git a/doc/doc_en/algorithm_rec_srn_en.md b/doc/doc_en/algorithm_rec_srn_en.md
index ebc4a74ffd0215bd46467b38ac48db160c8ada74..c022a81f9e5797c531c79de7e793d44d9a22552c 100644
--- a/doc/doc_en/algorithm_rec_srn_en.md
+++ b/doc/doc_en/algorithm_rec_srn_en.md
@@ -24,7 +24,7 @@ Paper:
Using MJSynth and SynthText two text recognition datasets for training, and evaluating on IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE datasets, the algorithm reproduction effect is as follows:
|Model|Backbone|config|Acc|Download link|
-| --- | --- | --- | --- | --- | --- | --- |
+| --- | --- | --- | --- | --- |
|SRN|Resnet50_vd_fpn|[rec_r50_fpn_srn.yml](../../configs/rec/rec_r50_fpn_srn.yml)|86.31%|[train model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_r50_vd_srn_train.tar)|
diff --git a/doc/doc_en/algorithm_rec_svtr_en.md b/doc/doc_en/algorithm_rec_svtr_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..2e7deb4c077ce508773c4789e2e76bdda7dfe8c8
--- /dev/null
+++ b/doc/doc_en/algorithm_rec_svtr_en.md
@@ -0,0 +1,146 @@
+# SVTR
+
+- [1. Introduction](#1)
+- [2. Environment](#2)
+- [3. Model Training / Evaluation / Prediction](#3)
+ - [3.1 Training](#3-1)
+ - [3.2 Evaluation](#3-2)
+ - [3.3 Prediction](#3-3)
+- [4. Inference and Deployment](#4)
+ - [4.1 Python Inference](#4-1)
+ - [4.2 C++ Inference](#4-2)
+ - [4.3 Serving](#4-3)
+ - [4.4 More](#4-4)
+- [5. FAQ](#5)
+
+
+## 1. Introduction
+
+Paper:
+> [SVTR: Scene Text Recognition with a Single Visual Model](https://arxiv.org/abs/2205.00159)
+> Yongkun Du and Zhineng Chen and Caiyan Jia Xiaoting Yin and Tianlun Zheng and Chenxia Li and Yuning Du and Yu-Gang Jiang
+> IJCAI, 2022
+
+
+The accuracy (%) and model files of SVTR on the public dataset of scene text recognition are as follows:
+* Chinese dataset from [Chinese Benckmark](https://arxiv.org/abs/2112.15093) , and the Chinese training evaluation strategy of SVTR follows the paper.
+
+| Model |IC13
857 | SVT |IIIT5k
3000 |IC15
1811| SVTP |CUTE80 | Avg_6 |IC15
2077 |IC13
1015 |IC03
867|IC03
860|Avg_10 | Chinese
scene_test| Download link |
+|:----------:|:------:|:-----:|:---------:|:------:|:-----:|:-----:|:-----:|:-------:|:-------:|:-----:|:-----:|:---------------------------------------------:|:-----:|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:|
+| SVTR Tiny | 96.85 | 91.34 | 94.53 | 83.99 | 85.43 | 89.24 | 90.87 | 80.55 | 95.37 | 95.27 | 95.70 | 90.13 | 67.90 | [English](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/rec_svtr_tiny_none_ctc_en_train.tar) / [Chinese](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/rec_svtr_tiny_none_ctc_ch_train.tar) |
+| SVTR Small | 95.92 | 93.04 | 95.03 | 84.70 | 87.91 | 92.01 | 91.63 | 82.72 | 94.88 | 96.08 | 96.28 | 91.02 | 69.00 | [English](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/rec_svtr_small_none_ctc_en_train.tar) / [Chinese](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/rec_svtr_small_none_ctc_ch_train.tar) |
+| SVTR Base | 97.08 | 91.50 | 96.03 | 85.20 | 89.92 | 91.67 | 92.33 | 83.73 | 95.66 | 95.62 | 95.81 | 91.61 | 71.40 | [English](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/rec_svtr_base_none_ctc_en_train.tar) / - |
+| SVTR Large | 97.20 | 91.65 | 96.30 | 86.58 | 88.37 | 95.14 | 92.82 | 84.54 | 96.35 | 96.54 | 96.74 | 92.24 | 72.10 | [English](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/rec_svtr_large_none_ctc_en_train.tar) / [Chinese](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/rec_svtr_large_none_ctc_ch_train.tar) |
+
+
+## 2. Environment
+Please refer to ["Environment Preparation"](./environment.md) to configure the PaddleOCR environment, and refer to ["Project Clone"](./clone.md) to clone the project code.
+
+#### Dataset Preparation
+
+[English dataset download](https://github.com/clovaai/deep-text-recognition-benchmark#download-lmdb-dataset-for-traininig-and-evaluation-from-here)
+[Chinese dataset download](https://github.com/fudanvi/benchmarking-chinese-text-recognition#download)
+
+
+## 3. Model Training / Evaluation / Prediction
+
+Please refer to [Text Recognition Tutorial](./recognition.md). PaddleOCR modularizes the code, and training different recognition models only requires **changing the configuration file**.
+
+Training:
+
+Specifically, after the data preparation is completed, the training can be started. The training command is as follows:
+
+```
+#Single GPU training (long training period, not recommended)
+python3 tools/train.py -c configs/rec/rec_svtrnet.yml
+
+#Multi GPU training, specify the gpu number through the --gpus parameter
+python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/rec/rec_svtrnet.yml
+```
+
+Evaluation:
+
+You can download the model files and configuration files provided by `SVTR`: [download link](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/rec_svtr_tiny_none_ctc_en_train.tar), take `SVTR-T` as an example, using the following command to evaluate:
+
+```
+# Download the tar archive containing the model files and configuration files of SVTR-T and extract it
+wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/rec_svtr_tiny_none_ctc_en_train.tar && tar xf rec_svtr_tiny_none_ctc_en_train.tar
+# GPU evaluation
+python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c ./rec_svtr_tiny_none_ctc_en_train/rec_svtr_tiny_6local_6global_stn_en.yml -o Global.pretrained_model=./rec_svtr_tiny_none_ctc_en_train/best_accuracy
+```
+
+Prediction:
+
+```
+python3 tools/infer_rec.py -c ./rec_svtr_tiny_none_ctc_en_train/rec_svtr_tiny_6local_6global_stn_en.yml -o Global.infer_img='./doc/imgs_words_en/word_10.png' Global.pretrained_model=./rec_svtr_tiny_none_ctc_en_train/best_accuracy
+```
+
+
+## 4. Inference and Deployment
+
+
+### 4.1 Python Inference
+First, the model saved during the SVTR text recognition training process is converted into an inference model. ( [Model download link](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/rec_svtr_tiny_none_ctc_en_train.tar) ), you can use the following command to convert:
+
+```
+python3 tools/export_model.py -c configs/rec/rec_svtrnet.yml -o Global.pretrained_model=./rec_svtr_tiny_none_ctc_en_train/best_accuracy Global.save_inference_dir=./inference/rec_svtr_tiny_stn_en
+```
+
+**Note:**
+- If you are training the model on your own dataset and have modified the dictionary file, please pay attention to modify the `character_dict_path` in the configuration file to the modified dictionary file.
+- If you modified the input size during training, please modify the `infer_shape` corresponding to SVTR in the `tools/export_model.py` file.
+
+After the conversion is successful, there are three files in the directory:
+```
+/inference/rec_svtr_tiny_stn_en/
+ ├── inference.pdiparams
+ ├── inference.pdiparams.info
+ └── inference.pdmodel
+```
+
+
+For SVTR text recognition model inference, the following commands can be executed:
+
+```
+python3 tools/infer/predict_rec.py --image_dir='./doc/imgs_words_en/word_10.png' --rec_model_dir='./inference/rec_svtr_tiny_stn_en/' --rec_algorithm='SVTR' --rec_image_shape='3,64,256' --rec_char_dict_path='./ppocr/utils/ic15_dict.txt'
+```
+
+
+
+After executing the command, the prediction result (recognized text and score) of the image above is printed to the screen, an example is as follows:
+The result is as follows:
+```shell
+Predicts of ./doc/imgs_words_en/word_10.png:('pain', 0.9999998807907104)
+```
+
+
+### 4.2 C++ Inference
+
+Not supported
+
+
+### 4.3 Serving
+
+Not supported
+
+
+### 4.4 More
+
+Not supported
+
+
+## 5. FAQ
+
+1. Since most of the operators used by `SVTR` are matrix multiplication, in the GPU environment, the speed has an advantage, but in the environment where mkldnn is enabled on the CPU, `SVTR` has no advantage over the optimized convolutional network.
+
+## Citation
+
+```bibtex
+@article{Du2022SVTR,
+ title = {SVTR: Scene Text Recognition with a Single Visual Model},
+ author = {Du, Yongkun and Chen, Zhineng and Jia, Caiyan and Yin, Xiaoting and Zheng, Tianlun and Li, Chenxia and Du, Yuning and Jiang, Yu-Gang},
+ booktitle = {IJCAI},
+ year = {2022},
+ url = {https://arxiv.org/abs/2205.00159}
+}
+```
diff --git a/doc/doc_en/dataset/docvqa_datasets_en.md b/doc/doc_en/dataset/docvqa_datasets_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..820462c324318a391abe409412e8996f11b36279
--- /dev/null
+++ b/doc/doc_en/dataset/docvqa_datasets_en.md
@@ -0,0 +1,27 @@
+## DocVQA dataset
+Here are the common DocVQA datasets, which are being updated continuously. Welcome to contribute datasets~
+- [FUNSD dataset](#funsd)
+- [XFUND dataset](#xfund)
+
+
+#### 1. FUNSD dataset
+- **Data source**: https://guillaumejaume.github.io/FUNSD/
+- **Data Introduction**: The FUNSD dataset is a dataset for form comprehension. It contains 199 real, fully annotated scanned images, including market reports, advertisements, and academic reports, etc., and is divided into 149 50 training sets and 50 test sets. The FUNSD dataset is suitable for many types of DocVQA tasks, such as field-level entity classification, field-level entity connection, etc. Part of the image and the annotation box visualization are shown below:
+
+

+

+
+ In the figure, the orange area represents `header`, the light blue area represents `question`, the green area represents `answer`, and the pink area represents `other`.
+
+- **Download address**: https://guillaumejaume.github.io/FUNSD/download/
+
+
+#### 2. XFUND dataset
+- **Data source**: https://github.com/doc-analysis/XFUND
+- **Data introduction**: XFUND is a multilingual form comprehension dataset, which contains form data in 7 different languages, and all are manually annotated in the form of key-value pairs. The data for each language contains 199 form data, which are divided into 149 training sets and 50 test sets. Part of the image and the annotation box visualization are shown below:
+
+

+

+
+
+- **Download address**: https://github.com/doc-analysis/XFUND/releases/tag/v1.0
diff --git a/doc/doc_en/inference_ppocr_en.md b/doc/doc_en/inference_ppocr_en.md
index 8dc30d3106048575a9ad722386daf9cb658dd455..dcfb738e7699736ad871cd6fd28ce2a8300a277a 100755
--- a/doc/doc_en/inference_ppocr_en.md
+++ b/doc/doc_en/inference_ppocr_en.md
@@ -20,10 +20,10 @@ The default configuration is based on the inference setting of the DB text detec
```
# download DB text detection inference model
-wget https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer.tar
-tar xf ch_PP-OCRv2_det_infer.tar
+wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar
+tar xf ch_PP-OCRv3_det_infer.tar
# run inference
-python3 tools/infer/predict_det.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./ch_PP-OCRv2_det_infer.tar/"
+python3 tools/infer/predict_det.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./ch_PP-OCRv3_det_infer/"
```
The visual text detection results are saved to the ./inference_results folder by default, and the name of the result file is prefixed with 'det_res'. Examples of results are as follows:
@@ -40,12 +40,12 @@ Set as `limit_type='min', det_limit_side_len=960`, it means that the shortest si
If the resolution of the input picture is relatively large and you want to use a larger resolution prediction, you can set det_limit_side_len to the desired value, such as 1216:
```
-python3 tools/infer/predict_det.py --image_dir="./doc/imgs/1.jpg" --det_model_dir="./inference/ch_PP-OCRv2_det_infer/" --det_limit_type=max --det_limit_side_len=1216
+python3 tools/infer/predict_det.py --image_dir="./doc/imgs/1.jpg" --det_model_dir="./ch_PP-OCRv3_det_infer/" --det_limit_type=max --det_limit_side_len=1216
```
If you want to use the CPU for prediction, execute the command as follows
```
-python3 tools/infer/predict_det.py --image_dir="./doc/imgs/1.jpg" --det_model_dir="./inference/ch_PP-OCRv2_det_infer/" --use_gpu=False
+python3 tools/infer/predict_det.py --image_dir="./doc/imgs/1.jpg" --det_model_dir="./ch_PP-OCRv3_det_infer/" --use_gpu=False
```
@@ -56,14 +56,17 @@ python3 tools/infer/predict_det.py --image_dir="./doc/imgs/1.jpg" --det_model_di
### 1. Lightweight Chinese Recognition Model Inference
+**Note**: The input shape used by the recognition model of `PP-OCRv3` is `3,48,320`, and the parameter `--rec_image_shape=3,48,320` needs to be added. If the recognition model of `PP-OCRv3` is not used, this parameter does not need to be set.
+
+
For lightweight Chinese recognition model inference, you can execute the following commands:
```
# download CRNN text recognition inference model
-wget https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_infer.tar
-tar xf ch_PP-OCRv2_rec_infer.tar
+wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar
+tar xf ch_PP-OCRv3_rec_infer.tar
# run inference
-python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words/ch/word_4.jpg" --rec_model_dir="./ch_PP-OCRv2_rec_infer/"
+python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words_en/word_10.png" --rec_model_dir="./ch_PP-OCRv3_rec_infer/" --rec_image_shape=3,48,320
```

@@ -71,7 +74,7 @@ python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words/ch/word_4.jpg"
After executing the command, the prediction results (recognized text and score) of the above image will be printed on the screen.
```bash
-Predicts of ./doc/imgs_words_en/word_10.png:('PAIN', 0.9897658)
+Predicts of ./doc/imgs_words_en/word_10.png:('PAIN', 0.988671)
```
@@ -117,20 +120,22 @@ After executing the command, the prediction results (classification angle and sc
## Text Detection Angle Classification and Recognition Inference Concatenation
+**Note**: The input shape used by the recognition model of `PP-OCRv3` is `3,48,320`, and the parameter `--rec_image_shape=3,48,320` needs to be added. If the recognition model of `PP-OCRv3` is not used, this parameter does not need to be set.
+
When performing prediction, you need to specify the path of a single image or a folder of images through the parameter `image_dir`, the parameter `det_model_dir` specifies the path to detect the inference model, the parameter `cls_model_dir` specifies the path to angle classification inference model and the parameter `rec_model_dir` specifies the path to identify the inference model. The parameter `use_angle_cls` is used to control whether to enable the angle classification model. The parameter `use_mp` specifies whether to use multi-process to infer `total_process_num` specifies process number when using multi-process. The parameter . The visualized recognition results are saved to the `./inference_results` folder by default.
```shell
# use direction classifier
-python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./inference/ch_PP-OCRv2_det_infer/" --cls_model_dir="./inference/cls/" --rec_model_dir="./inference/ch_PP-OCRv2_rec_infer/" --use_angle_cls=true
+python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./ch_PP-OCRv3_det_infer/" --cls_model_dir="./cls/" --rec_model_dir="./ch_PP-OCRv2_rec_infer/" --use_angle_cls=true --rec_image_shape=3,48,320
# not use use direction classifier
-python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./inference/ch_PP-OCRv2_det_infer/" --rec_model_dir="./inference/ch_PP-OCRv2_rec_infer/" --use_angle_cls=false
+python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./ch_PP-OCRv2_det_infer/" --rec_model_dir="./ch_PP-OCRv2_rec_infer/" --use_angle_cls=false --rec_image_shape=3,48,320
# use multi-process
-python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./inference/ch_PP-OCRv2_det_infer/" --rec_model_dir="./inference/ch_PP-OCRv2_rec_infer/" --use_angle_cls=false --use_mp=True --total_process_num=6
+python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./ch_PP-OCRv2_det_infer/" --rec_model_dir="./ch_PP-OCRv2_rec_infer/" --use_angle_cls=false --use_mp=True --total_process_num=6 --rec_image_shape=3,48,320
```
After executing the command, the recognition result image is as follows:
-
+
diff --git a/doc/doc_en/knowledge_distillation_en.md b/doc/doc_en/knowledge_distillation_en.md
index 1db9faef5c97cacbba36fdb42807924e9c7a53cf..bd36907c98c6d556fe1dea85712ece0e717fe426 100755
--- a/doc/doc_en/knowledge_distillation_en.md
+++ b/doc/doc_en/knowledge_distillation_en.md
@@ -74,6 +74,7 @@ The configuration file is in [ch_PP-OCRv2_rec_distillation.yml](../../configs/re
#### 2.1.1 Model Structure
In the knowledge distillation task, the model structure configuration is as follows.
+
```yaml
Architecture:
model_type: &model_type "rec" # Model category, recognition, detection, etc.
@@ -85,37 +86,55 @@ Architecture:
freeze_params: false # Do you need fixed parameters
return_all_feats: true # Do you need to return all features, if it is False, only the final output is returned
model_type: *model_type # Model category
- algorithm: CRNN # The algorithm name of the sub-network. The remaining parameters of the sub-network are consistent with the general model training configuration
+ algorithm: SVTR # The algorithm name of the sub-network. The remaining parameters of the sub-network are consistent with the general model training configuration
Transform:
Backbone:
name: MobileNetV1Enhance
scale: 0.5
- Neck:
- name: SequenceEncoder
- encoder_type: rnn
- hidden_size: 64
+ last_conv_stride: [1, 2]
+ last_pool_type: avg
Head:
- name: CTCHead
- mid_channels: 96
- fc_decay: 0.00002
+ name: MultiHead
+ head_list:
+ - CTCHead:
+ Neck:
+ name: svtr
+ dims: 64
+ depth: 2
+ hidden_dims: 120
+ use_guide: True
+ Head:
+ fc_decay: 0.00001
+ - SARHead:
+ enc_dim: 512
+ max_text_length: *max_text_length
Student: # Another sub-network, here is a distillation example of DML, the two sub-networks have the same structure, and both need to learn parameters
pretrained: # The following parameters are the same as above
freeze_params: false
return_all_feats: true
model_type: *model_type
- algorithm: CRNN
+ algorithm: SVTR
Transform:
Backbone:
name: MobileNetV1Enhance
scale: 0.5
- Neck:
- name: SequenceEncoder
- encoder_type: rnn
- hidden_size: 64
+ last_conv_stride: [1, 2]
+ last_pool_type: avg
Head:
- name: CTCHead
- mid_channels: 96
- fc_decay: 0.00002
+ name: MultiHead
+ head_list:
+ - CTCHead:
+ Neck:
+ name: svtr
+ dims: 64
+ depth: 2
+ hidden_dims: 120
+ use_guide: True
+ Head:
+ fc_decay: 0.00001
+ - SARHead:
+ enc_dim: 512
+ max_text_length: *max_text_length
```
If you want to add more sub-networks for training, you can also add the corresponding fields in the configuration file according to the way of adding `Student` and `Teacher`.
@@ -132,55 +151,83 @@ Architecture:
freeze_params: false
return_all_feats: true
model_type: *model_type
- algorithm: CRNN
+ algorithm: SVTR
Transform:
Backbone:
name: MobileNetV1Enhance
scale: 0.5
- Neck:
- name: SequenceEncoder
- encoder_type: rnn
- hidden_size: 64
+ last_conv_stride: [1, 2]
+ last_pool_type: avg
Head:
- name: CTCHead
- mid_channels: 96
- fc_decay: 0.00002
+ name: MultiHead
+ head_list:
+ - CTCHead:
+ Neck:
+ name: svtr
+ dims: 64
+ depth: 2
+ hidden_dims: 120
+ use_guide: True
+ Head:
+ fc_decay: 0.00001
+ - SARHead:
+ enc_dim: 512
+ max_text_length: *max_text_length
Student:
pretrained:
freeze_params: false
return_all_feats: true
model_type: *model_type
- algorithm: CRNN
+ algorithm: SVTR
Transform:
Backbone:
name: MobileNetV1Enhance
scale: 0.5
- Neck:
- name: SequenceEncoder
- encoder_type: rnn
- hidden_size: 64
+ last_conv_stride: [1, 2]
+ last_pool_type: avg
Head:
- name: CTCHead
- mid_channels: 96
- fc_decay: 0.00002
- Student2: # The new sub-network introduced in the knowledge distillation task, the configuration is the same as above
+ name: MultiHead
+ head_list:
+ - CTCHead:
+ Neck:
+ name: svtr
+ dims: 64
+ depth: 2
+ hidden_dims: 120
+ use_guide: True
+ Head:
+ fc_decay: 0.00001
+ - SARHead:
+ enc_dim: 512
+ max_text_length: *max_text_length
+ Student2:
pretrained:
freeze_params: false
return_all_feats: true
model_type: *model_type
- algorithm: CRNN
+ algorithm: SVTR
Transform:
Backbone:
name: MobileNetV1Enhance
scale: 0.5
- Neck:
- name: SequenceEncoder
- encoder_type: rnn
- hidden_size: 64
+ last_conv_stride: [1, 2]
+ last_pool_type: avg
Head:
- name: CTCHead
- mid_channels: 96
- fc_decay: 0.00002
+ name: MultiHead
+ head_list:
+ - CTCHead:
+ Neck:
+ name: svtr
+ dims: 64
+ depth: 2
+ hidden_dims: 120
+ use_guide: True
+ Head:
+ fc_decay: 0.00001
+ - SARHead:
+ enc_dim: 512
+ max_text_length: *max_text_length
+```
```
When the model is finally trained, it contains 3 sub-networks: `Teacher`, `Student`, `Student2`.
@@ -224,23 +271,42 @@ Loss:
act: "softmax" # Activation function, use it to process the input, can be softmax, sigmoid or None, the default is None
model_name_pairs: # The subnet name pair used to calculate DML loss. If you want to calculate the DML loss of other subnets, you can continue to add it below the list
- ["Student", "Teacher"]
- key: head_out
+ key: head_out
+ multi_head: True # whether to use mult_head
+ dis_head: ctc # assign the head name to calculate loss
+ name: dml_ctc # prefix name of the loss
+ - DistillationDMLLoss: # DML loss function, inherited from the standard DMLLoss
+ weight: 0.5
+ act: "softmax" # Activation function, use it to process the input, can be softmax, sigmoid or None, the default is None
+ model_name_pairs: # The subnet name pair used to calculate DML loss. If you want to calculate the DML loss of other subnets, you can continue to add it below the list
+ - ["Student", "Teacher"]
+ key: head_out
+ multi_head: True # whether to use mult_head
+ dis_head: sar # assign the head name to calculate loss
+ name: dml_sar # prefix name of the loss
- DistillationDistanceLoss: # Distilled distance loss function
weight: 1.0
mode: "l2" # Support l1, l2 or smooth_l1
model_name_pairs: # Calculate the distance loss of the subnet name pair
- ["Student", "Teacher"]
key: backbone_out
+ - DistillationSARLoss: # SAR loss function based on distillation, inherited from standard SAR loss
+ weight: 1.0 # The weight of the loss function. In loss_config_list, each loss function must include this field
+ model_name_list: ["Student", "Teacher"] # For the prediction results of the distillation model, extract the output of these two sub-networks and calculate the SAR loss with gt
+ key: head_out # In the sub-network output dict, take the corresponding tensor
+ multi_head: True # whether it is multi-head or not, if true, SAR branch is used to calculate the loss
```
Among the above loss functions, all distillation loss functions are inherited from the standard loss function class.
The main functions are: Analyze the output of the distillation model, find the intermediate node (tensor) used to calculate the loss,
and then use the standard loss function class to calculate.
-Taking the above configuration as an example, the final distillation training loss function contains the following three parts.
+Taking the above configuration as an example, the final distillation training loss function contains the following five parts.
-- The final output `head_out` of `Student` and `Teacher` calculates the CTC loss with gt (loss weight equals 1.0). Here, because both sub-networks need to update the parameters, both of them need to calculate the loss with gt.
-- DML loss between `Student` and `Teacher`'s final output `head_out` (loss weight equals 1.0).
+- CTC branch of the final output `head_out` for `Student` and `Teacher` calculates the CTC loss with gt (loss weight equals 1.0). Here, because both sub-networks need to update the parameters, both of them need to calculate the loss with gt.
+- SAR branch of the final output `head_out` for `Student` and `Teacher` calculates the SAR loss with gt (loss weight equals 1.0). Here, because both sub-networks need to update the parameters, both of them need to calculate the loss with gt.
+- DML loss between CTC branch of `Student` and `Teacher`'s final output `head_out` (loss weight equals 1.0).
+- DML loss between SAR branch of `Student` and `Teacher`'s final output `head_out` (loss weight equals 0.5).
- L2 loss between `Student` and `Teacher`'s backbone network output `backbone_out` (loss weight equals 1.0).
For more specific implementation of `CombinedLoss`, please refer to: [combined_loss.py](../../ppocr/losses/combined_loss.py#L23).
@@ -257,6 +323,7 @@ PostProcess:
name: DistillationCTCLabelDecode # CTC decoding post-processing of distillation tasks, inherited from the standard CTCLabelDecode class
model_name: ["Student", "Teacher"] # For the prediction results of the distillation model, extract the outputs of these two sub-networks and decode them
key: head_out # Take the corresponding tensor in the subnet output dict
+ multi_head: True # whether it is multi-head or not, if true, CTC branch is used to calculate the loss
```
Taking the above configuration as an example, the CTC decoding output of the two sub-networks `Student` and `Teahcer` will be calculated at the same time.
@@ -276,6 +343,7 @@ Metric:
base_metric_name: RecMetric # The base class of indicator calculation. For the output of the model, the indicator will be calculated based on this class
main_indicator: acc # The name of the indicator
key: "Student" # Select the main_indicator of this subnet as the criterion for saving the best model
+ ignore_space: False # whether to ignore space during evaulation
```
Taking the above configuration as an example, the accuracy metric of the `Student` subnet will be used as the judgment metric for saving the best model.
@@ -289,13 +357,13 @@ For more specific implementation of `DistillationMetric`, please refer to: [dist
There are two ways to fine-tune the recognition distillation task.
-1. Fine-tuning based on knowledge distillation: this situation is relatively simple, download the pre-trained model. Then configure the pre-training model path and your own data path in [ch_PP-OCRv2_rec_distillation.yml](../../configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec_distillation.yml) to perform fine-tuning training of the model.
+1. Fine-tuning based on knowledge distillation: this situation is relatively simple, download the pre-trained model. Then configure the pre-training model path and your own data path in [ch_PP-OCRv2_rec_distillation.yml](../../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml) to perform fine-tuning training of the model.
2. Do not use knowledge distillation in fine-tuning: In this case, you need to first extract the student model parameters from the pre-training model. The specific steps are as follows.
- First download the pre-trained model and unzip it.
```shell
-wget https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_train.tar
-tar -xf ch_PP-OCRv2_rec_train.tar
+wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_train.tar
+tar -xf ch_PP-OCRv3_rec_train.tar
```
- Then use python to extract the student model parameters
@@ -303,7 +371,7 @@ tar -xf ch_PP-OCRv2_rec_train.tar
```python
import paddle
# Load the pre-trained model
-all_params = paddle.load("ch_PP-OCRv2_rec_train/best_accuracy.pdparams")
+all_params = paddle.load("ch_PP-OCRv3_rec_train/best_accuracy.pdparams")
# View the keys of the weight parameter
print(all_params.keys())
# Weight extraction of student model
@@ -311,19 +379,18 @@ s_params = {key[len("Student."):]: all_params[key] for key in all_params if "Stu
# View the keys of the weight parameters of the student model
print(s_params.keys())
# Save weight parameters
-paddle.save(s_params, "ch_PP-OCRv2_rec_train/student.pdparams")
+paddle.save(s_params, "ch_PP-OCRv3_rec_train/student.pdparams")
```
-After the extraction is complete, use [ch_PP-OCRv2_rec.yml](../../configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec.yml) to modify the path of the pre-trained model (the path of the exported `student.pdparams` model) and your own data path to fine-tune the model.
+After the extraction is complete, use [ch_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec.yml) to modify the path of the pre-trained model (the path of the exported `student.pdparams` model) and your own data path to fine-tune the model.
### 2.2 Detection Model Configuration File Analysis
-The configuration file of the detection model distillation is in the ```PaddleOCR/configs/det/ch_PP-OCRv2/``` directory, which contains three distillation configuration files:
+The configuration file of the detection model distillation is in the ```PaddleOCR/configs/det/ch_PP-OCRv3/``` directory, which contains three distillation configuration files:
-- ```ch_PP-OCRv2_det_cml.yml```, Use one large model to distill two small models, and the two small models learn from each other
-- ```ch_PP-OCRv2_det_dml.yml```, Method of mutual distillation of two student models
-- ```ch_PP-OCRv2_det_distill.yml```, The method of using large teacher model to distill small student model
+- ```ch_PP-OCRv3_det_cml.yml```, Use one large model to distill two small models, and the two small models learn from each other
+- ```ch_PP-OCRv3_det_dml.yml```, Method of mutual distillation of two student models
#### 2.2.1 Model Structure
@@ -341,39 +408,40 @@ Architecture:
model_type: det
algorithm: DB
Backbone:
- name: MobileNetV3
- scale: 0.5
- model_name: large
- disable_se: True
+ name: ResNet
+ in_channels: 3
+ layers: 50
Neck:
- name: DBFPN
- out_channels: 96
+ name: LKPAN
+ out_channels: 256
Head:
name: DBHead
+ kernel_list: [7,2,2]
k: 50
Teacher: # Another sub-network, here is a distillation example of a large model distill a small model
pretrained: ./pretrain_models/ch_ppocr_server_v2.0_det_train/best_accuracy
- freeze_params: true # The Teacher model is well-trained and does not need to participate in training
return_all_feats: false
model_type: det
algorithm: DB
Transform:
Backbone:
name: ResNet
- layers: 18
+ in_channels: 3
+ layers: 50
Neck:
- name: DBFPN
+ name: LKPAN
out_channels: 256
Head:
name: DBHead
+ kernel_list: [7,2,2]
k: 50
```
If DML is used, that is, the method of two small models learning from each other, the Teacher network structure in the above configuration file needs to be set to the same configuration as the Student model.
-Refer to the configuration file for details. [ch_PP-OCRv2_det_dml.yml](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.4/configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_dml.yml)
+Refer to the configuration file for details. [ch_PP-OCRv3_det_dml.yml](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.4/configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_dml.yml)
-The following describes the configuration file parameters [ch_PP-OCRv2_det_cml.yml](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.4/configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_cml.yml):
+The following describes the configuration file parameters [ch_PP-OCRv3_det_cml.yml](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.4/configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml):
```
Architecture:
@@ -390,12 +458,14 @@ Architecture:
Transform:
Backbone:
name: ResNet
- layers: 18
+ in_channels: 3
+ layers: 50
Neck:
- name: DBFPN
+ name: LKPAN
out_channels: 256
Head:
name: DBHead
+ kernel_list: [7,2,2]
k: 50
Student: # Student model configuration for CML distillation
pretrained: ./pretrain_models/MobileNetV3_large_x0_5_pretrained
@@ -407,10 +477,11 @@ Architecture:
name: MobileNetV3
scale: 0.5
model_name: large
- disable_se: True
+ disable_se: true
Neck:
- name: DBFPN
+ name: RSEFPN
out_channels: 96
+ shortcut: True
Head:
name: DBHead
k: 50
@@ -425,10 +496,11 @@ Architecture:
name: MobileNetV3
scale: 0.5
model_name: large
- disable_se: True
+ disable_se: true
Neck:
- name: DBFPN
+ name: RSEFPN
out_channels: 96
+ shortcut: True
Head:
name: DBHead
k: 50
@@ -460,34 +532,7 @@ The key contains `backbone_out`, `neck_out`, `head_out`, and `value` is the tens
#### 2.2.2 Loss Function
-
-In the task of detection knowledge distillation ```ch_PP-OCRv2_det_distill.yml````, the distillation loss function configuration is as follows.
-```yaml
-Loss:
- name: CombinedLoss # Loss function name
- loss_config_list: # List of loss function configuration files, mandatory functions for CombinedLoss
- - DistillationDilaDBLoss: # DB loss function based on distillation, inherited from standard DBloss
- weight: 1.0 # The weight of the loss function. In loss_config_list, each loss function must include this field
- model_name_pairs: # Extract the output of these two sub-networks and calculate the loss between them
- - ["Student", "Teacher"]
- key: maps # In the sub-network output dict, take the corresponding tensor
- balance_loss: true # The following parameters are the configuration parameters of standard DBloss
- main_loss_type: DiceLoss
- alpha: 5
- beta: 10
- ohem_ratio: 3
- - DistillationDBLoss: # Used to calculate the loss between Student and GT
- weight: 1.0
- model_name_list: ["Student"] # The model name only has Student, which means that the loss between Student and GT is calculated
- name: DBLoss
- balance_loss: true
- main_loss_type: DiceLoss
- alpha: 5
- beta: 10
- ohem_ratio: 3
-```
-
-Similarly, distillation loss function configuration(`ch_PP-OCRv2_det_cml.yml`) is shown below. Compared with the loss function configuration of ch_PP-OCRv2_det_distill.yml, there are three changes:
+The distillation loss function configuration(`ch_PP-OCRv3_det_cml.yml`) is shown below.
```yaml
Loss:
name: CombinedLoss
@@ -530,7 +575,7 @@ In the task of detecting knowledge distillation, the post-processing configurati
```yaml
PostProcess:
- name: DistillationDBPostProcess # The CTC decoding post-processing of the DB detection distillation task, inherited from the standard DBPostProcess class
+ name: DistillationDBPostProcess # The post-processing of the DB detection distillation task, inherited from the standard DBPostProcess class
model_name: ["Student", "Student2", "Teacher"] # Extract the output of multiple sub-networks and decode them. The network that does not require post-processing is not set in model_name
thresh: 0.3
box_thresh: 0.6
@@ -561,9 +606,9 @@ Model Structure
#### 2.2.5 Fine-tuning Distillation Model
There are three ways to fine-tune the detection distillation task:
-- `ch_PP-OCRv2_det_distill.yml`, The teacher model is set to the model provided by PaddleOCR or the large model you have trained.
-- `ch_PP-OCRv2_det_cml.yml`, Use cml distillation. Similarly, the Teacher model is set to the model provided by PaddleOCR or the large model you have trained.
-- `ch_PP-OCRv2_det_dml.yml`, Distillation using DML. The method of mutual distillation of the two Student models has an accuracy improvement of about 1.7% on the data set used by PaddleOCR.
+- `ch_PP-OCRv3_det_distill.yml`, The teacher model is set to the model provided by PaddleOCR or the large model you have trained.
+- `ch_PP-OCRv3_det_cml.yml`, Use cml distillation. Similarly, the Teacher model is set to the model provided by PaddleOCR or the large model you have trained.
+- `ch_PP-OCRv3_det_dml.yml`, Distillation using DML. The method of mutual distillation of the two Student models has an accuracy improvement of about 1.7% on the data set used by PaddleOCR.
In fine-tune, you need to set the pre-trained model to be loaded in the `pretrained` parameter of the network structure.
@@ -572,13 +617,13 @@ In terms of accuracy improvement, `cml` > `dml` > `distill`. When the amount of
In addition, since the distillation pre-training model provided by PaddleOCR contains multiple model parameters, if you want to extract the parameters of the student model, you can refer to the following code:
```sh
# Download the parameters of the distillation training model
-wget https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_distill_train.tar
+wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_distill_train.tar
```
```python
import paddle
# Load the pre-trained model
-all_params = paddle.load("ch_PP-OCRv2_det_distill_train/best_accuracy.pdparams")
+all_params = paddle.load("ch_PP-OCRv3_det_distill_train/best_accuracy.pdparams")
# View the keys of the weight parameter
print(all_params.keys())
# Extract the weights of the student model
@@ -586,7 +631,7 @@ s_params = {key[len("Student."):]: all_params[key] for key in all_params if "Stu
# View the keys of the weight parameters of the student model
print(s_params.keys())
# Save
-paddle.save(s_params, "ch_PP-OCRv2_det_distill_train/student.pdparams")
+paddle.save(s_params, "ch_PP-OCRv3_det_distill_train/student.pdparams")
```
-Finally, the parameters of the student model will be saved in `ch_PP-OCRv2_det_distill_train/student.pdparams` for the fine-tune of the model.
+Finally, the parameters of the student model will be saved in `ch_PP-OCRv3_det_distill_train/student.pdparams` for the fine-tune of the model.
diff --git a/doc/doc_en/models_list_en.md b/doc/doc_en/models_list_en.md
index e77a5ea96336e91cdd0490b687ca3652ba374df5..a61667b8d66a72d265c5ea9d3dbb9a2bff51de61 100644
--- a/doc/doc_en/models_list_en.md
+++ b/doc/doc_en/models_list_en.md
@@ -1,11 +1,14 @@
-# OCR Model List(V2.1, updated on 2021.9.6)
+# OCR Model List(V2.1, updated on 2022.4.28)
> **Note**
-> 1. Compared with the model v2.0, the 2.1 version of the detection model has a improvement in accuracy, and the 2.1 version of the recognition model has optimizations in accuracy and speed with CPU.
-> 2. Compared with [models 1.1](https://github.com/PaddlePaddle/PaddleOCR/blob/develop/doc/doc_en/models_list_en.md), which are trained with static graph programming paradigm, models 2.0 are the dynamic graph trained version and achieve close performance.
+> 1. Compared with the model v2, the 3rd version of the detection model has a improvement in accuracy, and the 2.1 version of the recognition model has optimizations in accuracy and speed with CPU.
+> 2. Compared with [models 1.1](https://github.com/PaddlePaddle/PaddleOCR/blob/develop/doc/doc_en/models_list_en.md), which are trained with static graph programming paradigm, models 2.0 or higher are the dynamic graph trained version and achieve close performance.
> 3. All models in this tutorial are all ppocr-series models, for more introduction of algorithms and models based on public dataset, you can refer to [algorithm overview tutorial](./algorithm_overview_en.md).
-- [OCR Model List(V2.1, updated on 2021.9.6)](#ocr-model-listv21-updated-on-202196)
+- [OCR Model List(V3, updated on 2022.4.28)]()
- [1. Text Detection Model](#1-text-detection-model)
+ - [1.1 Chinese Detection Model](#1.1)
+ - [2.2 English Detection Model](#1.2)
+ - [1.3 Multilingual Detection Model](#1.3)
- [2. Text Recognition Model](#2-text-recognition-model)
- [2.1 Chinese Recognition Model](#21-chinese-recognition-model)
- [2.2 English Recognition Model](#22-english-recognition-model)
@@ -28,14 +31,42 @@ Relationship of the above models is as follows.
## 1. Text Detection Model
+
+
+### 1. Chinese Detection Model
+
|model name|description|config|model size|download|
| --- | --- | --- | --- | --- |
-|ch_PP-OCRv2_det_slim|[New] slim quantization with distillation lightweight model, supporting Chinese, English, multilingual text detection|[ch_PP-OCRv2_det_cml.yml](../../configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_cml.yml)| 3M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_slim_quant_infer.tar)|
-|ch_PP-OCRv2_det|[New] Original lightweight model, supporting Chinese, English, multilingual text detection|[ch_PP-OCRv2_det_cml.yml](../../configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_cml.yml)|3M|[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_distill_train.tar)|
+|ch_PP-OCRv3_det_slim| [New] slim quantization with distillation lightweight model, supporting Chinese, English, multilingual text detection |[ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml)| 1.1M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_slim_infer.tar) / [trained model (coming soon)](https://paddleocr.bj.bcebos.com/PP-OCRv3/ch/ch_PP-OCRv3_det_slim_distill_train.tar) / [slim model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_slim_infer.nb)|
+|ch_PP-OCRv3_det| [New] Original lightweight model, supporting Chinese, English, multilingual text detection |[ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml)| 3.8M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_distill_train.tar)|
+|ch_PP-OCRv2_det_slim| [New] slim quantization with distillation lightweight model, supporting Chinese, English, multilingual text detection|[ch_PP-OCRv2_det_cml.yml](../../configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_cml.yml)| 3M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_slim_quant_infer.tar)|
+|ch_PP-OCRv2_det| [New] Original lightweight model, supporting Chinese, English, multilingual text detection|[ch_PP-OCRv2_det_cml.yml](../../configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_cml.yml)|3M|[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_distill_train.tar)|
|ch_ppocr_mobile_slim_v2.0_det|Slim pruned lightweight model, supporting Chinese, English, multilingual text detection|[ch_det_mv3_db_v2.0.yml](../../configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml)|2.6M |[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/slim/ch_ppocr_mobile_v2.0_det_prune_infer.tar)|
|ch_ppocr_mobile_v2.0_det|Original lightweight model, supporting Chinese, English, multilingual text detection|[ch_det_mv3_db_v2.0.yml](../../configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml)|3M|[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_train.tar)|
|ch_ppocr_server_v2.0_det|General model, which is larger than the lightweight model, but achieved better performance|[ch_det_res18_db_v2.0.yml](../../configs/det/ch_ppocr_v2.0/ch_det_res18_db_v2.0.yml)|47M|[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_train.tar)|
+
+
+### 1.2 English Detection Model
+
+|model name|description|config|model size|download|
+| --- | --- | --- | --- | --- |
+|en_PP-OCRv3_det_slim | [New] Slim qunatization with distillation lightweight detection model, supporting English | [ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml) | 1.1M |[inference model(coming soon)](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_slim_infer.tar) / [trained model (coming soon)](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_slim_distill_train.tar) / [slim model(coming soon)](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_slim_infer.nb) |
+|ch_PP-OCRv3_det | [New] Original lightweight detection model, supporting English |[ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml)| 3.8M | [inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_distill_train.tar) |
+
+* Note: English configuration file is same as Chinese except training data, here we only provide one configuration file.
+
+
+
+### 1.3 Multilingual Detection Model
+
+|model name|description|config|model size|download|
+| --- | --- | --- | --- | --- |
+| ml_PP-OCRv3_det_slim | [New] Slim qunatization with distillation lightweight detection model, supporting English | [ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml) | 1.1M | [inference model(coming soon)](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/Multilingual_PP-OCRv3_det_slim_infer.tar) / [trained model (coming soon)](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/Multilingual_PP-OCRv3_det_slim_distill_train.tar) / [slim model(coming soon)](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/Multilingual_PP-OCRv3_det_slim_infer.nb) |
+| ml_PP-OCRv3_det |[New] Original lightweight detection model, supporting English | [ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml)| 3.8M | [inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/Multilingual_PP-OCRv3_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/Multilingual_PP-OCRv3_det_distill_train.tar) |
+
+* Note: English configuration file is same as Chinese except training data, here we only provide one configuration file.
+
## 2. Text Recognition Model
@@ -44,8 +75,10 @@ Relationship of the above models is as follows.
|model name|description|config|model size|download|
| --- | --- | --- | --- | --- |
-|ch_PP-OCRv2_rec_slim|[New] Slim qunatization with distillation lightweight model, supporting Chinese, English, multilingual text recognition|[ch_PP-OCRv2_rec.yml](../../configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec.yml)| 9M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_train.tar) |
-|ch_PP-OCRv2_rec|[New] Original lightweight model, supporting Chinese, English, multilingual text recognition|[ch_PP-OCRv2_rec_distillation.yml](../../configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec_distillation.yml)|8.5M|[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_train.tar) |
+|ch_PP-OCRv3_rec_slim | [New] Slim qunatization with distillation lightweight model, supporting Chinese, English text recognition |[ch_PP-OCRv3_rec_distillation.yml](../../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml)| 4.9M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/ch/ch_PP-OCRv3_rec_slim_train.tar) / [slim model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_infer.nb) |
+|ch_PP-OCRv3_rec| [New] Original lightweight model, supporting Chinese, English, multilingual text recognition |[ch_PP-OCRv3_rec_distillation.yml](../../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml)| 12.4M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_train.tar) |
+|ch_PP-OCRv2_rec_slim| Slim qunatization with distillation lightweight model, supporting Chinese, English text recognition|[ch_PP-OCRv2_rec.yml](../../configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec.yml)| 9M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_train.tar) |
+|ch_PP-OCRv2_rec| Original lightweight model, supporting Chinese, English, multilingual text recognition |[ch_PP-OCRv2_rec_distillation.yml](../../configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec_distillation.yml)|8.5M|[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_train.tar) |
|ch_ppocr_mobile_slim_v2.0_rec|Slim pruned and quantized lightweight model, supporting Chinese, English and number recognition|[rec_chinese_lite_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml)| 6M | [inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_slim_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_slim_train.tar) |
|ch_ppocr_mobile_v2.0_rec|Original lightweight model, supporting Chinese, English and number recognition|[rec_chinese_lite_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml)|5.2M|[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_train.tar) / [pre-trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_pre.tar) |
|ch_ppocr_server_v2.0_rec|General model, supporting Chinese, English and number recognition|[rec_chinese_common_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_common_train_v2.0.yml)|94.8M|[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_train.tar) / [pre-trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_pre.tar) |
@@ -58,6 +91,8 @@ Relationship of the above models is as follows.
|model name|description|config|model size|download|
| --- | --- | --- | --- | --- |
+|en_PP-OCRv3_rec_slim | [New] Slim qunatization with distillation lightweight model, supporting english, English text recognition |[en_PP-OCRv3_rec_distillation.yml](../../configs/rec/PP-OCRv3/en_PP-OCRv3_rec_distillation.yml)| 4.9M |[inference model(coming soon)](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_slim_infer.tar) / [trained model (coming soon)](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_slim_train.tar) / [slim model(coming soon)](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_slim_infer.nb) |
+|en_PP-OCRv3_rec| [New] Original lightweight model, supporting english, English, multilingual text recognition |[en_PP-OCRv3_rec_distillation.yml](../../configs/rec/PP-OCRv3/en_PP-OCRv3_rec_distillation.yml)| 12.4M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_train.tar) |
|en_number_mobile_slim_v2.0_rec|Slim pruned and quantized lightweight model, supporting English and number recognition|[rec_en_number_lite_train.yml](../../configs/rec/multi_language/rec_en_number_lite_train.yml)| 2.7M | [inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/en_number_mobile_v2.0_rec_slim_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/en_number_mobile_v2.0_rec_slim_train.tar) |
|en_number_mobile_v2.0_rec|Original lightweight model, supporting English and number recognition|[rec_en_number_lite_train.yml](../../configs/rec/multi_language/rec_en_number_lite_train.yml)|2.6M|[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/en_number_mobile_v2.0_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/en_number_mobile_v2.0_rec_train.tar) |
diff --git a/doc/doc_en/ppocr_introduction_en.md b/doc/doc_en/ppocr_introduction_en.md
index 1a8a77006a753f52cc9b36cea5c85bd5ae7f1606..d8af8d9ee31dd4ab63b8e22b8f1c59f64ee10f38 100644
--- a/doc/doc_en/ppocr_introduction_en.md
+++ b/doc/doc_en/ppocr_introduction_en.md
@@ -32,9 +32,18 @@ PP-OCR system is in continuous optimization. At present, PP-OCR and PP-OCRv2 hav
[2] On the basis of PP-OCR, PP-OCRv2 is further optimized in five aspects. The detection model adopts CML(Collaborative Mutual Learning) knowledge distillation strategy and CopyPaste data expansion strategy. The recognition model adopts LCNet lightweight backbone network, U-DML knowledge distillation strategy and enhanced CTC loss function improvement (as shown in the red box above), which further improves the inference speed and prediction effect. For more details, please refer to the technical report of PP-OCRv2 (https://arxiv.org/abs/2109.03144).
+[3] PP-OCRv3 is further upgraded on the basis of PP-OCRv2. The detection model is still based on DB algorithm, and the optimization strategies include a newly proposed FPN structure with residual attention mechanism named with RSEFPN, a PAN structure with enlarged receptive field named with LKPAN, and better teacher model based on DML training; The recognition model replaces the base model from CRNN with IJCAI 2022 paper [SVTR](https://arxiv.org/abs/2205.00159), and adopts lightweight SVTR, guided training of CTC, data augmentation strategy RecConAug, better pre-trained model by self-supervised training, and the use of unlabeled data to accelerate the model and improve the effect. For more details, please refer to PP-OCRv3 [technical report](./PP-OCRv3_introduction_en.md).
+
+PP-OCRv3 pipeline is as follows:
+
+
+

+
+
## 2. Features
+- Ultra lightweight PP-OCRv3 series models: detection (3.6M) + direction classifier (1.4M) + recognition 12M) = 17.0M
- Ultra lightweight PP-OCRv2 series models: detection (3.1M) + direction classifier (1.4M) + recognition 8.5M) = 13.0M
- Ultra lightweight PP-OCR mobile series models: detection (3.0M) + direction classifier (1.4M) + recognition (5.0M) = 9.4M
- General PP-OCR server series models: detection (47.1M) + direction classifier (1.4M) + recognition (94.9M) = 143.4M
@@ -51,7 +60,7 @@ For the performance comparison between PP-OCR series models, please check the [b
PP-OCRv2 English model
-
+
@@ -69,20 +78,20 @@ For the performance comparison between PP-OCR series models, please check the [b
-
+
PP-OCRv2 Multilingual model
-
+
-
+
-
+
## 5. Tutorial
@@ -101,10 +110,12 @@ For more tutorials, including model training, model compression, deployment, etc
## 6. Model zoo
-## PP-OCR Series Model List(Update on September 8th)
+## PP-OCR Series Model List(Update on 2022.04.28)
| Model introduction | Model name | Recommended scene | Detection model | Direction classifier | Recognition model |
| ------------------------------------------------------------ | ---------------------------- | ----------------- | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ |
+| Chinese and English ultra-lightweight PP-OCRv3 model(16.2M) | ch_PP-OCRv3_xx | Mobile & Server | [inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_distill_train.tar) | [inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) | [inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_train.tar) |
+| English ultra-lightweight PP-OCRv3 model(13.4M) | en_PP-OCRv3_xx | Mobile & Server | [inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_distill_train.tar) | [inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) | [inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_train.tar) |
| Chinese and English ultra-lightweight PP-OCRv2 model(11.6M) | ch_PP-OCRv2_xx |Mobile & Server|[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_distill_train.tar)| [inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_train.tar)|
| Chinese and English ultra-lightweight PP-OCR model (9.4M) | ch_ppocr_mobile_v2.0_xx | Mobile & server |[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_train.tar)|[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) |[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_train.tar) |
| Chinese and English general PP-OCR model (143.4M) | ch_ppocr_server_v2.0_xx | Server |[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_train.tar) |[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) |[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_train.tar) |
diff --git a/doc/doc_en/quickstart_en.md b/doc/doc_en/quickstart_en.md
index 8a9c38069f384dcef06db60f6b1266e6eb116d84..7243e2db927a1cc89f8ac4d63c2a5a722de393d5 100644
--- a/doc/doc_en/quickstart_en.md
+++ b/doc/doc_en/quickstart_en.md
@@ -73,6 +73,8 @@ cd /path/to/ppocr_img
If you do not use the provided test image, you can replace the following `--image_dir` parameter with the corresponding test image path
+**Note**: The whl package uses the `PP-OCRv3` model by default, and the input shape used by the recognition model is `3,48,320`, so if you use the recognition function, you need to add the parameter `--rec_image_shape 3,48,320`, if you do not use the default `PP- OCRv3` model, you do not need to set this parameter.
+
#### 2.1.1 Chinese and English Model
@@ -80,15 +82,15 @@ If you do not use the provided test image, you can replace the following `--imag
* Detection, direction classification and recognition: set the parameter`--use_gpu false` to disable the gpu device
```bash
- paddleocr --image_dir ./imgs_en/img_12.jpg --use_angle_cls true --lang en --use_gpu false
+ paddleocr --image_dir ./imgs_en/img_12.jpg --use_angle_cls true --lang en --use_gpu false --rec_image_shape 3,48,320
```
Output will be a list, each item contains bounding box, text and recognition confidence
```bash
- [[[442.0, 173.0], [1169.0, 173.0], [1169.0, 225.0], [442.0, 225.0]], ['ACKNOWLEDGEMENTS', 0.99283075]]
- [[[393.0, 340.0], [1207.0, 342.0], [1207.0, 389.0], [393.0, 387.0]], ['We would like to thank all the designers and', 0.9357758]]
- [[[399.0, 398.0], [1204.0, 398.0], [1204.0, 433.0], [399.0, 433.0]], ['contributors whohave been involved in the', 0.9592447]]
+ [[[441.0, 174.0], [1166.0, 176.0], [1165.0, 222.0], [441.0, 221.0]], ('ACKNOWLEDGEMENTS', 0.9971134662628174)]
+ [[[403.0, 346.0], [1204.0, 348.0], [1204.0, 384.0], [402.0, 383.0]], ('We would like to thank all the designers and', 0.9761400818824768)]
+ [[[403.0, 396.0], [1204.0, 398.0], [1204.0, 434.0], [402.0, 433.0]], ('contributors who have been involved in the', 0.9791957139968872)]
......
```
@@ -101,33 +103,33 @@ If you do not use the provided test image, you can replace the following `--imag
Output will be a list, each item only contains bounding box
```bash
- [[756.0, 812.0], [805.0, 812.0], [805.0, 830.0], [756.0, 830.0]]
- [[820.0, 803.0], [1085.0, 801.0], [1085.0, 836.0], [820.0, 838.0]]
- [[393.0, 801.0], [715.0, 805.0], [715.0, 839.0], [393.0, 836.0]]
+ [[397.0, 802.0], [1092.0, 802.0], [1092.0, 841.0], [397.0, 841.0]]
+ [[397.0, 750.0], [1211.0, 750.0], [1211.0, 789.0], [397.0, 789.0]]
+ [[397.0, 702.0], [1209.0, 698.0], [1209.0, 734.0], [397.0, 738.0]]
......
```
* Only recognition: set `--det` to `false`
```bash
- paddleocr --image_dir ./imgs_words_en/word_10.png --det false --lang en
+ paddleocr --image_dir ./imgs_words_en/word_10.png --det false --lang en --rec_image_shape 3,48,320
```
Output will be a list, each item contains text and recognition confidence
```bash
- ['PAIN', 0.990372]
+ ['PAIN', 0.9934559464454651]
```
-If you need to use the 2.0 model, please specify the parameter `--version PP-OCR`, paddleocr uses the 2.1 model by default(`--versioin PP-OCRv2`). More whl package usage can be found in [whl package](./whl_en.md)
+If you need to use the 2.0 model, please specify the parameter `--version PP-OCR`, paddleocr uses the PP-OCRv3 model by default(`--versioin PP-OCRv3`). More whl package usage can be found in [whl package](./whl_en.md)
#### 2.1.2 Multi-language Model
-Paddleocr currently supports 80 languages, which can be switched by modifying the `--lang` parameter.
+Paddleocr currently supports 80 languages, which can be switched by modifying the `--lang` parameter. PP-OCRv3 currently only supports Chinese and English models, and other multilingual models will be updated one after another.
``` bash
-paddleocr --image_dir ./doc/imgs_en/254.jpg --lang=en
+paddleocr --image_dir ./doc/imgs_en/254.jpg --lang=en --rec_image_shape 3,48,320
```
@@ -137,13 +139,9 @@ paddleocr --image_dir ./doc/imgs_en/254.jpg --lang=en
The result is a list, each item contains a text box, text and recognition confidence
```text
-[('PHO CAPITAL', 0.95723116), [[66.0, 50.0], [327.0, 44.0], [327.0, 76.0], [67.0, 82.0]]]
-[('107 State Street', 0.96311164), [[72.0, 90.0], [451.0, 84.0], [452.0, 116.0], [73.0, 121.0]]]
-[('Montpelier Vermont', 0.97389287), [[69.0, 132.0], [501.0, 126.0], [501.0, 158.0], [70.0, 164.0]]]
-[('8022256183', 0.99810505), [[71.0, 175.0], [363.0, 170.0], [364.0, 202.0], [72.0, 207.0]]]
-[('REG 07-24-201706:59 PM', 0.93537045), [[73.0, 299.0], [653.0, 281.0], [654.0, 318.0], [74.0, 336.0]]]
-[('045555', 0.99346405), [[509.0, 331.0], [651.0, 325.0], [652.0, 356.0], [511.0, 362.0]]]
-[('CT1', 0.9988654), [[535.0, 367.0], [654.0, 367.0], [654.0, 406.0], [535.0, 406.0]]]
+[[[67.0, 51.0], [327.0, 46.0], [327.0, 74.0], [68.0, 80.0]], ('PHOCAPITAL', 0.9944712519645691)]
+[[[72.0, 92.0], [453.0, 84.0], [454.0, 114.0], [73.0, 122.0]], ('107 State Street', 0.9744491577148438)]
+[[[69.0, 135.0], [501.0, 125.0], [501.0, 156.0], [70.0, 165.0]], ('Montpelier Vermont', 0.9357033967971802)]
......
```
@@ -234,10 +232,10 @@ im_show.save('result.jpg')
Output will be a list, each item contains bounding box, text and recognition confidence
```bash
-[[[442.0, 173.0], [1169.0, 173.0], [1169.0, 225.0], [442.0, 225.0]], ['ACKNOWLEDGEMENTS', 0.99283075]]
-[[[393.0, 340.0], [1207.0, 342.0], [1207.0, 389.0], [393.0, 387.0]], ['We would like to thank all the designers and', 0.9357758]]
-[[[399.0, 398.0], [1204.0, 398.0], [1204.0, 433.0], [399.0, 433.0]], ['contributors whohave been involved in the', 0.9592447]]
-......
+[[[441.0, 174.0], [1166.0, 176.0], [1165.0, 222.0], [441.0, 221.0]], ('ACKNOWLEDGEMENTS', 0.9971134662628174)]
+ [[[403.0, 346.0], [1204.0, 348.0], [1204.0, 384.0], [402.0, 383.0]], ('We would like to thank all the designers and', 0.9761400818824768)]
+ [[[403.0, 396.0], [1204.0, 398.0], [1204.0, 434.0], [402.0, 433.0]], ('contributors who have been involved in the', 0.9791957139968872)]
+ ......
```
Visualization of results
diff --git a/doc/doc_en/recognition_en.md b/doc/doc_en/recognition_en.md
index a6b255f9501a0c2d34c31162385bcf03ff578aa3..60b4a1b26b373adc562ab9624e55ffe59a775a35 100644
--- a/doc/doc_en/recognition_en.md
+++ b/doc/doc_en/recognition_en.md
@@ -1,7 +1,7 @@
# Text Recognition
- [1. Data Preparation](#DATA_PREPARATION)
- * [1.1 Costom Dataset](#Costom_Dataset)
+ * [1.1 Custom Dataset](#Custom_Dataset)
* [1.2 Dataset Download](#Dataset_download)
* [1.3 Dictionary](#Dictionary)
* [1.4 Add Space Category](#Add_space_category)
@@ -28,7 +28,34 @@
To prepare datasets, refer to [ocr_datasets](./dataset/ocr_datasets.md) .
-If you want to reproduce the paper SAR, you need to download extra dataset [SynthAdd](https://pan.baidu.com/share/init?surl=uV0LtoNmcxbO-0YA7Ch4dg), extraction code: 627x. Besides, icdar2013, icdar2015, cocotext, IIIT5k datasets are also used to train. For specific details, please refer to the paper SAR.
+PaddleOCR provides label files for training the icdar2015 dataset, which can be downloaded in the following ways:
+
+```
+# Training set label
+wget -P ./train_data/ic15_data https://paddleocr.bj.bcebos.com/dataset/rec_gt_train.txt
+# Test Set Label
+wget -P ./train_data/ic15_data https://paddleocr.bj.bcebos.com/dataset/rec_gt_test.txt
+```
+
+PaddleOCR also provides a data format conversion script, which can convert ICDAR official website label to a data format
+supported by PaddleOCR. The data conversion tool is in `ppocr/utils/gen_label.py`, here is the training set as an example:
+
+```
+# convert the official gt to rec_gt_label.txt
+python gen_label.py --mode="rec" --input_path="{path/of/origin/label}" --output_label="rec_gt_label.txt"
+```
+
+The data format is as follows, (a) is the original picture, (b) is the Ground Truth text file corresponding to each picture:
+
+
+
+
+- Multilingual dataset
+
+The multi-language model training method is the same as the Chinese model. The training data set is 100w synthetic data. A small amount of fonts and test data can be downloaded using the following two methods.
+* [Baidu Netdisk](https://pan.baidu.com/s/1bS_u207Rm7YbY33wOECKDA) ,Extraction code:frgi.
+* [Google drive](https://drive.google.com/file/d/18cSWX7wXSy4G0tbKJ0d9PuIaiwRLHpjA/view)
+
### 1.2 Dictionary
@@ -101,11 +128,11 @@ First download the pretrain model, you can download the trained model to finetun
```
cd PaddleOCR/
-# Download the pre-trained model of MobileNetV3
-wget -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_mv3_none_bilstm_ctc_v2.0_train.tar
+# Download the pre-trained model of en_PP-OCRv3
+wget -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_train.tar
# Decompress model parameters
cd pretrain_models
-tar -xf rec_mv3_none_bilstm_ctc_v2.0_train.tar && rm -rf rec_mv3_none_bilstm_ctc_v2.0_train.tar
+tar -xf en_PP-OCRv3_rec_train.tar && rm -rf en_PP-OCRv3_rec_train.tar
```
Start training:
@@ -115,9 +142,10 @@ Start training:
# Training icdar15 English data and The training log will be automatically saved as train.log under "{save_model_dir}"
#specify the single card training(Long training time, not recommended)
-python3 tools/train.py -c configs/rec/rec_icdar15_train.yml
+python3 tools/train.py -c configs/rec/PP-OCRv3/en_PP-OCRv3_rec.yml -o Global.pretrained_model=en_PP-OCRv3_rec_train/best_accuracy
+
#specify the card number through --gpus
-python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/rec/rec_icdar15_train.yml
+python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/rec/PP-OCRv3/en_PP-OCRv3_rec.yml -o Global.pretrained_model=en_PP-OCRv3_rec_train/best_accuracy
```
@@ -125,31 +153,13 @@ PaddleOCR supports alternating training and evaluation. You can modify `eval_bat
If the evaluation set is large, the test will be time-consuming. It is recommended to reduce the number of evaluations, or evaluate after training.
-* Tip: You can use the `-c` parameter to select multiple model configurations under the `configs/rec/` path for training. The recognition algorithms supported by PaddleOCR are:
-
-
-| Configuration file | Algorithm | backbone | trans | seq | pred |
-| :--------: | :-------: | :-------: | :-------: | :-----: | :-----: |
-| [rec_chinese_lite_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml) | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc |
-| [rec_chinese_common_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_common_train_v2.0.yml) | CRNN | ResNet34_vd | None | BiLSTM | ctc |
-| rec_chinese_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc |
-| rec_chinese_common_train.yml | CRNN | ResNet34_vd | None | BiLSTM | ctc |
-| rec_icdar15_train.yml | CRNN | Mobilenet_v3 large 0.5 | None | BiLSTM | ctc |
-| rec_mv3_none_bilstm_ctc.yml | CRNN | Mobilenet_v3 large 0.5 | None | BiLSTM | ctc |
-| rec_mv3_none_none_ctc.yml | Rosetta | Mobilenet_v3 large 0.5 | None | None | ctc |
-| rec_r34_vd_none_bilstm_ctc.yml | CRNN | Resnet34_vd | None | BiLSTM | ctc |
-| rec_r34_vd_none_none_ctc.yml | Rosetta | Resnet34_vd | None | None | ctc |
-| rec_mv3_tps_bilstm_att.yml | CRNN | Mobilenet_v3 | TPS | BiLSTM | att |
-| rec_r34_vd_tps_bilstm_att.yml | CRNN | Resnet34_vd | TPS | BiLSTM | att |
-| rec_r50fpn_vd_none_srn.yml | SRN | Resnet50_fpn_vd | None | rnn | srn |
-| rec_mtb_nrtr.yml | NRTR | nrtr_mtb | None | transformer encoder | transformer decoder |
-| rec_r31_sar.yml | SAR | ResNet31 | None | LSTM encoder | LSTM decoder |
+* Tip: You can use the `-c` parameter to select multiple model configurations under the `configs/rec/` path for training. The recognition algorithms supported at [rec_algorithm](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/doc/doc_en/algorithm_overview.md):
For training Chinese data, it is recommended to use
-[rec_chinese_lite_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml). If you want to try the result of other algorithms on the Chinese data set, please refer to the following instructions to modify the configuration file:
-co
-Take `rec_chinese_lite_train_v2.0.yml` as an example:
+[ch_PP-OCRv3_rec_distillation.yml](../../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml). If you want to try the result of other algorithms on the Chinese data set, please refer to the following instructions to modify the configuration file:
+
+Take `ch_PP-OCRv3_rec_distillation.yml` as an example:
```
Global:
...
@@ -183,7 +193,7 @@ Train:
...
- RecResizeImg:
# Modify image_shape to fit long text
- image_shape: [3, 32, 320]
+ image_shape: [3, 48, 320]
...
loader:
...
@@ -203,7 +213,7 @@ Eval:
...
- RecResizeImg:
# Modify image_shape to fit long text
- image_shape: [3, 32, 320]
+ image_shape: [3, 48, 320]
...
loader:
# Eval batch_size for Single card
@@ -380,11 +390,12 @@ Running on a DCU device requires setting the environment variable `export HIP_VI
### 3.1 Evaluation
-The model parameters during training are saved in the `Global.save_model_dir` directory by default. When evaluating indicators, you need to set `Global.checkpoints` to point to the saved parameter file. The evaluation dataset can be set by modifying the `Eval.dataset.label_file_list` field in the `configs/rec/rec_icdar15_train.yml` file.
+The model parameters during training are saved in the `Global.save_model_dir` directory by default. When evaluating indicators, you need to set `Global.checkpoints` to point to the saved parameter file. The evaluation dataset can be set by modifying the `Eval.dataset.label_file_list` field in the `configs/rec/PP-OCRv3/en_PP-OCRv3_rec.yml` file.
+
```
# GPU evaluation, Global.checkpoints is the weight to be tested
-python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/rec/rec_icdar15_train.yml -o Global.checkpoints={path/to/weights}/best_accuracy
+python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/rec/PP-OCRv3/en_PP-OCRv3_rec.yml -o Global.checkpoints={path/to/weights}/best_accuracy
```
@@ -417,7 +428,7 @@ Among them, best_accuracy.* is the best model on the evaluation set; iter_epoch_
```
# Predict English results
-python3 tools/infer_rec.py -c configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.load_static_weights=false Global.infer_img=doc/imgs_words/en/word_1.jpg
+python3 tools/infer_rec.py -c configs/rec/PP-OCRv3/en_PP-OCRv3_rec.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.infer_img=doc/imgs_words/en/word_1.png
```
@@ -436,7 +447,7 @@ The configuration file used for prediction must be consistent with the training.
```
# Predict Chinese results
-python3 tools/infer_rec.py -c configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.load_static_weights=false Global.infer_img=doc/imgs_words/ch/word_1.jpg
+python3 tools/infer_rec.py -c configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.infer_img=doc/imgs_words/ch/word_1.jpg
```
Input image:
@@ -467,7 +478,7 @@ The recognition model is converted to the inference model in the same way as the
# Global.pretrained_model parameter Set the training model address to be converted without adding the file suffix .pdmodel, .pdopt or .pdparams.
# Global.save_inference_dir Set the address where the converted model will be saved.
-python3 tools/export_model.py -c configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml -o Global.pretrained_model=./ch_lite/ch_ppocr_mobile_v2.0_rec_train/best_accuracy Global.save_inference_dir=./inference/rec_crnn/
+python3 tools/export_model.py -c configs/rec/PP-OCRv3/en_PP-OCRv3_rec.yml -o Global.pretrained_model=en_PP-OCRv3_rec_train/best_accuracy Global.save_inference_dir=./inference/en_PP-OCRv3_rec/
```
If you have a model trained on your own dataset with a different dictionary file, please make sure that you modify the `character_dict_path` in the configuration file to your dictionary file path.
@@ -475,7 +486,8 @@ If you have a model trained on your own dataset with a different dictionary file
After the conversion is successful, there are three files in the model save directory:
```
-inference/rec_crnn/
+
+inference/en_PP-OCRv3_rec/
├── inference.pdiparams # The parameter file of recognition inference model
├── inference.pdiparams.info # The parameter information of recognition inference model, which can be ignored
└── inference.pdmodel # The program file of recognition model
diff --git a/doc/doc_en/whl_en.md b/doc/doc_en/whl_en.md
index 35b2b1798ad8b566ee87e921e23be84a5ecccf24..40a2e122d19679a59e7e65df29dd59781b4a2143 100644
--- a/doc/doc_en/whl_en.md
+++ b/doc/doc_en/whl_en.md
@@ -172,40 +172,42 @@ show help information
paddleocr -h
```
+**Note**: The whl package uses the `PP-OCRv3` model by default, and the input shape used by the recognition model is `3,48,320`, so if you use the recognition function, you need to add the parameter `--rec_image_shape 3,48,320`, if you do not use the default `PP- OCRv3` model, you do not need to set this parameter.
+
* detection classification and recognition
```bash
-paddleocr --image_dir PaddleOCR/doc/imgs_en/img_12.jpg --use_angle_cls true --lang en
+paddleocr --image_dir PaddleOCR/doc/imgs_en/img_12.jpg --use_angle_cls true --lang en --rec_image_shape 3,48,320
```
Output will be a list, each item contains bounding box, text and recognition confidence
```bash
-[[[442.0, 173.0], [1169.0, 173.0], [1169.0, 225.0], [442.0, 225.0]], ['ACKNOWLEDGEMENTS', 0.99283075]]
-[[[393.0, 340.0], [1207.0, 342.0], [1207.0, 389.0], [393.0, 387.0]], ['We would like to thank all the designers and', 0.9357758]]
-[[[399.0, 398.0], [1204.0, 398.0], [1204.0, 433.0], [399.0, 433.0]], ['contributors whohave been involved in the', 0.9592447]]
+[[[441.0, 174.0], [1166.0, 176.0], [1165.0, 222.0], [441.0, 221.0]], ('ACKNOWLEDGEMENTS', 0.9971134662628174)]
+[[[403.0, 346.0], [1204.0, 348.0], [1204.0, 384.0], [402.0, 383.0]], ('We would like to thank all the designers and', 0.9761400818824768)]
+[[[403.0, 396.0], [1204.0, 398.0], [1204.0, 434.0], [402.0, 433.0]], ('contributors who have been involved in the', 0.9791957139968872)]
......
```
* detection and recognition
```bash
-paddleocr --image_dir PaddleOCR/doc/imgs_en/img_12.jpg --lang en
+paddleocr --image_dir PaddleOCR/doc/imgs_en/img_12.jpg --lang en --rec_image_shape 3,48,320
```
Output will be a list, each item contains bounding box, text and recognition confidence
```bash
-[[[442.0, 173.0], [1169.0, 173.0], [1169.0, 225.0], [442.0, 225.0]], ['ACKNOWLEDGEMENTS', 0.99283075]]
-[[[393.0, 340.0], [1207.0, 342.0], [1207.0, 389.0], [393.0, 387.0]], ['We would like to thank all the designers and', 0.9357758]]
-[[[399.0, 398.0], [1204.0, 398.0], [1204.0, 433.0], [399.0, 433.0]], ['contributors whohave been involved in the', 0.9592447]]
+[[[441.0, 174.0], [1166.0, 176.0], [1165.0, 222.0], [441.0, 221.0]], ('ACKNOWLEDGEMENTS', 0.9971134662628174)]
+[[[403.0, 346.0], [1204.0, 348.0], [1204.0, 384.0], [402.0, 383.0]], ('We would like to thank all the designers and', 0.9761400818824768)]
+[[[403.0, 396.0], [1204.0, 398.0], [1204.0, 434.0], [402.0, 433.0]], ('contributors who have been involved in the', 0.9791957139968872)]
......
```
* classification and recognition
```bash
-paddleocr --image_dir PaddleOCR/doc/imgs_words_en/word_10.png --use_angle_cls true --det false --lang en
+paddleocr --image_dir PaddleOCR/doc/imgs_words_en/word_10.png --use_angle_cls true --det false --lang en --rec_image_shape 3,48,320
```
Output will be a list, each item contains text and recognition confidence
```bash
-['PAIN', 0.990372]
+['PAIN', 0.9934559464454651]
```
* only detection
@@ -215,20 +217,20 @@ paddleocr --image_dir PaddleOCR/doc/imgs_en/img_12.jpg --rec false
Output will be a list, each item only contains bounding box
```bash
-[[756.0, 812.0], [805.0, 812.0], [805.0, 830.0], [756.0, 830.0]]
-[[820.0, 803.0], [1085.0, 801.0], [1085.0, 836.0], [820.0, 838.0]]
-[[393.0, 801.0], [715.0, 805.0], [715.0, 839.0], [393.0, 836.0]]
+[[397.0, 802.0], [1092.0, 802.0], [1092.0, 841.0], [397.0, 841.0]]
+[[397.0, 750.0], [1211.0, 750.0], [1211.0, 789.0], [397.0, 789.0]]
+[[397.0, 702.0], [1209.0, 698.0], [1209.0, 734.0], [397.0, 738.0]]
......
```
* only recognition
```bash
-paddleocr --image_dir PaddleOCR/doc/imgs_words_en/word_10.png --det false --lang en
+paddleocr --image_dir PaddleOCR/doc/imgs_words_en/word_10.png --det false --lang en --rec_image_shape 3,48,320
```
Output will be a list, each item contains text and recognition confidence
```bash
-['PAIN', 0.990372]
+['PAIN', 0.9934559464454651]
```
* only classification
@@ -366,5 +368,4 @@ im_show.save('result.jpg')
| cls | Enable classification when `ppocr.ocr` func exec((Use use_angle_cls in command line mode to control whether to start classification in the forward direction) | FALSE |
| show_log | Whether to print log| FALSE |
| type | Perform ocr or table structuring, the value is selected in ['ocr','structure'] | ocr |
-| ocr_version | OCR Model version number, the current model support list is as follows: PP-OCRv2 support Chinese detection and recognition model, PP-OCR support Chinese detection, recognition and direction classifier, multilingual recognition model | PP-OCRv2 |
-| structure_version | table structure Model version number, the current model support list is as follows: STRUCTURE support english table structure model | STRUCTURE |
+| ocr_version | OCR Model version number, the current model support list is as follows: PP-OCRv3 support Chinese and English detection and recognition model and direction classifier model, PP-OCRv2 support Chinese detection and recognition model, PP-OCR support Chinese detection, recognition and direction classifier, multilingual recognition model | PP-OCRv3 |
diff --git a/doc/features.png b/doc/features.png
index 7d6342c1eb83e0544df0045a0cfa71bc083022fe..6295c902adc007446f72a2ff08793198f9ac2c4a 100644
Binary files a/doc/features.png and b/doc/features.png differ
diff --git a/doc/features_en.png b/doc/features_en.png
index 9f0a66299bb5e922257e3327b0c6cf2d3ebfe05b..6ec1ed7bdb50cd05d1cd280c635b0aea19725cb2 100644
Binary files a/doc/features_en.png and b/doc/features_en.png differ
diff --git a/doc/imgs_results/system_res_00018069_v3.jpg b/doc/imgs_results/system_res_00018069_v3.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..51808ca556b52239cad3602cd5602f4d5d0ab7ce
Binary files /dev/null and b/doc/imgs_results/system_res_00018069_v3.jpg differ
diff --git a/doc/ppocr_v3/GTC.png b/doc/ppocr_v3/GTC.png
new file mode 100644
index 0000000000000000000000000000000000000000..2af2261d51d2279f171727a5a0b5a8d974763d80
Binary files /dev/null and b/doc/ppocr_v3/GTC.png differ
diff --git a/doc/ppocr_v3/LKPAN.png b/doc/ppocr_v3/LKPAN.png
new file mode 100644
index 0000000000000000000000000000000000000000..ff0578f6901603185809e10c85793c212c40dc48
Binary files /dev/null and b/doc/ppocr_v3/LKPAN.png differ
diff --git a/doc/ppocr_v3/RSEFPN.png b/doc/ppocr_v3/RSEFPN.png
new file mode 100644
index 0000000000000000000000000000000000000000..87f7f69fb516d496c9357d81b97e5bdb750f808a
Binary files /dev/null and b/doc/ppocr_v3/RSEFPN.png differ
diff --git a/doc/ppocr_v3/SSL.png b/doc/ppocr_v3/SSL.png
new file mode 100644
index 0000000000000000000000000000000000000000..1344a2a77cf50c4ffe044e9ebc565215a2d7efbc
Binary files /dev/null and b/doc/ppocr_v3/SSL.png differ
diff --git a/doc/ppocr_v3/UDML.png b/doc/ppocr_v3/UDML.png
new file mode 100644
index 0000000000000000000000000000000000000000..3b59bc58bc1e7cb0ae0ffab30b41bc519410c794
Binary files /dev/null and b/doc/ppocr_v3/UDML.png differ
diff --git a/doc/ppocr_v3/ppocr_v3.png b/doc/ppocr_v3/ppocr_v3.png
new file mode 100644
index 0000000000000000000000000000000000000000..123c125acdcbc9e2ef6e4d6a0a1c92d01136ffde
Binary files /dev/null and b/doc/ppocr_v3/ppocr_v3.png differ
diff --git a/doc/ppocr_v3/recconaug.png b/doc/ppocr_v3/recconaug.png
new file mode 100644
index 0000000000000000000000000000000000000000..899bc430de897cbd8741e07bfa796a58bf2c7715
Binary files /dev/null and b/doc/ppocr_v3/recconaug.png differ
diff --git a/doc/ppocr_v3/svtr_g2.png b/doc/ppocr_v3/svtr_g2.png
new file mode 100644
index 0000000000000000000000000000000000000000..d589891d5897533243845a993bd56d8f75726cfc
Binary files /dev/null and b/doc/ppocr_v3/svtr_g2.png differ
diff --git a/doc/ppocr_v3/svtr_g4.png b/doc/ppocr_v3/svtr_g4.png
new file mode 100644
index 0000000000000000000000000000000000000000..234a85c44b2cc3d968942480a596b2be5e45f53d
Binary files /dev/null and b/doc/ppocr_v3/svtr_g4.png differ
diff --git a/doc/ppocr_v3/svtr_tiny.jpg b/doc/ppocr_v3/svtr_tiny.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..26261047ef253e9802956f4c64449870d10de850
Binary files /dev/null and b/doc/ppocr_v3/svtr_tiny.jpg differ
diff --git a/doc/ppocr_v3/svtr_tiny.png b/doc/ppocr_v3/svtr_tiny.png
new file mode 100644
index 0000000000000000000000000000000000000000..91b3eacb9f1242806ad3520cc36252351fc7baf1
Binary files /dev/null and b/doc/ppocr_v3/svtr_tiny.png differ
diff --git a/doc/ppocrv3_framework.png b/doc/ppocrv3_framework.png
new file mode 100644
index 0000000000000000000000000000000000000000..c05398248fa7273382e9691a26d932bddc3cf84f
Binary files /dev/null and b/doc/ppocrv3_framework.png differ
diff --git "a/notebook/notebook_ch/1.introduction/OCR\346\212\200\346\234\257\345\257\274\350\256\272.ipynb" "b/notebook/notebook_ch/1.introduction/OCR\346\212\200\346\234\257\345\257\274\350\256\272.ipynb"
index 37098ab63ea0f5ca2d033b4dc6c46b14ef2eadd9..96c91b11b7bfe04e754b58b1204a339e983de9c1 100644
--- "a/notebook/notebook_ch/1.introduction/OCR\346\212\200\346\234\257\345\257\274\350\256\272.ipynb"
+++ "b/notebook/notebook_ch/1.introduction/OCR\346\212\200\346\234\257\345\257\274\350\256\272.ipynb"
@@ -164,7 +164,7 @@
"\n",
"
\n",
"\n",
- "一般的KIE方法基于命名实体识别(Named Entity Recognition,NER)[4]来研究,但是这类方法只利用了图像中的文本信息,缺少对视觉和结构信息的使用,因此精度不高。在此基础上,近几年的方法都开始将视觉和结构信息与文本信息融合到一起,按照对多模态信息进行融合时所采用的的原理可以将这些方法分为下面四种:\n",
+ "一般的KIE方法基于命名实体识别(Named Entity Recognition,NER)[4]来研究,但是这类方法只利用了图像中的文本信息,缺少对视觉和结构信息的使用,因此精度不高。在此基础上,近几年的方法都开始将视觉和结构信息与文本信息融合到一起,按照对多模态信息进行融合时所采用的原理可以将这些方法分为下面四种:\n",
"\n",
"- 基于Grid的方法\n",
"- 基于Token的方法\n",
diff --git "a/notebook/notebook_ch/3.text_recognition/\346\226\207\346\234\254\350\257\206\345\210\253\345\256\236\350\267\265\351\203\250\345\210\206.ipynb" "b/notebook/notebook_ch/3.text_recognition/\346\226\207\346\234\254\350\257\206\345\210\253\345\256\236\350\267\265\351\203\250\345\210\206.ipynb"
index 8a6cb2d788719c238fb14703a4cfb2616da1a95b..b48f90bf54f02a0f74e10505f56919c202fab859 100644
--- "a/notebook/notebook_ch/3.text_recognition/\346\226\207\346\234\254\350\257\206\345\210\253\345\256\236\350\267\265\351\203\250\345\210\206.ipynb"
+++ "b/notebook/notebook_ch/3.text_recognition/\346\226\207\346\234\254\350\257\206\345\210\253\345\256\236\350\267\265\351\203\250\345\210\206.ipynb"
@@ -2133,7 +2133,7 @@
"collapsed": false
},
"source": [
- "根据配置文件中设置的的 `save_model_dir` 字段,会有以下几种参数被保存下来:\n",
+ "根据配置文件中设置的 `save_model_dir` 字段,会有以下几种参数被保存下来:\n",
"\n",
"```\n",
"output/rec/ic15\n",
diff --git "a/notebook/notebook_ch/3.text_recognition/\346\226\207\346\234\254\350\257\206\345\210\253\347\220\206\350\256\272\351\203\250\345\210\206.ipynb" "b/notebook/notebook_ch/3.text_recognition/\346\226\207\346\234\254\350\257\206\345\210\253\347\220\206\350\256\272\351\203\250\345\210\206.ipynb"
index dd6af90a35945a7b2ffd79be6fa8d2fe6665d2c5..6913a65533a9e12b338b92b324d6ca4225b477ef 100644
--- "a/notebook/notebook_ch/3.text_recognition/\346\226\207\346\234\254\350\257\206\345\210\253\347\220\206\350\256\272\351\203\250\345\210\206.ipynb"
+++ "b/notebook/notebook_ch/3.text_recognition/\346\226\207\346\234\254\350\257\206\345\210\253\347\220\206\350\256\272\351\203\250\345\210\206.ipynb"
@@ -22,7 +22,7 @@
"\n",
"## 1 背景介绍\n",
"\n",
- "文本识别是OCR(Optical Character Recognition)的一个子任务,其任务为识别一个固定区域的的文本内容。在OCR的两阶段方法里,它接在文本检测后面,将图像信息转换为文字信息。\n",
+ "文本识别是OCR(Optical Character Recognition)的一个子任务,其任务为识别一个固定区域的文本内容。在OCR的两阶段方法里,它接在文本检测后面,将图像信息转换为文字信息。\n",
"\n",
"具体地,模型输入一张定位好的文本行,由模型预测出图片中的文字内容和置信度,可视化结果如下图所示:\n",
"\n",
diff --git "a/notebook/notebook_ch/4.ppcor_system_strategy/PP-OCR\347\263\273\347\273\237\345\217\212\344\274\230\345\214\226\347\255\226\347\225\245.ipynb" "b/notebook/notebook_ch/4.ppcor_system_strategy/PP-OCR\347\263\273\347\273\237\345\217\212\344\274\230\345\214\226\347\255\226\347\225\245.ipynb"
index 4160b077b3ef2939ffd2ac88867bb12c93438c61..17b7d9f5b845dd7ea07050bffd9ddc57c8ade721 100644
--- "a/notebook/notebook_ch/4.ppcor_system_strategy/PP-OCR\347\263\273\347\273\237\345\217\212\344\274\230\345\214\226\347\255\226\347\225\245.ipynb"
+++ "b/notebook/notebook_ch/4.ppcor_system_strategy/PP-OCR\347\263\273\347\273\237\345\217\212\344\274\230\345\214\226\347\255\226\347\225\245.ipynb"
@@ -2915,7 +2915,7 @@
"\n",
"```yaml\n",
"Architecture:\n",
- " model_type: &model_type \"rec\" # 模型类别,rec、det等,每个子网络的的模型类别都与\n",
+ " model_type: &model_type \"rec\" # 模型类别,rec、det等,每个子网络的模型类别都与\n",
" name: DistillationModel # 结构名称,蒸馏任务中,为DistillationModel,用于构建对应的结构\n",
" algorithm: Distillation # 算法名称\n",
" Models: # 模型,包含子网络的配置信息\n",
diff --git "a/notebook/notebook_ch/4.ppocr_system_strategy/PP-OCR\347\263\273\347\273\237\345\217\212\344\274\230\345\214\226\347\255\226\347\225\245.ipynb" "b/notebook/notebook_ch/4.ppocr_system_strategy/PP-OCR\347\263\273\347\273\237\345\217\212\344\274\230\345\214\226\347\255\226\347\225\245.ipynb"
index 4160b077b3ef2939ffd2ac88867bb12c93438c61..17b7d9f5b845dd7ea07050bffd9ddc57c8ade721 100644
--- "a/notebook/notebook_ch/4.ppocr_system_strategy/PP-OCR\347\263\273\347\273\237\345\217\212\344\274\230\345\214\226\347\255\226\347\225\245.ipynb"
+++ "b/notebook/notebook_ch/4.ppocr_system_strategy/PP-OCR\347\263\273\347\273\237\345\217\212\344\274\230\345\214\226\347\255\226\347\225\245.ipynb"
@@ -2915,7 +2915,7 @@
"\n",
"```yaml\n",
"Architecture:\n",
- " model_type: &model_type \"rec\" # 模型类别,rec、det等,每个子网络的的模型类别都与\n",
+ " model_type: &model_type \"rec\" # 模型类别,rec、det等,每个子网络的模型类别都与\n",
" name: DistillationModel # 结构名称,蒸馏任务中,为DistillationModel,用于构建对应的结构\n",
" algorithm: Distillation # 算法名称\n",
" Models: # 模型,包含子网络的配置信息\n",
diff --git "a/notebook/notebook_ch/5.ppocrv2_inference_deployment/PP-OCRv2\351\242\204\346\265\213\351\203\250\347\275\262\345\256\236\346\210\230.ipynb" "b/notebook/notebook_ch/5.ppocrv2_inference_deployment/PP-OCRv2\351\242\204\346\265\213\351\203\250\347\275\262\345\256\236\346\210\230.ipynb"
index c65627acc8d179a5459e3ceb1afa83325383c998..3b8550d339b571f9bc2a4fdabb57b7c29830b9ba 100644
--- "a/notebook/notebook_ch/5.ppocrv2_inference_deployment/PP-OCRv2\351\242\204\346\265\213\351\203\250\347\275\262\345\256\236\346\210\230.ipynb"
+++ "b/notebook/notebook_ch/5.ppocrv2_inference_deployment/PP-OCRv2\351\242\204\346\265\213\351\203\250\347\275\262\345\256\236\346\210\230.ipynb"
@@ -1876,11 +1876,11 @@
" rec_res)\n",
" filter_boxes, filter_rec_res = [], []\n",
" # 根据识别得分的阈值对结果进行过滤,如果得分小于阈值,就过滤掉\n",
- " for box, rec_reuslt in zip(dt_boxes, rec_res):\n",
- " text, score = rec_reuslt\n",
+ " for box, rec_result in zip(dt_boxes, rec_res):\n",
+ " text, score = rec_result\n",
" if score >= self.drop_score:\n",
" filter_boxes.append(box)\n",
- " filter_rec_res.append(rec_reuslt)\n",
+ " filter_rec_res.append(rec_result)\n",
" return filter_boxes, filter_rec_res\n",
"\n",
"def sorted_boxes(dt_boxes):\n",
diff --git "a/notebook/notebook_ch/6.document_analysis/\346\226\207\346\241\243\345\210\206\346\236\220\347\220\206\350\256\272.ipynb" "b/notebook/notebook_ch/6.document_analysis/\346\226\207\346\241\243\345\210\206\346\236\220\347\220\206\350\256\272.ipynb"
index ae49cbc9052b15b30b80aecd493dadfcc7ceac77..2ddd4776c1f7526aca1240b170be60a7d376453f 100644
--- "a/notebook/notebook_ch/6.document_analysis/\346\226\207\346\241\243\345\210\206\346\236\220\347\220\206\350\256\272.ipynb"
+++ "b/notebook/notebook_ch/6.document_analysis/\346\226\207\346\241\243\345\210\206\346\236\220\347\220\206\350\256\272.ipynb"
@@ -327,7 +327,7 @@
"

\n",
"
图 18: SER,RE任务示例\n",
"\n",
- "一般的KIE方法基于命名实体识别(Named Entity Recognition,NER)[4]来研究,但是这类方法只利用了图像中的文本信息,缺少对视觉和结构信息的使用,因此精度不高。在此基础上,近几年的方法都开始将视觉和结构信息与文本信息融合到一起,按照对多模态信息进行融合时所采用的的原理可以将这些方法分为下面三种:\n",
+ "一般的KIE方法基于命名实体识别(Named Entity Recognition,NER)[4]来研究,但是这类方法只利用了图像中的文本信息,缺少对视觉和结构信息的使用,因此精度不高。在此基础上,近几年的方法都开始将视觉和结构信息与文本信息融合到一起,按照对多模态信息进行融合时所采用的原理可以将这些方法分为下面三种:\n",
"\n",
"1. 基于Grid的方法\n",
"1. 基于Token的方法\n",
diff --git a/notebook/notebook_en/2.text_detection/text_detection_theory.ipynb b/notebook/notebook_en/2.text_detection/text_detection_theory.ipynb
index 64a4c669f0598887b6557f66ea72cd1cb9a0773b..5916ebed9eeafcfc7ce146bf802df84ad32bc26e 100644
--- a/notebook/notebook_en/2.text_detection/text_detection_theory.ipynb
+++ b/notebook/notebook_en/2.text_detection/text_detection_theory.ipynb
@@ -136,7 +136,7 @@
"
Figure 11: LOMO frame diagram\n",
"\n",
"\n",
- "Contournet [18] is based on the proposed modeling of text contour points to obtain a curved text detection frame. This method first uses Adaptive-RPN to obtain the proposal features of the text area, and then designs a local orthogonal texture perception LOTM module to learn horizontal and vertical textures. The feature is represented by contour points. Finally, by considering the feature responses in two orthogonal directions at the same time, the Point Re-Scoring algorithm can effectively filter out the prediction of strong unidirectional or weak orthogonal activation, and the final text contour can be used as a A group of high-quality contour points are shown.\n",
+ "Contournet [18] is based on the proposed modeling of text contour points to obtain a curved text detection frame. This method first uses Adaptive-RPN to obtain the proposal features of the text area, and then designs a local orthogonal texture perception LOTM module to learn horizontal and vertical textures. The feature is represented by contour points. Finally, by considering the feature responses in two orthogonal directions at the same time, the Point Re-Scoring algorithm can effectively filter out the prediction of strong unidirectional or weak orthogonal activation, and the final text contour can be used as a group of high-quality contour points are shown.\n",
"

\n",
"
Figure 12: Contournet frame diagram\n",
diff --git a/notebook/notebook_en/5.ppocrv2_inference_deployment/ppocrv2_inference_deployment_practice.ipynb b/notebook/notebook_en/5.ppocrv2_inference_deployment/ppocrv2_inference_deployment_practice.ipynb
index 61cd4561514c52644648eb7fdd8b2473a0caa8cb..780f948579d188449f1b8d3da5967bb3072a2b49 100644
--- a/notebook/notebook_en/5.ppocrv2_inference_deployment/ppocrv2_inference_deployment_practice.ipynb
+++ b/notebook/notebook_en/5.ppocrv2_inference_deployment/ppocrv2_inference_deployment_practice.ipynb
@@ -1886,11 +1886,11 @@
" rec_res)\n",
" filter_boxes, filter_rec_res = [], []\n",
" #Filter the results according to the threshold of the recognition score, if the score is less than the threshold, filter out\n",
- " for box, rec_reuslt in zip(dt_boxes, rec_res):\n",
- " text, score = rec_reuslt\n",
+ " for box, rec_result in zip(dt_boxes, rec_res):\n",
+ " text, score = rec_result\n",
" if score >= self.drop_score:\n",
" filter_boxes.append(box)\n",
- " filter_rec_res.append(rec_reuslt)\n",
+ " filter_rec_res.append(rec_result)\n",
" return filter_boxes, filter_rec_res\n",
"\n",
"def sorted_boxes(dt_boxes):\n",
diff --git a/paddleocr.py b/paddleocr.py
index cb2c34f69f68d289b317d4737bd23385c77c3d95..417350839ac4d1e512c7396831f89ab4b2d6c724 100644
--- a/paddleocr.py
+++ b/paddleocr.py
@@ -47,16 +47,46 @@ __all__ = [
]
SUPPORT_DET_MODEL = ['DB']
-VERSION = '2.5'
+VERSION = '2.5.0.1'
SUPPORT_REC_MODEL = ['CRNN']
BASE_DIR = os.path.expanduser("~/.paddleocr/")
-DEFAULT_OCR_MODEL_VERSION = 'PP-OCR'
-SUPPORT_OCR_MODEL_VERSION = ['PP-OCR', 'PP-OCRv2']
-DEFAULT_STRUCTURE_MODEL_VERSION = 'STRUCTURE'
-SUPPORT_STRUCTURE_MODEL_VERSION = ['STRUCTURE']
+DEFAULT_OCR_MODEL_VERSION = 'PP-OCRv3'
+SUPPORT_OCR_MODEL_VERSION = ['PP-OCR', 'PP-OCRv2', 'PP-OCRv3']
+DEFAULT_STRUCTURE_MODEL_VERSION = 'PP-STRUCTURE'
+SUPPORT_STRUCTURE_MODEL_VERSION = ['PP-STRUCTURE']
MODEL_URLS = {
'OCR': {
+ 'PP-OCRv3': {
+ 'det': {
+ 'ch': {
+ 'url':
+ 'https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar',
+ },
+ 'en': {
+ 'url':
+ 'https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar',
+ },
+ },
+ 'rec': {
+ 'ch': {
+ 'url':
+ 'https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar',
+ 'dict_path': './ppocr/utils/ppocr_keys_v1.txt'
+ },
+ 'en': {
+ 'url':
+ 'https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_infer.tar',
+ 'dict_path': './ppocr/utils/en_dict.txt'
+ },
+ },
+ 'cls': {
+ 'ch': {
+ 'url':
+ 'https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar',
+ }
+ },
+ },
'PP-OCRv2': {
'det': {
'ch': {
@@ -72,7 +102,7 @@ MODEL_URLS = {
}
}
},
- DEFAULT_OCR_MODEL_VERSION: {
+ 'PP-OCR': {
'det': {
'ch': {
'url':
@@ -173,7 +203,7 @@ MODEL_URLS = {
}
},
'STRUCTURE': {
- DEFAULT_STRUCTURE_MODEL_VERSION: {
+ 'PP-STRUCTURE': {
'table': {
'en': {
'url':
@@ -198,16 +228,17 @@ def parse_args(mMain=True):
"--ocr_version",
type=str,
choices=SUPPORT_OCR_MODEL_VERSION,
- default='PP-OCRv2',
+ default='PP-OCRv3',
help='OCR Model version, the current model support list is as follows: '
- '1. PP-OCRv2 Support Chinese detection and recognition model. '
- '2. PP-OCR support Chinese detection, recognition and direction classifier and multilingual recognition model.'
+ '1. PP-OCRv3 Support Chinese and English detection and recognition model, and direction classifier model'
+ '2. PP-OCRv2 Support Chinese detection and recognition model. '
+ '3. PP-OCR support Chinese detection, recognition and direction classifier and multilingual recognition model.'
)
parser.add_argument(
"--structure_version",
type=str,
choices=SUPPORT_STRUCTURE_MODEL_VERSION,
- default='STRUCTURE',
+ default='PP-STRUCTURE',
help='Model version, the current model support list is as follows:'
' 1. STRUCTURE Support en table structure model.')
diff --git a/ppocr/data/imaug/__init__.py b/ppocr/data/imaug/__init__.py
index 20aaf48e119d68e6c37ce9246a87701fb149d5e7..548832fb0d116ba2de622bd97562b591d74501d8 100644
--- a/ppocr/data/imaug/__init__.py
+++ b/ppocr/data/imaug/__init__.py
@@ -23,7 +23,7 @@ from .random_crop_data import EastRandomCropData, RandomCropImgMask
from .make_pse_gt import MakePseGt
from .rec_img_aug import RecAug, RecConAug, RecResizeImg, ClsResizeImg, \
- SRNRecResizeImg, NRTRRecResizeImg, SARRecResizeImg, PRENResizeImg, SVTRRecResizeImg
+ SRNRecResizeImg, NRTRRecResizeImg, SARRecResizeImg, PRENResizeImg
from .ssl_img_aug import SSLRotateResize
from .randaugment import RandAugment
from .copy_paste import CopyPaste
diff --git a/ppocr/data/imaug/rec_img_aug.py b/ppocr/data/imaug/rec_img_aug.py
index 2f70b51a3b88422274353046209c6d0d4dc79489..7483dffe5b6d9a0a2204702757fcb49762a1cc7a 100644
--- a/ppocr/data/imaug/rec_img_aug.py
+++ b/ppocr/data/imaug/rec_img_aug.py
@@ -207,25 +207,6 @@ class PRENResizeImg(object):
return data
-class SVTRRecResizeImg(object):
- def __init__(self,
- image_shape,
- infer_mode=False,
- character_dict_path='./ppocr/utils/ppocr_keys_v1.txt',
- padding=True,
- **kwargs):
- self.image_shape = image_shape
- self.infer_mode = infer_mode
- self.character_dict_path = character_dict_path
- self.padding = padding
-
- def __call__(self, data):
- img = data['image']
- norm_img = resize_norm_img_svtr(img, self.image_shape, self.padding)
- data['image'] = norm_img
- return data
-
-
def resize_norm_img_sar(img, image_shape, width_downsample_ratio=0.25):
imgC, imgH, imgW_min, imgW_max = image_shape
h = img.shape[0]
@@ -344,58 +325,6 @@ def resize_norm_img_srn(img, image_shape):
return np.reshape(img_black, (c, row, col)).astype(np.float32)
-def resize_norm_img_svtr(img, image_shape, padding=False):
- imgC, imgH, imgW = image_shape
- h = img.shape[0]
- w = img.shape[1]
- if not padding:
- if h > 2.0 * w:
- image = Image.fromarray(img)
- image1 = image.rotate(90, expand=True)
- image2 = image.rotate(-90, expand=True)
- img1 = np.array(image1)
- img2 = np.array(image2)
- else:
- img1 = copy.deepcopy(img)
- img2 = copy.deepcopy(img)
-
- resized_image = cv2.resize(
- img, (imgW, imgH), interpolation=cv2.INTER_LINEAR)
- resized_image1 = cv2.resize(
- img1, (imgW, imgH), interpolation=cv2.INTER_LINEAR)
- resized_image2 = cv2.resize(
- img2, (imgW, imgH), interpolation=cv2.INTER_LINEAR)
- resized_w = imgW
- else:
- ratio = w / float(h)
- if math.ceil(imgH * ratio) > imgW:
- resized_w = imgW
- else:
- resized_w = int(math.ceil(imgH * ratio))
- resized_image = cv2.resize(img, (resized_w, imgH))
- resized_image = resized_image.astype('float32')
- resized_image1 = resized_image1.astype('float32')
- resized_image2 = resized_image2.astype('float32')
- if image_shape[0] == 1:
- resized_image = resized_image / 255
- resized_image = resized_image[np.newaxis, :]
- else:
- resized_image = resized_image.transpose((2, 0, 1)) / 255
- resized_image1 = resized_image1.transpose((2, 0, 1)) / 255
- resized_image2 = resized_image2.transpose((2, 0, 1)) / 255
- resized_image -= 0.5
- resized_image /= 0.5
- resized_image1 -= 0.5
- resized_image1 /= 0.5
- resized_image2 -= 0.5
- resized_image2 /= 0.5
- padding_im = np.zeros((3, imgC, imgH, imgW), dtype=np.float32)
- padding_im[0, :, :, 0:resized_w] = resized_image
- padding_im[1, :, :, 0:resized_w] = resized_image1
- padding_im[2, :, :, 0:resized_w] = resized_image2
- return padding_im
-
-
def srn_other_inputs(image_shape, num_heads, max_text_length):
imgC, imgH, imgW = image_shape
diff --git a/ppocr/metrics/det_metric.py b/ppocr/metrics/det_metric.py
index c9ec8dd2e9082d7fd00db1086a352a61f0239cb1..dca94c092790b121c46c4f1c0f0355391ef8dba9 100644
--- a/ppocr/metrics/det_metric.py
+++ b/ppocr/metrics/det_metric.py
@@ -64,9 +64,9 @@ class DetMetric(object):
}
"""
- metircs = self.evaluator.combine_results(self.results)
+ metrics = self.evaluator.combine_results(self.results)
self.reset()
- return metircs
+ return metrics
def reset(self):
self.results = [] # clear results
@@ -127,20 +127,20 @@ class DetFCEMetric(object):
'thr 0.9':'precision: 0 recall: 0 hmean: 0',
}
"""
- metircs = {}
+ metrics = {}
hmean = 0
for score_thr in self.results.keys():
- metirc = self.evaluator.combine_results(self.results[score_thr])
- # for key, value in metirc.items():
- # metircs['{}_{}'.format(key, score_thr)] = value
- metirc_str = 'precision:{:.5f} recall:{:.5f} hmean:{:.5f}'.format(
- metirc['precision'], metirc['recall'], metirc['hmean'])
- metircs['thr {}'.format(score_thr)] = metirc_str
- hmean = max(hmean, metirc['hmean'])
- metircs['hmean'] = hmean
+ metric = self.evaluator.combine_results(self.results[score_thr])
+ # for key, value in metric.items():
+ # metrics['{}_{}'.format(key, score_thr)] = value
+ metric_str = 'precision:{:.5f} recall:{:.5f} hmean:{:.5f}'.format(
+ metric['precision'], metric['recall'], metric['hmean'])
+ metrics['thr {}'.format(score_thr)] = metric_str
+ hmean = max(hmean, metric['hmean'])
+ metrics['hmean'] = hmean
self.reset()
- return metircs
+ return metrics
def reset(self):
self.results = {
diff --git a/ppocr/metrics/e2e_metric.py b/ppocr/metrics/e2e_metric.py
index 41b7ac2bad041295ad67a2de3461c109cf76a84a..2f8ba3b222022099501e6bda48266ef51a40e9db 100644
--- a/ppocr/metrics/e2e_metric.py
+++ b/ppocr/metrics/e2e_metric.py
@@ -78,9 +78,9 @@ class E2EMetric(object):
self.results.append(result)
def get_metric(self):
- metircs = combine_results(self.results)
+ metrics = combine_results(self.results)
self.reset()
- return metircs
+ return metrics
def reset(self):
self.results = [] # clear results
diff --git a/ppocr/metrics/kie_metric.py b/ppocr/metrics/kie_metric.py
index f3bce0411d6521b1756892cbd7b4c6fcb7bcfb6c..28ab22b807ffe4347ca95be5a44143539db4c97c 100644
--- a/ppocr/metrics/kie_metric.py
+++ b/ppocr/metrics/kie_metric.py
@@ -61,9 +61,9 @@ class KIEMetric(object):
def get_metric(self):
- metircs = self.combine_results(self.results)
+ metrics = self.combine_results(self.results)
self.reset()
- return metircs
+ return metrics
def reset(self):
self.results = [] # clear results
diff --git a/ppocr/metrics/vqa_token_ser_metric.py b/ppocr/metrics/vqa_token_ser_metric.py
index 92d80d0970dc2eab1d3fb82e2b4cfb8d930a60a0..286d8addaf32ba8e4fa12751c6e270c853775bd4 100644
--- a/ppocr/metrics/vqa_token_ser_metric.py
+++ b/ppocr/metrics/vqa_token_ser_metric.py
@@ -34,13 +34,13 @@ class VQASerTokenMetric(object):
def get_metric(self):
from seqeval.metrics import f1_score, precision_score, recall_score
- metircs = {
+ metrics = {
"precision": precision_score(self.gt_list, self.pred_list),
"recall": recall_score(self.gt_list, self.pred_list),
"hmean": f1_score(self.gt_list, self.pred_list),
}
self.reset()
- return metircs
+ return metrics
def reset(self):
self.pred_list = []
diff --git a/ppocr/modeling/architectures/base_model.py b/ppocr/modeling/architectures/base_model.py
index f5b29f94057d5b1f1fbec27686d5f1d679b15479..c6b50d4886daa9bfd2f863c1d8fd6dbc3d1e42c0 100644
--- a/ppocr/modeling/architectures/base_model.py
+++ b/ppocr/modeling/architectures/base_model.py
@@ -92,6 +92,9 @@ class BaseModel(nn.Layer):
else:
y["head_out"] = x
if self.return_all_feats:
- return y
+ if self.training:
+ return y
+ else:
+ return {"head_out": y["head_out"]}
else:
return x
diff --git a/ppocr/modeling/backbones/rec_svtrnet.py b/ppocr/modeling/backbones/rec_svtrnet.py
index 5ded74378c60e6f08a4adf68671afaa1168737b6..c57bf46345d6e08f23b9258358f77f2285366314 100644
--- a/ppocr/modeling/backbones/rec_svtrnet.py
+++ b/ppocr/modeling/backbones/rec_svtrnet.py
@@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-from collections import Callable
from paddle import ParamAttr
from paddle.nn.initializer import KaimingNormal
import numpy as np
@@ -170,17 +169,14 @@ class Attention(nn.Layer):
self.N = H * W
self.C = dim
if mixer == 'Local' and HW is not None:
-
hk = local_k[0]
wk = local_k[1]
- mask = np.ones([H * W, H * W])
- for h in range(H):
- for w in range(W):
- for kh in range(-(hk // 2), (hk // 2) + 1):
- for kw in range(-(wk // 2), (wk // 2) + 1):
- if H > (h + kh) >= 0 and W > (w + kw) >= 0:
- mask[h * W + w][(h + kh) * W + (w + kw)] = 0
- mask_paddle = paddle.to_tensor(mask, dtype='float32')
+ mask = paddle.ones([H * W, H + hk - 1, W + wk - 1], dtype='float32')
+ for h in range(0, H):
+ for w in range(0, W):
+ mask[h * W + w, h:h + hk, w:w + wk] = 0.
+ mask_paddle = mask[:, hk // 2:H + hk // 2, wk // 2:W + wk //
+ 2].flatten(1)
mask_inf = paddle.full([H * W, H * W], '-inf', dtype='float32')
mask = paddle.where(mask_paddle < 1, mask_paddle, mask_inf)
self.mask = mask.unsqueeze([0, 1])
@@ -228,11 +224,8 @@ class Block(nn.Layer):
super().__init__()
if isinstance(norm_layer, str):
self.norm1 = eval(norm_layer)(dim, epsilon=epsilon)
- elif isinstance(norm_layer, Callable):
- self.norm1 = norm_layer(dim)
else:
- raise TypeError(
- "The norm_layer must be str or paddle.nn.layer.Layer class")
+ self.norm1 = norm_layer(dim)
if mixer == 'Global' or mixer == 'Local':
self.mixer = Attention(
dim,
@@ -250,15 +243,11 @@ class Block(nn.Layer):
else:
raise TypeError("The mixer must be one of [Global, Local, Conv]")
- # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
if isinstance(norm_layer, str):
self.norm2 = eval(norm_layer)(dim, epsilon=epsilon)
- elif isinstance(norm_layer, Callable):
- self.norm2 = norm_layer(dim)
else:
- raise TypeError(
- "The norm_layer must be str or paddle.nn.layer.Layer class")
+ self.norm2 = norm_layer(dim)
mlp_hidden_dim = int(dim * mlp_ratio)
self.mlp_ratio = mlp_ratio
self.mlp = Mlp(in_features=dim,
@@ -330,8 +319,6 @@ class PatchEmbed(nn.Layer):
act=nn.GELU,
bias_attr=None),
ConvBNLayer(
- embed_dim // 2,
- embed_dim,
in_channels=embed_dim // 2,
out_channels=embed_dim,
kernel_size=3,
diff --git a/ppocr/modeling/transforms/stn.py b/ppocr/modeling/transforms/stn.py
index 1b15d5b8a7b7a1b1ab686d20acea750437463939..6f2bdda050f217d8253740001901fbff4065782a 100644
--- a/ppocr/modeling/transforms/stn.py
+++ b/ppocr/modeling/transforms/stn.py
@@ -128,8 +128,6 @@ class STN_ON(nn.Layer):
self.out_channels = in_channels
def forward(self, image):
- if len(image.shape)==5:
- image = image.reshape([0, image.shape[-3], image.shape[-2], image.shape[-1]])
stn_input = paddle.nn.functional.interpolate(
image, self.tps_inputsize, mode="bilinear", align_corners=True)
stn_img_feat, ctrl_points = self.stn_head(stn_input)
diff --git a/ppocr/optimizer/optimizer.py b/ppocr/optimizer/optimizer.py
index c450a3a3684eb44cdc758a2b27783b5a81945c38..dd8544e2e7d39be33a9096cad16c4d58eb58bcad 100644
--- a/ppocr/optimizer/optimizer.py
+++ b/ppocr/optimizer/optimizer.py
@@ -43,12 +43,15 @@ class Momentum(object):
self.grad_clip = grad_clip
def __call__(self, model):
+ train_params = [
+ param for param in model.parameters() if param.trainable is True
+ ]
opt = optim.Momentum(
learning_rate=self.learning_rate,
momentum=self.momentum,
weight_decay=self.weight_decay,
grad_clip=self.grad_clip,
- parameters=model.parameters())
+ parameters=train_params)
return opt
@@ -76,6 +79,9 @@ class Adam(object):
self.lazy_mode = lazy_mode
def __call__(self, model):
+ train_params = [
+ param for param in model.parameters() if param.trainable is True
+ ]
opt = optim.Adam(
learning_rate=self.learning_rate,
beta1=self.beta1,
@@ -85,7 +91,7 @@ class Adam(object):
grad_clip=self.grad_clip,
name=self.name,
lazy_mode=self.lazy_mode,
- parameters=model.parameters())
+ parameters=train_params)
return opt
@@ -118,6 +124,9 @@ class RMSProp(object):
self.grad_clip = grad_clip
def __call__(self, model):
+ train_params = [
+ param for param in model.parameters() if param.trainable is True
+ ]
opt = optim.RMSProp(
learning_rate=self.learning_rate,
momentum=self.momentum,
@@ -125,7 +134,7 @@ class RMSProp(object):
epsilon=self.epsilon,
weight_decay=self.weight_decay,
grad_clip=self.grad_clip,
- parameters=model.parameters())
+ parameters=train_params)
return opt
@@ -149,6 +158,9 @@ class Adadelta(object):
self.name = name
def __call__(self, model):
+ train_params = [
+ param for param in model.parameters() if param.trainable is True
+ ]
opt = optim.Adadelta(
learning_rate=self.learning_rate,
epsilon=self.epsilon,
@@ -156,7 +168,7 @@ class Adadelta(object):
weight_decay=self.weight_decay,
grad_clip=self.grad_clip,
name=self.name,
- parameters=model.parameters())
+ parameters=train_params)
return opt
@@ -190,17 +202,20 @@ class AdamW(object):
self.one_dim_param_no_weight_decay = one_dim_param_no_weight_decay
def __call__(self, model):
- parameters = model.parameters()
+ parameters = [
+ param for param in model.parameters() if param.trainable is True
+ ]
self.no_weight_decay_param_name_list = [
- p.name for n, p in model.named_parameters() if any(nd in n for nd in self.no_weight_decay_name_list)
+ p.name for n, p in model.named_parameters()
+ if any(nd in n for nd in self.no_weight_decay_name_list)
]
if self.one_dim_param_no_weight_decay:
self.no_weight_decay_param_name_list += [
- p.name for n, p in model.named_parameters() if len(p.shape) == 1
+ p.name for n, p in model.named_parameters() if len(p.shape) == 1
]
-
+
opt = optim.AdamW(
learning_rate=self.learning_rate,
beta1=self.beta1,
@@ -216,4 +231,4 @@ class AdamW(object):
return opt
def _apply_decay_param_fun(self, name):
- return name not in self.no_weight_decay_param_name_list
\ No newline at end of file
+ return name not in self.no_weight_decay_param_name_list
diff --git a/ppocr/postprocess/__init__.py b/ppocr/postprocess/__init__.py
index 390f6f4560f9814a3af757a4fd16c55fe93d01f9..f50b5f1c5f8e617066bb47636c8f4d2b171b6ecb 100644
--- a/ppocr/postprocess/__init__.py
+++ b/ppocr/postprocess/__init__.py
@@ -27,7 +27,7 @@ from .sast_postprocess import SASTPostProcess
from .fce_postprocess import FCEPostProcess
from .rec_postprocess import CTCLabelDecode, AttnLabelDecode, SRNLabelDecode, \
DistillationCTCLabelDecode, TableLabelDecode, NRTRLabelDecode, SARLabelDecode, \
- SEEDLabelDecode, PRENLabelDecode, SVTRLabelDecode
+ SEEDLabelDecode, PRENLabelDecode
from .cls_postprocess import ClsPostProcess
from .pg_postprocess import PGPostProcess
from .vqa_token_ser_layoutlm_postprocess import VQASerTokenLayoutLMPostProcess
@@ -42,7 +42,7 @@ def build_post_process(config, global_config=None):
'DistillationDBPostProcess', 'NRTRLabelDecode', 'SARLabelDecode',
'SEEDLabelDecode', 'VQASerTokenLayoutLMPostProcess',
'VQAReTokenLayoutLMPostProcess', 'PRENLabelDecode',
- 'DistillationSARLabelDecode', 'SVTRLabelDecode'
+ 'DistillationSARLabelDecode'
]
if config['name'] == 'PSEPostProcess':
diff --git a/ppocr/postprocess/rec_postprocess.py b/ppocr/postprocess/rec_postprocess.py
index 50f11f899fb4dd49da75199095772a92cc4a8d7b..bf0fd890bf25949361665d212bf8e1a657054e5b 100644
--- a/ppocr/postprocess/rec_postprocess.py
+++ b/ppocr/postprocess/rec_postprocess.py
@@ -752,40 +752,3 @@ class PRENLabelDecode(BaseRecLabelDecode):
return text
label = self.decode(label)
return text, label
-
-
-class SVTRLabelDecode(BaseRecLabelDecode):
- """ Convert between text-label and text-index """
-
- def __init__(self, character_dict_path=None, use_space_char=False,
- **kwargs):
- super(SVTRLabelDecode, self).__init__(character_dict_path,
- use_space_char)
-
- def __call__(self, preds, label=None, *args, **kwargs):
- if isinstance(preds, tuple):
- preds = preds[-1]
- if isinstance(preds, paddle.Tensor):
- preds = preds.numpy()
- preds_idx = preds.argmax(axis=-1)
- preds_prob = preds.max(axis=-1)
-
- text = self.decode(preds_idx, preds_prob, is_remove_duplicate=True)
- return_text = []
- for i in range(0, len(text), 3):
- text0 = text[i]
- text1 = text[i + 1]
- text2 = text[i + 2]
-
- text_pred = [text0[0], text1[0], text2[0]]
- text_prob = [text0[1], text1[1], text2[1]]
- id_max = text_prob.index(max(text_prob))
- return_text.append((text_pred[id_max], text_prob[id_max]))
- if label is None:
- return return_text
- label = self.decode(label)
- return return_text, label
-
- def add_special_char(self, dict_character):
- dict_character = ['blank'] + dict_character
- return dict_character
\ No newline at end of file
diff --git a/ppocr/utils/utility.py b/ppocr/utils/utility.py
index dc2a6e7405e4d01ddffabf4a23c98bcf3ea3c205..4a25ff8b2fa182faaf4f4ce8909c9ec2e9b55ccc 100755
--- a/ppocr/utils/utility.py
+++ b/ppocr/utils/utility.py
@@ -49,18 +49,23 @@ def get_check_global_params(mode):
return check_params
+def _check_image_file(path):
+ img_end = {'jpg', 'bmp', 'png', 'jpeg', 'rgb', 'tif', 'tiff', 'gif'}
+ return any([path.lower().endswith(e) for e in img_end])
+
+
def get_image_file_list(img_file):
imgs_lists = []
if img_file is None or not os.path.exists(img_file):
raise Exception("not found any img file in {}".format(img_file))
- img_end = {'jpg', 'bmp', 'png', 'jpeg', 'rgb', 'tif', 'tiff', 'gif', 'GIF'}
- if os.path.isfile(img_file) and imghdr.what(img_file) in img_end:
+ img_end = {'jpg', 'bmp', 'png', 'jpeg', 'rgb', 'tif', 'tiff', 'gif'}
+ if os.path.isfile(img_file) and _check_image_file(img_file):
imgs_lists.append(img_file)
elif os.path.isdir(img_file):
for single_file in os.listdir(img_file):
file_path = os.path.join(img_file, single_file)
- if os.path.isfile(file_path) and imghdr.what(file_path) in img_end:
+ if os.path.isfile(file_path) and _check_image_file(file_path):
imgs_lists.append(file_path)
if len(imgs_lists) == 0:
raise Exception("not found any img file in {}".format(img_file))
diff --git a/ppstructure/docs/quickstart.md b/ppstructure/docs/quickstart.md
index 6610035d1442f988ac69763724ce78f6db35ae20..31e59416247b4f0e6b6d82fb13e0d3841a113a5f 100644
--- a/ppstructure/docs/quickstart.md
+++ b/ppstructure/docs/quickstart.md
@@ -194,5 +194,6 @@ dict 里各个字段说明如下
| layout | 前向中是否执行版面分析 | True |
| table | 前向中是否执行表格识别 | True |
| ocr | 对于版面分析中的非表格区域,是否执行ocr。当layout为False时会被自动设置为False | True |
+| structure_version | 表格结构化模型版本,可选 PP-STRUCTURE。PP-STRUCTURE支持表格结构化模型 | PP-STRUCTURE |
大部分参数和PaddleOCR whl包保持一致,见 [whl包文档](../../doc/doc_ch/whl.md)
diff --git a/ppstructure/docs/quickstart_en.md b/ppstructure/docs/quickstart_en.md
index 853436ff07e665fb140a749e8ccbde4392ea5c13..1f78b43ea3334648a37a37745737a6a26e27ece3 100644
--- a/ppstructure/docs/quickstart_en.md
+++ b/ppstructure/docs/quickstart_en.md
@@ -194,5 +194,5 @@ Please refer to: [Documentation Visual Q&A](../vqa/README.md) .
| layout | Whether to perform layout analysis in forward | True |
| table | Whether to perform table recognition in forward | True |
| ocr | Whether to perform ocr for non-table areas in layout analysis. When layout is False, it will be automatically set to False | True |
-
+| structure_version | table structure Model version number, the current model support list is as follows: PP-STRUCTURE support english table structure model | PP-STRUCTURE |
Most of the parameters are consistent with the PaddleOCR whl package, see [whl package documentation](../../doc/doc_en/whl.md)
diff --git a/ppstructure/vqa/tools/eval_with_label_end2end.py b/ppstructure/vqa/tools/eval_with_label_end2end.py
index 3aa439acb269d74165543fac7e0042cfc213f08d..b13ffb568fd9610fee5d5a246c501ed5b90de91a 100644
--- a/ppstructure/vqa/tools/eval_with_label_end2end.py
+++ b/ppstructure/vqa/tools/eval_with_label_end2end.py
@@ -82,7 +82,7 @@ def polygon_iou(poly1, poly2):
except shapely.geos.TopologicalError:
# except Exception as e:
# print(e)
- print('shapely.geos.TopologicalError occured, iou set to 0')
+ print('shapely.geos.TopologicalError occurred, iou set to 0')
iou = 0
return iou
diff --git a/test_tipc/docs/jeston_test_train_inference_python.md b/test_tipc/docs/jeston_test_train_inference_python.md
index d96505985ea8a291b3579acb2aaee1b3d66c1baa..9e9d15fb674ca04558b1f8cb616dc4e44934dbb9 100644
--- a/test_tipc/docs/jeston_test_train_inference_python.md
+++ b/test_tipc/docs/jeston_test_train_inference_python.md
@@ -1,6 +1,6 @@
-# Jeston端基础训练预测功能测试
+# Jetson端基础训练预测功能测试
-Jeston端基础训练预测功能测试的主程序为`test_inference_inference.sh`,由于Jeston端CPU较差,Jeston只需要测试TIPC关于GPU和TensorRT预测推理的部分即可。
+Jetson端基础训练预测功能测试的主程序为`test_inference_inference.sh`,由于Jetson端CPU较差,Jetson只需要测试TIPC关于GPU和TensorRT预测推理的部分即可。
## 1. 测试结论汇总
@@ -42,7 +42,7 @@ Jeston端基础训练预测功能测试的主程序为`test_inference_inference.
先运行`prepare.sh`准备数据和模型,然后运行`test_inference_inference.sh`进行测试,最终在```test_tipc/output```目录下生成`python_infer_*.log`格式的日志文件。
-`test_inference_inference.sh`仅有一个模式`whole_infer`,在Jeston端,仅需要测试预测推理的模式即可:
+`test_inference_inference.sh`仅有一个模式`whole_infer`,在Jetson端,仅需要测试预测推理的模式即可:
```
- 模式3:whole_infer,不训练,全量数据预测,走通开源模型评估、动转静,检查inference model预测时间和精度;
@@ -51,7 +51,7 @@ bash test_tipc/prepare.sh ./test_tipc/configs/ch_ppocr_mobile_v2.0_det/model_lin
# 用法1:
bash test_tipc/test_inference_inference.sh ./test_tipc/configs/ch_ppocr_mobile_v2.0_det/model_linux_gpu_normal_normal_infer_python_jetson.txt 'whole_infer'
# 用法2: 指定GPU卡预测,第三个传入参数为GPU卡号
-bash test_tipc/test_inference_jeston.sh ./test_tipc/configs/ch_ppocr_mobile_v2.0_det/model_linux_gpu_normal_normal_infer_python_jetson.txt 'whole_infer' '1'
+bash test_tipc/test_inference_jetson.sh ./test_tipc/configs/ch_ppocr_mobile_v2.0_det/model_linux_gpu_normal_normal_infer_python_jetson.txt 'whole_infer' '1'
```
运行相应指令后,在`test_tipc/output`文件夹下自动会保存运行日志。如`whole_infer`模式下,会运行训练+inference的链条,因此,在`test_tipc/output`文件夹有以下文件:
diff --git a/tools/end2end/eval_end2end.py b/tools/end2end/eval_end2end.py
index 6e7573ca472503e5d2216723c056ddf42c77e0aa..dd37940845b60d55291b40b8cd50d6afd5b91f94 100644
--- a/tools/end2end/eval_end2end.py
+++ b/tools/end2end/eval_end2end.py
@@ -59,7 +59,7 @@ def polygon_iou(poly1, poly2):
except shapely.geos.TopologicalError:
# except Exception as e:
# print(e)
- print('shapely.geos.TopologicalError occured, iou set to 0')
+ print('shapely.geos.TopologicalError occurred, iou set to 0')
iou = 0
return iou
diff --git a/tools/eval.py b/tools/eval.py
index 7fd4fa7ada7b1550bcca8766f5acb9b4d4ed2049..cab28334396c54f1526f830044de0772b5402a11 100755
--- a/tools/eval.py
+++ b/tools/eval.py
@@ -20,8 +20,8 @@ import os
import sys
__dir__ = os.path.dirname(os.path.abspath(__file__))
-sys.path.append(__dir__)
-sys.path.append(os.path.abspath(os.path.join(__dir__, '..')))
+sys.path.insert(0, __dir__)
+sys.path.insert(0, os.path.abspath(os.path.join(__dir__, '..')))
from ppocr.data import build_dataloader
from ppocr.modeling.architectures import build_model
diff --git a/tools/export_model.py b/tools/export_model.py
index 1f9f29e396fe4960914ae802769b65d20c103bd3..e971f6cb20025d529d0387d287ec87a76abbdbe7 100755
--- a/tools/export_model.py
+++ b/tools/export_model.py
@@ -133,7 +133,7 @@ def main():
else:
config["Architecture"]["Models"][key]["Head"][
"out_channels"] = char_num
- # just one final tensor needs to to exported for inference
+ # just one final tensor needs to exported for inference
config["Architecture"]["Models"][key][
"return_all_feats"] = False
elif config['Architecture']['Head'][
diff --git a/tools/infer/predict_system.py b/tools/infer/predict_system.py
index 63b635c1114d079478e34682d7c94c433bd21ffa..534f08fbcb3b90eea6a371d9f0ad128e276874c4 100755
--- a/tools/infer/predict_system.py
+++ b/tools/infer/predict_system.py
@@ -195,7 +195,7 @@ def main(args):
text_sys.text_detector.autolog.report()
text_sys.text_recognizer.autolog.report()
- with open(os.path.join(draw_img_save_dir, "system_results.txt"), 'w') as f:
+ with open(os.path.join(draw_img_save_dir, "system_results.txt"), 'w', encoding='utf-8') as f:
f.writelines(save_results)
diff --git a/tools/infer/utility.py b/tools/infer/utility.py
index c92e8e152a9ee4d86d269aec7ff5645f23cad443..ce4e2d92c2851fc7c7ae2d2a371c755c82dc97e5 100644
--- a/tools/infer/utility.py
+++ b/tools/infer/utility.py
@@ -193,7 +193,7 @@ def create_predictor(args, mode, logger):
gpu_id = get_infer_gpuid()
if gpu_id is None:
logger.warning(
- "GPU is not found in current device by nvidia-smi. Please check your device or ignore it if run on jeston."
+ "GPU is not found in current device by nvidia-smi. Please check your device or ignore it if run on jetson."
)
config.enable_use_gpu(args.gpu_mem, 0)
if args.use_tensorrt:
diff --git a/tools/infer_e2e.py b/tools/infer_e2e.py
index f3d5712fdda21faf04b95b7a2d5d3092af1b5011..d3e6b28fca0a3ff32ea940747712d6c71aa290fd 100755
--- a/tools/infer_e2e.py
+++ b/tools/infer_e2e.py
@@ -104,7 +104,7 @@ def main():
preds = model(images)
post_result = post_process_class(preds, shape_list)
points, strs = post_result['points'], post_result['texts']
- # write resule
+ # write result
dt_boxes_json = []
for poly, str in zip(points, strs):
tmp_json = {"transcription": str}
diff --git a/tools/infer_rec.py b/tools/infer_rec.py
index 63d410b627f3868191c9299f9bc99e7fcab69d35..193e24a4de12392130d16b86a3407db74602e1f4 100755
--- a/tools/infer_rec.py
+++ b/tools/infer_rec.py
@@ -150,7 +150,7 @@ def main():
"label": post_result[key][0][0],
"score": float(post_result[key][0][1]),
}
- info = json.dumps(rec_info)
+ info = json.dumps(rec_info, ensure_ascii=False)
else:
if len(post_result[0]) >= 2:
info = post_result[0][0] + "\t" + str(post_result[0][1])
diff --git a/tools/infer_vqa_token_ser_re.py b/tools/infer_vqa_token_ser_re.py
index 2c7cb5e4251819c53e56ba74df530181836299a2..6210f7f3c24227c9d366b08ce93ccfe4df849ce1 100755
--- a/tools/infer_vqa_token_ser_re.py
+++ b/tools/infer_vqa_token_ser_re.py
@@ -193,7 +193,7 @@ if __name__ == '__main__':
result = result[0]
fout.write(img_path + "\t" + json.dumps(
{
- "ser_resule": result,
+ "ser_result": result,
}, ensure_ascii=False) + "\n")
img_res = draw_re_results(img_path, result)
cv2.imwrite(save_img_path, img_res)