diff --git a/PPOCRLabel/PPOCRLabel.py b/PPOCRLabel/PPOCRLabel.py index 523661b635ed4eaaf8d2b4ac05aaf60b2abc8541..7c7802a73ed32680142f8119b10a0393d1fab9cc 100644 --- a/PPOCRLabel/PPOCRLabel.py +++ b/PPOCRLabel/PPOCRLabel.py @@ -1439,8 +1439,8 @@ class MainWindow(QMainWindow): DEFAULT_LOCK_COLOR, key_cls, box['difficult'])) if imgidx in self.PPlabel.keys(): for box in self.PPlabel[imgidx]: - key_cls = None if not self.kie_mode else box['key_cls'] - shapes.append((box['transcription'], box['points'], None, key_cls, box['difficult'])) + key_cls = None if not self.kie_mode else box.get('key_cls', 'None') + shapes.append((box['transcription'], box['points'], None, key_cls, box.get('difficult', False))) self.loadLabels(shapes) self.canvas.verified = False @@ -1584,7 +1584,7 @@ class MainWindow(QMainWindow): for image, info in label_dict.items(): for box in info: if "key_cls" not in box: - continue + box.update({"key_cls": "None"}) self.existed_key_cls_set.add(box["key_cls"]) if len(self.existed_key_cls_set) > 0: for key_text in self.existed_key_cls_set: @@ -1606,8 +1606,6 @@ class MainWindow(QMainWindow): fit_to_content={'column': True, 'row': False}, flags=None ) - else: - self.keyDialog.labelList.addItems(self.existed_key_cls_set) def importDirImages(self, dirpath, isDelete=False): if not self.mayContinue() or not dirpath: diff --git a/PPOCRLabel/README.md b/PPOCRLabel/README.md index 0a1a7536912b6591517050bfa62260aaef6077cd..95baf66b297bb17b9396a20b8aa156fbd8273e39 100644 --- a/PPOCRLabel/README.md +++ b/PPOCRLabel/README.md @@ -12,6 +12,7 @@ PPOCRLabel is a semi-automatic graphic annotation tool suitable for OCR field, w - Add KIE Mode by using `--kie`, for [detection + identification + keyword extraction] labeling. - 2022.01:(by [PeterH0323](https://github.com/peterh0323) ) - Improve user experience: prompt for the number of files and labels, optimize interaction, and fix bugs such as only use CPU when inference + - New functions: Support using `C` or `X` to rotate box. - 2021.11.17: - Support install and start PPOCRLabel through the whl package (by [d2623587501](https://github.com/d2623587501)) - Dataset segmentation: Divide the annotation file into training, verification and testing parts (refer to section 3.5 below, by [MrCuiHao](https://github.com/MrCuiHao)) diff --git a/PPOCRLabel/README_ch.md b/PPOCRLabel/README_ch.md index 99c088de83f9cba775733a0473b50596683c47ab..5534b308f3b49905bfaa33b24e128e2bbcc1ef5b 100644 --- a/PPOCRLabel/README_ch.md +++ b/PPOCRLabel/README_ch.md @@ -11,7 +11,8 @@ PPOCRLabel是一款适用于OCR领域的半自动化图形标注工具,内置P - 2022.02:(by [PeterH0323](https://github.com/peterh0323) ) - 新增:使用 `--kie` 进入 KIE 功能,用于打【检测+识别+关键字提取】的标签 - 2022.01:(by [PeterH0323](https://github.com/peterh0323) ) - - 提升用户体验:新增文件与标记数目提示、优化交互、修复gpu使用等问题 + - 提升用户体验:新增文件与标记数目提示、优化交互、修复gpu使用等问题。 + - 新增功能:使用 `C` 和 `X` 对标记框进行旋转。 - 2021.11.17: - 新增支持通过whl包安装和启动PPOCRLabel(by [d2623587501](https://github.com/d2623587501)) - 标注数据集切分:对标注数据进行训练、验证与测试集划分(参考下方3.5节,by [MrCuiHao](https://github.com/MrCuiHao)) diff --git a/README.md b/README.md index 259ccb5aa02352ca2a2b81bf81d858cec2b47081..7ff799db308c72530b825bafc53973f707be389b 100644 --- a/README.md +++ b/README.md @@ -33,6 +33,8 @@ PaddleOCR aims to create multilingual, awesome, leading, and practical OCR tools PaddleOCR support a variety of cutting-edge algorithms related to OCR, and developed industrial featured models/solution [PP-OCR](./doc/doc_en/ppocr_introduction_en.md) and [PP-Structure](./ppstructure/README.md) on this basis, and get through the whole process of data production, model training, compression, inference and deployment. +PaddleOCR also supports metric and model logging during training to [VisualDL](https://www.paddlepaddle.org.cn/documentation/docs/en/guides/03_VisualDL/visualdl_usage_en.html) and [Weights & Biases](https://docs.wandb.ai/). + ![](./doc/features_en.png) > It is recommended to start with the “quick experience” in the document tutorial diff --git a/configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml b/configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml index 773a3649d8378cb39373b5b90837f17f9ecba335..e7cbae59a14af73639e1a74a14021b9b2ef60057 100644 --- a/configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml +++ b/configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml @@ -129,7 +129,7 @@ Loss: key: head_out multi_head: True - DistillationSARLoss: - weight: 0.5 + weight: 1.0 model_name_list: ["Student", "Teacher"] key: head_out multi_head: True diff --git a/configs/rec/PP-OCRv3/multi_language/arabic_PP-OCRv3_rec.yml b/configs/rec/PP-OCRv3/multi_language/arabic_PP-OCRv3_rec.yml new file mode 100644 index 0000000000000000000000000000000000000000..0ad1ab0adc189102ff07094fcda92d4f9ea9c662 --- /dev/null +++ b/configs/rec/PP-OCRv3/multi_language/arabic_PP-OCRv3_rec.yml @@ -0,0 +1,131 @@ +Global: + debug: false + use_gpu: true + epoch_num: 500 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/v3_arabic_mobile + save_epoch_step: 3 + eval_batch_step: [0, 2000] + cal_metric_during_train: true + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: false + infer_img: doc/imgs_words/ch/word_1.jpg + character_dict_path: ppocr/utils/dict/arabic_dict.txt + max_text_length: &max_text_length 25 + infer_mode: false + use_space_char: true + distributed: true + save_res_path: ./output/rec/predicts_ppocrv3_arabic.txt + + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Cosine + learning_rate: 0.001 + warmup_epoch: 5 + regularizer: + name: L2 + factor: 3.0e-05 + + +Architecture: + model_type: rec + algorithm: SVTR + Transform: + Backbone: + name: MobileNetV1Enhance + scale: 0.5 + last_conv_stride: [1, 2] + last_pool_type: avg + Head: + name: MultiHead + head_list: + - CTCHead: + Neck: + name: svtr + dims: 64 + depth: 2 + hidden_dims: 120 + use_guide: True + Head: + fc_decay: 0.00001 + - SARHead: + enc_dim: 512 + max_text_length: *max_text_length + +Loss: + name: MultiLoss + loss_config_list: + - CTCLoss: + - SARLoss: + +PostProcess: + name: CTCLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + ignore_space: False + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ + ext_op_transform_idx: 1 + label_file_list: + - ./train_data/train_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - RecConAug: + prob: 0.5 + ext_data_num: 2 + image_shape: [48, 320, 3] + - RecAug: + - MultiLabelEncode: + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_sar + - length + - valid_ratio + loader: + shuffle: true + batch_size_per_card: 128 + drop_last: true + num_workers: 4 +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data + label_file_list: + - ./train_data/val_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - MultiLabelEncode: + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_sar + - length + - valid_ratio + loader: + shuffle: false + drop_last: false + batch_size_per_card: 128 + num_workers: 4 diff --git a/configs/rec/PP-OCRv3/multi_language/chinese_cht_PP-OCRv3_rec.yml b/configs/rec/PP-OCRv3/multi_language/chinese_cht_PP-OCRv3_rec.yml new file mode 100644 index 0000000000000000000000000000000000000000..28e0c10aa0f83fdf8e621aae04bf2b7374255adc --- /dev/null +++ b/configs/rec/PP-OCRv3/multi_language/chinese_cht_PP-OCRv3_rec.yml @@ -0,0 +1,131 @@ +Global: + debug: false + use_gpu: true + epoch_num: 500 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/v3_chinese_cht_mobile + save_epoch_step: 3 + eval_batch_step: [0, 2000] + cal_metric_during_train: true + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: false + infer_img: doc/imgs_words/ch/word_1.jpg + character_dict_path: ppocr/utils/dict/chinese_cht_dict.txt + max_text_length: &max_text_length 25 + infer_mode: false + use_space_char: true + distributed: true + save_res_path: ./output/rec/predicts_ppocrv3_chinese_cht.txt + + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Cosine + learning_rate: 0.001 + warmup_epoch: 5 + regularizer: + name: L2 + factor: 3.0e-05 + + +Architecture: + model_type: rec + algorithm: SVTR + Transform: + Backbone: + name: MobileNetV1Enhance + scale: 0.5 + last_conv_stride: [1, 2] + last_pool_type: avg + Head: + name: MultiHead + head_list: + - CTCHead: + Neck: + name: svtr + dims: 64 + depth: 2 + hidden_dims: 120 + use_guide: True + Head: + fc_decay: 0.00001 + - SARHead: + enc_dim: 512 + max_text_length: *max_text_length + +Loss: + name: MultiLoss + loss_config_list: + - CTCLoss: + - SARLoss: + +PostProcess: + name: CTCLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + ignore_space: False + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ + ext_op_transform_idx: 1 + label_file_list: + - ./train_data/train_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - RecConAug: + prob: 0.5 + ext_data_num: 2 + image_shape: [48, 320, 3] + - RecAug: + - MultiLabelEncode: + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_sar + - length + - valid_ratio + loader: + shuffle: true + batch_size_per_card: 128 + drop_last: true + num_workers: 4 +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data + label_file_list: + - ./train_data/val_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - MultiLabelEncode: + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_sar + - length + - valid_ratio + loader: + shuffle: false + drop_last: false + batch_size_per_card: 128 + num_workers: 4 diff --git a/configs/rec/PP-OCRv3/multi_language/cyrillic_PP-OCRv3_rec.yml b/configs/rec/PP-OCRv3/multi_language/cyrillic_PP-OCRv3_rec.yml new file mode 100644 index 0000000000000000000000000000000000000000..fbdbe6c44c689ea267c9995f832305d800046edb --- /dev/null +++ b/configs/rec/PP-OCRv3/multi_language/cyrillic_PP-OCRv3_rec.yml @@ -0,0 +1,131 @@ +Global: + debug: false + use_gpu: true + epoch_num: 500 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/v3_cyrillic_mobile + save_epoch_step: 3 + eval_batch_step: [0, 2000] + cal_metric_during_train: true + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: false + infer_img: doc/imgs_words/ch/word_1.jpg + character_dict_path: ppocr/utils/dict/cyrillic_dict.txt + max_text_length: &max_text_length 25 + infer_mode: false + use_space_char: true + distributed: true + save_res_path: ./output/rec/predicts_ppocrv3_cyrillic.txt + + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Cosine + learning_rate: 0.001 + warmup_epoch: 5 + regularizer: + name: L2 + factor: 3.0e-05 + + +Architecture: + model_type: rec + algorithm: SVTR + Transform: + Backbone: + name: MobileNetV1Enhance + scale: 0.5 + last_conv_stride: [1, 2] + last_pool_type: avg + Head: + name: MultiHead + head_list: + - CTCHead: + Neck: + name: svtr + dims: 64 + depth: 2 + hidden_dims: 120 + use_guide: True + Head: + fc_decay: 0.00001 + - SARHead: + enc_dim: 512 + max_text_length: *max_text_length + +Loss: + name: MultiLoss + loss_config_list: + - CTCLoss: + - SARLoss: + +PostProcess: + name: CTCLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + ignore_space: False + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ + ext_op_transform_idx: 1 + label_file_list: + - ./train_data/train_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - RecConAug: + prob: 0.5 + ext_data_num: 2 + image_shape: [48, 320, 3] + - RecAug: + - MultiLabelEncode: + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_sar + - length + - valid_ratio + loader: + shuffle: true + batch_size_per_card: 128 + drop_last: true + num_workers: 4 +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data + label_file_list: + - ./train_data/val_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - MultiLabelEncode: + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_sar + - length + - valid_ratio + loader: + shuffle: false + drop_last: false + batch_size_per_card: 128 + num_workers: 4 diff --git a/configs/rec/PP-OCRv3/multi_language/devanagari_PP-OCRv3_rec.yml b/configs/rec/PP-OCRv3/multi_language/devanagari_PP-OCRv3_rec.yml new file mode 100644 index 0000000000000000000000000000000000000000..48eb38df36f931b76b8e9fb8369daf06ad037d25 --- /dev/null +++ b/configs/rec/PP-OCRv3/multi_language/devanagari_PP-OCRv3_rec.yml @@ -0,0 +1,131 @@ +Global: + debug: false + use_gpu: true + epoch_num: 500 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/v3_devanagari_mobile + save_epoch_step: 3 + eval_batch_step: [0, 2000] + cal_metric_during_train: true + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: false + infer_img: doc/imgs_words/ch/word_1.jpg + character_dict_path: ppocr/utils/dict/devanagari_dict.txt + max_text_length: &max_text_length 25 + infer_mode: false + use_space_char: true + distributed: true + save_res_path: ./output/rec/predicts_ppocrv3_devanagari.txt + + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Cosine + learning_rate: 0.001 + warmup_epoch: 5 + regularizer: + name: L2 + factor: 3.0e-05 + + +Architecture: + model_type: rec + algorithm: SVTR + Transform: + Backbone: + name: MobileNetV1Enhance + scale: 0.5 + last_conv_stride: [1, 2] + last_pool_type: avg + Head: + name: MultiHead + head_list: + - CTCHead: + Neck: + name: svtr + dims: 64 + depth: 2 + hidden_dims: 120 + use_guide: True + Head: + fc_decay: 0.00001 + - SARHead: + enc_dim: 512 + max_text_length: *max_text_length + +Loss: + name: MultiLoss + loss_config_list: + - CTCLoss: + - SARLoss: + +PostProcess: + name: CTCLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + ignore_space: False + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ + ext_op_transform_idx: 1 + label_file_list: + - ./train_data/train_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - RecConAug: + prob: 0.5 + ext_data_num: 2 + image_shape: [48, 320, 3] + - RecAug: + - MultiLabelEncode: + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_sar + - length + - valid_ratio + loader: + shuffle: true + batch_size_per_card: 128 + drop_last: true + num_workers: 4 +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data + label_file_list: + - ./train_data/val_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - MultiLabelEncode: + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_sar + - length + - valid_ratio + loader: + shuffle: false + drop_last: false + batch_size_per_card: 128 + num_workers: 4 diff --git a/configs/rec/PP-OCRv3/multi_language/japan_PP-OCRv3_rec.yml b/configs/rec/PP-OCRv3/multi_language/japan_PP-OCRv3_rec.yml new file mode 100644 index 0000000000000000000000000000000000000000..6cab0d447247e28bb58b30384d4f9d032d6ce9d0 --- /dev/null +++ b/configs/rec/PP-OCRv3/multi_language/japan_PP-OCRv3_rec.yml @@ -0,0 +1,131 @@ +Global: + debug: false + use_gpu: true + epoch_num: 500 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/v3_japan_mobile + save_epoch_step: 3 + eval_batch_step: [0, 2000] + cal_metric_during_train: true + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: false + infer_img: doc/imgs_words/ch/word_1.jpg + character_dict_path: ppocr/utils/dict/japan_dict.txt + max_text_length: &max_text_length 25 + infer_mode: false + use_space_char: true + distributed: true + save_res_path: ./output/rec/predicts_ppocrv3_japan.txt + + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Cosine + learning_rate: 0.001 + warmup_epoch: 5 + regularizer: + name: L2 + factor: 3.0e-05 + + +Architecture: + model_type: rec + algorithm: SVTR + Transform: + Backbone: + name: MobileNetV1Enhance + scale: 0.5 + last_conv_stride: [1, 2] + last_pool_type: avg + Head: + name: MultiHead + head_list: + - CTCHead: + Neck: + name: svtr + dims: 64 + depth: 2 + hidden_dims: 120 + use_guide: True + Head: + fc_decay: 0.00001 + - SARHead: + enc_dim: 512 + max_text_length: *max_text_length + +Loss: + name: MultiLoss + loss_config_list: + - CTCLoss: + - SARLoss: + +PostProcess: + name: CTCLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + ignore_space: False + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ + ext_op_transform_idx: 1 + label_file_list: + - ./train_data/train_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - RecConAug: + prob: 0.5 + ext_data_num: 2 + image_shape: [48, 320, 3] + - RecAug: + - MultiLabelEncode: + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_sar + - length + - valid_ratio + loader: + shuffle: true + batch_size_per_card: 128 + drop_last: true + num_workers: 4 +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data + label_file_list: + - ./train_data/val_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - MultiLabelEncode: + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_sar + - length + - valid_ratio + loader: + shuffle: false + drop_last: false + batch_size_per_card: 128 + num_workers: 4 diff --git a/configs/rec/PP-OCRv3/multi_language/ka_PP-OCRv3_rec.yml b/configs/rec/PP-OCRv3/multi_language/ka_PP-OCRv3_rec.yml new file mode 100644 index 0000000000000000000000000000000000000000..7a9c8241d1564e5f1295655ba64694a117064bd8 --- /dev/null +++ b/configs/rec/PP-OCRv3/multi_language/ka_PP-OCRv3_rec.yml @@ -0,0 +1,131 @@ +Global: + debug: false + use_gpu: true + epoch_num: 500 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/v3_ka_mobile + save_epoch_step: 3 + eval_batch_step: [0, 2000] + cal_metric_during_train: true + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: false + infer_img: doc/imgs_words/ch/word_1.jpg + character_dict_path: ppocr/utils/dict/ka_dict.txt + max_text_length: &max_text_length 25 + infer_mode: false + use_space_char: true + distributed: true + save_res_path: ./output/rec/predicts_ppocrv3_ka.txt + + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Cosine + learning_rate: 0.001 + warmup_epoch: 5 + regularizer: + name: L2 + factor: 3.0e-05 + + +Architecture: + model_type: rec + algorithm: SVTR + Transform: + Backbone: + name: MobileNetV1Enhance + scale: 0.5 + last_conv_stride: [1, 2] + last_pool_type: avg + Head: + name: MultiHead + head_list: + - CTCHead: + Neck: + name: svtr + dims: 64 + depth: 2 + hidden_dims: 120 + use_guide: True + Head: + fc_decay: 0.00001 + - SARHead: + enc_dim: 512 + max_text_length: *max_text_length + +Loss: + name: MultiLoss + loss_config_list: + - CTCLoss: + - SARLoss: + +PostProcess: + name: CTCLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + ignore_space: False + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ + ext_op_transform_idx: 1 + label_file_list: + - ./train_data/train_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - RecConAug: + prob: 0.5 + ext_data_num: 2 + image_shape: [48, 320, 3] + - RecAug: + - MultiLabelEncode: + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_sar + - length + - valid_ratio + loader: + shuffle: true + batch_size_per_card: 128 + drop_last: true + num_workers: 4 +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data + label_file_list: + - ./train_data/val_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - MultiLabelEncode: + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_sar + - length + - valid_ratio + loader: + shuffle: false + drop_last: false + batch_size_per_card: 128 + num_workers: 4 diff --git a/configs/rec/PP-OCRv3/multi_language/korean_PP-OCRv3_rec.yml b/configs/rec/PP-OCRv3/multi_language/korean_PP-OCRv3_rec.yml new file mode 100644 index 0000000000000000000000000000000000000000..29ff570772a621ba747e0388bcc0c042db0dba43 --- /dev/null +++ b/configs/rec/PP-OCRv3/multi_language/korean_PP-OCRv3_rec.yml @@ -0,0 +1,131 @@ +Global: + debug: false + use_gpu: true + epoch_num: 500 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/v3_korean_mobile + save_epoch_step: 3 + eval_batch_step: [0, 2000] + cal_metric_during_train: true + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: false + infer_img: doc/imgs_words/ch/word_1.jpg + character_dict_path: ppocr/utils/dict/korean_dict.txt + max_text_length: &max_text_length 25 + infer_mode: false + use_space_char: true + distributed: true + save_res_path: ./output/rec/predicts_ppocrv3_korean.txt + + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Cosine + learning_rate: 0.001 + warmup_epoch: 5 + regularizer: + name: L2 + factor: 3.0e-05 + + +Architecture: + model_type: rec + algorithm: SVTR + Transform: + Backbone: + name: MobileNetV1Enhance + scale: 0.5 + last_conv_stride: [1, 2] + last_pool_type: avg + Head: + name: MultiHead + head_list: + - CTCHead: + Neck: + name: svtr + dims: 64 + depth: 2 + hidden_dims: 120 + use_guide: True + Head: + fc_decay: 0.00001 + - SARHead: + enc_dim: 512 + max_text_length: *max_text_length + +Loss: + name: MultiLoss + loss_config_list: + - CTCLoss: + - SARLoss: + +PostProcess: + name: CTCLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + ignore_space: False + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ + ext_op_transform_idx: 1 + label_file_list: + - ./train_data/train_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - RecConAug: + prob: 0.5 + ext_data_num: 2 + image_shape: [48, 320, 3] + - RecAug: + - MultiLabelEncode: + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_sar + - length + - valid_ratio + loader: + shuffle: true + batch_size_per_card: 128 + drop_last: true + num_workers: 4 +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data + label_file_list: + - ./train_data/val_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - MultiLabelEncode: + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_sar + - length + - valid_ratio + loader: + shuffle: false + drop_last: false + batch_size_per_card: 128 + num_workers: 4 diff --git a/configs/rec/PP-OCRv3/multi_language/latin_PP-OCRv3_rec.yml b/configs/rec/PP-OCRv3/multi_language/latin_PP-OCRv3_rec.yml new file mode 100644 index 0000000000000000000000000000000000000000..1784bfe611366c45230fd2abf69ab16e3a1c3ae9 --- /dev/null +++ b/configs/rec/PP-OCRv3/multi_language/latin_PP-OCRv3_rec.yml @@ -0,0 +1,131 @@ +Global: + debug: false + use_gpu: true + epoch_num: 500 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/v3_latin_mobile + save_epoch_step: 3 + eval_batch_step: [0, 2000] + cal_metric_during_train: true + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: false + infer_img: doc/imgs_words/ch/word_1.jpg + character_dict_path: ppocr/utils/dict/latin_dict.txt + max_text_length: &max_text_length 25 + infer_mode: false + use_space_char: true + distributed: true + save_res_path: ./output/rec/predicts_ppocrv3_latin.txt + + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Cosine + learning_rate: 0.001 + warmup_epoch: 5 + regularizer: + name: L2 + factor: 3.0e-05 + + +Architecture: + model_type: rec + algorithm: SVTR + Transform: + Backbone: + name: MobileNetV1Enhance + scale: 0.5 + last_conv_stride: [1, 2] + last_pool_type: avg + Head: + name: MultiHead + head_list: + - CTCHead: + Neck: + name: svtr + dims: 64 + depth: 2 + hidden_dims: 120 + use_guide: True + Head: + fc_decay: 0.00001 + - SARHead: + enc_dim: 512 + max_text_length: *max_text_length + +Loss: + name: MultiLoss + loss_config_list: + - CTCLoss: + - SARLoss: + +PostProcess: + name: CTCLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + ignore_space: False + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ + ext_op_transform_idx: 1 + label_file_list: + - ./train_data/train_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - RecConAug: + prob: 0.5 + ext_data_num: 2 + image_shape: [48, 320, 3] + - RecAug: + - MultiLabelEncode: + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_sar + - length + - valid_ratio + loader: + shuffle: true + batch_size_per_card: 128 + drop_last: true + num_workers: 4 +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data + label_file_list: + - ./train_data/val_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - MultiLabelEncode: + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_sar + - length + - valid_ratio + loader: + shuffle: false + drop_last: false + batch_size_per_card: 128 + num_workers: 4 diff --git a/configs/rec/PP-OCRv3/multi_language/ta_PP-OCRv3_rec.yml b/configs/rec/PP-OCRv3/multi_language/ta_PP-OCRv3_rec.yml new file mode 100644 index 0000000000000000000000000000000000000000..70b26aa84a2178111edab9f094c369c5d22e31a9 --- /dev/null +++ b/configs/rec/PP-OCRv3/multi_language/ta_PP-OCRv3_rec.yml @@ -0,0 +1,131 @@ +Global: + debug: false + use_gpu: true + epoch_num: 500 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/v3_ta_mobile + save_epoch_step: 3 + eval_batch_step: [0, 2000] + cal_metric_during_train: true + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: false + infer_img: doc/imgs_words/ch/word_1.jpg + character_dict_path: ppocr/utils/dict/ta_dict.txt + max_text_length: &max_text_length 25 + infer_mode: false + use_space_char: true + distributed: true + save_res_path: ./output/rec/predicts_ppocrv3_ta.txt + + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Cosine + learning_rate: 0.001 + warmup_epoch: 5 + regularizer: + name: L2 + factor: 3.0e-05 + + +Architecture: + model_type: rec + algorithm: SVTR + Transform: + Backbone: + name: MobileNetV1Enhance + scale: 0.5 + last_conv_stride: [1, 2] + last_pool_type: avg + Head: + name: MultiHead + head_list: + - CTCHead: + Neck: + name: svtr + dims: 64 + depth: 2 + hidden_dims: 120 + use_guide: True + Head: + fc_decay: 0.00001 + - SARHead: + enc_dim: 512 + max_text_length: *max_text_length + +Loss: + name: MultiLoss + loss_config_list: + - CTCLoss: + - SARLoss: + +PostProcess: + name: CTCLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + ignore_space: False + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ + ext_op_transform_idx: 1 + label_file_list: + - ./train_data/train_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - RecConAug: + prob: 0.5 + ext_data_num: 2 + image_shape: [48, 320, 3] + - RecAug: + - MultiLabelEncode: + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_sar + - length + - valid_ratio + loader: + shuffle: true + batch_size_per_card: 128 + drop_last: true + num_workers: 4 +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data + label_file_list: + - ./train_data/val_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - MultiLabelEncode: + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_sar + - length + - valid_ratio + loader: + shuffle: false + drop_last: false + batch_size_per_card: 128 + num_workers: 4 diff --git a/configs/rec/PP-OCRv3/multi_language/te_PP-OCRv3_rec.yml b/configs/rec/PP-OCRv3/multi_language/te_PP-OCRv3_rec.yml new file mode 100644 index 0000000000000000000000000000000000000000..3617af79e3b9c5a55ef22d549465ba2109618e32 --- /dev/null +++ b/configs/rec/PP-OCRv3/multi_language/te_PP-OCRv3_rec.yml @@ -0,0 +1,131 @@ +Global: + debug: false + use_gpu: true + epoch_num: 500 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/v3_te_mobile + save_epoch_step: 3 + eval_batch_step: [0, 2000] + cal_metric_during_train: true + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: false + infer_img: doc/imgs_words/ch/word_1.jpg + character_dict_path: ppocr/utils/dict/te_dict.txt + max_text_length: &max_text_length 25 + infer_mode: false + use_space_char: true + distributed: true + save_res_path: ./output/rec/predicts_ppocrv3_te.txt + + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Cosine + learning_rate: 0.001 + warmup_epoch: 5 + regularizer: + name: L2 + factor: 3.0e-05 + + +Architecture: + model_type: rec + algorithm: SVTR + Transform: + Backbone: + name: MobileNetV1Enhance + scale: 0.5 + last_conv_stride: [1, 2] + last_pool_type: avg + Head: + name: MultiHead + head_list: + - CTCHead: + Neck: + name: svtr + dims: 64 + depth: 2 + hidden_dims: 120 + use_guide: True + Head: + fc_decay: 0.00001 + - SARHead: + enc_dim: 512 + max_text_length: *max_text_length + +Loss: + name: MultiLoss + loss_config_list: + - CTCLoss: + - SARLoss: + +PostProcess: + name: CTCLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + ignore_space: False + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ + ext_op_transform_idx: 1 + label_file_list: + - ./train_data/train_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - RecConAug: + prob: 0.5 + ext_data_num: 2 + image_shape: [48, 320, 3] + - RecAug: + - MultiLabelEncode: + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_sar + - length + - valid_ratio + loader: + shuffle: true + batch_size_per_card: 128 + drop_last: true + num_workers: 4 +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data + label_file_list: + - ./train_data/val_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - MultiLabelEncode: + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_sar + - length + - valid_ratio + loader: + shuffle: false + drop_last: false + batch_size_per_card: 128 + num_workers: 4 diff --git a/deploy/Jetson/readme.md b/deploy/Jetson/readme.md index 1eb0ea44f77b522567100fc6b38a23dab68613af..14c88c3c3c8aa8e4bbfe6ce04cb51c58af762351 100644 --- a/deploy/Jetson/readme.md +++ b/deploy/Jetson/readme.md @@ -1,46 +1,45 @@ +English | [简体中文](readme_ch.md) -# Jetson部署PaddleOCR模型 +# Jetson Deployment for PaddleOCR -本节介绍PaddleOCR在Jetson NX、TX2、nano、AGX等系列硬件的部署。 +This section introduces the deployment of PaddleOCR on Jetson NX, TX2, nano, AGX and other series of hardware. -## 1. 环境准备 +## 1. Prepare Environment -需要准备一台Jetson开发板,如果需要TensorRT预测,需准备好TensorRT环境,建议使用7.1.3版本的TensorRT; +You need to prepare a Jetson development hardware. If you need TensorRT, you need to prepare the TensorRT environment. It is recommended to use TensorRT version 7.1.3; -1. Jetson安装PaddlePaddle +1. Install PaddlePaddle in Jetson -PaddlePaddle下载[链接](https://www.paddlepaddle.org.cn/inference/user_guides/download_lib.html#python) -请选择适合的您Jetpack版本、cuda版本、trt版本的安装包。 +The PaddlePaddle download [link](https://www.paddlepaddle.org.cn/inference/user_guides/download_lib.html#python) +Please select the appropriate installation package for your Jetpack version, cuda version, and trt version. Here, we download paddlepaddle_gpu-2.3.0rc0-cp36-cp36m-linux_aarch64.whl. -安装命令: +Install PaddlePaddle: ```shell -# 安装paddle,以paddlepaddle_gpu-2.3.0rc0-cp36-cp36m-linux_aarch64.whl 为例 pip3 install -U paddlepaddle_gpu-2.3.0rc0-cp36-cp36m-linux_aarch64.whl ``` -2. 下载PaddleOCR代码并安装依赖 +2. Download PaddleOCR code and install dependencies -首先 clone PaddleOCR 代码: +Clone the PaddleOCR code: ``` git clone https://github.com/PaddlePaddle/PaddleOCR ``` -然后,安装依赖: +and install dependencies: ``` cd PaddleOCR pip3 install -r requirements.txt ``` -*注:jetson硬件CPU较差,依赖安装较慢,请耐心等待* +*Note: Jetson hardware CPU is poor, dependency installation is slow, please wait patiently* +## 2. Perform prediction -## 2. 执行预测 +Obtain the PPOCR model from the [document](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/doc/doc_en/ppocr_introduction_en.md#6-model-zoo) model library. The following takes the PP-OCRv3 model as an example to introduce the use of the PPOCR model on Jetson: -从[文档](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/doc/doc_ch/ppocr_introduction.md#6-%E6%A8%A1%E5%9E%8B%E5%BA%93) 模型库中获取PPOCR模型,下面以PP-OCRv3模型为例,介绍在PPOCR模型在jetson上的使用方式: - -下载并解压PP-OCRv3模型 +Download and unzip the PP-OCRv3 models. ``` wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar @@ -48,38 +47,38 @@ tar xf ch_PP-OCRv3_det_infer.tar tar xf ch_PP-OCRv3_rec_infer.tar ``` -执行文本检测预测: +The text detection inference: ``` cd PaddleOCR python3 tools/infer/predict_det.py --det_model_dir=./inference/ch_PP-OCRv2_det_infer/ --image_dir=./doc/imgs/french_0.jpg --use_gpu=True ``` -执行命令后在终端会打印出预测的信息,并在 `./inference_results/` 下保存可视化结果。 +After executing the command, the predicted information will be printed out in the terminal, and the visualization results will be saved in the `./inference_results/` directory. ![](./images/det_res_french_0.jpg) -执行文本识别预测: +The text recognition inference: ``` python3 tools/infer/predict_det.py --rec_model_dir=./inference/ch_PP-OCRv2_rec_infer/ --image_dir=./doc/imgs_words/en/word_2.png --use_gpu=True --rec_image_shape="3,48,320" ``` -执行命令后在终端会打印出预测的信息,输出如下: +After executing the command, the predicted information will be printed on the terminal, and the output is as follows: ``` [2022/04/28 15:41:45] root INFO: Predicts of ./doc/imgs_words/en/word_2.png:('yourself', 0.98084533) ``` -执行文本检测+文本识别串联预测: +The text detection and text recognition inference: ``` -python3 tools/infer/predict_system.py --det_model_dir=./inference/ch_PP-OCRv2_det_infer/ --rec_model_dir=./inference/ch_PP-OCRv2_rec_infer/ --image_dir=./doc/imgs/ --use_gpu=True --rec_image_shape="3,48,320" +python3 tools/infer/predict_system.py --det_model_dir=./inference/ch_PP-OCRv2_det_infer/ --rec_model_dir=./inference/ch_PP-OCRv2_rec_infer/ --image_dir=./doc/imgs/00057937.jpg --use_gpu=True --rec_image_shape="3,48,320" ``` -执行命令后在终端会打印出预测的信息,并在 `./inference_results/` 下保存可视化结果。 +After executing the command, the predicted information will be printed out in the terminal, and the visualization results will be saved in the `./inference_results/` directory. ![](./images/00057937.jpg) -开启TRT预测只需要在以上命令基础上设置`--use_tensorrt=True`即可: +To enable TRT prediction, you only need to set `--use_tensorrt=True` on the basis of the above command: ``` -python3 tools/infer/predict_system.py --det_model_dir=./inference/ch_PP-OCRv2_det_infer/ --rec_model_dir=./inference/ch_PP-OCRv2_rec_infer/ --image_dir=./doc/imgs/00057937.jpg --use_gpu=True --use_tensorrt=True --rec_image_shape="3,48,320" +python3 tools/infer/predict_system.py --det_model_dir=./inference/ch_PP-OCRv2_det_infer/ --rec_model_dir=./inference/ch_PP-OCRv2_rec_infer/ --image_dir=./doc/imgs/ --rec_image_shape="3,48,320" --use_gpu=True --use_tensorrt=True ``` -更多ppocr模型预测请参考[文档](../../doc/doc_ch/models_list.md) +For more ppocr model predictions, please refer to[document](../../doc/doc_en/models_list_en.md) diff --git a/deploy/Jetson/readme_ch.md b/deploy/Jetson/readme_ch.md new file mode 100644 index 0000000000000000000000000000000000000000..7b0a344ffe3b4c0abefc78e50371b5c92aa25001 --- /dev/null +++ b/deploy/Jetson/readme_ch.md @@ -0,0 +1,86 @@ +[English](readme.md) | 简体中文 + +# Jetson部署PaddleOCR模型 + +本节介绍PaddleOCR在Jetson NX、TX2、nano、AGX等系列硬件的部署。 + + +## 1. 环境准备 + +需要准备一台Jetson开发板,如果需要TensorRT预测,需准备好TensorRT环境,建议使用7.1.3版本的TensorRT; + +1. Jetson安装PaddlePaddle + +PaddlePaddle下载[链接](https://www.paddlepaddle.org.cn/inference/user_guides/download_lib.html#python) +请选择适合的您Jetpack版本、cuda版本、trt版本的安装包。 + +安装命令: +```shell +# 安装paddle,以paddlepaddle_gpu-2.3.0rc0-cp36-cp36m-linux_aarch64.whl 为例 +pip3 install -U paddlepaddle_gpu-2.3.0rc0-cp36-cp36m-linux_aarch64.whl +``` + + +2. 下载PaddleOCR代码并安装依赖 + +首先 clone PaddleOCR 代码: +``` +git clone https://github.com/PaddlePaddle/PaddleOCR +``` + +然后,安装依赖: +``` +cd PaddleOCR +pip3 install -r requirements.txt +``` + +*注:jetson硬件CPU较差,依赖安装较慢,请耐心等待* + + +## 2. 执行预测 + +从[文档](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/doc/doc_ch/ppocr_introduction.md#6-%E6%A8%A1%E5%9E%8B%E5%BA%93) 模型库中获取PPOCR模型,下面以PP-OCRv3模型为例,介绍在PPOCR模型在jetson上的使用方式: + +下载并解压PP-OCRv3模型 +``` +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar +tar xf ch_PP-OCRv3_det_infer.tar +tar xf ch_PP-OCRv3_rec_infer.tar +``` + +执行文本检测预测: +``` +cd PaddleOCR +python3 tools/infer/predict_det.py --det_model_dir=./inference/ch_PP-OCRv2_det_infer/ --image_dir=./doc/imgs/french_0.jpg --use_gpu=True +``` + +执行命令后在终端会打印出预测的信息,并在 `./inference_results/` 下保存可视化结果。 +![](./images/det_res_french_0.jpg) + + +执行文本识别预测: +``` +python3 tools/infer/predict_det.py --rec_model_dir=./inference/ch_PP-OCRv2_rec_infer/ --image_dir=./doc/imgs_words/en/word_2.png --use_gpu=True --rec_image_shape="3,48,320" +``` + +执行命令后在终端会打印出预测的信息,输出如下: +``` +[2022/04/28 15:41:45] root INFO: Predicts of ./doc/imgs_words/en/word_2.png:('yourself', 0.98084533) +``` + +执行文本检测+文本识别串联预测: + +``` +python3 tools/infer/predict_system.py --det_model_dir=./inference/ch_PP-OCRv2_det_infer/ --rec_model_dir=./inference/ch_PP-OCRv2_rec_infer/ --image_dir=./doc/imgs/ --use_gpu=True --rec_image_shape="3,48,320" +``` + +执行命令后在终端会打印出预测的信息,并在 `./inference_results/` 下保存可视化结果。 +![](./images/00057937.jpg) + +开启TRT预测只需要在以上命令基础上设置`--use_tensorrt=True`即可: +``` +python3 tools/infer/predict_system.py --det_model_dir=./inference/ch_PP-OCRv2_det_infer/ --rec_model_dir=./inference/ch_PP-OCRv2_rec_infer/ --image_dir=./doc/imgs/00057937.jpg --use_gpu=True --use_tensorrt=True --rec_image_shape="3,48,320" +``` + +更多ppocr模型预测请参考[文档](../../doc/doc_ch/models_list.md) diff --git a/deploy/Jetson/readme_en.md b/deploy/Jetson/readme_en.md deleted file mode 100644 index d499989160e13d6dd6bf1092c890e8ec11681ce4..0000000000000000000000000000000000000000 --- a/deploy/Jetson/readme_en.md +++ /dev/null @@ -1,83 +0,0 @@ - -# Jetson Deployment for PaddleOCR - -This section introduces the deployment of PaddleOCR on Jetson NX, TX2, nano, AGX and other series of hardware. - - -## 1. Prepare Environment - -You need to prepare a Jetson development hardware. If you need TensorRT, you need to prepare the TensorRT environment. It is recommended to use TensorRT version 7.1.3; - -1. Install PaddlePaddle in Jetson - -The PaddlePaddle download [link](https://www.paddlepaddle.org.cn/inference/user_guides/download_lib.html#python) -Please select the appropriate installation package for your Jetpack version, cuda version, and trt version. Here, we download paddlepaddle_gpu-2.3.0rc0-cp36-cp36m-linux_aarch64.whl. - -Install PaddlePaddle: -```shell -pip3 install -U paddlepaddle_gpu-2.3.0rc0-cp36-cp36m-linux_aarch64.whl -``` - - -2. Download PaddleOCR code and install dependencies - -Clone the PaddleOCR code: -``` -git clone https://github.com/PaddlePaddle/PaddleOCR -``` - -and install dependencies: -``` -cd PaddleOCR -pip3 install -r requirements.txt -``` - -*Note: Jetson hardware CPU is poor, dependency installation is slow, please wait patiently* - -## 2. Perform prediction - -Obtain the PPOCR model from the [document](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/doc/doc_en/ppocr_introduction_en.md#6-model-zoo) model library. The following takes the PP-OCRv3 model as an example to introduce the use of the PPOCR model on Jetson: - -Download and unzip the PP-OCRv3 models. -``` -wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar -wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar -tar xf ch_PP-OCRv3_det_infer.tar -tar xf ch_PP-OCRv3_rec_infer.tar -``` - -The text detection inference: -``` -cd PaddleOCR -python3 tools/infer/predict_det.py --det_model_dir=./inference/ch_PP-OCRv2_det_infer/ --image_dir=./doc/imgs/french_0.jpg --use_gpu=True -``` - -After executing the command, the predicted information will be printed out in the terminal, and the visualization results will be saved in the `./inference_results/` directory. -![](./images/det_res_french_0.jpg) - - -The text recognition inference: -``` -python3 tools/infer/predict_det.py --rec_model_dir=./inference/ch_PP-OCRv2_rec_infer/ --image_dir=./doc/imgs_words/en/word_2.png --use_gpu=True --rec_image_shape="3,48,320" -``` - -After executing the command, the predicted information will be printed on the terminal, and the output is as follows: -``` -[2022/04/28 15:41:45] root INFO: Predicts of ./doc/imgs_words/en/word_2.png:('yourself', 0.98084533) -``` - -The text detection and text recognition inference: - -``` -python3 tools/infer/predict_system.py --det_model_dir=./inference/ch_PP-OCRv2_det_infer/ --rec_model_dir=./inference/ch_PP-OCRv2_rec_infer/ --image_dir=./doc/imgs/00057937.jpg --use_gpu=True --rec_image_shape="3,48,320" -``` - -After executing the command, the predicted information will be printed out in the terminal, and the visualization results will be saved in the `./inference_results/` directory. -![](./images/00057937.jpg) - -To enable TRT prediction, you only need to set `--use_tensorrt=True` on the basis of the above command: -``` -python3 tools/infer/predict_system.py --det_model_dir=./inference/ch_PP-OCRv2_det_infer/ --rec_model_dir=./inference/ch_PP-OCRv2_rec_infer/ --image_dir=./doc/imgs/ --rec_image_shape="3,48,320" --use_gpu=True --use_tensorrt=True -``` - -For more ppocr model predictions, please refer to[document](../../doc/doc_en/models_list_en.md) diff --git a/deploy/README.md b/deploy/README.md index 3a77ff22a15b624ace56027f606eff61e9efb66c..0cfb793f92547de1215277f5e2348bc157ec8503 100644 --- a/deploy/README.md +++ b/deploy/README.md @@ -25,7 +25,7 @@ PP-OCR has supported muti deployment schemes. Click the link to get the specific - [Serving (Python/C++)](./pdserving/README.md) - [Paddle-Lite (ARM CPU/OpenCL ARM GPU)](./lite/readme.md) - [Paddle.js](./paddlejs/README.md) -- [Jetson Inference]() +- [Jetson Inference](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/deploy/Jetson/readme.md) - [Paddle2ONNX](./paddle2onnx/readme.md) If you need the deployment tutorial of academic algorithm models other than PP-OCR, please directly enter the main page of corresponding algorithms, [entrance](../doc/doc_en/algorithm_overview_en.md)。 diff --git a/deploy/README_ch.md b/deploy/README_ch.md index 9ec90bd36a05a53ae0f27386f6d2eba798e40842..1773aedc2c757499bfc1f950e468a98c477636ac 100644 --- a/deploy/README_ch.md +++ b/deploy/README_ch.md @@ -25,7 +25,7 @@ PP-OCR模型已打通多种场景部署方案,点击链接获取具体的使 - [Serving 服务化部署(Python/C++)](./pdserving/README_CN.md) - [Paddle-Lite 端侧部署(ARM CPU/OpenCL ARM GPU)](./lite/readme_ch.md) - [Paddle.js 部署](./paddlejs/README_ch.md) -- [Jetson 推理]() +- [Jetson 推理](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/deploy/Jetson/readme_ch.md) - [Paddle2ONNX 推理](./paddle2onnx/readme_ch.md) -需要PP-OCR以外的学术算法模型的推理部署,请直接进入相应算法主页面,[入口](../doc/doc_ch/algorithm_overview.md)。 +需要PP-OCR以外的学术算法模型的推理部署,请直接进入相应算法主页面,[入口](../doc/doc_ch/algorithm_overview.md)。 \ No newline at end of file diff --git a/deploy/pdserving/config.yml b/deploy/pdserving/config.yml index 6e30a626d0cdb0b4e5fe6feb737ea46c2bc59f90..19cd9325ee8b241fd591678b9ba6452de9bec025 100644 --- a/deploy/pdserving/config.yml +++ b/deploy/pdserving/config.yml @@ -37,7 +37,7 @@ op: model_config: ./ppocr_det_v3_serving #Fetch结果列表,以client_config中fetch_var的alias_name为准 - fetch_list: ["save_infer_model/scale_0.tmp_1"] + fetch_list: ["sigmoid_0.tmp_0"] #计算硬件ID,当devices为""或不写时为CPU预测;当devices为"0", "0,1,2"时为GPU预测,表示使用的GPU卡 devices: "0" diff --git a/deploy/pdserving/web_service.py b/deploy/pdserving/web_service.py index 07fd6102beaef4001f87574a2f0631e2b1012613..98e2dfba2f5abd3fc36bf3743b23f7eb7be3b9c4 100644 --- a/deploy/pdserving/web_service.py +++ b/deploy/pdserving/web_service.py @@ -56,7 +56,7 @@ class DetOp(Op): return {"x": det_img[np.newaxis, :].copy()}, False, None, "" def postprocess(self, input_dicts, fetch_dict, data_id, log_id): - det_out = fetch_dict["save_infer_model/scale_0.tmp_1"] + det_out = fetch_dict["sigmoid_0.tmp_0"] ratio_list = [ float(self.new_h) / self.ori_h, float(self.new_w) / self.ori_w ] diff --git a/deploy/pdserving/web_service_det.py b/deploy/pdserving/web_service_det.py index 0ca8dbc41bbdde4caf76bcfddabe4b9c2e94cb4b..7584608a9fed4bea93caa5c814c0450566696d56 100644 --- a/deploy/pdserving/web_service_det.py +++ b/deploy/pdserving/web_service_det.py @@ -55,7 +55,7 @@ class DetOp(Op): return {"x": det_img[np.newaxis, :].copy()}, False, None, "" def postprocess(self, input_dicts, fetch_dict, data_id, log_id): - det_out = fetch_dict["save_infer_model/scale_0.tmp_1"] + det_out = fetch_dict["sigmoid_0.tmp_0"] ratio_list = [ float(self.new_h) / self.ori_h, float(self.new_w) / self.ori_w ] diff --git a/doc/doc_ch/PP-OCRv3_introduction.md b/doc/doc_ch/PP-OCRv3_introduction.md index f65ee7ccf9cf813d20d9419cf3b8e62c22eeabf0..dc0271f294cf43a26477dbc974b77297e04122ac 100644 --- a/doc/doc_ch/PP-OCRv3_introduction.md +++ b/doc/doc_ch/PP-OCRv3_introduction.md @@ -1,38 +1,27 @@ [English](../doc_en/PP-OCRv3_introduction_en.md) | 简体中文 -# PP-OCR +# PP-OCRv3 - [1. 简介](#1) -- [2. 特性](#2) -- [3. benchmark](#3) +- [2. 检测优化](#2) +- [3. 识别优化](#3) +- [4. 端到端评估](#4) ## 1. 简介 -PP-OCR是PaddleOCR自研的实用的超轻量OCR系统。在实现[前沿算法](algorithm.md)的基础上,考虑精度与速度的平衡,进行**模型瘦身**和**深度优化**,使其尽可能满足产业落地需求。 +PP-OCRv3在PP-OCRv2的基础上进一步升级。检测模型仍然基于DB算法,优化策略采用了带残差注意力机制的FPN结构RSEFPN、增大感受野的PAN结构LKPAN、基于DML训练的更优的教师模型;识别模型将base模型从CRNN替换成了IJCAI 2022论文[SVTR](),并采用SVTR轻量化、带指导训练CTC、数据增广策略RecConAug、自监督训练的更好的预训练模型、无标签数据的使用进行模型加速和效果提升。更多细节请参考PP-OCRv3[技术报告](./PP-OCRv3_introduction.md)。 -#### PP-OCR - -PP-OCR是一个两阶段的OCR系统,其中文本检测算法选用[DB](algorithm_det_db.md),文本识别算法选用[CRNN](algorithm_rec_crnn.md),并在检测和识别模块之间添加[文本方向分类器](angle_class.md),以应对不同方向的文本识别。 - -PP-OCRv2系统pipeline如下: +PP-OCRv3系统pipeline如下:
- +
- -PP-OCR系统在持续迭代优化,目前已发布PP-OCR、PP-OCRv2、PP-OCRv3三个版本: - -PP-OCR从骨干网络选择和调整、预测头部的设计、数据增强、学习率变换策略、正则化参数选择、预训练模型使用以及模型自动裁剪量化8个方面,采用19个有效策略,对各个模块的模型进行效果调优和瘦身(如绿框所示),最终得到整体大小为3.5M的超轻量中英文OCR和2.8M的英文数字OCR。更多细节请参考PP-OCR技术方案 https://arxiv.org/abs/2009.09941 - - -## PP-OCRv3策略简介 - - -### PP-OCRv3文本检测模型优化策略 + +## 2. 检测优化 PP-OCRv3采用PP-OCRv2的[CML](https://arxiv.org/pdf/2109.03144.pdf)蒸馏策略,在蒸馏的student模型、teacher模型精度提升,CML蒸馏策略上分别做了优化。 @@ -44,10 +33,11 @@ RSEFPN的网络结构如下图所示,RSEFPN在PP-OCRv2的FPN基础上,将FPN - RSEFPN将PP-OCR检测模型的精度hmean从81.3%提升到84.5%。模型大小从3M变为3.6M。 -- 在蒸馏的teacher模型精度提升方面,提出了LKPAN结构替换PP-OCRv2的FPN结构,并且使用ResNet50作为Backbone,更大的模型带来更多的精度提升。另外,对teacher模型使用[DML](https://arxiv.org/abs/1706.00384)蒸馏策略进一步提升teacher模型的精度。最终teacher的模型指标hmean从83.2%提升到了86.0%。 +*注:PP-OCRv2的FPN通道数仅为96和24,如果直接用SE模块代替FPN的卷积会导致精度下降,RSEConv引入残差结构可以防止训练中包含重要特征的通道被抑制。* + +- 在蒸馏的teacher模型精度提升方面,提出了LKPAN结构替换PP-OCRv2的FPN结构,并且使用ResNet50作为Backbone,更大的模型带来更多的精度提升。另外,对teacher模型使用[DML](https://arxiv.org/abs/1706.00384)蒸馏策略进一步提升teacher模型的精度。最终teacher的模型指标相比ppocr_server_v2.0从83.2%提升到了86.0%。 *注:[PP-OCRv2的FPN结构](https://github.com/PaddlePaddle/PaddleOCR/blob/77acb3bfe51c8a46c684527f73cd218cefedb4a3/ppocr/modeling/necks/db_fpn.py#L107)对DB算法FPN结构做了轻量级设计* @@ -57,7 +47,7 @@ LKPAN的网络结构如下图所示: -LKPAN(Large Kernel PAN)是一个具有更大感受野的轻量级[PAN](https://arxiv.org/pdf/1803.01534.pdf)结构。在LKPAN的path augmentation中,使用kernel size为`9*9`的卷积;更大的kernel size意味着更大的感受野,更容易检测大字体的文字以及极端长宽比的文字。LKPAN将base检测模型的精度hmean从81.3%提升到84.9%。 +LKPAN(Large Kernel PAN)是一个具有更大感受野的轻量级[PAN](https://arxiv.org/pdf/1803.01534.pdf)结构。在LKPAN的path augmentation中,使用kernel size为`9*9`的卷积;更大的kernel size意味着更大的感受野,更容易检测大字体的文字以及极端长宽比的文字。LKPAN将PP-OCR检测模型的精度hmean从81.3%提升到84.9%。 *注:LKPAN相比RSEFPN有更多的精度提升,但是考虑到模型大小和预测速度等因素,在student模型中使用RSEFPN。* @@ -71,23 +61,70 @@ LKPAN(Large Kernel PAN)是一个具有更大感受野的轻量级[PAN](https://a |1|PP-OCRV2|3M|83.3%|117ms| |2|0 + RESFPN|3.6M|84.5%|124ms| |3|0 + LKPAN|4.6M|84.9%|156ms| -|4|teacher |124M|83.2%|-| -|5|teacher + DML + LKPAN|124M|86.0%|-| +|4|ppocr_server_v2.0 |124M|83.2%||171ms| +|5|teacher + DML + LKPAN|124M|86.0%|396ms| |6|0 + 2 + 5 + CML|3.6M|85.4%|124ms| - -## 2. 特性 + +## 3. 识别优化 -- 超轻量PP-OCRv2系列:检测(3.1M)+ 方向分类器(1.4M)+ 识别(8.5M)= 13.0M -- 超轻量PP-OCR mobile移动端系列:检测(3.0M)+方向分类器(1.4M)+ 识别(5.0M)= 9.4M -- 通用PP-OCR server系列:检测(47.1M)+方向分类器(1.4M)+ 识别(94.9M)= 143.4M -- 支持中英文数字组合识别、竖排文本识别、长文本识别 -- 支持多语言识别:韩语、日语、德语、法语等约80种语言 +[SVTR](https://arxiv.org/abs/2205.00159) 证明了强大的单视觉模型(无需序列模型)即可高效准确完成文本识别任务,在中英文数据上均有优秀的表现。经过实验验证,SVTR_Tiny在自建的 [中文数据集上](https://arxiv.org/abs/2109.03144) ,识别精度可以提升10.7%,网络结构如下所示: + - -## 3. benchmark +由于 MKLDNN 加速库支持的模型结构有限,SVTR 在CPU+MKLDNN上相比PP-OCRv2慢了10倍。 + +PP-OCRv3 期望在提升模型精度的同时,不带来额外的推理耗时。通过分析发现,SVTR_Tiny结构的主要耗时模块为Mixing Block,因此我们对 SVTR_Tiny 的结构进行了一系列优化(详细速度数据请参考下方消融实验表格): + +1. 将SVTR网络前半部分替换为PP-LCNet的前三个stage,保留4个 Global Mixing Block ,精度为76%,加速69%,网络结构如下所示: + +2. 将4个 Global Attenntion Block 减小到2个,精度为72.9%,加速69%,网络结构如下所示: + +3. 实验发现 Global Attention 的预测速度与输入其特征的shape有关,因此后移Global Mixing Block的位置到池化层之后,精度下降为71.9%,速度超越 CNN-base 的PP-OCRv2 22%,网络结构如下所示: + + +为了提升模型精度同时不引入额外推理成本,PP-OCRv3参考GTC策略,使用Attention监督CTC训练,预测时完全去除Attention模块,在推理阶段不增加任何耗时, 精度提升3.8%,训练流程如下所示: + + +在训练策略方面,PP-OCRv3参考 [SSL](https://github.com/ku21fan/STR-Fewer-Labels) 设计了文本方向任务,训练了适用于文本识别的预训练模型,加速模型收敛过程,精度提升了0.6%; 使用UDML蒸馏策略,进一步提升精度1.5%,训练流程所示: + + + + +数据增强方面: + +1. 基于 [ConCLR](https://www.cse.cuhk.edu.hk/~byu/papers/C139-AAAI2022-ConCLR.pdf) 中的ConAug方法,设计了 RecConAug 数据增强方法,增强数据多样性,精度提升0.5%,增强可视化效果如下所示: + + +2. 使用训练好的 SVTR_large 预测 120W 的 lsvt 无标注数据,取出其中得分大于0.95的数据,共得到81W识别数据加入到PP-OCRv3的训练数据中,精度提升1%。 + +总体来讲PP-OCRv3识别从网络结构、训练策略、数据增强三个方向做了进一步优化: + +- 网络结构上:考虑[SVTR](https://arxiv.org/abs/2205.00159) 在中英文效果上的优越性,采用SVTR_Tiny作为base,选取Global Mixing Block和卷积组合提取特征,并将Global Mixing Block位置后移进行加速; 参考 [GTC](https://arxiv.org/pdf/2002.01276.pdf) 策略,使用注意力机制模块指导CTC训练,定位和识别字符,提升不规则文本的识别精度。 +- 训练策略上:参考 [SSL](https://github.com/ku21fan/STR-Fewer-Labels) 设计了方向分类前序任务,获取更优预训练模型,加速模型收敛过程,提升精度; 使用UDML蒸馏策略、监督attention、ctc两个分支得到更优模型。 +- 数据增强上:基于 [ConCLR](https://www.cse.cuhk.edu.hk/~byu/papers/C139-AAAI2022-ConCLR.pdf) 中的ConAug方法,改进得到 RecConAug 数据增广方法,支持随机结合任意多张图片,提升训练数据的上下文信息丰富度,增强模型鲁棒性;使用 SVTR_large 预测无标签数据,向训练集中补充81w高质量真实数据。 + +基于上述策略,PP-OCRv3识别模型相比PP-OCRv2,在速度可比的情况下,精度进一步提升4.5%。 具体消融实验如下所示: + +实验细节: + +| id | 策略 | 模型大小 | 精度 | 速度(cpu + mkldnn)| +|-----|-----|--------|----| --- | +| 01 | PP-OCRv2 | 8M | 69.3% | 8.54ms | +| 02 | SVTR_Tiny | 21M | 80.1% | 97ms | +| 03 | LCNet_SVTR_G4 | 9.2M | 76% | 30ms | +| 04 | LCNet_SVTR_G2 | 13M | 72.98% | 9.37ms | +| 05 | PP-OCRv3 | 12M | 71.9% | 6.6ms | +| 06 | + large input_shape | 12M | 73.98% | 7.6ms | +| 06 | + GTC | 12M | 75.8% | 7.6ms | +| 07 | + RecConAug | 12M | 76.3% | 7.6ms | +| 08 | + SSL pretrain | 12M | 76.9% | 7.6ms | +| 09 | + UDML | 12M | 78.4% | 7.6ms | +| 10 | + unlabeled data | 12M | 79.4% | 7.6ms | + +注: 测试速度时,实验01-05输入图片尺寸均为(3,32,320),06-10输入图片尺寸均为(3,48,320) -关于PP-OCR系列模型之间的性能对比,请查看[benchmark](./benchmark.md)文档。 + +## 4. 端到端评估 diff --git a/doc/doc_ch/models_list.md b/doc/doc_ch/models_list.md index e0356e89e7c1c9998c953f104891a2f4b577e2de..b7a93af5d9cc50c89d626e557d74b6d848125ccc 100644 --- a/doc/doc_ch/models_list.md +++ b/doc/doc_ch/models_list.md @@ -81,7 +81,7 @@ PaddleOCR提供的可下载模型包括`推理模型`、`训练模型`、`预训 |模型名称|模型简介|配置文件|推理模型大小|下载地址| | --- | --- | --- | --- | --- | -|ch_PP-OCRv3_rec_slim |【最新】slim量化版超轻量模型,支持中英文、数字识别|[ch_PP-OCRv3_rec_distillation.yml](../../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml)| 4.9M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/ch/ch_PP-OCRv3_rec_slim_train.tar) / [slim模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_infer.nb) | +|ch_PP-OCRv3_rec_slim |【最新】slim量化版超轻量模型,支持中英文、数字识别|[ch_PP-OCRv3_rec_distillation.yml](../../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml)| 4.9M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_train.tar) / [slim模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_infer.nb) | |ch_PP-OCRv3_rec|【最新】原始超轻量模型,支持中英文、数字识别|[ch_PP-OCRv3_rec_distillation.yml](../../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml)| 12.4M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_train.tar) | |ch_PP-OCRv2_rec_slim| slim量化版超轻量模型,支持中英文、数字识别|[ch_PP-OCRv2_rec.yml](../../configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec.yml)| 9M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_train.tar) | |ch_PP-OCRv2_rec| 原始超轻量模型,支持中英文、数字识别|[ch_PP-OCRv2_rec_distillation.yml](../../configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec_distillation.yml)|8.5M|[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_train.tar) | @@ -96,7 +96,7 @@ PaddleOCR提供的可下载模型包括`推理模型`、`训练模型`、`预训 |模型名称|模型简介|配置文件|推理模型大小|下载地址| | --- | --- | --- | --- | --- | -|en_PP-OCRv3_rec_slim |【最新】slim量化版超轻量模型,支持英文、数字识别 | [en_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/en_PP-OCRv3_rec.yml)| - |[推理模型(coming soon)](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_slim_infer.tar) / [训练模型(coming soon)](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_slim_train.tar) / [slim模型(coming soon)](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_slim_infer.nb) | +|en_PP-OCRv3_rec_slim |【最新】slim量化版超轻量模型,支持英文、数字识别 | [en_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/en_PP-OCRv3_rec.yml)| - |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_slim_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_slim_train.tar) / [slim模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_slim_infer.nb) | |ch_PP-OCRv3_rec |【最新】原始超轻量模型,支持英文、数字识别|[en_PP-OCRv3_rec.yml](../../configs/rec/en_PP-OCRv3/en_PP-OCRv3_rec.yml)| 9.6M | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_train.tar) | |en_number_mobile_slim_v2.0_rec|slim裁剪量化版超轻量模型,支持英文、数字识别|[rec_en_number_lite_train.yml](../../configs/rec/multi_language/rec_en_number_lite_train.yml)| 2.7M | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/en_number_mobile_v2.0_rec_slim_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/en_number_mobile_v2.0_rec_slim_train.tar) | |en_number_mobile_v2.0_rec|原始超轻量模型,支持英文、数字识别|[rec_en_number_lite_train.yml](../../configs/rec/multi_language/rec_en_number_lite_train.yml)|2.6M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/en_number_mobile_v2.0_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/en_number_mobile_v2.0_rec_train.tar) | @@ -107,18 +107,17 @@ PaddleOCR提供的可下载模型包括`推理模型`、`训练模型`、`预训 |模型名称|字典文件|模型简介|配置文件|推理模型大小|下载地址| | --- | --- | --- | --- |--- | --- | -| french_mobile_v2.0_rec | ppocr/utils/dict/french_dict.txt |法文识别|[rec_french_lite_train.yml](../../configs/rec/multi_language/rec_french_lite_train.yml)|2.65M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/french_mobile_v2.0_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/french_mobile_v2.0_rec_train.tar) | -| german_mobile_v2.0_rec | ppocr/utils/dict/german_dict.txt |德文识别|[rec_german_lite_train.yml](../../configs/rec/multi_language/rec_german_lite_train.yml)|2.65M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/german_mobile_v2.0_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/german_mobile_v2.0_rec_train.tar) | -| korean_mobile_v2.0_rec | ppocr/utils/dict/korean_dict.txt |韩文识别|[rec_korean_lite_train.yml](../../configs/rec/multi_language/rec_korean_lite_train.yml)|3.9M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/korean_mobile_v2.0_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/korean_mobile_v2.0_rec_train.tar) | -| japan_mobile_v2.0_rec | ppocr/utils/dict/japan_dict.txt |日文识别|[rec_japan_lite_train.yml](../../configs/rec/multi_language/rec_japan_lite_train.yml)|4.23M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/japan_mobile_v2.0_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/japan_mobile_v2.0_rec_train.tar) | -| chinese_cht_mobile_v2.0_rec | ppocr/utils/dict/chinese_cht_dict.txt | 中文繁体识别|rec_chinese_cht_lite_train.yml|5.63M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/chinese_cht_mobile_v2.0_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/chinese_cht_mobile_v2.0_rec_train.tar) | -| te_mobile_v2.0_rec | ppocr/utils/dict/te_dict.txt | 泰卢固文识别|rec_te_lite_train.yml|2.63M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/te_mobile_v2.0_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/te_mobile_v2.0_rec_train.tar) | -| ka_mobile_v2.0_rec | ppocr/utils/dict/ka_dict.txt |卡纳达文识别|rec_ka_lite_train.yml|2.63M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/ka_mobile_v2.0_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/ka_mobile_v2.0_rec_train.tar) | -| ta_mobile_v2.0_rec | ppocr/utils/dict/ta_dict.txt |泰米尔文识别|rec_ta_lite_train.yml|2.63M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/ta_mobile_v2.0_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/ta_mobile_v2.0_rec_train.tar) | -| latin_mobile_v2.0_rec | ppocr/utils/dict/latin_dict.txt | 拉丁文识别 | [rec_latin_lite_train.yml](../../configs/rec/multi_language/rec_latin_lite_train.yml) |2.6M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/latin_ppocr_mobile_v2.0_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/latin_ppocr_mobile_v2.0_rec_train.tar) | -| arabic_mobile_v2.0_rec | ppocr/utils/dict/arabic_dict.txt | 阿拉伯字母 | [rec_arabic_lite_train.yml](../../configs/rec/multi_language/rec_arabic_lite_train.yml) |2.6M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/arabic_ppocr_mobile_v2.0_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/arabic_ppocr_mobile_v2.0_rec_train.tar) | -| cyrillic_mobile_v2.0_rec | ppocr/utils/dict/cyrillic_dict.txt | 斯拉夫字母 | [rec_cyrillic_lite_train.yml](../../configs/rec/multi_language/rec_cyrillic_lite_train.yml) |2.6M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/cyrillic_ppocr_mobile_v2.0_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/cyrillic_ppocr_mobile_v2.0_rec_train.tar) | -| devanagari_mobile_v2.0_rec | ppocr/utils/dict/devanagari_dict.txt |梵文字母 | [rec_devanagari_lite_train.yml](../../configs/rec/multi_language/rec_devanagari_lite_train.yml) |2.6M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/devanagari_ppocr_mobile_v2.0_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/devanagari_ppocr_mobile_v2.0_rec_train.tar) | +| korean_PP-OCRv3_rec | ppocr/utils/dict/korean_dict.txt |韩文识别|[korean_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/korean_PP-OCRv3_rec.yml)|11M|[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/korean_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/korean_PP-OCRv3_rec_train.tar) | +| japan_PP-OCRv3_rec | ppocr/utils/dict/japan_dict.txt |日文识别|[japan_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/japan_PP-OCRv3_rec.yml)|11M|[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/japan_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/japan_PP-OCRv3_rec_train.tar) | +| chinese_cht_PP-OCRv3_rec | ppocr/utils/dict/chinese_cht_dict.txt | 中文繁体识别|[chinese_cht_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/chinese_cht_PP-OCRv3_rec.yml)|12M|[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/chinese_cht_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/chinese_cht_PP-OCRv3_rec_train.tar) | +| te_PP-OCRv3_rec | ppocr/utils/dict/te_dict.txt | 泰卢固文识别|[te_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/te_PP-OCRv3_rec.yml)|9.6M|[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/te_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/te_PP-OCRv3_rec_train.tar) | +| ka_PP-OCRv3_rec | ppocr/utils/dict/ka_dict.txt |卡纳达文识别|[ka_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/ka_PP-OCRv3_rec.yml)|9.9M|[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/ka_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/ka_PP-OCRv3_rec_train.tar) | +| ta_PP-OCRv3_rec | ppocr/utils/dict/ta_dict.txt |泰米尔文识别|[ta_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/ta_PP-OCRv3_rec.yml)|9.6M|[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/ta_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/ta_PP-OCRv3_rec_train.tar) | +| latin_PP-OCRv3_rec | ppocr/utils/dict/latin_dict.txt | 拉丁文识别 | [latin_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/latin_PP-OCRv3_rec.yml) |9.6M|[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/latin_ppocr_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/latin_ppocr_PP-OCRv3_rec_train.tar) | +| arabic_PP-OCRv3_rec | ppocr/utils/dict/arabic_dict.txt | 阿拉伯字母 | [arabic_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/rec_arabic_lite_train.yml) |9.6M|[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/arabic_ppocr_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/arabic_ppocr_PP-OCRv3_rec_train.tar) | +| cyrillic_PP-OCRv3_rec | ppocr/utils/dict/cyrillic_dict.txt | 斯拉夫字母 | [cyrillic_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/cyrillic_PP-OCRv3_rec.yml) |9.6M|[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/cyrillic_ppocr_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/cyrillic_ppocr_PP-OCRv3_rec_train.tar) | +| devanagari_PP-OCRv3_rec | ppocr/utils/dict/devanagari_dict.txt |梵文字母 | [devanagari_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/devanagari_PP-OCRv3_rec.yml) |9.6M|[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/devanagari_ppocr_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/devanagari_ppocr_PP-OCRv3_rec_train.tar) | + 更多支持语种请参考: [多语言模型](./multi_languages.md) diff --git a/doc/doc_ch/ppocr_introduction.md b/doc/doc_ch/ppocr_introduction.md index c316302649664727479cb88a8f054e4f898352c8..6527c5803b3135bda922b5478ebe9ddbbb9ae0d9 100644 --- a/doc/doc_ch/ppocr_introduction.md +++ b/doc/doc_ch/ppocr_introduction.md @@ -34,48 +34,23 @@ PP-OCR从骨干网络选择和调整、预测头部的设计、数据增强、 #### PP-OCRv2 -PP-OCRv2在PP-OCR的基础上,进一步在5个方面重点优化,检测模型采用CML协同互学习知识蒸馏策略和CopyPaste数据增广策略;识别模型采用LCNet轻量级骨干网络、UDML 改进知识蒸馏策略和[Enhanced CTC loss](./doc/doc_ch/enhanced_ctc_loss.md)损失函数改进(如上图红框所示),进一步在推理速度和预测效果上取得明显提升。更多细节请参考PP-OCRv2[技术报告](https://arxiv.org/abs/2109.03144)。 +PP-OCRv2在PP-OCR的基础上,进一步在5个方面重点优化,检测模型采用CML协同互学习知识蒸馏策略和CopyPaste数据增广策略;识别模型采用LCNet轻量级骨干网络、UDML 改进知识蒸馏策略和[Enhanced CTC loss](./enhanced_ctc_loss.md)损失函数改进(如上图红框所示),进一步在推理速度和预测效果上取得明显提升。更多细节请参考PP-OCRv2[技术报告](https://arxiv.org/abs/2109.03144)。 #### PP-OCRv3 -PP-OCRv3在PP-OCRv2的基础上进一步升级。 -PP-OCRv3文本检测从网络结构、蒸馏训练策略两个方向做了进一步优化: -- 网络结构改进:提出两种改进后的FPN网络结构,RSEFPN,LKPAN,分别从channel attention、更大感受野的角度优化FPN中的特征,优化FPN提取的特征。 -- 蒸馏训练策略:首先,以resnet50作为backbone,改进后的LKPAN网络结构作为FPN,使用DML自蒸馏策略得到精度更高的teacher模型;然后,student模型FPN部分采用RSEFPN,采用PPOCRV2提出的CML蒸馏方法蒸馏,在训练过程中,动态调整CML蒸馏teacher loss的占比。 - -|序号|策略|模型大小|hmean|Intel Gold 6148CPU+mkldnn预测耗时| -|-|-|-|-|-| -|0|ppocr_mobile|3M|81.3|117ms| -|1|PPOCRV2|3M|83.3|117ms| -|2|teacher DML|124M|86.0|-| -|3|1 + 2 + RESFPN|3.6M|85.4|124ms| -|4|1 + 2 + LKPAN|4.6M|86.0|156ms| - - -PP-OCRv3识别从网络结构、训练策略、数据增强三个方向做了进一步优化: -- 网络结构上:使用[SVTR](todo:add_link)中的 Transformer block 替换LSTM,提升模型精度和预测速度; -- 训练策略上:参考 [GTC](https://arxiv.org/pdf/2002.01276.pdf) 策略,使用注意力机制模块指导CTC训练,定位和识别字符,提升不规则文本的识别精度;设计方向分类前序任务,获取更优预训练模型,加速模型收敛过程,提升精度。 -- 数据增强上:使用[RecConAug](todo:add_link)数据增广方法,随机结合图片,提升训练数据的上下文信息丰富度,增强模型鲁棒性。 - -基于上述策略,PP-OCRv3识别模型相比上一版本,速度加速30%,精度进一步提升4.5%。 具体消融实验: - -| id | 策略 | 模型大小 | 精度 | CPU+mkldnn 预测耗时 | -|-----|-----|--------|----|------------| -| 01 | PP-OCRv2 | 8M | 69.3% | 26ms | -| 02 | SVTR_tiny | 19M | 80.1% | - | -| 03 | LCNet_SVTR_G6 | 8.2M | 76% | - | -| 04 | LCNet_SVTR_G1 | - | - | - | -| 05 | PP-OCRv3 | 12M | 71.9% | 19ms | -| 06 | + GTC | 12M | 75.8% | 19ms | -| 07 | + RecConAug | 12M | 76.3% | 19ms | -| 08 | + SSL pretrain | 12M | 76.9% | 19ms | -| 09 | + UDML | 12M | 78.4% | 19ms | -| 10 | + unlabeled data | 12M | 79.4% | 19ms | +PP-OCRv3在PP-OCRv2的基础上进一步升级。检测模型仍然基于DB算法,优化策略采用了带残差注意力机制的FPN结构RSEFPN、增大感受野的PAN结构LKPAN、基于DML训练的更优的教师模型;识别模型将base模型从CRNN替换成了IJCAI 2022论文[SVTR](https://arxiv.org/abs/2205.00159),并采用SVTR轻量化、带指导训练CTC、数据增广策略RecConAug、自监督训练的更好的预训练模型、无标签数据的使用进行模型加速和效果提升。更多细节请参考PP-OCRv3[技术报告](./PP-OCRv3_introduction.md)。 +PP-OCRv3系统pipeline如下: + +
+ +
+ ## 2. 特性 +- 超轻量PP-OCRv3系列:检测(3.6M)+ 方向分类器(1.4M)+ 识别(12M)= 17.0M - 超轻量PP-OCRv2系列:检测(3.1M)+ 方向分类器(1.4M)+ 识别(8.5M)= 13.0M - 超轻量PP-OCR mobile移动端系列:检测(3.0M)+方向分类器(1.4M)+ 识别(5.0M)= 9.4M - 通用PP-OCR server系列:检测(47.1M)+方向分类器(1.4M)+ 识别(94.9M)= 143.4M diff --git a/doc/doc_en/PP-OCRv3_introduction_en.md b/doc/doc_en/PP-OCRv3_introduction_en.md new file mode 100644 index 0000000000000000000000000000000000000000..791c95a6b560aef54ce1c70a6ced7ee1cd0f0368 --- /dev/null +++ b/doc/doc_en/PP-OCRv3_introduction_en.md @@ -0,0 +1 @@ +English | [简体中文](../doc_ch/PP-OCRv3_introduction.md) diff --git a/doc/doc_en/config_en.md b/doc/doc_en/config_en.md index 68c2b5f0c14f0c9b09d854f5a8b33ca86cc4bdf7..d467a7f918ed57eb80754483715f3671fd2552c7 100644 --- a/doc/doc_en/config_en.md +++ b/doc/doc_en/config_en.md @@ -36,6 +36,7 @@ Take rec_chinese_lite_train_v2.0.yml as an example | pretrained_model | Set the path of the pre-trained model | ./pretrain_models/CRNN/best_accuracy | \ | | checkpoints | set model parameter path | None | Used to load parameters after interruption to continue training| | use_visualdl | Set whether to enable visualdl for visual log display | False | [Tutorial](https://www.paddlepaddle.org.cn/paddle/visualdl) | +| use_wandb | Set whether to enable W&B for visual log display | False | [Documentation](https://docs.wandb.ai/) | infer_img | Set inference image path or folder path | ./infer_img | \|| | character_dict_path | Set dictionary path | ./ppocr/utils/ppocr_keys_v1.txt | If the character_dict_path is None, model can only recognize number and lower letters | | max_text_length | Set the maximum length of text | 25 | \ | @@ -66,7 +67,7 @@ In PaddleOCR, the network is divided into four stages: Transform, Backbone, Neck | :---------------------: | :---------------------: | :--------------: | :--------------------: | | model_type | Network Type | rec | Currently support`rec`,`det`,`cls` | | algorithm | Model name | CRNN | See [algorithm_overview](./algorithm_overview_en.md) for the support list | -| **Transform** | Set the transformation method | - | Currently only recognition algorithms are supported, see [ppocr/modeling/transforms](../../ppocr/modeling/transforms) for details | +| **Transform** | Set the transformation method | - | Currently only recognition algorithms are supported, see [ppocr/modeling/transform](../../ppocr/modeling/transforms) for details | | name | Transformation class name | TPS | Currently supports `TPS` | | num_fiducial | Number of TPS control points | 20 | Ten on the top and bottom | | loc_lr | Localization network learning rate | 0.1 | \ | @@ -130,6 +131,17 @@ In PaddleOCR, the network is divided into four stages: Transform, Backbone, Neck | drop_last | Whether to discard the last incomplete mini-batch because the number of samples in the data set cannot be divisible by batch_size | True | \ | | num_workers | The number of sub-processes used to load data, if it is 0, the sub-process is not started, and the data is loaded in the main process | 8 | \ | +### Weights & Biases ([W&B](../../ppocr/utils/loggers/wandb_logger.py)) +| Parameter | Use | Defaults | Note | +| :---------------------: | :---------------------: | :--------------: | :--------------------: | +| project | Project to which the run is to be logged | uncategorized | \ +| name | Alias/Name of the run | Randomly generated by wandb | \ +| id | ID of the run | Randomly generated by wandb | \ +| entity | User or team to which the run is being logged | The logged in user | \ +| save_dir | local directory in which all the models and other data is saved | wandb | \ +| config | model configuration | None | \ + + ## 3. Multilingual Config File Generation @@ -233,4 +245,4 @@ For more supported languages, please refer to : [Multi-language model](https://g The multi-language model training method is the same as the Chinese model. The training data set is 100w synthetic data. A small amount of fonts and test data can be downloaded using the following two methods. * [Baidu Netdisk](https://pan.baidu.com/s/1bS_u207Rm7YbY33wOECKDA),Extraction code:frgi. -* [Google drive](https://drive.google.com/file/d/18cSWX7wXSy4G0tbKJ0d9PuIaiwRLHpjA/view) +* [Google drive](https://drive.google.com/file/d/18cSWX7wXSy4G0tbKJ0d9PuIaiwRLHpjA/view) \ No newline at end of file diff --git a/doc/doc_en/logging_en.md b/doc/doc_en/logging_en.md new file mode 100644 index 0000000000000000000000000000000000000000..d00ab8bd561c1bb7e489642298e74180e0c66886 --- /dev/null +++ b/doc/doc_en/logging_en.md @@ -0,0 +1,61 @@ +## Logging metrics and models + +PaddleOCR comes with two metric logging tools integrated directly into the training API: [VisualDL](https://readthedocs.org/projects/visualdl/) and [Weights & Biases](https://docs.wandb.ai/). + +### VisualDL +VisualDL is a visualization analysis tool of PaddlePaddle. The integration allows all training metrics to be logged to a VisualDL dashboard. To use it, add the following line to the `Global` section of the config yaml file - + +``` +Global: + use_visualdl: True +``` + +To see the visualizations run the following command in your terminal + +```shell +visualdl --logdir +``` + +Now open `localhost:8040` in your browser of choice! + +### Weights & Biases +W&B is a MLOps tool that can be used for experiment tracking, dataset/model versioning, visualizing results and collaborating with colleagues. A W&B logger is integrated directly into PaddleOCR and to use it, first you need to install the `wandb` sdk and login to your wandb account. + +```shell +pip install wandb +wandb login +``` + +If you do not have a wandb account, you can make one [here](https://wandb.ai/site). + +To visualize and track your model training add the following flag to your config yaml file under the `Global` section - + +``` +Global: + use_wandb: True +``` + +To add more arguments to the `WandbLogger` listed [here](./config_en.md) add the header `wandb` to the yaml file and add the arguments under it - + +``` +wandb: + project: my_project + entity: my_team +``` + +These config variables from the yaml file are used to instantiate the `WandbLogger` object with the project name, entity name (the logged in user by default), directory to store metadata (`./wandb` by default) and more. During the training process, the `log_metrics` function is called to log training and evaluation metrics at the training and evaluation steps respectively from the rank 0 process only. + +At every model saving step, the WandbLogger, logs the model using the `log_model` function along with relavant metadata and tags showing the epoch in which the model is saved, the model is best or not and so on. + +All the logging mentioned above is integrated into the `program.train` function and will generate dashboards like this - + +![W&B Dashboard](../imgs_en/wandb_metrics.png) + +![W&B Models](../imgs_en/wandb_models.png) + +For more advanced usage to log images, audios, videos or any other form of data, you can use `WandbLogger().run.log`. More examples on how to log different kinds of data are available [here](https://docs.wandb.ai/examples). + +To view the dashboard, the link to the dashboard is printed to the console at the beginning and end of every training job and you can also access it by logging into your W&B account on your browser. + +### Using Multiple Loggers +Both VisualDL and W&B can also be used simultaneously by just setting both the aforementioned flags to True. \ No newline at end of file diff --git a/doc/doc_en/ppocr_introduction_en.md b/doc/doc_en/ppocr_introduction_en.md index c0b3c1c96a1de6a5953984c845f2fcb735f390f6..d8af8d9ee31dd4ab63b8e22b8f1c59f64ee10f38 100644 --- a/doc/doc_en/ppocr_introduction_en.md +++ b/doc/doc_en/ppocr_introduction_en.md @@ -32,24 +32,18 @@ PP-OCR system is in continuous optimization. At present, PP-OCR and PP-OCRv2 hav [2] On the basis of PP-OCR, PP-OCRv2 is further optimized in five aspects. The detection model adopts CML(Collaborative Mutual Learning) knowledge distillation strategy and CopyPaste data expansion strategy. The recognition model adopts LCNet lightweight backbone network, U-DML knowledge distillation strategy and enhanced CTC loss function improvement (as shown in the red box above), which further improves the inference speed and prediction effect. For more details, please refer to the technical report of PP-OCRv2 (https://arxiv.org/abs/2109.03144). -[3] PP-OCRv3 is further upgraded on the basis of PP-OCRv2. -PP-OCRv3 text detection has been further optimized from the two directions of network structure and distillation training strategy: -- Network structure improvement: Two improved FPN network structures, RSEFPN and LKPAN, are proposed to optimize the features in the FPN from the perspective of channel attention and a larger receptive field, and optimize the features extracted by the FPN. -- Distillation training strategy: First, use resnet50 as the backbone, the improved LKPAN network structure as the FPN, and use the DML self-distillation strategy to obtain a teacher model with higher accuracy; then, the FPN part of the student model adopts RSEFPN, and adopts the CML distillation method proposed by PPOCRV2, during the training process, dynamically adjust the proportion of CML distillation teacher loss. +[3] PP-OCRv3 is further upgraded on the basis of PP-OCRv2. The detection model is still based on DB algorithm, and the optimization strategies include a newly proposed FPN structure with residual attention mechanism named with RSEFPN, a PAN structure with enlarged receptive field named with LKPAN, and better teacher model based on DML training; The recognition model replaces the base model from CRNN with IJCAI 2022 paper [SVTR](https://arxiv.org/abs/2205.00159), and adopts lightweight SVTR, guided training of CTC, data augmentation strategy RecConAug, better pre-trained model by self-supervised training, and the use of unlabeled data to accelerate the model and improve the effect. For more details, please refer to PP-OCRv3 [technical report](./PP-OCRv3_introduction_en.md). -|Index|Method|Model SIze|Hmean|CPU inference time| -|-|-|-|-|-| -|0|ppocr_mobile|3M|81.3|117ms| -|1|PPOCRV2|3M|83.3|117ms| -|2|teacher DML|124M|86.0|-| -|3|1 + 2 + RESFPN|3.6M|85.4|124ms| -|4|1 + 2 + LKPAN|4.6M|86.0|156ms| +PP-OCRv3 pipeline is as follows: -*note: CPU inference time refers to the average inference time on an Intel Gold 6148CPU with mkldnn enabled.* +
+ +
## 2. Features +- Ultra lightweight PP-OCRv3 series models: detection (3.6M) + direction classifier (1.4M) + recognition 12M) = 17.0M - Ultra lightweight PP-OCRv2 series models: detection (3.1M) + direction classifier (1.4M) + recognition 8.5M) = 13.0M - Ultra lightweight PP-OCR mobile series models: detection (3.0M) + direction classifier (1.4M) + recognition (5.0M) = 9.4M - General PP-OCR server series models: detection (47.1M) + direction classifier (1.4M) + recognition (94.9M) = 143.4M diff --git a/doc/features.png b/doc/features.png index 7d6342c1eb83e0544df0045a0cfa71bc083022fe..6295c902adc007446f72a2ff08793198f9ac2c4a 100644 Binary files a/doc/features.png and b/doc/features.png differ diff --git a/doc/features_en.png b/doc/features_en.png index 9f0a66299bb5e922257e3327b0c6cf2d3ebfe05b..6ec1ed7bdb50cd05d1cd280c635b0aea19725cb2 100644 Binary files a/doc/features_en.png and b/doc/features_en.png differ diff --git a/doc/imgs_en/wandb_metrics.png b/doc/imgs_en/wandb_metrics.png new file mode 100644 index 0000000000000000000000000000000000000000..45f0041ae4d3819c2bf9c9fababcceb3ff20a115 Binary files /dev/null and b/doc/imgs_en/wandb_metrics.png differ diff --git a/doc/imgs_en/wandb_models.png b/doc/imgs_en/wandb_models.png new file mode 100644 index 0000000000000000000000000000000000000000..f9a7042bd59fa16179bd8a1f1e0eb49031300e4f Binary files /dev/null and b/doc/imgs_en/wandb_models.png differ diff --git a/doc/ppocr_v3/GTC.png b/doc/ppocr_v3/GTC.png new file mode 100644 index 0000000000000000000000000000000000000000..2af2261d51d2279f171727a5a0b5a8d974763d80 Binary files /dev/null and b/doc/ppocr_v3/GTC.png differ diff --git a/doc/ppocr_v3/LKPAN.png b/doc/ppocr_v3/LKPAN.png index 87f7f69fb516d496c9357d81b97e5bdb750f808a..ff0578f6901603185809e10c85793c212c40dc48 100644 Binary files a/doc/ppocr_v3/LKPAN.png and b/doc/ppocr_v3/LKPAN.png differ diff --git a/doc/ppocr_v3/SSL.png b/doc/ppocr_v3/SSL.png new file mode 100644 index 0000000000000000000000000000000000000000..1344a2a77cf50c4ffe044e9ebc565215a2d7efbc Binary files /dev/null and b/doc/ppocr_v3/SSL.png differ diff --git a/doc/ppocr_v3/UDML.png b/doc/ppocr_v3/UDML.png new file mode 100644 index 0000000000000000000000000000000000000000..3b59bc58bc1e7cb0ae0ffab30b41bc519410c794 Binary files /dev/null and b/doc/ppocr_v3/UDML.png differ diff --git a/doc/ppocr_v3/ppocr_v3.png b/doc/ppocr_v3/ppocr_v3.png new file mode 100644 index 0000000000000000000000000000000000000000..123c125acdcbc9e2ef6e4d6a0a1c92d01136ffde Binary files /dev/null and b/doc/ppocr_v3/ppocr_v3.png differ diff --git a/doc/ppocr_v3/recconaug.png b/doc/ppocr_v3/recconaug.png new file mode 100644 index 0000000000000000000000000000000000000000..899bc430de897cbd8741e07bfa796a58bf2c7715 Binary files /dev/null and b/doc/ppocr_v3/recconaug.png differ diff --git a/doc/ppocr_v3/svtr_g2.png b/doc/ppocr_v3/svtr_g2.png new file mode 100644 index 0000000000000000000000000000000000000000..d589891d5897533243845a993bd56d8f75726cfc Binary files /dev/null and b/doc/ppocr_v3/svtr_g2.png differ diff --git a/doc/ppocr_v3/svtr_g4.png b/doc/ppocr_v3/svtr_g4.png new file mode 100644 index 0000000000000000000000000000000000000000..234a85c44b2cc3d968942480a596b2be5e45f53d Binary files /dev/null and b/doc/ppocr_v3/svtr_g4.png differ diff --git a/doc/ppocr_v3/svtr_tiny.jpg b/doc/ppocr_v3/svtr_tiny.jpg new file mode 100644 index 0000000000000000000000000000000000000000..26261047ef253e9802956f4c64449870d10de850 Binary files /dev/null and b/doc/ppocr_v3/svtr_tiny.jpg differ diff --git a/doc/ppocr_v3/svtr_tiny.png b/doc/ppocr_v3/svtr_tiny.png new file mode 100644 index 0000000000000000000000000000000000000000..91b3eacb9f1242806ad3520cc36252351fc7baf1 Binary files /dev/null and b/doc/ppocr_v3/svtr_tiny.png differ diff --git a/doc/ppocrv3_framework.png b/doc/ppocrv3_framework.png new file mode 100644 index 0000000000000000000000000000000000000000..c05398248fa7273382e9691a26d932bddc3cf84f Binary files /dev/null and b/doc/ppocrv3_framework.png differ diff --git a/ppocr/utils/loggers/__init__.py b/ppocr/utils/loggers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b1e92f734e84b7e0278f8e7940ef3baf137c159e --- /dev/null +++ b/ppocr/utils/loggers/__init__.py @@ -0,0 +1,3 @@ +from .vdl_logger import VDLLogger +from .wandb_logger import WandbLogger +from .loggers import Loggers diff --git a/ppocr/utils/loggers/base_logger.py b/ppocr/utils/loggers/base_logger.py new file mode 100644 index 0000000000000000000000000000000000000000..3a7fc3593ba8e69fdd5bed386c7ae4ff0d459988 --- /dev/null +++ b/ppocr/utils/loggers/base_logger.py @@ -0,0 +1,15 @@ +import os +from abc import ABC, abstractmethod + +class BaseLogger(ABC): + def __init__(self, save_dir): + self.save_dir = save_dir + os.makedirs(self.save_dir, exist_ok=True) + + @abstractmethod + def log_metrics(self, metrics, prefix=None): + pass + + @abstractmethod + def close(self): + pass \ No newline at end of file diff --git a/ppocr/utils/loggers/loggers.py b/ppocr/utils/loggers/loggers.py new file mode 100644 index 0000000000000000000000000000000000000000..260146620811c8e72da66e9f2c7bbcbaef90b90d --- /dev/null +++ b/ppocr/utils/loggers/loggers.py @@ -0,0 +1,18 @@ +from .wandb_logger import WandbLogger + +class Loggers(object): + def __init__(self, loggers): + super().__init__() + self.loggers = loggers + + def log_metrics(self, metrics, prefix=None, step=None): + for logger in self.loggers: + logger.log_metrics(metrics, prefix=prefix, step=step) + + def log_model(self, is_best, prefix, metadata=None): + for logger in self.loggers: + logger.log_model(is_best=is_best, prefix=prefix, metadata=metadata) + + def close(self): + for logger in self.loggers: + logger.close() \ No newline at end of file diff --git a/ppocr/utils/loggers/vdl_logger.py b/ppocr/utils/loggers/vdl_logger.py new file mode 100644 index 0000000000000000000000000000000000000000..c345f93235b239873f0ddcd49c8b1b8966877a03 --- /dev/null +++ b/ppocr/utils/loggers/vdl_logger.py @@ -0,0 +1,21 @@ +from .base_logger import BaseLogger +from visualdl import LogWriter + +class VDLLogger(BaseLogger): + def __init__(self, save_dir): + super().__init__(save_dir) + self.vdl_writer = LogWriter(logdir=save_dir) + + def log_metrics(self, metrics, prefix=None, step=None): + if not prefix: + prefix = "" + updated_metrics = {prefix + "/" + k: v for k, v in metrics.items()} + + for k, v in updated_metrics.items(): + self.vdl_writer.add_scalar(k, v, step) + + def log_model(self, is_best, prefix, metadata=None): + pass + + def close(self): + self.vdl_writer.close() \ No newline at end of file diff --git a/ppocr/utils/loggers/wandb_logger.py b/ppocr/utils/loggers/wandb_logger.py new file mode 100644 index 0000000000000000000000000000000000000000..b9c6711696569e825638e0a27394071020b29cb5 --- /dev/null +++ b/ppocr/utils/loggers/wandb_logger.py @@ -0,0 +1,78 @@ +import os +from .base_logger import BaseLogger + +class WandbLogger(BaseLogger): + def __init__(self, + project=None, + name=None, + id=None, + entity=None, + save_dir=None, + config=None, + **kwargs): + try: + import wandb + self.wandb = wandb + except ModuleNotFoundError: + raise ModuleNotFoundError( + "Please install wandb using `pip install wandb`" + ) + + self.project = project + self.name = name + self.id = id + self.save_dir = save_dir + self.config = config + self.kwargs = kwargs + self.entity = entity + self._run = None + self._wandb_init = dict( + project=self.project, + name=self.name, + id=self.id, + entity=self.entity, + dir=self.save_dir, + resume="allow" + ) + self._wandb_init.update(**kwargs) + + _ = self.run + + if self.config: + self.run.config.update(self.config) + + @property + def run(self): + if self._run is None: + if self.wandb.run is not None: + logger.info( + "There is a wandb run already in progress " + "and newly created instances of `WandbLogger` will reuse" + " this run. If this is not desired, call `wandb.finish()`" + "before instantiating `WandbLogger`." + ) + self._run = self.wandb.run + else: + self._run = self.wandb.init(**self._wandb_init) + return self._run + + def log_metrics(self, metrics, prefix=None, step=None): + if not prefix: + prefix = "" + updated_metrics = {prefix.lower() + "/" + k: v for k, v in metrics.items()} + + self.run.log(updated_metrics, step=step) + + def log_model(self, is_best, prefix, metadata=None): + model_path = os.path.join(self.save_dir, prefix + '.pdparams') + artifact = self.wandb.Artifact('model-{}'.format(self.run.id), type='model', metadata=metadata) + artifact.add_file(model_path, name="model_ckpt.pdparams") + + aliases = [prefix] + if is_best: + aliases.append("best") + + self.run.log_artifact(artifact, aliases=aliases) + + def close(self): + self.run.finish() \ No newline at end of file diff --git a/tools/program.py b/tools/program.py index 90fd309ae9e1ae23723d8e67c62a905e79a073d3..7c02dc0149f36085ef05ca378b79d27e92d6dd57 100755 --- a/tools/program.py +++ b/tools/program.py @@ -31,6 +31,7 @@ from ppocr.utils.stats import TrainingStats from ppocr.utils.save_load import save_model from ppocr.utils.utility import print_dict, AverageMeter from ppocr.utils.logging import get_logger +from ppocr.utils.loggers import VDLLogger, WandbLogger, Loggers from ppocr.utils import profiler from ppocr.data import build_dataloader @@ -161,7 +162,7 @@ def train(config, eval_class, pre_best_model_dict, logger, - vdl_writer=None, + log_writer=None, scaler=None): cal_metric_during_train = config['Global'].get('cal_metric_during_train', False) @@ -300,10 +301,8 @@ def train(config, stats['lr'] = lr train_stats.update(stats) - if vdl_writer is not None and dist.get_rank() == 0: - for k, v in train_stats.get().items(): - vdl_writer.add_scalar('TRAIN/{}'.format(k), v, global_step) - vdl_writer.add_scalar('TRAIN/lr', lr, global_step) + if log_writer is not None and dist.get_rank() == 0: + log_writer.log_metrics(metrics=train_stats.get(), prefix="TRAIN", step=global_step) if dist.get_rank() == 0 and ( (global_step > 0 and global_step % print_batch_step == 0) or @@ -349,11 +348,9 @@ def train(config, logger.info(cur_metric_str) # logger metric - if vdl_writer is not None: - for k, v in cur_metric.items(): - if isinstance(v, (float, int)): - vdl_writer.add_scalar('EVAL/{}'.format(k), - cur_metric[k], global_step) + if log_writer is not None: + log_writer.log_metrics(metrics=cur_metric, prefix="EVAL", step=global_step) + if cur_metric[main_indicator] >= best_model_dict[ main_indicator]: best_model_dict.update(cur_metric) @@ -374,10 +371,12 @@ def train(config, ])) logger.info(best_str) # logger best metric - if vdl_writer is not None: - vdl_writer.add_scalar('EVAL/best_{}'.format(main_indicator), - best_model_dict[main_indicator], - global_step) + if log_writer is not None: + log_writer.log_metrics(metrics={ + "best_{}".format(main_indicator): best_model_dict[main_indicator] + }, prefix="EVAL", step=global_step) + + log_writer.log_model(is_best=True, prefix="best_accuracy", metadata=best_model_dict) reader_start = time.time() if dist.get_rank() == 0: @@ -392,6 +391,10 @@ def train(config, best_model_dict=best_model_dict, epoch=epoch, global_step=global_step) + + if log_writer is not None: + log_writer.log_model(is_best=False, prefix="latest") + if dist.get_rank() == 0 and epoch > 0 and epoch % save_epoch_step == 0: save_model( model, @@ -404,11 +407,14 @@ def train(config, best_model_dict=best_model_dict, epoch=epoch, global_step=global_step) + if log_writer is not None: + log_writer.log_model(is_best=False, prefix='iter_epoch_{}'.format(epoch)) + best_str = 'best metric, {}'.format(', '.join( ['{}: {}'.format(k, v) for k, v in best_model_dict.items()])) logger.info(best_str) - if dist.get_rank() == 0 and vdl_writer is not None: - vdl_writer.close() + if dist.get_rank() == 0 and log_writer is not None: + log_writer.close() return @@ -565,15 +571,32 @@ def preprocess(is_train=False): config['Global']['distributed'] = dist.get_world_size() != 1 - if config['Global']['use_visualdl'] and dist.get_rank() == 0: - from visualdl import LogWriter + loggers = [] + + if 'use_visualdl' in config['Global'] and config['Global']['use_visualdl']: save_model_dir = config['Global']['save_model_dir'] vdl_writer_path = '{}/vdl/'.format(save_model_dir) - os.makedirs(vdl_writer_path, exist_ok=True) - vdl_writer = LogWriter(logdir=vdl_writer_path) + log_writer = VDLLogger(save_model_dir) + loggers.append(log_writer) + if ('use_wandb' in config['Global'] and config['Global']['use_wandb']) or 'wandb' in config: + save_dir = config['Global']['save_model_dir'] + wandb_writer_path = "{}/wandb".format(save_dir) + if "wandb" in config: + wandb_params = config['wandb'] + else: + wandb_params = dict() + wandb_params.update({'save_dir': save_model_dir}) + log_writer = WandbLogger(**wandb_params, config=config) + loggers.append(log_writer) else: - vdl_writer = None + log_writer = None print_dict(config, logger) + + if loggers: + log_writer = Loggers(loggers) + else: + log_writer = None + logger.info('train with paddle {} and device {}'.format(paddle.__version__, device)) - return config, device, logger, vdl_writer + return config, device, logger, log_writer