diff --git a/PPOCRLabel/PPOCRLabel.py b/PPOCRLabel/PPOCRLabel.py index 523661b635ed4eaaf8d2b4ac05aaf60b2abc8541..7c7802a73ed32680142f8119b10a0393d1fab9cc 100644 --- a/PPOCRLabel/PPOCRLabel.py +++ b/PPOCRLabel/PPOCRLabel.py @@ -1439,8 +1439,8 @@ class MainWindow(QMainWindow): DEFAULT_LOCK_COLOR, key_cls, box['difficult'])) if imgidx in self.PPlabel.keys(): for box in self.PPlabel[imgidx]: - key_cls = None if not self.kie_mode else box['key_cls'] - shapes.append((box['transcription'], box['points'], None, key_cls, box['difficult'])) + key_cls = None if not self.kie_mode else box.get('key_cls', 'None') + shapes.append((box['transcription'], box['points'], None, key_cls, box.get('difficult', False))) self.loadLabels(shapes) self.canvas.verified = False @@ -1584,7 +1584,7 @@ class MainWindow(QMainWindow): for image, info in label_dict.items(): for box in info: if "key_cls" not in box: - continue + box.update({"key_cls": "None"}) self.existed_key_cls_set.add(box["key_cls"]) if len(self.existed_key_cls_set) > 0: for key_text in self.existed_key_cls_set: @@ -1606,8 +1606,6 @@ class MainWindow(QMainWindow): fit_to_content={'column': True, 'row': False}, flags=None ) - else: - self.keyDialog.labelList.addItems(self.existed_key_cls_set) def importDirImages(self, dirpath, isDelete=False): if not self.mayContinue() or not dirpath: diff --git a/PPOCRLabel/README.md b/PPOCRLabel/README.md index 0a1a7536912b6591517050bfa62260aaef6077cd..95baf66b297bb17b9396a20b8aa156fbd8273e39 100644 --- a/PPOCRLabel/README.md +++ b/PPOCRLabel/README.md @@ -12,6 +12,7 @@ PPOCRLabel is a semi-automatic graphic annotation tool suitable for OCR field, w - Add KIE Mode by using `--kie`, for [detection + identification + keyword extraction] labeling. - 2022.01:(by [PeterH0323](https://github.com/peterh0323) ) - Improve user experience: prompt for the number of files and labels, optimize interaction, and fix bugs such as only use CPU when inference + - New functions: Support using `C` or `X` to rotate box. - 2021.11.17: - Support install and start PPOCRLabel through the whl package (by [d2623587501](https://github.com/d2623587501)) - Dataset segmentation: Divide the annotation file into training, verification and testing parts (refer to section 3.5 below, by [MrCuiHao](https://github.com/MrCuiHao)) diff --git a/PPOCRLabel/README_ch.md b/PPOCRLabel/README_ch.md index 99c088de83f9cba775733a0473b50596683c47ab..5534b308f3b49905bfaa33b24e128e2bbcc1ef5b 100644 --- a/PPOCRLabel/README_ch.md +++ b/PPOCRLabel/README_ch.md @@ -11,7 +11,8 @@ PPOCRLabel是一款适用于OCR领域的半自动化图形标注工具,内置P - 2022.02:(by [PeterH0323](https://github.com/peterh0323) ) - 新增:使用 `--kie` 进入 KIE 功能,用于打【检测+识别+关键字提取】的标签 - 2022.01:(by [PeterH0323](https://github.com/peterh0323) ) - - 提升用户体验:新增文件与标记数目提示、优化交互、修复gpu使用等问题 + - 提升用户体验:新增文件与标记数目提示、优化交互、修复gpu使用等问题。 + - 新增功能:使用 `C` 和 `X` 对标记框进行旋转。 - 2021.11.17: - 新增支持通过whl包安装和启动PPOCRLabel(by [d2623587501](https://github.com/d2623587501)) - 标注数据集切分:对标注数据进行训练、验证与测试集划分(参考下方3.5节,by [MrCuiHao](https://github.com/MrCuiHao)) diff --git a/README.md b/README.md index 259ccb5aa02352ca2a2b81bf81d858cec2b47081..7ff799db308c72530b825bafc53973f707be389b 100644 --- a/README.md +++ b/README.md @@ -33,6 +33,8 @@ PaddleOCR aims to create multilingual, awesome, leading, and practical OCR tools PaddleOCR support a variety of cutting-edge algorithms related to OCR, and developed industrial featured models/solution [PP-OCR](./doc/doc_en/ppocr_introduction_en.md) and [PP-Structure](./ppstructure/README.md) on this basis, and get through the whole process of data production, model training, compression, inference and deployment. +PaddleOCR also supports metric and model logging during training to [VisualDL](https://www.paddlepaddle.org.cn/documentation/docs/en/guides/03_VisualDL/visualdl_usage_en.html) and [Weights & Biases](https://docs.wandb.ai/). + ![](./doc/features_en.png) > It is recommended to start with the “quick experience” in the document tutorial diff --git a/configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml b/configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml index 773a3649d8378cb39373b5b90837f17f9ecba335..e7cbae59a14af73639e1a74a14021b9b2ef60057 100644 --- a/configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml +++ b/configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml @@ -129,7 +129,7 @@ Loss: key: head_out multi_head: True - DistillationSARLoss: - weight: 0.5 + weight: 1.0 model_name_list: ["Student", "Teacher"] key: head_out multi_head: True diff --git a/configs/rec/PP-OCRv3/multi_language/arabic_PP-OCRv3_rec.yml b/configs/rec/PP-OCRv3/multi_language/arabic_PP-OCRv3_rec.yml new file mode 100644 index 0000000000000000000000000000000000000000..0ad1ab0adc189102ff07094fcda92d4f9ea9c662 --- /dev/null +++ b/configs/rec/PP-OCRv3/multi_language/arabic_PP-OCRv3_rec.yml @@ -0,0 +1,131 @@ +Global: + debug: false + use_gpu: true + epoch_num: 500 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/v3_arabic_mobile + save_epoch_step: 3 + eval_batch_step: [0, 2000] + cal_metric_during_train: true + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: false + infer_img: doc/imgs_words/ch/word_1.jpg + character_dict_path: ppocr/utils/dict/arabic_dict.txt + max_text_length: &max_text_length 25 + infer_mode: false + use_space_char: true + distributed: true + save_res_path: ./output/rec/predicts_ppocrv3_arabic.txt + + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Cosine + learning_rate: 0.001 + warmup_epoch: 5 + regularizer: + name: L2 + factor: 3.0e-05 + + +Architecture: + model_type: rec + algorithm: SVTR + Transform: + Backbone: + name: MobileNetV1Enhance + scale: 0.5 + last_conv_stride: [1, 2] + last_pool_type: avg + Head: + name: MultiHead + head_list: + - CTCHead: + Neck: + name: svtr + dims: 64 + depth: 2 + hidden_dims: 120 + use_guide: True + Head: + fc_decay: 0.00001 + - SARHead: + enc_dim: 512 + max_text_length: *max_text_length + +Loss: + name: MultiLoss + loss_config_list: + - CTCLoss: + - SARLoss: + +PostProcess: + name: CTCLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + ignore_space: False + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ + ext_op_transform_idx: 1 + label_file_list: + - ./train_data/train_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - RecConAug: + prob: 0.5 + ext_data_num: 2 + image_shape: [48, 320, 3] + - RecAug: + - MultiLabelEncode: + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_sar + - length + - valid_ratio + loader: + shuffle: true + batch_size_per_card: 128 + drop_last: true + num_workers: 4 +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data + label_file_list: + - ./train_data/val_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - MultiLabelEncode: + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_sar + - length + - valid_ratio + loader: + shuffle: false + drop_last: false + batch_size_per_card: 128 + num_workers: 4 diff --git a/configs/rec/PP-OCRv3/multi_language/chinese_cht_PP-OCRv3_rec.yml b/configs/rec/PP-OCRv3/multi_language/chinese_cht_PP-OCRv3_rec.yml new file mode 100644 index 0000000000000000000000000000000000000000..28e0c10aa0f83fdf8e621aae04bf2b7374255adc --- /dev/null +++ b/configs/rec/PP-OCRv3/multi_language/chinese_cht_PP-OCRv3_rec.yml @@ -0,0 +1,131 @@ +Global: + debug: false + use_gpu: true + epoch_num: 500 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/v3_chinese_cht_mobile + save_epoch_step: 3 + eval_batch_step: [0, 2000] + cal_metric_during_train: true + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: false + infer_img: doc/imgs_words/ch/word_1.jpg + character_dict_path: ppocr/utils/dict/chinese_cht_dict.txt + max_text_length: &max_text_length 25 + infer_mode: false + use_space_char: true + distributed: true + save_res_path: ./output/rec/predicts_ppocrv3_chinese_cht.txt + + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Cosine + learning_rate: 0.001 + warmup_epoch: 5 + regularizer: + name: L2 + factor: 3.0e-05 + + +Architecture: + model_type: rec + algorithm: SVTR + Transform: + Backbone: + name: MobileNetV1Enhance + scale: 0.5 + last_conv_stride: [1, 2] + last_pool_type: avg + Head: + name: MultiHead + head_list: + - CTCHead: + Neck: + name: svtr + dims: 64 + depth: 2 + hidden_dims: 120 + use_guide: True + Head: + fc_decay: 0.00001 + - SARHead: + enc_dim: 512 + max_text_length: *max_text_length + +Loss: + name: MultiLoss + loss_config_list: + - CTCLoss: + - SARLoss: + +PostProcess: + name: CTCLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + ignore_space: False + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ + ext_op_transform_idx: 1 + label_file_list: + - ./train_data/train_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - RecConAug: + prob: 0.5 + ext_data_num: 2 + image_shape: [48, 320, 3] + - RecAug: + - MultiLabelEncode: + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_sar + - length + - valid_ratio + loader: + shuffle: true + batch_size_per_card: 128 + drop_last: true + num_workers: 4 +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data + label_file_list: + - ./train_data/val_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - MultiLabelEncode: + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_sar + - length + - valid_ratio + loader: + shuffle: false + drop_last: false + batch_size_per_card: 128 + num_workers: 4 diff --git a/configs/rec/PP-OCRv3/multi_language/cyrillic_PP-OCRv3_rec.yml b/configs/rec/PP-OCRv3/multi_language/cyrillic_PP-OCRv3_rec.yml new file mode 100644 index 0000000000000000000000000000000000000000..fbdbe6c44c689ea267c9995f832305d800046edb --- /dev/null +++ b/configs/rec/PP-OCRv3/multi_language/cyrillic_PP-OCRv3_rec.yml @@ -0,0 +1,131 @@ +Global: + debug: false + use_gpu: true + epoch_num: 500 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/v3_cyrillic_mobile + save_epoch_step: 3 + eval_batch_step: [0, 2000] + cal_metric_during_train: true + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: false + infer_img: doc/imgs_words/ch/word_1.jpg + character_dict_path: ppocr/utils/dict/cyrillic_dict.txt + max_text_length: &max_text_length 25 + infer_mode: false + use_space_char: true + distributed: true + save_res_path: ./output/rec/predicts_ppocrv3_cyrillic.txt + + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Cosine + learning_rate: 0.001 + warmup_epoch: 5 + regularizer: + name: L2 + factor: 3.0e-05 + + +Architecture: + model_type: rec + algorithm: SVTR + Transform: + Backbone: + name: MobileNetV1Enhance + scale: 0.5 + last_conv_stride: [1, 2] + last_pool_type: avg + Head: + name: MultiHead + head_list: + - CTCHead: + Neck: + name: svtr + dims: 64 + depth: 2 + hidden_dims: 120 + use_guide: True + Head: + fc_decay: 0.00001 + - SARHead: + enc_dim: 512 + max_text_length: *max_text_length + +Loss: + name: MultiLoss + loss_config_list: + - CTCLoss: + - SARLoss: + +PostProcess: + name: CTCLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + ignore_space: False + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ + ext_op_transform_idx: 1 + label_file_list: + - ./train_data/train_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - RecConAug: + prob: 0.5 + ext_data_num: 2 + image_shape: [48, 320, 3] + - RecAug: + - MultiLabelEncode: + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_sar + - length + - valid_ratio + loader: + shuffle: true + batch_size_per_card: 128 + drop_last: true + num_workers: 4 +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data + label_file_list: + - ./train_data/val_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - MultiLabelEncode: + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_sar + - length + - valid_ratio + loader: + shuffle: false + drop_last: false + batch_size_per_card: 128 + num_workers: 4 diff --git a/configs/rec/PP-OCRv3/multi_language/devanagari_PP-OCRv3_rec.yml b/configs/rec/PP-OCRv3/multi_language/devanagari_PP-OCRv3_rec.yml new file mode 100644 index 0000000000000000000000000000000000000000..48eb38df36f931b76b8e9fb8369daf06ad037d25 --- /dev/null +++ b/configs/rec/PP-OCRv3/multi_language/devanagari_PP-OCRv3_rec.yml @@ -0,0 +1,131 @@ +Global: + debug: false + use_gpu: true + epoch_num: 500 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/v3_devanagari_mobile + save_epoch_step: 3 + eval_batch_step: [0, 2000] + cal_metric_during_train: true + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: false + infer_img: doc/imgs_words/ch/word_1.jpg + character_dict_path: ppocr/utils/dict/devanagari_dict.txt + max_text_length: &max_text_length 25 + infer_mode: false + use_space_char: true + distributed: true + save_res_path: ./output/rec/predicts_ppocrv3_devanagari.txt + + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Cosine + learning_rate: 0.001 + warmup_epoch: 5 + regularizer: + name: L2 + factor: 3.0e-05 + + +Architecture: + model_type: rec + algorithm: SVTR + Transform: + Backbone: + name: MobileNetV1Enhance + scale: 0.5 + last_conv_stride: [1, 2] + last_pool_type: avg + Head: + name: MultiHead + head_list: + - CTCHead: + Neck: + name: svtr + dims: 64 + depth: 2 + hidden_dims: 120 + use_guide: True + Head: + fc_decay: 0.00001 + - SARHead: + enc_dim: 512 + max_text_length: *max_text_length + +Loss: + name: MultiLoss + loss_config_list: + - CTCLoss: + - SARLoss: + +PostProcess: + name: CTCLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + ignore_space: False + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ + ext_op_transform_idx: 1 + label_file_list: + - ./train_data/train_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - RecConAug: + prob: 0.5 + ext_data_num: 2 + image_shape: [48, 320, 3] + - RecAug: + - MultiLabelEncode: + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_sar + - length + - valid_ratio + loader: + shuffle: true + batch_size_per_card: 128 + drop_last: true + num_workers: 4 +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data + label_file_list: + - ./train_data/val_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - MultiLabelEncode: + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_sar + - length + - valid_ratio + loader: + shuffle: false + drop_last: false + batch_size_per_card: 128 + num_workers: 4 diff --git a/configs/rec/PP-OCRv3/multi_language/japan_PP-OCRv3_rec.yml b/configs/rec/PP-OCRv3/multi_language/japan_PP-OCRv3_rec.yml new file mode 100644 index 0000000000000000000000000000000000000000..6cab0d447247e28bb58b30384d4f9d032d6ce9d0 --- /dev/null +++ b/configs/rec/PP-OCRv3/multi_language/japan_PP-OCRv3_rec.yml @@ -0,0 +1,131 @@ +Global: + debug: false + use_gpu: true + epoch_num: 500 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/v3_japan_mobile + save_epoch_step: 3 + eval_batch_step: [0, 2000] + cal_metric_during_train: true + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: false + infer_img: doc/imgs_words/ch/word_1.jpg + character_dict_path: ppocr/utils/dict/japan_dict.txt + max_text_length: &max_text_length 25 + infer_mode: false + use_space_char: true + distributed: true + save_res_path: ./output/rec/predicts_ppocrv3_japan.txt + + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Cosine + learning_rate: 0.001 + warmup_epoch: 5 + regularizer: + name: L2 + factor: 3.0e-05 + + +Architecture: + model_type: rec + algorithm: SVTR + Transform: + Backbone: + name: MobileNetV1Enhance + scale: 0.5 + last_conv_stride: [1, 2] + last_pool_type: avg + Head: + name: MultiHead + head_list: + - CTCHead: + Neck: + name: svtr + dims: 64 + depth: 2 + hidden_dims: 120 + use_guide: True + Head: + fc_decay: 0.00001 + - SARHead: + enc_dim: 512 + max_text_length: *max_text_length + +Loss: + name: MultiLoss + loss_config_list: + - CTCLoss: + - SARLoss: + +PostProcess: + name: CTCLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + ignore_space: False + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ + ext_op_transform_idx: 1 + label_file_list: + - ./train_data/train_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - RecConAug: + prob: 0.5 + ext_data_num: 2 + image_shape: [48, 320, 3] + - RecAug: + - MultiLabelEncode: + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_sar + - length + - valid_ratio + loader: + shuffle: true + batch_size_per_card: 128 + drop_last: true + num_workers: 4 +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data + label_file_list: + - ./train_data/val_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - MultiLabelEncode: + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_sar + - length + - valid_ratio + loader: + shuffle: false + drop_last: false + batch_size_per_card: 128 + num_workers: 4 diff --git a/configs/rec/PP-OCRv3/multi_language/ka_PP-OCRv3_rec.yml b/configs/rec/PP-OCRv3/multi_language/ka_PP-OCRv3_rec.yml new file mode 100644 index 0000000000000000000000000000000000000000..7a9c8241d1564e5f1295655ba64694a117064bd8 --- /dev/null +++ b/configs/rec/PP-OCRv3/multi_language/ka_PP-OCRv3_rec.yml @@ -0,0 +1,131 @@ +Global: + debug: false + use_gpu: true + epoch_num: 500 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/v3_ka_mobile + save_epoch_step: 3 + eval_batch_step: [0, 2000] + cal_metric_during_train: true + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: false + infer_img: doc/imgs_words/ch/word_1.jpg + character_dict_path: ppocr/utils/dict/ka_dict.txt + max_text_length: &max_text_length 25 + infer_mode: false + use_space_char: true + distributed: true + save_res_path: ./output/rec/predicts_ppocrv3_ka.txt + + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Cosine + learning_rate: 0.001 + warmup_epoch: 5 + regularizer: + name: L2 + factor: 3.0e-05 + + +Architecture: + model_type: rec + algorithm: SVTR + Transform: + Backbone: + name: MobileNetV1Enhance + scale: 0.5 + last_conv_stride: [1, 2] + last_pool_type: avg + Head: + name: MultiHead + head_list: + - CTCHead: + Neck: + name: svtr + dims: 64 + depth: 2 + hidden_dims: 120 + use_guide: True + Head: + fc_decay: 0.00001 + - SARHead: + enc_dim: 512 + max_text_length: *max_text_length + +Loss: + name: MultiLoss + loss_config_list: + - CTCLoss: + - SARLoss: + +PostProcess: + name: CTCLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + ignore_space: False + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ + ext_op_transform_idx: 1 + label_file_list: + - ./train_data/train_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - RecConAug: + prob: 0.5 + ext_data_num: 2 + image_shape: [48, 320, 3] + - RecAug: + - MultiLabelEncode: + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_sar + - length + - valid_ratio + loader: + shuffle: true + batch_size_per_card: 128 + drop_last: true + num_workers: 4 +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data + label_file_list: + - ./train_data/val_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - MultiLabelEncode: + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_sar + - length + - valid_ratio + loader: + shuffle: false + drop_last: false + batch_size_per_card: 128 + num_workers: 4 diff --git a/configs/rec/PP-OCRv3/multi_language/korean_PP-OCRv3_rec.yml b/configs/rec/PP-OCRv3/multi_language/korean_PP-OCRv3_rec.yml new file mode 100644 index 0000000000000000000000000000000000000000..29ff570772a621ba747e0388bcc0c042db0dba43 --- /dev/null +++ b/configs/rec/PP-OCRv3/multi_language/korean_PP-OCRv3_rec.yml @@ -0,0 +1,131 @@ +Global: + debug: false + use_gpu: true + epoch_num: 500 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/v3_korean_mobile + save_epoch_step: 3 + eval_batch_step: [0, 2000] + cal_metric_during_train: true + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: false + infer_img: doc/imgs_words/ch/word_1.jpg + character_dict_path: ppocr/utils/dict/korean_dict.txt + max_text_length: &max_text_length 25 + infer_mode: false + use_space_char: true + distributed: true + save_res_path: ./output/rec/predicts_ppocrv3_korean.txt + + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Cosine + learning_rate: 0.001 + warmup_epoch: 5 + regularizer: + name: L2 + factor: 3.0e-05 + + +Architecture: + model_type: rec + algorithm: SVTR + Transform: + Backbone: + name: MobileNetV1Enhance + scale: 0.5 + last_conv_stride: [1, 2] + last_pool_type: avg + Head: + name: MultiHead + head_list: + - CTCHead: + Neck: + name: svtr + dims: 64 + depth: 2 + hidden_dims: 120 + use_guide: True + Head: + fc_decay: 0.00001 + - SARHead: + enc_dim: 512 + max_text_length: *max_text_length + +Loss: + name: MultiLoss + loss_config_list: + - CTCLoss: + - SARLoss: + +PostProcess: + name: CTCLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + ignore_space: False + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ + ext_op_transform_idx: 1 + label_file_list: + - ./train_data/train_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - RecConAug: + prob: 0.5 + ext_data_num: 2 + image_shape: [48, 320, 3] + - RecAug: + - MultiLabelEncode: + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_sar + - length + - valid_ratio + loader: + shuffle: true + batch_size_per_card: 128 + drop_last: true + num_workers: 4 +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data + label_file_list: + - ./train_data/val_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - MultiLabelEncode: + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_sar + - length + - valid_ratio + loader: + shuffle: false + drop_last: false + batch_size_per_card: 128 + num_workers: 4 diff --git a/configs/rec/PP-OCRv3/multi_language/latin_PP-OCRv3_rec.yml b/configs/rec/PP-OCRv3/multi_language/latin_PP-OCRv3_rec.yml new file mode 100644 index 0000000000000000000000000000000000000000..1784bfe611366c45230fd2abf69ab16e3a1c3ae9 --- /dev/null +++ b/configs/rec/PP-OCRv3/multi_language/latin_PP-OCRv3_rec.yml @@ -0,0 +1,131 @@ +Global: + debug: false + use_gpu: true + epoch_num: 500 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/v3_latin_mobile + save_epoch_step: 3 + eval_batch_step: [0, 2000] + cal_metric_during_train: true + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: false + infer_img: doc/imgs_words/ch/word_1.jpg + character_dict_path: ppocr/utils/dict/latin_dict.txt + max_text_length: &max_text_length 25 + infer_mode: false + use_space_char: true + distributed: true + save_res_path: ./output/rec/predicts_ppocrv3_latin.txt + + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Cosine + learning_rate: 0.001 + warmup_epoch: 5 + regularizer: + name: L2 + factor: 3.0e-05 + + +Architecture: + model_type: rec + algorithm: SVTR + Transform: + Backbone: + name: MobileNetV1Enhance + scale: 0.5 + last_conv_stride: [1, 2] + last_pool_type: avg + Head: + name: MultiHead + head_list: + - CTCHead: + Neck: + name: svtr + dims: 64 + depth: 2 + hidden_dims: 120 + use_guide: True + Head: + fc_decay: 0.00001 + - SARHead: + enc_dim: 512 + max_text_length: *max_text_length + +Loss: + name: MultiLoss + loss_config_list: + - CTCLoss: + - SARLoss: + +PostProcess: + name: CTCLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + ignore_space: False + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ + ext_op_transform_idx: 1 + label_file_list: + - ./train_data/train_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - RecConAug: + prob: 0.5 + ext_data_num: 2 + image_shape: [48, 320, 3] + - RecAug: + - MultiLabelEncode: + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_sar + - length + - valid_ratio + loader: + shuffle: true + batch_size_per_card: 128 + drop_last: true + num_workers: 4 +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data + label_file_list: + - ./train_data/val_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - MultiLabelEncode: + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_sar + - length + - valid_ratio + loader: + shuffle: false + drop_last: false + batch_size_per_card: 128 + num_workers: 4 diff --git a/configs/rec/PP-OCRv3/multi_language/ta_PP-OCRv3_rec.yml b/configs/rec/PP-OCRv3/multi_language/ta_PP-OCRv3_rec.yml new file mode 100644 index 0000000000000000000000000000000000000000..70b26aa84a2178111edab9f094c369c5d22e31a9 --- /dev/null +++ b/configs/rec/PP-OCRv3/multi_language/ta_PP-OCRv3_rec.yml @@ -0,0 +1,131 @@ +Global: + debug: false + use_gpu: true + epoch_num: 500 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/v3_ta_mobile + save_epoch_step: 3 + eval_batch_step: [0, 2000] + cal_metric_during_train: true + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: false + infer_img: doc/imgs_words/ch/word_1.jpg + character_dict_path: ppocr/utils/dict/ta_dict.txt + max_text_length: &max_text_length 25 + infer_mode: false + use_space_char: true + distributed: true + save_res_path: ./output/rec/predicts_ppocrv3_ta.txt + + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Cosine + learning_rate: 0.001 + warmup_epoch: 5 + regularizer: + name: L2 + factor: 3.0e-05 + + +Architecture: + model_type: rec + algorithm: SVTR + Transform: + Backbone: + name: MobileNetV1Enhance + scale: 0.5 + last_conv_stride: [1, 2] + last_pool_type: avg + Head: + name: MultiHead + head_list: + - CTCHead: + Neck: + name: svtr + dims: 64 + depth: 2 + hidden_dims: 120 + use_guide: True + Head: + fc_decay: 0.00001 + - SARHead: + enc_dim: 512 + max_text_length: *max_text_length + +Loss: + name: MultiLoss + loss_config_list: + - CTCLoss: + - SARLoss: + +PostProcess: + name: CTCLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + ignore_space: False + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ + ext_op_transform_idx: 1 + label_file_list: + - ./train_data/train_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - RecConAug: + prob: 0.5 + ext_data_num: 2 + image_shape: [48, 320, 3] + - RecAug: + - MultiLabelEncode: + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_sar + - length + - valid_ratio + loader: + shuffle: true + batch_size_per_card: 128 + drop_last: true + num_workers: 4 +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data + label_file_list: + - ./train_data/val_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - MultiLabelEncode: + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_sar + - length + - valid_ratio + loader: + shuffle: false + drop_last: false + batch_size_per_card: 128 + num_workers: 4 diff --git a/configs/rec/PP-OCRv3/multi_language/te_PP-OCRv3_rec.yml b/configs/rec/PP-OCRv3/multi_language/te_PP-OCRv3_rec.yml new file mode 100644 index 0000000000000000000000000000000000000000..3617af79e3b9c5a55ef22d549465ba2109618e32 --- /dev/null +++ b/configs/rec/PP-OCRv3/multi_language/te_PP-OCRv3_rec.yml @@ -0,0 +1,131 @@ +Global: + debug: false + use_gpu: true + epoch_num: 500 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/v3_te_mobile + save_epoch_step: 3 + eval_batch_step: [0, 2000] + cal_metric_during_train: true + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: false + infer_img: doc/imgs_words/ch/word_1.jpg + character_dict_path: ppocr/utils/dict/te_dict.txt + max_text_length: &max_text_length 25 + infer_mode: false + use_space_char: true + distributed: true + save_res_path: ./output/rec/predicts_ppocrv3_te.txt + + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Cosine + learning_rate: 0.001 + warmup_epoch: 5 + regularizer: + name: L2 + factor: 3.0e-05 + + +Architecture: + model_type: rec + algorithm: SVTR + Transform: + Backbone: + name: MobileNetV1Enhance + scale: 0.5 + last_conv_stride: [1, 2] + last_pool_type: avg + Head: + name: MultiHead + head_list: + - CTCHead: + Neck: + name: svtr + dims: 64 + depth: 2 + hidden_dims: 120 + use_guide: True + Head: + fc_decay: 0.00001 + - SARHead: + enc_dim: 512 + max_text_length: *max_text_length + +Loss: + name: MultiLoss + loss_config_list: + - CTCLoss: + - SARLoss: + +PostProcess: + name: CTCLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + ignore_space: False + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ + ext_op_transform_idx: 1 + label_file_list: + - ./train_data/train_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - RecConAug: + prob: 0.5 + ext_data_num: 2 + image_shape: [48, 320, 3] + - RecAug: + - MultiLabelEncode: + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_sar + - length + - valid_ratio + loader: + shuffle: true + batch_size_per_card: 128 + drop_last: true + num_workers: 4 +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data + label_file_list: + - ./train_data/val_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - MultiLabelEncode: + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_sar + - length + - valid_ratio + loader: + shuffle: false + drop_last: false + batch_size_per_card: 128 + num_workers: 4 diff --git a/deploy/Jetson/readme.md b/deploy/Jetson/readme.md index 1eb0ea44f77b522567100fc6b38a23dab68613af..14c88c3c3c8aa8e4bbfe6ce04cb51c58af762351 100644 --- a/deploy/Jetson/readme.md +++ b/deploy/Jetson/readme.md @@ -1,46 +1,45 @@ +English | [简体中文](readme_ch.md) -# Jetson部署PaddleOCR模型 +# Jetson Deployment for PaddleOCR -本节介绍PaddleOCR在Jetson NX、TX2、nano、AGX等系列硬件的部署。 +This section introduces the deployment of PaddleOCR on Jetson NX, TX2, nano, AGX and other series of hardware. -## 1. 环境准备 +## 1. Prepare Environment -需要准备一台Jetson开发板,如果需要TensorRT预测,需准备好TensorRT环境,建议使用7.1.3版本的TensorRT; +You need to prepare a Jetson development hardware. If you need TensorRT, you need to prepare the TensorRT environment. It is recommended to use TensorRT version 7.1.3; -1. Jetson安装PaddlePaddle +1. Install PaddlePaddle in Jetson -PaddlePaddle下载[链接](https://www.paddlepaddle.org.cn/inference/user_guides/download_lib.html#python) -请选择适合的您Jetpack版本、cuda版本、trt版本的安装包。 +The PaddlePaddle download [link](https://www.paddlepaddle.org.cn/inference/user_guides/download_lib.html#python) +Please select the appropriate installation package for your Jetpack version, cuda version, and trt version. Here, we download paddlepaddle_gpu-2.3.0rc0-cp36-cp36m-linux_aarch64.whl. -安装命令: +Install PaddlePaddle: ```shell -# 安装paddle,以paddlepaddle_gpu-2.3.0rc0-cp36-cp36m-linux_aarch64.whl 为例 pip3 install -U paddlepaddle_gpu-2.3.0rc0-cp36-cp36m-linux_aarch64.whl ``` -2. 下载PaddleOCR代码并安装依赖 +2. Download PaddleOCR code and install dependencies -首先 clone PaddleOCR 代码: +Clone the PaddleOCR code: ``` git clone https://github.com/PaddlePaddle/PaddleOCR ``` -然后,安装依赖: +and install dependencies: ``` cd PaddleOCR pip3 install -r requirements.txt ``` -*注:jetson硬件CPU较差,依赖安装较慢,请耐心等待* +*Note: Jetson hardware CPU is poor, dependency installation is slow, please wait patiently* +## 2. Perform prediction -## 2. 执行预测 +Obtain the PPOCR model from the [document](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/doc/doc_en/ppocr_introduction_en.md#6-model-zoo) model library. The following takes the PP-OCRv3 model as an example to introduce the use of the PPOCR model on Jetson: -从[文档](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/doc/doc_ch/ppocr_introduction.md#6-%E6%A8%A1%E5%9E%8B%E5%BA%93) 模型库中获取PPOCR模型,下面以PP-OCRv3模型为例,介绍在PPOCR模型在jetson上的使用方式: - -下载并解压PP-OCRv3模型 +Download and unzip the PP-OCRv3 models. ``` wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar @@ -48,38 +47,38 @@ tar xf ch_PP-OCRv3_det_infer.tar tar xf ch_PP-OCRv3_rec_infer.tar ``` -执行文本检测预测: +The text detection inference: ``` cd PaddleOCR python3 tools/infer/predict_det.py --det_model_dir=./inference/ch_PP-OCRv2_det_infer/ --image_dir=./doc/imgs/french_0.jpg --use_gpu=True ``` -执行命令后在终端会打印出预测的信息,并在 `./inference_results/` 下保存可视化结果。 +After executing the command, the predicted information will be printed out in the terminal, and the visualization results will be saved in the `./inference_results/` directory. ![](./images/det_res_french_0.jpg) -执行文本识别预测: +The text recognition inference: ``` python3 tools/infer/predict_det.py --rec_model_dir=./inference/ch_PP-OCRv2_rec_infer/ --image_dir=./doc/imgs_words/en/word_2.png --use_gpu=True --rec_image_shape="3,48,320" ``` -执行命令后在终端会打印出预测的信息,输出如下: +After executing the command, the predicted information will be printed on the terminal, and the output is as follows: ``` [2022/04/28 15:41:45] root INFO: Predicts of ./doc/imgs_words/en/word_2.png:('yourself', 0.98084533) ``` -执行文本检测+文本识别串联预测: +The text detection and text recognition inference: ``` -python3 tools/infer/predict_system.py --det_model_dir=./inference/ch_PP-OCRv2_det_infer/ --rec_model_dir=./inference/ch_PP-OCRv2_rec_infer/ --image_dir=./doc/imgs/ --use_gpu=True --rec_image_shape="3,48,320" +python3 tools/infer/predict_system.py --det_model_dir=./inference/ch_PP-OCRv2_det_infer/ --rec_model_dir=./inference/ch_PP-OCRv2_rec_infer/ --image_dir=./doc/imgs/00057937.jpg --use_gpu=True --rec_image_shape="3,48,320" ``` -执行命令后在终端会打印出预测的信息,并在 `./inference_results/` 下保存可视化结果。 +After executing the command, the predicted information will be printed out in the terminal, and the visualization results will be saved in the `./inference_results/` directory. ![](./images/00057937.jpg) -开启TRT预测只需要在以上命令基础上设置`--use_tensorrt=True`即可: +To enable TRT prediction, you only need to set `--use_tensorrt=True` on the basis of the above command: ``` -python3 tools/infer/predict_system.py --det_model_dir=./inference/ch_PP-OCRv2_det_infer/ --rec_model_dir=./inference/ch_PP-OCRv2_rec_infer/ --image_dir=./doc/imgs/00057937.jpg --use_gpu=True --use_tensorrt=True --rec_image_shape="3,48,320" +python3 tools/infer/predict_system.py --det_model_dir=./inference/ch_PP-OCRv2_det_infer/ --rec_model_dir=./inference/ch_PP-OCRv2_rec_infer/ --image_dir=./doc/imgs/ --rec_image_shape="3,48,320" --use_gpu=True --use_tensorrt=True ``` -更多ppocr模型预测请参考[文档](../../doc/doc_ch/models_list.md) +For more ppocr model predictions, please refer to[document](../../doc/doc_en/models_list_en.md) diff --git a/deploy/Jetson/readme_ch.md b/deploy/Jetson/readme_ch.md new file mode 100644 index 0000000000000000000000000000000000000000..7b0a344ffe3b4c0abefc78e50371b5c92aa25001 --- /dev/null +++ b/deploy/Jetson/readme_ch.md @@ -0,0 +1,86 @@ +[English](readme.md) | 简体中文 + +# Jetson部署PaddleOCR模型 + +本节介绍PaddleOCR在Jetson NX、TX2、nano、AGX等系列硬件的部署。 + + +## 1. 环境准备 + +需要准备一台Jetson开发板,如果需要TensorRT预测,需准备好TensorRT环境,建议使用7.1.3版本的TensorRT; + +1. Jetson安装PaddlePaddle + +PaddlePaddle下载[链接](https://www.paddlepaddle.org.cn/inference/user_guides/download_lib.html#python) +请选择适合的您Jetpack版本、cuda版本、trt版本的安装包。 + +安装命令: +```shell +# 安装paddle,以paddlepaddle_gpu-2.3.0rc0-cp36-cp36m-linux_aarch64.whl 为例 +pip3 install -U paddlepaddle_gpu-2.3.0rc0-cp36-cp36m-linux_aarch64.whl +``` + + +2. 下载PaddleOCR代码并安装依赖 + +首先 clone PaddleOCR 代码: +``` +git clone https://github.com/PaddlePaddle/PaddleOCR +``` + +然后,安装依赖: +``` +cd PaddleOCR +pip3 install -r requirements.txt +``` + +*注:jetson硬件CPU较差,依赖安装较慢,请耐心等待* + + +## 2. 执行预测 + +从[文档](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/doc/doc_ch/ppocr_introduction.md#6-%E6%A8%A1%E5%9E%8B%E5%BA%93) 模型库中获取PPOCR模型,下面以PP-OCRv3模型为例,介绍在PPOCR模型在jetson上的使用方式: + +下载并解压PP-OCRv3模型 +``` +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar +tar xf ch_PP-OCRv3_det_infer.tar +tar xf ch_PP-OCRv3_rec_infer.tar +``` + +执行文本检测预测: +``` +cd PaddleOCR +python3 tools/infer/predict_det.py --det_model_dir=./inference/ch_PP-OCRv2_det_infer/ --image_dir=./doc/imgs/french_0.jpg --use_gpu=True +``` + +执行命令后在终端会打印出预测的信息,并在 `./inference_results/` 下保存可视化结果。 +![](./images/det_res_french_0.jpg) + + +执行文本识别预测: +``` +python3 tools/infer/predict_det.py --rec_model_dir=./inference/ch_PP-OCRv2_rec_infer/ --image_dir=./doc/imgs_words/en/word_2.png --use_gpu=True --rec_image_shape="3,48,320" +``` + +执行命令后在终端会打印出预测的信息,输出如下: +``` +[2022/04/28 15:41:45] root INFO: Predicts of ./doc/imgs_words/en/word_2.png:('yourself', 0.98084533) +``` + +执行文本检测+文本识别串联预测: + +``` +python3 tools/infer/predict_system.py --det_model_dir=./inference/ch_PP-OCRv2_det_infer/ --rec_model_dir=./inference/ch_PP-OCRv2_rec_infer/ --image_dir=./doc/imgs/ --use_gpu=True --rec_image_shape="3,48,320" +``` + +执行命令后在终端会打印出预测的信息,并在 `./inference_results/` 下保存可视化结果。 +![](./images/00057937.jpg) + +开启TRT预测只需要在以上命令基础上设置`--use_tensorrt=True`即可: +``` +python3 tools/infer/predict_system.py --det_model_dir=./inference/ch_PP-OCRv2_det_infer/ --rec_model_dir=./inference/ch_PP-OCRv2_rec_infer/ --image_dir=./doc/imgs/00057937.jpg --use_gpu=True --use_tensorrt=True --rec_image_shape="3,48,320" +``` + +更多ppocr模型预测请参考[文档](../../doc/doc_ch/models_list.md) diff --git a/deploy/Jetson/readme_en.md b/deploy/Jetson/readme_en.md deleted file mode 100644 index d499989160e13d6dd6bf1092c890e8ec11681ce4..0000000000000000000000000000000000000000 --- a/deploy/Jetson/readme_en.md +++ /dev/null @@ -1,83 +0,0 @@ - -# Jetson Deployment for PaddleOCR - -This section introduces the deployment of PaddleOCR on Jetson NX, TX2, nano, AGX and other series of hardware. - - -## 1. Prepare Environment - -You need to prepare a Jetson development hardware. If you need TensorRT, you need to prepare the TensorRT environment. It is recommended to use TensorRT version 7.1.3; - -1. Install PaddlePaddle in Jetson - -The PaddlePaddle download [link](https://www.paddlepaddle.org.cn/inference/user_guides/download_lib.html#python) -Please select the appropriate installation package for your Jetpack version, cuda version, and trt version. Here, we download paddlepaddle_gpu-2.3.0rc0-cp36-cp36m-linux_aarch64.whl. - -Install PaddlePaddle: -```shell -pip3 install -U paddlepaddle_gpu-2.3.0rc0-cp36-cp36m-linux_aarch64.whl -``` - - -2. Download PaddleOCR code and install dependencies - -Clone the PaddleOCR code: -``` -git clone https://github.com/PaddlePaddle/PaddleOCR -``` - -and install dependencies: -``` -cd PaddleOCR -pip3 install -r requirements.txt -``` - -*Note: Jetson hardware CPU is poor, dependency installation is slow, please wait patiently* - -## 2. Perform prediction - -Obtain the PPOCR model from the [document](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/doc/doc_en/ppocr_introduction_en.md#6-model-zoo) model library. The following takes the PP-OCRv3 model as an example to introduce the use of the PPOCR model on Jetson: - -Download and unzip the PP-OCRv3 models. -``` -wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar -wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar -tar xf ch_PP-OCRv3_det_infer.tar -tar xf ch_PP-OCRv3_rec_infer.tar -``` - -The text detection inference: -``` -cd PaddleOCR -python3 tools/infer/predict_det.py --det_model_dir=./inference/ch_PP-OCRv2_det_infer/ --image_dir=./doc/imgs/french_0.jpg --use_gpu=True -``` - -After executing the command, the predicted information will be printed out in the terminal, and the visualization results will be saved in the `./inference_results/` directory. -![](./images/det_res_french_0.jpg) - - -The text recognition inference: -``` -python3 tools/infer/predict_det.py --rec_model_dir=./inference/ch_PP-OCRv2_rec_infer/ --image_dir=./doc/imgs_words/en/word_2.png --use_gpu=True --rec_image_shape="3,48,320" -``` - -After executing the command, the predicted information will be printed on the terminal, and the output is as follows: -``` -[2022/04/28 15:41:45] root INFO: Predicts of ./doc/imgs_words/en/word_2.png:('yourself', 0.98084533) -``` - -The text detection and text recognition inference: - -``` -python3 tools/infer/predict_system.py --det_model_dir=./inference/ch_PP-OCRv2_det_infer/ --rec_model_dir=./inference/ch_PP-OCRv2_rec_infer/ --image_dir=./doc/imgs/00057937.jpg --use_gpu=True --rec_image_shape="3,48,320" -``` - -After executing the command, the predicted information will be printed out in the terminal, and the visualization results will be saved in the `./inference_results/` directory. -![](./images/00057937.jpg) - -To enable TRT prediction, you only need to set `--use_tensorrt=True` on the basis of the above command: -``` -python3 tools/infer/predict_system.py --det_model_dir=./inference/ch_PP-OCRv2_det_infer/ --rec_model_dir=./inference/ch_PP-OCRv2_rec_infer/ --image_dir=./doc/imgs/ --rec_image_shape="3,48,320" --use_gpu=True --use_tensorrt=True -``` - -For more ppocr model predictions, please refer to[document](../../doc/doc_en/models_list_en.md) diff --git a/deploy/README.md b/deploy/README.md index 3a77ff22a15b624ace56027f606eff61e9efb66c..0cfb793f92547de1215277f5e2348bc157ec8503 100644 --- a/deploy/README.md +++ b/deploy/README.md @@ -25,7 +25,7 @@ PP-OCR has supported muti deployment schemes. Click the link to get the specific - [Serving (Python/C++)](./pdserving/README.md) - [Paddle-Lite (ARM CPU/OpenCL ARM GPU)](./lite/readme.md) - [Paddle.js](./paddlejs/README.md) -- [Jetson Inference]() +- [Jetson Inference](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/deploy/Jetson/readme.md) - [Paddle2ONNX](./paddle2onnx/readme.md) If you need the deployment tutorial of academic algorithm models other than PP-OCR, please directly enter the main page of corresponding algorithms, [entrance](../doc/doc_en/algorithm_overview_en.md)。 diff --git a/deploy/README_ch.md b/deploy/README_ch.md index 9ec90bd36a05a53ae0f27386f6d2eba798e40842..1773aedc2c757499bfc1f950e468a98c477636ac 100644 --- a/deploy/README_ch.md +++ b/deploy/README_ch.md @@ -25,7 +25,7 @@ PP-OCR模型已打通多种场景部署方案,点击链接获取具体的使 - [Serving 服务化部署(Python/C++)](./pdserving/README_CN.md) - [Paddle-Lite 端侧部署(ARM CPU/OpenCL ARM GPU)](./lite/readme_ch.md) - [Paddle.js 部署](./paddlejs/README_ch.md) -- [Jetson 推理]() +- [Jetson 推理](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/deploy/Jetson/readme_ch.md) - [Paddle2ONNX 推理](./paddle2onnx/readme_ch.md) -需要PP-OCR以外的学术算法模型的推理部署,请直接进入相应算法主页面,[入口](../doc/doc_ch/algorithm_overview.md)。 +需要PP-OCR以外的学术算法模型的推理部署,请直接进入相应算法主页面,[入口](../doc/doc_ch/algorithm_overview.md)。 \ No newline at end of file diff --git a/deploy/pdserving/config.yml b/deploy/pdserving/config.yml index 6e30a626d0cdb0b4e5fe6feb737ea46c2bc59f90..19cd9325ee8b241fd591678b9ba6452de9bec025 100644 --- a/deploy/pdserving/config.yml +++ b/deploy/pdserving/config.yml @@ -37,7 +37,7 @@ op: model_config: ./ppocr_det_v3_serving #Fetch结果列表,以client_config中fetch_var的alias_name为准 - fetch_list: ["save_infer_model/scale_0.tmp_1"] + fetch_list: ["sigmoid_0.tmp_0"] #计算硬件ID,当devices为""或不写时为CPU预测;当devices为"0", "0,1,2"时为GPU预测,表示使用的GPU卡 devices: "0" diff --git a/deploy/pdserving/web_service.py b/deploy/pdserving/web_service.py index 07fd6102beaef4001f87574a2f0631e2b1012613..98e2dfba2f5abd3fc36bf3743b23f7eb7be3b9c4 100644 --- a/deploy/pdserving/web_service.py +++ b/deploy/pdserving/web_service.py @@ -56,7 +56,7 @@ class DetOp(Op): return {"x": det_img[np.newaxis, :].copy()}, False, None, "" def postprocess(self, input_dicts, fetch_dict, data_id, log_id): - det_out = fetch_dict["save_infer_model/scale_0.tmp_1"] + det_out = fetch_dict["sigmoid_0.tmp_0"] ratio_list = [ float(self.new_h) / self.ori_h, float(self.new_w) / self.ori_w ] diff --git a/deploy/pdserving/web_service_det.py b/deploy/pdserving/web_service_det.py index 0ca8dbc41bbdde4caf76bcfddabe4b9c2e94cb4b..7584608a9fed4bea93caa5c814c0450566696d56 100644 --- a/deploy/pdserving/web_service_det.py +++ b/deploy/pdserving/web_service_det.py @@ -55,7 +55,7 @@ class DetOp(Op): return {"x": det_img[np.newaxis, :].copy()}, False, None, "" def postprocess(self, input_dicts, fetch_dict, data_id, log_id): - det_out = fetch_dict["save_infer_model/scale_0.tmp_1"] + det_out = fetch_dict["sigmoid_0.tmp_0"] ratio_list = [ float(self.new_h) / self.ori_h, float(self.new_w) / self.ori_w ] diff --git a/doc/doc_ch/PP-OCRv3_introduction.md b/doc/doc_ch/PP-OCRv3_introduction.md index f65ee7ccf9cf813d20d9419cf3b8e62c22eeabf0..dc0271f294cf43a26477dbc974b77297e04122ac 100644 --- a/doc/doc_ch/PP-OCRv3_introduction.md +++ b/doc/doc_ch/PP-OCRv3_introduction.md @@ -1,38 +1,27 @@ [English](../doc_en/PP-OCRv3_introduction_en.md) | 简体中文 -# PP-OCR +# PP-OCRv3 - [1. 简介](#1) -- [2. 特性](#2) -- [3. benchmark](#3) +- [2. 检测优化](#2) +- [3. 识别优化](#3) +- [4. 端到端评估](#4) ## 1. 简介 -PP-OCR是PaddleOCR自研的实用的超轻量OCR系统。在实现[前沿算法](algorithm.md)的基础上,考虑精度与速度的平衡,进行**模型瘦身**和**深度优化**,使其尽可能满足产业落地需求。 +PP-OCRv3在PP-OCRv2的基础上进一步升级。检测模型仍然基于DB算法,优化策略采用了带残差注意力机制的FPN结构RSEFPN、增大感受野的PAN结构LKPAN、基于DML训练的更优的教师模型;识别模型将base模型从CRNN替换成了IJCAI 2022论文[SVTR](),并采用SVTR轻量化、带指导训练CTC、数据增广策略RecConAug、自监督训练的更好的预训练模型、无标签数据的使用进行模型加速和效果提升。更多细节请参考PP-OCRv3[技术报告](./PP-OCRv3_introduction.md)。 -#### PP-OCR - -PP-OCR是一个两阶段的OCR系统,其中文本检测算法选用[DB](algorithm_det_db.md),文本识别算法选用[CRNN](algorithm_rec_crnn.md),并在检测和识别模块之间添加[文本方向分类器](angle_class.md),以应对不同方向的文本识别。 - -PP-OCRv2系统pipeline如下: +PP-OCRv3系统pipeline如下: